1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_i18n.h"
18 #include "kmp_itt.h"
19 #include "kmp_wait_release.h"
20 #include "kmp_stats.h"
21 
22 #if OMPT_SUPPORT
23 #include "ompt-specific.h"
24 #endif
25 
26 #include "tsan_annotations.h"
27 
28 /* ------------------------------------------------------------------------ */
29 /* ------------------------------------------------------------------------ */
30 
31 
32 /* forward declaration */
33 static void __kmp_enable_tasking( kmp_task_team_t *task_team, kmp_info_t *this_thr );
34 static void __kmp_alloc_task_deque( kmp_info_t *thread, kmp_thread_data_t *thread_data );
35 static int  __kmp_realloc_task_threads_data( kmp_info_t *thread, kmp_task_team_t *task_team );
36 
37 #ifdef OMP_45_ENABLED
38 static void __kmp_bottom_half_finish_proxy( kmp_int32 gtid, kmp_task_t * ptask );
39 #endif
40 
41 #ifdef BUILD_TIED_TASK_STACK
42 
43 //---------------------------------------------------------------------------
44 //  __kmp_trace_task_stack: print the tied tasks from the task stack in order
45 //     from top do bottom
46 //
47 //  gtid: global thread identifier for thread containing stack
48 //  thread_data: thread data for task team thread containing stack
49 //  threshold: value above which the trace statement triggers
50 //  location: string identifying call site of this function (for trace)
51 
52 static void
53 __kmp_trace_task_stack( kmp_int32 gtid, kmp_thread_data_t *thread_data, int threshold, char *location )
54 {
55     kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks;
56     kmp_taskdata_t **stack_top = task_stack -> ts_top;
57     kmp_int32 entries = task_stack -> ts_entries;
58     kmp_taskdata_t *tied_task;
59 
60     KA_TRACE(threshold, ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
61                          "first_block = %p, stack_top = %p \n",
62                          location, gtid, entries, task_stack->ts_first_block, stack_top ) );
63 
64     KMP_DEBUG_ASSERT( stack_top != NULL );
65     KMP_DEBUG_ASSERT( entries > 0 );
66 
67     while ( entries != 0 )
68     {
69         KMP_DEBUG_ASSERT( stack_top != & task_stack->ts_first_block.sb_block[0] );
70         // fix up ts_top if we need to pop from previous block
71         if ( entries & TASK_STACK_INDEX_MASK == 0 )
72         {
73             kmp_stack_block_t *stack_block = (kmp_stack_block_t *) (stack_top) ;
74 
75             stack_block = stack_block -> sb_prev;
76             stack_top = & stack_block -> sb_block[TASK_STACK_BLOCK_SIZE];
77         }
78 
79         // finish bookkeeping
80         stack_top--;
81         entries--;
82 
83         tied_task = * stack_top;
84 
85         KMP_DEBUG_ASSERT( tied_task != NULL );
86         KMP_DEBUG_ASSERT( tied_task -> td_flags.tasktype == TASK_TIED );
87 
88         KA_TRACE(threshold, ("__kmp_trace_task_stack(%s):             gtid=%d, entry=%d, "
89                              "stack_top=%p, tied_task=%p\n",
90                              location, gtid, entries, stack_top, tied_task ) );
91     }
92     KMP_DEBUG_ASSERT( stack_top == & task_stack->ts_first_block.sb_block[0] );
93 
94     KA_TRACE(threshold, ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
95                          location, gtid ) );
96 }
97 
98 //---------------------------------------------------------------------------
99 //  __kmp_init_task_stack: initialize the task stack for the first time
100 //    after a thread_data structure is created.
101 //    It should not be necessary to do this again (assuming the stack works).
102 //
103 //  gtid: global thread identifier of calling thread
104 //  thread_data: thread data for task team thread containing stack
105 
106 static void
107 __kmp_init_task_stack( kmp_int32 gtid, kmp_thread_data_t *thread_data )
108 {
109     kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks;
110     kmp_stack_block_t *first_block;
111 
112     // set up the first block of the stack
113     first_block = & task_stack -> ts_first_block;
114     task_stack -> ts_top = (kmp_taskdata_t **) first_block;
115     memset( (void *) first_block, '\0', TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
116 
117     // initialize the stack to be empty
118     task_stack  -> ts_entries = TASK_STACK_EMPTY;
119     first_block -> sb_next = NULL;
120     first_block -> sb_prev = NULL;
121 }
122 
123 
124 //---------------------------------------------------------------------------
125 //  __kmp_free_task_stack: free the task stack when thread_data is destroyed.
126 //
127 //  gtid: global thread identifier for calling thread
128 //  thread_data: thread info for thread containing stack
129 
130 static void
131 __kmp_free_task_stack( kmp_int32 gtid, kmp_thread_data_t *thread_data )
132 {
133     kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks;
134     kmp_stack_block_t *stack_block = & task_stack -> ts_first_block;
135 
136     KMP_DEBUG_ASSERT( task_stack -> ts_entries == TASK_STACK_EMPTY );
137     // free from the second block of the stack
138     while ( stack_block != NULL ) {
139         kmp_stack_block_t *next_block = (stack_block) ? stack_block -> sb_next : NULL;
140 
141         stack_block -> sb_next = NULL;
142         stack_block -> sb_prev = NULL;
143         if (stack_block != & task_stack -> ts_first_block) {
144             __kmp_thread_free( thread, stack_block );  // free the block, if not the first
145         }
146         stack_block = next_block;
147     }
148     // initialize the stack to be empty
149     task_stack -> ts_entries = 0;
150     task_stack -> ts_top = NULL;
151 }
152 
153 
154 //---------------------------------------------------------------------------
155 //  __kmp_push_task_stack: Push the tied task onto the task stack.
156 //     Grow the stack if necessary by allocating another block.
157 //
158 //  gtid: global thread identifier for calling thread
159 //  thread: thread info for thread containing stack
160 //  tied_task: the task to push on the stack
161 
162 static void
163 __kmp_push_task_stack( kmp_int32 gtid, kmp_info_t *thread, kmp_taskdata_t * tied_task )
164 {
165     // GEH - need to consider what to do if tt_threads_data not allocated yet
166     kmp_thread_data_t *thread_data = & thread -> th.th_task_team ->
167                                         tt.tt_threads_data[ __kmp_tid_from_gtid( gtid ) ];
168     kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks ;
169 
170     if ( tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser ) {
171         return;  // Don't push anything on stack if team or team tasks are serialized
172     }
173 
174     KMP_DEBUG_ASSERT( tied_task -> td_flags.tasktype == TASK_TIED );
175     KMP_DEBUG_ASSERT( task_stack -> ts_top != NULL );
176 
177     KA_TRACE(20, ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
178                   gtid, thread, tied_task ) );
179     // Store entry
180     * (task_stack -> ts_top) = tied_task;
181 
182     // Do bookkeeping for next push
183     task_stack -> ts_top++;
184     task_stack -> ts_entries++;
185 
186     if ( task_stack -> ts_entries & TASK_STACK_INDEX_MASK == 0 )
187     {
188         // Find beginning of this task block
189         kmp_stack_block_t *stack_block =
190              (kmp_stack_block_t *) (task_stack -> ts_top - TASK_STACK_BLOCK_SIZE);
191 
192         // Check if we already have a block
193         if ( stack_block -> sb_next != NULL )
194         {    // reset ts_top to beginning of next block
195             task_stack -> ts_top = & stack_block -> sb_next -> sb_block[0];
196         }
197         else
198         {   // Alloc new block and link it up
199             kmp_stack_block_t *new_block = (kmp_stack_block_t *)
200               __kmp_thread_calloc(thread, sizeof(kmp_stack_block_t));
201 
202             task_stack -> ts_top  = & new_block -> sb_block[0];
203             stack_block -> sb_next = new_block;
204             new_block  -> sb_prev = stack_block;
205             new_block  -> sb_next = NULL;
206 
207             KA_TRACE(30, ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
208                           gtid, tied_task, new_block ) );
209         }
210     }
211     KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid, tied_task ) );
212 }
213 
214 //---------------------------------------------------------------------------
215 //  __kmp_pop_task_stack: Pop the tied task from the task stack.  Don't return
216 //     the task, just check to make sure it matches the ending task passed in.
217 //
218 //  gtid: global thread identifier for the calling thread
219 //  thread: thread info structure containing stack
220 //  tied_task: the task popped off the stack
221 //  ending_task: the task that is ending (should match popped task)
222 
223 static void
224 __kmp_pop_task_stack( kmp_int32 gtid, kmp_info_t *thread, kmp_taskdata_t *ending_task )
225 {
226     // GEH - need to consider what to do if tt_threads_data not allocated yet
227     kmp_thread_data_t *thread_data = & thread -> th.th_task_team -> tt_threads_data[ __kmp_tid_from_gtid( gtid ) ];
228     kmp_task_stack_t *task_stack = & thread_data->td.td_susp_tied_tasks ;
229     kmp_taskdata_t *tied_task;
230 
231     if ( ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser ) {
232         return;  // Don't pop anything from stack if team or team tasks are serialized
233     }
234 
235     KMP_DEBUG_ASSERT( task_stack -> ts_top != NULL );
236     KMP_DEBUG_ASSERT( task_stack -> ts_entries > 0 );
237 
238     KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid, thread ) );
239 
240     // fix up ts_top if we need to pop from previous block
241     if ( task_stack -> ts_entries & TASK_STACK_INDEX_MASK == 0 )
242     {
243         kmp_stack_block_t *stack_block =
244            (kmp_stack_block_t *) (task_stack -> ts_top) ;
245 
246         stack_block = stack_block -> sb_prev;
247         task_stack -> ts_top = & stack_block -> sb_block[TASK_STACK_BLOCK_SIZE];
248     }
249 
250     // finish bookkeeping
251     task_stack -> ts_top--;
252     task_stack -> ts_entries--;
253 
254     tied_task = * (task_stack -> ts_top );
255 
256     KMP_DEBUG_ASSERT( tied_task != NULL );
257     KMP_DEBUG_ASSERT( tied_task -> td_flags.tasktype == TASK_TIED );
258     KMP_DEBUG_ASSERT( tied_task == ending_task );  // If we built the stack correctly
259 
260     KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid, tied_task ) );
261     return;
262 }
263 #endif /* BUILD_TIED_TASK_STACK */
264 
265 //---------------------------------------------------
266 //  __kmp_push_task: Add a task to the thread's deque
267 
268 static kmp_int32
269 __kmp_push_task(kmp_int32 gtid, kmp_task_t * task )
270 {
271     kmp_info_t *        thread = __kmp_threads[ gtid ];
272     kmp_taskdata_t *    taskdata = KMP_TASK_TO_TASKDATA(task);
273     kmp_task_team_t *   task_team = thread->th.th_task_team;
274     kmp_int32           tid = __kmp_tid_from_gtid( gtid );
275     kmp_thread_data_t * thread_data;
276 
277     KA_TRACE(20, ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata ) );
278 
279     if ( taskdata->td_flags.tiedness == TASK_UNTIED ) {
280         // untied task needs to increment counter so that the task structure is not freed prematurely
281         kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count);
282         KA_TRACE(20, ( "__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
283                        gtid, counter, taskdata ) );
284     }
285 
286     // The first check avoids building task_team thread data if serialized
287     if ( taskdata->td_flags.task_serial ) {
288         KA_TRACE(20, ( "__kmp_push_task: T#%d team serialized; returning TASK_NOT_PUSHED for task %p\n",
289                        gtid, taskdata ) );
290         return TASK_NOT_PUSHED;
291     }
292 
293     // Now that serialized tasks have returned, we can assume that we are not in immediate exec mode
294     KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
295     if ( ! KMP_TASKING_ENABLED(task_team) ) {
296          __kmp_enable_tasking( task_team, thread );
297     }
298     KMP_DEBUG_ASSERT( TCR_4(task_team -> tt.tt_found_tasks) == TRUE );
299     KMP_DEBUG_ASSERT( TCR_PTR(task_team -> tt.tt_threads_data) != NULL );
300 
301     // Find tasking deque specific to encountering thread
302     thread_data = & task_team -> tt.tt_threads_data[ tid ];
303 
304     // No lock needed since only owner can allocate
305     if (thread_data -> td.td_deque == NULL ) {
306         __kmp_alloc_task_deque( thread, thread_data );
307     }
308 
309     // Check if deque is full
310     if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) )
311     {
312         KA_TRACE(20, ( "__kmp_push_task: T#%d deque is full; returning TASK_NOT_PUSHED for task %p\n",
313                        gtid, taskdata ) );
314         return TASK_NOT_PUSHED;
315     }
316 
317     // Lock the deque for the task push operation
318     __kmp_acquire_bootstrap_lock( & thread_data -> td.td_deque_lock );
319 
320 #if OMP_45_ENABLED
321     // Need to recheck as we can get a proxy task from a thread outside of OpenMP
322     if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) )
323     {
324         __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
325         KA_TRACE(20, ( "__kmp_push_task: T#%d deque is full on 2nd check; returning TASK_NOT_PUSHED for task %p\n",
326                        gtid, taskdata ) );
327         return TASK_NOT_PUSHED;
328     }
329 #else
330     // Must have room since no thread can add tasks but calling thread
331     KMP_DEBUG_ASSERT( TCR_4(thread_data -> td.td_deque_ntasks) < TASK_DEQUE_SIZE(thread_data->td) );
332 #endif
333 
334     thread_data -> td.td_deque[ thread_data -> td.td_deque_tail ] = taskdata;  // Push taskdata
335     // Wrap index.
336     thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK(thread_data->td);
337     TCW_4(thread_data -> td.td_deque_ntasks, TCR_4(thread_data -> td.td_deque_ntasks) + 1);             // Adjust task count
338 
339     KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
340           "task=%p ntasks=%d head=%u tail=%u\n",
341           gtid, taskdata, thread_data->td.td_deque_ntasks,
342           thread_data->td.td_deque_head, thread_data->td.td_deque_tail) );
343 
344     __kmp_release_bootstrap_lock( & thread_data->td.td_deque_lock );
345 
346     return TASK_SUCCESSFULLY_PUSHED;
347 }
348 
349 
350 //-----------------------------------------------------------------------------------------
351 // __kmp_pop_current_task_from_thread: set up current task from called thread when team ends
352 // this_thr: thread structure to set current_task in.
353 
354 void
355 __kmp_pop_current_task_from_thread( kmp_info_t *this_thr )
356 {
357     KF_TRACE( 10, ("__kmp_pop_current_task_from_thread(enter): T#%d this_thread=%p, curtask=%p, "
358                    "curtask_parent=%p\n",
359                    0, this_thr, this_thr -> th.th_current_task,
360                    this_thr -> th.th_current_task -> td_parent ) );
361 
362     this_thr -> th.th_current_task = this_thr -> th.th_current_task -> td_parent;
363 
364     KF_TRACE( 10, ("__kmp_pop_current_task_from_thread(exit): T#%d this_thread=%p, curtask=%p, "
365                    "curtask_parent=%p\n",
366                    0, this_thr, this_thr -> th.th_current_task,
367                    this_thr -> th.th_current_task -> td_parent ) );
368 }
369 
370 
371 //---------------------------------------------------------------------------------------
372 // __kmp_push_current_task_to_thread: set up current task in called thread for a new team
373 // this_thr: thread structure to set up
374 // team: team for implicit task data
375 // tid: thread within team to set up
376 
377 void
378 __kmp_push_current_task_to_thread( kmp_info_t *this_thr, kmp_team_t *team, int tid )
379 {
380     // current task of the thread is a parent of the new just created implicit tasks of new team
381     KF_TRACE( 10, ( "__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p curtask=%p "
382                     "parent_task=%p\n",
383                     tid, this_thr, this_thr->th.th_current_task,
384                     team->t.t_implicit_task_taskdata[tid].td_parent ) );
385 
386     KMP_DEBUG_ASSERT (this_thr != NULL);
387 
388     if( tid == 0 ) {
389         if( this_thr->th.th_current_task != & team -> t.t_implicit_task_taskdata[ 0 ] ) {
390             team -> t.t_implicit_task_taskdata[ 0 ].td_parent = this_thr->th.th_current_task;
391             this_thr->th.th_current_task = & team -> t.t_implicit_task_taskdata[ 0 ];
392         }
393     } else {
394         team -> t.t_implicit_task_taskdata[ tid ].td_parent = team -> t.t_implicit_task_taskdata[ 0 ].td_parent;
395         this_thr->th.th_current_task = & team -> t.t_implicit_task_taskdata[ tid ];
396     }
397 
398     KF_TRACE( 10, ( "__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p curtask=%p "
399                     "parent_task=%p\n",
400                     tid, this_thr, this_thr->th.th_current_task,
401                     team->t.t_implicit_task_taskdata[tid].td_parent ) );
402 }
403 
404 
405 //----------------------------------------------------------------------
406 // __kmp_task_start: bookkeeping for a task starting execution
407 // GTID: global thread id of calling thread
408 // task: task starting execution
409 // current_task: task suspending
410 
411 static void
412 __kmp_task_start( kmp_int32 gtid, kmp_task_t * task, kmp_taskdata_t * current_task )
413 {
414     kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
415     kmp_info_t * thread = __kmp_threads[ gtid ];
416 
417     KA_TRACE(10, ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
418                   gtid, taskdata, current_task) );
419 
420     KMP_DEBUG_ASSERT( taskdata -> td_flags.tasktype == TASK_EXPLICIT );
421 
422     // mark currently executing task as suspended
423     // TODO: GEH - make sure root team implicit task is initialized properly.
424     // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
425     current_task -> td_flags.executing = 0;
426 
427     // Add task to stack if tied
428 #ifdef BUILD_TIED_TASK_STACK
429     if ( taskdata -> td_flags.tiedness == TASK_TIED )
430     {
431         __kmp_push_task_stack( gtid, thread, taskdata );
432     }
433 #endif /* BUILD_TIED_TASK_STACK */
434 
435     // mark starting task as executing and as current task
436     thread -> th.th_current_task = taskdata;
437 
438     KMP_DEBUG_ASSERT( taskdata->td_flags.started == 0 || taskdata->td_flags.tiedness == TASK_UNTIED );
439     KMP_DEBUG_ASSERT( taskdata->td_flags.executing == 0 || taskdata->td_flags.tiedness == TASK_UNTIED );
440     taskdata -> td_flags.started = 1;
441     taskdata -> td_flags.executing = 1;
442     KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 0 );
443     KMP_DEBUG_ASSERT( taskdata -> td_flags.freed == 0 );
444 
445     // GEH TODO: shouldn't we pass some sort of location identifier here?
446     // APT: yes, we will pass location here.
447     // need to store current thread state (in a thread or taskdata structure)
448     // before setting work_state, otherwise wrong state is set after end of task
449 
450     KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n",
451                   gtid, taskdata ) );
452 
453 #if OMPT_SUPPORT
454     if (ompt_enabled &&
455         ompt_callbacks.ompt_callback(ompt_event_task_begin)) {
456         kmp_taskdata_t *parent = taskdata->td_parent;
457         ompt_callbacks.ompt_callback(ompt_event_task_begin)(
458             parent ? parent->ompt_task_info.task_id : ompt_task_id_none,
459             parent ? &(parent->ompt_task_info.frame) : NULL,
460             taskdata->ompt_task_info.task_id,
461             taskdata->ompt_task_info.function);
462     }
463 #endif
464 #if OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE
465     /* OMPT emit all dependences if requested by the tool */
466     if (ompt_enabled && taskdata->ompt_task_info.ndeps > 0 &&
467         ompt_callbacks.ompt_callback(ompt_event_task_dependences))
468 	{
469         ompt_callbacks.ompt_callback(ompt_event_task_dependences)(
470             taskdata->ompt_task_info.task_id,
471             taskdata->ompt_task_info.deps,
472             taskdata->ompt_task_info.ndeps
473         );
474 		/* We can now free the allocated memory for the dependencies */
475 		KMP_OMPT_DEPS_FREE (thread, taskdata->ompt_task_info.deps);
476         taskdata->ompt_task_info.deps = NULL;
477         taskdata->ompt_task_info.ndeps = 0;
478     }
479 #endif /* OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE */
480 
481     return;
482 }
483 
484 
485 //----------------------------------------------------------------------
486 // __kmpc_omp_task_begin_if0: report that a given serialized task has started execution
487 // loc_ref: source location information; points to beginning of task block.
488 // gtid: global thread number.
489 // task: task thunk for the started task.
490 
491 void
492 __kmpc_omp_task_begin_if0( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * task )
493 {
494     kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
495     kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
496 
497     KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p current_task=%p\n",
498                   gtid, loc_ref, taskdata, current_task ) );
499 
500     if ( taskdata->td_flags.tiedness == TASK_UNTIED ) {
501         // untied task needs to increment counter so that the task structure is not freed prematurely
502         kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count);
503         KA_TRACE(20, ( "__kmpc_omp_task_begin_if0: T#%d untied_count (%d) incremented for task %p\n",
504                        gtid, counter, taskdata ) );
505     }
506 
507     taskdata -> td_flags.task_serial = 1;  // Execute this task immediately, not deferred.
508     __kmp_task_start( gtid, task, current_task );
509 
510     KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n",
511                   gtid, loc_ref, taskdata ) );
512 
513     return;
514 }
515 
516 #ifdef TASK_UNUSED
517 //----------------------------------------------------------------------
518 // __kmpc_omp_task_begin: report that a given task has started execution
519 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
520 
521 void
522 __kmpc_omp_task_begin( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * task )
523 {
524     kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
525 
526     KA_TRACE(10, ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
527                   gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task ) );
528 
529     __kmp_task_start( gtid, task, current_task );
530 
531     KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n",
532                   gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
533 
534     return;
535 }
536 #endif // TASK_UNUSED
537 
538 
539 //-------------------------------------------------------------------------------------
540 // __kmp_free_task: free the current task space and the space for shareds
541 // gtid: Global thread ID of calling thread
542 // taskdata: task to free
543 // thread: thread data structure of caller
544 
545 static void
546 __kmp_free_task( kmp_int32 gtid, kmp_taskdata_t * taskdata, kmp_info_t * thread )
547 {
548     KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n",
549                   gtid, taskdata) );
550 
551     // Check to make sure all flags and counters have the correct values
552     KMP_DEBUG_ASSERT( taskdata->td_flags.tasktype == TASK_EXPLICIT );
553     KMP_DEBUG_ASSERT( taskdata->td_flags.executing == 0 );
554     KMP_DEBUG_ASSERT( taskdata->td_flags.complete == 1 );
555     KMP_DEBUG_ASSERT( taskdata->td_flags.freed == 0 );
556     KMP_DEBUG_ASSERT( TCR_4(taskdata->td_allocated_child_tasks) == 0  || taskdata->td_flags.task_serial == 1);
557     KMP_DEBUG_ASSERT( TCR_4(taskdata->td_incomplete_child_tasks) == 0 );
558 
559     taskdata->td_flags.freed = 1;
560     ANNOTATE_HAPPENS_BEFORE(taskdata);
561     // deallocate the taskdata and shared variable blocks associated with this task
562     #if USE_FAST_MEMORY
563         __kmp_fast_free( thread, taskdata );
564     #else /* ! USE_FAST_MEMORY */
565         __kmp_thread_free( thread, taskdata );
566     #endif
567 
568     KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n",
569                   gtid, taskdata) );
570 }
571 
572 //-------------------------------------------------------------------------------------
573 // __kmp_free_task_and_ancestors: free the current task and ancestors without children
574 //
575 // gtid: Global thread ID of calling thread
576 // taskdata: task to free
577 // thread: thread data structure of caller
578 
579 static void
580 __kmp_free_task_and_ancestors( kmp_int32 gtid, kmp_taskdata_t * taskdata, kmp_info_t * thread )
581 {
582 #if OMP_45_ENABLED
583     // Proxy tasks must always be allowed to free their parents
584     // because they can be run in background even in serial mode.
585     kmp_int32 team_serial = ( taskdata->td_flags.team_serial ||
586         taskdata->td_flags.tasking_ser ) && !taskdata->td_flags.proxy;
587 #else
588     kmp_int32 team_serial = taskdata->td_flags.team_serial ||
589         taskdata->td_flags.tasking_ser;
590 #endif
591     KMP_DEBUG_ASSERT( taskdata -> td_flags.tasktype == TASK_EXPLICIT );
592 
593     kmp_int32 children = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata -> td_allocated_child_tasks) ) - 1;
594     KMP_DEBUG_ASSERT( children >= 0 );
595 
596     // Now, go up the ancestor tree to see if any ancestors can now be freed.
597     while ( children == 0 )
598     {
599         kmp_taskdata_t * parent_taskdata = taskdata -> td_parent;
600 
601         KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
602                       "and freeing itself\n", gtid, taskdata) );
603 
604         // --- Deallocate my ancestor task ---
605         __kmp_free_task( gtid, taskdata, thread );
606 
607         taskdata = parent_taskdata;
608 
609         // Stop checking ancestors at implicit task
610         // instead of walking up ancestor tree to avoid premature deallocation of ancestors.
611         if ( team_serial || taskdata -> td_flags.tasktype == TASK_IMPLICIT )
612             return;
613 
614         // Predecrement simulated by "- 1" calculation
615         children = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata -> td_allocated_child_tasks) ) - 1;
616         KMP_DEBUG_ASSERT( children >= 0 );
617     }
618 
619     KA_TRACE(20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
620                   "not freeing it yet\n", gtid, taskdata, children) );
621 }
622 
623 //---------------------------------------------------------------------
624 // __kmp_task_finish: bookkeeping to do when a task finishes execution
625 // gtid: global thread ID for calling thread
626 // task: task to be finished
627 // resumed_task: task to be resumed.  (may be NULL if task is serialized)
628 
629 static void
630 __kmp_task_finish( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t *resumed_task )
631 {
632     kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
633     kmp_info_t * thread = __kmp_threads[ gtid ];
634     kmp_task_team_t * task_team = thread->th.th_task_team; // might be NULL for serial teams...
635     kmp_int32 children = 0;
636 
637 #if OMPT_SUPPORT
638     if (ompt_enabled &&
639         ompt_callbacks.ompt_callback(ompt_event_task_end)) {
640         kmp_taskdata_t *parent = taskdata->td_parent;
641         ompt_callbacks.ompt_callback(ompt_event_task_end)(
642             taskdata->ompt_task_info.task_id);
643     }
644 #endif
645 
646     KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming task %p\n",
647                   gtid, taskdata, resumed_task) );
648 
649     KMP_DEBUG_ASSERT( taskdata -> td_flags.tasktype == TASK_EXPLICIT );
650 
651     // Pop task from stack if tied
652 #ifdef BUILD_TIED_TASK_STACK
653     if ( taskdata -> td_flags.tiedness == TASK_TIED )
654     {
655         __kmp_pop_task_stack( gtid, thread, taskdata );
656     }
657 #endif /* BUILD_TIED_TASK_STACK */
658 
659     if ( taskdata->td_flags.tiedness == TASK_UNTIED ) {
660         // untied task needs to check the counter so that the task structure is not freed prematurely
661         kmp_int32 counter = KMP_TEST_THEN_DEC32(&taskdata->td_untied_count) - 1;
662         KA_TRACE(20, ( "__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
663                        gtid, counter, taskdata ) );
664         if ( counter > 0 ) {
665             // untied task is not done, to be continued possibly by other thread, do not free it now
666             if (resumed_task == NULL) {
667                 KMP_DEBUG_ASSERT( taskdata->td_flags.task_serial );
668                 resumed_task = taskdata->td_parent;  // In a serialized task, the resumed task is the parent
669             }
670             thread->th.th_current_task = resumed_task; // restore current_task
671             resumed_task->td_flags.executing = 1;  // resume previous task
672             KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, resuming task %p\n",
673                           gtid, taskdata, resumed_task) );
674             return;
675         }
676     }
677 
678     KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 0 );
679     taskdata -> td_flags.complete = 1;   // mark the task as completed
680     KMP_DEBUG_ASSERT( taskdata -> td_flags.started == 1 );
681     KMP_DEBUG_ASSERT( taskdata -> td_flags.freed == 0 );
682 
683     // Only need to keep track of count if team parallel and tasking not serialized
684     if ( !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) ) {
685         // Predecrement simulated by "- 1" calculation
686         children = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata -> td_parent -> td_incomplete_child_tasks) ) - 1;
687         KMP_DEBUG_ASSERT( children >= 0 );
688 #if OMP_40_ENABLED
689         if ( taskdata->td_taskgroup )
690             KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata->td_taskgroup->count) );
691 #if OMP_45_ENABLED
692     }
693     // if we found proxy tasks there could exist a dependency chain
694     // with the proxy task as origin
695     if ( !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) || (task_team && task_team->tt.tt_found_proxy_tasks) ) {
696 #endif
697         __kmp_release_deps(gtid,taskdata);
698 #endif
699     }
700 
701     // td_flags.executing  must be marked as 0 after __kmp_release_deps has been called
702     // Othertwise, if a task is executed immediately from the release_deps code
703     // the flag will be reset to 1 again by this same function
704     KMP_DEBUG_ASSERT( taskdata -> td_flags.executing == 1 );
705     taskdata -> td_flags.executing = 0;  // suspend the finishing task
706 
707     KA_TRACE(20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
708                   gtid, taskdata, children) );
709 
710 #if OMP_40_ENABLED
711     /* If the tasks' destructor thunk flag has been set, we need to invoke the
712        destructor thunk that has been generated by the compiler.
713        The code is placed here, since at this point other tasks might have been released
714        hence overlapping the destructor invokations with some other work in the
715        released tasks.  The OpenMP spec is not specific on when the destructors are
716        invoked, so we should be free to choose.
717     */
718     if (taskdata->td_flags.destructors_thunk) {
719         kmp_routine_entry_t destr_thunk = task->data1.destructors;
720         KMP_ASSERT(destr_thunk);
721         destr_thunk(gtid, task);
722     }
723 #endif // OMP_40_ENABLED
724 
725     // bookkeeping for resuming task:
726     // GEH - note tasking_ser => task_serial
727     KMP_DEBUG_ASSERT( (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
728                        taskdata->td_flags.task_serial);
729     if ( taskdata->td_flags.task_serial )
730     {
731         if (resumed_task == NULL) {
732             resumed_task = taskdata->td_parent;  // In a serialized task, the resumed task is the parent
733         }
734         else
735 #if OMP_45_ENABLED
736              if ( !(task_team && task_team->tt.tt_found_proxy_tasks) )
737 #endif
738         {
739             // verify resumed task passed in points to parent
740             KMP_DEBUG_ASSERT( resumed_task == taskdata->td_parent );
741         }
742     }
743     else {
744         KMP_DEBUG_ASSERT( resumed_task != NULL );        // verify that resumed task is passed as arguemnt
745     }
746 
747     // Free this task and then ancestor tasks if they have no children.
748     // Restore th_current_task first as suggested by John:
749     // johnmc: if an asynchronous inquiry peers into the runtime system
750     // it doesn't see the freed task as the current task.
751     thread->th.th_current_task = resumed_task;
752     __kmp_free_task_and_ancestors(gtid, taskdata, thread);
753 
754     // TODO: GEH - make sure root team implicit task is initialized properly.
755     // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
756     resumed_task->td_flags.executing = 1;  // resume previous task
757 
758     KA_TRACE(10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
759                   gtid, taskdata, resumed_task) );
760 
761     return;
762 }
763 
764 //---------------------------------------------------------------------
765 // __kmpc_omp_task_complete_if0: report that a task has completed execution
766 // loc_ref: source location information; points to end of task block.
767 // gtid: global thread number.
768 // task: task thunk for the completed task.
769 
770 void
771 __kmpc_omp_task_complete_if0( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task )
772 {
773     KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
774                   gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
775 
776     __kmp_task_finish( gtid, task, NULL );  // this routine will provide task to resume
777 
778     KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
779                   gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
780 
781     return;
782 }
783 
784 #ifdef TASK_UNUSED
785 //---------------------------------------------------------------------
786 // __kmpc_omp_task_complete: report that a task has completed execution
787 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
788 
789 void
790 __kmpc_omp_task_complete( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task )
791 {
792     KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n",
793                   gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
794 
795     __kmp_task_finish( gtid, task, NULL );  // Not sure how to find task to resume
796 
797     KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n",
798                   gtid, loc_ref, KMP_TASK_TO_TASKDATA(task) ) );
799     return;
800 }
801 #endif // TASK_UNUSED
802 
803 
804 #if OMPT_SUPPORT
805 //----------------------------------------------------------------------------------------------------
806 // __kmp_task_init_ompt:
807 //   Initialize OMPT fields maintained by a task. This will only be called after
808 //   ompt_tool, so we already know whether ompt is enabled or not.
809 
810 static inline void
811 __kmp_task_init_ompt( kmp_taskdata_t * task, int tid, void * function )
812 {
813     if (ompt_enabled) {
814         task->ompt_task_info.task_id = __ompt_task_id_new(tid);
815         task->ompt_task_info.function = function;
816         task->ompt_task_info.frame.exit_runtime_frame = NULL;
817         task->ompt_task_info.frame.reenter_runtime_frame = NULL;
818 #if OMP_40_ENABLED
819         task->ompt_task_info.ndeps = 0;
820         task->ompt_task_info.deps = NULL;
821 #endif /* OMP_40_ENABLED */
822     }
823 }
824 #endif
825 
826 
827 //----------------------------------------------------------------------------------------------------
828 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit task for a given thread
829 //
830 // loc_ref:  reference to source location of parallel region
831 // this_thr:  thread data structure corresponding to implicit task
832 // team: team for this_thr
833 // tid: thread id of given thread within team
834 // set_curr_task: TRUE if need to push current task to thread
835 // NOTE: Routine does not set up the implicit task ICVS.  This is assumed to have already been done elsewhere.
836 // TODO: Get better loc_ref.  Value passed in may be NULL
837 
838 void
839 __kmp_init_implicit_task( ident_t *loc_ref, kmp_info_t *this_thr, kmp_team_t *team, int tid, int set_curr_task )
840 {
841     kmp_taskdata_t * task   = & team->t.t_implicit_task_taskdata[ tid ];
842 
843     KF_TRACE(10, ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
844                   tid, team, task, set_curr_task ? "TRUE" : "FALSE" ) );
845 
846     task->td_task_id  = KMP_GEN_TASK_ID();
847     task->td_team     = team;
848 //    task->td_parent   = NULL;  // fix for CQ230101 (broken parent task info in debugger)
849     task->td_ident    = loc_ref;
850     task->td_taskwait_ident   = NULL;
851     task->td_taskwait_counter = 0;
852     task->td_taskwait_thread  = 0;
853 
854     task->td_flags.tiedness    = TASK_TIED;
855     task->td_flags.tasktype    = TASK_IMPLICIT;
856 #if OMP_45_ENABLED
857     task->td_flags.proxy       = TASK_FULL;
858 #endif
859 
860     // All implicit tasks are executed immediately, not deferred
861     task->td_flags.task_serial = 1;
862     task->td_flags.tasking_ser = ( __kmp_tasking_mode == tskm_immediate_exec );
863     task->td_flags.team_serial = ( team->t.t_serialized ) ? 1 : 0;
864 
865     task->td_flags.started     = 1;
866     task->td_flags.executing   = 1;
867     task->td_flags.complete    = 0;
868     task->td_flags.freed       = 0;
869 
870 #if OMP_40_ENABLED
871     task->td_depnode = NULL;
872 #endif
873 
874     if (set_curr_task) {  // only do this initialization the first time a thread is created
875         task->td_incomplete_child_tasks = 0;
876         task->td_allocated_child_tasks  = 0; // Not used because do not need to deallocate implicit task
877 #if OMP_40_ENABLED
878         task->td_taskgroup = NULL;           // An implicit task does not have taskgroup
879         task->td_dephash = NULL;
880 #endif
881         __kmp_push_current_task_to_thread( this_thr, team, tid );
882     } else {
883         KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
884         KMP_DEBUG_ASSERT(task->td_allocated_child_tasks  == 0);
885     }
886 
887 #if OMPT_SUPPORT
888     __kmp_task_init_ompt(task, tid, NULL);
889 #endif
890 
891     KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n",
892                   tid, team, task ) );
893 }
894 
895 
896 //-----------------------------------------------------------------------------
897 //// __kmp_finish_implicit_task: Release resources associated to implicit tasks
898 //// at the end of parallel regions. Some resources are kept for reuse in the
899 //// next parallel region.
900 ////
901 //// thread:  thread data structure corresponding to implicit task
902 //
903 void
904 __kmp_finish_implicit_task(kmp_info_t *thread)
905 {
906     kmp_taskdata_t *task = thread->th.th_current_task;
907     if (task->td_dephash)
908         __kmp_dephash_free_entries(thread, task->td_dephash);
909 }
910 
911 
912 //-----------------------------------------------------------------------------
913 //// __kmp_free_implicit_task: Release resources associated to implicit tasks
914 //// when these are destroyed regions
915 ////
916 //// thread:  thread data structure corresponding to implicit task
917 //
918 void
919 __kmp_free_implicit_task(kmp_info_t *thread)
920 {
921     kmp_taskdata_t *task = thread->th.th_current_task;
922     if (task->td_dephash)
923         __kmp_dephash_free(thread, task->td_dephash);
924     task->td_dephash = NULL;
925 }
926 
927 
928 // Round up a size to a power of two specified by val
929 // Used to insert padding between structures co-allocated using a single malloc() call
930 static size_t
931 __kmp_round_up_to_val( size_t size, size_t val ) {
932     if ( size & ( val - 1 ) ) {
933         size &= ~ ( val - 1 );
934         if ( size <= KMP_SIZE_T_MAX - val ) {
935             size += val;    // Round up if there is no overflow.
936         }; // if
937     }; // if
938     return size;
939 } // __kmp_round_up_to_va
940 
941 
942 //---------------------------------------------------------------------------------
943 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
944 //
945 // loc_ref: source location information
946 // gtid: global thread number.
947 // flags: include tiedness & task type (explicit vs. implicit) of the ''new'' task encountered.
948 //        Converted from kmp_int32 to kmp_tasking_flags_t in routine.
949 // sizeof_kmp_task_t:  Size in bytes of kmp_task_t data structure including private vars accessed in task.
950 // sizeof_shareds:  Size in bytes of array of pointers to shared vars accessed in task.
951 // task_entry: Pointer to task code entry point generated by compiler.
952 // returns: a pointer to the allocated kmp_task_t structure (task).
953 
954 kmp_task_t *
955 __kmp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_tasking_flags_t *flags,
956                   size_t sizeof_kmp_task_t, size_t sizeof_shareds,
957                   kmp_routine_entry_t task_entry )
958 {
959     kmp_task_t *task;
960     kmp_taskdata_t *taskdata;
961     kmp_info_t *thread = __kmp_threads[ gtid ];
962     kmp_team_t *team = thread->th.th_team;
963     kmp_taskdata_t *parent_task = thread->th.th_current_task;
964     size_t shareds_offset;
965 
966     KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
967                   "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
968                   gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
969                   sizeof_shareds, task_entry) );
970 
971     if ( parent_task->td_flags.final ) {
972         if (flags->merged_if0) {
973         }
974         flags->final = 1;
975     }
976 
977 #if OMP_45_ENABLED
978     if ( flags->proxy == TASK_PROXY ) {
979         flags->tiedness = TASK_UNTIED;
980         flags->merged_if0 = 1;
981 
982         /* are we running in a sequential parallel or tskm_immediate_exec... we need tasking support enabled */
983         if ( (thread->th.th_task_team) == NULL ) {
984             /* This should only happen if the team is serialized
985                 setup a task team and propagate it to the thread
986             */
987             KMP_DEBUG_ASSERT(team->t.t_serialized);
988             KA_TRACE(30,("T#%d creating task team in __kmp_task_alloc for proxy task\n", gtid));
989             __kmp_task_team_setup(thread,team,1); // 1 indicates setup the current team regardless of nthreads
990             thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
991         }
992         kmp_task_team_t * task_team = thread->th.th_task_team;
993 
994         /* tasking must be enabled now as the task might not be pushed */
995         if ( !KMP_TASKING_ENABLED( task_team ) ) {
996             KA_TRACE(30,("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
997             __kmp_enable_tasking( task_team, thread );
998             kmp_int32 tid = thread->th.th_info.ds.ds_tid;
999             kmp_thread_data_t * thread_data = & task_team -> tt.tt_threads_data[ tid ];
1000             // No lock needed since only owner can allocate
1001             if (thread_data -> td.td_deque == NULL ) {
1002                 __kmp_alloc_task_deque( thread, thread_data );
1003             }
1004         }
1005 
1006         if ( task_team->tt.tt_found_proxy_tasks == FALSE )
1007           TCW_4(task_team -> tt.tt_found_proxy_tasks, TRUE);
1008     }
1009 #endif
1010 
1011     // Calculate shared structure offset including padding after kmp_task_t struct
1012     // to align pointers in shared struct
1013     shareds_offset = sizeof( kmp_taskdata_t ) + sizeof_kmp_task_t;
1014     shareds_offset = __kmp_round_up_to_val( shareds_offset, sizeof( void * ));
1015 
1016     // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1017     KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n",
1018                   gtid, shareds_offset) );
1019     KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n",
1020                   gtid, sizeof_shareds) );
1021 
1022     // Avoid double allocation here by combining shareds with taskdata
1023     #if USE_FAST_MEMORY
1024     taskdata = (kmp_taskdata_t *) __kmp_fast_allocate( thread, shareds_offset + sizeof_shareds );
1025     #else /* ! USE_FAST_MEMORY */
1026     taskdata = (kmp_taskdata_t *) __kmp_thread_malloc( thread, shareds_offset + sizeof_shareds );
1027     #endif /* USE_FAST_MEMORY */
1028     ANNOTATE_HAPPENS_AFTER(taskdata);
1029 
1030     task                      = KMP_TASKDATA_TO_TASK(taskdata);
1031 
1032     // Make sure task & taskdata are aligned appropriately
1033 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1034     KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)taskdata) & (sizeof(double)-1) ) == 0 );
1035     KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)task) & (sizeof(double)-1) ) == 0 );
1036 #else
1037     KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)taskdata) & (sizeof(_Quad)-1) ) == 0 );
1038     KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)task) & (sizeof(_Quad)-1) ) == 0 );
1039 #endif
1040     if (sizeof_shareds > 0) {
1041         // Avoid double allocation here by combining shareds with taskdata
1042         task->shareds         = & ((char *) taskdata)[ shareds_offset ];
1043         // Make sure shareds struct is aligned to pointer size
1044         KMP_DEBUG_ASSERT( ( ((kmp_uintptr_t)task->shareds) & (sizeof(void *)-1) ) == 0 );
1045     } else {
1046         task->shareds         = NULL;
1047     }
1048     task->routine             = task_entry;
1049     task->part_id             = 0;      // AC: Always start with 0 part id
1050 
1051     taskdata->td_task_id      = KMP_GEN_TASK_ID();
1052     taskdata->td_team         = team;
1053     taskdata->td_alloc_thread = thread;
1054     taskdata->td_parent       = parent_task;
1055     taskdata->td_level        = parent_task->td_level + 1; // increment nesting level
1056     taskdata->td_untied_count = 0;
1057     taskdata->td_ident        = loc_ref;
1058     taskdata->td_taskwait_ident   = NULL;
1059     taskdata->td_taskwait_counter = 0;
1060     taskdata->td_taskwait_thread  = 0;
1061     KMP_DEBUG_ASSERT( taskdata->td_parent != NULL );
1062 #if OMP_45_ENABLED
1063     // avoid copying icvs for proxy tasks
1064     if ( flags->proxy == TASK_FULL )
1065 #endif
1066        copy_icvs( &taskdata->td_icvs, &taskdata->td_parent->td_icvs );
1067 
1068     taskdata->td_flags.tiedness    = flags->tiedness;
1069     taskdata->td_flags.final       = flags->final;
1070     taskdata->td_flags.merged_if0  = flags->merged_if0;
1071 #if OMP_40_ENABLED
1072     taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
1073 #endif // OMP_40_ENABLED
1074 #if OMP_45_ENABLED
1075     taskdata->td_flags.proxy           = flags->proxy;
1076     taskdata->td_task_team         = thread->th.th_task_team;
1077     taskdata->td_size_alloc        = shareds_offset + sizeof_shareds;
1078 #endif
1079     taskdata->td_flags.tasktype    = TASK_EXPLICIT;
1080 
1081     // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1082     taskdata->td_flags.tasking_ser = ( __kmp_tasking_mode == tskm_immediate_exec );
1083 
1084     // GEH - TODO: fix this to copy parent task's value of team_serial flag
1085     taskdata->td_flags.team_serial = ( team->t.t_serialized ) ? 1 : 0;
1086 
1087     // GEH - Note we serialize the task if the team is serialized to make sure implicit parallel region
1088     //       tasks are not left until program termination to execute.  Also, it helps locality to execute
1089     //       immediately.
1090     taskdata->td_flags.task_serial = ( parent_task->td_flags.final
1091       || taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser );
1092 
1093     taskdata->td_flags.started     = 0;
1094     taskdata->td_flags.executing   = 0;
1095     taskdata->td_flags.complete    = 0;
1096     taskdata->td_flags.freed       = 0;
1097 
1098     taskdata->td_flags.native      = flags->native;
1099 
1100     taskdata->td_incomplete_child_tasks = 0;
1101     taskdata->td_allocated_child_tasks  = 1; // start at one because counts current task and children
1102 #if OMP_40_ENABLED
1103     taskdata->td_taskgroup = parent_task->td_taskgroup; // task inherits the taskgroup from the parent task
1104     taskdata->td_dephash = NULL;
1105     taskdata->td_depnode = NULL;
1106 #endif
1107 
1108     // Only need to keep track of child task counts if team parallel and tasking not serialized or if it is a proxy task
1109 #if OMP_45_ENABLED
1110     if ( flags->proxy == TASK_PROXY || !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) )
1111 #else
1112     if ( !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) )
1113 #endif
1114     {
1115         KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_incomplete_child_tasks) );
1116 #if OMP_40_ENABLED
1117         if ( parent_task->td_taskgroup )
1118             KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_taskgroup->count) );
1119 #endif
1120         // Only need to keep track of allocated child tasks for explicit tasks since implicit not deallocated
1121         if ( taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT ) {
1122             KMP_TEST_THEN_INC32( (kmp_int32 *)(& taskdata->td_parent->td_allocated_child_tasks) );
1123         }
1124     }
1125 
1126     KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1127                   gtid, taskdata, taskdata->td_parent) );
1128     ANNOTATE_HAPPENS_BEFORE(task);
1129 
1130 #if OMPT_SUPPORT
1131     __kmp_task_init_ompt(taskdata, gtid, (void*) task_entry);
1132 #endif
1133 
1134     return task;
1135 }
1136 
1137 
1138 kmp_task_t *
1139 __kmpc_omp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
1140                        size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1141                        kmp_routine_entry_t task_entry )
1142 {
1143     kmp_task_t *retval;
1144     kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *) & flags;
1145 
1146     input_flags->native = FALSE;
1147     // __kmp_task_alloc() sets up all other runtime flags
1148 
1149 #if OMP_45_ENABLED
1150     KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) "
1151                   "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1152                   gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
1153                   input_flags->proxy ? "proxy" : "",
1154                   sizeof_kmp_task_t, sizeof_shareds, task_entry) );
1155 #else
1156     KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) "
1157                   "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1158                   gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",
1159                   sizeof_kmp_task_t, sizeof_shareds, task_entry) );
1160 #endif
1161 
1162     retval = __kmp_task_alloc( loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1163                                sizeof_shareds, task_entry );
1164 
1165     KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval) );
1166 
1167     return retval;
1168 }
1169 
1170 //-----------------------------------------------------------
1171 //  __kmp_invoke_task: invoke the specified task
1172 //
1173 // gtid: global thread ID of caller
1174 // task: the task to invoke
1175 // current_task: the task to resume after task invokation
1176 
1177 static void
1178 __kmp_invoke_task( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t * current_task )
1179 {
1180     kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
1181     kmp_uint64 cur_time;
1182 #if OMP_40_ENABLED
1183     int discard = 0 /* false */;
1184 #endif
1185     KA_TRACE(30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1186                   gtid, taskdata, current_task) );
1187     KMP_DEBUG_ASSERT(task);
1188 #if OMP_45_ENABLED
1189     if ( taskdata->td_flags.proxy == TASK_PROXY &&
1190          taskdata->td_flags.complete == 1)
1191          {
1192             // This is a proxy task that was already completed but it needs to run
1193             // its bottom-half finish
1194             KA_TRACE(30, ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1195                   gtid, taskdata) );
1196 
1197             __kmp_bottom_half_finish_proxy(gtid,task);
1198 
1199             KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for proxy task %p, resuming task %p\n", gtid, taskdata, current_task) );
1200 
1201             return;
1202          }
1203 #endif
1204 
1205 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1206     if(__kmp_forkjoin_frames_mode == 3) {
1207         // Get the current time stamp to measure task execution time to correct barrier imbalance time
1208         cur_time = __itt_get_timestamp();
1209     }
1210 #endif
1211 
1212 #if OMP_45_ENABLED
1213     // Proxy tasks are not handled by the runtime
1214     if ( taskdata->td_flags.proxy != TASK_PROXY ) {
1215 #endif
1216       ANNOTATE_HAPPENS_AFTER(task);
1217       __kmp_task_start( gtid, task, current_task );
1218 #if OMP_45_ENABLED
1219     }
1220 #endif
1221 
1222 #if OMPT_SUPPORT
1223     ompt_thread_info_t oldInfo;
1224     kmp_info_t * thread;
1225     if (ompt_enabled) {
1226         // Store the threads states and restore them after the task
1227         thread = __kmp_threads[ gtid ];
1228         oldInfo = thread->th.ompt_thread_info;
1229         thread->th.ompt_thread_info.wait_id = 0;
1230         thread->th.ompt_thread_info.state = ompt_state_work_parallel;
1231         taskdata->ompt_task_info.frame.exit_runtime_frame = __builtin_frame_address(0);
1232     }
1233 #endif
1234 
1235 #if OMP_40_ENABLED
1236     // TODO: cancel tasks if the parallel region has also been cancelled
1237     // TODO: check if this sequence can be hoisted above __kmp_task_start
1238     // if cancellation has been enabled for this run ...
1239     if (__kmp_omp_cancellation) {
1240         kmp_info_t *this_thr = __kmp_threads [ gtid ];
1241         kmp_team_t * this_team = this_thr->th.th_team;
1242         kmp_taskgroup_t * taskgroup = taskdata->td_taskgroup;
1243         if ((taskgroup && taskgroup->cancel_request) || (this_team->t.t_cancel_request == cancel_parallel)) {
1244             KMP_COUNT_BLOCK(TASK_cancelled);
1245             // this task belongs to a task group and we need to cancel it
1246             discard = 1 /* true */;
1247         }
1248     }
1249 
1250     //
1251     // Invoke the task routine and pass in relevant data.
1252     // Thunks generated by gcc take a different argument list.
1253     //
1254     if (!discard) {
1255 #if KMP_STATS_ENABLED
1256         KMP_COUNT_BLOCK(TASK_executed);
1257         switch(KMP_GET_THREAD_STATE()) {
1258          case FORK_JOIN_BARRIER: KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar); break;
1259          case PLAIN_BARRIER: KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar); break;
1260          case TASKYIELD: KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield); break;
1261          case TASKWAIT: KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait); break;
1262          case TASKGROUP: KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup); break;
1263          default: KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate); break;
1264         }
1265 #endif // KMP_STATS_ENABLED
1266 #endif // OMP_40_ENABLED
1267 
1268 #if OMPT_SUPPORT && OMPT_TRACE
1269         /* let OMPT know that we're about to run this task */
1270         if (ompt_enabled &&
1271              ompt_callbacks.ompt_callback(ompt_event_task_switch))
1272         {
1273           ompt_callbacks.ompt_callback(ompt_event_task_switch)(
1274             current_task->ompt_task_info.task_id,
1275             taskdata->ompt_task_info.task_id);
1276         }
1277 #endif
1278 
1279 #ifdef KMP_GOMP_COMPAT
1280         if (taskdata->td_flags.native) {
1281             ((void (*)(void *))(*(task->routine)))(task->shareds);
1282         }
1283         else
1284 #endif /* KMP_GOMP_COMPAT */
1285         {
1286             (*(task->routine))(gtid, task);
1287         }
1288         KMP_POP_PARTITIONED_TIMER();
1289 
1290 #if OMPT_SUPPORT && OMPT_TRACE
1291         /* let OMPT know that we're returning to the callee task */
1292         if (ompt_enabled &&
1293              ompt_callbacks.ompt_callback(ompt_event_task_switch))
1294         {
1295           ompt_callbacks.ompt_callback(ompt_event_task_switch)(
1296             taskdata->ompt_task_info.task_id,
1297             current_task->ompt_task_info.task_id);
1298         }
1299 #endif
1300 
1301 #if OMP_40_ENABLED
1302     }
1303 #endif // OMP_40_ENABLED
1304 
1305 
1306 #if OMPT_SUPPORT
1307     if (ompt_enabled) {
1308         thread->th.ompt_thread_info = oldInfo;
1309         taskdata->ompt_task_info.frame.exit_runtime_frame = NULL;
1310     }
1311 #endif
1312 
1313 #if OMP_45_ENABLED
1314     // Proxy tasks are not handled by the runtime
1315     if ( taskdata->td_flags.proxy != TASK_PROXY ) {
1316 #endif
1317       ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
1318       __kmp_task_finish( gtid, task, current_task );
1319 #if OMP_45_ENABLED
1320     }
1321 #endif
1322 
1323 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1324     // Barrier imbalance - correct arrive time after the task finished
1325     if(__kmp_forkjoin_frames_mode == 3) {
1326         kmp_info_t *this_thr = __kmp_threads [ gtid ];
1327         if(this_thr->th.th_bar_arrive_time) {
1328             this_thr->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1329         }
1330     }
1331 #endif
1332     KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1333                   gtid, taskdata, current_task) );
1334     return;
1335 }
1336 
1337 //-----------------------------------------------------------------------
1338 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1339 //
1340 // loc_ref: location of original task pragma (ignored)
1341 // gtid: Global Thread ID of encountering thread
1342 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1343 // Returns:
1344 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to be resumed later.
1345 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be resumed later.
1346 
1347 kmp_int32
1348 __kmpc_omp_task_parts( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task)
1349 {
1350     kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1351 
1352     KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n",
1353                   gtid, loc_ref, new_taskdata ) );
1354 
1355     /* Should we execute the new task or queue it?   For now, let's just always try to
1356        queue it.  If the queue fills up, then we'll execute it.  */
1357 
1358     if ( __kmp_push_task( gtid, new_task ) == TASK_NOT_PUSHED ) // if cannot defer
1359     {                                                           // Execute this task immediately
1360         kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
1361         new_taskdata->td_flags.task_serial = 1;
1362         __kmp_invoke_task( gtid, new_task, current_task );
1363     }
1364 
1365     KA_TRACE(10, ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1366                   "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n", gtid, loc_ref,
1367                   new_taskdata ) );
1368 
1369     ANNOTATE_HAPPENS_BEFORE(new_task);
1370     return TASK_CURRENT_NOT_QUEUED;
1371 }
1372 
1373 //---------------------------------------------------------------------
1374 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1375 // gtid: Global Thread ID of encountering thread
1376 // new_task: non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1377 // serialize_immediate: if TRUE then if the task is executed immediately its execution will be serialized
1378 // returns:
1379 //
1380 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to be resumed later.
1381 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be resumed later.
1382 kmp_int32
1383 __kmp_omp_task( kmp_int32 gtid, kmp_task_t * new_task, bool serialize_immediate )
1384 {
1385     kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1386 
1387 #if OMPT_SUPPORT
1388     if (ompt_enabled) {
1389         new_taskdata->ompt_task_info.frame.reenter_runtime_frame =
1390             __builtin_frame_address(1);
1391     }
1392 #endif
1393 
1394     /* Should we execute the new task or queue it?   For now, let's just always try to
1395        queue it.  If the queue fills up, then we'll execute it.  */
1396 #if OMP_45_ENABLED
1397     if ( new_taskdata->td_flags.proxy == TASK_PROXY || __kmp_push_task( gtid, new_task ) == TASK_NOT_PUSHED ) // if cannot defer
1398 #else
1399     if ( __kmp_push_task( gtid, new_task ) == TASK_NOT_PUSHED ) // if cannot defer
1400 #endif
1401     {                                                           // Execute this task immediately
1402         kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
1403         if ( serialize_immediate )
1404           new_taskdata -> td_flags.task_serial = 1;
1405         __kmp_invoke_task( gtid, new_task, current_task );
1406     }
1407 
1408 #if OMPT_SUPPORT
1409     if (ompt_enabled) {
1410         new_taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL;
1411     }
1412 #endif
1413 
1414     ANNOTATE_HAPPENS_BEFORE(new_task);
1415     return TASK_CURRENT_NOT_QUEUED;
1416 }
1417 
1418 //---------------------------------------------------------------------
1419 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a non-thread-switchable task from
1420 // the parent thread only!
1421 // loc_ref: location of original task pragma (ignored)
1422 // gtid: Global Thread ID of encountering thread
1423 // new_task: non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1424 // returns:
1425 //
1426 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to be resumed later.
1427 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be resumed later.
1428 
1429 kmp_int32
1430 __kmpc_omp_task( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task)
1431 {
1432     kmp_int32 res;
1433     KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1434 
1435 #if KMP_DEBUG
1436     kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1437 #endif
1438     KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n",
1439                   gtid, loc_ref, new_taskdata ) );
1440 
1441     res =  __kmp_omp_task(gtid,new_task,true);
1442 
1443     KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1444                   gtid, loc_ref, new_taskdata ) );
1445     return res;
1446 }
1447 
1448 //-------------------------------------------------------------------------------------
1449 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are complete
1450 
1451 kmp_int32
1452 __kmpc_omp_taskwait( ident_t *loc_ref, kmp_int32 gtid )
1453 {
1454     kmp_taskdata_t * taskdata;
1455     kmp_info_t * thread;
1456     int thread_finished = FALSE;
1457     KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1458 
1459     KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref) );
1460 
1461     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1462         // GEH TODO: shouldn't we have some sort of OMPRAP API calls here to mark begin wait?
1463 
1464         thread = __kmp_threads[ gtid ];
1465         taskdata = thread -> th.th_current_task;
1466 
1467 #if OMPT_SUPPORT && OMPT_TRACE
1468         ompt_task_id_t my_task_id;
1469         ompt_parallel_id_t my_parallel_id;
1470 
1471         if (ompt_enabled) {
1472             kmp_team_t *team = thread->th.th_team;
1473             my_task_id = taskdata->ompt_task_info.task_id;
1474             my_parallel_id = team->t.ompt_team_info.parallel_id;
1475 
1476             taskdata->ompt_task_info.frame.reenter_runtime_frame = __builtin_frame_address(1);
1477             if (ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)) {
1478                 ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)(
1479                                 my_parallel_id, my_task_id);
1480             }
1481         }
1482 #endif
1483 
1484         // Debugger: The taskwait is active. Store location and thread encountered the taskwait.
1485 #if USE_ITT_BUILD
1486         // Note: These values are used by ITT events as well.
1487 #endif /* USE_ITT_BUILD */
1488         taskdata->td_taskwait_counter += 1;
1489         taskdata->td_taskwait_ident    = loc_ref;
1490         taskdata->td_taskwait_thread   = gtid + 1;
1491 
1492 #if USE_ITT_BUILD
1493         void * itt_sync_obj = __kmp_itt_taskwait_object( gtid );
1494         if ( itt_sync_obj != NULL )
1495             __kmp_itt_taskwait_starting( gtid, itt_sync_obj );
1496 #endif /* USE_ITT_BUILD */
1497 
1498         bool must_wait = ! taskdata->td_flags.team_serial && ! taskdata->td_flags.final;
1499 
1500 #if OMP_45_ENABLED
1501         must_wait = must_wait || (thread->th.th_task_team != NULL && thread->th.th_task_team->tt.tt_found_proxy_tasks);
1502 #endif
1503         if (must_wait)
1504         {
1505             kmp_flag_32 flag(&(taskdata->td_incomplete_child_tasks), 0U);
1506             while ( TCR_4(taskdata -> td_incomplete_child_tasks) != 0 ) {
1507                 flag.execute_tasks(thread, gtid, FALSE, &thread_finished
1508                                    USE_ITT_BUILD_ARG(itt_sync_obj), __kmp_task_stealing_constraint );
1509             }
1510         }
1511 #if USE_ITT_BUILD
1512         if ( itt_sync_obj != NULL )
1513             __kmp_itt_taskwait_finished( gtid, itt_sync_obj );
1514 #endif /* USE_ITT_BUILD */
1515 
1516         // GEH TODO: shouldn't we have some sort of OMPRAP API calls here to mark end of wait?
1517         // Debugger:  The taskwait is completed. Location remains, but thread is negated.
1518         taskdata->td_taskwait_thread = - taskdata->td_taskwait_thread;
1519 
1520 #if OMPT_SUPPORT && OMPT_TRACE
1521         if (ompt_enabled) {
1522             if (ompt_callbacks.ompt_callback(ompt_event_taskwait_end)) {
1523                 ompt_callbacks.ompt_callback(ompt_event_taskwait_end)(
1524                                 my_parallel_id, my_task_id);
1525             }
1526             taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL;
1527         }
1528 #endif
1529         ANNOTATE_HAPPENS_AFTER(taskdata);
1530     }
1531 
1532     KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1533                   "returning TASK_CURRENT_NOT_QUEUED\n", gtid, taskdata) );
1534 
1535     return TASK_CURRENT_NOT_QUEUED;
1536 }
1537 
1538 
1539 //-------------------------------------------------
1540 // __kmpc_omp_taskyield: switch to a different task
1541 
1542 kmp_int32
1543 __kmpc_omp_taskyield( ident_t *loc_ref, kmp_int32 gtid, int end_part )
1544 {
1545     kmp_taskdata_t * taskdata;
1546     kmp_info_t * thread;
1547     int thread_finished = FALSE;
1548 
1549     KMP_COUNT_BLOCK(OMP_TASKYIELD);
1550     KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
1551 
1552     KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
1553                   gtid, loc_ref, end_part) );
1554 
1555     if ( __kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel ) {
1556         // GEH TODO: shouldn't we have some sort of OMPRAP API calls here to mark begin wait?
1557 
1558         thread = __kmp_threads[ gtid ];
1559         taskdata = thread -> th.th_current_task;
1560         // Should we model this as a task wait or not?
1561         // Debugger: The taskwait is active. Store location and thread encountered the taskwait.
1562 #if USE_ITT_BUILD
1563         // Note: These values are used by ITT events as well.
1564 #endif /* USE_ITT_BUILD */
1565         taskdata->td_taskwait_counter += 1;
1566         taskdata->td_taskwait_ident    = loc_ref;
1567         taskdata->td_taskwait_thread   = gtid + 1;
1568 
1569 #if USE_ITT_BUILD
1570         void * itt_sync_obj = __kmp_itt_taskwait_object( gtid );
1571         if ( itt_sync_obj != NULL )
1572             __kmp_itt_taskwait_starting( gtid, itt_sync_obj );
1573 #endif /* USE_ITT_BUILD */
1574         if ( ! taskdata->td_flags.team_serial ) {
1575             kmp_task_team_t * task_team = thread->th.th_task_team;
1576             if (task_team != NULL) {
1577                 if (KMP_TASKING_ENABLED(task_team)) {
1578                     __kmp_execute_tasks_32( thread, gtid, NULL, FALSE, &thread_finished
1579                                             USE_ITT_BUILD_ARG(itt_sync_obj), __kmp_task_stealing_constraint );
1580                 }
1581             }
1582         }
1583 #if USE_ITT_BUILD
1584         if ( itt_sync_obj != NULL )
1585             __kmp_itt_taskwait_finished( gtid, itt_sync_obj );
1586 #endif /* USE_ITT_BUILD */
1587 
1588         // GEH TODO: shouldn't we have some sort of OMPRAP API calls here to mark end of wait?
1589         // Debugger:  The taskwait is completed. Location remains, but thread is negated.
1590         taskdata->td_taskwait_thread = - taskdata->td_taskwait_thread;
1591     }
1592 
1593     KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
1594                   "returning TASK_CURRENT_NOT_QUEUED\n", gtid, taskdata) );
1595 
1596     return TASK_CURRENT_NOT_QUEUED;
1597 }
1598 
1599 
1600 #if OMP_40_ENABLED
1601 //-------------------------------------------------------------------------------------
1602 // __kmpc_taskgroup: Start a new taskgroup
1603 
1604 void
1605 __kmpc_taskgroup( ident_t* loc, int gtid )
1606 {
1607     kmp_info_t      * thread = __kmp_threads[ gtid ];
1608     kmp_taskdata_t  * taskdata = thread->th.th_current_task;
1609     kmp_taskgroup_t * tg_new =
1610         (kmp_taskgroup_t *)__kmp_thread_malloc( thread, sizeof( kmp_taskgroup_t ) );
1611     KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new) );
1612     tg_new->count = 0;
1613     tg_new->cancel_request = cancel_noreq;
1614     tg_new->parent = taskdata->td_taskgroup;
1615     taskdata->td_taskgroup = tg_new;
1616 }
1617 
1618 
1619 //-------------------------------------------------------------------------------------
1620 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
1621 //                       and its descendants are complete
1622 
1623 void
1624 __kmpc_end_taskgroup( ident_t* loc, int gtid )
1625 {
1626     kmp_info_t      * thread = __kmp_threads[ gtid ];
1627     kmp_taskdata_t  * taskdata = thread->th.th_current_task;
1628     kmp_taskgroup_t * taskgroup = taskdata->td_taskgroup;
1629     int thread_finished = FALSE;
1630 
1631     KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc) );
1632     KMP_DEBUG_ASSERT( taskgroup != NULL );
1633     KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
1634 
1635     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1636 #if USE_ITT_BUILD
1637         // For ITT the taskgroup wait is similar to taskwait until we need to distinguish them
1638         void * itt_sync_obj = __kmp_itt_taskwait_object( gtid );
1639         if ( itt_sync_obj != NULL )
1640             __kmp_itt_taskwait_starting( gtid, itt_sync_obj );
1641 #endif /* USE_ITT_BUILD */
1642 
1643 #if OMP_45_ENABLED
1644         if ( ! taskdata->td_flags.team_serial || (thread->th.th_task_team != NULL && thread->th.th_task_team->tt.tt_found_proxy_tasks) )
1645 #else
1646         if ( ! taskdata->td_flags.team_serial )
1647 #endif
1648         {
1649             kmp_flag_32 flag(&(taskgroup->count), 0U);
1650             while ( TCR_4(taskgroup->count) != 0 ) {
1651                 flag.execute_tasks(thread, gtid, FALSE, &thread_finished
1652                                    USE_ITT_BUILD_ARG(itt_sync_obj), __kmp_task_stealing_constraint );
1653             }
1654         }
1655 
1656 #if USE_ITT_BUILD
1657         if ( itt_sync_obj != NULL )
1658             __kmp_itt_taskwait_finished( gtid, itt_sync_obj );
1659 #endif /* USE_ITT_BUILD */
1660     }
1661     KMP_DEBUG_ASSERT( taskgroup->count == 0 );
1662 
1663     // Restore parent taskgroup for the current task
1664     taskdata->td_taskgroup = taskgroup->parent;
1665     __kmp_thread_free( thread, taskgroup );
1666 
1667     KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n", gtid, taskdata) );
1668     ANNOTATE_HAPPENS_AFTER(taskdata);
1669 }
1670 #endif
1671 
1672 
1673 //------------------------------------------------------
1674 // __kmp_remove_my_task: remove a task from my own deque
1675 
1676 static kmp_task_t *
1677 __kmp_remove_my_task( kmp_info_t * thread, kmp_int32 gtid, kmp_task_team_t *task_team,
1678                       kmp_int32 is_constrained )
1679 {
1680     kmp_task_t * task;
1681     kmp_taskdata_t * taskdata;
1682     kmp_thread_data_t *thread_data;
1683     kmp_uint32 tail;
1684 
1685     KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
1686     KMP_DEBUG_ASSERT( task_team -> tt.tt_threads_data != NULL ); // Caller should check this condition
1687 
1688         thread_data = & task_team -> tt.tt_threads_data[ __kmp_tid_from_gtid( gtid ) ];
1689 
1690     KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
1691                   gtid, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
1692                   thread_data->td.td_deque_tail) );
1693 
1694     if (TCR_4(thread_data -> td.td_deque_ntasks) == 0) {
1695         KA_TRACE(10, ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: ntasks=%d head=%u tail=%u\n",
1696                       gtid, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
1697                       thread_data->td.td_deque_tail) );
1698         return NULL;
1699     }
1700 
1701     __kmp_acquire_bootstrap_lock( & thread_data -> td.td_deque_lock );
1702 
1703     if (TCR_4(thread_data -> td.td_deque_ntasks) == 0) {
1704         __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
1705         KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: ntasks=%d head=%u tail=%u\n",
1706                       gtid, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
1707                       thread_data->td.td_deque_tail) );
1708         return NULL;
1709     }
1710 
1711     tail = ( thread_data -> td.td_deque_tail - 1 ) & TASK_DEQUE_MASK(thread_data->td);  // Wrap index.
1712     taskdata = thread_data -> td.td_deque[ tail ];
1713 
1714     if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) {
1715         // we need to check if the candidate obeys task scheduling constraint:
1716         // only child of current task can be scheduled
1717         kmp_taskdata_t * current = thread->th.th_current_task;
1718         kmp_int32        level = current->td_level;
1719         kmp_taskdata_t * parent = taskdata->td_parent;
1720         while ( parent != current && parent->td_level > level ) {
1721             parent = parent->td_parent;  // check generation up to the level of the current task
1722             KMP_DEBUG_ASSERT(parent != NULL);
1723         }
1724         if ( parent != current ) {
1725             // If the tail task is not a child, then no other child can appear in the deque.
1726             __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
1727             KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: ntasks=%d head=%u tail=%u\n",
1728                           gtid, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
1729                           thread_data->td.td_deque_tail) );
1730             return NULL;
1731         }
1732     }
1733 
1734     thread_data -> td.td_deque_tail = tail;
1735     TCW_4(thread_data -> td.td_deque_ntasks, thread_data -> td.td_deque_ntasks - 1);
1736 
1737     __kmp_release_bootstrap_lock( & thread_data->td.td_deque_lock );
1738 
1739     KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d task %p removed: ntasks=%d head=%u tail=%u\n",
1740                   gtid, taskdata, thread_data->td.td_deque_ntasks, thread_data->td.td_deque_head,
1741                   thread_data->td.td_deque_tail) );
1742 
1743     task = KMP_TASKDATA_TO_TASK( taskdata );
1744     return task;
1745 }
1746 
1747 
1748 //-----------------------------------------------------------
1749 // __kmp_steal_task: remove a task from another thread's deque
1750 // Assume that calling thread has already checked existence of
1751 // task_team thread_data before calling this routine.
1752 
1753 static kmp_task_t *
1754 __kmp_steal_task( kmp_info_t *victim, kmp_int32 gtid, kmp_task_team_t *task_team,
1755                   volatile kmp_uint32 *unfinished_threads, int *thread_finished,
1756                   kmp_int32 is_constrained )
1757 {
1758     kmp_task_t * task;
1759     kmp_taskdata_t * taskdata;
1760     kmp_thread_data_t *victim_td, *threads_data;
1761     kmp_int32 victim_tid;
1762 
1763     KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
1764 
1765     threads_data = task_team -> tt.tt_threads_data;
1766     KMP_DEBUG_ASSERT( threads_data != NULL );  // Caller should check this condition
1767 
1768     victim_tid = victim->th.th_info.ds.ds_tid;
1769     victim_td = & threads_data[ victim_tid ];
1770 
1771     KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: task_team=%p ntasks=%d "
1772                   "head=%u tail=%u\n",
1773                   gtid, __kmp_gtid_from_thread( victim ), task_team, victim_td->td.td_deque_ntasks,
1774                   victim_td->td.td_deque_head, victim_td->td.td_deque_tail) );
1775 
1776     if ( (TCR_4(victim_td -> td.td_deque_ntasks) == 0) || // Caller should not check this condition
1777          (TCR_PTR(victim->th.th_task_team) != task_team)) // GEH: why would this happen?
1778     {
1779         KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: task_team=%p "
1780                       "ntasks=%d head=%u tail=%u\n",
1781                       gtid, __kmp_gtid_from_thread( victim ), task_team, victim_td->td.td_deque_ntasks,
1782                       victim_td->td.td_deque_head, victim_td->td.td_deque_tail) );
1783         return NULL;
1784     }
1785 
1786     __kmp_acquire_bootstrap_lock( & victim_td -> td.td_deque_lock );
1787 
1788     // Check again after we acquire the lock
1789     if ( (TCR_4(victim_td -> td.td_deque_ntasks) == 0) ||
1790          (TCR_PTR(victim->th.th_task_team) != task_team)) // GEH: why would this happen?
1791     {
1792         __kmp_release_bootstrap_lock( & victim_td -> td.td_deque_lock );
1793         KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: task_team=%p "
1794                       "ntasks=%d head=%u tail=%u\n",
1795                       gtid, __kmp_gtid_from_thread( victim ), task_team, victim_td->td.td_deque_ntasks,
1796                       victim_td->td.td_deque_head, victim_td->td.td_deque_tail) );
1797         return NULL;
1798     }
1799 
1800     KMP_DEBUG_ASSERT( victim_td -> td.td_deque != NULL );
1801 
1802     taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
1803     if ( is_constrained ) {
1804         // we need to check if the candidate obeys task scheduling constraint:
1805         // only descendant of current task can be scheduled
1806         kmp_taskdata_t * current = __kmp_threads[ gtid ]->th.th_current_task;
1807         kmp_int32        level = current->td_level;
1808         kmp_taskdata_t * parent = taskdata->td_parent;
1809         while ( parent != current && parent->td_level > level ) {
1810             parent = parent->td_parent;  // check generation up to the level of the current task
1811             KMP_DEBUG_ASSERT(parent != NULL);
1812         }
1813         if ( parent != current ) {
1814             // If the head task is not a descendant of the current task then do not
1815             // steal it. No other task in victim's deque can be a descendant of the
1816             // current task.
1817             __kmp_release_bootstrap_lock( & victim_td -> td.td_deque_lock );
1818             KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: task_team=%p "
1819                           "ntasks=%d head=%u tail=%u\n",
1820                           gtid, __kmp_gtid_from_thread( threads_data[victim_tid].td.td_thr ),
1821                           task_team, victim_td->td.td_deque_ntasks,
1822                           victim_td->td.td_deque_head, victim_td->td.td_deque_tail) );
1823             return NULL;
1824         }
1825     }
1826     // Bump head pointer and Wrap.
1827     victim_td->td.td_deque_head = (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
1828     if (*thread_finished) {
1829         // We need to un-mark this victim as a finished victim.  This must be done before
1830         // releasing the lock, or else other threads (starting with the master victim)
1831         // might be prematurely released from the barrier!!!
1832         kmp_uint32 count;
1833 
1834         count = KMP_TEST_THEN_INC32( (kmp_int32 *)unfinished_threads );
1835 
1836         KA_TRACE(20, ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
1837                       gtid, count + 1, task_team) );
1838 
1839         *thread_finished = FALSE;
1840     }
1841     TCW_4(victim_td -> td.td_deque_ntasks, TCR_4(victim_td -> td.td_deque_ntasks) - 1);
1842 
1843     __kmp_release_bootstrap_lock( & victim_td -> td.td_deque_lock );
1844 
1845     KMP_COUNT_BLOCK(TASK_stolen);
1846     KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d stole task %p from T#%d: task_team=%p "
1847                   "ntasks=%d head=%u tail=%u\n",
1848                   gtid, taskdata, __kmp_gtid_from_thread( victim ), task_team,
1849                   victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
1850                   victim_td->td.td_deque_tail) );
1851 
1852     task = KMP_TASKDATA_TO_TASK( taskdata );
1853     return task;
1854 }
1855 
1856 
1857 //-----------------------------------------------------------------------------
1858 // __kmp_execute_tasks_template: Choose and execute tasks until either the condition
1859 // is statisfied (return true) or there are none left (return false).
1860 // final_spin is TRUE if this is the spin at the release barrier.
1861 // thread_finished indicates whether the thread is finished executing all
1862 // the tasks it has on its deque, and is at the release barrier.
1863 // spinner is the location on which to spin.
1864 // spinner == NULL means only execute a single task and return.
1865 // checker is the value to check to terminate the spin.
1866 template <class C>
1867 static inline int __kmp_execute_tasks_template(kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
1868                                                int *thread_finished
1869                                                USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
1870 {
1871     kmp_task_team_t *     task_team = thread->th.th_task_team;
1872     kmp_thread_data_t *   threads_data;
1873     kmp_task_t *          task;
1874     kmp_info_t *          other_thread;
1875     kmp_taskdata_t *      current_task = thread -> th.th_current_task;
1876     volatile kmp_uint32 * unfinished_threads;
1877     kmp_int32             nthreads, victim=-2, use_own_tasks=1, new_victim=0, tid=thread->th.th_info.ds.ds_tid;
1878 
1879     KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
1880     KMP_DEBUG_ASSERT( thread == __kmp_threads[ gtid ] );
1881 
1882     if (task_team == NULL) return FALSE;
1883 
1884     KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d *thread_finished=%d\n",
1885                   gtid, final_spin, *thread_finished) );
1886 
1887     threads_data = (kmp_thread_data_t *)TCR_PTR(task_team -> tt.tt_threads_data);
1888     KMP_DEBUG_ASSERT( threads_data != NULL );
1889 
1890     nthreads = task_team -> tt.tt_nproc;
1891     unfinished_threads = &(task_team -> tt.tt_unfinished_threads);
1892 #if OMP_45_ENABLED
1893     KMP_DEBUG_ASSERT( nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
1894 #else
1895     KMP_DEBUG_ASSERT( nthreads > 1 );
1896 #endif
1897     KMP_DEBUG_ASSERT( (int)(TCR_4(*unfinished_threads)) >= 0 );
1898 
1899     while (1) { // Outer loop keeps trying to find tasks in case of single thread getting tasks from target constructs
1900         while (1) { // Inner loop to find a task and execute it
1901             task = NULL;
1902             if (use_own_tasks) { // check on own queue first
1903                 task = __kmp_remove_my_task( thread, gtid, task_team, is_constrained );
1904             }
1905             if ((task == NULL) && (nthreads > 1)) { // Steal a task
1906                 int asleep = 1;
1907                 use_own_tasks = 0;
1908                 // Try to steal from the last place I stole from successfully.
1909                 if (victim == -2) { // haven't stolen anything yet
1910                     victim = threads_data[tid].td.td_deque_last_stolen;
1911                     if (victim != -1) // if we have a last stolen from victim, get the thread
1912                         other_thread = threads_data[victim].td.td_thr;
1913                 }
1914                 if (victim != -1) { // found last victim
1915                     asleep = 0;
1916                 }
1917                 else if (!new_victim) { // no recent steals and we haven't already used a new victim; select a random thread
1918                     do { // Find a different thread to steal work from.
1919                         // Pick a random thread. Initial plan was to cycle through all the threads, and only return if
1920                         // we tried to steal from every thread, and failed.  Arch says that's not such a great idea.
1921                         victim = __kmp_get_random(thread) % (nthreads - 1);
1922                         if (victim >= tid) {
1923                             ++victim;  // Adjusts random distribution to exclude self
1924                         }
1925                         // Found a potential victim
1926                         other_thread = threads_data[victim].td.td_thr;
1927                         // There is a slight chance that __kmp_enable_tasking() did not wake up all threads
1928                         // waiting at the barrier.  If victim is sleeping, then wake it up.  Since we were going to
1929                         // pay the cache miss penalty for referencing another thread's kmp_info_t struct anyway,
1930                         // the check shouldn't cost too much performance at this point. In extra barrier mode, tasks
1931                         // do not sleep at the separate tasking barrier, so this isn't a problem.
1932                         asleep = 0;
1933                         if ( ( __kmp_tasking_mode == tskm_task_teams ) &&
1934                              (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
1935                              (TCR_PTR(other_thread->th.th_sleep_loc) != NULL)) {
1936                             asleep = 1;
1937                             __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread), other_thread->th.th_sleep_loc);
1938                             // A sleeping thread should not have any tasks on it's queue. There is a slight
1939                             // possibility that it resumes, steals a task from another thread, which spawns more
1940                             // tasks, all in the time that it takes this thread to check => don't write an assertion
1941                             // that the victim's queue is empty.  Try stealing from a different thread.
1942                         }
1943                     } while (asleep);
1944                 }
1945 
1946                 if (!asleep) {
1947                     // We have a victim to try to steal from
1948                     task = __kmp_steal_task(other_thread, gtid, task_team, unfinished_threads, thread_finished, is_constrained);
1949                 }
1950                 if (task != NULL) { // set last stolen to victim
1951                     if (threads_data[tid].td.td_deque_last_stolen != victim) {
1952                         threads_data[tid].td.td_deque_last_stolen = victim;
1953                         // The pre-refactored code did not try more than 1 successful new vicitm,
1954                         // unless the last one generated more local tasks; new_victim keeps track of this
1955                         new_victim = 1;
1956                     }
1957                 }
1958                 else { // No tasks found; unset last_stolen
1959                     KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
1960                     victim = -2; // no successful victim found
1961                 }
1962             }
1963 
1964             if (task == NULL) // break out of tasking loop
1965                 break;
1966 
1967             // Found a task; execute it
1968 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1969             if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) {
1970                 if ( itt_sync_obj == NULL ) { // we are at fork barrier where we could not get the object reliably
1971                     itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
1972                 }
1973                 __kmp_itt_task_starting( itt_sync_obj );
1974             }
1975 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1976             __kmp_invoke_task( gtid, task, current_task );
1977 #if USE_ITT_BUILD
1978             if ( itt_sync_obj != NULL ) __kmp_itt_task_finished( itt_sync_obj );
1979 #endif /* USE_ITT_BUILD */
1980             // If this thread is only partway through the barrier and the condition is met, then return now,
1981             // so that the barrier gather/release pattern can proceed. If this thread is in the last spin loop
1982             // in the barrier, waiting to be released, we know that the termination condition will not be
1983             // satisified, so don't waste any cycles checking it.
1984             if (flag == NULL || (!final_spin && flag->done_check())) {
1985                 KA_TRACE(15, ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", gtid) );
1986                 return TRUE;
1987             }
1988             if (thread->th.th_task_team == NULL) {
1989                 break;
1990             }
1991             KMP_YIELD( __kmp_library == library_throughput );   // Yield before executing next task
1992             // If execution of a stolen task results in more tasks being placed on our run queue, reset use_own_tasks
1993             if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
1994                 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned other tasks, restart\n", gtid));
1995                 use_own_tasks = 1;
1996                 new_victim = 0;
1997             }
1998         }
1999 
2000         // The task source has been exhausted. If in final spin loop of barrier, check if termination condition is satisfied.
2001 #if OMP_45_ENABLED
2002         // The work queue may be empty but there might be proxy tasks still executing
2003         if (final_spin && TCR_4(current_task->td_incomplete_child_tasks) == 0)
2004 #else
2005         if (final_spin)
2006 #endif
2007         {
2008             // First, decrement the #unfinished threads, if that has not already been done.  This decrement
2009             // might be to the spin location, and result in the termination condition being satisfied.
2010             if (! *thread_finished) {
2011                 kmp_uint32 count;
2012 
2013                 count = KMP_TEST_THEN_DEC32( (kmp_int32 *)unfinished_threads ) - 1;
2014                 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec unfinished_threads to %d task_team=%p\n",
2015                               gtid, count, task_team) );
2016                 *thread_finished = TRUE;
2017             }
2018 
2019             // It is now unsafe to reference thread->th.th_team !!!
2020             // Decrementing task_team->tt.tt_unfinished_threads can allow the master thread to pass through
2021             // the barrier, where it might reset each thread's th.th_team field for the next parallel region.
2022             // If we can steal more work, we know that this has not happened yet.
2023             if (flag != NULL && flag->done_check()) {
2024                 KA_TRACE(15, ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", gtid) );
2025                 return TRUE;
2026             }
2027         }
2028 
2029         // If this thread's task team is NULL, master has recognized that there are no more tasks; bail out
2030         if (thread->th.th_task_team == NULL) {
2031             KA_TRACE(15, ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid) );
2032             return FALSE;
2033         }
2034 
2035 #if OMP_45_ENABLED
2036         // We could be getting tasks from target constructs; if this is the only thread, keep trying to execute
2037         // tasks from own queue
2038         if (nthreads == 1)
2039             use_own_tasks = 1;
2040         else
2041 #endif
2042         {
2043             KA_TRACE(15, ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid) );
2044             return FALSE;
2045         }
2046     }
2047 }
2048 
2049 int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
2050                            int *thread_finished
2051                            USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
2052 {
2053     return __kmp_execute_tasks_template(thread, gtid, flag, final_spin, thread_finished
2054                                         USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2055 }
2056 
2057 int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
2058                            int *thread_finished
2059                            USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
2060 {
2061     return __kmp_execute_tasks_template(thread, gtid, flag, final_spin, thread_finished
2062                                         USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2063 }
2064 
2065 int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
2066                                int *thread_finished
2067                                USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
2068 {
2069     return __kmp_execute_tasks_template(thread, gtid, flag, final_spin, thread_finished
2070                                         USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
2071 }
2072 
2073 
2074 
2075 //-----------------------------------------------------------------------------
2076 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
2077 // next barrier so they can assist in executing enqueued tasks.
2078 // First thread in allocates the task team atomically.
2079 
2080 static void
2081 __kmp_enable_tasking( kmp_task_team_t *task_team, kmp_info_t *this_thr )
2082 {
2083     kmp_thread_data_t *threads_data;
2084     int nthreads, i, is_init_thread;
2085 
2086     KA_TRACE( 10, ( "__kmp_enable_tasking(enter): T#%d\n",
2087                     __kmp_gtid_from_thread( this_thr ) ) );
2088 
2089     KMP_DEBUG_ASSERT(task_team != NULL);
2090     KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
2091 
2092     nthreads = task_team->tt.tt_nproc;
2093     KMP_DEBUG_ASSERT(nthreads > 0);
2094     KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
2095 
2096     // Allocate or increase the size of threads_data if necessary
2097     is_init_thread = __kmp_realloc_task_threads_data( this_thr, task_team );
2098 
2099     if (!is_init_thread) {
2100         // Some other thread already set up the array.
2101         KA_TRACE( 20, ( "__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
2102                         __kmp_gtid_from_thread( this_thr ) ) );
2103         return;
2104     }
2105     threads_data = (kmp_thread_data_t *)TCR_PTR(task_team -> tt.tt_threads_data);
2106     KMP_DEBUG_ASSERT( threads_data != NULL );
2107 
2108     if ( ( __kmp_tasking_mode == tskm_task_teams ) &&
2109          ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) )
2110     {
2111         // Release any threads sleeping at the barrier, so that they can steal
2112         // tasks and execute them.  In extra barrier mode, tasks do not sleep
2113         // at the separate tasking barrier, so this isn't a problem.
2114         for (i = 0; i < nthreads; i++) {
2115             volatile void *sleep_loc;
2116             kmp_info_t *thread = threads_data[i].td.td_thr;
2117 
2118             if (i == this_thr->th.th_info.ds.ds_tid) {
2119                 continue;
2120             }
2121             // Since we haven't locked the thread's suspend mutex lock at this
2122             // point, there is a small window where a thread might be putting
2123             // itself to sleep, but hasn't set the th_sleep_loc field yet.
2124             // To work around this, __kmp_execute_tasks_template() periodically checks
2125             // see if other threads are sleeping (using the same random
2126             // mechanism that is used for task stealing) and awakens them if
2127             // they are.
2128             if ( ( sleep_loc = TCR_PTR( thread -> th.th_sleep_loc) ) != NULL )
2129             {
2130                 KF_TRACE( 50, ( "__kmp_enable_tasking: T#%d waking up thread T#%d\n",
2131                                  __kmp_gtid_from_thread( this_thr ),
2132                                  __kmp_gtid_from_thread( thread ) ) );
2133                 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
2134             }
2135             else {
2136                 KF_TRACE( 50, ( "__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
2137                                  __kmp_gtid_from_thread( this_thr ),
2138                                  __kmp_gtid_from_thread( thread ) ) );
2139             }
2140         }
2141     }
2142 
2143     KA_TRACE( 10, ( "__kmp_enable_tasking(exit): T#%d\n",
2144                     __kmp_gtid_from_thread( this_thr ) ) );
2145 }
2146 
2147 
2148 /* ------------------------------------------------------------------------ */
2149 /* // TODO: Check the comment consistency
2150  * Utility routines for "task teams".  A task team (kmp_task_t) is kind of
2151  * like a shadow of the kmp_team_t data struct, with a different lifetime.
2152  * After a child * thread checks into a barrier and calls __kmp_release() from
2153  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
2154  * longer assume that the kmp_team_t structure is intact (at any moment, the
2155  * master thread may exit the barrier code and free the team data structure,
2156  * and return the threads to the thread pool).
2157  *
2158  * This does not work with the the tasking code, as the thread is still
2159  * expected to participate in the execution of any tasks that may have been
2160  * spawned my a member of the team, and the thread still needs access to all
2161  * to each thread in the team, so that it can steal work from it.
2162  *
2163  * Enter the existence of the kmp_task_team_t struct.  It employs a reference
2164  * counting mechanims, and is allocated by the master thread before calling
2165  * __kmp_<barrier_kind>_release, and then is release by the last thread to
2166  * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes
2167  * of the kmp_task_team_t structs for consecutive barriers can overlap
2168  * (and will, unless the master thread is the last thread to exit the barrier
2169  * release phase, which is not typical).
2170  *
2171  * The existence of such a struct is useful outside the context of tasking,
2172  * but for now, I'm trying to keep it specific to the OMP_30_ENABLED macro,
2173  * so that any performance differences show up when comparing the 2.5 vs. 3.0
2174  * libraries.
2175  *
2176  * We currently use the existence of the threads array as an indicator that
2177  * tasks were spawned since the last barrier.  If the structure is to be
2178  * useful outside the context of tasking, then this will have to change, but
2179  * not settting the field minimizes the performance impact of tasking on
2180  * barriers, when no explicit tasks were spawned (pushed, actually).
2181  */
2182 
2183 
2184 static kmp_task_team_t *__kmp_free_task_teams = NULL;           // Free list for task_team data structures
2185 // Lock for task team data structures
2186 static kmp_bootstrap_lock_t __kmp_task_team_lock = KMP_BOOTSTRAP_LOCK_INITIALIZER( __kmp_task_team_lock );
2187 
2188 
2189 //------------------------------------------------------------------------------
2190 // __kmp_alloc_task_deque:
2191 // Allocates a task deque for a particular thread, and initialize the necessary
2192 // data structures relating to the deque.  This only happens once per thread
2193 // per task team since task teams are recycled.
2194 // No lock is needed during allocation since each thread allocates its own
2195 // deque.
2196 
2197 static void
2198 __kmp_alloc_task_deque( kmp_info_t *thread, kmp_thread_data_t *thread_data )
2199 {
2200     __kmp_init_bootstrap_lock( & thread_data -> td.td_deque_lock );
2201     KMP_DEBUG_ASSERT( thread_data -> td.td_deque == NULL );
2202 
2203     // Initialize last stolen task field to "none"
2204     thread_data -> td.td_deque_last_stolen = -1;
2205 
2206     KMP_DEBUG_ASSERT( TCR_4(thread_data -> td.td_deque_ntasks) == 0 );
2207     KMP_DEBUG_ASSERT( thread_data -> td.td_deque_head == 0 );
2208     KMP_DEBUG_ASSERT( thread_data -> td.td_deque_tail == 0 );
2209 
2210     KE_TRACE( 10, ( "__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
2211                    __kmp_gtid_from_thread( thread ), INITIAL_TASK_DEQUE_SIZE, thread_data ) );
2212     // Allocate space for task deque, and zero the deque
2213     // Cannot use __kmp_thread_calloc() because threads not around for
2214     // kmp_reap_task_team( ).
2215     thread_data -> td.td_deque = (kmp_taskdata_t **)
2216             __kmp_allocate( INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
2217 	thread_data -> td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
2218 }
2219 
2220 //------------------------------------------------------------------------------
2221 // __kmp_realloc_task_deque:
2222 // Re-allocates a task deque for a particular thread, copies the content from the old deque
2223 // and adjusts the necessary data structures relating to the deque.
2224 // This operation must be done with a the deque_lock being held
2225 
2226 static void __kmp_realloc_task_deque ( kmp_info_t *thread, kmp_thread_data_t *thread_data )
2227 {
2228     kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
2229     kmp_int32 new_size = 2 * size;
2230 
2231     KE_TRACE( 10, ( "__kmp_realloc_task_deque: T#%d reallocating deque[from %d to %d] for thread_data %p\n",
2232                   __kmp_gtid_from_thread( thread ), size, new_size, thread_data ) );
2233 
2234     kmp_taskdata_t ** new_deque = (kmp_taskdata_t **) __kmp_allocate( new_size * sizeof(kmp_taskdata_t *));
2235 
2236     int i,j;
2237     for ( i = thread_data->td.td_deque_head, j = 0; j < size; i = (i+1) & TASK_DEQUE_MASK(thread_data->td), j++ )
2238        new_deque[j] = thread_data->td.td_deque[i];
2239 
2240     __kmp_free(thread_data->td.td_deque);
2241 
2242     thread_data -> td.td_deque_head = 0;
2243     thread_data -> td.td_deque_tail = size;
2244     thread_data -> td.td_deque = new_deque;
2245     thread_data -> td.td_deque_size = new_size;
2246 }
2247 
2248 //------------------------------------------------------------------------------
2249 // __kmp_free_task_deque:
2250 // Deallocates a task deque for a particular thread.
2251 // Happens at library deallocation so don't need to reset all thread data fields.
2252 
2253 static void
2254 __kmp_free_task_deque( kmp_thread_data_t *thread_data )
2255 {
2256     __kmp_acquire_bootstrap_lock( & thread_data -> td.td_deque_lock );
2257 
2258     if ( thread_data -> td.td_deque != NULL ) {
2259         TCW_4(thread_data -> td.td_deque_ntasks, 0);
2260          __kmp_free( thread_data -> td.td_deque );
2261         thread_data -> td.td_deque = NULL;
2262     }
2263     __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock );
2264 
2265 #ifdef BUILD_TIED_TASK_STACK
2266     // GEH: Figure out what to do here for td_susp_tied_tasks
2267     if ( thread_data -> td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY ) {
2268         __kmp_free_task_stack( __kmp_thread_from_gtid( gtid ), thread_data );
2269     }
2270 #endif // BUILD_TIED_TASK_STACK
2271 }
2272 
2273 
2274 //------------------------------------------------------------------------------
2275 // __kmp_realloc_task_threads_data:
2276 // Allocates a threads_data array for a task team, either by allocating an initial
2277 // array or enlarging an existing array.  Only the first thread to get the lock
2278 // allocs or enlarges the array and re-initializes the array eleemnts.
2279 // That thread returns "TRUE", the rest return "FALSE".
2280 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
2281 // The current size is given by task_team -> tt.tt_max_threads.
2282 
2283 static int
2284 __kmp_realloc_task_threads_data( kmp_info_t *thread, kmp_task_team_t *task_team )
2285 {
2286     kmp_thread_data_t ** threads_data_p;
2287     kmp_int32            nthreads, maxthreads;
2288     int                  is_init_thread = FALSE;
2289 
2290     if ( TCR_4(task_team -> tt.tt_found_tasks) ) {
2291         // Already reallocated and initialized.
2292         return FALSE;
2293     }
2294 
2295     threads_data_p = & task_team -> tt.tt_threads_data;
2296     nthreads   = task_team -> tt.tt_nproc;
2297     maxthreads = task_team -> tt.tt_max_threads;
2298 
2299     // All threads must lock when they encounter the first task of the implicit task
2300     // region to make sure threads_data fields are (re)initialized before used.
2301     __kmp_acquire_bootstrap_lock( & task_team -> tt.tt_threads_lock );
2302 
2303     if ( ! TCR_4(task_team -> tt.tt_found_tasks) ) {
2304         // first thread to enable tasking
2305         kmp_team_t *team = thread -> th.th_team;
2306         int i;
2307 
2308         is_init_thread = TRUE;
2309         if ( maxthreads < nthreads ) {
2310 
2311             if ( *threads_data_p != NULL ) {
2312                 kmp_thread_data_t *old_data = *threads_data_p;
2313                 kmp_thread_data_t *new_data = NULL;
2314 
2315                 KE_TRACE( 10, ( "__kmp_realloc_task_threads_data: T#%d reallocating "
2316                                "threads data for task_team %p, new_size = %d, old_size = %d\n",
2317                                __kmp_gtid_from_thread( thread ), task_team,
2318                                nthreads, maxthreads ) );
2319                 // Reallocate threads_data to have more elements than current array
2320                 // Cannot use __kmp_thread_realloc() because threads not around for
2321                 // kmp_reap_task_team( ).  Note all new array entries are initialized
2322                 // to zero by __kmp_allocate().
2323                 new_data = (kmp_thread_data_t *)
2324                             __kmp_allocate( nthreads * sizeof(kmp_thread_data_t) );
2325                 // copy old data to new data
2326                 KMP_MEMCPY_S( (void *) new_data, nthreads * sizeof(kmp_thread_data_t),
2327                               (void *) old_data,
2328                               maxthreads * sizeof(kmp_taskdata_t *) );
2329 
2330 #ifdef BUILD_TIED_TASK_STACK
2331                 // GEH: Figure out if this is the right thing to do
2332                 for (i = maxthreads; i < nthreads; i++) {
2333                     kmp_thread_data_t *thread_data = & (*threads_data_p)[i];
2334                     __kmp_init_task_stack( __kmp_gtid_from_thread( thread ), thread_data );
2335                 }
2336 #endif // BUILD_TIED_TASK_STACK
2337                 // Install the new data and free the old data
2338                 (*threads_data_p) = new_data;
2339                 __kmp_free( old_data );
2340             }
2341             else {
2342                 KE_TRACE( 10, ( "__kmp_realloc_task_threads_data: T#%d allocating "
2343                                "threads data for task_team %p, size = %d\n",
2344                                __kmp_gtid_from_thread( thread ), task_team, nthreads ) );
2345                 // Make the initial allocate for threads_data array, and zero entries
2346                 // Cannot use __kmp_thread_calloc() because threads not around for
2347                 // kmp_reap_task_team( ).
2348                 ANNOTATE_IGNORE_WRITES_BEGIN();
2349                 *threads_data_p = (kmp_thread_data_t *)
2350                                   __kmp_allocate( nthreads * sizeof(kmp_thread_data_t) );
2351                 ANNOTATE_IGNORE_WRITES_END();
2352 #ifdef BUILD_TIED_TASK_STACK
2353                 // GEH: Figure out if this is the right thing to do
2354                 for (i = 0; i < nthreads; i++) {
2355                     kmp_thread_data_t *thread_data = & (*threads_data_p)[i];
2356                     __kmp_init_task_stack( __kmp_gtid_from_thread( thread ), thread_data );
2357                 }
2358 #endif // BUILD_TIED_TASK_STACK
2359             }
2360             task_team -> tt.tt_max_threads = nthreads;
2361         }
2362         else {
2363             // If array has (more than) enough elements, go ahead and use it
2364             KMP_DEBUG_ASSERT( *threads_data_p != NULL );
2365         }
2366 
2367         // initialize threads_data pointers back to thread_info structures
2368         for (i = 0; i < nthreads; i++) {
2369             kmp_thread_data_t *thread_data = & (*threads_data_p)[i];
2370             thread_data -> td.td_thr = team -> t.t_threads[i];
2371 
2372             if ( thread_data -> td.td_deque_last_stolen >= nthreads) {
2373                 // The last stolen field survives across teams / barrier, and the number
2374                 // of threads may have changed.  It's possible (likely?) that a new
2375                 // parallel region will exhibit the same behavior as the previous region.
2376                 thread_data -> td.td_deque_last_stolen = -1;
2377             }
2378         }
2379 
2380         KMP_MB();
2381         TCW_SYNC_4(task_team -> tt.tt_found_tasks, TRUE);
2382     }
2383 
2384     __kmp_release_bootstrap_lock( & task_team -> tt.tt_threads_lock );
2385     return is_init_thread;
2386 }
2387 
2388 
2389 //------------------------------------------------------------------------------
2390 // __kmp_free_task_threads_data:
2391 // Deallocates a threads_data array for a task team, including any attached
2392 // tasking deques.  Only occurs at library shutdown.
2393 
2394 static void
2395 __kmp_free_task_threads_data( kmp_task_team_t *task_team )
2396 {
2397     __kmp_acquire_bootstrap_lock( & task_team -> tt.tt_threads_lock );
2398     if ( task_team -> tt.tt_threads_data != NULL ) {
2399         int i;
2400         for (i = 0; i < task_team->tt.tt_max_threads; i++ ) {
2401             __kmp_free_task_deque( & task_team -> tt.tt_threads_data[i] );
2402         }
2403         __kmp_free( task_team -> tt.tt_threads_data );
2404         task_team -> tt.tt_threads_data = NULL;
2405     }
2406     __kmp_release_bootstrap_lock( & task_team -> tt.tt_threads_lock );
2407 }
2408 
2409 
2410 //------------------------------------------------------------------------------
2411 // __kmp_allocate_task_team:
2412 // Allocates a task team associated with a specific team, taking it from
2413 // the global task team free list if possible.  Also initializes data structures.
2414 
2415 static kmp_task_team_t *
2416 __kmp_allocate_task_team( kmp_info_t *thread, kmp_team_t *team )
2417 {
2418     kmp_task_team_t *task_team = NULL;
2419     int nthreads;
2420 
2421     KA_TRACE( 20, ( "__kmp_allocate_task_team: T#%d entering; team = %p\n",
2422                     (thread ? __kmp_gtid_from_thread( thread ) : -1), team ) );
2423 
2424     if (TCR_PTR(__kmp_free_task_teams) != NULL) {
2425         // Take a task team from the task team pool
2426         __kmp_acquire_bootstrap_lock( &__kmp_task_team_lock );
2427         if (__kmp_free_task_teams != NULL) {
2428             task_team = __kmp_free_task_teams;
2429             TCW_PTR(__kmp_free_task_teams, task_team -> tt.tt_next);
2430             task_team -> tt.tt_next = NULL;
2431         }
2432         __kmp_release_bootstrap_lock( &__kmp_task_team_lock );
2433     }
2434 
2435     if (task_team == NULL) {
2436         KE_TRACE( 10, ( "__kmp_allocate_task_team: T#%d allocating "
2437                        "task team for team %p\n",
2438                        __kmp_gtid_from_thread( thread ), team ) );
2439         // Allocate a new task team if one is not available.
2440         // Cannot use __kmp_thread_malloc() because threads not around for
2441         // kmp_reap_task_team( ).
2442         task_team = (kmp_task_team_t *) __kmp_allocate( sizeof(kmp_task_team_t) );
2443         __kmp_init_bootstrap_lock( & task_team -> tt.tt_threads_lock );
2444         //task_team -> tt.tt_threads_data = NULL;   // AC: __kmp_allocate zeroes returned memory
2445         //task_team -> tt.tt_max_threads = 0;
2446         //task_team -> tt.tt_next = NULL;
2447     }
2448 
2449     TCW_4(task_team -> tt.tt_found_tasks, FALSE);
2450 #if OMP_45_ENABLED
2451     TCW_4(task_team -> tt.tt_found_proxy_tasks, FALSE);
2452 #endif
2453     task_team -> tt.tt_nproc = nthreads = team->t.t_nproc;
2454 
2455     TCW_4( task_team -> tt.tt_unfinished_threads, nthreads );
2456     TCW_4( task_team -> tt.tt_active, TRUE );
2457 
2458     KA_TRACE( 20, ( "__kmp_allocate_task_team: T#%d exiting; task_team = %p unfinished_threads init'd to %d\n",
2459                     (thread ? __kmp_gtid_from_thread( thread ) : -1), task_team, task_team -> tt.tt_unfinished_threads) );
2460     return task_team;
2461 }
2462 
2463 
2464 //------------------------------------------------------------------------------
2465 // __kmp_free_task_team:
2466 // Frees the task team associated with a specific thread, and adds it
2467 // to the global task team free list.
2468 
2469 void
2470 __kmp_free_task_team( kmp_info_t *thread, kmp_task_team_t *task_team )
2471 {
2472     KA_TRACE( 20, ( "__kmp_free_task_team: T#%d task_team = %p\n",
2473                     thread ? __kmp_gtid_from_thread( thread ) : -1, task_team ) );
2474 
2475     // Put task team back on free list
2476     __kmp_acquire_bootstrap_lock( & __kmp_task_team_lock );
2477 
2478     KMP_DEBUG_ASSERT( task_team -> tt.tt_next == NULL );
2479     task_team -> tt.tt_next = __kmp_free_task_teams;
2480     TCW_PTR(__kmp_free_task_teams, task_team);
2481 
2482     __kmp_release_bootstrap_lock( & __kmp_task_team_lock );
2483 }
2484 
2485 
2486 //------------------------------------------------------------------------------
2487 // __kmp_reap_task_teams:
2488 // Free all the task teams on the task team free list.
2489 // Should only be done during library shutdown.
2490 // Cannot do anything that needs a thread structure or gtid since they are already gone.
2491 
2492 void
2493 __kmp_reap_task_teams( void )
2494 {
2495     kmp_task_team_t   *task_team;
2496 
2497     if ( TCR_PTR(__kmp_free_task_teams) != NULL ) {
2498         // Free all task_teams on the free list
2499         __kmp_acquire_bootstrap_lock( &__kmp_task_team_lock );
2500         while ( ( task_team = __kmp_free_task_teams ) != NULL ) {
2501             __kmp_free_task_teams = task_team -> tt.tt_next;
2502             task_team -> tt.tt_next = NULL;
2503 
2504             // Free threads_data if necessary
2505             if ( task_team -> tt.tt_threads_data != NULL ) {
2506                 __kmp_free_task_threads_data( task_team );
2507             }
2508             __kmp_free( task_team );
2509         }
2510         __kmp_release_bootstrap_lock( &__kmp_task_team_lock );
2511     }
2512 }
2513 
2514 //------------------------------------------------------------------------------
2515 // __kmp_wait_to_unref_task_teams:
2516 // Some threads could still be in the fork barrier release code, possibly
2517 // trying to steal tasks.  Wait for each thread to unreference its task team.
2518 //
2519 void
2520 __kmp_wait_to_unref_task_teams(void)
2521 {
2522     kmp_info_t *thread;
2523     kmp_uint32 spins;
2524     int done;
2525 
2526     KMP_INIT_YIELD( spins );
2527 
2528     for (;;) {
2529         done = TRUE;
2530 
2531         // TODO: GEH - this may be is wrong because some sync would be necessary
2532         //             in case threads are added to the pool during the traversal.
2533         //             Need to verify that lock for thread pool is held when calling
2534         //             this routine.
2535         for (thread = (kmp_info_t *)__kmp_thread_pool;
2536              thread != NULL;
2537              thread = thread->th.th_next_pool)
2538         {
2539 #if KMP_OS_WINDOWS
2540             DWORD exit_val;
2541 #endif
2542             if ( TCR_PTR(thread->th.th_task_team) == NULL ) {
2543                 KA_TRACE( 10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
2544                                __kmp_gtid_from_thread( thread ) ) );
2545                 continue;
2546             }
2547 #if KMP_OS_WINDOWS
2548             // TODO: GEH - add this check for Linux* OS / OS X* as well?
2549             if (!__kmp_is_thread_alive(thread, &exit_val)) {
2550                 thread->th.th_task_team = NULL;
2551                 continue;
2552             }
2553 #endif
2554 
2555             done = FALSE;  // Because th_task_team pointer is not NULL for this thread
2556 
2557             KA_TRACE( 10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to unreference task_team\n",
2558                            __kmp_gtid_from_thread( thread ) ) );
2559 
2560             if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
2561                 volatile void *sleep_loc;
2562                 // If the thread is sleeping, awaken it.
2563                 if ( ( sleep_loc = TCR_PTR( thread->th.th_sleep_loc) ) != NULL ) {
2564                     KA_TRACE( 10, ( "__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
2565                                     __kmp_gtid_from_thread( thread ), __kmp_gtid_from_thread( thread ) ) );
2566                     __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
2567                 }
2568             }
2569         }
2570         if (done) {
2571             break;
2572         }
2573 
2574         // If we are oversubscribed,
2575         // or have waited a bit (and library mode is throughput), yield.
2576         // Pause is in the following code.
2577         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2578         KMP_YIELD_SPIN( spins );        // Yields only if KMP_LIBRARY=throughput
2579     }
2580 }
2581 
2582 
2583 //------------------------------------------------------------------------------
2584 // __kmp_task_team_setup:  Create a task_team for the current team, but use
2585 // an already created, unused one if it already exists.
2586 void
2587 __kmp_task_team_setup( kmp_info_t *this_thr, kmp_team_t *team, int always )
2588 {
2589     KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
2590 
2591     // If this task_team hasn't been created yet, allocate it. It will be used in the region after the next.
2592     // If it exists, it is the current task team and shouldn't be touched yet as it may still be in use.
2593     if (team->t.t_task_team[this_thr->th.th_task_state] == NULL && (always || team->t.t_nproc > 1) ) {
2594         team->t.t_task_team[this_thr->th.th_task_state] = __kmp_allocate_task_team( this_thr, team );
2595         KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p for team %d at parity=%d\n",
2596                       __kmp_gtid_from_thread(this_thr), team->t.t_task_team[this_thr->th.th_task_state],
2597                       ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
2598     }
2599 
2600     // After threads exit the release, they will call sync, and then point to this other task_team; make sure it is
2601     // allocated and properly initialized. As threads spin in the barrier release phase, they will continue to use the
2602     // previous task_team struct(above), until they receive the signal to stop checking for tasks (they can't safely
2603     // reference the kmp_team_t struct, which could be reallocated by the master thread). No task teams are formed for
2604     // serialized teams.
2605     if (team->t.t_nproc > 1) {
2606         int other_team = 1 - this_thr->th.th_task_state;
2607         if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
2608                 team->t.t_task_team[other_team] = __kmp_allocate_task_team( this_thr, team );
2609                 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new task_team %p for team %d at parity=%d\n",
2610                                 __kmp_gtid_from_thread( this_thr ), team->t.t_task_team[other_team],
2611                               ((team != NULL) ? team->t.t_id : -1), other_team ));
2612         }
2613         else { // Leave the old task team struct in place for the upcoming region; adjust as needed
2614             kmp_task_team_t *task_team = team->t.t_task_team[other_team];
2615             if (!task_team->tt.tt_active || team->t.t_nproc != task_team->tt.tt_nproc) {
2616                 TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
2617                 TCW_4(task_team->tt.tt_found_tasks, FALSE);
2618 #if OMP_45_ENABLED
2619                 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
2620 #endif
2621                 TCW_4(task_team->tt.tt_unfinished_threads, team->t.t_nproc );
2622                 TCW_4(task_team->tt.tt_active, TRUE );
2623             }
2624             // if team size has changed, the first thread to enable tasking will realloc threads_data if necessary
2625             KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team %p for team %d at parity=%d\n",
2626                           __kmp_gtid_from_thread( this_thr ), team->t.t_task_team[other_team],
2627                           ((team != NULL) ? team->t.t_id : -1), other_team ));
2628         }
2629     }
2630 }
2631 
2632 
2633 //------------------------------------------------------------------------------
2634 // __kmp_task_team_sync: Propagation of task team data from team to threads
2635 // which happens just after the release phase of a team barrier.  This may be
2636 // called by any thread, but only for teams with # threads > 1.
2637 
2638 void
2639 __kmp_task_team_sync( kmp_info_t *this_thr, kmp_team_t *team )
2640 {
2641     KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
2642 
2643     // Toggle the th_task_state field, to switch which task_team this thread refers to
2644     this_thr->th.th_task_state = 1 - this_thr->th.th_task_state;
2645     // It is now safe to propagate the task team pointer from the team struct to the current thread.
2646     TCW_PTR(this_thr->th.th_task_team, team->t.t_task_team[this_thr->th.th_task_state]);
2647     KA_TRACE(20, ("__kmp_task_team_sync: Thread T#%d task team switched to task_team %p from Team #%d (parity=%d)\n",
2648                   __kmp_gtid_from_thread( this_thr ), this_thr->th.th_task_team,
2649                   ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
2650 }
2651 
2652 
2653 //--------------------------------------------------------------------------------------------
2654 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the barrier gather
2655 // phase.  Only called by master thread if #threads in team > 1 or if proxy tasks were created.
2656 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off by passing in 0
2657 // optionally as the last argument. When wait is zero, master thread does not wait for
2658 // unfinished_threads to reach 0.
2659 void
2660 __kmp_task_team_wait( kmp_info_t *this_thr, kmp_team_t *team
2661                       USE_ITT_BUILD_ARG(void * itt_sync_obj)
2662                       , int wait)
2663 {
2664     kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
2665 
2666     KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
2667     KMP_DEBUG_ASSERT( task_team == this_thr->th.th_task_team );
2668 
2669     if ( ( task_team != NULL ) && KMP_TASKING_ENABLED(task_team) ) {
2670         if (wait) {
2671             KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks (for unfinished_threads to reach 0) on task_team = %p\n",
2672                           __kmp_gtid_from_thread(this_thr), task_team));
2673             // Worker threads may have dropped through to release phase, but could still be executing tasks. Wait
2674             // here for tasks to complete. To avoid memory contention, only master thread checks termination condition.
2675             kmp_flag_32 flag(&task_team->tt.tt_unfinished_threads, 0U);
2676             flag.wait(this_thr, TRUE
2677                       USE_ITT_BUILD_ARG(itt_sync_obj));
2678         }
2679         // Deactivate the old task team, so that the worker threads will stop referencing it while spinning.
2680         KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
2681                       "setting active to false, setting local and team's pointer to NULL\n",
2682                       __kmp_gtid_from_thread(this_thr), task_team));
2683 #if OMP_45_ENABLED
2684         KMP_DEBUG_ASSERT( task_team->tt.tt_nproc > 1 || task_team->tt.tt_found_proxy_tasks == TRUE );
2685         TCW_SYNC_4( task_team->tt.tt_found_proxy_tasks, FALSE );
2686 #else
2687         KMP_DEBUG_ASSERT( task_team->tt.tt_nproc > 1 );
2688 #endif
2689         TCW_SYNC_4( task_team->tt.tt_active, FALSE );
2690         KMP_MB();
2691 
2692         TCW_PTR(this_thr->th.th_task_team, NULL);
2693     }
2694 }
2695 
2696 
2697 //------------------------------------------------------------------------------
2698 // __kmp_tasking_barrier:
2699 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier.
2700 // Internal function to execute all tasks prior to a regular barrier or a
2701 // join barrier.  It is a full barrier itself, which unfortunately turns
2702 // regular barriers into double barriers and join barriers into 1 1/2
2703 // barriers.
2704 void
2705 __kmp_tasking_barrier( kmp_team_t *team, kmp_info_t *thread, int gtid )
2706 {
2707     volatile kmp_uint32 *spin = &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads;
2708     int flag = FALSE;
2709     KMP_DEBUG_ASSERT( __kmp_tasking_mode == tskm_extra_barrier );
2710 
2711 #if USE_ITT_BUILD
2712     KMP_FSYNC_SPIN_INIT( spin, (kmp_uint32*) NULL );
2713 #endif /* USE_ITT_BUILD */
2714     kmp_flag_32 spin_flag(spin, 0U);
2715     while (! spin_flag.execute_tasks(thread, gtid, TRUE, &flag
2716                                      USE_ITT_BUILD_ARG(NULL), 0 ) ) {
2717 #if USE_ITT_BUILD
2718         // TODO: What about itt_sync_obj??
2719         KMP_FSYNC_SPIN_PREPARE( spin );
2720 #endif /* USE_ITT_BUILD */
2721 
2722         if( TCR_4(__kmp_global.g.g_done) ) {
2723             if( __kmp_global.g.g_abort )
2724                 __kmp_abort_thread( );
2725             break;
2726         }
2727         KMP_YIELD( TRUE );       // GH: We always yield here
2728     }
2729 #if USE_ITT_BUILD
2730     KMP_FSYNC_SPIN_ACQUIRED( (void*) spin );
2731 #endif /* USE_ITT_BUILD */
2732 }
2733 
2734 
2735 #if OMP_45_ENABLED
2736 
2737 /* __kmp_give_task puts a task into a given thread queue if:
2738     - the queue for that thread was created
2739     - there's space in that queue
2740 
2741     Because of this, __kmp_push_task needs to check if there's space after getting the lock
2742  */
2743 static bool __kmp_give_task ( kmp_info_t *thread, kmp_int32 tid, kmp_task_t * task, kmp_int32 pass )
2744 {
2745     kmp_taskdata_t *    taskdata = KMP_TASK_TO_TASKDATA(task);
2746     kmp_task_team_t *	task_team = taskdata->td_task_team;
2747 
2748     KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n", taskdata, tid ) );
2749 
2750     // If task_team is NULL something went really bad...
2751     KMP_DEBUG_ASSERT( task_team != NULL );
2752 
2753     bool result = false;
2754     kmp_thread_data_t * thread_data = & task_team -> tt.tt_threads_data[ tid ];
2755 
2756     if (thread_data -> td.td_deque == NULL ) {
2757         // There's no queue in this thread, go find another one
2758         // We're guaranteed that at least one thread has a queue
2759         KA_TRACE(30, ("__kmp_give_task: thread %d has no queue while giving task %p.\n", tid, taskdata ) );
2760         return result;
2761     }
2762 
2763     if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) )
2764     {
2765         KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) );
2766 
2767         // if this deque is bigger than the pass ratio give a chance to another thread
2768         if ( TASK_DEQUE_SIZE(thread_data->td)/INITIAL_TASK_DEQUE_SIZE >= pass ) return result;
2769 
2770         __kmp_acquire_bootstrap_lock( & thread_data-> td.td_deque_lock );
2771         __kmp_realloc_task_deque(thread,thread_data);
2772 
2773     } else {
2774 
2775        __kmp_acquire_bootstrap_lock( & thread_data-> td.td_deque_lock );
2776 
2777        if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) )
2778        {
2779            KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) );
2780 
2781            // if this deque is bigger than the pass ratio give a chance to another thread
2782            if ( TASK_DEQUE_SIZE(thread_data->td)/INITIAL_TASK_DEQUE_SIZE >= pass )
2783               goto release_and_exit;
2784 
2785            __kmp_realloc_task_deque(thread,thread_data);
2786        }
2787     }
2788 
2789     // lock is held here, and there is space in the deque
2790 
2791     thread_data -> td.td_deque[ thread_data -> td.td_deque_tail ] = taskdata;
2792     // Wrap index.
2793     thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK(thread_data->td);
2794     TCW_4(thread_data -> td.td_deque_ntasks, TCR_4(thread_data -> td.td_deque_ntasks) + 1);
2795 
2796     result = true;
2797     KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n", taskdata, tid ) );
2798 
2799 release_and_exit:
2800     __kmp_release_bootstrap_lock( & thread_data-> td.td_deque_lock );
2801 
2802      return result;
2803 }
2804 
2805 
2806 /* The finish of the a proxy tasks is divided in two pieces:
2807     - the top half is the one that can be done from a thread outside the team
2808     - the bottom half must be run from a them within the team
2809 
2810     In order to run the bottom half the task gets queued back into one of the threads of the team.
2811     Once the td_incomplete_child_task counter of the parent is decremented the threads can leave the barriers.
2812     So, the bottom half needs to be queued before the counter is decremented. The top half is therefore divided in two parts:
2813     - things that can be run before queuing the bottom half
2814     - things that must be run after queuing the bottom half
2815 
2816     This creates a second race as the bottom half can free the task before the second top half is executed. To avoid this
2817     we use the td_incomplete_child_task of the proxy task to synchronize the top and bottom half.
2818 */
2819 
2820 static void __kmp_first_top_half_finish_proxy( kmp_taskdata_t * taskdata )
2821 {
2822     KMP_DEBUG_ASSERT( taskdata -> td_flags.tasktype == TASK_EXPLICIT );
2823     KMP_DEBUG_ASSERT( taskdata -> td_flags.proxy == TASK_PROXY );
2824     KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 0 );
2825     KMP_DEBUG_ASSERT( taskdata -> td_flags.freed == 0 );
2826 
2827     taskdata -> td_flags.complete = 1;   // mark the task as completed
2828 
2829     if ( taskdata->td_taskgroup )
2830        KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata->td_taskgroup->count) );
2831 
2832     // Create an imaginary children for this task so the bottom half cannot release the task before we have completed the second top half
2833     TCI_4(taskdata->td_incomplete_child_tasks);
2834 }
2835 
2836 static void __kmp_second_top_half_finish_proxy( kmp_taskdata_t * taskdata )
2837 {
2838     kmp_int32 children = 0;
2839 
2840     // Predecrement simulated by "- 1" calculation
2841     children = KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata -> td_parent -> td_incomplete_child_tasks) ) - 1;
2842     KMP_DEBUG_ASSERT( children >= 0 );
2843 
2844     // Remove the imaginary children
2845     TCD_4(taskdata->td_incomplete_child_tasks);
2846 }
2847 
2848 static void __kmp_bottom_half_finish_proxy( kmp_int32 gtid, kmp_task_t * ptask )
2849 {
2850     kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(ptask);
2851     kmp_info_t * thread = __kmp_threads[ gtid ];
2852 
2853     KMP_DEBUG_ASSERT( taskdata -> td_flags.proxy == TASK_PROXY );
2854     KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 1 ); // top half must run before bottom half
2855 
2856     // We need to wait to make sure the top half is finished
2857     // Spinning here should be ok as this should happen quickly
2858     while ( TCR_4(taskdata->td_incomplete_child_tasks) > 0 ) ;
2859 
2860     __kmp_release_deps(gtid,taskdata);
2861     __kmp_free_task_and_ancestors(gtid, taskdata, thread);
2862 }
2863 
2864 /*!
2865 @ingroup TASKING
2866 @param gtid Global Thread ID of encountering thread
2867 @param ptask Task which execution is completed
2868 
2869 Execute the completation of a proxy task from a thread of that is part of the team. Run first and bottom halves directly.
2870 */
2871 void __kmpc_proxy_task_completed( kmp_int32 gtid, kmp_task_t *ptask )
2872 {
2873     KMP_DEBUG_ASSERT( ptask != NULL );
2874     kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(ptask);
2875     KA_TRACE(10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n", gtid, taskdata ) );
2876 
2877     KMP_DEBUG_ASSERT( taskdata->td_flags.proxy == TASK_PROXY );
2878 
2879     __kmp_first_top_half_finish_proxy(taskdata);
2880     __kmp_second_top_half_finish_proxy(taskdata);
2881     __kmp_bottom_half_finish_proxy(gtid,ptask);
2882 
2883     KA_TRACE(10, ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n", gtid, taskdata ) );
2884 }
2885 
2886 /*!
2887 @ingroup TASKING
2888 @param ptask Task which execution is completed
2889 
2890 Execute the completation of a proxy task from a thread that could not belong to the team.
2891 */
2892 void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask )
2893 {
2894     KMP_DEBUG_ASSERT( ptask != NULL );
2895     kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(ptask);
2896 
2897     KA_TRACE(10, ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n", taskdata ) );
2898 
2899     KMP_DEBUG_ASSERT( taskdata->td_flags.proxy == TASK_PROXY );
2900 
2901     __kmp_first_top_half_finish_proxy(taskdata);
2902 
2903     // Enqueue task to complete bottom half completion from a thread within the corresponding team
2904     kmp_team_t * team = taskdata->td_team;
2905     kmp_int32 nthreads = team->t.t_nproc;
2906     kmp_info_t *thread;
2907 
2908     //This should be similar to start_k = __kmp_get_random( thread ) % nthreads but we cannot use __kmp_get_random here
2909     kmp_int32 start_k = 0;
2910     kmp_int32 pass = 1;
2911     kmp_int32 k = start_k;
2912 
2913     do {
2914         //For now we're just linearly trying to find a thread
2915         thread = team->t.t_threads[k];
2916         k = (k+1) % nthreads;
2917 
2918         // we did a full pass through all the threads
2919         if ( k == start_k ) pass = pass << 1;
2920 
2921     } while ( !__kmp_give_task( thread, k,  ptask, pass ) );
2922 
2923     __kmp_second_top_half_finish_proxy(taskdata);
2924 
2925     KA_TRACE(10, ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", taskdata ) );
2926 }
2927 
2928 //---------------------------------------------------------------------------------
2929 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task for taskloop
2930 //
2931 // thread:   allocating thread
2932 // task_src: pointer to source task to be duplicated
2933 // returns:  a pointer to the allocated kmp_task_t structure (task).
2934 kmp_task_t *
2935 __kmp_task_dup_alloc( kmp_info_t *thread, kmp_task_t *task_src )
2936 {
2937     kmp_task_t     *task;
2938     kmp_taskdata_t *taskdata;
2939     kmp_taskdata_t *taskdata_src;
2940     kmp_taskdata_t *parent_task = thread->th.th_current_task;
2941     size_t shareds_offset;
2942     size_t task_size;
2943 
2944     KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, task_src) );
2945     taskdata_src = KMP_TASK_TO_TASKDATA( task_src );
2946     KMP_DEBUG_ASSERT( taskdata_src->td_flags.proxy == TASK_FULL ); // it should not be proxy task
2947     KMP_DEBUG_ASSERT( taskdata_src->td_flags.tasktype == TASK_EXPLICIT );
2948     task_size = taskdata_src->td_size_alloc;
2949 
2950     // Allocate a kmp_taskdata_t block and a kmp_task_t block.
2951     KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread, task_size) );
2952     #if USE_FAST_MEMORY
2953     taskdata = (kmp_taskdata_t *)__kmp_fast_allocate( thread, task_size );
2954     #else
2955     taskdata = (kmp_taskdata_t *)__kmp_thread_malloc( thread, task_size );
2956     #endif /* USE_FAST_MEMORY */
2957     KMP_MEMCPY(taskdata, taskdata_src, task_size);
2958 
2959     task = KMP_TASKDATA_TO_TASK(taskdata);
2960 
2961     // Initialize new task (only specific fields not affected by memcpy)
2962     taskdata->td_task_id = KMP_GEN_TASK_ID();
2963     if( task->shareds != NULL ) { // need setup shareds pointer
2964         shareds_offset = (char*)task_src->shareds - (char*)taskdata_src;
2965         task->shareds = &((char*)taskdata)[shareds_offset];
2966         KMP_DEBUG_ASSERT( (((kmp_uintptr_t)task->shareds) & (sizeof(void*)-1)) == 0 );
2967     }
2968     taskdata->td_alloc_thread = thread;
2969     taskdata->td_taskgroup = parent_task->td_taskgroup; // task inherits the taskgroup from the parent task
2970 
2971     // Only need to keep track of child task counts if team parallel and tasking not serialized
2972     if ( !( taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser ) ) {
2973         KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_incomplete_child_tasks) );
2974         if ( parent_task->td_taskgroup )
2975             KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_taskgroup->count) );
2976         // Only need to keep track of allocated child tasks for explicit tasks since implicit not deallocated
2977         if ( taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT )
2978             KMP_TEST_THEN_INC32( (kmp_int32 *)(& taskdata->td_parent->td_allocated_child_tasks) );
2979     }
2980 
2981     KA_TRACE(20, ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
2982                   thread, taskdata, taskdata->td_parent) );
2983 #if OMPT_SUPPORT
2984     __kmp_task_init_ompt(taskdata, thread->th.th_info.ds.ds_gtid, (void*)task->routine);
2985 #endif
2986     return task;
2987 }
2988 
2989 // Routine optionally generated by th ecompiler for setting the lastprivate flag
2990 // and calling needed constructors for private/firstprivate objects
2991 // (used to form taskloop tasks from pattern task)
2992 typedef void(*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
2993 
2994 //---------------------------------------------------------------------------------
2995 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
2996 //
2997 // loc       Source location information
2998 // gtid      Global thread ID
2999 // task      Task with whole loop iteration range
3000 // lb        Pointer to loop lower bound
3001 // ub        Pointer to loop upper bound
3002 // st        Loop stride
3003 // sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
3004 // grainsize Schedule value if specified
3005 // task_dup  Tasks duplication routine
3006 void
3007 __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
3008                 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
3009                 int sched, kmp_uint64 grainsize, void *task_dup )
3010 {
3011     KMP_COUNT_BLOCK(OMP_TASKLOOP);
3012     KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
3013     p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
3014     kmp_uint64 tc;
3015     kmp_uint64 lower = *lb; // compiler provides global bounds here
3016     kmp_uint64 upper = *ub;
3017     kmp_uint64 i, num_tasks = 0, extras = 0;
3018     kmp_info_t *thread = __kmp_threads[gtid];
3019     kmp_taskdata_t *current_task = thread->th.th_current_task;
3020     kmp_task_t *next_task;
3021     kmp_int32 lastpriv = 0;
3022     size_t lower_offset = (char*)lb - (char*)task; // remember offset of lb in the task structure
3023     size_t upper_offset = (char*)ub - (char*)task; // remember offset of ub in the task structure
3024 
3025     // compute trip count
3026     if ( st == 1 ) {   // most common case
3027         tc = upper - lower + 1;
3028     } else if ( st < 0 ) {
3029         tc = (lower - upper) / (-st) + 1;
3030     } else {       // st > 0
3031         tc = (upper - lower) / st + 1;
3032     }
3033     if(tc == 0) {
3034         KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid));
3035         // free the pattern task and exit
3036         __kmp_task_start( gtid, task, current_task );
3037         // do not execute anything for zero-trip loop
3038         __kmp_task_finish( gtid, task, current_task );
3039         return;
3040     }
3041 
3042     // compute num_tasks/grainsize based on the input provided
3043     switch( sched ) {
3044     case 0: // no schedule clause specified, we can choose the default
3045             // let's try to schedule (team_size*10) tasks
3046         grainsize = thread->th.th_team_nproc * 10;
3047     case 2: // num_tasks provided
3048         if( grainsize > tc ) {
3049             num_tasks = tc;   // too big num_tasks requested, adjust values
3050             grainsize = 1;
3051             extras = 0;
3052         } else {
3053             num_tasks = grainsize;
3054             grainsize = tc / num_tasks;
3055             extras = tc % num_tasks;
3056         }
3057         break;
3058     case 1: // grainsize provided
3059         if( grainsize > tc ) {
3060             num_tasks = 1;    // too big grainsize requested, adjust values
3061             grainsize = tc;
3062             extras = 0;
3063         } else {
3064             num_tasks = tc / grainsize;
3065             grainsize = tc / num_tasks; // adjust grainsize for balanced distribution of iterations
3066             extras = tc % num_tasks;
3067         }
3068         break;
3069     default:
3070         KMP_ASSERT2(0, "unknown scheduling of taskloop");
3071     }
3072     KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
3073     KMP_DEBUG_ASSERT(num_tasks > extras);
3074     KMP_DEBUG_ASSERT(num_tasks > 0);
3075     KA_TRACE(20, ("__kmpc_taskloop: T#%d will launch: num_tasks %lld, grainsize %lld, extras %lld\n",
3076                   gtid, num_tasks, grainsize, extras));
3077 
3078     // Main loop, launch num_tasks tasks, assign grainsize iterations each task
3079     for( i = 0; i < num_tasks; ++i ) {
3080         kmp_uint64 chunk_minus_1;
3081         if( extras == 0 ) {
3082             chunk_minus_1 = grainsize - 1;
3083         } else {
3084             chunk_minus_1 = grainsize;
3085             --extras; // first extras iterations get bigger chunk (grainsize+1)
3086         }
3087         upper = lower + st * chunk_minus_1;
3088         if( i == num_tasks - 1 ) {
3089             // schedule the last task, set lastprivate flag
3090             lastpriv = 1;
3091 #if KMP_DEBUG
3092             if( st == 1 )
3093                 KMP_DEBUG_ASSERT(upper == *ub);
3094             else if( st > 0 )
3095                 KMP_DEBUG_ASSERT(upper+st > *ub);
3096             else
3097                 KMP_DEBUG_ASSERT(upper+st < *ub);
3098 #endif
3099         }
3100         next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
3101         *(kmp_uint64*)((char*)next_task + lower_offset) = lower; // adjust task-specific bounds
3102         *(kmp_uint64*)((char*)next_task + upper_offset) = upper;
3103         if( ptask_dup != NULL )
3104             ptask_dup(next_task, task, lastpriv); // set lastprivate flag, construct fistprivates, etc.
3105         KA_TRACE(20, ("__kmpc_taskloop: T#%d schedule task %p: lower %lld, upper %lld (offsets %p %p)\n",
3106                       gtid, next_task, lower, upper, lower_offset, upper_offset));
3107         __kmp_omp_task(gtid, next_task, true); // schedule new task
3108         lower = upper + st; // adjust lower bound for the next iteration
3109     }
3110     // free the pattern task and exit
3111     __kmp_task_start( gtid, task, current_task );
3112     // do not execute the pattern task, just do bookkeeping
3113     __kmp_task_finish( gtid, task, current_task );
3114 }
3115 
3116 /*!
3117 @ingroup TASKING
3118 @param loc       Source location information
3119 @param gtid      Global thread ID
3120 @param task      Task structure
3121 @param if_val    Value of the if clause
3122 @param lb        Pointer to loop lower bound
3123 @param ub        Pointer to loop upper bound
3124 @param st        Loop stride
3125 @param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
3126 @param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
3127 @param grainsize Schedule value if specified
3128 @param task_dup  Tasks duplication routine
3129 
3130 Execute the taskloop construct.
3131 */
3132 void
3133 __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
3134                 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
3135                 int nogroup, int sched, kmp_uint64 grainsize, void *task_dup )
3136 {
3137     kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
3138     KMP_DEBUG_ASSERT( task != NULL );
3139 
3140     KA_TRACE(10, ("__kmpc_taskloop(enter): T#%d, pattern task %p, lb %lld ub %lld st %lld, grain %llu(%d)\n",
3141         gtid, taskdata, *lb, *ub, st, grainsize, sched));
3142 
3143     // check if clause value first
3144     if( if_val == 0 ) { // if(0) specified, mark task as serial
3145         taskdata->td_flags.task_serial = 1;
3146         taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
3147     }
3148     if( nogroup == 0 ) {
3149         __kmpc_taskgroup( loc, gtid );
3150     }
3151 
3152     if( 1 /* AC: use some heuristic here to choose task scheduling method */ ) {
3153         __kmp_taskloop_linear( loc, gtid, task, lb, ub, st, sched, grainsize, task_dup );
3154     }
3155 
3156     if( nogroup == 0 ) {
3157         __kmpc_end_taskgroup( loc, gtid );
3158     }
3159     KA_TRACE(10, ("__kmpc_taskloop(exit): T#%d\n", gtid));
3160 }
3161 
3162 #endif
3163