1 /*
2  * kmp_taskq.cpp -- TASKQ support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 //                     The LLVM Compiler Infrastructure
8 //
9 // This file is dual licensed under the MIT and the University of Illinois Open
10 // Source Licenses. See LICENSE.txt for details.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "kmp.h"
15 #include "kmp_error.h"
16 #include "kmp_i18n.h"
17 #include "kmp_io.h"
18 
19 #define MAX_MESSAGE 512
20 
21 /* Taskq routines and global variables */
22 
23 #define KMP_DEBUG_REF_CTS(x) KF_TRACE(1, x);
24 
25 #define THREAD_ALLOC_FOR_TASKQ
26 
in_parallel_context(kmp_team_t * team)27 static int in_parallel_context(kmp_team_t *team) {
28   return !team->t.t_serialized;
29 }
30 
__kmp_taskq_eo(int * gtid_ref,int * cid_ref,ident_t * loc_ref)31 static void __kmp_taskq_eo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
32   int gtid = *gtid_ref;
33   int tid = __kmp_tid_from_gtid(gtid);
34   kmp_uint32 my_token;
35   kmpc_task_queue_t *taskq;
36   kmp_taskq_t *tq = &__kmp_threads[gtid]->th.th_team->t.t_taskq;
37 
38   if (__kmp_env_consistency_check)
39 #if KMP_USE_DYNAMIC_LOCK
40     __kmp_push_sync(gtid, ct_ordered_in_taskq, loc_ref, NULL, 0);
41 #else
42     __kmp_push_sync(gtid, ct_ordered_in_taskq, loc_ref, NULL);
43 #endif
44 
45   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) {
46     KMP_MB(); /* Flush all pending memory write invalidates.  */
47 
48     /* GEH - need check here under stats to make sure   */
49     /*       inside task (curr_thunk[*tid_ref] != NULL) */
50 
51     my_token = tq->tq_curr_thunk[tid]->th_tasknum;
52 
53     taskq = tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue;
54 
55     KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_EQ, NULL);
56     KMP_MB();
57   }
58 }
59 
__kmp_taskq_xo(int * gtid_ref,int * cid_ref,ident_t * loc_ref)60 static void __kmp_taskq_xo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61   int gtid = *gtid_ref;
62   int tid = __kmp_tid_from_gtid(gtid);
63   kmp_uint32 my_token;
64   kmp_taskq_t *tq = &__kmp_threads[gtid]->th.th_team->t.t_taskq;
65 
66   if (__kmp_env_consistency_check)
67     __kmp_pop_sync(gtid, ct_ordered_in_taskq, loc_ref);
68 
69   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) {
70     KMP_MB(); /* Flush all pending memory write invalidates.  */
71 
72     /* GEH - need check here under stats to make sure */
73     /*       inside task (curr_thunk[tid] != NULL)    */
74 
75     my_token = tq->tq_curr_thunk[tid]->th_tasknum;
76 
77     KMP_MB(); /* Flush all pending memory write invalidates.  */
78 
79     tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue->tq_tasknum_serving =
80         my_token + 1;
81 
82     KMP_MB(); /* Flush all pending memory write invalidates.  */
83   }
84 }
85 
__kmp_taskq_check_ordered(kmp_int32 gtid,kmpc_thunk_t * thunk)86 static void __kmp_taskq_check_ordered(kmp_int32 gtid, kmpc_thunk_t *thunk) {
87   kmp_uint32 my_token;
88   kmpc_task_queue_t *taskq;
89 
90   /* assume we are always called from an active parallel context */
91 
92   KMP_MB(); /* Flush all pending memory write invalidates.  */
93 
94   my_token = thunk->th_tasknum;
95 
96   taskq = thunk->th.th_shareds->sv_queue;
97 
98   if (taskq->tq_tasknum_serving <= my_token) {
99     KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_GE, NULL);
100     KMP_MB();
101     taskq->tq_tasknum_serving = my_token + 1;
102     KMP_MB();
103   }
104 }
105 
106 #ifdef KMP_DEBUG
107 
__kmp_dump_TQF(kmp_int32 flags)108 static void __kmp_dump_TQF(kmp_int32 flags) {
109   if (flags & TQF_IS_ORDERED)
110     __kmp_printf("ORDERED ");
111   if (flags & TQF_IS_LASTPRIVATE)
112     __kmp_printf("LAST_PRIV ");
113   if (flags & TQF_IS_NOWAIT)
114     __kmp_printf("NOWAIT ");
115   if (flags & TQF_HEURISTICS)
116     __kmp_printf("HEURIST ");
117   if (flags & TQF_INTERFACE_RESERVED1)
118     __kmp_printf("RESERV1 ");
119   if (flags & TQF_INTERFACE_RESERVED2)
120     __kmp_printf("RESERV2 ");
121   if (flags & TQF_INTERFACE_RESERVED3)
122     __kmp_printf("RESERV3 ");
123   if (flags & TQF_INTERFACE_RESERVED4)
124     __kmp_printf("RESERV4 ");
125   if (flags & TQF_IS_LAST_TASK)
126     __kmp_printf("LAST_TASK ");
127   if (flags & TQF_TASKQ_TASK)
128     __kmp_printf("TASKQ_TASK ");
129   if (flags & TQF_RELEASE_WORKERS)
130     __kmp_printf("RELEASE ");
131   if (flags & TQF_ALL_TASKS_QUEUED)
132     __kmp_printf("ALL_QUEUED ");
133   if (flags & TQF_PARALLEL_CONTEXT)
134     __kmp_printf("PARALLEL ");
135   if (flags & TQF_DEALLOCATED)
136     __kmp_printf("DEALLOC ");
137   if (!(flags & (TQF_INTERNAL_FLAGS | TQF_INTERFACE_FLAGS)))
138     __kmp_printf("(NONE)");
139 }
140 
__kmp_dump_thunk(kmp_taskq_t * tq,kmpc_thunk_t * thunk,kmp_int32 global_tid)141 static void __kmp_dump_thunk(kmp_taskq_t *tq, kmpc_thunk_t *thunk,
142                              kmp_int32 global_tid) {
143   int i;
144   int nproc = __kmp_threads[global_tid]->th.th_team->t.t_nproc;
145 
146   __kmp_printf("\tThunk at %p on (%d):  ", thunk, global_tid);
147 
148   if (thunk != NULL) {
149     for (i = 0; i < nproc; i++) {
150       if (tq->tq_curr_thunk[i] == thunk) {
151         __kmp_printf("[%i] ", i);
152       }
153     }
154     __kmp_printf("th_shareds=%p, ", thunk->th.th_shareds);
155     __kmp_printf("th_task=%p, ", thunk->th_task);
156     __kmp_printf("th_encl_thunk=%p, ", thunk->th_encl_thunk);
157     __kmp_printf("th_status=%d, ", thunk->th_status);
158     __kmp_printf("th_tasknum=%u, ", thunk->th_tasknum);
159     __kmp_printf("th_flags=");
160     __kmp_dump_TQF(thunk->th_flags);
161   }
162 
163   __kmp_printf("\n");
164 }
165 
__kmp_dump_thunk_stack(kmpc_thunk_t * thunk,kmp_int32 thread_num)166 static void __kmp_dump_thunk_stack(kmpc_thunk_t *thunk, kmp_int32 thread_num) {
167   kmpc_thunk_t *th;
168 
169   __kmp_printf("    Thunk stack for T#%d:  ", thread_num);
170 
171   for (th = thunk; th != NULL; th = th->th_encl_thunk)
172     __kmp_printf("%p ", th);
173 
174   __kmp_printf("\n");
175 }
176 
__kmp_dump_task_queue(kmp_taskq_t * tq,kmpc_task_queue_t * queue,kmp_int32 global_tid)177 static void __kmp_dump_task_queue(kmp_taskq_t *tq, kmpc_task_queue_t *queue,
178                                   kmp_int32 global_tid) {
179   int qs, count, i;
180   kmpc_thunk_t *thunk;
181   kmpc_task_queue_t *taskq;
182 
183   __kmp_printf("Task Queue at %p on (%d):\n", queue, global_tid);
184 
185   if (queue != NULL) {
186     int in_parallel = queue->tq_flags & TQF_PARALLEL_CONTEXT;
187 
188     if (__kmp_env_consistency_check) {
189       __kmp_printf("    tq_loc             : ");
190     }
191     if (in_parallel) {
192 
193       // if (queue->tq.tq_parent != 0)
194       //__kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
195 
196       //__kmp_acquire_lock(& queue->tq_link_lck, global_tid);
197 
198       // Make sure data structures are in consistent state before querying them
199       // Seems to work without this for digital/alpha, needed for IBM/RS6000
200       KMP_MB();
201 
202       __kmp_printf("    tq_parent          : %p\n", queue->tq.tq_parent);
203       __kmp_printf("    tq_first_child     : %p\n", queue->tq_first_child);
204       __kmp_printf("    tq_next_child      : %p\n", queue->tq_next_child);
205       __kmp_printf("    tq_prev_child      : %p\n", queue->tq_prev_child);
206       __kmp_printf("    tq_ref_count       : %d\n", queue->tq_ref_count);
207 
208       //__kmp_release_lock(& queue->tq_link_lck, global_tid);
209 
210       // if (queue->tq.tq_parent != 0)
211       //__kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
212 
213       //__kmp_acquire_lock(& queue->tq_free_thunks_lck, global_tid);
214       //__kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
215 
216       // Make sure data structures are in consistent state before querying them
217       // Seems to work without this for digital/alpha, needed for IBM/RS6000
218       KMP_MB();
219     }
220 
221     __kmp_printf("    tq_shareds         : ");
222     for (i = 0; i < ((queue == tq->tq_root) ? queue->tq_nproc : 1); i++)
223       __kmp_printf("%p ", queue->tq_shareds[i].ai_data);
224     __kmp_printf("\n");
225 
226     if (in_parallel) {
227       __kmp_printf("    tq_tasknum_queuing : %u\n", queue->tq_tasknum_queuing);
228       __kmp_printf("    tq_tasknum_serving : %u\n", queue->tq_tasknum_serving);
229     }
230 
231     __kmp_printf("    tq_queue           : %p\n", queue->tq_queue);
232     __kmp_printf("    tq_thunk_space     : %p\n", queue->tq_thunk_space);
233     __kmp_printf("    tq_taskq_slot      : %p\n", queue->tq_taskq_slot);
234 
235     __kmp_printf("    tq_free_thunks     : ");
236     for (thunk = queue->tq_free_thunks; thunk != NULL;
237          thunk = thunk->th.th_next_free)
238       __kmp_printf("%p ", thunk);
239     __kmp_printf("\n");
240 
241     __kmp_printf("    tq_nslots          : %d\n", queue->tq_nslots);
242     __kmp_printf("    tq_head            : %d\n", queue->tq_head);
243     __kmp_printf("    tq_tail            : %d\n", queue->tq_tail);
244     __kmp_printf("    tq_nfull           : %d\n", queue->tq_nfull);
245     __kmp_printf("    tq_hiwat           : %d\n", queue->tq_hiwat);
246     __kmp_printf("    tq_flags           : ");
247     __kmp_dump_TQF(queue->tq_flags);
248     __kmp_printf("\n");
249 
250     if (in_parallel) {
251       __kmp_printf("    tq_th_thunks       : ");
252       for (i = 0; i < queue->tq_nproc; i++) {
253         __kmp_printf("%d ", queue->tq_th_thunks[i].ai_data);
254       }
255       __kmp_printf("\n");
256     }
257 
258     __kmp_printf("\n");
259     __kmp_printf("    Queue slots:\n");
260 
261     qs = queue->tq_tail;
262     for (count = 0; count < queue->tq_nfull; ++count) {
263       __kmp_printf("(%d)", qs);
264       __kmp_dump_thunk(tq, queue->tq_queue[qs].qs_thunk, global_tid);
265       qs = (qs + 1) % queue->tq_nslots;
266     }
267 
268     __kmp_printf("\n");
269 
270     if (in_parallel) {
271       if (queue->tq_taskq_slot != NULL) {
272         __kmp_printf("    TaskQ slot:\n");
273         __kmp_dump_thunk(tq, CCAST(kmpc_thunk_t *, queue->tq_taskq_slot),
274                          global_tid);
275         __kmp_printf("\n");
276       }
277       //__kmp_release_lock(& queue->tq_queue_lck, global_tid);
278       //__kmp_release_lock(& queue->tq_free_thunks_lck, global_tid);
279     }
280   }
281 
282   __kmp_printf("    Taskq freelist: ");
283 
284   //__kmp_acquire_lock( & tq->tq_freelist_lck, global_tid );
285 
286   // Make sure data structures are in consistent state before querying them
287   // Seems to work without this call for digital/alpha, needed for IBM/RS6000
288   KMP_MB();
289 
290   for (taskq = tq->tq_freelist; taskq != NULL; taskq = taskq->tq.tq_next_free)
291     __kmp_printf("%p ", taskq);
292 
293   //__kmp_release_lock( & tq->tq_freelist_lck, global_tid );
294 
295   __kmp_printf("\n\n");
296 }
297 
__kmp_aux_dump_task_queue_tree(kmp_taskq_t * tq,kmpc_task_queue_t * curr_queue,kmp_int32 level,kmp_int32 global_tid)298 static void __kmp_aux_dump_task_queue_tree(kmp_taskq_t *tq,
299                                            kmpc_task_queue_t *curr_queue,
300                                            kmp_int32 level,
301                                            kmp_int32 global_tid) {
302   int i, count, qs;
303   int nproc = __kmp_threads[global_tid]->th.th_team->t.t_nproc;
304   kmpc_task_queue_t *queue = curr_queue;
305 
306   if (curr_queue == NULL)
307     return;
308 
309   __kmp_printf("    ");
310 
311   for (i = 0; i < level; i++)
312     __kmp_printf("  ");
313 
314   __kmp_printf("%p", curr_queue);
315 
316   for (i = 0; i < nproc; i++) {
317     if (tq->tq_curr_thunk[i] &&
318         tq->tq_curr_thunk[i]->th.th_shareds->sv_queue == curr_queue) {
319       __kmp_printf(" [%i]", i);
320     }
321   }
322 
323   __kmp_printf(":");
324 
325   //__kmp_acquire_lock(& curr_queue->tq_queue_lck, global_tid);
326 
327   // Make sure data structures are in consistent state before querying them
328   // Seems to work without this call for digital/alpha, needed for IBM/RS6000
329   KMP_MB();
330 
331   qs = curr_queue->tq_tail;
332 
333   for (count = 0; count < curr_queue->tq_nfull; ++count) {
334     __kmp_printf("%p ", curr_queue->tq_queue[qs].qs_thunk);
335     qs = (qs + 1) % curr_queue->tq_nslots;
336   }
337 
338   //__kmp_release_lock(& curr_queue->tq_queue_lck, global_tid);
339 
340   __kmp_printf("\n");
341 
342   if (curr_queue->tq_first_child) {
343     //__kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
344 
345     // Make sure data structures are in consistent state before querying them
346     // Seems to work without this call for digital/alpha, needed for IBM/RS6000
347     KMP_MB();
348 
349     if (curr_queue->tq_first_child) {
350       for (queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child);
351            queue != NULL; queue = queue->tq_next_child) {
352         __kmp_aux_dump_task_queue_tree(tq, queue, level + 1, global_tid);
353       }
354     }
355 
356     //__kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
357   }
358 }
359 
__kmp_dump_task_queue_tree(kmp_taskq_t * tq,kmpc_task_queue_t * tqroot,kmp_int32 global_tid)360 static void __kmp_dump_task_queue_tree(kmp_taskq_t *tq,
361                                        kmpc_task_queue_t *tqroot,
362                                        kmp_int32 global_tid) {
363   __kmp_printf("TaskQ Tree at root %p on (%d):\n", tqroot, global_tid);
364 
365   __kmp_aux_dump_task_queue_tree(tq, tqroot, 0, global_tid);
366 
367   __kmp_printf("\n");
368 }
369 #endif
370 
371 /* New taskq storage routines that try to minimize overhead of mallocs but
372    still provide cache line alignment. */
__kmp_taskq_allocate(size_t size,kmp_int32 global_tid)373 static void *__kmp_taskq_allocate(size_t size, kmp_int32 global_tid) {
374   void *addr, *orig_addr;
375   size_t bytes;
376 
377   KB_TRACE(5, ("__kmp_taskq_allocate: called size=%d, gtid=%d\n", (int)size,
378                global_tid));
379 
380   bytes = sizeof(void *) + CACHE_LINE + size;
381 
382 #ifdef THREAD_ALLOC_FOR_TASKQ
383   orig_addr =
384       (void *)__kmp_thread_malloc(__kmp_thread_from_gtid(global_tid), bytes);
385 #else
386   KE_TRACE(10, ("%%%%%% MALLOC( %d )\n", bytes));
387   orig_addr = (void *)KMP_INTERNAL_MALLOC(bytes);
388 #endif /* THREAD_ALLOC_FOR_TASKQ */
389 
390   if (orig_addr == 0)
391     KMP_FATAL(OutOfHeapMemory);
392 
393   addr = orig_addr;
394 
395   if (((kmp_uintptr_t)addr & (CACHE_LINE - 1)) != 0) {
396     KB_TRACE(50, ("__kmp_taskq_allocate:  adjust for cache alignment\n"));
397     addr = (void *)(((kmp_uintptr_t)addr + CACHE_LINE) & ~(CACHE_LINE - 1));
398   }
399 
400   (*(void **)addr) = orig_addr;
401 
402   KB_TRACE(10,
403            ("__kmp_taskq_allocate:  allocate: %p, use: %p - %p, size: %d, "
404             "gtid: %d\n",
405             orig_addr, ((void **)addr) + 1,
406             ((char *)(((void **)addr) + 1)) + size - 1, (int)size, global_tid));
407 
408   return (((void **)addr) + 1);
409 }
410 
__kmpc_taskq_free(void * p,kmp_int32 global_tid)411 static void __kmpc_taskq_free(void *p, kmp_int32 global_tid) {
412   KB_TRACE(5, ("__kmpc_taskq_free: called addr=%p, gtid=%d\n", p, global_tid));
413 
414   KB_TRACE(10, ("__kmpc_taskq_free:  freeing: %p, gtid: %d\n",
415                 (*(((void **)p) - 1)), global_tid));
416 
417 #ifdef THREAD_ALLOC_FOR_TASKQ
418   __kmp_thread_free(__kmp_thread_from_gtid(global_tid), *(((void **)p) - 1));
419 #else
420   KMP_INTERNAL_FREE(*(((void **)p) - 1));
421 #endif /* THREAD_ALLOC_FOR_TASKQ */
422 }
423 
424 /* Keep freed kmpc_task_queue_t on an internal freelist and recycle since
425    they're of constant size. */
426 
427 static kmpc_task_queue_t *
__kmp_alloc_taskq(kmp_taskq_t * tq,int in_parallel,kmp_int32 nslots,kmp_int32 nthunks,kmp_int32 nshareds,kmp_int32 nproc,size_t sizeof_thunk,size_t sizeof_shareds,kmpc_thunk_t ** new_taskq_thunk,kmp_int32 global_tid)428 __kmp_alloc_taskq(kmp_taskq_t *tq, int in_parallel, kmp_int32 nslots,
429                   kmp_int32 nthunks, kmp_int32 nshareds, kmp_int32 nproc,
430                   size_t sizeof_thunk, size_t sizeof_shareds,
431                   kmpc_thunk_t **new_taskq_thunk, kmp_int32 global_tid) {
432   kmp_int32 i;
433   size_t bytes;
434   kmpc_task_queue_t *new_queue;
435   kmpc_aligned_shared_vars_t *shared_var_array;
436   char *shared_var_storage;
437   char *pt; /* for doing byte-adjusted address computations */
438 
439   __kmp_acquire_lock(&tq->tq_freelist_lck, global_tid);
440 
441   // Make sure data structures are in consistent state before querying them
442   // Seems to work without this call for digital/alpha, needed for IBM/RS6000
443   KMP_MB();
444 
445   if (tq->tq_freelist) {
446     new_queue = tq->tq_freelist;
447     tq->tq_freelist = tq->tq_freelist->tq.tq_next_free;
448 
449     KMP_DEBUG_ASSERT(new_queue->tq_flags & TQF_DEALLOCATED);
450 
451     new_queue->tq_flags = 0;
452 
453     __kmp_release_lock(&tq->tq_freelist_lck, global_tid);
454   } else {
455     __kmp_release_lock(&tq->tq_freelist_lck, global_tid);
456 
457     new_queue = (kmpc_task_queue_t *)__kmp_taskq_allocate(
458         sizeof(kmpc_task_queue_t), global_tid);
459     new_queue->tq_flags = 0;
460   }
461 
462   /*  space in the task queue for queue slots (allocate as one big chunk */
463   /* of storage including new_taskq_task space)                          */
464 
465   sizeof_thunk +=
466       (CACHE_LINE - (sizeof_thunk % CACHE_LINE)); /* pad to cache line size */
467   pt = (char *)__kmp_taskq_allocate(nthunks * sizeof_thunk, global_tid);
468   new_queue->tq_thunk_space = (kmpc_thunk_t *)pt;
469   *new_taskq_thunk = (kmpc_thunk_t *)(pt + (nthunks - 1) * sizeof_thunk);
470 
471   /*  chain the allocated thunks into a freelist for this queue  */
472 
473   new_queue->tq_free_thunks = (kmpc_thunk_t *)pt;
474 
475   for (i = 0; i < (nthunks - 2); i++) {
476     ((kmpc_thunk_t *)(pt + i * sizeof_thunk))->th.th_next_free =
477         (kmpc_thunk_t *)(pt + (i + 1) * sizeof_thunk);
478 #ifdef KMP_DEBUG
479     ((kmpc_thunk_t *)(pt + i * sizeof_thunk))->th_flags = TQF_DEALLOCATED;
480 #endif
481   }
482 
483   ((kmpc_thunk_t *)(pt + (nthunks - 2) * sizeof_thunk))->th.th_next_free = NULL;
484 #ifdef KMP_DEBUG
485   ((kmpc_thunk_t *)(pt + (nthunks - 2) * sizeof_thunk))->th_flags =
486       TQF_DEALLOCATED;
487 #endif
488 
489   /* initialize the locks */
490 
491   if (in_parallel) {
492     __kmp_init_lock(&new_queue->tq_link_lck);
493     __kmp_init_lock(&new_queue->tq_free_thunks_lck);
494     __kmp_init_lock(&new_queue->tq_queue_lck);
495   }
496 
497   /* now allocate the slots */
498 
499   bytes = nslots * sizeof(kmpc_aligned_queue_slot_t);
500   new_queue->tq_queue =
501       (kmpc_aligned_queue_slot_t *)__kmp_taskq_allocate(bytes, global_tid);
502 
503   /*  space for array of pointers to shared variable structures */
504   sizeof_shareds += sizeof(kmpc_task_queue_t *);
505   sizeof_shareds +=
506       (CACHE_LINE - (sizeof_shareds % CACHE_LINE)); /* pad to cache line size */
507 
508   bytes = nshareds * sizeof(kmpc_aligned_shared_vars_t);
509   shared_var_array =
510       (kmpc_aligned_shared_vars_t *)__kmp_taskq_allocate(bytes, global_tid);
511 
512   bytes = nshareds * sizeof_shareds;
513   shared_var_storage = (char *)__kmp_taskq_allocate(bytes, global_tid);
514 
515   for (i = 0; i < nshareds; i++) {
516     shared_var_array[i].ai_data =
517         (kmpc_shared_vars_t *)(shared_var_storage + i * sizeof_shareds);
518     shared_var_array[i].ai_data->sv_queue = new_queue;
519   }
520   new_queue->tq_shareds = shared_var_array;
521 
522   /* array for number of outstanding thunks per thread */
523 
524   if (in_parallel) {
525     bytes = nproc * sizeof(kmpc_aligned_int32_t);
526     new_queue->tq_th_thunks =
527         (kmpc_aligned_int32_t *)__kmp_taskq_allocate(bytes, global_tid);
528     new_queue->tq_nproc = nproc;
529 
530     for (i = 0; i < nproc; i++)
531       new_queue->tq_th_thunks[i].ai_data = 0;
532   }
533 
534   return new_queue;
535 }
536 
__kmp_free_taskq(kmp_taskq_t * tq,kmpc_task_queue_t * p,int in_parallel,kmp_int32 global_tid)537 static void __kmp_free_taskq(kmp_taskq_t *tq, kmpc_task_queue_t *p,
538                              int in_parallel, kmp_int32 global_tid) {
539   __kmpc_taskq_free(p->tq_thunk_space, global_tid);
540   __kmpc_taskq_free(p->tq_queue, global_tid);
541 
542   /* free shared var structure storage */
543   __kmpc_taskq_free(CCAST(kmpc_shared_vars_t *, p->tq_shareds[0].ai_data),
544                     global_tid);
545   /* free array of pointers to shared vars storage */
546   __kmpc_taskq_free(p->tq_shareds, global_tid);
547 
548 #ifdef KMP_DEBUG
549   p->tq_first_child = NULL;
550   p->tq_next_child = NULL;
551   p->tq_prev_child = NULL;
552   p->tq_ref_count = -10;
553   p->tq_shareds = NULL;
554   p->tq_tasknum_queuing = 0;
555   p->tq_tasknum_serving = 0;
556   p->tq_queue = NULL;
557   p->tq_thunk_space = NULL;
558   p->tq_taskq_slot = NULL;
559   p->tq_free_thunks = NULL;
560   p->tq_nslots = 0;
561   p->tq_head = 0;
562   p->tq_tail = 0;
563   p->tq_nfull = 0;
564   p->tq_hiwat = 0;
565 
566   if (in_parallel) {
567     int i;
568 
569     for (i = 0; i < p->tq_nproc; i++)
570       p->tq_th_thunks[i].ai_data = 0;
571   }
572   if (__kmp_env_consistency_check)
573     p->tq_loc = NULL;
574   KMP_DEBUG_ASSERT(p->tq_flags & TQF_DEALLOCATED);
575   p->tq_flags = TQF_DEALLOCATED;
576 #endif /* KMP_DEBUG */
577 
578   if (in_parallel) {
579     __kmpc_taskq_free(p->tq_th_thunks, global_tid);
580     __kmp_destroy_lock(&p->tq_link_lck);
581     __kmp_destroy_lock(&p->tq_queue_lck);
582     __kmp_destroy_lock(&p->tq_free_thunks_lck);
583   }
584 #ifdef KMP_DEBUG
585   p->tq_th_thunks = NULL;
586 #endif /* KMP_DEBUG */
587 
588   // Make sure data structures are in consistent state before querying them
589   // Seems to work without this call for digital/alpha, needed for IBM/RS6000
590   KMP_MB();
591 
592   __kmp_acquire_lock(&tq->tq_freelist_lck, global_tid);
593   p->tq.tq_next_free = tq->tq_freelist;
594 
595   tq->tq_freelist = p;
596   __kmp_release_lock(&tq->tq_freelist_lck, global_tid);
597 }
598 
599 /* Once a group of thunks has been allocated for use in a particular queue,
600    these are managed via a per-queue freelist.
601    We force a check that there's always a thunk free if we need one. */
602 
__kmp_alloc_thunk(kmpc_task_queue_t * queue,int in_parallel,kmp_int32 global_tid)603 static kmpc_thunk_t *__kmp_alloc_thunk(kmpc_task_queue_t *queue,
604                                        int in_parallel, kmp_int32 global_tid) {
605   kmpc_thunk_t *fl;
606 
607   if (in_parallel) {
608     __kmp_acquire_lock(&queue->tq_free_thunks_lck, global_tid);
609     // Make sure data structures are in consistent state before querying them
610     // Seems to work without this call for digital/alpha, needed for IBM/RS6000
611     KMP_MB();
612   }
613 
614   fl = queue->tq_free_thunks;
615 
616   KMP_DEBUG_ASSERT(fl != NULL);
617 
618   queue->tq_free_thunks = fl->th.th_next_free;
619   fl->th_flags = 0;
620 
621   if (in_parallel)
622     __kmp_release_lock(&queue->tq_free_thunks_lck, global_tid);
623 
624   return fl;
625 }
626 
__kmp_free_thunk(kmpc_task_queue_t * queue,kmpc_thunk_t * p,int in_parallel,kmp_int32 global_tid)627 static void __kmp_free_thunk(kmpc_task_queue_t *queue, kmpc_thunk_t *p,
628                              int in_parallel, kmp_int32 global_tid) {
629 #ifdef KMP_DEBUG
630   p->th_task = 0;
631   p->th_encl_thunk = 0;
632   p->th_status = 0;
633   p->th_tasknum = 0;
634 /* Also could zero pointers to private vars */
635 #endif
636 
637   if (in_parallel) {
638     __kmp_acquire_lock(&queue->tq_free_thunks_lck, global_tid);
639     // Make sure data structures are in consistent state before querying them
640     // Seems to work without this call for digital/alpha, needed for IBM/RS6000
641     KMP_MB();
642   }
643 
644   p->th.th_next_free = queue->tq_free_thunks;
645   queue->tq_free_thunks = p;
646 
647 #ifdef KMP_DEBUG
648   p->th_flags = TQF_DEALLOCATED;
649 #endif
650 
651   if (in_parallel)
652     __kmp_release_lock(&queue->tq_free_thunks_lck, global_tid);
653 }
654 
655 /*  returns nonzero if the queue just became full after the enqueue  */
__kmp_enqueue_task(kmp_taskq_t * tq,kmp_int32 global_tid,kmpc_task_queue_t * queue,kmpc_thunk_t * thunk,int in_parallel)656 static kmp_int32 __kmp_enqueue_task(kmp_taskq_t *tq, kmp_int32 global_tid,
657                                     kmpc_task_queue_t *queue,
658                                     kmpc_thunk_t *thunk, int in_parallel) {
659   kmp_int32 ret;
660 
661   /*  dkp: can we get around the lock in the TQF_RELEASE_WORKERS case (only the
662    * master is executing then)  */
663   if (in_parallel) {
664     __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
665     // Make sure data structures are in consistent state before querying them
666     // Seems to work without this call for digital/alpha, needed for IBM/RS6000
667     KMP_MB();
668   }
669 
670   KMP_DEBUG_ASSERT(queue->tq_nfull < queue->tq_nslots); // check queue not full
671 
672   queue->tq_queue[(queue->tq_head)++].qs_thunk = thunk;
673 
674   if (queue->tq_head >= queue->tq_nslots)
675     queue->tq_head = 0;
676 
677   (queue->tq_nfull)++;
678 
679   KMP_MB(); /* to assure that nfull is seen to increase before
680                TQF_ALL_TASKS_QUEUED is set */
681 
682   ret = (in_parallel) ? (queue->tq_nfull == queue->tq_nslots) : FALSE;
683 
684   if (in_parallel) {
685     /* don't need to wait until workers are released before unlocking */
686     __kmp_release_lock(&queue->tq_queue_lck, global_tid);
687 
688     if (tq->tq_global_flags & TQF_RELEASE_WORKERS) {
689       // If just creating the root queue, the worker threads are waiting at a
690       // join barrier until now, when there's something in the queue for them to
691       // do; release them now to do work. This should only be done when this is
692       // the first task enqueued, so reset the flag here also.
693       tq->tq_global_flags &= ~TQF_RELEASE_WORKERS; /* no lock needed, workers
694                                                       are still in spin mode */
695       // avoid releasing barrier twice if taskq_task switches threads
696       KMP_MB();
697 
698       __kmpc_end_barrier_master(NULL, global_tid);
699     }
700   }
701 
702   return ret;
703 }
704 
__kmp_dequeue_task(kmp_int32 global_tid,kmpc_task_queue_t * queue,int in_parallel)705 static kmpc_thunk_t *__kmp_dequeue_task(kmp_int32 global_tid,
706                                         kmpc_task_queue_t *queue,
707                                         int in_parallel) {
708   kmpc_thunk_t *pt;
709   int tid = __kmp_tid_from_gtid(global_tid);
710 
711   KMP_DEBUG_ASSERT(queue->tq_nfull > 0); /*  check queue not empty  */
712 
713   if (queue->tq.tq_parent != NULL && in_parallel) {
714     int ct;
715     __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
716     ct = ++(queue->tq_ref_count);
717     __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
718     KMP_DEBUG_REF_CTS(
719         ("line %d gtid %d: Q %p inc %d\n", __LINE__, global_tid, queue, ct));
720   }
721 
722   pt = queue->tq_queue[(queue->tq_tail)++].qs_thunk;
723 
724   if (queue->tq_tail >= queue->tq_nslots)
725     queue->tq_tail = 0;
726 
727   if (in_parallel) {
728     queue->tq_th_thunks[tid].ai_data++;
729 
730     KMP_MB(); /* necessary so ai_data increment is propagated to other threads
731                  immediately (digital) */
732 
733     KF_TRACE(200, ("__kmp_dequeue_task: T#%d(:%d) now has %d outstanding "
734                    "thunks from queue %p\n",
735                    global_tid, tid, queue->tq_th_thunks[tid].ai_data, queue));
736   }
737 
738   (queue->tq_nfull)--;
739 
740 #ifdef KMP_DEBUG
741   KMP_MB();
742 
743   /* necessary so (queue->tq_nfull > 0) above succeeds after tq_nfull is
744    * decremented */
745 
746   KMP_DEBUG_ASSERT(queue->tq_nfull >= 0);
747 
748   if (in_parallel) {
749     KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data <=
750                      __KMP_TASKQ_THUNKS_PER_TH);
751   }
752 #endif
753 
754   return pt;
755 }
756 
757 /* Find the next (non-null) task to dequeue and return it.
758  * This is never called unless in_parallel=TRUE
759  *
760  * Here are the rules for deciding which queue to take the task from:
761  * 1.  Walk up the task queue tree from the current queue's parent and look
762  *      on the way up (for loop, below).
763  * 2.  Do a depth-first search back down the tree from the root and
764  *      look (find_task_in_descendant_queue()).
765  *
766  * Here are the rules for deciding which task to take from a queue
767  * (__kmp_find_task_in_queue ()):
768  * 1.  Never take the last task from a queue if TQF_IS_LASTPRIVATE; this task
769  *     must be staged to make sure we execute the last one with
770  *     TQF_IS_LAST_TASK at the end of task queue execution.
771  * 2.  If the queue length is below some high water mark and the taskq task
772  *     is enqueued, prefer running the taskq task.
773  * 3.  Otherwise, take a (normal) task from the queue.
774  *
775  * If we do all this and return pt == NULL at the bottom of this routine,
776  * this means there are no more tasks to execute (except possibly for
777  * TQF_IS_LASTPRIVATE).
778  */
779 
__kmp_find_task_in_queue(kmp_int32 global_tid,kmpc_task_queue_t * queue)780 static kmpc_thunk_t *__kmp_find_task_in_queue(kmp_int32 global_tid,
781                                               kmpc_task_queue_t *queue) {
782   kmpc_thunk_t *pt = NULL;
783   int tid = __kmp_tid_from_gtid(global_tid);
784 
785   /* To prevent deadlock from tq_queue_lck if queue already deallocated */
786   if (!(queue->tq_flags & TQF_DEALLOCATED)) {
787 
788     __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
789 
790     /* Check again to avoid race in __kmpc_end_taskq() */
791     if (!(queue->tq_flags & TQF_DEALLOCATED)) {
792       // Make sure data structures are in consistent state before querying them
793       // Seems to work without this for digital/alpha, needed for IBM/RS6000
794       KMP_MB();
795 
796       if ((queue->tq_taskq_slot != NULL) &&
797           (queue->tq_nfull <= queue->tq_hiwat)) {
798         /* if there's enough room in the queue and the dispatcher */
799         /* (taskq task) is available, schedule more tasks         */
800         pt = CCAST(kmpc_thunk_t *, queue->tq_taskq_slot);
801         queue->tq_taskq_slot = NULL;
802       } else if (queue->tq_nfull == 0 ||
803                  queue->tq_th_thunks[tid].ai_data >=
804                      __KMP_TASKQ_THUNKS_PER_TH) {
805         /* do nothing if no thunks available or this thread can't */
806         /* run any because it already is executing too many       */
807         pt = NULL;
808       } else if (queue->tq_nfull > 1) {
809         /*  always safe to schedule a task even if TQF_IS_LASTPRIVATE  */
810 
811         pt = __kmp_dequeue_task(global_tid, queue, TRUE);
812       } else if (!(queue->tq_flags & TQF_IS_LASTPRIVATE)) {
813         // one thing in queue, always safe to schedule if !TQF_IS_LASTPRIVATE
814         pt = __kmp_dequeue_task(global_tid, queue, TRUE);
815       } else if (queue->tq_flags & TQF_IS_LAST_TASK) {
816         /* TQF_IS_LASTPRIVATE, one thing in queue, kmpc_end_taskq_task()   */
817         /* has been run so this is last task, run with TQF_IS_LAST_TASK so */
818         /* instrumentation does copy-out.                                  */
819         pt = __kmp_dequeue_task(global_tid, queue, TRUE);
820         pt->th_flags |=
821             TQF_IS_LAST_TASK; /* don't need test_then_or since already locked */
822       }
823     }
824 
825     /* GEH - What happens here if is lastprivate, but not last task? */
826     __kmp_release_lock(&queue->tq_queue_lck, global_tid);
827   }
828 
829   return pt;
830 }
831 
832 /* Walk a tree of queues starting at queue's first child and return a non-NULL
833    thunk if one can be scheduled. Must only be called when in_parallel=TRUE */
834 
835 static kmpc_thunk_t *
__kmp_find_task_in_descendant_queue(kmp_int32 global_tid,kmpc_task_queue_t * curr_queue)836 __kmp_find_task_in_descendant_queue(kmp_int32 global_tid,
837                                     kmpc_task_queue_t *curr_queue) {
838   kmpc_thunk_t *pt = NULL;
839   kmpc_task_queue_t *queue = curr_queue;
840 
841   if (curr_queue->tq_first_child != NULL) {
842     __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
843     // Make sure data structures are in consistent state before querying them
844     // Seems to work without this call for digital/alpha, needed for IBM/RS6000
845     KMP_MB();
846 
847     queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child);
848     if (queue == NULL) {
849       __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
850       return NULL;
851     }
852 
853     while (queue != NULL) {
854       int ct;
855       kmpc_task_queue_t *next;
856 
857       ct = ++(queue->tq_ref_count);
858       __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
859       KMP_DEBUG_REF_CTS(
860           ("line %d gtid %d: Q %p inc %d\n", __LINE__, global_tid, queue, ct));
861 
862       pt = __kmp_find_task_in_queue(global_tid, queue);
863 
864       if (pt != NULL) {
865         int ct;
866 
867         __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
868         // Make sure data structures in consistent state before querying them
869         // Seems to work without this for digital/alpha, needed for IBM/RS6000
870         KMP_MB();
871 
872         ct = --(queue->tq_ref_count);
873         KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__,
874                            global_tid, queue, ct));
875         KMP_DEBUG_ASSERT(queue->tq_ref_count >= 0);
876 
877         __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
878 
879         return pt;
880       }
881 
882       /* although reference count stays active during descendant walk, shouldn't
883          matter  since if children still exist, reference counts aren't being
884          monitored anyway   */
885 
886       pt = __kmp_find_task_in_descendant_queue(global_tid, queue);
887 
888       if (pt != NULL) {
889         int ct;
890 
891         __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
892         // Make sure data structures in consistent state before querying them
893         // Seems to work without this for digital/alpha, needed for IBM/RS6000
894         KMP_MB();
895 
896         ct = --(queue->tq_ref_count);
897         KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__,
898                            global_tid, queue, ct));
899         KMP_DEBUG_ASSERT(ct >= 0);
900 
901         __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
902 
903         return pt;
904       }
905 
906       __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
907       // Make sure data structures in consistent state before querying them
908       // Seems to work without this for digital/alpha, needed for IBM/RS6000
909       KMP_MB();
910 
911       next = queue->tq_next_child;
912 
913       ct = --(queue->tq_ref_count);
914       KMP_DEBUG_REF_CTS(
915           ("line %d gtid %d: Q %p dec %d\n", __LINE__, global_tid, queue, ct));
916       KMP_DEBUG_ASSERT(ct >= 0);
917 
918       queue = next;
919     }
920 
921     __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
922   }
923 
924   return pt;
925 }
926 
927 /* Walk up the taskq tree looking for a task to execute. If we get to the root,
928    search the tree for a descendent queue task. Must only be called when
929    in_parallel=TRUE */
930 static kmpc_thunk_t *
__kmp_find_task_in_ancestor_queue(kmp_taskq_t * tq,kmp_int32 global_tid,kmpc_task_queue_t * curr_queue)931 __kmp_find_task_in_ancestor_queue(kmp_taskq_t *tq, kmp_int32 global_tid,
932                                   kmpc_task_queue_t *curr_queue) {
933   kmpc_task_queue_t *queue;
934   kmpc_thunk_t *pt;
935 
936   pt = NULL;
937 
938   if (curr_queue->tq.tq_parent != NULL) {
939     queue = curr_queue->tq.tq_parent;
940 
941     while (queue != NULL) {
942       if (queue->tq.tq_parent != NULL) {
943         int ct;
944         __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
945         // Make sure data structures in consistent state before querying them
946         // Seems to work without this for digital/alpha, needed for IBM/RS6000
947         KMP_MB();
948 
949         ct = ++(queue->tq_ref_count);
950         __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
951         KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n", __LINE__,
952                            global_tid, queue, ct));
953       }
954 
955       pt = __kmp_find_task_in_queue(global_tid, queue);
956       if (pt != NULL) {
957         if (queue->tq.tq_parent != NULL) {
958           int ct;
959           __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
960           // Make sure data structures in consistent state before querying them
961           // Seems to work without this for digital/alpha, needed for IBM/RS6000
962           KMP_MB();
963 
964           ct = --(queue->tq_ref_count);
965           KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__,
966                              global_tid, queue, ct));
967           KMP_DEBUG_ASSERT(ct >= 0);
968 
969           __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
970         }
971 
972         return pt;
973       }
974 
975       if (queue->tq.tq_parent != NULL) {
976         int ct;
977         __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
978         // Make sure data structures in consistent state before querying them
979         // Seems to work without this for digital/alpha, needed for IBM/RS6000
980         KMP_MB();
981 
982         ct = --(queue->tq_ref_count);
983         KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__,
984                            global_tid, queue, ct));
985         KMP_DEBUG_ASSERT(ct >= 0);
986       }
987       queue = queue->tq.tq_parent;
988 
989       if (queue != NULL)
990         __kmp_release_lock(&queue->tq_link_lck, global_tid);
991     }
992   }
993 
994   pt = __kmp_find_task_in_descendant_queue(global_tid, tq->tq_root);
995 
996   return pt;
997 }
998 
__kmp_taskq_tasks_finished(kmpc_task_queue_t * queue)999 static int __kmp_taskq_tasks_finished(kmpc_task_queue_t *queue) {
1000   int i;
1001 
1002   /* KMP_MB(); */ /* is this really necessary? */
1003 
1004   for (i = 0; i < queue->tq_nproc; i++) {
1005     if (queue->tq_th_thunks[i].ai_data != 0)
1006       return FALSE;
1007   }
1008 
1009   return TRUE;
1010 }
1011 
__kmp_taskq_has_any_children(kmpc_task_queue_t * queue)1012 static int __kmp_taskq_has_any_children(kmpc_task_queue_t *queue) {
1013   return (queue->tq_first_child != NULL);
1014 }
1015 
__kmp_remove_queue_from_tree(kmp_taskq_t * tq,kmp_int32 global_tid,kmpc_task_queue_t * queue,int in_parallel)1016 static void __kmp_remove_queue_from_tree(kmp_taskq_t *tq, kmp_int32 global_tid,
1017                                          kmpc_task_queue_t *queue,
1018                                          int in_parallel) {
1019 #ifdef KMP_DEBUG
1020   kmp_int32 i;
1021   kmpc_thunk_t *thunk;
1022 #endif
1023 
1024   KF_TRACE(50,
1025            ("Before Deletion of TaskQ at %p on (%d):\n", queue, global_tid));
1026   KF_DUMP(50, __kmp_dump_task_queue(tq, queue, global_tid));
1027 
1028   /*  sub-queue in a recursion, not the root task queue  */
1029   KMP_DEBUG_ASSERT(queue->tq.tq_parent != NULL);
1030 
1031   if (in_parallel) {
1032     __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1033     // Make sure data structures are in consistent state before querying them
1034     // Seems to work without this call for digital/alpha, needed for IBM/RS6000
1035     KMP_MB();
1036   }
1037 
1038   KMP_DEBUG_ASSERT(queue->tq_first_child == NULL);
1039 
1040   /*  unlink queue from its siblings if any at this level  */
1041   if (queue->tq_prev_child != NULL)
1042     queue->tq_prev_child->tq_next_child = queue->tq_next_child;
1043   if (queue->tq_next_child != NULL)
1044     queue->tq_next_child->tq_prev_child = queue->tq_prev_child;
1045   if (queue->tq.tq_parent->tq_first_child == queue)
1046     queue->tq.tq_parent->tq_first_child = queue->tq_next_child;
1047 
1048   queue->tq_prev_child = NULL;
1049   queue->tq_next_child = NULL;
1050 
1051   if (in_parallel) {
1052     KMP_DEBUG_REF_CTS(
1053         ("line %d gtid %d: Q %p waiting for ref_count of %d to reach 1\n",
1054          __LINE__, global_tid, queue, queue->tq_ref_count));
1055 
1056     /* wait until all other threads have stopped accessing this queue */
1057     while (queue->tq_ref_count > 1) {
1058       __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1059 
1060       KMP_WAIT_YIELD((volatile kmp_uint32 *)&queue->tq_ref_count, 1, KMP_LE,
1061                      NULL);
1062 
1063       __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1064       // Make sure data structures are in consistent state before querying them
1065       // Seems to work without this for digital/alpha, needed for IBM/RS6000
1066       KMP_MB();
1067     }
1068 
1069     __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1070   }
1071 
1072   KMP_DEBUG_REF_CTS(
1073       ("line %d gtid %d: Q %p freeing queue\n", __LINE__, global_tid, queue));
1074 
1075 #ifdef KMP_DEBUG
1076   KMP_DEBUG_ASSERT(queue->tq_flags & TQF_ALL_TASKS_QUEUED);
1077   KMP_DEBUG_ASSERT(queue->tq_nfull == 0);
1078 
1079   for (i = 0; i < queue->tq_nproc; i++) {
1080     KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0);
1081   }
1082 
1083   i = 0;
1084   for (thunk = queue->tq_free_thunks; thunk != NULL;
1085        thunk = thunk->th.th_next_free)
1086     ++i;
1087 
1088   KMP_ASSERT(i ==
1089              queue->tq_nslots + (queue->tq_nproc * __KMP_TASKQ_THUNKS_PER_TH));
1090 #endif
1091 
1092   /*  release storage for queue entry  */
1093   __kmp_free_taskq(tq, queue, TRUE, global_tid);
1094 
1095   KF_TRACE(50, ("After Deletion of TaskQ at %p on (%d):\n", queue, global_tid));
1096   KF_DUMP(50, __kmp_dump_task_queue_tree(tq, tq->tq_root, global_tid));
1097 }
1098 
1099 /* Starting from indicated queue, proceed downward through tree and remove all
1100    taskqs which are finished, but only go down to taskqs which have the "nowait"
1101    clause present.  Assume this is only called when in_parallel=TRUE. */
1102 
__kmp_find_and_remove_finished_child_taskq(kmp_taskq_t * tq,kmp_int32 global_tid,kmpc_task_queue_t * curr_queue)1103 static void __kmp_find_and_remove_finished_child_taskq(
1104     kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *curr_queue) {
1105   kmpc_task_queue_t *queue = curr_queue;
1106 
1107   if (curr_queue->tq_first_child != NULL) {
1108     __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
1109     // Make sure data structures are in consistent state before querying them
1110     // Seems to work without this call for digital/alpha, needed for IBM/RS6000
1111     KMP_MB();
1112 
1113     queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child);
1114     if (queue != NULL) {
1115       __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
1116       return;
1117     }
1118 
1119     while (queue != NULL) {
1120       kmpc_task_queue_t *next;
1121       int ct = ++(queue->tq_ref_count);
1122       KMP_DEBUG_REF_CTS(
1123           ("line %d gtid %d: Q %p inc %d\n", __LINE__, global_tid, queue, ct));
1124 
1125       /* although reference count stays active during descendant walk, */
1126       /* shouldn't matter since if children still exist, reference     */
1127       /* counts aren't being monitored anyway                          */
1128 
1129       if (queue->tq_flags & TQF_IS_NOWAIT) {
1130         __kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue);
1131 
1132         if ((queue->tq_flags & TQF_ALL_TASKS_QUEUED) &&
1133             (queue->tq_nfull == 0) && __kmp_taskq_tasks_finished(queue) &&
1134             !__kmp_taskq_has_any_children(queue)) {
1135 
1136           /* Only remove this if we have not already marked it for deallocation.
1137              This should prevent multiple threads from trying to free this. */
1138 
1139           if (__kmp_test_lock(&queue->tq_queue_lck, global_tid)) {
1140             if (!(queue->tq_flags & TQF_DEALLOCATED)) {
1141               queue->tq_flags |= TQF_DEALLOCATED;
1142               __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1143 
1144               __kmp_remove_queue_from_tree(tq, global_tid, queue, TRUE);
1145 
1146               /* Can't do any more here since can't be sure where sibling queue
1147                * is so just exit this level */
1148               return;
1149             } else {
1150               __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1151             }
1152           }
1153           /* otherwise, just fall through and decrement reference count */
1154         }
1155       }
1156 
1157       __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
1158       // Make sure data structures are in consistent state before querying them
1159       // Seems to work without this for digital/alpha, needed for IBM/RS6000
1160       KMP_MB();
1161 
1162       next = queue->tq_next_child;
1163 
1164       ct = --(queue->tq_ref_count);
1165       KMP_DEBUG_REF_CTS(
1166           ("line %d gtid %d: Q %p dec %d\n", __LINE__, global_tid, queue, ct));
1167       KMP_DEBUG_ASSERT(ct >= 0);
1168 
1169       queue = next;
1170     }
1171 
1172     __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
1173   }
1174 }
1175 
1176 /* Starting from indicated queue, proceed downward through tree and remove all
1177    taskq's assuming all are finished and assuming NO other threads are executing
1178    at this point. */
__kmp_remove_all_child_taskq(kmp_taskq_t * tq,kmp_int32 global_tid,kmpc_task_queue_t * queue)1179 static void __kmp_remove_all_child_taskq(kmp_taskq_t *tq, kmp_int32 global_tid,
1180                                          kmpc_task_queue_t *queue) {
1181   kmpc_task_queue_t *next_child;
1182 
1183   queue = CCAST(kmpc_task_queue_t *, queue->tq_first_child);
1184 
1185   while (queue != NULL) {
1186     __kmp_remove_all_child_taskq(tq, global_tid, queue);
1187 
1188     next_child = queue->tq_next_child;
1189     queue->tq_flags |= TQF_DEALLOCATED;
1190     __kmp_remove_queue_from_tree(tq, global_tid, queue, FALSE);
1191     queue = next_child;
1192   }
1193 }
1194 
__kmp_execute_task_from_queue(kmp_taskq_t * tq,ident_t * loc,kmp_int32 global_tid,kmpc_thunk_t * thunk,int in_parallel)1195 static void __kmp_execute_task_from_queue(kmp_taskq_t *tq, ident_t *loc,
1196                                           kmp_int32 global_tid,
1197                                           kmpc_thunk_t *thunk,
1198                                           int in_parallel) {
1199   kmpc_task_queue_t *queue = thunk->th.th_shareds->sv_queue;
1200   kmp_int32 tid = __kmp_tid_from_gtid(global_tid);
1201 
1202   KF_TRACE(100, ("After dequeueing this Task on (%d):\n", global_tid));
1203   KF_DUMP(100, __kmp_dump_thunk(tq, thunk, global_tid));
1204   KF_TRACE(100, ("Task Queue: %p looks like this (%d):\n", queue, global_tid));
1205   KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1206 
1207   /* For the taskq task, the curr_thunk pushes and pop pairs are set up as
1208    * follows:
1209    *
1210    * happens exactly once:
1211    * 1) __kmpc_taskq             : push (if returning thunk only)
1212    * 4) __kmpc_end_taskq_task    : pop
1213    *
1214    * optionally happens *each* time taskq task is dequeued/enqueued:
1215    * 2) __kmpc_taskq_task        : pop
1216    * 3) __kmp_execute_task_from_queue  : push
1217    *
1218    * execution ordering:  1,(2,3)*,4
1219    */
1220 
1221   if (!(thunk->th_flags & TQF_TASKQ_TASK)) {
1222     kmp_int32 index = (queue == tq->tq_root) ? tid : 0;
1223     thunk->th.th_shareds =
1224         CCAST(kmpc_shared_vars_t *, queue->tq_shareds[index].ai_data);
1225 
1226     if (__kmp_env_consistency_check) {
1227       __kmp_push_workshare(global_tid,
1228                            (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered
1229                                                               : ct_task,
1230                            queue->tq_loc);
1231     }
1232   } else {
1233     if (__kmp_env_consistency_check)
1234       __kmp_push_workshare(global_tid, ct_taskq, queue->tq_loc);
1235   }
1236 
1237   if (in_parallel) {
1238     thunk->th_encl_thunk = tq->tq_curr_thunk[tid];
1239     tq->tq_curr_thunk[tid] = thunk;
1240 
1241     KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1242   }
1243 
1244   KF_TRACE(50, ("Begin Executing Thunk %p from queue %p on (%d)\n", thunk,
1245                 queue, global_tid));
1246   thunk->th_task(global_tid, thunk);
1247   KF_TRACE(50, ("End Executing Thunk %p from queue %p on (%d)\n", thunk, queue,
1248                 global_tid));
1249 
1250   if (!(thunk->th_flags & TQF_TASKQ_TASK)) {
1251     if (__kmp_env_consistency_check)
1252       __kmp_pop_workshare(global_tid,
1253                           (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered
1254                                                              : ct_task,
1255                           queue->tq_loc);
1256 
1257     if (in_parallel) {
1258       tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1259       thunk->th_encl_thunk = NULL;
1260       KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1261     }
1262 
1263     if ((thunk->th_flags & TQF_IS_ORDERED) && in_parallel) {
1264       __kmp_taskq_check_ordered(global_tid, thunk);
1265     }
1266 
1267     __kmp_free_thunk(queue, thunk, in_parallel, global_tid);
1268 
1269     KF_TRACE(100, ("T#%d After freeing thunk: %p, TaskQ looks like this:\n",
1270                    global_tid, thunk));
1271     KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1272 
1273     if (in_parallel) {
1274       KMP_MB(); /* needed so thunk put on free list before outstanding thunk
1275                    count is decremented */
1276 
1277       KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data >= 1);
1278 
1279       KF_TRACE(
1280           200,
1281           ("__kmp_execute_task_from_queue: T#%d has %d thunks in queue %p\n",
1282            global_tid, queue->tq_th_thunks[tid].ai_data - 1, queue));
1283 
1284       queue->tq_th_thunks[tid].ai_data--;
1285 
1286       /* KMP_MB(); */ /* is MB really necessary ? */
1287     }
1288 
1289     if (queue->tq.tq_parent != NULL && in_parallel) {
1290       int ct;
1291       __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1292       ct = --(queue->tq_ref_count);
1293       __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1294       KMP_DEBUG_REF_CTS(
1295           ("line %d gtid %d: Q %p dec %d\n", __LINE__, global_tid, queue, ct));
1296       KMP_DEBUG_ASSERT(ct >= 0);
1297     }
1298   }
1299 }
1300 
1301 /* starts a taskq; creates and returns a thunk for the taskq_task        */
1302 /* also, returns pointer to shared vars for this thread in "shareds" arg */
__kmpc_taskq(ident_t * loc,kmp_int32 global_tid,kmpc_task_t taskq_task,size_t sizeof_thunk,size_t sizeof_shareds,kmp_int32 flags,kmpc_shared_vars_t ** shareds)1303 kmpc_thunk_t *__kmpc_taskq(ident_t *loc, kmp_int32 global_tid,
1304                            kmpc_task_t taskq_task, size_t sizeof_thunk,
1305                            size_t sizeof_shareds, kmp_int32 flags,
1306                            kmpc_shared_vars_t **shareds) {
1307   int in_parallel;
1308   kmp_int32 nslots, nthunks, nshareds, nproc;
1309   kmpc_task_queue_t *new_queue, *curr_queue;
1310   kmpc_thunk_t *new_taskq_thunk;
1311   kmp_info_t *th;
1312   kmp_team_t *team;
1313   kmp_taskq_t *tq;
1314   kmp_int32 tid;
1315 
1316   KE_TRACE(10, ("__kmpc_taskq called (%d)\n", global_tid));
1317 
1318   th = __kmp_threads[global_tid];
1319   team = th->th.th_team;
1320   tq = &team->t.t_taskq;
1321   nproc = team->t.t_nproc;
1322   tid = __kmp_tid_from_gtid(global_tid);
1323 
1324   /* find out whether this is a parallel taskq or serialized one. */
1325   in_parallel = in_parallel_context(team);
1326 
1327   if (!tq->tq_root) {
1328     if (in_parallel) {
1329       /* Vector ORDERED SECTION to taskq version */
1330       th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo;
1331 
1332       /* Vector ORDERED SECTION to taskq version */
1333       th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo;
1334     }
1335 
1336     if (in_parallel) {
1337       // This shouldn't be a barrier region boundary, it will confuse the user.
1338       /* Need the boundary to be at the end taskq instead. */
1339       if (__kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL)) {
1340         /* Creating the active root queue, and we are not the master thread. */
1341         /* The master thread below created the queue and tasks have been     */
1342         /* enqueued, and the master thread released this barrier.  This      */
1343         /* worker thread can now proceed and execute tasks.  See also the    */
1344         /* TQF_RELEASE_WORKERS which is used to handle this case.            */
1345         *shareds =
1346             CCAST(kmpc_shared_vars_t *, tq->tq_root->tq_shareds[tid].ai_data);
1347         KE_TRACE(10, ("__kmpc_taskq return (%d)\n", global_tid));
1348 
1349         return NULL;
1350       }
1351     }
1352 
1353     /* master thread only executes this code */
1354     if (tq->tq_curr_thunk_capacity < nproc) {
1355       if (tq->tq_curr_thunk)
1356         __kmp_free(tq->tq_curr_thunk);
1357       else {
1358         /* only need to do this once at outer level, i.e. when tq_curr_thunk is
1359          * still NULL */
1360         __kmp_init_lock(&tq->tq_freelist_lck);
1361       }
1362 
1363       tq->tq_curr_thunk =
1364           (kmpc_thunk_t **)__kmp_allocate(nproc * sizeof(kmpc_thunk_t *));
1365       tq->tq_curr_thunk_capacity = nproc;
1366     }
1367 
1368     if (in_parallel)
1369       tq->tq_global_flags = TQF_RELEASE_WORKERS;
1370   }
1371 
1372   /* dkp: in future, if flags & TQF_HEURISTICS, will choose nslots based */
1373   /*      on some heuristics (e.g., depth of queue nesting?).            */
1374   nslots = (in_parallel) ? (2 * nproc) : 1;
1375 
1376   /* There must be nproc * __KMP_TASKQ_THUNKS_PER_TH extra slots for pending */
1377   /* jobs being executed by other threads, and one extra for taskq slot */
1378   nthunks = (in_parallel) ? (nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH) + 1)
1379                           : nslots + 2;
1380 
1381   /* Only the root taskq gets a per-thread array of shareds.       */
1382   /* The rest of the taskq's only get one copy of the shared vars. */
1383   nshareds = (!tq->tq_root && in_parallel) ? nproc : 1;
1384 
1385   /*  create overall queue data structure and its components that require
1386    * allocation */
1387   new_queue = __kmp_alloc_taskq(tq, in_parallel, nslots, nthunks, nshareds,
1388                                 nproc, sizeof_thunk, sizeof_shareds,
1389                                 &new_taskq_thunk, global_tid);
1390 
1391   /*  rest of new_queue initializations  */
1392   new_queue->tq_flags = flags & TQF_INTERFACE_FLAGS;
1393 
1394   if (in_parallel) {
1395     new_queue->tq_tasknum_queuing = 0;
1396     new_queue->tq_tasknum_serving = 0;
1397     new_queue->tq_flags |= TQF_PARALLEL_CONTEXT;
1398   }
1399 
1400   new_queue->tq_taskq_slot = NULL;
1401   new_queue->tq_nslots = nslots;
1402   new_queue->tq_hiwat = HIGH_WATER_MARK(nslots);
1403   new_queue->tq_nfull = 0;
1404   new_queue->tq_head = 0;
1405   new_queue->tq_tail = 0;
1406   new_queue->tq_loc = loc;
1407 
1408   if ((new_queue->tq_flags & TQF_IS_ORDERED) && in_parallel) {
1409     /* prepare to serve the first-queued task's ORDERED directive */
1410     new_queue->tq_tasknum_serving = 1;
1411 
1412     /* Vector ORDERED SECTION to taskq version */
1413     th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo;
1414 
1415     /* Vector ORDERED SECTION to taskq version */
1416     th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo;
1417   }
1418 
1419   /*  create a new thunk for the taskq_task in the new_queue  */
1420   *shareds = CCAST(kmpc_shared_vars_t *, new_queue->tq_shareds[0].ai_data);
1421 
1422   new_taskq_thunk->th.th_shareds = *shareds;
1423   new_taskq_thunk->th_task = taskq_task;
1424   new_taskq_thunk->th_flags = new_queue->tq_flags | TQF_TASKQ_TASK;
1425   new_taskq_thunk->th_status = 0;
1426 
1427   KMP_DEBUG_ASSERT(new_taskq_thunk->th_flags & TQF_TASKQ_TASK);
1428 
1429   // Make sure these inits complete before threads start using this queue
1430   /* KMP_MB(); */ // (necessary?)
1431 
1432   /* insert the new task queue into the tree, but only after all fields
1433    * initialized */
1434 
1435   if (in_parallel) {
1436     if (!tq->tq_root) {
1437       new_queue->tq.tq_parent = NULL;
1438       new_queue->tq_first_child = NULL;
1439       new_queue->tq_next_child = NULL;
1440       new_queue->tq_prev_child = NULL;
1441       new_queue->tq_ref_count = 1;
1442       tq->tq_root = new_queue;
1443     } else {
1444       curr_queue = tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue;
1445       new_queue->tq.tq_parent = curr_queue;
1446       new_queue->tq_first_child = NULL;
1447       new_queue->tq_prev_child = NULL;
1448       new_queue->tq_ref_count =
1449           1; /* for this the thread that built the queue */
1450 
1451       KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p alloc %d\n", __LINE__,
1452                          global_tid, new_queue, new_queue->tq_ref_count));
1453 
1454       __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
1455 
1456       // Make sure data structures are in consistent state before querying them
1457       // Seems to work without this for digital/alpha, needed for IBM/RS6000
1458       KMP_MB();
1459 
1460       new_queue->tq_next_child =
1461           CCAST(struct kmpc_task_queue_t *, curr_queue->tq_first_child);
1462 
1463       if (curr_queue->tq_first_child != NULL)
1464         curr_queue->tq_first_child->tq_prev_child = new_queue;
1465 
1466       curr_queue->tq_first_child = new_queue;
1467 
1468       __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
1469     }
1470 
1471     /* set up thunk stack only after code that determines curr_queue above */
1472     new_taskq_thunk->th_encl_thunk = tq->tq_curr_thunk[tid];
1473     tq->tq_curr_thunk[tid] = new_taskq_thunk;
1474 
1475     KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1476   } else {
1477     new_taskq_thunk->th_encl_thunk = 0;
1478     new_queue->tq.tq_parent = NULL;
1479     new_queue->tq_first_child = NULL;
1480     new_queue->tq_next_child = NULL;
1481     new_queue->tq_prev_child = NULL;
1482     new_queue->tq_ref_count = 1;
1483   }
1484 
1485 #ifdef KMP_DEBUG
1486   KF_TRACE(150, ("Creating TaskQ Task on (%d):\n", global_tid));
1487   KF_DUMP(150, __kmp_dump_thunk(tq, new_taskq_thunk, global_tid));
1488 
1489   if (in_parallel) {
1490     KF_TRACE(25,
1491              ("After TaskQ at %p Creation on (%d):\n", new_queue, global_tid));
1492   } else {
1493     KF_TRACE(25, ("After Serial TaskQ at %p Creation on (%d):\n", new_queue,
1494                   global_tid));
1495   }
1496 
1497   KF_DUMP(25, __kmp_dump_task_queue(tq, new_queue, global_tid));
1498 
1499   if (in_parallel) {
1500     KF_DUMP(50, __kmp_dump_task_queue_tree(tq, tq->tq_root, global_tid));
1501   }
1502 #endif /* KMP_DEBUG */
1503 
1504   if (__kmp_env_consistency_check)
1505     __kmp_push_workshare(global_tid, ct_taskq, new_queue->tq_loc);
1506 
1507   KE_TRACE(10, ("__kmpc_taskq return (%d)\n", global_tid));
1508 
1509   return new_taskq_thunk;
1510 }
1511 
1512 /*  ends a taskq; last thread out destroys the queue  */
1513 
__kmpc_end_taskq(ident_t * loc,kmp_int32 global_tid,kmpc_thunk_t * taskq_thunk)1514 void __kmpc_end_taskq(ident_t *loc, kmp_int32 global_tid,
1515                       kmpc_thunk_t *taskq_thunk) {
1516 #ifdef KMP_DEBUG
1517   kmp_int32 i;
1518 #endif
1519   kmp_taskq_t *tq;
1520   int in_parallel;
1521   kmp_info_t *th;
1522   kmp_int32 is_outermost;
1523   kmpc_task_queue_t *queue;
1524   kmpc_thunk_t *thunk;
1525   int nproc;
1526 
1527   KE_TRACE(10, ("__kmpc_end_taskq called (%d)\n", global_tid));
1528 
1529   tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
1530   nproc = __kmp_threads[global_tid]->th.th_team->t.t_nproc;
1531 
1532   /* For the outermost taskq only, all but one thread will have taskq_thunk ==
1533    * NULL */
1534   queue = (taskq_thunk == NULL) ? tq->tq_root
1535                                 : taskq_thunk->th.th_shareds->sv_queue;
1536 
1537   KE_TRACE(50, ("__kmpc_end_taskq queue=%p (%d) \n", queue, global_tid));
1538   is_outermost = (queue == tq->tq_root);
1539   in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1540 
1541   if (in_parallel) {
1542     kmp_uint32 spins;
1543 
1544     /* this is just a safeguard to release the waiting threads if */
1545     /* the outermost taskq never queues a task                    */
1546 
1547     if (is_outermost && (KMP_MASTER_GTID(global_tid))) {
1548       if (tq->tq_global_flags & TQF_RELEASE_WORKERS) {
1549         /* no lock needed, workers are still in spin mode */
1550         tq->tq_global_flags &= ~TQF_RELEASE_WORKERS;
1551 
1552         __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1553       }
1554     }
1555 
1556     /* keep dequeueing work until all tasks are queued and dequeued */
1557 
1558     do {
1559       /* wait until something is available to dequeue */
1560       KMP_INIT_YIELD(spins);
1561 
1562       while ((queue->tq_nfull == 0) && (queue->tq_taskq_slot == NULL) &&
1563              (!__kmp_taskq_has_any_children(queue)) &&
1564              (!(queue->tq_flags & TQF_ALL_TASKS_QUEUED))) {
1565         KMP_YIELD_WHEN(TRUE, spins);
1566       }
1567 
1568       /* check to see if we can execute tasks in the queue */
1569       while (((queue->tq_nfull != 0) || (queue->tq_taskq_slot != NULL)) &&
1570              (thunk = __kmp_find_task_in_queue(global_tid, queue)) != NULL) {
1571         KF_TRACE(50, ("Found thunk: %p in primary queue %p (%d)\n", thunk,
1572                       queue, global_tid));
1573         __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1574       }
1575 
1576       /* see if work found can be found in a descendant queue */
1577       if ((__kmp_taskq_has_any_children(queue)) &&
1578           (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) !=
1579               NULL) {
1580 
1581         KF_TRACE(50,
1582                  ("Stole thunk: %p in descendant queue: %p while waiting in "
1583                   "queue: %p (%d)\n",
1584                   thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1585 
1586         __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1587       }
1588 
1589     } while ((!(queue->tq_flags & TQF_ALL_TASKS_QUEUED)) ||
1590              (queue->tq_nfull != 0));
1591 
1592     KF_TRACE(50, ("All tasks queued and dequeued in queue: %p (%d)\n", queue,
1593                   global_tid));
1594 
1595     /* wait while all tasks are not finished and more work found
1596        in descendant queues */
1597 
1598     while ((!__kmp_taskq_tasks_finished(queue)) &&
1599            (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) !=
1600                NULL) {
1601 
1602       KF_TRACE(50, ("Stole thunk: %p in descendant queue: %p while waiting in "
1603                     "queue: %p (%d)\n",
1604                     thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1605 
1606       __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1607     }
1608 
1609     KF_TRACE(50, ("No work found in descendent queues or all work finished in "
1610                   "queue: %p (%d)\n",
1611                   queue, global_tid));
1612 
1613     if (!is_outermost) {
1614       /* need to return if NOWAIT present and not outermost taskq */
1615 
1616       if (queue->tq_flags & TQF_IS_NOWAIT) {
1617         __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1618         queue->tq_ref_count--;
1619         KMP_DEBUG_ASSERT(queue->tq_ref_count >= 0);
1620         __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1621 
1622         KE_TRACE(
1623             10, ("__kmpc_end_taskq return for nowait case (%d)\n", global_tid));
1624 
1625         return;
1626       }
1627 
1628       __kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue);
1629 
1630       /* WAIT until all tasks are finished and no child queues exist before
1631        * proceeding */
1632       KMP_INIT_YIELD(spins);
1633 
1634       while (!__kmp_taskq_tasks_finished(queue) ||
1635              __kmp_taskq_has_any_children(queue)) {
1636         thunk = __kmp_find_task_in_ancestor_queue(tq, global_tid, queue);
1637 
1638         if (thunk != NULL) {
1639           KF_TRACE(50,
1640                    ("Stole thunk: %p in ancestor queue: %p while waiting in "
1641                     "queue: %p (%d)\n",
1642                     thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1643           __kmp_execute_task_from_queue(tq, loc, global_tid, thunk,
1644                                         in_parallel);
1645         }
1646 
1647         KMP_YIELD_WHEN(thunk == NULL, spins);
1648 
1649         __kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue);
1650       }
1651 
1652       __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
1653       if (!(queue->tq_flags & TQF_DEALLOCATED)) {
1654         queue->tq_flags |= TQF_DEALLOCATED;
1655       }
1656       __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1657 
1658       /* only the allocating thread can deallocate the queue */
1659       if (taskq_thunk != NULL) {
1660         __kmp_remove_queue_from_tree(tq, global_tid, queue, TRUE);
1661       }
1662 
1663       KE_TRACE(
1664           10,
1665           ("__kmpc_end_taskq return for non_outermost queue, wait case (%d)\n",
1666            global_tid));
1667 
1668       return;
1669     }
1670 
1671     // Outermost Queue: steal work from descendants until all tasks are finished
1672 
1673     KMP_INIT_YIELD(spins);
1674 
1675     while (!__kmp_taskq_tasks_finished(queue)) {
1676       thunk = __kmp_find_task_in_descendant_queue(global_tid, queue);
1677 
1678       if (thunk != NULL) {
1679         KF_TRACE(50,
1680                  ("Stole thunk: %p in descendant queue: %p while waiting in "
1681                   "queue: %p (%d)\n",
1682                   thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1683 
1684         __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1685       }
1686 
1687       KMP_YIELD_WHEN(thunk == NULL, spins);
1688     }
1689 
1690     /* Need this barrier to prevent destruction of queue before threads have all
1691      * executed above code */
1692     /* This may need to be done earlier when NOWAIT is implemented for the
1693      * outermost level */
1694 
1695     if (!__kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL)) {
1696       /* the queue->tq_flags & TQF_IS_NOWAIT case is not yet handled here;   */
1697       /* for right now, everybody waits, and the master thread destroys the  */
1698       /* remaining queues.                                                   */
1699 
1700       __kmp_remove_all_child_taskq(tq, global_tid, queue);
1701 
1702       /* Now destroy the root queue */
1703       KF_TRACE(100, ("T#%d Before Deletion of top-level TaskQ at %p:\n",
1704                      global_tid, queue));
1705       KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1706 
1707 #ifdef KMP_DEBUG
1708       /*  the root queue entry  */
1709       KMP_DEBUG_ASSERT((queue->tq.tq_parent == NULL) &&
1710                        (queue->tq_next_child == NULL));
1711 
1712       /*  children must all be gone by now because of barrier above */
1713       KMP_DEBUG_ASSERT(queue->tq_first_child == NULL);
1714 
1715       for (i = 0; i < nproc; i++) {
1716         KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0);
1717       }
1718 
1719       for (i = 0, thunk = queue->tq_free_thunks; thunk != NULL;
1720            i++, thunk = thunk->th.th_next_free)
1721         ;
1722 
1723       KMP_DEBUG_ASSERT(i ==
1724                        queue->tq_nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH));
1725 
1726       for (i = 0; i < nproc; i++) {
1727         KMP_DEBUG_ASSERT(!tq->tq_curr_thunk[i]);
1728       }
1729 #endif
1730       /*  unlink the root queue entry  */
1731       tq->tq_root = NULL;
1732 
1733       /*  release storage for root queue entry  */
1734       KF_TRACE(50, ("After Deletion of top-level TaskQ at %p on (%d):\n", queue,
1735                     global_tid));
1736 
1737       queue->tq_flags |= TQF_DEALLOCATED;
1738       __kmp_free_taskq(tq, queue, in_parallel, global_tid);
1739 
1740       KF_DUMP(50, __kmp_dump_task_queue_tree(tq, tq->tq_root, global_tid));
1741 
1742       /* release the workers now that the data structures are up to date */
1743       __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1744     }
1745 
1746     th = __kmp_threads[global_tid];
1747 
1748     /* Reset ORDERED SECTION to parallel version */
1749     th->th.th_dispatch->th_deo_fcn = 0;
1750 
1751     /* Reset ORDERED SECTION to parallel version */
1752     th->th.th_dispatch->th_dxo_fcn = 0;
1753   } else {
1754     /* in serial execution context, dequeue the last task  */
1755     /* and execute it, if there were any tasks encountered */
1756 
1757     if (queue->tq_nfull > 0) {
1758       KMP_DEBUG_ASSERT(queue->tq_nfull == 1);
1759 
1760       thunk = __kmp_dequeue_task(global_tid, queue, in_parallel);
1761 
1762       if (queue->tq_flags & TQF_IS_LAST_TASK) {
1763         /* TQF_IS_LASTPRIVATE, one thing in queue, __kmpc_end_taskq_task() */
1764         /* has been run so this is last task, run with TQF_IS_LAST_TASK so */
1765         /* instrumentation does copy-out.                                  */
1766 
1767         /* no need for test_then_or call since already locked */
1768         thunk->th_flags |= TQF_IS_LAST_TASK;
1769       }
1770 
1771       KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid,
1772                     thunk, queue));
1773 
1774       __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1775     }
1776 
1777     // destroy the unattached serial queue now that there is no more work to do
1778     KF_TRACE(100, ("Before Deletion of Serialized TaskQ at %p on (%d):\n",
1779                    queue, global_tid));
1780     KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1781 
1782 #ifdef KMP_DEBUG
1783     i = 0;
1784     for (thunk = queue->tq_free_thunks; thunk != NULL;
1785          thunk = thunk->th.th_next_free)
1786       ++i;
1787     KMP_DEBUG_ASSERT(i == queue->tq_nslots + 1);
1788 #endif
1789     /*  release storage for unattached serial queue  */
1790     KF_TRACE(50,
1791              ("Serialized TaskQ at %p deleted on (%d).\n", queue, global_tid));
1792 
1793     queue->tq_flags |= TQF_DEALLOCATED;
1794     __kmp_free_taskq(tq, queue, in_parallel, global_tid);
1795   }
1796 
1797   KE_TRACE(10, ("__kmpc_end_taskq return (%d)\n", global_tid));
1798 }
1799 
1800 /*  Enqueues a task for thunk previously created by __kmpc_task_buffer. */
1801 /*  Returns nonzero if just filled up queue  */
1802 
__kmpc_task(ident_t * loc,kmp_int32 global_tid,kmpc_thunk_t * thunk)1803 kmp_int32 __kmpc_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk) {
1804   kmp_int32 ret;
1805   kmpc_task_queue_t *queue;
1806   int in_parallel;
1807   kmp_taskq_t *tq;
1808 
1809   KE_TRACE(10, ("__kmpc_task called (%d)\n", global_tid));
1810 
1811   KMP_DEBUG_ASSERT(!(thunk->th_flags &
1812                      TQF_TASKQ_TASK)); /*  thunk->th_task is a regular task  */
1813 
1814   tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
1815   queue = thunk->th.th_shareds->sv_queue;
1816   in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1817 
1818   if (in_parallel && (thunk->th_flags & TQF_IS_ORDERED))
1819     thunk->th_tasknum = ++queue->tq_tasknum_queuing;
1820 
1821   /* For serial execution dequeue the preceding task and execute it, if one
1822    * exists */
1823   /* This cannot be the last task.  That one is handled in __kmpc_end_taskq */
1824 
1825   if (!in_parallel && queue->tq_nfull > 0) {
1826     kmpc_thunk_t *prev_thunk;
1827 
1828     KMP_DEBUG_ASSERT(queue->tq_nfull == 1);
1829 
1830     prev_thunk = __kmp_dequeue_task(global_tid, queue, in_parallel);
1831 
1832     KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid,
1833                   prev_thunk, queue));
1834 
1835     __kmp_execute_task_from_queue(tq, loc, global_tid, prev_thunk, in_parallel);
1836   }
1837 
1838   /* The instrumentation sequence is:  __kmpc_task_buffer(), initialize private
1839      variables, __kmpc_task().  The __kmpc_task_buffer routine checks that the
1840      task queue is not full and allocates a thunk (which is then passed to
1841      __kmpc_task()).  So, the enqueue below should never fail due to a full
1842      queue. */
1843 
1844   KF_TRACE(100, ("After enqueueing this Task on (%d):\n", global_tid));
1845   KF_DUMP(100, __kmp_dump_thunk(tq, thunk, global_tid));
1846 
1847   ret = __kmp_enqueue_task(tq, global_tid, queue, thunk, in_parallel);
1848 
1849   KF_TRACE(100, ("Task Queue looks like this on (%d):\n", global_tid));
1850   KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1851 
1852   KE_TRACE(10, ("__kmpc_task return (%d)\n", global_tid));
1853 
1854   return ret;
1855 }
1856 
1857 /*  enqueues a taskq_task for thunk previously created by __kmpc_taskq  */
1858 /*  this should never be called unless in a parallel context            */
1859 
__kmpc_taskq_task(ident_t * loc,kmp_int32 global_tid,kmpc_thunk_t * thunk,kmp_int32 status)1860 void __kmpc_taskq_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk,
1861                        kmp_int32 status) {
1862   kmpc_task_queue_t *queue;
1863   kmp_taskq_t *tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
1864   int tid = __kmp_tid_from_gtid(global_tid);
1865 
1866   KE_TRACE(10, ("__kmpc_taskq_task called (%d)\n", global_tid));
1867   KF_TRACE(100, ("TaskQ Task argument thunk on (%d):\n", global_tid));
1868   KF_DUMP(100, __kmp_dump_thunk(tq, thunk, global_tid));
1869 
1870   queue = thunk->th.th_shareds->sv_queue;
1871 
1872   if (__kmp_env_consistency_check)
1873     __kmp_pop_workshare(global_tid, ct_taskq, loc);
1874 
1875   /*  thunk->th_task is the taskq_task  */
1876   KMP_DEBUG_ASSERT(thunk->th_flags & TQF_TASKQ_TASK);
1877 
1878   /*  not supposed to call __kmpc_taskq_task if it's already enqueued  */
1879   KMP_DEBUG_ASSERT(queue->tq_taskq_slot == NULL);
1880 
1881   /* dequeue taskq thunk from curr_thunk stack */
1882   tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1883   thunk->th_encl_thunk = NULL;
1884 
1885   KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1886 
1887   thunk->th_status = status;
1888 
1889   // Flush thunk->th_status before taskq_task enqueued to avoid race condition
1890   KMP_MB();
1891 
1892   /* enqueue taskq_task in thunk into special slot in queue     */
1893   /* GEH - probably don't need to lock taskq slot since only one */
1894   /*       thread enqueues & already a lock set at dequeue point */
1895 
1896   queue->tq_taskq_slot = thunk;
1897 
1898   KE_TRACE(10, ("__kmpc_taskq_task return (%d)\n", global_tid));
1899 }
1900 
1901 /* ends a taskq_task; done generating tasks  */
1902 
__kmpc_end_taskq_task(ident_t * loc,kmp_int32 global_tid,kmpc_thunk_t * thunk)1903 void __kmpc_end_taskq_task(ident_t *loc, kmp_int32 global_tid,
1904                            kmpc_thunk_t *thunk) {
1905   kmp_taskq_t *tq;
1906   kmpc_task_queue_t *queue;
1907   int in_parallel;
1908   int tid;
1909 
1910   KE_TRACE(10, ("__kmpc_end_taskq_task called (%d)\n", global_tid));
1911 
1912   tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
1913   queue = thunk->th.th_shareds->sv_queue;
1914   in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1915   tid = __kmp_tid_from_gtid(global_tid);
1916 
1917   if (__kmp_env_consistency_check)
1918     __kmp_pop_workshare(global_tid, ct_taskq, loc);
1919 
1920   if (in_parallel) {
1921 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1922     KMP_TEST_THEN_OR32(RCAST(volatile kmp_uint32 *, &queue->tq_flags),
1923                        TQF_ALL_TASKS_QUEUED);
1924 #else
1925     {
1926       __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
1927 
1928       // Make sure data structures are in consistent state before querying them
1929       // Seems to work without this for digital/alpha, needed for IBM/RS6000
1930       KMP_MB();
1931 
1932       queue->tq_flags |= TQF_ALL_TASKS_QUEUED;
1933       __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1934     }
1935 #endif
1936   }
1937 
1938   if (thunk->th_flags & TQF_IS_LASTPRIVATE) {
1939     /* Normally, __kmp_find_task_in_queue() refuses to schedule the last task in
1940        the queue if TQF_IS_LASTPRIVATE so we can positively identify that last
1941        task and run it with its TQF_IS_LAST_TASK bit turned on in th_flags.
1942        When __kmpc_end_taskq_task() is called we are done generating all the
1943        tasks, so we know the last one in the queue is the lastprivate task.
1944        Mark the queue as having gotten to this state via tq_flags &
1945        TQF_IS_LAST_TASK; when that task actually executes mark it via th_flags &
1946        TQF_IS_LAST_TASK (this th_flags bit signals the instrumented code to do
1947        copy-outs after execution). */
1948     if (!in_parallel) {
1949       /* No synchronization needed for serial context */
1950       queue->tq_flags |= TQF_IS_LAST_TASK;
1951     } else {
1952 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1953       KMP_TEST_THEN_OR32(RCAST(volatile kmp_uint32 *, &queue->tq_flags),
1954                          TQF_IS_LAST_TASK);
1955 #else
1956       {
1957         __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
1958 
1959         // Make sure data structures in consistent state before querying them
1960         // Seems to work without this for digital/alpha, needed for IBM/RS6000
1961         KMP_MB();
1962 
1963         queue->tq_flags |= TQF_IS_LAST_TASK;
1964         __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1965       }
1966 #endif
1967       /* to prevent race condition where last task is dequeued but */
1968       /* flag isn't visible yet (not sure about this)              */
1969       KMP_MB();
1970     }
1971   }
1972 
1973   /* dequeue taskq thunk from curr_thunk stack */
1974   if (in_parallel) {
1975     tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1976     thunk->th_encl_thunk = NULL;
1977 
1978     KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1979   }
1980 
1981   KE_TRACE(10, ("__kmpc_end_taskq_task return (%d)\n", global_tid));
1982 }
1983 
1984 /* returns thunk for a regular task based on taskq_thunk              */
1985 /* (__kmpc_taskq_task does the analogous thing for a TQF_TASKQ_TASK)  */
1986 
__kmpc_task_buffer(ident_t * loc,kmp_int32 global_tid,kmpc_thunk_t * taskq_thunk,kmpc_task_t task)1987 kmpc_thunk_t *__kmpc_task_buffer(ident_t *loc, kmp_int32 global_tid,
1988                                  kmpc_thunk_t *taskq_thunk, kmpc_task_t task) {
1989   kmp_taskq_t *tq;
1990   kmpc_task_queue_t *queue;
1991   kmpc_thunk_t *new_thunk;
1992   int in_parallel;
1993 
1994   KE_TRACE(10, ("__kmpc_task_buffer called (%d)\n", global_tid));
1995 
1996   KMP_DEBUG_ASSERT(
1997       taskq_thunk->th_flags &
1998       TQF_TASKQ_TASK); /*  taskq_thunk->th_task is the taskq_task  */
1999 
2000   tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
2001   queue = taskq_thunk->th.th_shareds->sv_queue;
2002   in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
2003 
2004   /* The instrumentation sequence is:  __kmpc_task_buffer(), initialize private
2005      variables, __kmpc_task().  The __kmpc_task_buffer routine checks that the
2006      task queue is not full and allocates a thunk (which is then passed to
2007      __kmpc_task()).  So, we can pre-allocate a thunk here assuming it will be
2008      the next to be enqueued in __kmpc_task(). */
2009 
2010   new_thunk = __kmp_alloc_thunk(queue, in_parallel, global_tid);
2011   new_thunk->th.th_shareds =
2012       CCAST(kmpc_shared_vars_t *, queue->tq_shareds[0].ai_data);
2013   new_thunk->th_encl_thunk = NULL;
2014   new_thunk->th_task = task;
2015 
2016   /* GEH - shouldn't need to lock the read of tq_flags here */
2017   new_thunk->th_flags = queue->tq_flags & TQF_INTERFACE_FLAGS;
2018 
2019   new_thunk->th_status = 0;
2020 
2021   KMP_DEBUG_ASSERT(!(new_thunk->th_flags & TQF_TASKQ_TASK));
2022 
2023   KF_TRACE(100, ("Creating Regular Task on (%d):\n", global_tid));
2024   KF_DUMP(100, __kmp_dump_thunk(tq, new_thunk, global_tid));
2025 
2026   KE_TRACE(10, ("__kmpc_task_buffer return (%d)\n", global_tid));
2027 
2028   return new_thunk;
2029 }
2030