1 /*
2  * kmp_taskdeps.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 //#define KMP_SUPPORT_GRAPH_OUTPUT 1
14 
15 #include "kmp.h"
16 #include "kmp_io.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 #if OMPT_SUPPORT
20 #include "ompt-specific.h"
21 #endif
22 
23 // TODO: Improve memory allocation? keep a list of pre-allocated structures?
24 // allocate in blocks? re-use list finished list entries?
25 // TODO: don't use atomic ref counters for stack-allocated nodes.
26 // TODO: find an alternate to atomic refs for heap-allocated nodes?
27 // TODO: Finish graph output support
28 // TODO: kmp_lock_t seems a tad to big (and heavy weight) for this. Check other
29 // runtime locks
30 // TODO: Any ITT support needed?
31 
32 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
33 static std::atomic<kmp_int32> kmp_node_id_seed = ATOMIC_VAR_INIT(0);
34 #endif
35 
36 static void __kmp_init_node(kmp_depnode_t *node) {
37   node->dn.successors = NULL;
38   node->dn.task = NULL; // will point to the right task
39   // once dependences have been processed
40   for (int i = 0; i < MAX_MTX_DEPS; ++i)
41     node->dn.mtx_locks[i] = NULL;
42   node->dn.mtx_num_locks = 0;
43   __kmp_init_lock(&node->dn.lock);
44   KMP_ATOMIC_ST_RLX(&node->dn.nrefs, 1); // init creates the first reference
45 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
46   node->dn.id = KMP_ATOMIC_INC(&kmp_node_id_seed);
47 #endif
48 }
49 
50 static inline kmp_depnode_t *__kmp_node_ref(kmp_depnode_t *node) {
51   KMP_ATOMIC_INC(&node->dn.nrefs);
52   return node;
53 }
54 
55 enum { KMP_DEPHASH_OTHER_SIZE = 97, KMP_DEPHASH_MASTER_SIZE = 997 };
56 
57 size_t sizes[] = { 997, 2003, 4001, 8191, 16001, 32003, 64007, 131071, 270029 };
58 const size_t MAX_GEN = 8;
59 
60 static inline size_t __kmp_dephash_hash(kmp_intptr_t addr, size_t hsize) {
61   // TODO alternate to try: set = (((Addr64)(addrUsefulBits * 9.618)) %
62   // m_num_sets );
63   return ((addr >> 6) ^ (addr >> 2)) % hsize;
64 }
65 
66 static kmp_dephash_t *__kmp_dephash_extend(kmp_info_t *thread,
67                                            kmp_dephash_t *current_dephash) {
68   kmp_dephash_t *h;
69 
70   size_t gen = current_dephash->generation + 1;
71   if (gen >= MAX_GEN)
72     return current_dephash;
73   size_t new_size = sizes[gen];
74 
75   size_t size_to_allocate =
76       new_size * sizeof(kmp_dephash_entry_t *) + sizeof(kmp_dephash_t);
77 
78 #if USE_FAST_MEMORY
79   h = (kmp_dephash_t *)__kmp_fast_allocate(thread, size_to_allocate);
80 #else
81   h = (kmp_dephash_t *)__kmp_thread_malloc(thread, size_to_allocate);
82 #endif
83 
84   h->size = new_size;
85   h->nelements = current_dephash->nelements;
86   h->buckets = (kmp_dephash_entry **)(h + 1);
87   h->generation = gen;
88   h->nconflicts = 0;
89   // insert existing elements in the new table
90   for (size_t i = 0; i < current_dephash->size; i++) {
91     kmp_dephash_entry_t *next, *entry;
92     for (entry = current_dephash->buckets[i]; entry; entry = next) {
93       next = entry->next_in_bucket;
94       // Compute the new hash using the new size, and insert the entry in
95       // the new bucket.
96       size_t new_bucket = __kmp_dephash_hash(entry->addr, h->size);
97       entry->next_in_bucket = h->buckets[new_bucket];
98       if (entry->next_in_bucket) {
99         h->nconflicts++;
100       }
101       h->buckets[new_bucket] = entry;
102     }
103   }
104 
105   // Free old hash table
106 #if USE_FAST_MEMORY
107   __kmp_fast_free(thread, current_dephash);
108 #else
109   __kmp_thread_free(thread, current_dephash);
110 #endif
111 
112   return h;
113 }
114 
115 static kmp_dephash_t *__kmp_dephash_create(kmp_info_t *thread,
116                                            kmp_taskdata_t *current_task) {
117   kmp_dephash_t *h;
118 
119   size_t h_size;
120 
121   if (current_task->td_flags.tasktype == TASK_IMPLICIT)
122     h_size = KMP_DEPHASH_MASTER_SIZE;
123   else
124     h_size = KMP_DEPHASH_OTHER_SIZE;
125 
126   size_t size = h_size * sizeof(kmp_dephash_entry_t *) + sizeof(kmp_dephash_t);
127 
128 #if USE_FAST_MEMORY
129   h = (kmp_dephash_t *)__kmp_fast_allocate(thread, size);
130 #else
131   h = (kmp_dephash_t *)__kmp_thread_malloc(thread, size);
132 #endif
133   h->size = h_size;
134 
135   h->generation = 0;
136   h->nelements = 0;
137   h->nconflicts = 0;
138   h->buckets = (kmp_dephash_entry **)(h + 1);
139 
140   for (size_t i = 0; i < h_size; i++)
141     h->buckets[i] = 0;
142 
143   return h;
144 }
145 
146 #define ENTRY_LAST_INS 0
147 #define ENTRY_LAST_MTXS 1
148 
149 static kmp_dephash_entry *
150 __kmp_dephash_find(kmp_info_t *thread, kmp_dephash_t **hash, kmp_intptr_t addr) {
151   kmp_dephash_t *h = *hash;
152   if (h->nelements != 0
153       && h->nconflicts/h->size >= 1) {
154     *hash = __kmp_dephash_extend(thread, h);
155     h = *hash;
156   }
157   size_t bucket = __kmp_dephash_hash(addr, h->size);
158 
159   kmp_dephash_entry_t *entry;
160   for (entry = h->buckets[bucket]; entry; entry = entry->next_in_bucket)
161     if (entry->addr == addr)
162       break;
163 
164   if (entry == NULL) {
165 // create entry. This is only done by one thread so no locking required
166 #if USE_FAST_MEMORY
167     entry = (kmp_dephash_entry_t *)__kmp_fast_allocate(
168         thread, sizeof(kmp_dephash_entry_t));
169 #else
170     entry = (kmp_dephash_entry_t *)__kmp_thread_malloc(
171         thread, sizeof(kmp_dephash_entry_t));
172 #endif
173     entry->addr = addr;
174     entry->last_out = NULL;
175     entry->last_ins = NULL;
176     entry->last_mtxs = NULL;
177     entry->last_flag = ENTRY_LAST_INS;
178     entry->mtx_lock = NULL;
179     entry->next_in_bucket = h->buckets[bucket];
180     h->buckets[bucket] = entry;
181     h->nelements++;
182     if (entry->next_in_bucket)
183       h->nconflicts++;
184   }
185   return entry;
186 }
187 
188 static kmp_depnode_list_t *__kmp_add_node(kmp_info_t *thread,
189                                           kmp_depnode_list_t *list,
190                                           kmp_depnode_t *node) {
191   kmp_depnode_list_t *new_head;
192 
193 #if USE_FAST_MEMORY
194   new_head = (kmp_depnode_list_t *)__kmp_fast_allocate(
195       thread, sizeof(kmp_depnode_list_t));
196 #else
197   new_head = (kmp_depnode_list_t *)__kmp_thread_malloc(
198       thread, sizeof(kmp_depnode_list_t));
199 #endif
200 
201   new_head->node = __kmp_node_ref(node);
202   new_head->next = list;
203 
204   return new_head;
205 }
206 
207 static inline void __kmp_track_dependence(kmp_int32 gtid, kmp_depnode_t *source,
208                                           kmp_depnode_t *sink,
209                                           kmp_task_t *sink_task) {
210 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
211   kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
212   // do not use sink->dn.task as that is only filled after the dependencies
213   // are already processed!
214   kmp_taskdata_t *task_sink = KMP_TASK_TO_TASKDATA(sink_task);
215 
216   __kmp_printf("%d(%s) -> %d(%s)\n", source->dn.id,
217                task_source->td_ident->psource, sink->dn.id,
218                task_sink->td_ident->psource);
219 #endif
220 #if OMPT_SUPPORT && OMPT_OPTIONAL
221   /* OMPT tracks dependences between task (a=source, b=sink) in which
222      task a blocks the execution of b through the ompt_new_dependence_callback
223      */
224   if (ompt_enabled.ompt_callback_task_dependence) {
225     kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
226     ompt_data_t *sink_data;
227     if (sink_task)
228       sink_data = &(KMP_TASK_TO_TASKDATA(sink_task)->ompt_task_info.task_data);
229     else
230       sink_data = &__kmp_threads[gtid]->th.ompt_thread_info.task_data;
231 
232     ompt_callbacks.ompt_callback(ompt_callback_task_dependence)(
233         &(task_source->ompt_task_info.task_data), sink_data);
234   }
235 #endif /* OMPT_SUPPORT && OMPT_OPTIONAL */
236 }
237 
238 static inline kmp_int32
239 __kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread,
240                              kmp_task_t *task, kmp_depnode_t *node,
241                              kmp_depnode_list_t *plist) {
242   if (!plist)
243     return 0;
244   kmp_int32 npredecessors = 0;
245   // link node as successor of list elements
246   for (kmp_depnode_list_t *p = plist; p; p = p->next) {
247     kmp_depnode_t *dep = p->node;
248     if (dep->dn.task) {
249       KMP_ACQUIRE_DEPNODE(gtid, dep);
250       if (dep->dn.task) {
251         __kmp_track_dependence(gtid, dep, node, task);
252         dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node);
253         KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
254                       "%p\n",
255                       gtid, KMP_TASK_TO_TASKDATA(dep->dn.task),
256                       KMP_TASK_TO_TASKDATA(task)));
257         npredecessors++;
258       }
259       KMP_RELEASE_DEPNODE(gtid, dep);
260     }
261   }
262   return npredecessors;
263 }
264 
265 static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid,
266                                                      kmp_info_t *thread,
267                                                      kmp_task_t *task,
268                                                      kmp_depnode_t *source,
269                                                      kmp_depnode_t *sink) {
270   if (!sink)
271     return 0;
272   kmp_int32 npredecessors = 0;
273   if (sink->dn.task) {
274     // synchronously add source to sink' list of successors
275     KMP_ACQUIRE_DEPNODE(gtid, sink);
276     if (sink->dn.task) {
277       __kmp_track_dependence(gtid, sink, source, task);
278       sink->dn.successors = __kmp_add_node(thread, sink->dn.successors, source);
279       KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
280                     "%p\n",
281                     gtid, KMP_TASK_TO_TASKDATA(sink->dn.task),
282                     KMP_TASK_TO_TASKDATA(task)));
283       npredecessors++;
284     }
285     KMP_RELEASE_DEPNODE(gtid, sink);
286   }
287   return npredecessors;
288 }
289 
290 template <bool filter>
291 static inline kmp_int32
292 __kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t **hash,
293                    bool dep_barrier, kmp_int32 ndeps,
294                    kmp_depend_info_t *dep_list, kmp_task_t *task) {
295   KA_TRACE(30, ("__kmp_process_deps<%d>: T#%d processing %d dependencies : "
296                 "dep_barrier = %d\n",
297                 filter, gtid, ndeps, dep_barrier));
298 
299   kmp_info_t *thread = __kmp_threads[gtid];
300   kmp_int32 npredecessors = 0;
301   for (kmp_int32 i = 0; i < ndeps; i++) {
302     const kmp_depend_info_t *dep = &dep_list[i];
303 
304     if (filter && dep->base_addr == 0)
305       continue; // skip filtered entries
306 
307     kmp_dephash_entry_t *info =
308         __kmp_dephash_find(thread, hash, dep->base_addr);
309     kmp_depnode_t *last_out = info->last_out;
310     kmp_depnode_list_t *last_ins = info->last_ins;
311     kmp_depnode_list_t *last_mtxs = info->last_mtxs;
312 
313     if (dep->flags.out) { // out --> clean lists of ins and mtxs if any
314       if (last_ins || last_mtxs) {
315         if (info->last_flag == ENTRY_LAST_INS) { // INS were last
316           npredecessors +=
317               __kmp_depnode_link_successor(gtid, thread, task, node, last_ins);
318         } else { // MTXS were last
319           npredecessors +=
320               __kmp_depnode_link_successor(gtid, thread, task, node, last_mtxs);
321         }
322         __kmp_depnode_list_free(thread, last_ins);
323         __kmp_depnode_list_free(thread, last_mtxs);
324         info->last_ins = NULL;
325         info->last_mtxs = NULL;
326       } else {
327         npredecessors +=
328             __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
329       }
330       __kmp_node_deref(thread, last_out);
331       if (dep_barrier) {
332         // if this is a sync point in the serial sequence, then the previous
333         // outputs are guaranteed to be completed after the execution of this
334         // task so the previous output nodes can be cleared.
335         info->last_out = NULL;
336       } else {
337         info->last_out = __kmp_node_ref(node);
338       }
339     } else if (dep->flags.in) {
340       // in --> link node to either last_out or last_mtxs, clean earlier deps
341       if (last_mtxs) {
342         npredecessors +=
343             __kmp_depnode_link_successor(gtid, thread, task, node, last_mtxs);
344         __kmp_node_deref(thread, last_out);
345         info->last_out = NULL;
346         if (info->last_flag == ENTRY_LAST_MTXS && last_ins) { // MTXS were last
347           // clean old INS before creating new list
348           __kmp_depnode_list_free(thread, last_ins);
349           info->last_ins = NULL;
350         }
351       } else {
352         // link node as successor of the last_out if any
353         npredecessors +=
354             __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
355       }
356       info->last_flag = ENTRY_LAST_INS;
357       info->last_ins = __kmp_add_node(thread, info->last_ins, node);
358     } else {
359       KMP_DEBUG_ASSERT(dep->flags.mtx == 1);
360       // mtx --> link node to either last_out or last_ins, clean earlier deps
361       if (last_ins) {
362         npredecessors +=
363             __kmp_depnode_link_successor(gtid, thread, task, node, last_ins);
364         __kmp_node_deref(thread, last_out);
365         info->last_out = NULL;
366         if (info->last_flag == ENTRY_LAST_INS && last_mtxs) { // INS were last
367           // clean old MTXS before creating new list
368           __kmp_depnode_list_free(thread, last_mtxs);
369           info->last_mtxs = NULL;
370         }
371       } else {
372         // link node as successor of the last_out if any
373         npredecessors +=
374             __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
375       }
376       info->last_flag = ENTRY_LAST_MTXS;
377       info->last_mtxs = __kmp_add_node(thread, info->last_mtxs, node);
378       if (info->mtx_lock == NULL) {
379         info->mtx_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
380         __kmp_init_lock(info->mtx_lock);
381       }
382       KMP_DEBUG_ASSERT(node->dn.mtx_num_locks < MAX_MTX_DEPS);
383       kmp_int32 m;
384       // Save lock in node's array
385       for (m = 0; m < MAX_MTX_DEPS; ++m) {
386         // sort pointers in decreasing order to avoid potential livelock
387         if (node->dn.mtx_locks[m] < info->mtx_lock) {
388           KMP_DEBUG_ASSERT(node->dn.mtx_locks[node->dn.mtx_num_locks] == NULL);
389           for (int n = node->dn.mtx_num_locks; n > m; --n) {
390             // shift right all lesser non-NULL pointers
391             KMP_DEBUG_ASSERT(node->dn.mtx_locks[n - 1] != NULL);
392             node->dn.mtx_locks[n] = node->dn.mtx_locks[n - 1];
393           }
394           node->dn.mtx_locks[m] = info->mtx_lock;
395           break;
396         }
397       }
398       KMP_DEBUG_ASSERT(m < MAX_MTX_DEPS); // must break from loop
399       node->dn.mtx_num_locks++;
400     }
401   }
402   KA_TRACE(30, ("__kmp_process_deps<%d>: T#%d found %d predecessors\n", filter,
403                 gtid, npredecessors));
404   return npredecessors;
405 }
406 
407 #define NO_DEP_BARRIER (false)
408 #define DEP_BARRIER (true)
409 
410 // returns true if the task has any outstanding dependence
411 static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
412                              kmp_task_t *task, kmp_dephash_t **hash,
413                              bool dep_barrier, kmp_int32 ndeps,
414                              kmp_depend_info_t *dep_list,
415                              kmp_int32 ndeps_noalias,
416                              kmp_depend_info_t *noalias_dep_list) {
417   int i, n_mtxs = 0;
418 #if KMP_DEBUG
419   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
420 #endif
421   KA_TRACE(20, ("__kmp_check_deps: T#%d checking dependencies for task %p : %d "
422                 "possibly aliased dependencies, %d non-aliased dependencies : "
423                 "dep_barrier=%d .\n",
424                 gtid, taskdata, ndeps, ndeps_noalias, dep_barrier));
425 
426   // Filter deps in dep_list
427   // TODO: Different algorithm for large dep_list ( > 10 ? )
428   for (i = 0; i < ndeps; i++) {
429     if (dep_list[i].base_addr != 0) {
430       for (int j = i + 1; j < ndeps; j++) {
431         if (dep_list[i].base_addr == dep_list[j].base_addr) {
432           dep_list[i].flags.in |= dep_list[j].flags.in;
433           dep_list[i].flags.out |=
434               (dep_list[j].flags.out ||
435                (dep_list[i].flags.in && dep_list[j].flags.mtx) ||
436                (dep_list[i].flags.mtx && dep_list[j].flags.in));
437           dep_list[i].flags.mtx =
438               dep_list[i].flags.mtx | dep_list[j].flags.mtx &&
439               !dep_list[i].flags.out;
440           dep_list[j].base_addr = 0; // Mark j element as void
441         }
442       }
443       if (dep_list[i].flags.mtx) {
444         // limit number of mtx deps to MAX_MTX_DEPS per node
445         if (n_mtxs < MAX_MTX_DEPS && task != NULL) {
446           ++n_mtxs;
447         } else {
448           dep_list[i].flags.in = 1; // downgrade mutexinoutset to inout
449           dep_list[i].flags.out = 1;
450           dep_list[i].flags.mtx = 0;
451         }
452       }
453     }
454   }
455 
456   // doesn't need to be atomic as no other thread is going to be accessing this
457   // node just yet.
458   // npredecessors is set -1 to ensure that none of the releasing tasks queues
459   // this task before we have finished processing all the dependencies
460   node->dn.npredecessors = -1;
461 
462   // used to pack all npredecessors additions into a single atomic operation at
463   // the end
464   int npredecessors;
465 
466   npredecessors = __kmp_process_deps<true>(gtid, node, hash, dep_barrier, ndeps,
467                                            dep_list, task);
468   npredecessors += __kmp_process_deps<false>(
469       gtid, node, hash, dep_barrier, ndeps_noalias, noalias_dep_list, task);
470 
471   node->dn.task = task;
472   KMP_MB();
473 
474   // Account for our initial fake value
475   npredecessors++;
476 
477   // Update predecessors and obtain current value to check if there are still
478   // any outstanding dependences (some tasks may have finished while we
479   // processed the dependences)
480   npredecessors =
481       node->dn.npredecessors.fetch_add(npredecessors) + npredecessors;
482 
483   KA_TRACE(20, ("__kmp_check_deps: T#%d found %d predecessors for task %p \n",
484                 gtid, npredecessors, taskdata));
485 
486   // beyond this point the task could be queued (and executed) by a releasing
487   // task...
488   return npredecessors > 0 ? true : false;
489 }
490 
491 /*!
492 @ingroup TASKING
493 @param loc_ref location of the original task directive
494 @param gtid Global Thread ID of encountering thread
495 @param new_task task thunk allocated by __kmp_omp_task_alloc() for the ''new
496 task''
497 @param ndeps Number of depend items with possible aliasing
498 @param dep_list List of depend items with possible aliasing
499 @param ndeps_noalias Number of depend items with no aliasing
500 @param noalias_dep_list List of depend items with no aliasing
501 
502 @return Returns either TASK_CURRENT_NOT_QUEUED if the current task was not
503 suspended and queued, or TASK_CURRENT_QUEUED if it was suspended and queued
504 
505 Schedule a non-thread-switchable task with dependences for execution
506 */
507 kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
508                                     kmp_task_t *new_task, kmp_int32 ndeps,
509                                     kmp_depend_info_t *dep_list,
510                                     kmp_int32 ndeps_noalias,
511                                     kmp_depend_info_t *noalias_dep_list) {
512 
513   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
514   KA_TRACE(10, ("__kmpc_omp_task_with_deps(enter): T#%d loc=%p task=%p\n", gtid,
515                 loc_ref, new_taskdata));
516   __kmp_assert_valid_gtid(gtid);
517   kmp_info_t *thread = __kmp_threads[gtid];
518   kmp_taskdata_t *current_task = thread->th.th_current_task;
519 
520 #if OMPT_SUPPORT
521   if (ompt_enabled.enabled) {
522     if (!current_task->ompt_task_info.frame.enter_frame.ptr)
523       current_task->ompt_task_info.frame.enter_frame.ptr =
524           OMPT_GET_FRAME_ADDRESS(0);
525     if (ompt_enabled.ompt_callback_task_create) {
526       ompt_data_t task_data = ompt_data_none;
527       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
528           current_task ? &(current_task->ompt_task_info.task_data) : &task_data,
529           current_task ? &(current_task->ompt_task_info.frame) : NULL,
530           &(new_taskdata->ompt_task_info.task_data),
531           ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 1,
532           OMPT_LOAD_OR_GET_RETURN_ADDRESS(gtid));
533     }
534 
535     new_taskdata->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
536   }
537 
538 #if OMPT_OPTIONAL
539   /* OMPT grab all dependences if requested by the tool */
540   if (ndeps + ndeps_noalias > 0 &&
541       ompt_enabled.ompt_callback_dependences) {
542     kmp_int32 i;
543 
544     int ompt_ndeps = ndeps + ndeps_noalias;
545     ompt_dependence_t *ompt_deps = (ompt_dependence_t *)KMP_OMPT_DEPS_ALLOC(
546         thread, (ndeps + ndeps_noalias) * sizeof(ompt_dependence_t));
547 
548     KMP_ASSERT(ompt_deps != NULL);
549 
550     for (i = 0; i < ndeps; i++) {
551       ompt_deps[i].variable.ptr = (void *)dep_list[i].base_addr;
552       if (dep_list[i].flags.in && dep_list[i].flags.out)
553         ompt_deps[i].dependence_type = ompt_dependence_type_inout;
554       else if (dep_list[i].flags.out)
555         ompt_deps[i].dependence_type = ompt_dependence_type_out;
556       else if (dep_list[i].flags.in)
557         ompt_deps[i].dependence_type = ompt_dependence_type_in;
558       else if (dep_list[i].flags.mtx)
559         ompt_deps[i].dependence_type = ompt_dependence_type_mutexinoutset;
560     }
561     for (i = 0; i < ndeps_noalias; i++) {
562       ompt_deps[ndeps + i].variable.ptr = (void *)noalias_dep_list[i].base_addr;
563       if (noalias_dep_list[i].flags.in && noalias_dep_list[i].flags.out)
564         ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inout;
565       else if (noalias_dep_list[i].flags.out)
566         ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_out;
567       else if (noalias_dep_list[i].flags.in)
568         ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_in;
569       else if (noalias_dep_list[i].flags.mtx)
570         ompt_deps[ndeps + i].dependence_type =
571             ompt_dependence_type_mutexinoutset;
572     }
573     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
574         &(new_taskdata->ompt_task_info.task_data), ompt_deps, ompt_ndeps);
575     /* We can now free the allocated memory for the dependencies */
576     /* For OMPD we might want to delay the free until end of this function */
577     KMP_OMPT_DEPS_FREE(thread, ompt_deps);
578   }
579 #endif /* OMPT_OPTIONAL */
580 #endif /* OMPT_SUPPORT */
581 
582   bool serial = current_task->td_flags.team_serial ||
583                 current_task->td_flags.tasking_ser ||
584                 current_task->td_flags.final;
585   kmp_task_team_t *task_team = thread->th.th_task_team;
586   serial = serial && !(task_team && task_team->tt.tt_found_proxy_tasks);
587 
588   if (!serial && (ndeps > 0 || ndeps_noalias > 0)) {
589     /* if no dependencies have been tracked yet, create the dependence hash */
590     if (current_task->td_dephash == NULL)
591       current_task->td_dephash = __kmp_dephash_create(thread, current_task);
592 
593 #if USE_FAST_MEMORY
594     kmp_depnode_t *node =
595         (kmp_depnode_t *)__kmp_fast_allocate(thread, sizeof(kmp_depnode_t));
596 #else
597     kmp_depnode_t *node =
598         (kmp_depnode_t *)__kmp_thread_malloc(thread, sizeof(kmp_depnode_t));
599 #endif
600 
601     __kmp_init_node(node);
602     new_taskdata->td_depnode = node;
603 
604     if (__kmp_check_deps(gtid, node, new_task, &current_task->td_dephash,
605                          NO_DEP_BARRIER, ndeps, dep_list, ndeps_noalias,
606                          noalias_dep_list)) {
607       KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d task had blocking "
608                     "dependencies: "
609                     "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
610                     gtid, loc_ref, new_taskdata));
611 #if OMPT_SUPPORT
612       if (ompt_enabled.enabled) {
613         current_task->ompt_task_info.frame.enter_frame = ompt_data_none;
614       }
615 #endif
616       return TASK_CURRENT_NOT_QUEUED;
617     }
618   } else {
619     KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d ignored dependencies "
620                   "for task (serialized)"
621                   "loc=%p task=%p\n",
622                   gtid, loc_ref, new_taskdata));
623   }
624 
625   KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d task had no blocking "
626                 "dependencies : "
627                 "loc=%p task=%p, transferring to __kmp_omp_task\n",
628                 gtid, loc_ref, new_taskdata));
629 
630   kmp_int32 ret = __kmp_omp_task(gtid, new_task, true);
631 #if OMPT_SUPPORT
632   if (ompt_enabled.enabled) {
633     current_task->ompt_task_info.frame.enter_frame = ompt_data_none;
634   }
635 #endif
636   return ret;
637 }
638 
639 #if OMPT_SUPPORT
640 void __ompt_taskwait_dep_finish(kmp_taskdata_t *current_task,
641                                 ompt_data_t *taskwait_task_data) {
642   if (ompt_enabled.ompt_callback_task_schedule) {
643     ompt_data_t task_data = ompt_data_none;
644     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
645         current_task ? &(current_task->ompt_task_info.task_data) : &task_data,
646         ompt_task_switch, taskwait_task_data);
647     ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
648         taskwait_task_data, ompt_task_complete,
649         current_task ? &(current_task->ompt_task_info.task_data) : &task_data);
650   }
651   current_task->ompt_task_info.frame.enter_frame.ptr = NULL;
652   *taskwait_task_data = ompt_data_none;
653 }
654 #endif /* OMPT_SUPPORT */
655 
656 /*!
657 @ingroup TASKING
658 @param loc_ref location of the original task directive
659 @param gtid Global Thread ID of encountering thread
660 @param ndeps Number of depend items with possible aliasing
661 @param dep_list List of depend items with possible aliasing
662 @param ndeps_noalias Number of depend items with no aliasing
663 @param noalias_dep_list List of depend items with no aliasing
664 
665 Blocks the current task until all specifies dependencies have been fulfilled.
666 */
667 void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
668                           kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
669                           kmp_depend_info_t *noalias_dep_list) {
670   KA_TRACE(10, ("__kmpc_omp_wait_deps(enter): T#%d loc=%p\n", gtid, loc_ref));
671 
672   if (ndeps == 0 && ndeps_noalias == 0) {
673     KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no dependencies to "
674                   "wait upon : loc=%p\n",
675                   gtid, loc_ref));
676     return;
677   }
678   __kmp_assert_valid_gtid(gtid);
679   kmp_info_t *thread = __kmp_threads[gtid];
680   kmp_taskdata_t *current_task = thread->th.th_current_task;
681 
682 #if OMPT_SUPPORT
683   // this function represents a taskwait construct with depend clause
684   // We signal 4 events:
685   //  - creation of the taskwait task
686   //  - dependences of the taskwait task
687   //  - schedule and finish of the taskwait task
688   ompt_data_t *taskwait_task_data = &thread->th.ompt_thread_info.task_data;
689   KMP_ASSERT(taskwait_task_data->ptr == NULL);
690   if (ompt_enabled.enabled) {
691     if (!current_task->ompt_task_info.frame.enter_frame.ptr)
692       current_task->ompt_task_info.frame.enter_frame.ptr =
693           OMPT_GET_FRAME_ADDRESS(0);
694     if (ompt_enabled.ompt_callback_task_create) {
695       ompt_data_t task_data = ompt_data_none;
696       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
697           current_task ? &(current_task->ompt_task_info.task_data) : &task_data,
698           current_task ? &(current_task->ompt_task_info.frame) : NULL,
699           taskwait_task_data,
700           ompt_task_explicit | ompt_task_undeferred | ompt_task_mergeable, 1,
701           OMPT_LOAD_OR_GET_RETURN_ADDRESS(gtid));
702     }
703   }
704 
705 #if OMPT_OPTIONAL
706   /* OMPT grab all dependences if requested by the tool */
707   if (ndeps + ndeps_noalias > 0 && ompt_enabled.ompt_callback_dependences) {
708     kmp_int32 i;
709 
710     int ompt_ndeps = ndeps + ndeps_noalias;
711     ompt_dependence_t *ompt_deps = (ompt_dependence_t *)KMP_OMPT_DEPS_ALLOC(
712         thread, (ndeps + ndeps_noalias) * sizeof(ompt_dependence_t));
713 
714     KMP_ASSERT(ompt_deps != NULL);
715 
716     for (i = 0; i < ndeps; i++) {
717       ompt_deps[i].variable.ptr = (void *)dep_list[i].base_addr;
718       if (dep_list[i].flags.in && dep_list[i].flags.out)
719         ompt_deps[i].dependence_type = ompt_dependence_type_inout;
720       else if (dep_list[i].flags.out)
721         ompt_deps[i].dependence_type = ompt_dependence_type_out;
722       else if (dep_list[i].flags.in)
723         ompt_deps[i].dependence_type = ompt_dependence_type_in;
724       else if (dep_list[i].flags.mtx)
725         ompt_deps[ndeps + i].dependence_type =
726             ompt_dependence_type_mutexinoutset;
727     }
728     for (i = 0; i < ndeps_noalias; i++) {
729       ompt_deps[ndeps + i].variable.ptr = (void *)noalias_dep_list[i].base_addr;
730       if (noalias_dep_list[i].flags.in && noalias_dep_list[i].flags.out)
731         ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_inout;
732       else if (noalias_dep_list[i].flags.out)
733         ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_out;
734       else if (noalias_dep_list[i].flags.in)
735         ompt_deps[ndeps + i].dependence_type = ompt_dependence_type_in;
736       else if (noalias_dep_list[i].flags.mtx)
737         ompt_deps[ndeps + i].dependence_type =
738             ompt_dependence_type_mutexinoutset;
739     }
740     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
741         taskwait_task_data, ompt_deps, ompt_ndeps);
742     /* We can now free the allocated memory for the dependencies */
743     /* For OMPD we might want to delay the free until end of this function */
744     KMP_OMPT_DEPS_FREE(thread, ompt_deps);
745     ompt_deps = NULL;
746   }
747 #endif /* OMPT_OPTIONAL */
748 #endif /* OMPT_SUPPORT */
749 
750   // We can return immediately as:
751   // - dependences are not computed in serial teams (except with proxy tasks)
752   // - if the dephash is not yet created it means we have nothing to wait for
753   bool ignore = current_task->td_flags.team_serial ||
754                 current_task->td_flags.tasking_ser ||
755                 current_task->td_flags.final;
756   ignore = ignore && thread->th.th_task_team != NULL &&
757            thread->th.th_task_team->tt.tt_found_proxy_tasks == FALSE;
758   ignore = ignore || current_task->td_dephash == NULL;
759 
760   if (ignore) {
761     KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking "
762                   "dependencies : loc=%p\n",
763                   gtid, loc_ref));
764 #if OMPT_SUPPORT
765     __ompt_taskwait_dep_finish(current_task, taskwait_task_data);
766 #endif /* OMPT_SUPPORT */
767     return;
768   }
769 
770   kmp_depnode_t node = {0};
771   __kmp_init_node(&node);
772   // the stack owns the node
773   __kmp_node_ref(&node);
774 
775   if (!__kmp_check_deps(gtid, &node, NULL, &current_task->td_dephash,
776                         DEP_BARRIER, ndeps, dep_list, ndeps_noalias,
777                         noalias_dep_list)) {
778     KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking "
779                   "dependencies : loc=%p\n",
780                   gtid, loc_ref));
781 #if OMPT_SUPPORT
782     __ompt_taskwait_dep_finish(current_task, taskwait_task_data);
783 #endif /* OMPT_SUPPORT */
784     return;
785   }
786 
787   int thread_finished = FALSE;
788   kmp_flag_32<false, false> flag(
789       (std::atomic<kmp_uint32> *)&node.dn.npredecessors, 0U);
790   while (node.dn.npredecessors > 0) {
791     flag.execute_tasks(thread, gtid, FALSE,
792                        &thread_finished USE_ITT_BUILD_ARG(NULL),
793                        __kmp_task_stealing_constraint);
794   }
795 
796 #if OMPT_SUPPORT
797   __ompt_taskwait_dep_finish(current_task, taskwait_task_data);
798 #endif /* OMPT_SUPPORT */
799   KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d finished waiting : loc=%p\n",
800                 gtid, loc_ref));
801 }
802