1 /* 2 * kmp_tasking.cpp -- OpenMP 3.0 tasking support. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_i18n.h" 15 #include "kmp_itt.h" 16 #include "kmp_stats.h" 17 #include "kmp_wait_release.h" 18 #include "kmp_taskdeps.h" 19 20 #if OMPT_SUPPORT 21 #include "ompt-specific.h" 22 #endif 23 24 /* forward declaration */ 25 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 26 kmp_info_t *this_thr); 27 static void __kmp_alloc_task_deque(kmp_info_t *thread, 28 kmp_thread_data_t *thread_data); 29 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 30 kmp_task_team_t *task_team); 31 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask); 32 33 #ifdef BUILD_TIED_TASK_STACK 34 35 // __kmp_trace_task_stack: print the tied tasks from the task stack in order 36 // from top do bottom 37 // 38 // gtid: global thread identifier for thread containing stack 39 // thread_data: thread data for task team thread containing stack 40 // threshold: value above which the trace statement triggers 41 // location: string identifying call site of this function (for trace) 42 static void __kmp_trace_task_stack(kmp_int32 gtid, 43 kmp_thread_data_t *thread_data, 44 int threshold, char *location) { 45 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 46 kmp_taskdata_t **stack_top = task_stack->ts_top; 47 kmp_int32 entries = task_stack->ts_entries; 48 kmp_taskdata_t *tied_task; 49 50 KA_TRACE( 51 threshold, 52 ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, " 53 "first_block = %p, stack_top = %p \n", 54 location, gtid, entries, task_stack->ts_first_block, stack_top)); 55 56 KMP_DEBUG_ASSERT(stack_top != NULL); 57 KMP_DEBUG_ASSERT(entries > 0); 58 59 while (entries != 0) { 60 KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]); 61 // fix up ts_top if we need to pop from previous block 62 if (entries & TASK_STACK_INDEX_MASK == 0) { 63 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top); 64 65 stack_block = stack_block->sb_prev; 66 stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 67 } 68 69 // finish bookkeeping 70 stack_top--; 71 entries--; 72 73 tied_task = *stack_top; 74 75 KMP_DEBUG_ASSERT(tied_task != NULL); 76 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 77 78 KA_TRACE(threshold, 79 ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, " 80 "stack_top=%p, tied_task=%p\n", 81 location, gtid, entries, stack_top, tied_task)); 82 } 83 KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]); 84 85 KA_TRACE(threshold, 86 ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n", 87 location, gtid)); 88 } 89 90 // __kmp_init_task_stack: initialize the task stack for the first time 91 // after a thread_data structure is created. 92 // It should not be necessary to do this again (assuming the stack works). 93 // 94 // gtid: global thread identifier of calling thread 95 // thread_data: thread data for task team thread containing stack 96 static void __kmp_init_task_stack(kmp_int32 gtid, 97 kmp_thread_data_t *thread_data) { 98 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 99 kmp_stack_block_t *first_block; 100 101 // set up the first block of the stack 102 first_block = &task_stack->ts_first_block; 103 task_stack->ts_top = (kmp_taskdata_t **)first_block; 104 memset((void *)first_block, '\0', 105 TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *)); 106 107 // initialize the stack to be empty 108 task_stack->ts_entries = TASK_STACK_EMPTY; 109 first_block->sb_next = NULL; 110 first_block->sb_prev = NULL; 111 } 112 113 // __kmp_free_task_stack: free the task stack when thread_data is destroyed. 114 // 115 // gtid: global thread identifier for calling thread 116 // thread_data: thread info for thread containing stack 117 static void __kmp_free_task_stack(kmp_int32 gtid, 118 kmp_thread_data_t *thread_data) { 119 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 120 kmp_stack_block_t *stack_block = &task_stack->ts_first_block; 121 122 KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY); 123 // free from the second block of the stack 124 while (stack_block != NULL) { 125 kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL; 126 127 stack_block->sb_next = NULL; 128 stack_block->sb_prev = NULL; 129 if (stack_block != &task_stack->ts_first_block) { 130 __kmp_thread_free(thread, 131 stack_block); // free the block, if not the first 132 } 133 stack_block = next_block; 134 } 135 // initialize the stack to be empty 136 task_stack->ts_entries = 0; 137 task_stack->ts_top = NULL; 138 } 139 140 // __kmp_push_task_stack: Push the tied task onto the task stack. 141 // Grow the stack if necessary by allocating another block. 142 // 143 // gtid: global thread identifier for calling thread 144 // thread: thread info for thread containing stack 145 // tied_task: the task to push on the stack 146 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread, 147 kmp_taskdata_t *tied_task) { 148 // GEH - need to consider what to do if tt_threads_data not allocated yet 149 kmp_thread_data_t *thread_data = 150 &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 151 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 152 153 if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) { 154 return; // Don't push anything on stack if team or team tasks are serialized 155 } 156 157 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 158 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 159 160 KA_TRACE(20, 161 ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n", 162 gtid, thread, tied_task)); 163 // Store entry 164 *(task_stack->ts_top) = tied_task; 165 166 // Do bookkeeping for next push 167 task_stack->ts_top++; 168 task_stack->ts_entries++; 169 170 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 171 // Find beginning of this task block 172 kmp_stack_block_t *stack_block = 173 (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE); 174 175 // Check if we already have a block 176 if (stack_block->sb_next != 177 NULL) { // reset ts_top to beginning of next block 178 task_stack->ts_top = &stack_block->sb_next->sb_block[0]; 179 } else { // Alloc new block and link it up 180 kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc( 181 thread, sizeof(kmp_stack_block_t)); 182 183 task_stack->ts_top = &new_block->sb_block[0]; 184 stack_block->sb_next = new_block; 185 new_block->sb_prev = stack_block; 186 new_block->sb_next = NULL; 187 188 KA_TRACE( 189 30, 190 ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n", 191 gtid, tied_task, new_block)); 192 } 193 } 194 KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 195 tied_task)); 196 } 197 198 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return 199 // the task, just check to make sure it matches the ending task passed in. 200 // 201 // gtid: global thread identifier for the calling thread 202 // thread: thread info structure containing stack 203 // tied_task: the task popped off the stack 204 // ending_task: the task that is ending (should match popped task) 205 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread, 206 kmp_taskdata_t *ending_task) { 207 // GEH - need to consider what to do if tt_threads_data not allocated yet 208 kmp_thread_data_t *thread_data = 209 &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)]; 210 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 211 kmp_taskdata_t *tied_task; 212 213 if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) { 214 // Don't pop anything from stack if team or team tasks are serialized 215 return; 216 } 217 218 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 219 KMP_DEBUG_ASSERT(task_stack->ts_entries > 0); 220 221 KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid, 222 thread)); 223 224 // fix up ts_top if we need to pop from previous block 225 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 226 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top); 227 228 stack_block = stack_block->sb_prev; 229 task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 230 } 231 232 // finish bookkeeping 233 task_stack->ts_top--; 234 task_stack->ts_entries--; 235 236 tied_task = *(task_stack->ts_top); 237 238 KMP_DEBUG_ASSERT(tied_task != NULL); 239 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 240 KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly 241 242 KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 243 tied_task)); 244 return; 245 } 246 #endif /* BUILD_TIED_TASK_STACK */ 247 248 // returns 1 if new task is allowed to execute, 0 otherwise 249 // checks Task Scheduling constraint (if requested) and 250 // mutexinoutset dependencies if any 251 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained, 252 const kmp_taskdata_t *tasknew, 253 const kmp_taskdata_t *taskcurr) { 254 if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) { 255 // Check if the candidate obeys the Task Scheduling Constraints (TSC) 256 // only descendant of all deferred tied tasks can be scheduled, checking 257 // the last one is enough, as it in turn is the descendant of all others 258 kmp_taskdata_t *current = taskcurr->td_last_tied; 259 KMP_DEBUG_ASSERT(current != NULL); 260 // check if the task is not suspended on barrier 261 if (current->td_flags.tasktype == TASK_EXPLICIT || 262 current->td_taskwait_thread > 0) { // <= 0 on barrier 263 kmp_int32 level = current->td_level; 264 kmp_taskdata_t *parent = tasknew->td_parent; 265 while (parent != current && parent->td_level > level) { 266 // check generation up to the level of the current task 267 parent = parent->td_parent; 268 KMP_DEBUG_ASSERT(parent != NULL); 269 } 270 if (parent != current) 271 return false; 272 } 273 } 274 // Check mutexinoutset dependencies, acquire locks 275 kmp_depnode_t *node = tasknew->td_depnode; 276 if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) { 277 for (int i = 0; i < node->dn.mtx_num_locks; ++i) { 278 KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL); 279 if (__kmp_test_lock(node->dn.mtx_locks[i], gtid)) 280 continue; 281 // could not get the lock, release previous locks 282 for (int j = i - 1; j >= 0; --j) 283 __kmp_release_lock(node->dn.mtx_locks[j], gtid); 284 return false; 285 } 286 // negative num_locks means all locks acquired successfully 287 node->dn.mtx_num_locks = -node->dn.mtx_num_locks; 288 } 289 return true; 290 } 291 292 // __kmp_realloc_task_deque: 293 // Re-allocates a task deque for a particular thread, copies the content from 294 // the old deque and adjusts the necessary data structures relating to the 295 // deque. This operation must be done with the deque_lock being held 296 static void __kmp_realloc_task_deque(kmp_info_t *thread, 297 kmp_thread_data_t *thread_data) { 298 kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td); 299 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size); 300 kmp_int32 new_size = 2 * size; 301 302 KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to " 303 "%d] for thread_data %p\n", 304 __kmp_gtid_from_thread(thread), size, new_size, thread_data)); 305 306 kmp_taskdata_t **new_deque = 307 (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *)); 308 309 int i, j; 310 for (i = thread_data->td.td_deque_head, j = 0; j < size; 311 i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++) 312 new_deque[j] = thread_data->td.td_deque[i]; 313 314 __kmp_free(thread_data->td.td_deque); 315 316 thread_data->td.td_deque_head = 0; 317 thread_data->td.td_deque_tail = size; 318 thread_data->td.td_deque = new_deque; 319 thread_data->td.td_deque_size = new_size; 320 } 321 322 static kmp_task_pri_t *__kmp_alloc_task_pri_list() { 323 kmp_task_pri_t *l = (kmp_task_pri_t *)__kmp_allocate(sizeof(kmp_task_pri_t)); 324 kmp_thread_data_t *thread_data = &l->td; 325 __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock); 326 thread_data->td.td_deque_last_stolen = -1; 327 KE_TRACE(20, ("__kmp_alloc_task_pri_list: T#%d allocating deque[%d] " 328 "for thread_data %p\n", 329 __kmp_get_gtid(), INITIAL_TASK_DEQUE_SIZE, thread_data)); 330 thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate( 331 INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *)); 332 thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE; 333 return l; 334 } 335 336 // The function finds the deque of priority tasks with given priority, or 337 // allocates a new deque and put it into sorted (high -> low) list of deques. 338 // Deques of non-default priority tasks are shared between all threads in team, 339 // as opposed to per-thread deques of tasks with default priority. 340 // The function is called under the lock task_team->tt.tt_task_pri_lock. 341 static kmp_thread_data_t * 342 __kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri) { 343 kmp_thread_data_t *thread_data; 344 kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list; 345 if (lst->priority == pri) { 346 // Found queue of tasks with given priority. 347 thread_data = &lst->td; 348 } else if (lst->priority < pri) { 349 // All current priority queues contain tasks with lower priority. 350 // Allocate new one for given priority tasks. 351 kmp_task_pri_t *list = __kmp_alloc_task_pri_list(); 352 thread_data = &list->td; 353 list->priority = pri; 354 list->next = lst; 355 task_team->tt.tt_task_pri_list = list; 356 } else { // task_team->tt.tt_task_pri_list->priority > pri 357 kmp_task_pri_t *next_queue = lst->next; 358 while (next_queue && next_queue->priority > pri) { 359 lst = next_queue; 360 next_queue = lst->next; 361 } 362 // lst->priority > pri && (next == NULL || pri >= next->priority) 363 if (next_queue == NULL) { 364 // No queue with pri priority, need to allocate new one. 365 kmp_task_pri_t *list = __kmp_alloc_task_pri_list(); 366 thread_data = &list->td; 367 list->priority = pri; 368 list->next = NULL; 369 lst->next = list; 370 } else if (next_queue->priority == pri) { 371 // Found queue of tasks with given priority. 372 thread_data = &next_queue->td; 373 } else { // lst->priority > pri > next->priority 374 // insert newly allocated between existed queues 375 kmp_task_pri_t *list = __kmp_alloc_task_pri_list(); 376 thread_data = &list->td; 377 list->priority = pri; 378 list->next = next_queue; 379 lst->next = list; 380 } 381 } 382 return thread_data; 383 } 384 385 // __kmp_push_priority_task: Add a task to the team's priority task deque 386 static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread, 387 kmp_taskdata_t *taskdata, 388 kmp_task_team_t *task_team, 389 kmp_int32 pri) { 390 kmp_thread_data_t *thread_data = NULL; 391 KA_TRACE(20, 392 ("__kmp_push_priority_task: T#%d trying to push task %p, pri %d.\n", 393 gtid, taskdata, pri)); 394 395 // Find task queue specific to priority value 396 kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list; 397 if (UNLIKELY(lst == NULL)) { 398 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock); 399 if (task_team->tt.tt_task_pri_list == NULL) { 400 // List of queues is still empty, allocate one. 401 kmp_task_pri_t *list = __kmp_alloc_task_pri_list(); 402 thread_data = &list->td; 403 list->priority = pri; 404 list->next = NULL; 405 task_team->tt.tt_task_pri_list = list; 406 } else { 407 // Other thread initialized a queue. Check if it fits and get thread_data. 408 thread_data = __kmp_get_priority_deque_data(task_team, pri); 409 } 410 __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock); 411 } else { 412 if (lst->priority == pri) { 413 // Found queue of tasks with given priority. 414 thread_data = &lst->td; 415 } else { 416 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock); 417 thread_data = __kmp_get_priority_deque_data(task_team, pri); 418 __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock); 419 } 420 } 421 KMP_DEBUG_ASSERT(thread_data); 422 423 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 424 // Check if deque is full 425 if (TCR_4(thread_data->td.td_deque_ntasks) >= 426 TASK_DEQUE_SIZE(thread_data->td)) { 427 if (__kmp_enable_task_throttling && 428 __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata, 429 thread->th.th_current_task)) { 430 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 431 KA_TRACE(20, ("__kmp_push_priority_task: T#%d deque is full; returning " 432 "TASK_NOT_PUSHED for task %p\n", 433 gtid, taskdata)); 434 return TASK_NOT_PUSHED; 435 } else { 436 // expand deque to push the task which is not allowed to execute 437 __kmp_realloc_task_deque(thread, thread_data); 438 } 439 } 440 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) < 441 TASK_DEQUE_SIZE(thread_data->td)); 442 // Push taskdata. 443 thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata; 444 // Wrap index. 445 thread_data->td.td_deque_tail = 446 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 447 TCW_4(thread_data->td.td_deque_ntasks, 448 TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count 449 KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self 450 KMP_FSYNC_RELEASING(taskdata); // releasing child 451 KA_TRACE(20, ("__kmp_push_priority_task: T#%d returning " 452 "TASK_SUCCESSFULLY_PUSHED: task=%p ntasks=%d head=%u tail=%u\n", 453 gtid, taskdata, thread_data->td.td_deque_ntasks, 454 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 455 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 456 task_team->tt.tt_num_task_pri++; // atomic inc 457 return TASK_SUCCESSFULLY_PUSHED; 458 } 459 460 // __kmp_push_task: Add a task to the thread's deque 461 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { 462 kmp_info_t *thread = __kmp_threads[gtid]; 463 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 464 465 // If we encounter a hidden helper task, and the current thread is not a 466 // hidden helper thread, we have to give the task to any hidden helper thread 467 // starting from its shadow one. 468 if (UNLIKELY(taskdata->td_flags.hidden_helper && 469 !KMP_HIDDEN_HELPER_THREAD(gtid))) { 470 kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid); 471 __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid)); 472 // Signal the hidden helper threads. 473 __kmp_hidden_helper_worker_thread_signal(); 474 return TASK_SUCCESSFULLY_PUSHED; 475 } 476 477 kmp_task_team_t *task_team = thread->th.th_task_team; 478 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 479 kmp_thread_data_t *thread_data; 480 481 KA_TRACE(20, 482 ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata)); 483 484 if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) { 485 // untied task needs to increment counter so that the task structure is not 486 // freed prematurely 487 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count); 488 KMP_DEBUG_USE_VAR(counter); 489 KA_TRACE( 490 20, 491 ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n", 492 gtid, counter, taskdata)); 493 } 494 495 // The first check avoids building task_team thread data if serialized 496 if (UNLIKELY(taskdata->td_flags.task_serial)) { 497 KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning " 498 "TASK_NOT_PUSHED for task %p\n", 499 gtid, taskdata)); 500 return TASK_NOT_PUSHED; 501 } 502 503 // Now that serialized tasks have returned, we can assume that we are not in 504 // immediate exec mode 505 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 506 if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) { 507 __kmp_enable_tasking(task_team, thread); 508 } 509 KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE); 510 KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL); 511 512 if (taskdata->td_flags.priority_specified && task->data2.priority > 0 && 513 __kmp_max_task_priority > 0) { 514 int pri = KMP_MIN(task->data2.priority, __kmp_max_task_priority); 515 return __kmp_push_priority_task(gtid, thread, taskdata, task_team, pri); 516 } 517 518 // Find tasking deque specific to encountering thread 519 thread_data = &task_team->tt.tt_threads_data[tid]; 520 521 // No lock needed since only owner can allocate. If the task is hidden_helper, 522 // we don't need it either because we have initialized the dequeue for hidden 523 // helper thread data. 524 if (UNLIKELY(thread_data->td.td_deque == NULL)) { 525 __kmp_alloc_task_deque(thread, thread_data); 526 } 527 528 int locked = 0; 529 // Check if deque is full 530 if (TCR_4(thread_data->td.td_deque_ntasks) >= 531 TASK_DEQUE_SIZE(thread_data->td)) { 532 if (__kmp_enable_task_throttling && 533 __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata, 534 thread->th.th_current_task)) { 535 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning " 536 "TASK_NOT_PUSHED for task %p\n", 537 gtid, taskdata)); 538 return TASK_NOT_PUSHED; 539 } else { 540 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 541 locked = 1; 542 if (TCR_4(thread_data->td.td_deque_ntasks) >= 543 TASK_DEQUE_SIZE(thread_data->td)) { 544 // expand deque to push the task which is not allowed to execute 545 __kmp_realloc_task_deque(thread, thread_data); 546 } 547 } 548 } 549 // Lock the deque for the task push operation 550 if (!locked) { 551 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 552 // Need to recheck as we can get a proxy task from thread outside of OpenMP 553 if (TCR_4(thread_data->td.td_deque_ntasks) >= 554 TASK_DEQUE_SIZE(thread_data->td)) { 555 if (__kmp_enable_task_throttling && 556 __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata, 557 thread->th.th_current_task)) { 558 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 559 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; " 560 "returning TASK_NOT_PUSHED for task %p\n", 561 gtid, taskdata)); 562 return TASK_NOT_PUSHED; 563 } else { 564 // expand deque to push the task which is not allowed to execute 565 __kmp_realloc_task_deque(thread, thread_data); 566 } 567 } 568 } 569 // Must have room since no thread can add tasks but calling thread 570 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) < 571 TASK_DEQUE_SIZE(thread_data->td)); 572 573 thread_data->td.td_deque[thread_data->td.td_deque_tail] = 574 taskdata; // Push taskdata 575 // Wrap index. 576 thread_data->td.td_deque_tail = 577 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 578 TCW_4(thread_data->td.td_deque_ntasks, 579 TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count 580 KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self 581 KMP_FSYNC_RELEASING(taskdata); // releasing child 582 KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: " 583 "task=%p ntasks=%d head=%u tail=%u\n", 584 gtid, taskdata, thread_data->td.td_deque_ntasks, 585 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 586 587 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 588 589 return TASK_SUCCESSFULLY_PUSHED; 590 } 591 592 // __kmp_pop_current_task_from_thread: set up current task from called thread 593 // when team ends 594 // 595 // this_thr: thread structure to set current_task in. 596 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) { 597 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d " 598 "this_thread=%p, curtask=%p, " 599 "curtask_parent=%p\n", 600 0, this_thr, this_thr->th.th_current_task, 601 this_thr->th.th_current_task->td_parent)); 602 603 this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent; 604 605 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d " 606 "this_thread=%p, curtask=%p, " 607 "curtask_parent=%p\n", 608 0, this_thr, this_thr->th.th_current_task, 609 this_thr->th.th_current_task->td_parent)); 610 } 611 612 // __kmp_push_current_task_to_thread: set up current task in called thread for a 613 // new team 614 // 615 // this_thr: thread structure to set up 616 // team: team for implicit task data 617 // tid: thread within team to set up 618 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team, 619 int tid) { 620 // current task of the thread is a parent of the new just created implicit 621 // tasks of new team 622 KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p " 623 "curtask=%p " 624 "parent_task=%p\n", 625 tid, this_thr, this_thr->th.th_current_task, 626 team->t.t_implicit_task_taskdata[tid].td_parent)); 627 628 KMP_DEBUG_ASSERT(this_thr != NULL); 629 630 if (tid == 0) { 631 if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) { 632 team->t.t_implicit_task_taskdata[0].td_parent = 633 this_thr->th.th_current_task; 634 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0]; 635 } 636 } else { 637 team->t.t_implicit_task_taskdata[tid].td_parent = 638 team->t.t_implicit_task_taskdata[0].td_parent; 639 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid]; 640 } 641 642 KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p " 643 "curtask=%p " 644 "parent_task=%p\n", 645 tid, this_thr, this_thr->th.th_current_task, 646 team->t.t_implicit_task_taskdata[tid].td_parent)); 647 } 648 649 // __kmp_task_start: bookkeeping for a task starting execution 650 // 651 // GTID: global thread id of calling thread 652 // task: task starting execution 653 // current_task: task suspending 654 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task, 655 kmp_taskdata_t *current_task) { 656 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 657 kmp_info_t *thread = __kmp_threads[gtid]; 658 659 KA_TRACE(10, 660 ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n", 661 gtid, taskdata, current_task)); 662 663 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 664 665 // mark currently executing task as suspended 666 // TODO: GEH - make sure root team implicit task is initialized properly. 667 // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 ); 668 current_task->td_flags.executing = 0; 669 670 // Add task to stack if tied 671 #ifdef BUILD_TIED_TASK_STACK 672 if (taskdata->td_flags.tiedness == TASK_TIED) { 673 __kmp_push_task_stack(gtid, thread, taskdata); 674 } 675 #endif /* BUILD_TIED_TASK_STACK */ 676 677 // mark starting task as executing and as current task 678 thread->th.th_current_task = taskdata; 679 680 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 || 681 taskdata->td_flags.tiedness == TASK_UNTIED); 682 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 || 683 taskdata->td_flags.tiedness == TASK_UNTIED); 684 taskdata->td_flags.started = 1; 685 taskdata->td_flags.executing = 1; 686 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 687 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 688 689 // GEH TODO: shouldn't we pass some sort of location identifier here? 690 // APT: yes, we will pass location here. 691 // need to store current thread state (in a thread or taskdata structure) 692 // before setting work_state, otherwise wrong state is set after end of task 693 694 KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata)); 695 696 return; 697 } 698 699 #if OMPT_SUPPORT 700 //------------------------------------------------------------------------------ 701 // __ompt_task_init: 702 // Initialize OMPT fields maintained by a task. This will only be called after 703 // ompt_start_tool, so we already know whether ompt is enabled or not. 704 705 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) { 706 // The calls to __ompt_task_init already have the ompt_enabled condition. 707 task->ompt_task_info.task_data.value = 0; 708 task->ompt_task_info.frame.exit_frame = ompt_data_none; 709 task->ompt_task_info.frame.enter_frame = ompt_data_none; 710 task->ompt_task_info.frame.exit_frame_flags = 711 ompt_frame_runtime | ompt_frame_framepointer; 712 task->ompt_task_info.frame.enter_frame_flags = 713 ompt_frame_runtime | ompt_frame_framepointer; 714 } 715 716 // __ompt_task_start: 717 // Build and trigger task-begin event 718 static inline void __ompt_task_start(kmp_task_t *task, 719 kmp_taskdata_t *current_task, 720 kmp_int32 gtid) { 721 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 722 ompt_task_status_t status = ompt_task_switch; 723 if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) { 724 status = ompt_task_yield; 725 __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0; 726 } 727 /* let OMPT know that we're about to run this task */ 728 if (ompt_enabled.ompt_callback_task_schedule) { 729 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)( 730 &(current_task->ompt_task_info.task_data), status, 731 &(taskdata->ompt_task_info.task_data)); 732 } 733 taskdata->ompt_task_info.scheduling_parent = current_task; 734 } 735 736 // __ompt_task_finish: 737 // Build and trigger final task-schedule event 738 static inline void __ompt_task_finish(kmp_task_t *task, 739 kmp_taskdata_t *resumed_task, 740 ompt_task_status_t status) { 741 if (ompt_enabled.ompt_callback_task_schedule) { 742 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 743 if (__kmp_omp_cancellation && taskdata->td_taskgroup && 744 taskdata->td_taskgroup->cancel_request == cancel_taskgroup) { 745 status = ompt_task_cancel; 746 } 747 748 /* let OMPT know that we're returning to the callee task */ 749 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)( 750 &(taskdata->ompt_task_info.task_data), status, 751 (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL)); 752 } 753 } 754 #endif 755 756 template <bool ompt> 757 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid, 758 kmp_task_t *task, 759 void *frame_address, 760 void *return_address) { 761 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 762 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 763 764 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p " 765 "current_task=%p\n", 766 gtid, loc_ref, taskdata, current_task)); 767 768 if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) { 769 // untied task needs to increment counter so that the task structure is not 770 // freed prematurely 771 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count); 772 KMP_DEBUG_USE_VAR(counter); 773 KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) " 774 "incremented for task %p\n", 775 gtid, counter, taskdata)); 776 } 777 778 taskdata->td_flags.task_serial = 779 1; // Execute this task immediately, not deferred. 780 __kmp_task_start(gtid, task, current_task); 781 782 #if OMPT_SUPPORT 783 if (ompt) { 784 if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) { 785 current_task->ompt_task_info.frame.enter_frame.ptr = 786 taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address; 787 current_task->ompt_task_info.frame.enter_frame_flags = 788 taskdata->ompt_task_info.frame.exit_frame_flags = 789 ompt_frame_application | ompt_frame_framepointer; 790 } 791 if (ompt_enabled.ompt_callback_task_create) { 792 ompt_task_info_t *parent_info = &(current_task->ompt_task_info); 793 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 794 &(parent_info->task_data), &(parent_info->frame), 795 &(taskdata->ompt_task_info.task_data), 796 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0, 797 return_address); 798 } 799 __ompt_task_start(task, current_task, gtid); 800 } 801 #endif // OMPT_SUPPORT 802 803 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid, 804 loc_ref, taskdata)); 805 } 806 807 #if OMPT_SUPPORT 808 OMPT_NOINLINE 809 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid, 810 kmp_task_t *task, 811 void *frame_address, 812 void *return_address) { 813 __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address, 814 return_address); 815 } 816 #endif // OMPT_SUPPORT 817 818 // __kmpc_omp_task_begin_if0: report that a given serialized task has started 819 // execution 820 // 821 // loc_ref: source location information; points to beginning of task block. 822 // gtid: global thread number. 823 // task: task thunk for the started task. 824 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid, 825 kmp_task_t *task) { 826 #if OMPT_SUPPORT 827 if (UNLIKELY(ompt_enabled.enabled)) { 828 OMPT_STORE_RETURN_ADDRESS(gtid); 829 __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task, 830 OMPT_GET_FRAME_ADDRESS(1), 831 OMPT_LOAD_RETURN_ADDRESS(gtid)); 832 return; 833 } 834 #endif 835 __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL); 836 } 837 838 #ifdef TASK_UNUSED 839 // __kmpc_omp_task_begin: report that a given task has started execution 840 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 841 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) { 842 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 843 844 KA_TRACE( 845 10, 846 ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n", 847 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task)); 848 849 __kmp_task_start(gtid, task, current_task); 850 851 KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid, 852 loc_ref, KMP_TASK_TO_TASKDATA(task))); 853 return; 854 } 855 #endif // TASK_UNUSED 856 857 // __kmp_free_task: free the current task space and the space for shareds 858 // 859 // gtid: Global thread ID of calling thread 860 // taskdata: task to free 861 // thread: thread data structure of caller 862 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata, 863 kmp_info_t *thread) { 864 KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid, 865 taskdata)); 866 867 // Check to make sure all flags and counters have the correct values 868 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 869 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0); 870 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1); 871 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 872 KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 || 873 taskdata->td_flags.task_serial == 1); 874 KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0); 875 kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata); 876 // Clear data to not be re-used later by mistake. 877 task->data1.destructors = NULL; 878 task->data2.priority = 0; 879 880 taskdata->td_flags.freed = 1; 881 // deallocate the taskdata and shared variable blocks associated with this task 882 #if USE_FAST_MEMORY 883 __kmp_fast_free(thread, taskdata); 884 #else /* ! USE_FAST_MEMORY */ 885 __kmp_thread_free(thread, taskdata); 886 #endif 887 KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata)); 888 } 889 890 // __kmp_free_task_and_ancestors: free the current task and ancestors without 891 // children 892 // 893 // gtid: Global thread ID of calling thread 894 // taskdata: task to free 895 // thread: thread data structure of caller 896 static void __kmp_free_task_and_ancestors(kmp_int32 gtid, 897 kmp_taskdata_t *taskdata, 898 kmp_info_t *thread) { 899 // Proxy tasks must always be allowed to free their parents 900 // because they can be run in background even in serial mode. 901 kmp_int32 team_serial = 902 (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) && 903 !taskdata->td_flags.proxy; 904 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 905 906 kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1; 907 KMP_DEBUG_ASSERT(children >= 0); 908 909 // Now, go up the ancestor tree to see if any ancestors can now be freed. 910 while (children == 0) { 911 kmp_taskdata_t *parent_taskdata = taskdata->td_parent; 912 913 KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete " 914 "and freeing itself\n", 915 gtid, taskdata)); 916 917 // --- Deallocate my ancestor task --- 918 __kmp_free_task(gtid, taskdata, thread); 919 920 taskdata = parent_taskdata; 921 922 if (team_serial) 923 return; 924 // Stop checking ancestors at implicit task instead of walking up ancestor 925 // tree to avoid premature deallocation of ancestors. 926 if (taskdata->td_flags.tasktype == TASK_IMPLICIT) { 927 if (taskdata->td_dephash) { // do we need to cleanup dephash? 928 int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks); 929 kmp_tasking_flags_t flags_old = taskdata->td_flags; 930 if (children == 0 && flags_old.complete == 1) { 931 kmp_tasking_flags_t flags_new = flags_old; 932 flags_new.complete = 0; 933 if (KMP_COMPARE_AND_STORE_ACQ32( 934 RCAST(kmp_int32 *, &taskdata->td_flags), 935 *RCAST(kmp_int32 *, &flags_old), 936 *RCAST(kmp_int32 *, &flags_new))) { 937 KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans " 938 "dephash of implicit task %p\n", 939 gtid, taskdata)); 940 // cleanup dephash of finished implicit task 941 __kmp_dephash_free_entries(thread, taskdata->td_dephash); 942 } 943 } 944 } 945 return; 946 } 947 // Predecrement simulated by "- 1" calculation 948 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1; 949 KMP_DEBUG_ASSERT(children >= 0); 950 } 951 952 KA_TRACE( 953 20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; " 954 "not freeing it yet\n", 955 gtid, taskdata, children)); 956 } 957 958 // Only need to keep track of child task counts if any of the following: 959 // 1. team parallel and tasking not serialized; 960 // 2. it is a proxy or detachable or hidden helper task 961 // 3. the children counter of its parent task is greater than 0. 962 // The reason for the 3rd one is for serialized team that found detached task, 963 // hidden helper task, T. In this case, the execution of T is still deferred, 964 // and it is also possible that a regular task depends on T. In this case, if we 965 // don't track the children, task synchronization will be broken. 966 static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) { 967 kmp_tasking_flags_t flags = taskdata->td_flags; 968 bool ret = !(flags.team_serial || flags.tasking_ser); 969 ret = ret || flags.proxy == TASK_PROXY || 970 flags.detachable == TASK_DETACHABLE || flags.hidden_helper; 971 ret = ret || 972 KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0; 973 return ret; 974 } 975 976 // __kmp_task_finish: bookkeeping to do when a task finishes execution 977 // 978 // gtid: global thread ID for calling thread 979 // task: task to be finished 980 // resumed_task: task to be resumed. (may be NULL if task is serialized) 981 // 982 // template<ompt>: effectively ompt_enabled.enabled!=0 983 // the version with ompt=false is inlined, allowing to optimize away all ompt 984 // code in this case 985 template <bool ompt> 986 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, 987 kmp_taskdata_t *resumed_task) { 988 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 989 kmp_info_t *thread = __kmp_threads[gtid]; 990 kmp_task_team_t *task_team = 991 thread->th.th_task_team; // might be NULL for serial teams... 992 #if KMP_DEBUG 993 kmp_int32 children = 0; 994 #endif 995 KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming " 996 "task %p\n", 997 gtid, taskdata, resumed_task)); 998 999 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 1000 1001 // Pop task from stack if tied 1002 #ifdef BUILD_TIED_TASK_STACK 1003 if (taskdata->td_flags.tiedness == TASK_TIED) { 1004 __kmp_pop_task_stack(gtid, thread, taskdata); 1005 } 1006 #endif /* BUILD_TIED_TASK_STACK */ 1007 1008 if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) { 1009 // untied task needs to check the counter so that the task structure is not 1010 // freed prematurely 1011 kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1; 1012 KA_TRACE( 1013 20, 1014 ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n", 1015 gtid, counter, taskdata)); 1016 if (counter > 0) { 1017 // untied task is not done, to be continued possibly by other thread, do 1018 // not free it now 1019 if (resumed_task == NULL) { 1020 KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial); 1021 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 1022 // task is the parent 1023 } 1024 thread->th.th_current_task = resumed_task; // restore current_task 1025 resumed_task->td_flags.executing = 1; // resume previous task 1026 KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, " 1027 "resuming task %p\n", 1028 gtid, taskdata, resumed_task)); 1029 return; 1030 } 1031 } 1032 1033 // bookkeeping for resuming task: 1034 // GEH - note tasking_ser => task_serial 1035 KMP_DEBUG_ASSERT( 1036 (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) == 1037 taskdata->td_flags.task_serial); 1038 if (taskdata->td_flags.task_serial) { 1039 if (resumed_task == NULL) { 1040 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 1041 // task is the parent 1042 } 1043 } else { 1044 KMP_DEBUG_ASSERT(resumed_task != 1045 NULL); // verify that resumed task is passed as argument 1046 } 1047 1048 /* If the tasks' destructor thunk flag has been set, we need to invoke the 1049 destructor thunk that has been generated by the compiler. The code is 1050 placed here, since at this point other tasks might have been released 1051 hence overlapping the destructor invocations with some other work in the 1052 released tasks. The OpenMP spec is not specific on when the destructors 1053 are invoked, so we should be free to choose. */ 1054 if (UNLIKELY(taskdata->td_flags.destructors_thunk)) { 1055 kmp_routine_entry_t destr_thunk = task->data1.destructors; 1056 KMP_ASSERT(destr_thunk); 1057 destr_thunk(gtid, task); 1058 } 1059 1060 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 1061 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1); 1062 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 1063 1064 bool detach = false; 1065 if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) { 1066 if (taskdata->td_allow_completion_event.type == 1067 KMP_EVENT_ALLOW_COMPLETION) { 1068 // event hasn't been fulfilled yet. Try to detach task. 1069 __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid); 1070 if (taskdata->td_allow_completion_event.type == 1071 KMP_EVENT_ALLOW_COMPLETION) { 1072 // task finished execution 1073 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); 1074 taskdata->td_flags.executing = 0; // suspend the finishing task 1075 1076 #if OMPT_SUPPORT 1077 // For a detached task, which is not completed, we switch back 1078 // the omp_fulfill_event signals completion 1079 // locking is necessary to avoid a race with ompt_task_late_fulfill 1080 if (ompt) 1081 __ompt_task_finish(task, resumed_task, ompt_task_detach); 1082 #endif 1083 1084 // no access to taskdata after this point! 1085 // __kmp_fulfill_event might free taskdata at any time from now 1086 1087 taskdata->td_flags.proxy = TASK_PROXY; // proxify! 1088 detach = true; 1089 } 1090 __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid); 1091 } 1092 } 1093 1094 if (!detach) { 1095 taskdata->td_flags.complete = 1; // mark the task as completed 1096 1097 #if OMPT_SUPPORT 1098 // This is not a detached task, we are done here 1099 if (ompt) 1100 __ompt_task_finish(task, resumed_task, ompt_task_complete); 1101 #endif 1102 // TODO: What would be the balance between the conditions in the function 1103 // and an atomic operation? 1104 if (__kmp_track_children_task(taskdata)) { 1105 __kmp_release_deps(gtid, taskdata); 1106 // Predecrement simulated by "- 1" calculation 1107 #if KMP_DEBUG 1108 children = -1 + 1109 #endif 1110 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks); 1111 KMP_DEBUG_ASSERT(children >= 0); 1112 if (taskdata->td_taskgroup) 1113 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); 1114 } else if (task_team && (task_team->tt.tt_found_proxy_tasks || 1115 task_team->tt.tt_hidden_helper_task_encountered)) { 1116 // if we found proxy or hidden helper tasks there could exist a dependency 1117 // chain with the proxy task as origin 1118 __kmp_release_deps(gtid, taskdata); 1119 } 1120 // td_flags.executing must be marked as 0 after __kmp_release_deps has been 1121 // called. Othertwise, if a task is executed immediately from the 1122 // release_deps code, the flag will be reset to 1 again by this same 1123 // function 1124 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); 1125 taskdata->td_flags.executing = 0; // suspend the finishing task 1126 } 1127 1128 KA_TRACE( 1129 20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n", 1130 gtid, taskdata, children)); 1131 1132 // Free this task and then ancestor tasks if they have no children. 1133 // Restore th_current_task first as suggested by John: 1134 // johnmc: if an asynchronous inquiry peers into the runtime system 1135 // it doesn't see the freed task as the current task. 1136 thread->th.th_current_task = resumed_task; 1137 if (!detach) 1138 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 1139 1140 // TODO: GEH - make sure root team implicit task is initialized properly. 1141 // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 ); 1142 resumed_task->td_flags.executing = 1; // resume previous task 1143 1144 KA_TRACE( 1145 10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n", 1146 gtid, taskdata, resumed_task)); 1147 1148 return; 1149 } 1150 1151 template <bool ompt> 1152 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref, 1153 kmp_int32 gtid, 1154 kmp_task_t *task) { 1155 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n", 1156 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 1157 KMP_DEBUG_ASSERT(gtid >= 0); 1158 // this routine will provide task to resume 1159 __kmp_task_finish<ompt>(gtid, task, NULL); 1160 1161 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n", 1162 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 1163 1164 #if OMPT_SUPPORT 1165 if (ompt) { 1166 ompt_frame_t *ompt_frame; 1167 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 1168 ompt_frame->enter_frame = ompt_data_none; 1169 ompt_frame->enter_frame_flags = 1170 ompt_frame_runtime | ompt_frame_framepointer; 1171 } 1172 #endif 1173 1174 return; 1175 } 1176 1177 #if OMPT_SUPPORT 1178 OMPT_NOINLINE 1179 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid, 1180 kmp_task_t *task) { 1181 __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task); 1182 } 1183 #endif // OMPT_SUPPORT 1184 1185 // __kmpc_omp_task_complete_if0: report that a task has completed execution 1186 // 1187 // loc_ref: source location information; points to end of task block. 1188 // gtid: global thread number. 1189 // task: task thunk for the completed task. 1190 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid, 1191 kmp_task_t *task) { 1192 #if OMPT_SUPPORT 1193 if (UNLIKELY(ompt_enabled.enabled)) { 1194 __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task); 1195 return; 1196 } 1197 #endif 1198 __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task); 1199 } 1200 1201 #ifdef TASK_UNUSED 1202 // __kmpc_omp_task_complete: report that a task has completed execution 1203 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 1204 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid, 1205 kmp_task_t *task) { 1206 KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid, 1207 loc_ref, KMP_TASK_TO_TASKDATA(task))); 1208 1209 __kmp_task_finish<false>(gtid, task, 1210 NULL); // Not sure how to find task to resume 1211 1212 KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid, 1213 loc_ref, KMP_TASK_TO_TASKDATA(task))); 1214 return; 1215 } 1216 #endif // TASK_UNUSED 1217 1218 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit 1219 // task for a given thread 1220 // 1221 // loc_ref: reference to source location of parallel region 1222 // this_thr: thread data structure corresponding to implicit task 1223 // team: team for this_thr 1224 // tid: thread id of given thread within team 1225 // set_curr_task: TRUE if need to push current task to thread 1226 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to 1227 // have already been done elsewhere. 1228 // TODO: Get better loc_ref. Value passed in may be NULL 1229 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr, 1230 kmp_team_t *team, int tid, int set_curr_task) { 1231 kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid]; 1232 1233 KF_TRACE( 1234 10, 1235 ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n", 1236 tid, team, task, set_curr_task ? "TRUE" : "FALSE")); 1237 1238 task->td_task_id = KMP_GEN_TASK_ID(); 1239 task->td_team = team; 1240 // task->td_parent = NULL; // fix for CQ230101 (broken parent task info 1241 // in debugger) 1242 task->td_ident = loc_ref; 1243 task->td_taskwait_ident = NULL; 1244 task->td_taskwait_counter = 0; 1245 task->td_taskwait_thread = 0; 1246 1247 task->td_flags.tiedness = TASK_TIED; 1248 task->td_flags.tasktype = TASK_IMPLICIT; 1249 task->td_flags.proxy = TASK_FULL; 1250 1251 // All implicit tasks are executed immediately, not deferred 1252 task->td_flags.task_serial = 1; 1253 task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 1254 task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 1255 1256 task->td_flags.started = 1; 1257 task->td_flags.executing = 1; 1258 task->td_flags.complete = 0; 1259 task->td_flags.freed = 0; 1260 1261 task->td_depnode = NULL; 1262 task->td_last_tied = task; 1263 task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED; 1264 1265 if (set_curr_task) { // only do this init first time thread is created 1266 KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0); 1267 // Not used: don't need to deallocate implicit task 1268 KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0); 1269 task->td_taskgroup = NULL; // An implicit task does not have taskgroup 1270 task->td_dephash = NULL; 1271 __kmp_push_current_task_to_thread(this_thr, team, tid); 1272 } else { 1273 KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0); 1274 KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0); 1275 } 1276 1277 #if OMPT_SUPPORT 1278 if (UNLIKELY(ompt_enabled.enabled)) 1279 __ompt_task_init(task, tid); 1280 #endif 1281 1282 KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid, 1283 team, task)); 1284 } 1285 1286 // __kmp_finish_implicit_task: Release resources associated to implicit tasks 1287 // at the end of parallel regions. Some resources are kept for reuse in the next 1288 // parallel region. 1289 // 1290 // thread: thread data structure corresponding to implicit task 1291 void __kmp_finish_implicit_task(kmp_info_t *thread) { 1292 kmp_taskdata_t *task = thread->th.th_current_task; 1293 if (task->td_dephash) { 1294 int children; 1295 task->td_flags.complete = 1; 1296 children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks); 1297 kmp_tasking_flags_t flags_old = task->td_flags; 1298 if (children == 0 && flags_old.complete == 1) { 1299 kmp_tasking_flags_t flags_new = flags_old; 1300 flags_new.complete = 0; 1301 if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags), 1302 *RCAST(kmp_int32 *, &flags_old), 1303 *RCAST(kmp_int32 *, &flags_new))) { 1304 KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans " 1305 "dephash of implicit task %p\n", 1306 thread->th.th_info.ds.ds_gtid, task)); 1307 __kmp_dephash_free_entries(thread, task->td_dephash); 1308 } 1309 } 1310 } 1311 } 1312 1313 // __kmp_free_implicit_task: Release resources associated to implicit tasks 1314 // when these are destroyed regions 1315 // 1316 // thread: thread data structure corresponding to implicit task 1317 void __kmp_free_implicit_task(kmp_info_t *thread) { 1318 kmp_taskdata_t *task = thread->th.th_current_task; 1319 if (task && task->td_dephash) { 1320 __kmp_dephash_free(thread, task->td_dephash); 1321 task->td_dephash = NULL; 1322 } 1323 } 1324 1325 // Round up a size to a power of two specified by val: Used to insert padding 1326 // between structures co-allocated using a single malloc() call 1327 static size_t __kmp_round_up_to_val(size_t size, size_t val) { 1328 if (size & (val - 1)) { 1329 size &= ~(val - 1); 1330 if (size <= KMP_SIZE_T_MAX - val) { 1331 size += val; // Round up if there is no overflow. 1332 } 1333 } 1334 return size; 1335 } // __kmp_round_up_to_va 1336 1337 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task 1338 // 1339 // loc_ref: source location information 1340 // gtid: global thread number. 1341 // flags: include tiedness & task type (explicit vs. implicit) of the ''new'' 1342 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine. 1343 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including 1344 // private vars accessed in task. 1345 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed 1346 // in task. 1347 // task_entry: Pointer to task code entry point generated by compiler. 1348 // returns: a pointer to the allocated kmp_task_t structure (task). 1349 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1350 kmp_tasking_flags_t *flags, 1351 size_t sizeof_kmp_task_t, size_t sizeof_shareds, 1352 kmp_routine_entry_t task_entry) { 1353 kmp_task_t *task; 1354 kmp_taskdata_t *taskdata; 1355 kmp_info_t *thread = __kmp_threads[gtid]; 1356 kmp_team_t *team = thread->th.th_team; 1357 kmp_taskdata_t *parent_task = thread->th.th_current_task; 1358 size_t shareds_offset; 1359 1360 if (UNLIKELY(!TCR_4(__kmp_init_middle))) 1361 __kmp_middle_initialize(); 1362 1363 if (flags->hidden_helper) { 1364 if (__kmp_enable_hidden_helper) { 1365 if (!TCR_4(__kmp_init_hidden_helper)) 1366 __kmp_hidden_helper_initialize(); 1367 } else { 1368 // If the hidden helper task is not enabled, reset the flag to FALSE. 1369 flags->hidden_helper = FALSE; 1370 } 1371 } 1372 1373 KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) " 1374 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1375 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t, 1376 sizeof_shareds, task_entry)); 1377 1378 KMP_DEBUG_ASSERT(parent_task); 1379 if (parent_task->td_flags.final) { 1380 if (flags->merged_if0) { 1381 } 1382 flags->final = 1; 1383 } 1384 1385 if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) { 1386 // Untied task encountered causes the TSC algorithm to check entire deque of 1387 // the victim thread. If no untied task encountered, then checking the head 1388 // of the deque should be enough. 1389 KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1); 1390 } 1391 1392 // Detachable tasks are not proxy tasks yet but could be in the future. Doing 1393 // the tasking setup 1394 // when that happens is too late. 1395 if (UNLIKELY(flags->proxy == TASK_PROXY || 1396 flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) { 1397 if (flags->proxy == TASK_PROXY) { 1398 flags->tiedness = TASK_UNTIED; 1399 flags->merged_if0 = 1; 1400 } 1401 /* are we running in a sequential parallel or tskm_immediate_exec... we need 1402 tasking support enabled */ 1403 if ((thread->th.th_task_team) == NULL) { 1404 /* This should only happen if the team is serialized 1405 setup a task team and propagate it to the thread */ 1406 KMP_DEBUG_ASSERT(team->t.t_serialized); 1407 KA_TRACE(30, 1408 ("T#%d creating task team in __kmp_task_alloc for proxy task\n", 1409 gtid)); 1410 // 1 indicates setup the current team regardless of nthreads 1411 __kmp_task_team_setup(thread, team, 1); 1412 thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state]; 1413 } 1414 kmp_task_team_t *task_team = thread->th.th_task_team; 1415 1416 /* tasking must be enabled now as the task might not be pushed */ 1417 if (!KMP_TASKING_ENABLED(task_team)) { 1418 KA_TRACE( 1419 30, 1420 ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid)); 1421 __kmp_enable_tasking(task_team, thread); 1422 kmp_int32 tid = thread->th.th_info.ds.ds_tid; 1423 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 1424 // No lock needed since only owner can allocate 1425 if (thread_data->td.td_deque == NULL) { 1426 __kmp_alloc_task_deque(thread, thread_data); 1427 } 1428 } 1429 1430 if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) && 1431 task_team->tt.tt_found_proxy_tasks == FALSE) 1432 TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE); 1433 if (flags->hidden_helper && 1434 task_team->tt.tt_hidden_helper_task_encountered == FALSE) 1435 TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE); 1436 } 1437 1438 // Calculate shared structure offset including padding after kmp_task_t struct 1439 // to align pointers in shared struct 1440 shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t; 1441 shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *)); 1442 1443 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 1444 KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid, 1445 shareds_offset)); 1446 KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid, 1447 sizeof_shareds)); 1448 1449 // Avoid double allocation here by combining shareds with taskdata 1450 #if USE_FAST_MEMORY 1451 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset + 1452 sizeof_shareds); 1453 #else /* ! USE_FAST_MEMORY */ 1454 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset + 1455 sizeof_shareds); 1456 #endif /* USE_FAST_MEMORY */ 1457 1458 task = KMP_TASKDATA_TO_TASK(taskdata); 1459 1460 // Make sure task & taskdata are aligned appropriately 1461 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD 1462 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0); 1463 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0); 1464 #else 1465 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0); 1466 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0); 1467 #endif 1468 if (sizeof_shareds > 0) { 1469 // Avoid double allocation here by combining shareds with taskdata 1470 task->shareds = &((char *)taskdata)[shareds_offset]; 1471 // Make sure shareds struct is aligned to pointer size 1472 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 1473 0); 1474 } else { 1475 task->shareds = NULL; 1476 } 1477 task->routine = task_entry; 1478 task->part_id = 0; // AC: Always start with 0 part id 1479 1480 taskdata->td_task_id = KMP_GEN_TASK_ID(); 1481 taskdata->td_team = thread->th.th_team; 1482 taskdata->td_alloc_thread = thread; 1483 taskdata->td_parent = parent_task; 1484 taskdata->td_level = parent_task->td_level + 1; // increment nesting level 1485 KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0); 1486 taskdata->td_ident = loc_ref; 1487 taskdata->td_taskwait_ident = NULL; 1488 taskdata->td_taskwait_counter = 0; 1489 taskdata->td_taskwait_thread = 0; 1490 KMP_DEBUG_ASSERT(taskdata->td_parent != NULL); 1491 // avoid copying icvs for proxy tasks 1492 if (flags->proxy == TASK_FULL) 1493 copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs); 1494 1495 taskdata->td_flags = *flags; 1496 taskdata->td_task_team = thread->th.th_task_team; 1497 taskdata->td_size_alloc = shareds_offset + sizeof_shareds; 1498 taskdata->td_flags.tasktype = TASK_EXPLICIT; 1499 // If it is hidden helper task, we need to set the team and task team 1500 // correspondingly. 1501 if (flags->hidden_helper) { 1502 kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)]; 1503 taskdata->td_team = shadow_thread->th.th_team; 1504 taskdata->td_task_team = shadow_thread->th.th_task_team; 1505 } 1506 1507 // GEH - TODO: fix this to copy parent task's value of tasking_ser flag 1508 taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 1509 1510 // GEH - TODO: fix this to copy parent task's value of team_serial flag 1511 taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 1512 1513 // GEH - Note we serialize the task if the team is serialized to make sure 1514 // implicit parallel region tasks are not left until program termination to 1515 // execute. Also, it helps locality to execute immediately. 1516 1517 taskdata->td_flags.task_serial = 1518 (parent_task->td_flags.final || taskdata->td_flags.team_serial || 1519 taskdata->td_flags.tasking_ser || flags->merged_if0); 1520 1521 taskdata->td_flags.started = 0; 1522 taskdata->td_flags.executing = 0; 1523 taskdata->td_flags.complete = 0; 1524 taskdata->td_flags.freed = 0; 1525 1526 KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0); 1527 // start at one because counts current task and children 1528 KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1); 1529 taskdata->td_taskgroup = 1530 parent_task->td_taskgroup; // task inherits taskgroup from the parent task 1531 taskdata->td_dephash = NULL; 1532 taskdata->td_depnode = NULL; 1533 if (flags->tiedness == TASK_UNTIED) 1534 taskdata->td_last_tied = NULL; // will be set when the task is scheduled 1535 else 1536 taskdata->td_last_tied = taskdata; 1537 taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED; 1538 #if OMPT_SUPPORT 1539 if (UNLIKELY(ompt_enabled.enabled)) 1540 __ompt_task_init(taskdata, gtid); 1541 #endif 1542 // TODO: What would be the balance between the conditions in the function and 1543 // an atomic operation? 1544 if (__kmp_track_children_task(taskdata)) { 1545 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks); 1546 if (parent_task->td_taskgroup) 1547 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count); 1548 // Only need to keep track of allocated child tasks for explicit tasks since 1549 // implicit not deallocated 1550 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) { 1551 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks); 1552 } 1553 if (flags->hidden_helper) { 1554 taskdata->td_flags.task_serial = FALSE; 1555 // Increment the number of hidden helper tasks to be executed 1556 KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks); 1557 } 1558 } 1559 1560 KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n", 1561 gtid, taskdata, taskdata->td_parent)); 1562 1563 return task; 1564 } 1565 1566 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1567 kmp_int32 flags, size_t sizeof_kmp_task_t, 1568 size_t sizeof_shareds, 1569 kmp_routine_entry_t task_entry) { 1570 kmp_task_t *retval; 1571 kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags; 1572 __kmp_assert_valid_gtid(gtid); 1573 input_flags->native = FALSE; 1574 // __kmp_task_alloc() sets up all other runtime flags 1575 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) " 1576 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1577 gtid, loc_ref, input_flags->tiedness ? "tied " : "untied", 1578 input_flags->proxy ? "proxy" : "", 1579 input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t, 1580 sizeof_shareds, task_entry)); 1581 1582 retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t, 1583 sizeof_shareds, task_entry); 1584 1585 KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval)); 1586 1587 return retval; 1588 } 1589 1590 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1591 kmp_int32 flags, 1592 size_t sizeof_kmp_task_t, 1593 size_t sizeof_shareds, 1594 kmp_routine_entry_t task_entry, 1595 kmp_int64 device_id) { 1596 auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags); 1597 // target task is untied defined in the specification 1598 input_flags.tiedness = TASK_UNTIED; 1599 1600 if (__kmp_enable_hidden_helper) 1601 input_flags.hidden_helper = TRUE; 1602 1603 return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t, 1604 sizeof_shareds, task_entry); 1605 } 1606 1607 /*! 1608 @ingroup TASKING 1609 @param loc_ref location of the original task directive 1610 @param gtid Global Thread ID of encountering thread 1611 @param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new 1612 task'' 1613 @param naffins Number of affinity items 1614 @param affin_list List of affinity items 1615 @return Returns non-zero if registering affinity information was not successful. 1616 Returns 0 if registration was successful 1617 This entry registers the affinity information attached to a task with the task 1618 thunk structure kmp_taskdata_t. 1619 */ 1620 kmp_int32 1621 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, 1622 kmp_task_t *new_task, kmp_int32 naffins, 1623 kmp_task_affinity_info_t *affin_list) { 1624 return 0; 1625 } 1626 1627 // __kmp_invoke_task: invoke the specified task 1628 // 1629 // gtid: global thread ID of caller 1630 // task: the task to invoke 1631 // current_task: the task to resume after task invocation 1632 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, 1633 kmp_taskdata_t *current_task) { 1634 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 1635 kmp_info_t *thread; 1636 int discard = 0 /* false */; 1637 KA_TRACE( 1638 30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n", 1639 gtid, taskdata, current_task)); 1640 KMP_DEBUG_ASSERT(task); 1641 if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY && 1642 taskdata->td_flags.complete == 1)) { 1643 // This is a proxy task that was already completed but it needs to run 1644 // its bottom-half finish 1645 KA_TRACE( 1646 30, 1647 ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n", 1648 gtid, taskdata)); 1649 1650 __kmp_bottom_half_finish_proxy(gtid, task); 1651 1652 KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for " 1653 "proxy task %p, resuming task %p\n", 1654 gtid, taskdata, current_task)); 1655 1656 return; 1657 } 1658 1659 #if OMPT_SUPPORT 1660 // For untied tasks, the first task executed only calls __kmpc_omp_task and 1661 // does not execute code. 1662 ompt_thread_info_t oldInfo; 1663 if (UNLIKELY(ompt_enabled.enabled)) { 1664 // Store the threads states and restore them after the task 1665 thread = __kmp_threads[gtid]; 1666 oldInfo = thread->th.ompt_thread_info; 1667 thread->th.ompt_thread_info.wait_id = 0; 1668 thread->th.ompt_thread_info.state = (thread->th.th_team_serialized) 1669 ? ompt_state_work_serial 1670 : ompt_state_work_parallel; 1671 taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1672 } 1673 #endif 1674 1675 // Decreament the counter of hidden helper tasks to be executed 1676 if (taskdata->td_flags.hidden_helper) { 1677 // Hidden helper tasks can only be executed by hidden helper threads 1678 KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid)); 1679 KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks); 1680 } 1681 1682 // Proxy tasks are not handled by the runtime 1683 if (taskdata->td_flags.proxy != TASK_PROXY) { 1684 __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded 1685 } 1686 1687 // TODO: cancel tasks if the parallel region has also been cancelled 1688 // TODO: check if this sequence can be hoisted above __kmp_task_start 1689 // if cancellation has been enabled for this run ... 1690 if (UNLIKELY(__kmp_omp_cancellation)) { 1691 thread = __kmp_threads[gtid]; 1692 kmp_team_t *this_team = thread->th.th_team; 1693 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 1694 if ((taskgroup && taskgroup->cancel_request) || 1695 (this_team->t.t_cancel_request == cancel_parallel)) { 1696 #if OMPT_SUPPORT && OMPT_OPTIONAL 1697 ompt_data_t *task_data; 1698 if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) { 1699 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL); 1700 ompt_callbacks.ompt_callback(ompt_callback_cancel)( 1701 task_data, 1702 ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup 1703 : ompt_cancel_parallel) | 1704 ompt_cancel_discarded_task, 1705 NULL); 1706 } 1707 #endif 1708 KMP_COUNT_BLOCK(TASK_cancelled); 1709 // this task belongs to a task group and we need to cancel it 1710 discard = 1 /* true */; 1711 } 1712 } 1713 1714 // Invoke the task routine and pass in relevant data. 1715 // Thunks generated by gcc take a different argument list. 1716 if (!discard) { 1717 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 1718 taskdata->td_last_tied = current_task->td_last_tied; 1719 KMP_DEBUG_ASSERT(taskdata->td_last_tied); 1720 } 1721 #if KMP_STATS_ENABLED 1722 KMP_COUNT_BLOCK(TASK_executed); 1723 switch (KMP_GET_THREAD_STATE()) { 1724 case FORK_JOIN_BARRIER: 1725 KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar); 1726 break; 1727 case PLAIN_BARRIER: 1728 KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar); 1729 break; 1730 case TASKYIELD: 1731 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield); 1732 break; 1733 case TASKWAIT: 1734 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait); 1735 break; 1736 case TASKGROUP: 1737 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup); 1738 break; 1739 default: 1740 KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate); 1741 break; 1742 } 1743 #endif // KMP_STATS_ENABLED 1744 1745 // OMPT task begin 1746 #if OMPT_SUPPORT 1747 if (UNLIKELY(ompt_enabled.enabled)) 1748 __ompt_task_start(task, current_task, gtid); 1749 #endif 1750 1751 #if OMPD_SUPPORT 1752 if (ompd_state & OMPD_ENABLE_BP) 1753 ompd_bp_task_begin(); 1754 #endif 1755 1756 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1757 kmp_uint64 cur_time; 1758 kmp_int32 kmp_itt_count_task = 1759 __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial && 1760 current_task->td_flags.tasktype == TASK_IMPLICIT; 1761 if (kmp_itt_count_task) { 1762 thread = __kmp_threads[gtid]; 1763 // Time outer level explicit task on barrier for adjusting imbalance time 1764 if (thread->th.th_bar_arrive_time) 1765 cur_time = __itt_get_timestamp(); 1766 else 1767 kmp_itt_count_task = 0; // thread is not on a barrier - skip timing 1768 } 1769 KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task) 1770 #endif 1771 1772 if (task->routine != NULL) { 1773 #ifdef KMP_GOMP_COMPAT 1774 if (taskdata->td_flags.native) { 1775 ((void (*)(void *))(*(task->routine)))(task->shareds); 1776 } else 1777 #endif /* KMP_GOMP_COMPAT */ 1778 { 1779 (*(task->routine))(gtid, task); 1780 } 1781 } 1782 KMP_POP_PARTITIONED_TIMER(); 1783 1784 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1785 if (kmp_itt_count_task) { 1786 // Barrier imbalance - adjust arrive time with the task duration 1787 thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time); 1788 } 1789 KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed) 1790 KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent 1791 #endif 1792 } 1793 1794 #if OMPD_SUPPORT 1795 if (ompd_state & OMPD_ENABLE_BP) 1796 ompd_bp_task_end(); 1797 #endif 1798 1799 // Proxy tasks are not handled by the runtime 1800 if (taskdata->td_flags.proxy != TASK_PROXY) { 1801 #if OMPT_SUPPORT 1802 if (UNLIKELY(ompt_enabled.enabled)) { 1803 thread->th.ompt_thread_info = oldInfo; 1804 if (taskdata->td_flags.tiedness == TASK_TIED) { 1805 taskdata->ompt_task_info.frame.exit_frame = ompt_data_none; 1806 } 1807 __kmp_task_finish<true>(gtid, task, current_task); 1808 } else 1809 #endif 1810 __kmp_task_finish<false>(gtid, task, current_task); 1811 } 1812 1813 KA_TRACE( 1814 30, 1815 ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n", 1816 gtid, taskdata, current_task)); 1817 return; 1818 } 1819 1820 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution 1821 // 1822 // loc_ref: location of original task pragma (ignored) 1823 // gtid: Global Thread ID of encountering thread 1824 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task'' 1825 // Returns: 1826 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1827 // be resumed later. 1828 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1829 // resumed later. 1830 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid, 1831 kmp_task_t *new_task) { 1832 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1833 1834 KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid, 1835 loc_ref, new_taskdata)); 1836 1837 #if OMPT_SUPPORT 1838 kmp_taskdata_t *parent; 1839 if (UNLIKELY(ompt_enabled.enabled)) { 1840 parent = new_taskdata->td_parent; 1841 if (ompt_enabled.ompt_callback_task_create) { 1842 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1843 &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame), 1844 &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0, 1845 OMPT_GET_RETURN_ADDRESS(0)); 1846 } 1847 } 1848 #endif 1849 1850 /* Should we execute the new task or queue it? For now, let's just always try 1851 to queue it. If the queue fills up, then we'll execute it. */ 1852 1853 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1854 { // Execute this task immediately 1855 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1856 new_taskdata->td_flags.task_serial = 1; 1857 __kmp_invoke_task(gtid, new_task, current_task); 1858 } 1859 1860 KA_TRACE( 1861 10, 1862 ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: " 1863 "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n", 1864 gtid, loc_ref, new_taskdata)); 1865 1866 #if OMPT_SUPPORT 1867 if (UNLIKELY(ompt_enabled.enabled)) { 1868 parent->ompt_task_info.frame.enter_frame = ompt_data_none; 1869 } 1870 #endif 1871 return TASK_CURRENT_NOT_QUEUED; 1872 } 1873 1874 // __kmp_omp_task: Schedule a non-thread-switchable task for execution 1875 // 1876 // gtid: Global Thread ID of encountering thread 1877 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc() 1878 // serialize_immediate: if TRUE then if the task is executed immediately its 1879 // execution will be serialized 1880 // Returns: 1881 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1882 // be resumed later. 1883 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1884 // resumed later. 1885 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task, 1886 bool serialize_immediate) { 1887 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1888 1889 /* Should we execute the new task or queue it? For now, let's just always try 1890 to queue it. If the queue fills up, then we'll execute it. */ 1891 if (new_taskdata->td_flags.proxy == TASK_PROXY || 1892 __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1893 { // Execute this task immediately 1894 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1895 if (serialize_immediate) 1896 new_taskdata->td_flags.task_serial = 1; 1897 __kmp_invoke_task(gtid, new_task, current_task); 1898 } 1899 1900 return TASK_CURRENT_NOT_QUEUED; 1901 } 1902 1903 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a 1904 // non-thread-switchable task from the parent thread only! 1905 // 1906 // loc_ref: location of original task pragma (ignored) 1907 // gtid: Global Thread ID of encountering thread 1908 // new_task: non-thread-switchable task thunk allocated by 1909 // __kmp_omp_task_alloc() 1910 // Returns: 1911 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1912 // be resumed later. 1913 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1914 // resumed later. 1915 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid, 1916 kmp_task_t *new_task) { 1917 kmp_int32 res; 1918 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); 1919 1920 #if KMP_DEBUG || OMPT_SUPPORT 1921 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1922 #endif 1923 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref, 1924 new_taskdata)); 1925 __kmp_assert_valid_gtid(gtid); 1926 1927 #if OMPT_SUPPORT 1928 kmp_taskdata_t *parent = NULL; 1929 if (UNLIKELY(ompt_enabled.enabled)) { 1930 if (!new_taskdata->td_flags.started) { 1931 OMPT_STORE_RETURN_ADDRESS(gtid); 1932 parent = new_taskdata->td_parent; 1933 if (!parent->ompt_task_info.frame.enter_frame.ptr) { 1934 parent->ompt_task_info.frame.enter_frame.ptr = 1935 OMPT_GET_FRAME_ADDRESS(0); 1936 } 1937 if (ompt_enabled.ompt_callback_task_create) { 1938 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1939 &(parent->ompt_task_info.task_data), 1940 &(parent->ompt_task_info.frame), 1941 &(new_taskdata->ompt_task_info.task_data), 1942 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, 1943 OMPT_LOAD_RETURN_ADDRESS(gtid)); 1944 } 1945 } else { 1946 // We are scheduling the continuation of an UNTIED task. 1947 // Scheduling back to the parent task. 1948 __ompt_task_finish(new_task, 1949 new_taskdata->ompt_task_info.scheduling_parent, 1950 ompt_task_switch); 1951 new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none; 1952 } 1953 } 1954 #endif 1955 1956 res = __kmp_omp_task(gtid, new_task, true); 1957 1958 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning " 1959 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n", 1960 gtid, loc_ref, new_taskdata)); 1961 #if OMPT_SUPPORT 1962 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) { 1963 parent->ompt_task_info.frame.enter_frame = ompt_data_none; 1964 } 1965 #endif 1966 return res; 1967 } 1968 1969 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule 1970 // a taskloop task with the correct OMPT return address 1971 // 1972 // loc_ref: location of original task pragma (ignored) 1973 // gtid: Global Thread ID of encountering thread 1974 // new_task: non-thread-switchable task thunk allocated by 1975 // __kmp_omp_task_alloc() 1976 // codeptr_ra: return address for OMPT callback 1977 // Returns: 1978 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1979 // be resumed later. 1980 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1981 // resumed later. 1982 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid, 1983 kmp_task_t *new_task, void *codeptr_ra) { 1984 kmp_int32 res; 1985 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); 1986 1987 #if KMP_DEBUG || OMPT_SUPPORT 1988 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1989 #endif 1990 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref, 1991 new_taskdata)); 1992 1993 #if OMPT_SUPPORT 1994 kmp_taskdata_t *parent = NULL; 1995 if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) { 1996 parent = new_taskdata->td_parent; 1997 if (!parent->ompt_task_info.frame.enter_frame.ptr) 1998 parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1999 if (ompt_enabled.ompt_callback_task_create) { 2000 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 2001 &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame), 2002 &(new_taskdata->ompt_task_info.task_data), 2003 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, 2004 codeptr_ra); 2005 } 2006 } 2007 #endif 2008 2009 res = __kmp_omp_task(gtid, new_task, true); 2010 2011 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning " 2012 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n", 2013 gtid, loc_ref, new_taskdata)); 2014 #if OMPT_SUPPORT 2015 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) { 2016 parent->ompt_task_info.frame.enter_frame = ompt_data_none; 2017 } 2018 #endif 2019 return res; 2020 } 2021 2022 template <bool ompt> 2023 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid, 2024 void *frame_address, 2025 void *return_address) { 2026 kmp_taskdata_t *taskdata = nullptr; 2027 kmp_info_t *thread; 2028 int thread_finished = FALSE; 2029 KMP_SET_THREAD_STATE_BLOCK(TASKWAIT); 2030 2031 KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref)); 2032 KMP_DEBUG_ASSERT(gtid >= 0); 2033 2034 if (__kmp_tasking_mode != tskm_immediate_exec) { 2035 thread = __kmp_threads[gtid]; 2036 taskdata = thread->th.th_current_task; 2037 2038 #if OMPT_SUPPORT && OMPT_OPTIONAL 2039 ompt_data_t *my_task_data; 2040 ompt_data_t *my_parallel_data; 2041 2042 if (ompt) { 2043 my_task_data = &(taskdata->ompt_task_info.task_data); 2044 my_parallel_data = OMPT_CUR_TEAM_DATA(thread); 2045 2046 taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address; 2047 2048 if (ompt_enabled.ompt_callback_sync_region) { 2049 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2050 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data, 2051 my_task_data, return_address); 2052 } 2053 2054 if (ompt_enabled.ompt_callback_sync_region_wait) { 2055 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2056 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data, 2057 my_task_data, return_address); 2058 } 2059 } 2060 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 2061 2062 // Debugger: The taskwait is active. Store location and thread encountered the 2063 // taskwait. 2064 #if USE_ITT_BUILD 2065 // Note: These values are used by ITT events as well. 2066 #endif /* USE_ITT_BUILD */ 2067 taskdata->td_taskwait_counter += 1; 2068 taskdata->td_taskwait_ident = loc_ref; 2069 taskdata->td_taskwait_thread = gtid + 1; 2070 2071 #if USE_ITT_BUILD 2072 void *itt_sync_obj = NULL; 2073 #if USE_ITT_NOTIFY 2074 KMP_ITT_TASKWAIT_STARTING(itt_sync_obj); 2075 #endif /* USE_ITT_NOTIFY */ 2076 #endif /* USE_ITT_BUILD */ 2077 2078 bool must_wait = 2079 !taskdata->td_flags.team_serial && !taskdata->td_flags.final; 2080 2081 must_wait = must_wait || (thread->th.th_task_team != NULL && 2082 thread->th.th_task_team->tt.tt_found_proxy_tasks); 2083 // If hidden helper thread is encountered, we must enable wait here. 2084 must_wait = 2085 must_wait || 2086 (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL && 2087 thread->th.th_task_team->tt.tt_hidden_helper_task_encountered); 2088 2089 if (must_wait) { 2090 kmp_flag_32<false, false> flag( 2091 RCAST(std::atomic<kmp_uint32> *, 2092 &(taskdata->td_incomplete_child_tasks)), 2093 0U); 2094 while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) { 2095 flag.execute_tasks(thread, gtid, FALSE, 2096 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 2097 __kmp_task_stealing_constraint); 2098 } 2099 } 2100 #if USE_ITT_BUILD 2101 KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj); 2102 KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children 2103 #endif /* USE_ITT_BUILD */ 2104 2105 // Debugger: The taskwait is completed. Location remains, but thread is 2106 // negated. 2107 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 2108 2109 #if OMPT_SUPPORT && OMPT_OPTIONAL 2110 if (ompt) { 2111 if (ompt_enabled.ompt_callback_sync_region_wait) { 2112 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2113 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data, 2114 my_task_data, return_address); 2115 } 2116 if (ompt_enabled.ompt_callback_sync_region) { 2117 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2118 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data, 2119 my_task_data, return_address); 2120 } 2121 taskdata->ompt_task_info.frame.enter_frame = ompt_data_none; 2122 } 2123 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 2124 2125 } 2126 2127 KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, " 2128 "returning TASK_CURRENT_NOT_QUEUED\n", 2129 gtid, taskdata)); 2130 2131 return TASK_CURRENT_NOT_QUEUED; 2132 } 2133 2134 #if OMPT_SUPPORT && OMPT_OPTIONAL 2135 OMPT_NOINLINE 2136 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid, 2137 void *frame_address, 2138 void *return_address) { 2139 return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address, 2140 return_address); 2141 } 2142 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 2143 2144 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are 2145 // complete 2146 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) { 2147 #if OMPT_SUPPORT && OMPT_OPTIONAL 2148 if (UNLIKELY(ompt_enabled.enabled)) { 2149 OMPT_STORE_RETURN_ADDRESS(gtid); 2150 return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0), 2151 OMPT_LOAD_RETURN_ADDRESS(gtid)); 2152 } 2153 #endif 2154 return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL); 2155 } 2156 2157 // __kmpc_omp_taskyield: switch to a different task 2158 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) { 2159 kmp_taskdata_t *taskdata = NULL; 2160 kmp_info_t *thread; 2161 int thread_finished = FALSE; 2162 2163 KMP_COUNT_BLOCK(OMP_TASKYIELD); 2164 KMP_SET_THREAD_STATE_BLOCK(TASKYIELD); 2165 2166 KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n", 2167 gtid, loc_ref, end_part)); 2168 __kmp_assert_valid_gtid(gtid); 2169 2170 if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) { 2171 thread = __kmp_threads[gtid]; 2172 taskdata = thread->th.th_current_task; 2173 // Should we model this as a task wait or not? 2174 // Debugger: The taskwait is active. Store location and thread encountered the 2175 // taskwait. 2176 #if USE_ITT_BUILD 2177 // Note: These values are used by ITT events as well. 2178 #endif /* USE_ITT_BUILD */ 2179 taskdata->td_taskwait_counter += 1; 2180 taskdata->td_taskwait_ident = loc_ref; 2181 taskdata->td_taskwait_thread = gtid + 1; 2182 2183 #if USE_ITT_BUILD 2184 void *itt_sync_obj = NULL; 2185 #if USE_ITT_NOTIFY 2186 KMP_ITT_TASKWAIT_STARTING(itt_sync_obj); 2187 #endif /* USE_ITT_NOTIFY */ 2188 #endif /* USE_ITT_BUILD */ 2189 if (!taskdata->td_flags.team_serial) { 2190 kmp_task_team_t *task_team = thread->th.th_task_team; 2191 if (task_team != NULL) { 2192 if (KMP_TASKING_ENABLED(task_team)) { 2193 #if OMPT_SUPPORT 2194 if (UNLIKELY(ompt_enabled.enabled)) 2195 thread->th.ompt_thread_info.ompt_task_yielded = 1; 2196 #endif 2197 __kmp_execute_tasks_32( 2198 thread, gtid, (kmp_flag_32<> *)NULL, FALSE, 2199 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 2200 __kmp_task_stealing_constraint); 2201 #if OMPT_SUPPORT 2202 if (UNLIKELY(ompt_enabled.enabled)) 2203 thread->th.ompt_thread_info.ompt_task_yielded = 0; 2204 #endif 2205 } 2206 } 2207 } 2208 #if USE_ITT_BUILD 2209 KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj); 2210 #endif /* USE_ITT_BUILD */ 2211 2212 // Debugger: The taskwait is completed. Location remains, but thread is 2213 // negated. 2214 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 2215 } 2216 2217 KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, " 2218 "returning TASK_CURRENT_NOT_QUEUED\n", 2219 gtid, taskdata)); 2220 2221 return TASK_CURRENT_NOT_QUEUED; 2222 } 2223 2224 // Task Reduction implementation 2225 // 2226 // Note: initial implementation didn't take into account the possibility 2227 // to specify omp_orig for initializer of the UDR (user defined reduction). 2228 // Corrected implementation takes into account the omp_orig object. 2229 // Compiler is free to use old implementation if omp_orig is not specified. 2230 2231 /*! 2232 @ingroup BASIC_TYPES 2233 @{ 2234 */ 2235 2236 /*! 2237 Flags for special info per task reduction item. 2238 */ 2239 typedef struct kmp_taskred_flags { 2240 /*! 1 - use lazy alloc/init (e.g. big objects, #tasks < #threads) */ 2241 unsigned lazy_priv : 1; 2242 unsigned reserved31 : 31; 2243 } kmp_taskred_flags_t; 2244 2245 /*! 2246 Internal struct for reduction data item related info set up by compiler. 2247 */ 2248 typedef struct kmp_task_red_input { 2249 void *reduce_shar; /**< shared between tasks item to reduce into */ 2250 size_t reduce_size; /**< size of data item in bytes */ 2251 // three compiler-generated routines (init, fini are optional): 2252 void *reduce_init; /**< data initialization routine (single parameter) */ 2253 void *reduce_fini; /**< data finalization routine */ 2254 void *reduce_comb; /**< data combiner routine */ 2255 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */ 2256 } kmp_task_red_input_t; 2257 2258 /*! 2259 Internal struct for reduction data item related info saved by the library. 2260 */ 2261 typedef struct kmp_taskred_data { 2262 void *reduce_shar; /**< shared between tasks item to reduce into */ 2263 size_t reduce_size; /**< size of data item */ 2264 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */ 2265 void *reduce_priv; /**< array of thread specific items */ 2266 void *reduce_pend; /**< end of private data for faster comparison op */ 2267 // three compiler-generated routines (init, fini are optional): 2268 void *reduce_comb; /**< data combiner routine */ 2269 void *reduce_init; /**< data initialization routine (two parameters) */ 2270 void *reduce_fini; /**< data finalization routine */ 2271 void *reduce_orig; /**< original item (can be used in UDR initializer) */ 2272 } kmp_taskred_data_t; 2273 2274 /*! 2275 Internal struct for reduction data item related info set up by compiler. 2276 2277 New interface: added reduce_orig field to provide omp_orig for UDR initializer. 2278 */ 2279 typedef struct kmp_taskred_input { 2280 void *reduce_shar; /**< shared between tasks item to reduce into */ 2281 void *reduce_orig; /**< original reduction item used for initialization */ 2282 size_t reduce_size; /**< size of data item */ 2283 // three compiler-generated routines (init, fini are optional): 2284 void *reduce_init; /**< data initialization routine (two parameters) */ 2285 void *reduce_fini; /**< data finalization routine */ 2286 void *reduce_comb; /**< data combiner routine */ 2287 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */ 2288 } kmp_taskred_input_t; 2289 /*! 2290 @} 2291 */ 2292 2293 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src); 2294 template <> 2295 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item, 2296 kmp_task_red_input_t &src) { 2297 item.reduce_orig = NULL; 2298 } 2299 template <> 2300 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item, 2301 kmp_taskred_input_t &src) { 2302 if (src.reduce_orig != NULL) { 2303 item.reduce_orig = src.reduce_orig; 2304 } else { 2305 item.reduce_orig = src.reduce_shar; 2306 } // non-NULL reduce_orig means new interface used 2307 } 2308 2309 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j); 2310 template <> 2311 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item, 2312 size_t offset) { 2313 ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset); 2314 } 2315 template <> 2316 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item, 2317 size_t offset) { 2318 ((void (*)(void *, void *))item.reduce_init)( 2319 (char *)(item.reduce_priv) + offset, item.reduce_orig); 2320 } 2321 2322 template <typename T> 2323 void *__kmp_task_reduction_init(int gtid, int num, T *data) { 2324 __kmp_assert_valid_gtid(gtid); 2325 kmp_info_t *thread = __kmp_threads[gtid]; 2326 kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup; 2327 kmp_uint32 nth = thread->th.th_team_nproc; 2328 kmp_taskred_data_t *arr; 2329 2330 // check input data just in case 2331 KMP_ASSERT(tg != NULL); 2332 KMP_ASSERT(data != NULL); 2333 KMP_ASSERT(num > 0); 2334 if (nth == 1) { 2335 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n", 2336 gtid, tg)); 2337 return (void *)tg; 2338 } 2339 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n", 2340 gtid, tg, num)); 2341 arr = (kmp_taskred_data_t *)__kmp_thread_malloc( 2342 thread, num * sizeof(kmp_taskred_data_t)); 2343 for (int i = 0; i < num; ++i) { 2344 size_t size = data[i].reduce_size - 1; 2345 // round the size up to cache line per thread-specific item 2346 size += CACHE_LINE - size % CACHE_LINE; 2347 KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory 2348 arr[i].reduce_shar = data[i].reduce_shar; 2349 arr[i].reduce_size = size; 2350 arr[i].flags = data[i].flags; 2351 arr[i].reduce_comb = data[i].reduce_comb; 2352 arr[i].reduce_init = data[i].reduce_init; 2353 arr[i].reduce_fini = data[i].reduce_fini; 2354 __kmp_assign_orig<T>(arr[i], data[i]); 2355 if (!arr[i].flags.lazy_priv) { 2356 // allocate cache-line aligned block and fill it with zeros 2357 arr[i].reduce_priv = __kmp_allocate(nth * size); 2358 arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size; 2359 if (arr[i].reduce_init != NULL) { 2360 // initialize all thread-specific items 2361 for (size_t j = 0; j < nth; ++j) { 2362 __kmp_call_init<T>(arr[i], j * size); 2363 } 2364 } 2365 } else { 2366 // only allocate space for pointers now, 2367 // objects will be lazily allocated/initialized if/when requested 2368 // note that __kmp_allocate zeroes the allocated memory 2369 arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *)); 2370 } 2371 } 2372 tg->reduce_data = (void *)arr; 2373 tg->reduce_num_data = num; 2374 return (void *)tg; 2375 } 2376 2377 /*! 2378 @ingroup TASKING 2379 @param gtid Global thread ID 2380 @param num Number of data items to reduce 2381 @param data Array of data for reduction 2382 @return The taskgroup identifier 2383 2384 Initialize task reduction for the taskgroup. 2385 2386 Note: this entry supposes the optional compiler-generated initializer routine 2387 has single parameter - pointer to object to be initialized. That means 2388 the reduction either does not use omp_orig object, or the omp_orig is accessible 2389 without help of the runtime library. 2390 */ 2391 void *__kmpc_task_reduction_init(int gtid, int num, void *data) { 2392 return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data); 2393 } 2394 2395 /*! 2396 @ingroup TASKING 2397 @param gtid Global thread ID 2398 @param num Number of data items to reduce 2399 @param data Array of data for reduction 2400 @return The taskgroup identifier 2401 2402 Initialize task reduction for the taskgroup. 2403 2404 Note: this entry supposes the optional compiler-generated initializer routine 2405 has two parameters, pointer to object to be initialized and pointer to omp_orig 2406 */ 2407 void *__kmpc_taskred_init(int gtid, int num, void *data) { 2408 return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data); 2409 } 2410 2411 // Copy task reduction data (except for shared pointers). 2412 template <typename T> 2413 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data, 2414 kmp_taskgroup_t *tg, void *reduce_data) { 2415 kmp_taskred_data_t *arr; 2416 KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p," 2417 " from data %p\n", 2418 thr, tg, reduce_data)); 2419 arr = (kmp_taskred_data_t *)__kmp_thread_malloc( 2420 thr, num * sizeof(kmp_taskred_data_t)); 2421 // threads will share private copies, thunk routines, sizes, flags, etc.: 2422 KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t)); 2423 for (int i = 0; i < num; ++i) { 2424 arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers 2425 } 2426 tg->reduce_data = (void *)arr; 2427 tg->reduce_num_data = num; 2428 } 2429 2430 /*! 2431 @ingroup TASKING 2432 @param gtid Global thread ID 2433 @param tskgrp The taskgroup ID (optional) 2434 @param data Shared location of the item 2435 @return The pointer to per-thread data 2436 2437 Get thread-specific location of data item 2438 */ 2439 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) { 2440 __kmp_assert_valid_gtid(gtid); 2441 kmp_info_t *thread = __kmp_threads[gtid]; 2442 kmp_int32 nth = thread->th.th_team_nproc; 2443 if (nth == 1) 2444 return data; // nothing to do 2445 2446 kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp; 2447 if (tg == NULL) 2448 tg = thread->th.th_current_task->td_taskgroup; 2449 KMP_ASSERT(tg != NULL); 2450 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data); 2451 kmp_int32 num = tg->reduce_num_data; 2452 kmp_int32 tid = thread->th.th_info.ds.ds_tid; 2453 2454 KMP_ASSERT(data != NULL); 2455 while (tg != NULL) { 2456 for (int i = 0; i < num; ++i) { 2457 if (!arr[i].flags.lazy_priv) { 2458 if (data == arr[i].reduce_shar || 2459 (data >= arr[i].reduce_priv && data < arr[i].reduce_pend)) 2460 return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size; 2461 } else { 2462 // check shared location first 2463 void **p_priv = (void **)(arr[i].reduce_priv); 2464 if (data == arr[i].reduce_shar) 2465 goto found; 2466 // check if we get some thread specific location as parameter 2467 for (int j = 0; j < nth; ++j) 2468 if (data == p_priv[j]) 2469 goto found; 2470 continue; // not found, continue search 2471 found: 2472 if (p_priv[tid] == NULL) { 2473 // allocate thread specific object lazily 2474 p_priv[tid] = __kmp_allocate(arr[i].reduce_size); 2475 if (arr[i].reduce_init != NULL) { 2476 if (arr[i].reduce_orig != NULL) { // new interface 2477 ((void (*)(void *, void *))arr[i].reduce_init)( 2478 p_priv[tid], arr[i].reduce_orig); 2479 } else { // old interface (single parameter) 2480 ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]); 2481 } 2482 } 2483 } 2484 return p_priv[tid]; 2485 } 2486 } 2487 tg = tg->parent; 2488 arr = (kmp_taskred_data_t *)(tg->reduce_data); 2489 num = tg->reduce_num_data; 2490 } 2491 KMP_ASSERT2(0, "Unknown task reduction item"); 2492 return NULL; // ERROR, this line never executed 2493 } 2494 2495 // Finalize task reduction. 2496 // Called from __kmpc_end_taskgroup() 2497 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) { 2498 kmp_int32 nth = th->th.th_team_nproc; 2499 KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1 2500 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data; 2501 kmp_int32 num = tg->reduce_num_data; 2502 for (int i = 0; i < num; ++i) { 2503 void *sh_data = arr[i].reduce_shar; 2504 void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini); 2505 void (*f_comb)(void *, void *) = 2506 (void (*)(void *, void *))(arr[i].reduce_comb); 2507 if (!arr[i].flags.lazy_priv) { 2508 void *pr_data = arr[i].reduce_priv; 2509 size_t size = arr[i].reduce_size; 2510 for (int j = 0; j < nth; ++j) { 2511 void *priv_data = (char *)pr_data + j * size; 2512 f_comb(sh_data, priv_data); // combine results 2513 if (f_fini) 2514 f_fini(priv_data); // finalize if needed 2515 } 2516 } else { 2517 void **pr_data = (void **)(arr[i].reduce_priv); 2518 for (int j = 0; j < nth; ++j) { 2519 if (pr_data[j] != NULL) { 2520 f_comb(sh_data, pr_data[j]); // combine results 2521 if (f_fini) 2522 f_fini(pr_data[j]); // finalize if needed 2523 __kmp_free(pr_data[j]); 2524 } 2525 } 2526 } 2527 __kmp_free(arr[i].reduce_priv); 2528 } 2529 __kmp_thread_free(th, arr); 2530 tg->reduce_data = NULL; 2531 tg->reduce_num_data = 0; 2532 } 2533 2534 // Cleanup task reduction data for parallel or worksharing, 2535 // do not touch task private data other threads still working with. 2536 // Called from __kmpc_end_taskgroup() 2537 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) { 2538 __kmp_thread_free(th, tg->reduce_data); 2539 tg->reduce_data = NULL; 2540 tg->reduce_num_data = 0; 2541 } 2542 2543 template <typename T> 2544 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, 2545 int num, T *data) { 2546 __kmp_assert_valid_gtid(gtid); 2547 kmp_info_t *thr = __kmp_threads[gtid]; 2548 kmp_int32 nth = thr->th.th_team_nproc; 2549 __kmpc_taskgroup(loc, gtid); // form new taskgroup first 2550 if (nth == 1) { 2551 KA_TRACE(10, 2552 ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n", 2553 gtid, thr->th.th_current_task->td_taskgroup)); 2554 return (void *)thr->th.th_current_task->td_taskgroup; 2555 } 2556 kmp_team_t *team = thr->th.th_team; 2557 void *reduce_data; 2558 kmp_taskgroup_t *tg; 2559 reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]); 2560 if (reduce_data == NULL && 2561 __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data, 2562 (void *)1)) { 2563 // single thread enters this block to initialize common reduction data 2564 KMP_DEBUG_ASSERT(reduce_data == NULL); 2565 // first initialize own data, then make a copy other threads can use 2566 tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data); 2567 reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t)); 2568 KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t)); 2569 // fini counters should be 0 at this point 2570 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0); 2571 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0); 2572 KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data); 2573 } else { 2574 while ( 2575 (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) == 2576 (void *)1) { // wait for task reduction initialization 2577 KMP_CPU_PAUSE(); 2578 } 2579 KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here 2580 tg = thr->th.th_current_task->td_taskgroup; 2581 __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data); 2582 } 2583 return tg; 2584 } 2585 2586 /*! 2587 @ingroup TASKING 2588 @param loc Source location info 2589 @param gtid Global thread ID 2590 @param is_ws Is 1 if the reduction is for worksharing, 0 otherwise 2591 @param num Number of data items to reduce 2592 @param data Array of data for reduction 2593 @return The taskgroup identifier 2594 2595 Initialize task reduction for a parallel or worksharing. 2596 2597 Note: this entry supposes the optional compiler-generated initializer routine 2598 has single parameter - pointer to object to be initialized. That means 2599 the reduction either does not use omp_orig object, or the omp_orig is accessible 2600 without help of the runtime library. 2601 */ 2602 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, 2603 int num, void *data) { 2604 return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num, 2605 (kmp_task_red_input_t *)data); 2606 } 2607 2608 /*! 2609 @ingroup TASKING 2610 @param loc Source location info 2611 @param gtid Global thread ID 2612 @param is_ws Is 1 if the reduction is for worksharing, 0 otherwise 2613 @param num Number of data items to reduce 2614 @param data Array of data for reduction 2615 @return The taskgroup identifier 2616 2617 Initialize task reduction for a parallel or worksharing. 2618 2619 Note: this entry supposes the optional compiler-generated initializer routine 2620 has two parameters, pointer to object to be initialized and pointer to omp_orig 2621 */ 2622 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, 2623 void *data) { 2624 return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num, 2625 (kmp_taskred_input_t *)data); 2626 } 2627 2628 /*! 2629 @ingroup TASKING 2630 @param loc Source location info 2631 @param gtid Global thread ID 2632 @param is_ws Is 1 if the reduction is for worksharing, 0 otherwise 2633 2634 Finalize task reduction for a parallel or worksharing. 2635 */ 2636 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) { 2637 __kmpc_end_taskgroup(loc, gtid); 2638 } 2639 2640 // __kmpc_taskgroup: Start a new taskgroup 2641 void __kmpc_taskgroup(ident_t *loc, int gtid) { 2642 __kmp_assert_valid_gtid(gtid); 2643 kmp_info_t *thread = __kmp_threads[gtid]; 2644 kmp_taskdata_t *taskdata = thread->th.th_current_task; 2645 kmp_taskgroup_t *tg_new = 2646 (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t)); 2647 KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new)); 2648 KMP_ATOMIC_ST_RLX(&tg_new->count, 0); 2649 KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq); 2650 tg_new->parent = taskdata->td_taskgroup; 2651 tg_new->reduce_data = NULL; 2652 tg_new->reduce_num_data = 0; 2653 tg_new->gomp_data = NULL; 2654 taskdata->td_taskgroup = tg_new; 2655 2656 #if OMPT_SUPPORT && OMPT_OPTIONAL 2657 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) { 2658 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2659 if (!codeptr) 2660 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2661 kmp_team_t *team = thread->th.th_team; 2662 ompt_data_t my_task_data = taskdata->ompt_task_info.task_data; 2663 // FIXME: I think this is wrong for lwt! 2664 ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data; 2665 2666 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2667 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data), 2668 &(my_task_data), codeptr); 2669 } 2670 #endif 2671 } 2672 2673 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task 2674 // and its descendants are complete 2675 void __kmpc_end_taskgroup(ident_t *loc, int gtid) { 2676 __kmp_assert_valid_gtid(gtid); 2677 kmp_info_t *thread = __kmp_threads[gtid]; 2678 kmp_taskdata_t *taskdata = thread->th.th_current_task; 2679 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 2680 int thread_finished = FALSE; 2681 2682 #if OMPT_SUPPORT && OMPT_OPTIONAL 2683 kmp_team_t *team; 2684 ompt_data_t my_task_data; 2685 ompt_data_t my_parallel_data; 2686 void *codeptr = nullptr; 2687 if (UNLIKELY(ompt_enabled.enabled)) { 2688 team = thread->th.th_team; 2689 my_task_data = taskdata->ompt_task_info.task_data; 2690 // FIXME: I think this is wrong for lwt! 2691 my_parallel_data = team->t.ompt_team_info.parallel_data; 2692 codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2693 if (!codeptr) 2694 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2695 } 2696 #endif 2697 2698 KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc)); 2699 KMP_DEBUG_ASSERT(taskgroup != NULL); 2700 KMP_SET_THREAD_STATE_BLOCK(TASKGROUP); 2701 2702 if (__kmp_tasking_mode != tskm_immediate_exec) { 2703 // mark task as waiting not on a barrier 2704 taskdata->td_taskwait_counter += 1; 2705 taskdata->td_taskwait_ident = loc; 2706 taskdata->td_taskwait_thread = gtid + 1; 2707 #if USE_ITT_BUILD 2708 // For ITT the taskgroup wait is similar to taskwait until we need to 2709 // distinguish them 2710 void *itt_sync_obj = NULL; 2711 #if USE_ITT_NOTIFY 2712 KMP_ITT_TASKWAIT_STARTING(itt_sync_obj); 2713 #endif /* USE_ITT_NOTIFY */ 2714 #endif /* USE_ITT_BUILD */ 2715 2716 #if OMPT_SUPPORT && OMPT_OPTIONAL 2717 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) { 2718 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2719 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data), 2720 &(my_task_data), codeptr); 2721 } 2722 #endif 2723 2724 if (!taskdata->td_flags.team_serial || 2725 (thread->th.th_task_team != NULL && 2726 (thread->th.th_task_team->tt.tt_found_proxy_tasks || 2727 thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) { 2728 kmp_flag_32<false, false> flag( 2729 RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U); 2730 while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) { 2731 flag.execute_tasks(thread, gtid, FALSE, 2732 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 2733 __kmp_task_stealing_constraint); 2734 } 2735 } 2736 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting 2737 2738 #if OMPT_SUPPORT && OMPT_OPTIONAL 2739 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) { 2740 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2741 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data), 2742 &(my_task_data), codeptr); 2743 } 2744 #endif 2745 2746 #if USE_ITT_BUILD 2747 KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj); 2748 KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants 2749 #endif /* USE_ITT_BUILD */ 2750 } 2751 KMP_DEBUG_ASSERT(taskgroup->count == 0); 2752 2753 if (taskgroup->reduce_data != NULL && 2754 !taskgroup->gomp_data) { // need to reduce? 2755 int cnt; 2756 void *reduce_data; 2757 kmp_team_t *t = thread->th.th_team; 2758 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data; 2759 // check if <priv> data of the first reduction variable shared for the team 2760 void *priv0 = arr[0].reduce_priv; 2761 if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL && 2762 ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) { 2763 // finishing task reduction on parallel 2764 cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]); 2765 if (cnt == thread->th.th_team_nproc - 1) { 2766 // we are the last thread passing __kmpc_reduction_modifier_fini() 2767 // finalize task reduction: 2768 __kmp_task_reduction_fini(thread, taskgroup); 2769 // cleanup fields in the team structure: 2770 // TODO: is relaxed store enough here (whole barrier should follow)? 2771 __kmp_thread_free(thread, reduce_data); 2772 KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL); 2773 KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0); 2774 } else { 2775 // we are not the last thread passing __kmpc_reduction_modifier_fini(), 2776 // so do not finalize reduction, just clean own copy of the data 2777 __kmp_task_reduction_clean(thread, taskgroup); 2778 } 2779 } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) != 2780 NULL && 2781 ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) { 2782 // finishing task reduction on worksharing 2783 cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]); 2784 if (cnt == thread->th.th_team_nproc - 1) { 2785 // we are the last thread passing __kmpc_reduction_modifier_fini() 2786 __kmp_task_reduction_fini(thread, taskgroup); 2787 // cleanup fields in team structure: 2788 // TODO: is relaxed store enough here (whole barrier should follow)? 2789 __kmp_thread_free(thread, reduce_data); 2790 KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL); 2791 KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0); 2792 } else { 2793 // we are not the last thread passing __kmpc_reduction_modifier_fini(), 2794 // so do not finalize reduction, just clean own copy of the data 2795 __kmp_task_reduction_clean(thread, taskgroup); 2796 } 2797 } else { 2798 // finishing task reduction on taskgroup 2799 __kmp_task_reduction_fini(thread, taskgroup); 2800 } 2801 } 2802 // Restore parent taskgroup for the current task 2803 taskdata->td_taskgroup = taskgroup->parent; 2804 __kmp_thread_free(thread, taskgroup); 2805 2806 KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n", 2807 gtid, taskdata)); 2808 2809 #if OMPT_SUPPORT && OMPT_OPTIONAL 2810 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) { 2811 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2812 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data), 2813 &(my_task_data), codeptr); 2814 } 2815 #endif 2816 } 2817 2818 static kmp_task_t *__kmp_get_priority_task(kmp_int32 gtid, 2819 kmp_task_team_t *task_team, 2820 kmp_int32 is_constrained) { 2821 kmp_task_t *task = NULL; 2822 kmp_taskdata_t *taskdata; 2823 kmp_taskdata_t *current; 2824 kmp_thread_data_t *thread_data; 2825 int ntasks = task_team->tt.tt_num_task_pri; 2826 if (ntasks == 0) { 2827 KA_TRACE( 2828 20, ("__kmp_get_priority_task(exit #1): T#%d No tasks to get\n", gtid)); 2829 return NULL; 2830 } 2831 do { 2832 // decrement num_tasks to "reserve" one task to get for execution 2833 if (__kmp_atomic_compare_store(&task_team->tt.tt_num_task_pri, ntasks, 2834 ntasks - 1)) 2835 break; 2836 } while (ntasks > 0); 2837 if (ntasks == 0) { 2838 KA_TRACE(20, ("__kmp_get_priority_task(exit #2): T#%d No tasks to get\n", 2839 __kmp_get_gtid())); 2840 return NULL; 2841 } 2842 // We got a "ticket" to get a "reserved" priority task 2843 int deque_ntasks; 2844 kmp_task_pri_t *list = task_team->tt.tt_task_pri_list; 2845 do { 2846 KMP_ASSERT(list != NULL); 2847 thread_data = &list->td; 2848 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 2849 deque_ntasks = thread_data->td.td_deque_ntasks; 2850 if (deque_ntasks == 0) { 2851 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2852 KA_TRACE(20, ("__kmp_get_priority_task: T#%d No tasks to get from %p\n", 2853 __kmp_get_gtid(), thread_data)); 2854 list = list->next; 2855 } 2856 } while (deque_ntasks == 0); 2857 KMP_DEBUG_ASSERT(deque_ntasks); 2858 int target = thread_data->td.td_deque_head; 2859 current = __kmp_threads[gtid]->th.th_current_task; 2860 taskdata = thread_data->td.td_deque[target]; 2861 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) { 2862 // Bump head pointer and Wrap. 2863 thread_data->td.td_deque_head = 2864 (target + 1) & TASK_DEQUE_MASK(thread_data->td); 2865 } else { 2866 if (!task_team->tt.tt_untied_task_encountered) { 2867 // The TSC does not allow to steal victim task 2868 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2869 KA_TRACE(20, ("__kmp_get_priority_task(exit #3): T#%d could not get task " 2870 "from %p: task_team=%p ntasks=%d head=%u tail=%u\n", 2871 gtid, thread_data, task_team, deque_ntasks, target, 2872 thread_data->td.td_deque_tail)); 2873 task_team->tt.tt_num_task_pri++; // atomic inc, restore value 2874 return NULL; 2875 } 2876 int i; 2877 // walk through the deque trying to steal any task 2878 taskdata = NULL; 2879 for (i = 1; i < deque_ntasks; ++i) { 2880 target = (target + 1) & TASK_DEQUE_MASK(thread_data->td); 2881 taskdata = thread_data->td.td_deque[target]; 2882 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) { 2883 break; // found task to execute 2884 } else { 2885 taskdata = NULL; 2886 } 2887 } 2888 if (taskdata == NULL) { 2889 // No appropriate candidate found to execute 2890 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2891 KA_TRACE( 2892 10, ("__kmp_get_priority_task(exit #4): T#%d could not get task from " 2893 "%p: task_team=%p ntasks=%d head=%u tail=%u\n", 2894 gtid, thread_data, task_team, deque_ntasks, 2895 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2896 task_team->tt.tt_num_task_pri++; // atomic inc, restore value 2897 return NULL; 2898 } 2899 int prev = target; 2900 for (i = i + 1; i < deque_ntasks; ++i) { 2901 // shift remaining tasks in the deque left by 1 2902 target = (target + 1) & TASK_DEQUE_MASK(thread_data->td); 2903 thread_data->td.td_deque[prev] = thread_data->td.td_deque[target]; 2904 prev = target; 2905 } 2906 KMP_DEBUG_ASSERT( 2907 thread_data->td.td_deque_tail == 2908 (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(thread_data->td))); 2909 thread_data->td.td_deque_tail = target; // tail -= 1 (wrapped)) 2910 } 2911 thread_data->td.td_deque_ntasks = deque_ntasks - 1; 2912 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2913 task = KMP_TASKDATA_TO_TASK(taskdata); 2914 return task; 2915 } 2916 2917 // __kmp_remove_my_task: remove a task from my own deque 2918 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid, 2919 kmp_task_team_t *task_team, 2920 kmp_int32 is_constrained) { 2921 kmp_task_t *task; 2922 kmp_taskdata_t *taskdata; 2923 kmp_thread_data_t *thread_data; 2924 kmp_uint32 tail; 2925 2926 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2927 KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data != 2928 NULL); // Caller should check this condition 2929 2930 thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 2931 2932 KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n", 2933 gtid, thread_data->td.td_deque_ntasks, 2934 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2935 2936 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 2937 KA_TRACE(10, 2938 ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: " 2939 "ntasks=%d head=%u tail=%u\n", 2940 gtid, thread_data->td.td_deque_ntasks, 2941 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2942 return NULL; 2943 } 2944 2945 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 2946 2947 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 2948 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2949 KA_TRACE(10, 2950 ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: " 2951 "ntasks=%d head=%u tail=%u\n", 2952 gtid, thread_data->td.td_deque_ntasks, 2953 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2954 return NULL; 2955 } 2956 2957 tail = (thread_data->td.td_deque_tail - 1) & 2958 TASK_DEQUE_MASK(thread_data->td); // Wrap index. 2959 taskdata = thread_data->td.td_deque[tail]; 2960 2961 if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata, 2962 thread->th.th_current_task)) { 2963 // The TSC does not allow to steal victim task 2964 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2965 KA_TRACE(10, 2966 ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: " 2967 "ntasks=%d head=%u tail=%u\n", 2968 gtid, thread_data->td.td_deque_ntasks, 2969 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2970 return NULL; 2971 } 2972 2973 thread_data->td.td_deque_tail = tail; 2974 TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1); 2975 2976 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2977 2978 KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: " 2979 "ntasks=%d head=%u tail=%u\n", 2980 gtid, taskdata, thread_data->td.td_deque_ntasks, 2981 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2982 2983 task = KMP_TASKDATA_TO_TASK(taskdata); 2984 return task; 2985 } 2986 2987 // __kmp_steal_task: remove a task from another thread's deque 2988 // Assume that calling thread has already checked existence of 2989 // task_team thread_data before calling this routine. 2990 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid, 2991 kmp_task_team_t *task_team, 2992 std::atomic<kmp_int32> *unfinished_threads, 2993 int *thread_finished, 2994 kmp_int32 is_constrained) { 2995 kmp_task_t *task; 2996 kmp_taskdata_t *taskdata; 2997 kmp_taskdata_t *current; 2998 kmp_thread_data_t *victim_td, *threads_data; 2999 kmp_int32 target; 3000 kmp_int32 victim_tid; 3001 3002 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3003 3004 threads_data = task_team->tt.tt_threads_data; 3005 KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition 3006 3007 victim_tid = victim_thr->th.th_info.ds.ds_tid; 3008 victim_td = &threads_data[victim_tid]; 3009 3010 KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: " 3011 "task_team=%p ntasks=%d head=%u tail=%u\n", 3012 gtid, __kmp_gtid_from_thread(victim_thr), task_team, 3013 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 3014 victim_td->td.td_deque_tail)); 3015 3016 if (TCR_4(victim_td->td.td_deque_ntasks) == 0) { 3017 KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: " 3018 "task_team=%p ntasks=%d head=%u tail=%u\n", 3019 gtid, __kmp_gtid_from_thread(victim_thr), task_team, 3020 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 3021 victim_td->td.td_deque_tail)); 3022 return NULL; 3023 } 3024 3025 __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock); 3026 3027 int ntasks = TCR_4(victim_td->td.td_deque_ntasks); 3028 // Check again after we acquire the lock 3029 if (ntasks == 0) { 3030 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 3031 KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: " 3032 "task_team=%p ntasks=%d head=%u tail=%u\n", 3033 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 3034 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 3035 return NULL; 3036 } 3037 3038 KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL); 3039 current = __kmp_threads[gtid]->th.th_current_task; 3040 taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head]; 3041 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) { 3042 // Bump head pointer and Wrap. 3043 victim_td->td.td_deque_head = 3044 (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td); 3045 } else { 3046 if (!task_team->tt.tt_untied_task_encountered) { 3047 // The TSC does not allow to steal victim task 3048 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 3049 KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from " 3050 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n", 3051 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 3052 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 3053 return NULL; 3054 } 3055 int i; 3056 // walk through victim's deque trying to steal any task 3057 target = victim_td->td.td_deque_head; 3058 taskdata = NULL; 3059 for (i = 1; i < ntasks; ++i) { 3060 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td); 3061 taskdata = victim_td->td.td_deque[target]; 3062 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) { 3063 break; // found victim task 3064 } else { 3065 taskdata = NULL; 3066 } 3067 } 3068 if (taskdata == NULL) { 3069 // No appropriate candidate to steal found 3070 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 3071 KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from " 3072 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n", 3073 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 3074 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 3075 return NULL; 3076 } 3077 int prev = target; 3078 for (i = i + 1; i < ntasks; ++i) { 3079 // shift remaining tasks in the deque left by 1 3080 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td); 3081 victim_td->td.td_deque[prev] = victim_td->td.td_deque[target]; 3082 prev = target; 3083 } 3084 KMP_DEBUG_ASSERT( 3085 victim_td->td.td_deque_tail == 3086 (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td))); 3087 victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped)) 3088 } 3089 if (*thread_finished) { 3090 // We need to un-mark this victim as a finished victim. This must be done 3091 // before releasing the lock, or else other threads (starting with the 3092 // primary thread victim) might be prematurely released from the barrier!!! 3093 #if KMP_DEBUG 3094 kmp_int32 count = 3095 #endif 3096 KMP_ATOMIC_INC(unfinished_threads); 3097 KA_TRACE( 3098 20, 3099 ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n", 3100 gtid, count + 1, task_team)); 3101 *thread_finished = FALSE; 3102 } 3103 TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1); 3104 3105 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 3106 3107 KMP_COUNT_BLOCK(TASK_stolen); 3108 KA_TRACE(10, 3109 ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: " 3110 "task_team=%p ntasks=%d head=%u tail=%u\n", 3111 gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team, 3112 ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 3113 3114 task = KMP_TASKDATA_TO_TASK(taskdata); 3115 return task; 3116 } 3117 3118 // __kmp_execute_tasks_template: Choose and execute tasks until either the 3119 // condition is statisfied (return true) or there are none left (return false). 3120 // 3121 // final_spin is TRUE if this is the spin at the release barrier. 3122 // thread_finished indicates whether the thread is finished executing all 3123 // the tasks it has on its deque, and is at the release barrier. 3124 // spinner is the location on which to spin. 3125 // spinner == NULL means only execute a single task and return. 3126 // checker is the value to check to terminate the spin. 3127 template <class C> 3128 static inline int __kmp_execute_tasks_template( 3129 kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin, 3130 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 3131 kmp_int32 is_constrained) { 3132 kmp_task_team_t *task_team = thread->th.th_task_team; 3133 kmp_thread_data_t *threads_data; 3134 kmp_task_t *task; 3135 kmp_info_t *other_thread; 3136 kmp_taskdata_t *current_task = thread->th.th_current_task; 3137 std::atomic<kmp_int32> *unfinished_threads; 3138 kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0, 3139 tid = thread->th.th_info.ds.ds_tid; 3140 3141 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3142 KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]); 3143 3144 if (task_team == NULL || current_task == NULL) 3145 return FALSE; 3146 3147 KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d " 3148 "*thread_finished=%d\n", 3149 gtid, final_spin, *thread_finished)); 3150 3151 thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 3152 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 3153 3154 KMP_DEBUG_ASSERT(threads_data != NULL); 3155 3156 nthreads = task_team->tt.tt_nproc; 3157 unfinished_threads = &(task_team->tt.tt_unfinished_threads); 3158 KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks || 3159 task_team->tt.tt_hidden_helper_task_encountered); 3160 KMP_DEBUG_ASSERT(*unfinished_threads >= 0); 3161 3162 while (1) { // Outer loop keeps trying to find tasks in case of single thread 3163 // getting tasks from target constructs 3164 while (1) { // Inner loop to find a task and execute it 3165 task = NULL; 3166 if (task_team->tt.tt_num_task_pri) { // get priority task first 3167 task = __kmp_get_priority_task(gtid, task_team, is_constrained); 3168 } 3169 if (task == NULL && use_own_tasks) { // check own queue next 3170 task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained); 3171 } 3172 if ((task == NULL) && (nthreads > 1)) { // Steal a task finally 3173 int asleep = 1; 3174 use_own_tasks = 0; 3175 // Try to steal from the last place I stole from successfully. 3176 if (victim_tid == -2) { // haven't stolen anything yet 3177 victim_tid = threads_data[tid].td.td_deque_last_stolen; 3178 if (victim_tid != 3179 -1) // if we have a last stolen from victim, get the thread 3180 other_thread = threads_data[victim_tid].td.td_thr; 3181 } 3182 if (victim_tid != -1) { // found last victim 3183 asleep = 0; 3184 } else if (!new_victim) { // no recent steals and we haven't already 3185 // used a new victim; select a random thread 3186 do { // Find a different thread to steal work from. 3187 // Pick a random thread. Initial plan was to cycle through all the 3188 // threads, and only return if we tried to steal from every thread, 3189 // and failed. Arch says that's not such a great idea. 3190 victim_tid = __kmp_get_random(thread) % (nthreads - 1); 3191 if (victim_tid >= tid) { 3192 ++victim_tid; // Adjusts random distribution to exclude self 3193 } 3194 // Found a potential victim 3195 other_thread = threads_data[victim_tid].td.td_thr; 3196 // There is a slight chance that __kmp_enable_tasking() did not wake 3197 // up all threads waiting at the barrier. If victim is sleeping, 3198 // then wake it up. Since we were going to pay the cache miss 3199 // penalty for referencing another thread's kmp_info_t struct 3200 // anyway, 3201 // the check shouldn't cost too much performance at this point. In 3202 // extra barrier mode, tasks do not sleep at the separate tasking 3203 // barrier, so this isn't a problem. 3204 asleep = 0; 3205 if ((__kmp_tasking_mode == tskm_task_teams) && 3206 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) && 3207 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) != 3208 NULL)) { 3209 asleep = 1; 3210 __kmp_null_resume_wrapper(other_thread); 3211 // A sleeping thread should not have any tasks on it's queue. 3212 // There is a slight possibility that it resumes, steals a task 3213 // from another thread, which spawns more tasks, all in the time 3214 // that it takes this thread to check => don't write an assertion 3215 // that the victim's queue is empty. Try stealing from a 3216 // different thread. 3217 } 3218 } while (asleep); 3219 } 3220 3221 if (!asleep) { 3222 // We have a victim to try to steal from 3223 task = __kmp_steal_task(other_thread, gtid, task_team, 3224 unfinished_threads, thread_finished, 3225 is_constrained); 3226 } 3227 if (task != NULL) { // set last stolen to victim 3228 if (threads_data[tid].td.td_deque_last_stolen != victim_tid) { 3229 threads_data[tid].td.td_deque_last_stolen = victim_tid; 3230 // The pre-refactored code did not try more than 1 successful new 3231 // vicitm, unless the last one generated more local tasks; 3232 // new_victim keeps track of this 3233 new_victim = 1; 3234 } 3235 } else { // No tasks found; unset last_stolen 3236 KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1); 3237 victim_tid = -2; // no successful victim found 3238 } 3239 } 3240 3241 if (task == NULL) 3242 break; // break out of tasking loop 3243 3244 // Found a task; execute it 3245 #if USE_ITT_BUILD && USE_ITT_NOTIFY 3246 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 3247 if (itt_sync_obj == NULL) { // we are at fork barrier where we could not 3248 // get the object reliably 3249 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 3250 } 3251 __kmp_itt_task_starting(itt_sync_obj); 3252 } 3253 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 3254 __kmp_invoke_task(gtid, task, current_task); 3255 #if USE_ITT_BUILD 3256 if (itt_sync_obj != NULL) 3257 __kmp_itt_task_finished(itt_sync_obj); 3258 #endif /* USE_ITT_BUILD */ 3259 // If this thread is only partway through the barrier and the condition is 3260 // met, then return now, so that the barrier gather/release pattern can 3261 // proceed. If this thread is in the last spin loop in the barrier, 3262 // waiting to be released, we know that the termination condition will not 3263 // be satisfied, so don't waste any cycles checking it. 3264 if (flag == NULL || (!final_spin && flag->done_check())) { 3265 KA_TRACE( 3266 15, 3267 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 3268 gtid)); 3269 return TRUE; 3270 } 3271 if (thread->th.th_task_team == NULL) { 3272 break; 3273 } 3274 KMP_YIELD(__kmp_library == library_throughput); // Yield before next task 3275 // If execution of a stolen task results in more tasks being placed on our 3276 // run queue, reset use_own_tasks 3277 if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) { 3278 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned " 3279 "other tasks, restart\n", 3280 gtid)); 3281 use_own_tasks = 1; 3282 new_victim = 0; 3283 } 3284 } 3285 3286 // The task source has been exhausted. If in final spin loop of barrier, 3287 // check if termination condition is satisfied. The work queue may be empty 3288 // but there might be proxy tasks still executing. 3289 if (final_spin && 3290 KMP_ATOMIC_LD_ACQ(¤t_task->td_incomplete_child_tasks) == 0) { 3291 // First, decrement the #unfinished threads, if that has not already been 3292 // done. This decrement might be to the spin location, and result in the 3293 // termination condition being satisfied. 3294 if (!*thread_finished) { 3295 #if KMP_DEBUG 3296 kmp_int32 count = -1 + 3297 #endif 3298 KMP_ATOMIC_DEC(unfinished_threads); 3299 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec " 3300 "unfinished_threads to %d task_team=%p\n", 3301 gtid, count, task_team)); 3302 *thread_finished = TRUE; 3303 } 3304 3305 // It is now unsafe to reference thread->th.th_team !!! 3306 // Decrementing task_team->tt.tt_unfinished_threads can allow the primary 3307 // thread to pass through the barrier, where it might reset each thread's 3308 // th.th_team field for the next parallel region. If we can steal more 3309 // work, we know that this has not happened yet. 3310 if (flag != NULL && flag->done_check()) { 3311 KA_TRACE( 3312 15, 3313 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 3314 gtid)); 3315 return TRUE; 3316 } 3317 } 3318 3319 // If this thread's task team is NULL, primary thread has recognized that 3320 // there are no more tasks; bail out 3321 if (thread->th.th_task_team == NULL) { 3322 KA_TRACE(15, 3323 ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid)); 3324 return FALSE; 3325 } 3326 3327 // Check the flag again to see if it has already done in case to be trapped 3328 // into infinite loop when a if0 task depends on a hidden helper task 3329 // outside any parallel region. Detached tasks are not impacted in this case 3330 // because the only thread executing this function has to execute the proxy 3331 // task so it is in another code path that has the same check. 3332 if (flag == NULL || (!final_spin && flag->done_check())) { 3333 KA_TRACE(15, 3334 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 3335 gtid)); 3336 return TRUE; 3337 } 3338 3339 // We could be getting tasks from target constructs; if this is the only 3340 // thread, keep trying to execute tasks from own queue 3341 if (nthreads == 1 && 3342 KMP_ATOMIC_LD_ACQ(¤t_task->td_incomplete_child_tasks)) 3343 use_own_tasks = 1; 3344 else { 3345 KA_TRACE(15, 3346 ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid)); 3347 return FALSE; 3348 } 3349 } 3350 } 3351 3352 template <bool C, bool S> 3353 int __kmp_execute_tasks_32( 3354 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin, 3355 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 3356 kmp_int32 is_constrained) { 3357 return __kmp_execute_tasks_template( 3358 thread, gtid, flag, final_spin, 3359 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 3360 } 3361 3362 template <bool C, bool S> 3363 int __kmp_execute_tasks_64( 3364 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin, 3365 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 3366 kmp_int32 is_constrained) { 3367 return __kmp_execute_tasks_template( 3368 thread, gtid, flag, final_spin, 3369 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 3370 } 3371 3372 template <bool C, bool S> 3373 int __kmp_atomic_execute_tasks_64( 3374 kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag, 3375 int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 3376 kmp_int32 is_constrained) { 3377 return __kmp_execute_tasks_template( 3378 thread, gtid, flag, final_spin, 3379 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 3380 } 3381 3382 int __kmp_execute_tasks_oncore( 3383 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin, 3384 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 3385 kmp_int32 is_constrained) { 3386 return __kmp_execute_tasks_template( 3387 thread, gtid, flag, final_spin, 3388 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 3389 } 3390 3391 template int 3392 __kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32, 3393 kmp_flag_32<false, false> *, int, 3394 int *USE_ITT_BUILD_ARG(void *), kmp_int32); 3395 3396 template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32, 3397 kmp_flag_64<false, true> *, 3398 int, 3399 int *USE_ITT_BUILD_ARG(void *), 3400 kmp_int32); 3401 3402 template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32, 3403 kmp_flag_64<true, false> *, 3404 int, 3405 int *USE_ITT_BUILD_ARG(void *), 3406 kmp_int32); 3407 3408 template int __kmp_atomic_execute_tasks_64<false, true>( 3409 kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int, 3410 int *USE_ITT_BUILD_ARG(void *), kmp_int32); 3411 3412 template int __kmp_atomic_execute_tasks_64<true, false>( 3413 kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int, 3414 int *USE_ITT_BUILD_ARG(void *), kmp_int32); 3415 3416 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the 3417 // next barrier so they can assist in executing enqueued tasks. 3418 // First thread in allocates the task team atomically. 3419 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 3420 kmp_info_t *this_thr) { 3421 kmp_thread_data_t *threads_data; 3422 int nthreads, i, is_init_thread; 3423 3424 KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n", 3425 __kmp_gtid_from_thread(this_thr))); 3426 3427 KMP_DEBUG_ASSERT(task_team != NULL); 3428 KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL); 3429 3430 nthreads = task_team->tt.tt_nproc; 3431 KMP_DEBUG_ASSERT(nthreads > 0); 3432 KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc); 3433 3434 // Allocate or increase the size of threads_data if necessary 3435 is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team); 3436 3437 if (!is_init_thread) { 3438 // Some other thread already set up the array. 3439 KA_TRACE( 3440 20, 3441 ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n", 3442 __kmp_gtid_from_thread(this_thr))); 3443 return; 3444 } 3445 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 3446 KMP_DEBUG_ASSERT(threads_data != NULL); 3447 3448 if (__kmp_tasking_mode == tskm_task_teams && 3449 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) { 3450 // Release any threads sleeping at the barrier, so that they can steal 3451 // tasks and execute them. In extra barrier mode, tasks do not sleep 3452 // at the separate tasking barrier, so this isn't a problem. 3453 for (i = 0; i < nthreads; i++) { 3454 void *sleep_loc; 3455 kmp_info_t *thread = threads_data[i].td.td_thr; 3456 3457 if (i == this_thr->th.th_info.ds.ds_tid) { 3458 continue; 3459 } 3460 // Since we haven't locked the thread's suspend mutex lock at this 3461 // point, there is a small window where a thread might be putting 3462 // itself to sleep, but hasn't set the th_sleep_loc field yet. 3463 // To work around this, __kmp_execute_tasks_template() periodically checks 3464 // see if other threads are sleeping (using the same random mechanism that 3465 // is used for task stealing) and awakens them if they are. 3466 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != 3467 NULL) { 3468 KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n", 3469 __kmp_gtid_from_thread(this_thr), 3470 __kmp_gtid_from_thread(thread))); 3471 __kmp_null_resume_wrapper(thread); 3472 } else { 3473 KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n", 3474 __kmp_gtid_from_thread(this_thr), 3475 __kmp_gtid_from_thread(thread))); 3476 } 3477 } 3478 } 3479 3480 KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n", 3481 __kmp_gtid_from_thread(this_thr))); 3482 } 3483 3484 /* // TODO: Check the comment consistency 3485 * Utility routines for "task teams". A task team (kmp_task_t) is kind of 3486 * like a shadow of the kmp_team_t data struct, with a different lifetime. 3487 * After a child * thread checks into a barrier and calls __kmp_release() from 3488 * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no 3489 * longer assume that the kmp_team_t structure is intact (at any moment, the 3490 * primary thread may exit the barrier code and free the team data structure, 3491 * and return the threads to the thread pool). 3492 * 3493 * This does not work with the tasking code, as the thread is still 3494 * expected to participate in the execution of any tasks that may have been 3495 * spawned my a member of the team, and the thread still needs access to all 3496 * to each thread in the team, so that it can steal work from it. 3497 * 3498 * Enter the existence of the kmp_task_team_t struct. It employs a reference 3499 * counting mechanism, and is allocated by the primary thread before calling 3500 * __kmp_<barrier_kind>_release, and then is release by the last thread to 3501 * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes 3502 * of the kmp_task_team_t structs for consecutive barriers can overlap 3503 * (and will, unless the primary thread is the last thread to exit the barrier 3504 * release phase, which is not typical). The existence of such a struct is 3505 * useful outside the context of tasking. 3506 * 3507 * We currently use the existence of the threads array as an indicator that 3508 * tasks were spawned since the last barrier. If the structure is to be 3509 * useful outside the context of tasking, then this will have to change, but 3510 * not setting the field minimizes the performance impact of tasking on 3511 * barriers, when no explicit tasks were spawned (pushed, actually). 3512 */ 3513 3514 static kmp_task_team_t *__kmp_free_task_teams = 3515 NULL; // Free list for task_team data structures 3516 // Lock for task team data structures 3517 kmp_bootstrap_lock_t __kmp_task_team_lock = 3518 KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock); 3519 3520 // __kmp_alloc_task_deque: 3521 // Allocates a task deque for a particular thread, and initialize the necessary 3522 // data structures relating to the deque. This only happens once per thread 3523 // per task team since task teams are recycled. No lock is needed during 3524 // allocation since each thread allocates its own deque. 3525 static void __kmp_alloc_task_deque(kmp_info_t *thread, 3526 kmp_thread_data_t *thread_data) { 3527 __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock); 3528 KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL); 3529 3530 // Initialize last stolen task field to "none" 3531 thread_data->td.td_deque_last_stolen = -1; 3532 3533 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0); 3534 KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0); 3535 KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0); 3536 3537 KE_TRACE( 3538 10, 3539 ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n", 3540 __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data)); 3541 // Allocate space for task deque, and zero the deque 3542 // Cannot use __kmp_thread_calloc() because threads not around for 3543 // kmp_reap_task_team( ). 3544 thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate( 3545 INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *)); 3546 thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE; 3547 } 3548 3549 // __kmp_free_task_deque: 3550 // Deallocates a task deque for a particular thread. Happens at library 3551 // deallocation so don't need to reset all thread data fields. 3552 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) { 3553 if (thread_data->td.td_deque != NULL) { 3554 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3555 TCW_4(thread_data->td.td_deque_ntasks, 0); 3556 __kmp_free(thread_data->td.td_deque); 3557 thread_data->td.td_deque = NULL; 3558 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 3559 } 3560 3561 #ifdef BUILD_TIED_TASK_STACK 3562 // GEH: Figure out what to do here for td_susp_tied_tasks 3563 if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) { 3564 __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data); 3565 } 3566 #endif // BUILD_TIED_TASK_STACK 3567 } 3568 3569 // __kmp_realloc_task_threads_data: 3570 // Allocates a threads_data array for a task team, either by allocating an 3571 // initial array or enlarging an existing array. Only the first thread to get 3572 // the lock allocs or enlarges the array and re-initializes the array elements. 3573 // That thread returns "TRUE", the rest return "FALSE". 3574 // Assumes that the new array size is given by task_team -> tt.tt_nproc. 3575 // The current size is given by task_team -> tt.tt_max_threads. 3576 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 3577 kmp_task_team_t *task_team) { 3578 kmp_thread_data_t **threads_data_p; 3579 kmp_int32 nthreads, maxthreads; 3580 int is_init_thread = FALSE; 3581 3582 if (TCR_4(task_team->tt.tt_found_tasks)) { 3583 // Already reallocated and initialized. 3584 return FALSE; 3585 } 3586 3587 threads_data_p = &task_team->tt.tt_threads_data; 3588 nthreads = task_team->tt.tt_nproc; 3589 maxthreads = task_team->tt.tt_max_threads; 3590 3591 // All threads must lock when they encounter the first task of the implicit 3592 // task region to make sure threads_data fields are (re)initialized before 3593 // used. 3594 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 3595 3596 if (!TCR_4(task_team->tt.tt_found_tasks)) { 3597 // first thread to enable tasking 3598 kmp_team_t *team = thread->th.th_team; 3599 int i; 3600 3601 is_init_thread = TRUE; 3602 if (maxthreads < nthreads) { 3603 3604 if (*threads_data_p != NULL) { 3605 kmp_thread_data_t *old_data = *threads_data_p; 3606 kmp_thread_data_t *new_data = NULL; 3607 3608 KE_TRACE( 3609 10, 3610 ("__kmp_realloc_task_threads_data: T#%d reallocating " 3611 "threads data for task_team %p, new_size = %d, old_size = %d\n", 3612 __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads)); 3613 // Reallocate threads_data to have more elements than current array 3614 // Cannot use __kmp_thread_realloc() because threads not around for 3615 // kmp_reap_task_team( ). Note all new array entries are initialized 3616 // to zero by __kmp_allocate(). 3617 new_data = (kmp_thread_data_t *)__kmp_allocate( 3618 nthreads * sizeof(kmp_thread_data_t)); 3619 // copy old data to new data 3620 KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t), 3621 (void *)old_data, maxthreads * sizeof(kmp_thread_data_t)); 3622 3623 #ifdef BUILD_TIED_TASK_STACK 3624 // GEH: Figure out if this is the right thing to do 3625 for (i = maxthreads; i < nthreads; i++) { 3626 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 3627 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 3628 } 3629 #endif // BUILD_TIED_TASK_STACK 3630 // Install the new data and free the old data 3631 (*threads_data_p) = new_data; 3632 __kmp_free(old_data); 3633 } else { 3634 KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating " 3635 "threads data for task_team %p, size = %d\n", 3636 __kmp_gtid_from_thread(thread), task_team, nthreads)); 3637 // Make the initial allocate for threads_data array, and zero entries 3638 // Cannot use __kmp_thread_calloc() because threads not around for 3639 // kmp_reap_task_team( ). 3640 *threads_data_p = (kmp_thread_data_t *)__kmp_allocate( 3641 nthreads * sizeof(kmp_thread_data_t)); 3642 #ifdef BUILD_TIED_TASK_STACK 3643 // GEH: Figure out if this is the right thing to do 3644 for (i = 0; i < nthreads; i++) { 3645 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 3646 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 3647 } 3648 #endif // BUILD_TIED_TASK_STACK 3649 } 3650 task_team->tt.tt_max_threads = nthreads; 3651 } else { 3652 // If array has (more than) enough elements, go ahead and use it 3653 KMP_DEBUG_ASSERT(*threads_data_p != NULL); 3654 } 3655 3656 // initialize threads_data pointers back to thread_info structures 3657 for (i = 0; i < nthreads; i++) { 3658 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 3659 thread_data->td.td_thr = team->t.t_threads[i]; 3660 3661 if (thread_data->td.td_deque_last_stolen >= nthreads) { 3662 // The last stolen field survives across teams / barrier, and the number 3663 // of threads may have changed. It's possible (likely?) that a new 3664 // parallel region will exhibit the same behavior as previous region. 3665 thread_data->td.td_deque_last_stolen = -1; 3666 } 3667 } 3668 3669 KMP_MB(); 3670 TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE); 3671 } 3672 3673 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 3674 return is_init_thread; 3675 } 3676 3677 // __kmp_free_task_threads_data: 3678 // Deallocates a threads_data array for a task team, including any attached 3679 // tasking deques. Only occurs at library shutdown. 3680 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) { 3681 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 3682 if (task_team->tt.tt_threads_data != NULL) { 3683 int i; 3684 for (i = 0; i < task_team->tt.tt_max_threads; i++) { 3685 __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]); 3686 } 3687 __kmp_free(task_team->tt.tt_threads_data); 3688 task_team->tt.tt_threads_data = NULL; 3689 } 3690 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 3691 } 3692 3693 // __kmp_free_task_pri_list: 3694 // Deallocates tasking deques used for priority tasks. 3695 // Only occurs at library shutdown. 3696 static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) { 3697 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock); 3698 if (task_team->tt.tt_task_pri_list != NULL) { 3699 kmp_task_pri_t *list = task_team->tt.tt_task_pri_list; 3700 while (list != NULL) { 3701 kmp_task_pri_t *next = list->next; 3702 __kmp_free_task_deque(&list->td); 3703 __kmp_free(list); 3704 list = next; 3705 } 3706 task_team->tt.tt_task_pri_list = NULL; 3707 } 3708 __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock); 3709 } 3710 3711 // __kmp_allocate_task_team: 3712 // Allocates a task team associated with a specific team, taking it from 3713 // the global task team free list if possible. Also initializes data 3714 // structures. 3715 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread, 3716 kmp_team_t *team) { 3717 kmp_task_team_t *task_team = NULL; 3718 int nthreads; 3719 3720 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n", 3721 (thread ? __kmp_gtid_from_thread(thread) : -1), team)); 3722 3723 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 3724 // Take a task team from the task team pool 3725 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3726 if (__kmp_free_task_teams != NULL) { 3727 task_team = __kmp_free_task_teams; 3728 TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next); 3729 task_team->tt.tt_next = NULL; 3730 } 3731 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3732 } 3733 3734 if (task_team == NULL) { 3735 KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating " 3736 "task team for team %p\n", 3737 __kmp_gtid_from_thread(thread), team)); 3738 // Allocate a new task team if one is not available. Cannot use 3739 // __kmp_thread_malloc because threads not around for kmp_reap_task_team. 3740 task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t)); 3741 __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock); 3742 __kmp_init_bootstrap_lock(&task_team->tt.tt_task_pri_lock); 3743 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 3744 // suppress race conditions detection on synchronization flags in debug mode 3745 // this helps to analyze library internals eliminating false positives 3746 __itt_suppress_mark_range( 3747 __itt_suppress_range, __itt_suppress_threading_errors, 3748 &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks)); 3749 __itt_suppress_mark_range(__itt_suppress_range, 3750 __itt_suppress_threading_errors, 3751 CCAST(kmp_uint32 *, &task_team->tt.tt_active), 3752 sizeof(task_team->tt.tt_active)); 3753 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 3754 // Note: __kmp_allocate zeroes returned memory, othewise we would need: 3755 // task_team->tt.tt_threads_data = NULL; 3756 // task_team->tt.tt_max_threads = 0; 3757 // task_team->tt.tt_next = NULL; 3758 } 3759 3760 TCW_4(task_team->tt.tt_found_tasks, FALSE); 3761 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3762 TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE); 3763 task_team->tt.tt_nproc = nthreads = team->t.t_nproc; 3764 3765 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads); 3766 TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE); 3767 TCW_4(task_team->tt.tt_active, TRUE); 3768 3769 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p " 3770 "unfinished_threads init'd to %d\n", 3771 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team, 3772 KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads))); 3773 return task_team; 3774 } 3775 3776 // __kmp_free_task_team: 3777 // Frees the task team associated with a specific thread, and adds it 3778 // to the global task team free list. 3779 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) { 3780 KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n", 3781 thread ? __kmp_gtid_from_thread(thread) : -1, task_team)); 3782 3783 // Put task team back on free list 3784 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3785 3786 KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL); 3787 task_team->tt.tt_next = __kmp_free_task_teams; 3788 TCW_PTR(__kmp_free_task_teams, task_team); 3789 3790 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3791 } 3792 3793 // __kmp_reap_task_teams: 3794 // Free all the task teams on the task team free list. 3795 // Should only be done during library shutdown. 3796 // Cannot do anything that needs a thread structure or gtid since they are 3797 // already gone. 3798 void __kmp_reap_task_teams(void) { 3799 kmp_task_team_t *task_team; 3800 3801 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 3802 // Free all task_teams on the free list 3803 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3804 while ((task_team = __kmp_free_task_teams) != NULL) { 3805 __kmp_free_task_teams = task_team->tt.tt_next; 3806 task_team->tt.tt_next = NULL; 3807 3808 // Free threads_data if necessary 3809 if (task_team->tt.tt_threads_data != NULL) { 3810 __kmp_free_task_threads_data(task_team); 3811 } 3812 if (task_team->tt.tt_task_pri_list != NULL) { 3813 __kmp_free_task_pri_list(task_team); 3814 } 3815 __kmp_free(task_team); 3816 } 3817 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3818 } 3819 } 3820 3821 // __kmp_wait_to_unref_task_teams: 3822 // Some threads could still be in the fork barrier release code, possibly 3823 // trying to steal tasks. Wait for each thread to unreference its task team. 3824 void __kmp_wait_to_unref_task_teams(void) { 3825 kmp_info_t *thread; 3826 kmp_uint32 spins; 3827 kmp_uint64 time; 3828 int done; 3829 3830 KMP_INIT_YIELD(spins); 3831 KMP_INIT_BACKOFF(time); 3832 3833 for (;;) { 3834 done = TRUE; 3835 3836 // TODO: GEH - this may be is wrong because some sync would be necessary 3837 // in case threads are added to the pool during the traversal. Need to 3838 // verify that lock for thread pool is held when calling this routine. 3839 for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL; 3840 thread = thread->th.th_next_pool) { 3841 #if KMP_OS_WINDOWS 3842 DWORD exit_val; 3843 #endif 3844 if (TCR_PTR(thread->th.th_task_team) == NULL) { 3845 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n", 3846 __kmp_gtid_from_thread(thread))); 3847 continue; 3848 } 3849 #if KMP_OS_WINDOWS 3850 // TODO: GEH - add this check for Linux* OS / OS X* as well? 3851 if (!__kmp_is_thread_alive(thread, &exit_val)) { 3852 thread->th.th_task_team = NULL; 3853 continue; 3854 } 3855 #endif 3856 3857 done = FALSE; // Because th_task_team pointer is not NULL for this thread 3858 3859 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to " 3860 "unreference task_team\n", 3861 __kmp_gtid_from_thread(thread))); 3862 3863 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 3864 void *sleep_loc; 3865 // If the thread is sleeping, awaken it. 3866 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != 3867 NULL) { 3868 KA_TRACE( 3869 10, 3870 ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n", 3871 __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread))); 3872 __kmp_null_resume_wrapper(thread); 3873 } 3874 } 3875 } 3876 if (done) { 3877 break; 3878 } 3879 3880 // If oversubscribed or have waited a bit, yield. 3881 KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); 3882 } 3883 } 3884 3885 // __kmp_task_team_setup: Create a task_team for the current team, but use 3886 // an already created, unused one if it already exists. 3887 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) { 3888 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3889 3890 // If this task_team hasn't been created yet, allocate it. It will be used in 3891 // the region after the next. 3892 // If it exists, it is the current task team and shouldn't be touched yet as 3893 // it may still be in use. 3894 if (team->t.t_task_team[this_thr->th.th_task_state] == NULL && 3895 (always || team->t.t_nproc > 1)) { 3896 team->t.t_task_team[this_thr->th.th_task_state] = 3897 __kmp_allocate_task_team(this_thr, team); 3898 KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p" 3899 " for team %d at parity=%d\n", 3900 __kmp_gtid_from_thread(this_thr), 3901 team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id, 3902 this_thr->th.th_task_state)); 3903 } 3904 3905 // After threads exit the release, they will call sync, and then point to this 3906 // other task_team; make sure it is allocated and properly initialized. As 3907 // threads spin in the barrier release phase, they will continue to use the 3908 // previous task_team struct(above), until they receive the signal to stop 3909 // checking for tasks (they can't safely reference the kmp_team_t struct, 3910 // which could be reallocated by the primary thread). No task teams are formed 3911 // for serialized teams. 3912 if (team->t.t_nproc > 1) { 3913 int other_team = 1 - this_thr->th.th_task_state; 3914 KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2); 3915 if (team->t.t_task_team[other_team] == NULL) { // setup other team as well 3916 team->t.t_task_team[other_team] = 3917 __kmp_allocate_task_team(this_thr, team); 3918 KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new " 3919 "task_team %p for team %d at parity=%d\n", 3920 __kmp_gtid_from_thread(this_thr), 3921 team->t.t_task_team[other_team], team->t.t_id, other_team)); 3922 } else { // Leave the old task team struct in place for the upcoming region; 3923 // adjust as needed 3924 kmp_task_team_t *task_team = team->t.t_task_team[other_team]; 3925 if (!task_team->tt.tt_active || 3926 team->t.t_nproc != task_team->tt.tt_nproc) { 3927 TCW_4(task_team->tt.tt_nproc, team->t.t_nproc); 3928 TCW_4(task_team->tt.tt_found_tasks, FALSE); 3929 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3930 TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE); 3931 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, 3932 team->t.t_nproc); 3933 TCW_4(task_team->tt.tt_active, TRUE); 3934 } 3935 // if team size has changed, the first thread to enable tasking will 3936 // realloc threads_data if necessary 3937 KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team " 3938 "%p for team %d at parity=%d\n", 3939 __kmp_gtid_from_thread(this_thr), 3940 team->t.t_task_team[other_team], team->t.t_id, other_team)); 3941 } 3942 } 3943 3944 // For regular thread, task enabling should be called when the task is going 3945 // to be pushed to a dequeue. However, for the hidden helper thread, we need 3946 // it ahead of time so that some operations can be performed without race 3947 // condition. 3948 if (this_thr == __kmp_hidden_helper_main_thread) { 3949 for (int i = 0; i < 2; ++i) { 3950 kmp_task_team_t *task_team = team->t.t_task_team[i]; 3951 if (KMP_TASKING_ENABLED(task_team)) { 3952 continue; 3953 } 3954 __kmp_enable_tasking(task_team, this_thr); 3955 for (int j = 0; j < task_team->tt.tt_nproc; ++j) { 3956 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j]; 3957 if (thread_data->td.td_deque == NULL) { 3958 __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data); 3959 } 3960 } 3961 } 3962 } 3963 } 3964 3965 // __kmp_task_team_sync: Propagation of task team data from team to threads 3966 // which happens just after the release phase of a team barrier. This may be 3967 // called by any thread, but only for teams with # threads > 1. 3968 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) { 3969 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3970 3971 // Toggle the th_task_state field, to switch which task_team this thread 3972 // refers to 3973 this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state); 3974 3975 // It is now safe to propagate the task team pointer from the team struct to 3976 // the current thread. 3977 TCW_PTR(this_thr->th.th_task_team, 3978 team->t.t_task_team[this_thr->th.th_task_state]); 3979 KA_TRACE(20, 3980 ("__kmp_task_team_sync: Thread T#%d task team switched to task_team " 3981 "%p from Team #%d (parity=%d)\n", 3982 __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team, 3983 team->t.t_id, this_thr->th.th_task_state)); 3984 } 3985 3986 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the 3987 // barrier gather phase. Only called by primary thread if #threads in team > 1 3988 // or if proxy tasks were created. 3989 // 3990 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off 3991 // by passing in 0 optionally as the last argument. When wait is zero, primary 3992 // thread does not wait for unfinished_threads to reach 0. 3993 void __kmp_task_team_wait( 3994 kmp_info_t *this_thr, 3995 kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) { 3996 kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state]; 3997 3998 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3999 KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team); 4000 4001 if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) { 4002 if (wait) { 4003 KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks " 4004 "(for unfinished_threads to reach 0) on task_team = %p\n", 4005 __kmp_gtid_from_thread(this_thr), task_team)); 4006 // Worker threads may have dropped through to release phase, but could 4007 // still be executing tasks. Wait here for tasks to complete. To avoid 4008 // memory contention, only primary thread checks termination condition. 4009 kmp_flag_32<false, false> flag( 4010 RCAST(std::atomic<kmp_uint32> *, 4011 &task_team->tt.tt_unfinished_threads), 4012 0U); 4013 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 4014 } 4015 // Deactivate the old task team, so that the worker threads will stop 4016 // referencing it while spinning. 4017 KA_TRACE( 4018 20, 4019 ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: " 4020 "setting active to false, setting local and team's pointer to NULL\n", 4021 __kmp_gtid_from_thread(this_thr), task_team)); 4022 KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 || 4023 task_team->tt.tt_found_proxy_tasks == TRUE || 4024 task_team->tt.tt_hidden_helper_task_encountered == TRUE); 4025 TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE); 4026 TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE); 4027 KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0); 4028 TCW_SYNC_4(task_team->tt.tt_active, FALSE); 4029 KMP_MB(); 4030 4031 TCW_PTR(this_thr->th.th_task_team, NULL); 4032 } 4033 } 4034 4035 // __kmp_tasking_barrier: 4036 // This routine is called only when __kmp_tasking_mode == tskm_extra_barrier. 4037 // Internal function to execute all tasks prior to a regular barrier or a join 4038 // barrier. It is a full barrier itself, which unfortunately turns regular 4039 // barriers into double barriers and join barriers into 1 1/2 barriers. 4040 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) { 4041 std::atomic<kmp_uint32> *spin = RCAST( 4042 std::atomic<kmp_uint32> *, 4043 &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads); 4044 int flag = FALSE; 4045 KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier); 4046 4047 #if USE_ITT_BUILD 4048 KMP_FSYNC_SPIN_INIT(spin, NULL); 4049 #endif /* USE_ITT_BUILD */ 4050 kmp_flag_32<false, false> spin_flag(spin, 0U); 4051 while (!spin_flag.execute_tasks(thread, gtid, TRUE, 4052 &flag USE_ITT_BUILD_ARG(NULL), 0)) { 4053 #if USE_ITT_BUILD 4054 // TODO: What about itt_sync_obj?? 4055 KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin)); 4056 #endif /* USE_ITT_BUILD */ 4057 4058 if (TCR_4(__kmp_global.g.g_done)) { 4059 if (__kmp_global.g.g_abort) 4060 __kmp_abort_thread(); 4061 break; 4062 } 4063 KMP_YIELD(TRUE); 4064 } 4065 #if USE_ITT_BUILD 4066 KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin)); 4067 #endif /* USE_ITT_BUILD */ 4068 } 4069 4070 // __kmp_give_task puts a task into a given thread queue if: 4071 // - the queue for that thread was created 4072 // - there's space in that queue 4073 // Because of this, __kmp_push_task needs to check if there's space after 4074 // getting the lock 4075 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task, 4076 kmp_int32 pass) { 4077 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4078 kmp_task_team_t *task_team = taskdata->td_task_team; 4079 4080 KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n", 4081 taskdata, tid)); 4082 4083 // If task_team is NULL something went really bad... 4084 KMP_DEBUG_ASSERT(task_team != NULL); 4085 4086 bool result = false; 4087 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 4088 4089 if (thread_data->td.td_deque == NULL) { 4090 // There's no queue in this thread, go find another one 4091 // We're guaranteed that at least one thread has a queue 4092 KA_TRACE(30, 4093 ("__kmp_give_task: thread %d has no queue while giving task %p.\n", 4094 tid, taskdata)); 4095 return result; 4096 } 4097 4098 if (TCR_4(thread_data->td.td_deque_ntasks) >= 4099 TASK_DEQUE_SIZE(thread_data->td)) { 4100 KA_TRACE( 4101 30, 4102 ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", 4103 taskdata, tid)); 4104 4105 // if this deque is bigger than the pass ratio give a chance to another 4106 // thread 4107 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 4108 return result; 4109 4110 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 4111 if (TCR_4(thread_data->td.td_deque_ntasks) >= 4112 TASK_DEQUE_SIZE(thread_data->td)) { 4113 // expand deque to push the task which is not allowed to execute 4114 __kmp_realloc_task_deque(thread, thread_data); 4115 } 4116 4117 } else { 4118 4119 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 4120 4121 if (TCR_4(thread_data->td.td_deque_ntasks) >= 4122 TASK_DEQUE_SIZE(thread_data->td)) { 4123 KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to " 4124 "thread %d.\n", 4125 taskdata, tid)); 4126 4127 // if this deque is bigger than the pass ratio give a chance to another 4128 // thread 4129 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 4130 goto release_and_exit; 4131 4132 __kmp_realloc_task_deque(thread, thread_data); 4133 } 4134 } 4135 4136 // lock is held here, and there is space in the deque 4137 4138 thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata; 4139 // Wrap index. 4140 thread_data->td.td_deque_tail = 4141 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 4142 TCW_4(thread_data->td.td_deque_ntasks, 4143 TCR_4(thread_data->td.td_deque_ntasks) + 1); 4144 4145 result = true; 4146 KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n", 4147 taskdata, tid)); 4148 4149 release_and_exit: 4150 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 4151 4152 return result; 4153 } 4154 4155 #define PROXY_TASK_FLAG 0x40000000 4156 /* The finish of the proxy tasks is divided in two pieces: 4157 - the top half is the one that can be done from a thread outside the team 4158 - the bottom half must be run from a thread within the team 4159 4160 In order to run the bottom half the task gets queued back into one of the 4161 threads of the team. Once the td_incomplete_child_task counter of the parent 4162 is decremented the threads can leave the barriers. So, the bottom half needs 4163 to be queued before the counter is decremented. The top half is therefore 4164 divided in two parts: 4165 - things that can be run before queuing the bottom half 4166 - things that must be run after queuing the bottom half 4167 4168 This creates a second race as the bottom half can free the task before the 4169 second top half is executed. To avoid this we use the 4170 td_incomplete_child_task of the proxy task to synchronize the top and bottom 4171 half. */ 4172 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 4173 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 4174 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 4175 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 4176 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 4177 4178 taskdata->td_flags.complete = 1; // mark the task as completed 4179 4180 if (taskdata->td_taskgroup) 4181 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); 4182 4183 // Create an imaginary children for this task so the bottom half cannot 4184 // release the task before we have completed the second top half 4185 KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG); 4186 } 4187 4188 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 4189 #if KMP_DEBUG 4190 kmp_int32 children = 0; 4191 // Predecrement simulated by "- 1" calculation 4192 children = -1 + 4193 #endif 4194 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks); 4195 KMP_DEBUG_ASSERT(children >= 0); 4196 4197 // Remove the imaginary children 4198 KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG); 4199 } 4200 4201 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) { 4202 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 4203 kmp_info_t *thread = __kmp_threads[gtid]; 4204 4205 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 4206 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 4207 1); // top half must run before bottom half 4208 4209 // We need to wait to make sure the top half is finished 4210 // Spinning here should be ok as this should happen quickly 4211 while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) & 4212 PROXY_TASK_FLAG) > 0) 4213 ; 4214 4215 __kmp_release_deps(gtid, taskdata); 4216 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 4217 } 4218 4219 /*! 4220 @ingroup TASKING 4221 @param gtid Global Thread ID of encountering thread 4222 @param ptask Task which execution is completed 4223 4224 Execute the completion of a proxy task from a thread of that is part of the 4225 team. Run first and bottom halves directly. 4226 */ 4227 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) { 4228 KMP_DEBUG_ASSERT(ptask != NULL); 4229 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 4230 KA_TRACE( 4231 10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n", 4232 gtid, taskdata)); 4233 __kmp_assert_valid_gtid(gtid); 4234 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 4235 4236 __kmp_first_top_half_finish_proxy(taskdata); 4237 __kmp_second_top_half_finish_proxy(taskdata); 4238 __kmp_bottom_half_finish_proxy(gtid, ptask); 4239 4240 KA_TRACE(10, 4241 ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n", 4242 gtid, taskdata)); 4243 } 4244 4245 void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) { 4246 KMP_DEBUG_ASSERT(ptask != NULL); 4247 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 4248 4249 // Enqueue task to complete bottom half completion from a thread within the 4250 // corresponding team 4251 kmp_team_t *team = taskdata->td_team; 4252 kmp_int32 nthreads = team->t.t_nproc; 4253 kmp_info_t *thread; 4254 4255 // This should be similar to start_k = __kmp_get_random( thread ) % nthreads 4256 // but we cannot use __kmp_get_random here 4257 kmp_int32 start_k = start % nthreads; 4258 kmp_int32 pass = 1; 4259 kmp_int32 k = start_k; 4260 4261 do { 4262 // For now we're just linearly trying to find a thread 4263 thread = team->t.t_threads[k]; 4264 k = (k + 1) % nthreads; 4265 4266 // we did a full pass through all the threads 4267 if (k == start_k) 4268 pass = pass << 1; 4269 4270 } while (!__kmp_give_task(thread, k, ptask, pass)); 4271 } 4272 4273 /*! 4274 @ingroup TASKING 4275 @param ptask Task which execution is completed 4276 4277 Execute the completion of a proxy task from a thread that could not belong to 4278 the team. 4279 */ 4280 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) { 4281 KMP_DEBUG_ASSERT(ptask != NULL); 4282 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 4283 4284 KA_TRACE( 4285 10, 4286 ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n", 4287 taskdata)); 4288 4289 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 4290 4291 __kmp_first_top_half_finish_proxy(taskdata); 4292 4293 __kmpc_give_task(ptask); 4294 4295 __kmp_second_top_half_finish_proxy(taskdata); 4296 4297 KA_TRACE( 4298 10, 4299 ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", 4300 taskdata)); 4301 } 4302 4303 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid, 4304 kmp_task_t *task) { 4305 kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task); 4306 if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) { 4307 td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION; 4308 td->td_allow_completion_event.ed.task = task; 4309 __kmp_init_tas_lock(&td->td_allow_completion_event.lock); 4310 } 4311 return &td->td_allow_completion_event; 4312 } 4313 4314 void __kmp_fulfill_event(kmp_event_t *event) { 4315 if (event->type == KMP_EVENT_ALLOW_COMPLETION) { 4316 kmp_task_t *ptask = event->ed.task; 4317 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 4318 bool detached = false; 4319 int gtid = __kmp_get_gtid(); 4320 4321 // The associated task might have completed or could be completing at this 4322 // point. 4323 // We need to take the lock to avoid races 4324 __kmp_acquire_tas_lock(&event->lock, gtid); 4325 if (taskdata->td_flags.proxy == TASK_PROXY) { 4326 detached = true; 4327 } else { 4328 #if OMPT_SUPPORT 4329 // The OMPT event must occur under mutual exclusion, 4330 // otherwise the tool might access ptask after free 4331 if (UNLIKELY(ompt_enabled.enabled)) 4332 __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill); 4333 #endif 4334 } 4335 event->type = KMP_EVENT_UNINITIALIZED; 4336 __kmp_release_tas_lock(&event->lock, gtid); 4337 4338 if (detached) { 4339 #if OMPT_SUPPORT 4340 // We free ptask afterwards and know the task is finished, 4341 // so locking is not necessary 4342 if (UNLIKELY(ompt_enabled.enabled)) 4343 __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill); 4344 #endif 4345 // If the task detached complete the proxy task 4346 if (gtid >= 0) { 4347 kmp_team_t *team = taskdata->td_team; 4348 kmp_info_t *thread = __kmp_get_thread(); 4349 if (thread->th.th_team == team) { 4350 __kmpc_proxy_task_completed(gtid, ptask); 4351 return; 4352 } 4353 } 4354 4355 // fallback 4356 __kmpc_proxy_task_completed_ooo(ptask); 4357 } 4358 } 4359 } 4360 4361 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task 4362 // for taskloop 4363 // 4364 // thread: allocating thread 4365 // task_src: pointer to source task to be duplicated 4366 // returns: a pointer to the allocated kmp_task_t structure (task). 4367 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) { 4368 kmp_task_t *task; 4369 kmp_taskdata_t *taskdata; 4370 kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src); 4371 kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task 4372 size_t shareds_offset; 4373 size_t task_size; 4374 4375 KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, 4376 task_src)); 4377 KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy == 4378 TASK_FULL); // it should not be proxy task 4379 KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT); 4380 task_size = taskdata_src->td_size_alloc; 4381 4382 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 4383 KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread, 4384 task_size)); 4385 #if USE_FAST_MEMORY 4386 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size); 4387 #else 4388 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size); 4389 #endif /* USE_FAST_MEMORY */ 4390 KMP_MEMCPY(taskdata, taskdata_src, task_size); 4391 4392 task = KMP_TASKDATA_TO_TASK(taskdata); 4393 4394 // Initialize new task (only specific fields not affected by memcpy) 4395 taskdata->td_task_id = KMP_GEN_TASK_ID(); 4396 if (task->shareds != NULL) { // need setup shareds pointer 4397 shareds_offset = (char *)task_src->shareds - (char *)taskdata_src; 4398 task->shareds = &((char *)taskdata)[shareds_offset]; 4399 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 4400 0); 4401 } 4402 taskdata->td_alloc_thread = thread; 4403 taskdata->td_parent = parent_task; 4404 // task inherits the taskgroup from the parent task 4405 taskdata->td_taskgroup = parent_task->td_taskgroup; 4406 // tied task needs to initialize the td_last_tied at creation, 4407 // untied one does this when it is scheduled for execution 4408 if (taskdata->td_flags.tiedness == TASK_TIED) 4409 taskdata->td_last_tied = taskdata; 4410 4411 // Only need to keep track of child task counts if team parallel and tasking 4412 // not serialized 4413 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { 4414 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks); 4415 if (parent_task->td_taskgroup) 4416 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count); 4417 // Only need to keep track of allocated child tasks for explicit tasks since 4418 // implicit not deallocated 4419 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) 4420 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks); 4421 } 4422 4423 KA_TRACE(20, 4424 ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n", 4425 thread, taskdata, taskdata->td_parent)); 4426 #if OMPT_SUPPORT 4427 if (UNLIKELY(ompt_enabled.enabled)) 4428 __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid); 4429 #endif 4430 return task; 4431 } 4432 4433 // Routine optionally generated by the compiler for setting the lastprivate flag 4434 // and calling needed constructors for private/firstprivate objects 4435 // (used to form taskloop tasks from pattern task) 4436 // Parameters: dest task, src task, lastprivate flag. 4437 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32); 4438 4439 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8); 4440 4441 // class to encapsulate manipulating loop bounds in a taskloop task. 4442 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting 4443 // the loop bound variables. 4444 class kmp_taskloop_bounds_t { 4445 kmp_task_t *task; 4446 const kmp_taskdata_t *taskdata; 4447 size_t lower_offset; 4448 size_t upper_offset; 4449 4450 public: 4451 kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub) 4452 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)), 4453 lower_offset((char *)lb - (char *)task), 4454 upper_offset((char *)ub - (char *)task) { 4455 KMP_DEBUG_ASSERT((char *)lb > (char *)_task); 4456 KMP_DEBUG_ASSERT((char *)ub > (char *)_task); 4457 } 4458 kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds) 4459 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)), 4460 lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {} 4461 size_t get_lower_offset() const { return lower_offset; } 4462 size_t get_upper_offset() const { return upper_offset; } 4463 kmp_uint64 get_lb() const { 4464 kmp_int64 retval; 4465 #if defined(KMP_GOMP_COMPAT) 4466 // Intel task just returns the lower bound normally 4467 if (!taskdata->td_flags.native) { 4468 retval = *(kmp_int64 *)((char *)task + lower_offset); 4469 } else { 4470 // GOMP task has to take into account the sizeof(long) 4471 if (taskdata->td_size_loop_bounds == 4) { 4472 kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds); 4473 retval = (kmp_int64)*lb; 4474 } else { 4475 kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds); 4476 retval = (kmp_int64)*lb; 4477 } 4478 } 4479 #else 4480 (void)taskdata; 4481 retval = *(kmp_int64 *)((char *)task + lower_offset); 4482 #endif // defined(KMP_GOMP_COMPAT) 4483 return retval; 4484 } 4485 kmp_uint64 get_ub() const { 4486 kmp_int64 retval; 4487 #if defined(KMP_GOMP_COMPAT) 4488 // Intel task just returns the upper bound normally 4489 if (!taskdata->td_flags.native) { 4490 retval = *(kmp_int64 *)((char *)task + upper_offset); 4491 } else { 4492 // GOMP task has to take into account the sizeof(long) 4493 if (taskdata->td_size_loop_bounds == 4) { 4494 kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1; 4495 retval = (kmp_int64)*ub; 4496 } else { 4497 kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1; 4498 retval = (kmp_int64)*ub; 4499 } 4500 } 4501 #else 4502 retval = *(kmp_int64 *)((char *)task + upper_offset); 4503 #endif // defined(KMP_GOMP_COMPAT) 4504 return retval; 4505 } 4506 void set_lb(kmp_uint64 lb) { 4507 #if defined(KMP_GOMP_COMPAT) 4508 // Intel task just sets the lower bound normally 4509 if (!taskdata->td_flags.native) { 4510 *(kmp_uint64 *)((char *)task + lower_offset) = lb; 4511 } else { 4512 // GOMP task has to take into account the sizeof(long) 4513 if (taskdata->td_size_loop_bounds == 4) { 4514 kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds); 4515 *lower = (kmp_uint32)lb; 4516 } else { 4517 kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds); 4518 *lower = (kmp_uint64)lb; 4519 } 4520 } 4521 #else 4522 *(kmp_uint64 *)((char *)task + lower_offset) = lb; 4523 #endif // defined(KMP_GOMP_COMPAT) 4524 } 4525 void set_ub(kmp_uint64 ub) { 4526 #if defined(KMP_GOMP_COMPAT) 4527 // Intel task just sets the upper bound normally 4528 if (!taskdata->td_flags.native) { 4529 *(kmp_uint64 *)((char *)task + upper_offset) = ub; 4530 } else { 4531 // GOMP task has to take into account the sizeof(long) 4532 if (taskdata->td_size_loop_bounds == 4) { 4533 kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1; 4534 *upper = (kmp_uint32)ub; 4535 } else { 4536 kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1; 4537 *upper = (kmp_uint64)ub; 4538 } 4539 } 4540 #else 4541 *(kmp_uint64 *)((char *)task + upper_offset) = ub; 4542 #endif // defined(KMP_GOMP_COMPAT) 4543 } 4544 }; 4545 4546 // __kmp_taskloop_linear: Start tasks of the taskloop linearly 4547 // 4548 // loc Source location information 4549 // gtid Global thread ID 4550 // task Pattern task, exposes the loop iteration range 4551 // lb Pointer to loop lower bound in task structure 4552 // ub Pointer to loop upper bound in task structure 4553 // st Loop stride 4554 // ub_glob Global upper bound (used for lastprivate check) 4555 // num_tasks Number of tasks to execute 4556 // grainsize Number of loop iterations per task 4557 // extras Number of chunks with grainsize+1 iterations 4558 // last_chunk Reduction of grainsize for last task 4559 // tc Iterations count 4560 // task_dup Tasks duplication routine 4561 // codeptr_ra Return address for OMPT events 4562 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, 4563 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 4564 kmp_uint64 ub_glob, kmp_uint64 num_tasks, 4565 kmp_uint64 grainsize, kmp_uint64 extras, 4566 kmp_int64 last_chunk, kmp_uint64 tc, 4567 #if OMPT_SUPPORT 4568 void *codeptr_ra, 4569 #endif 4570 void *task_dup) { 4571 KMP_COUNT_BLOCK(OMP_TASKLOOP); 4572 KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling); 4573 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 4574 // compiler provides global bounds here 4575 kmp_taskloop_bounds_t task_bounds(task, lb, ub); 4576 kmp_uint64 lower = task_bounds.get_lb(); 4577 kmp_uint64 upper = task_bounds.get_ub(); 4578 kmp_uint64 i; 4579 kmp_info_t *thread = __kmp_threads[gtid]; 4580 kmp_taskdata_t *current_task = thread->th.th_current_task; 4581 kmp_task_t *next_task; 4582 kmp_int32 lastpriv = 0; 4583 4584 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + 4585 (last_chunk < 0 ? last_chunk : extras)); 4586 KMP_DEBUG_ASSERT(num_tasks > extras); 4587 KMP_DEBUG_ASSERT(num_tasks > 0); 4588 KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, " 4589 "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n", 4590 gtid, num_tasks, grainsize, extras, last_chunk, lower, upper, 4591 ub_glob, st, task_dup)); 4592 4593 // Launch num_tasks tasks, assign grainsize iterations each task 4594 for (i = 0; i < num_tasks; ++i) { 4595 kmp_uint64 chunk_minus_1; 4596 if (extras == 0) { 4597 chunk_minus_1 = grainsize - 1; 4598 } else { 4599 chunk_minus_1 = grainsize; 4600 --extras; // first extras iterations get bigger chunk (grainsize+1) 4601 } 4602 upper = lower + st * chunk_minus_1; 4603 if (upper > *ub) { 4604 upper = *ub; 4605 } 4606 if (i == num_tasks - 1) { 4607 // schedule the last task, set lastprivate flag if needed 4608 if (st == 1) { // most common case 4609 KMP_DEBUG_ASSERT(upper == *ub); 4610 if (upper == ub_glob) 4611 lastpriv = 1; 4612 } else if (st > 0) { // positive loop stride 4613 KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper); 4614 if ((kmp_uint64)st > ub_glob - upper) 4615 lastpriv = 1; 4616 } else { // negative loop stride 4617 KMP_DEBUG_ASSERT(upper + st < *ub); 4618 if (upper - ub_glob < (kmp_uint64)(-st)) 4619 lastpriv = 1; 4620 } 4621 } 4622 next_task = __kmp_task_dup_alloc(thread, task); // allocate new task 4623 kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task); 4624 kmp_taskloop_bounds_t next_task_bounds = 4625 kmp_taskloop_bounds_t(next_task, task_bounds); 4626 4627 // adjust task-specific bounds 4628 next_task_bounds.set_lb(lower); 4629 if (next_taskdata->td_flags.native) { 4630 next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1)); 4631 } else { 4632 next_task_bounds.set_ub(upper); 4633 } 4634 if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates, 4635 // etc. 4636 ptask_dup(next_task, task, lastpriv); 4637 KA_TRACE(40, 4638 ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, " 4639 "upper %lld stride %lld, (offsets %p %p)\n", 4640 gtid, i, next_task, lower, upper, st, 4641 next_task_bounds.get_lower_offset(), 4642 next_task_bounds.get_upper_offset())); 4643 #if OMPT_SUPPORT 4644 __kmp_omp_taskloop_task(NULL, gtid, next_task, 4645 codeptr_ra); // schedule new task 4646 #else 4647 __kmp_omp_task(gtid, next_task, true); // schedule new task 4648 #endif 4649 lower = upper + st; // adjust lower bound for the next iteration 4650 } 4651 // free the pattern task and exit 4652 __kmp_task_start(gtid, task, current_task); // make internal bookkeeping 4653 // do not execute the pattern task, just do internal bookkeeping 4654 __kmp_task_finish<false>(gtid, task, current_task); 4655 } 4656 4657 // Structure to keep taskloop parameters for auxiliary task 4658 // kept in the shareds of the task structure. 4659 typedef struct __taskloop_params { 4660 kmp_task_t *task; 4661 kmp_uint64 *lb; 4662 kmp_uint64 *ub; 4663 void *task_dup; 4664 kmp_int64 st; 4665 kmp_uint64 ub_glob; 4666 kmp_uint64 num_tasks; 4667 kmp_uint64 grainsize; 4668 kmp_uint64 extras; 4669 kmp_int64 last_chunk; 4670 kmp_uint64 tc; 4671 kmp_uint64 num_t_min; 4672 #if OMPT_SUPPORT 4673 void *codeptr_ra; 4674 #endif 4675 } __taskloop_params_t; 4676 4677 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *, 4678 kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64, 4679 kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64, 4680 kmp_uint64, 4681 #if OMPT_SUPPORT 4682 void *, 4683 #endif 4684 void *); 4685 4686 // Execute part of the taskloop submitted as a task. 4687 int __kmp_taskloop_task(int gtid, void *ptask) { 4688 __taskloop_params_t *p = 4689 (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds; 4690 kmp_task_t *task = p->task; 4691 kmp_uint64 *lb = p->lb; 4692 kmp_uint64 *ub = p->ub; 4693 void *task_dup = p->task_dup; 4694 // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 4695 kmp_int64 st = p->st; 4696 kmp_uint64 ub_glob = p->ub_glob; 4697 kmp_uint64 num_tasks = p->num_tasks; 4698 kmp_uint64 grainsize = p->grainsize; 4699 kmp_uint64 extras = p->extras; 4700 kmp_int64 last_chunk = p->last_chunk; 4701 kmp_uint64 tc = p->tc; 4702 kmp_uint64 num_t_min = p->num_t_min; 4703 #if OMPT_SUPPORT 4704 void *codeptr_ra = p->codeptr_ra; 4705 #endif 4706 #if KMP_DEBUG 4707 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4708 KMP_DEBUG_ASSERT(task != NULL); 4709 KA_TRACE(20, 4710 ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize" 4711 " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n", 4712 gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub, 4713 st, task_dup)); 4714 #endif 4715 KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min); 4716 if (num_tasks > num_t_min) 4717 __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, 4718 grainsize, extras, last_chunk, tc, num_t_min, 4719 #if OMPT_SUPPORT 4720 codeptr_ra, 4721 #endif 4722 task_dup); 4723 else 4724 __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, 4725 grainsize, extras, last_chunk, tc, 4726 #if OMPT_SUPPORT 4727 codeptr_ra, 4728 #endif 4729 task_dup); 4730 4731 KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid)); 4732 return 0; 4733 } 4734 4735 // Schedule part of the taskloop as a task, 4736 // execute the rest of the taskloop. 4737 // 4738 // loc Source location information 4739 // gtid Global thread ID 4740 // task Pattern task, exposes the loop iteration range 4741 // lb Pointer to loop lower bound in task structure 4742 // ub Pointer to loop upper bound in task structure 4743 // st Loop stride 4744 // ub_glob Global upper bound (used for lastprivate check) 4745 // num_tasks Number of tasks to execute 4746 // grainsize Number of loop iterations per task 4747 // extras Number of chunks with grainsize+1 iterations 4748 // last_chunk Reduction of grainsize for last task 4749 // tc Iterations count 4750 // num_t_min Threshold to launch tasks recursively 4751 // task_dup Tasks duplication routine 4752 // codeptr_ra Return address for OMPT events 4753 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task, 4754 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 4755 kmp_uint64 ub_glob, kmp_uint64 num_tasks, 4756 kmp_uint64 grainsize, kmp_uint64 extras, 4757 kmp_int64 last_chunk, kmp_uint64 tc, 4758 kmp_uint64 num_t_min, 4759 #if OMPT_SUPPORT 4760 void *codeptr_ra, 4761 #endif 4762 void *task_dup) { 4763 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4764 KMP_DEBUG_ASSERT(task != NULL); 4765 KMP_DEBUG_ASSERT(num_tasks > num_t_min); 4766 KA_TRACE(20, 4767 ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize" 4768 " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n", 4769 gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub, 4770 st, task_dup)); 4771 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 4772 kmp_uint64 lower = *lb; 4773 kmp_info_t *thread = __kmp_threads[gtid]; 4774 // kmp_taskdata_t *current_task = thread->th.th_current_task; 4775 kmp_task_t *next_task; 4776 size_t lower_offset = 4777 (char *)lb - (char *)task; // remember offset of lb in the task structure 4778 size_t upper_offset = 4779 (char *)ub - (char *)task; // remember offset of ub in the task structure 4780 4781 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + 4782 (last_chunk < 0 ? last_chunk : extras)); 4783 KMP_DEBUG_ASSERT(num_tasks > extras); 4784 KMP_DEBUG_ASSERT(num_tasks > 0); 4785 4786 // split the loop in two halves 4787 kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1; 4788 kmp_int64 last_chunk0 = 0, last_chunk1 = 0; 4789 kmp_uint64 gr_size0 = grainsize; 4790 kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute 4791 kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task 4792 if (last_chunk < 0) { 4793 ext0 = ext1 = 0; 4794 last_chunk1 = last_chunk; 4795 tc0 = grainsize * n_tsk0; 4796 tc1 = tc - tc0; 4797 } else if (n_tsk0 <= extras) { 4798 gr_size0++; // integrate extras into grainsize 4799 ext0 = 0; // no extra iters in 1st half 4800 ext1 = extras - n_tsk0; // remaining extras 4801 tc0 = gr_size0 * n_tsk0; 4802 tc1 = tc - tc0; 4803 } else { // n_tsk0 > extras 4804 ext1 = 0; // no extra iters in 2nd half 4805 ext0 = extras; 4806 tc1 = grainsize * n_tsk1; 4807 tc0 = tc - tc1; 4808 } 4809 ub0 = lower + st * (tc0 - 1); 4810 lb1 = ub0 + st; 4811 4812 // create pattern task for 2nd half of the loop 4813 next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task 4814 // adjust lower bound (upper bound is not changed) for the 2nd half 4815 *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1; 4816 if (ptask_dup != NULL) // construct firstprivates, etc. 4817 ptask_dup(next_task, task, 0); 4818 *ub = ub0; // adjust upper bound for the 1st half 4819 4820 // create auxiliary task for 2nd half of the loop 4821 // make sure new task has same parent task as the pattern task 4822 kmp_taskdata_t *current_task = thread->th.th_current_task; 4823 thread->th.th_current_task = taskdata->td_parent; 4824 kmp_task_t *new_task = 4825 __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *), 4826 sizeof(__taskloop_params_t), &__kmp_taskloop_task); 4827 // restore current task 4828 thread->th.th_current_task = current_task; 4829 __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds; 4830 p->task = next_task; 4831 p->lb = (kmp_uint64 *)((char *)next_task + lower_offset); 4832 p->ub = (kmp_uint64 *)((char *)next_task + upper_offset); 4833 p->task_dup = task_dup; 4834 p->st = st; 4835 p->ub_glob = ub_glob; 4836 p->num_tasks = n_tsk1; 4837 p->grainsize = grainsize; 4838 p->extras = ext1; 4839 p->last_chunk = last_chunk1; 4840 p->tc = tc1; 4841 p->num_t_min = num_t_min; 4842 #if OMPT_SUPPORT 4843 p->codeptr_ra = codeptr_ra; 4844 #endif 4845 4846 #if OMPT_SUPPORT 4847 // schedule new task with correct return address for OMPT events 4848 __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra); 4849 #else 4850 __kmp_omp_task(gtid, new_task, true); // schedule new task 4851 #endif 4852 4853 // execute the 1st half of current subrange 4854 if (n_tsk0 > num_t_min) 4855 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0, 4856 ext0, last_chunk0, tc0, num_t_min, 4857 #if OMPT_SUPPORT 4858 codeptr_ra, 4859 #endif 4860 task_dup); 4861 else 4862 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, 4863 gr_size0, ext0, last_chunk0, tc0, 4864 #if OMPT_SUPPORT 4865 codeptr_ra, 4866 #endif 4867 task_dup); 4868 4869 KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid)); 4870 } 4871 4872 static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, 4873 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 4874 int nogroup, int sched, kmp_uint64 grainsize, 4875 int modifier, void *task_dup) { 4876 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4877 KMP_DEBUG_ASSERT(task != NULL); 4878 if (nogroup == 0) { 4879 #if OMPT_SUPPORT && OMPT_OPTIONAL 4880 OMPT_STORE_RETURN_ADDRESS(gtid); 4881 #endif 4882 __kmpc_taskgroup(loc, gtid); 4883 } 4884 4885 // ========================================================================= 4886 // calculate loop parameters 4887 kmp_taskloop_bounds_t task_bounds(task, lb, ub); 4888 kmp_uint64 tc; 4889 // compiler provides global bounds here 4890 kmp_uint64 lower = task_bounds.get_lb(); 4891 kmp_uint64 upper = task_bounds.get_ub(); 4892 kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag 4893 kmp_uint64 num_tasks = 0, extras = 0; 4894 kmp_int64 last_chunk = 4895 0; // reduce grainsize of last task by last_chunk in strict mode 4896 kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks; 4897 kmp_info_t *thread = __kmp_threads[gtid]; 4898 kmp_taskdata_t *current_task = thread->th.th_current_task; 4899 4900 KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, " 4901 "grain %llu(%d, %d), dup %p\n", 4902 gtid, taskdata, lower, upper, st, grainsize, sched, modifier, 4903 task_dup)); 4904 4905 // compute trip count 4906 if (st == 1) { // most common case 4907 tc = upper - lower + 1; 4908 } else if (st < 0) { 4909 tc = (lower - upper) / (-st) + 1; 4910 } else { // st > 0 4911 tc = (upper - lower) / st + 1; 4912 } 4913 if (tc == 0) { 4914 KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid)); 4915 // free the pattern task and exit 4916 __kmp_task_start(gtid, task, current_task); 4917 // do not execute anything for zero-trip loop 4918 __kmp_task_finish<false>(gtid, task, current_task); 4919 return; 4920 } 4921 4922 #if OMPT_SUPPORT && OMPT_OPTIONAL 4923 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 4924 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 4925 if (ompt_enabled.ompt_callback_work) { 4926 ompt_callbacks.ompt_callback(ompt_callback_work)( 4927 ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data), 4928 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0)); 4929 } 4930 #endif 4931 4932 if (num_tasks_min == 0) 4933 // TODO: can we choose better default heuristic? 4934 num_tasks_min = 4935 KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE); 4936 4937 // compute num_tasks/grainsize based on the input provided 4938 switch (sched) { 4939 case 0: // no schedule clause specified, we can choose the default 4940 // let's try to schedule (team_size*10) tasks 4941 grainsize = thread->th.th_team_nproc * 10; 4942 KMP_FALLTHROUGH(); 4943 case 2: // num_tasks provided 4944 if (grainsize > tc) { 4945 num_tasks = tc; // too big num_tasks requested, adjust values 4946 grainsize = 1; 4947 extras = 0; 4948 } else { 4949 num_tasks = grainsize; 4950 grainsize = tc / num_tasks; 4951 extras = tc % num_tasks; 4952 } 4953 break; 4954 case 1: // grainsize provided 4955 if (grainsize > tc) { 4956 num_tasks = 1; 4957 grainsize = tc; // too big grainsize requested, adjust values 4958 extras = 0; 4959 } else { 4960 if (modifier) { 4961 num_tasks = (tc + grainsize - 1) / grainsize; 4962 last_chunk = tc - (num_tasks * grainsize); 4963 extras = 0; 4964 } else { 4965 num_tasks = tc / grainsize; 4966 // adjust grainsize for balanced distribution of iterations 4967 grainsize = tc / num_tasks; 4968 extras = tc % num_tasks; 4969 } 4970 } 4971 break; 4972 default: 4973 KMP_ASSERT2(0, "unknown scheduling of taskloop"); 4974 } 4975 4976 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + 4977 (last_chunk < 0 ? last_chunk : extras)); 4978 KMP_DEBUG_ASSERT(num_tasks > extras); 4979 KMP_DEBUG_ASSERT(num_tasks > 0); 4980 // ========================================================================= 4981 4982 // check if clause value first 4983 // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native) 4984 if (if_val == 0) { // if(0) specified, mark task as serial 4985 taskdata->td_flags.task_serial = 1; 4986 taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied 4987 // always start serial tasks linearly 4988 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4989 grainsize, extras, last_chunk, tc, 4990 #if OMPT_SUPPORT 4991 OMPT_GET_RETURN_ADDRESS(0), 4992 #endif 4993 task_dup); 4994 // !taskdata->td_flags.native => currently force linear spawning of tasks 4995 // for GOMP_taskloop 4996 } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) { 4997 KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu" 4998 "(%lld), grain %llu, extras %llu, last_chunk %lld\n", 4999 gtid, tc, num_tasks, num_tasks_min, grainsize, extras, 5000 last_chunk)); 5001 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 5002 grainsize, extras, last_chunk, tc, num_tasks_min, 5003 #if OMPT_SUPPORT 5004 OMPT_GET_RETURN_ADDRESS(0), 5005 #endif 5006 task_dup); 5007 } else { 5008 KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu" 5009 "(%lld), grain %llu, extras %llu, last_chunk %lld\n", 5010 gtid, tc, num_tasks, num_tasks_min, grainsize, extras, 5011 last_chunk)); 5012 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 5013 grainsize, extras, last_chunk, tc, 5014 #if OMPT_SUPPORT 5015 OMPT_GET_RETURN_ADDRESS(0), 5016 #endif 5017 task_dup); 5018 } 5019 5020 #if OMPT_SUPPORT && OMPT_OPTIONAL 5021 if (ompt_enabled.ompt_callback_work) { 5022 ompt_callbacks.ompt_callback(ompt_callback_work)( 5023 ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data), 5024 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0)); 5025 } 5026 #endif 5027 5028 if (nogroup == 0) { 5029 #if OMPT_SUPPORT && OMPT_OPTIONAL 5030 OMPT_STORE_RETURN_ADDRESS(gtid); 5031 #endif 5032 __kmpc_end_taskgroup(loc, gtid); 5033 } 5034 KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid)); 5035 } 5036 5037 /*! 5038 @ingroup TASKING 5039 @param loc Source location information 5040 @param gtid Global thread ID 5041 @param task Task structure 5042 @param if_val Value of the if clause 5043 @param lb Pointer to loop lower bound in task structure 5044 @param ub Pointer to loop upper bound in task structure 5045 @param st Loop stride 5046 @param nogroup Flag, 1 if nogroup clause specified, 0 otherwise 5047 @param sched Schedule specified 0/1/2 for none/grainsize/num_tasks 5048 @param grainsize Schedule value if specified 5049 @param task_dup Tasks duplication routine 5050 5051 Execute the taskloop construct. 5052 */ 5053 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, 5054 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, 5055 int sched, kmp_uint64 grainsize, void *task_dup) { 5056 __kmp_assert_valid_gtid(gtid); 5057 KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid)); 5058 __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize, 5059 0, task_dup); 5060 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid)); 5061 } 5062 5063 /*! 5064 @ingroup TASKING 5065 @param loc Source location information 5066 @param gtid Global thread ID 5067 @param task Task structure 5068 @param if_val Value of the if clause 5069 @param lb Pointer to loop lower bound in task structure 5070 @param ub Pointer to loop upper bound in task structure 5071 @param st Loop stride 5072 @param nogroup Flag, 1 if nogroup clause specified, 0 otherwise 5073 @param sched Schedule specified 0/1/2 for none/grainsize/num_tasks 5074 @param grainsize Schedule value if specified 5075 @param modifer Modifier 'strict' for sched, 1 if present, 0 otherwise 5076 @param task_dup Tasks duplication routine 5077 5078 Execute the taskloop construct. 5079 */ 5080 void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val, 5081 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 5082 int nogroup, int sched, kmp_uint64 grainsize, 5083 int modifier, void *task_dup) { 5084 __kmp_assert_valid_gtid(gtid); 5085 KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid)); 5086 __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize, 5087 modifier, task_dup); 5088 KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid)); 5089 } 5090