1 /* 2 * kmp_tasking.cpp -- OpenMP 3.0 tasking support. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_i18n.h" 15 #include "kmp_itt.h" 16 #include "kmp_stats.h" 17 #include "kmp_wait_release.h" 18 #include "kmp_taskdeps.h" 19 20 #if OMPT_SUPPORT 21 #include "ompt-specific.h" 22 #endif 23 24 /* forward declaration */ 25 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 26 kmp_info_t *this_thr); 27 static void __kmp_alloc_task_deque(kmp_info_t *thread, 28 kmp_thread_data_t *thread_data); 29 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 30 kmp_task_team_t *task_team); 31 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask); 32 33 #ifdef BUILD_TIED_TASK_STACK 34 35 // __kmp_trace_task_stack: print the tied tasks from the task stack in order 36 // from top do bottom 37 // 38 // gtid: global thread identifier for thread containing stack 39 // thread_data: thread data for task team thread containing stack 40 // threshold: value above which the trace statement triggers 41 // location: string identifying call site of this function (for trace) 42 static void __kmp_trace_task_stack(kmp_int32 gtid, 43 kmp_thread_data_t *thread_data, 44 int threshold, char *location) { 45 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 46 kmp_taskdata_t **stack_top = task_stack->ts_top; 47 kmp_int32 entries = task_stack->ts_entries; 48 kmp_taskdata_t *tied_task; 49 50 KA_TRACE( 51 threshold, 52 ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, " 53 "first_block = %p, stack_top = %p \n", 54 location, gtid, entries, task_stack->ts_first_block, stack_top)); 55 56 KMP_DEBUG_ASSERT(stack_top != NULL); 57 KMP_DEBUG_ASSERT(entries > 0); 58 59 while (entries != 0) { 60 KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]); 61 // fix up ts_top if we need to pop from previous block 62 if (entries & TASK_STACK_INDEX_MASK == 0) { 63 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top); 64 65 stack_block = stack_block->sb_prev; 66 stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 67 } 68 69 // finish bookkeeping 70 stack_top--; 71 entries--; 72 73 tied_task = *stack_top; 74 75 KMP_DEBUG_ASSERT(tied_task != NULL); 76 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 77 78 KA_TRACE(threshold, 79 ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, " 80 "stack_top=%p, tied_task=%p\n", 81 location, gtid, entries, stack_top, tied_task)); 82 } 83 KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]); 84 85 KA_TRACE(threshold, 86 ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n", 87 location, gtid)); 88 } 89 90 // __kmp_init_task_stack: initialize the task stack for the first time 91 // after a thread_data structure is created. 92 // It should not be necessary to do this again (assuming the stack works). 93 // 94 // gtid: global thread identifier of calling thread 95 // thread_data: thread data for task team thread containing stack 96 static void __kmp_init_task_stack(kmp_int32 gtid, 97 kmp_thread_data_t *thread_data) { 98 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 99 kmp_stack_block_t *first_block; 100 101 // set up the first block of the stack 102 first_block = &task_stack->ts_first_block; 103 task_stack->ts_top = (kmp_taskdata_t **)first_block; 104 memset((void *)first_block, '\0', 105 TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *)); 106 107 // initialize the stack to be empty 108 task_stack->ts_entries = TASK_STACK_EMPTY; 109 first_block->sb_next = NULL; 110 first_block->sb_prev = NULL; 111 } 112 113 // __kmp_free_task_stack: free the task stack when thread_data is destroyed. 114 // 115 // gtid: global thread identifier for calling thread 116 // thread_data: thread info for thread containing stack 117 static void __kmp_free_task_stack(kmp_int32 gtid, 118 kmp_thread_data_t *thread_data) { 119 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 120 kmp_stack_block_t *stack_block = &task_stack->ts_first_block; 121 122 KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY); 123 // free from the second block of the stack 124 while (stack_block != NULL) { 125 kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL; 126 127 stack_block->sb_next = NULL; 128 stack_block->sb_prev = NULL; 129 if (stack_block != &task_stack->ts_first_block) { 130 __kmp_thread_free(thread, 131 stack_block); // free the block, if not the first 132 } 133 stack_block = next_block; 134 } 135 // initialize the stack to be empty 136 task_stack->ts_entries = 0; 137 task_stack->ts_top = NULL; 138 } 139 140 // __kmp_push_task_stack: Push the tied task onto the task stack. 141 // Grow the stack if necessary by allocating another block. 142 // 143 // gtid: global thread identifier for calling thread 144 // thread: thread info for thread containing stack 145 // tied_task: the task to push on the stack 146 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread, 147 kmp_taskdata_t *tied_task) { 148 // GEH - need to consider what to do if tt_threads_data not allocated yet 149 kmp_thread_data_t *thread_data = 150 &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 151 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 152 153 if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) { 154 return; // Don't push anything on stack if team or team tasks are serialized 155 } 156 157 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 158 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 159 160 KA_TRACE(20, 161 ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n", 162 gtid, thread, tied_task)); 163 // Store entry 164 *(task_stack->ts_top) = tied_task; 165 166 // Do bookkeeping for next push 167 task_stack->ts_top++; 168 task_stack->ts_entries++; 169 170 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 171 // Find beginning of this task block 172 kmp_stack_block_t *stack_block = 173 (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE); 174 175 // Check if we already have a block 176 if (stack_block->sb_next != 177 NULL) { // reset ts_top to beginning of next block 178 task_stack->ts_top = &stack_block->sb_next->sb_block[0]; 179 } else { // Alloc new block and link it up 180 kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc( 181 thread, sizeof(kmp_stack_block_t)); 182 183 task_stack->ts_top = &new_block->sb_block[0]; 184 stack_block->sb_next = new_block; 185 new_block->sb_prev = stack_block; 186 new_block->sb_next = NULL; 187 188 KA_TRACE( 189 30, 190 ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n", 191 gtid, tied_task, new_block)); 192 } 193 } 194 KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 195 tied_task)); 196 } 197 198 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return 199 // the task, just check to make sure it matches the ending task passed in. 200 // 201 // gtid: global thread identifier for the calling thread 202 // thread: thread info structure containing stack 203 // tied_task: the task popped off the stack 204 // ending_task: the task that is ending (should match popped task) 205 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread, 206 kmp_taskdata_t *ending_task) { 207 // GEH - need to consider what to do if tt_threads_data not allocated yet 208 kmp_thread_data_t *thread_data = 209 &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)]; 210 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 211 kmp_taskdata_t *tied_task; 212 213 if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) { 214 // Don't pop anything from stack if team or team tasks are serialized 215 return; 216 } 217 218 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 219 KMP_DEBUG_ASSERT(task_stack->ts_entries > 0); 220 221 KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid, 222 thread)); 223 224 // fix up ts_top if we need to pop from previous block 225 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 226 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top); 227 228 stack_block = stack_block->sb_prev; 229 task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 230 } 231 232 // finish bookkeeping 233 task_stack->ts_top--; 234 task_stack->ts_entries--; 235 236 tied_task = *(task_stack->ts_top); 237 238 KMP_DEBUG_ASSERT(tied_task != NULL); 239 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 240 KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly 241 242 KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 243 tied_task)); 244 return; 245 } 246 #endif /* BUILD_TIED_TASK_STACK */ 247 248 // returns 1 if new task is allowed to execute, 0 otherwise 249 // checks Task Scheduling constraint (if requested) and 250 // mutexinoutset dependencies if any 251 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained, 252 const kmp_taskdata_t *tasknew, 253 const kmp_taskdata_t *taskcurr) { 254 if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) { 255 // Check if the candidate obeys the Task Scheduling Constraints (TSC) 256 // only descendant of all deferred tied tasks can be scheduled, checking 257 // the last one is enough, as it in turn is the descendant of all others 258 kmp_taskdata_t *current = taskcurr->td_last_tied; 259 KMP_DEBUG_ASSERT(current != NULL); 260 // check if the task is not suspended on barrier 261 if (current->td_flags.tasktype == TASK_EXPLICIT || 262 current->td_taskwait_thread > 0) { // <= 0 on barrier 263 kmp_int32 level = current->td_level; 264 kmp_taskdata_t *parent = tasknew->td_parent; 265 while (parent != current && parent->td_level > level) { 266 // check generation up to the level of the current task 267 parent = parent->td_parent; 268 KMP_DEBUG_ASSERT(parent != NULL); 269 } 270 if (parent != current) 271 return false; 272 } 273 } 274 // Check mutexinoutset dependencies, acquire locks 275 kmp_depnode_t *node = tasknew->td_depnode; 276 if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) { 277 for (int i = 0; i < node->dn.mtx_num_locks; ++i) { 278 KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL); 279 if (__kmp_test_lock(node->dn.mtx_locks[i], gtid)) 280 continue; 281 // could not get the lock, release previous locks 282 for (int j = i - 1; j >= 0; --j) 283 __kmp_release_lock(node->dn.mtx_locks[j], gtid); 284 return false; 285 } 286 // negative num_locks means all locks acquired successfully 287 node->dn.mtx_num_locks = -node->dn.mtx_num_locks; 288 } 289 return true; 290 } 291 292 // __kmp_realloc_task_deque: 293 // Re-allocates a task deque for a particular thread, copies the content from 294 // the old deque and adjusts the necessary data structures relating to the 295 // deque. This operation must be done with the deque_lock being held 296 static void __kmp_realloc_task_deque(kmp_info_t *thread, 297 kmp_thread_data_t *thread_data) { 298 kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td); 299 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size); 300 kmp_int32 new_size = 2 * size; 301 302 KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to " 303 "%d] for thread_data %p\n", 304 __kmp_gtid_from_thread(thread), size, new_size, thread_data)); 305 306 kmp_taskdata_t **new_deque = 307 (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *)); 308 309 int i, j; 310 for (i = thread_data->td.td_deque_head, j = 0; j < size; 311 i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++) 312 new_deque[j] = thread_data->td.td_deque[i]; 313 314 __kmp_free(thread_data->td.td_deque); 315 316 thread_data->td.td_deque_head = 0; 317 thread_data->td.td_deque_tail = size; 318 thread_data->td.td_deque = new_deque; 319 thread_data->td.td_deque_size = new_size; 320 } 321 322 // __kmp_push_task: Add a task to the thread's deque 323 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { 324 kmp_info_t *thread = __kmp_threads[gtid]; 325 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 326 327 // We don't need to map to shadow gtid if it is already hidden helper thread 328 if (taskdata->td_flags.hidden_helper && !KMP_HIDDEN_HELPER_THREAD(gtid)) { 329 gtid = KMP_GTID_TO_SHADOW_GTID(gtid); 330 thread = __kmp_threads[gtid]; 331 } 332 333 kmp_task_team_t *task_team = thread->th.th_task_team; 334 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 335 kmp_thread_data_t *thread_data; 336 337 KA_TRACE(20, 338 ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata)); 339 340 if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) { 341 // untied task needs to increment counter so that the task structure is not 342 // freed prematurely 343 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count); 344 KMP_DEBUG_USE_VAR(counter); 345 KA_TRACE( 346 20, 347 ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n", 348 gtid, counter, taskdata)); 349 } 350 351 // The first check avoids building task_team thread data if serialized 352 if (UNLIKELY(taskdata->td_flags.task_serial)) { 353 KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning " 354 "TASK_NOT_PUSHED for task %p\n", 355 gtid, taskdata)); 356 return TASK_NOT_PUSHED; 357 } 358 359 // Now that serialized tasks have returned, we can assume that we are not in 360 // immediate exec mode 361 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 362 if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) { 363 __kmp_enable_tasking(task_team, thread); 364 } 365 KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE); 366 KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL); 367 368 // Find tasking deque specific to encountering thread 369 thread_data = &task_team->tt.tt_threads_data[tid]; 370 371 // No lock needed since only owner can allocate. If the task is hidden_helper, 372 // we don't need it either because we have initialized the dequeue for hidden 373 // helper thread data. 374 if (UNLIKELY(thread_data->td.td_deque == NULL)) { 375 __kmp_alloc_task_deque(thread, thread_data); 376 } 377 378 int locked = 0; 379 // Check if deque is full 380 if (TCR_4(thread_data->td.td_deque_ntasks) >= 381 TASK_DEQUE_SIZE(thread_data->td)) { 382 if (__kmp_enable_task_throttling && 383 __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata, 384 thread->th.th_current_task)) { 385 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning " 386 "TASK_NOT_PUSHED for task %p\n", 387 gtid, taskdata)); 388 return TASK_NOT_PUSHED; 389 } else { 390 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 391 locked = 1; 392 if (TCR_4(thread_data->td.td_deque_ntasks) >= 393 TASK_DEQUE_SIZE(thread_data->td)) { 394 // expand deque to push the task which is not allowed to execute 395 __kmp_realloc_task_deque(thread, thread_data); 396 } 397 } 398 } 399 // Lock the deque for the task push operation 400 if (!locked) { 401 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 402 // Need to recheck as we can get a proxy task from thread outside of OpenMP 403 if (TCR_4(thread_data->td.td_deque_ntasks) >= 404 TASK_DEQUE_SIZE(thread_data->td)) { 405 if (__kmp_enable_task_throttling && 406 __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata, 407 thread->th.th_current_task)) { 408 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 409 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; " 410 "returning TASK_NOT_PUSHED for task %p\n", 411 gtid, taskdata)); 412 return TASK_NOT_PUSHED; 413 } else { 414 // expand deque to push the task which is not allowed to execute 415 __kmp_realloc_task_deque(thread, thread_data); 416 } 417 } 418 } 419 // Must have room since no thread can add tasks but calling thread 420 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) < 421 TASK_DEQUE_SIZE(thread_data->td)); 422 423 thread_data->td.td_deque[thread_data->td.td_deque_tail] = 424 taskdata; // Push taskdata 425 // Wrap index. 426 thread_data->td.td_deque_tail = 427 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 428 TCW_4(thread_data->td.td_deque_ntasks, 429 TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count 430 KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self 431 KMP_FSYNC_RELEASING(taskdata); // releasing child 432 KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: " 433 "task=%p ntasks=%d head=%u tail=%u\n", 434 gtid, taskdata, thread_data->td.td_deque_ntasks, 435 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 436 437 auto hidden_helper = taskdata->td_flags.hidden_helper; 438 439 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 440 441 // Signal one worker thread to execute the task 442 if (UNLIKELY(hidden_helper)) { 443 // Wake hidden helper threads up if they're sleeping 444 __kmp_hidden_helper_worker_thread_signal(); 445 } 446 447 return TASK_SUCCESSFULLY_PUSHED; 448 } 449 450 // __kmp_pop_current_task_from_thread: set up current task from called thread 451 // when team ends 452 // 453 // this_thr: thread structure to set current_task in. 454 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) { 455 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d " 456 "this_thread=%p, curtask=%p, " 457 "curtask_parent=%p\n", 458 0, this_thr, this_thr->th.th_current_task, 459 this_thr->th.th_current_task->td_parent)); 460 461 this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent; 462 463 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d " 464 "this_thread=%p, curtask=%p, " 465 "curtask_parent=%p\n", 466 0, this_thr, this_thr->th.th_current_task, 467 this_thr->th.th_current_task->td_parent)); 468 } 469 470 // __kmp_push_current_task_to_thread: set up current task in called thread for a 471 // new team 472 // 473 // this_thr: thread structure to set up 474 // team: team for implicit task data 475 // tid: thread within team to set up 476 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team, 477 int tid) { 478 // current task of the thread is a parent of the new just created implicit 479 // tasks of new team 480 KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p " 481 "curtask=%p " 482 "parent_task=%p\n", 483 tid, this_thr, this_thr->th.th_current_task, 484 team->t.t_implicit_task_taskdata[tid].td_parent)); 485 486 KMP_DEBUG_ASSERT(this_thr != NULL); 487 488 if (tid == 0) { 489 if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) { 490 team->t.t_implicit_task_taskdata[0].td_parent = 491 this_thr->th.th_current_task; 492 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0]; 493 } 494 } else { 495 team->t.t_implicit_task_taskdata[tid].td_parent = 496 team->t.t_implicit_task_taskdata[0].td_parent; 497 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid]; 498 } 499 500 KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p " 501 "curtask=%p " 502 "parent_task=%p\n", 503 tid, this_thr, this_thr->th.th_current_task, 504 team->t.t_implicit_task_taskdata[tid].td_parent)); 505 } 506 507 // __kmp_task_start: bookkeeping for a task starting execution 508 // 509 // GTID: global thread id of calling thread 510 // task: task starting execution 511 // current_task: task suspending 512 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task, 513 kmp_taskdata_t *current_task) { 514 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 515 kmp_info_t *thread = __kmp_threads[gtid]; 516 517 KA_TRACE(10, 518 ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n", 519 gtid, taskdata, current_task)); 520 521 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 522 523 // mark currently executing task as suspended 524 // TODO: GEH - make sure root team implicit task is initialized properly. 525 // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 ); 526 current_task->td_flags.executing = 0; 527 528 // Add task to stack if tied 529 #ifdef BUILD_TIED_TASK_STACK 530 if (taskdata->td_flags.tiedness == TASK_TIED) { 531 __kmp_push_task_stack(gtid, thread, taskdata); 532 } 533 #endif /* BUILD_TIED_TASK_STACK */ 534 535 // mark starting task as executing and as current task 536 thread->th.th_current_task = taskdata; 537 538 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 || 539 taskdata->td_flags.tiedness == TASK_UNTIED); 540 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 || 541 taskdata->td_flags.tiedness == TASK_UNTIED); 542 taskdata->td_flags.started = 1; 543 taskdata->td_flags.executing = 1; 544 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 545 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 546 547 // GEH TODO: shouldn't we pass some sort of location identifier here? 548 // APT: yes, we will pass location here. 549 // need to store current thread state (in a thread or taskdata structure) 550 // before setting work_state, otherwise wrong state is set after end of task 551 552 KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata)); 553 554 return; 555 } 556 557 #if OMPT_SUPPORT 558 //------------------------------------------------------------------------------ 559 // __ompt_task_init: 560 // Initialize OMPT fields maintained by a task. This will only be called after 561 // ompt_start_tool, so we already know whether ompt is enabled or not. 562 563 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) { 564 // The calls to __ompt_task_init already have the ompt_enabled condition. 565 task->ompt_task_info.task_data.value = 0; 566 task->ompt_task_info.frame.exit_frame = ompt_data_none; 567 task->ompt_task_info.frame.enter_frame = ompt_data_none; 568 task->ompt_task_info.frame.exit_frame_flags = 569 ompt_frame_runtime | ompt_frame_framepointer; 570 task->ompt_task_info.frame.enter_frame_flags = 571 ompt_frame_runtime | ompt_frame_framepointer; 572 } 573 574 // __ompt_task_start: 575 // Build and trigger task-begin event 576 static inline void __ompt_task_start(kmp_task_t *task, 577 kmp_taskdata_t *current_task, 578 kmp_int32 gtid) { 579 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 580 ompt_task_status_t status = ompt_task_switch; 581 if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) { 582 status = ompt_task_yield; 583 __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0; 584 } 585 /* let OMPT know that we're about to run this task */ 586 if (ompt_enabled.ompt_callback_task_schedule) { 587 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)( 588 &(current_task->ompt_task_info.task_data), status, 589 &(taskdata->ompt_task_info.task_data)); 590 } 591 taskdata->ompt_task_info.scheduling_parent = current_task; 592 } 593 594 // __ompt_task_finish: 595 // Build and trigger final task-schedule event 596 static inline void __ompt_task_finish(kmp_task_t *task, 597 kmp_taskdata_t *resumed_task, 598 ompt_task_status_t status) { 599 if (ompt_enabled.ompt_callback_task_schedule) { 600 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 601 if (__kmp_omp_cancellation && taskdata->td_taskgroup && 602 taskdata->td_taskgroup->cancel_request == cancel_taskgroup) { 603 status = ompt_task_cancel; 604 } 605 606 /* let OMPT know that we're returning to the callee task */ 607 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)( 608 &(taskdata->ompt_task_info.task_data), status, 609 (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL)); 610 } 611 } 612 #endif 613 614 template <bool ompt> 615 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid, 616 kmp_task_t *task, 617 void *frame_address, 618 void *return_address) { 619 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 620 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 621 622 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p " 623 "current_task=%p\n", 624 gtid, loc_ref, taskdata, current_task)); 625 626 if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) { 627 // untied task needs to increment counter so that the task structure is not 628 // freed prematurely 629 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count); 630 KMP_DEBUG_USE_VAR(counter); 631 KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) " 632 "incremented for task %p\n", 633 gtid, counter, taskdata)); 634 } 635 636 taskdata->td_flags.task_serial = 637 1; // Execute this task immediately, not deferred. 638 __kmp_task_start(gtid, task, current_task); 639 640 #if OMPT_SUPPORT 641 if (ompt) { 642 if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) { 643 current_task->ompt_task_info.frame.enter_frame.ptr = 644 taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address; 645 current_task->ompt_task_info.frame.enter_frame_flags = 646 taskdata->ompt_task_info.frame.exit_frame_flags = 647 ompt_frame_application | ompt_frame_framepointer; 648 } 649 if (ompt_enabled.ompt_callback_task_create) { 650 ompt_task_info_t *parent_info = &(current_task->ompt_task_info); 651 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 652 &(parent_info->task_data), &(parent_info->frame), 653 &(taskdata->ompt_task_info.task_data), 654 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0, 655 return_address); 656 } 657 __ompt_task_start(task, current_task, gtid); 658 } 659 #endif // OMPT_SUPPORT 660 661 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid, 662 loc_ref, taskdata)); 663 } 664 665 #if OMPT_SUPPORT 666 OMPT_NOINLINE 667 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid, 668 kmp_task_t *task, 669 void *frame_address, 670 void *return_address) { 671 __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address, 672 return_address); 673 } 674 #endif // OMPT_SUPPORT 675 676 // __kmpc_omp_task_begin_if0: report that a given serialized task has started 677 // execution 678 // 679 // loc_ref: source location information; points to beginning of task block. 680 // gtid: global thread number. 681 // task: task thunk for the started task. 682 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid, 683 kmp_task_t *task) { 684 #if OMPT_SUPPORT 685 if (UNLIKELY(ompt_enabled.enabled)) { 686 OMPT_STORE_RETURN_ADDRESS(gtid); 687 __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task, 688 OMPT_GET_FRAME_ADDRESS(1), 689 OMPT_LOAD_RETURN_ADDRESS(gtid)); 690 return; 691 } 692 #endif 693 __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL); 694 } 695 696 #ifdef TASK_UNUSED 697 // __kmpc_omp_task_begin: report that a given task has started execution 698 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 699 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) { 700 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 701 702 KA_TRACE( 703 10, 704 ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n", 705 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task)); 706 707 __kmp_task_start(gtid, task, current_task); 708 709 KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid, 710 loc_ref, KMP_TASK_TO_TASKDATA(task))); 711 return; 712 } 713 #endif // TASK_UNUSED 714 715 // __kmp_free_task: free the current task space and the space for shareds 716 // 717 // gtid: Global thread ID of calling thread 718 // taskdata: task to free 719 // thread: thread data structure of caller 720 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata, 721 kmp_info_t *thread) { 722 KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid, 723 taskdata)); 724 725 // Check to make sure all flags and counters have the correct values 726 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 727 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0); 728 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1); 729 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 730 KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 || 731 taskdata->td_flags.task_serial == 1); 732 KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0); 733 734 taskdata->td_flags.freed = 1; 735 // deallocate the taskdata and shared variable blocks associated with this task 736 #if USE_FAST_MEMORY 737 __kmp_fast_free(thread, taskdata); 738 #else /* ! USE_FAST_MEMORY */ 739 __kmp_thread_free(thread, taskdata); 740 #endif 741 KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata)); 742 } 743 744 // __kmp_free_task_and_ancestors: free the current task and ancestors without 745 // children 746 // 747 // gtid: Global thread ID of calling thread 748 // taskdata: task to free 749 // thread: thread data structure of caller 750 static void __kmp_free_task_and_ancestors(kmp_int32 gtid, 751 kmp_taskdata_t *taskdata, 752 kmp_info_t *thread) { 753 // Proxy tasks must always be allowed to free their parents 754 // because they can be run in background even in serial mode. 755 kmp_int32 team_serial = 756 (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) && 757 !taskdata->td_flags.proxy; 758 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 759 760 kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1; 761 KMP_DEBUG_ASSERT(children >= 0); 762 763 // Now, go up the ancestor tree to see if any ancestors can now be freed. 764 while (children == 0) { 765 kmp_taskdata_t *parent_taskdata = taskdata->td_parent; 766 767 KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete " 768 "and freeing itself\n", 769 gtid, taskdata)); 770 771 // --- Deallocate my ancestor task --- 772 __kmp_free_task(gtid, taskdata, thread); 773 774 taskdata = parent_taskdata; 775 776 if (team_serial) 777 return; 778 // Stop checking ancestors at implicit task instead of walking up ancestor 779 // tree to avoid premature deallocation of ancestors. 780 if (taskdata->td_flags.tasktype == TASK_IMPLICIT) { 781 if (taskdata->td_dephash) { // do we need to cleanup dephash? 782 int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks); 783 kmp_tasking_flags_t flags_old = taskdata->td_flags; 784 if (children == 0 && flags_old.complete == 1) { 785 kmp_tasking_flags_t flags_new = flags_old; 786 flags_new.complete = 0; 787 if (KMP_COMPARE_AND_STORE_ACQ32( 788 RCAST(kmp_int32 *, &taskdata->td_flags), 789 *RCAST(kmp_int32 *, &flags_old), 790 *RCAST(kmp_int32 *, &flags_new))) { 791 KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans " 792 "dephash of implicit task %p\n", 793 gtid, taskdata)); 794 // cleanup dephash of finished implicit task 795 __kmp_dephash_free_entries(thread, taskdata->td_dephash); 796 } 797 } 798 } 799 return; 800 } 801 // Predecrement simulated by "- 1" calculation 802 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1; 803 KMP_DEBUG_ASSERT(children >= 0); 804 } 805 806 KA_TRACE( 807 20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; " 808 "not freeing it yet\n", 809 gtid, taskdata, children)); 810 } 811 812 // __kmp_task_finish: bookkeeping to do when a task finishes execution 813 // 814 // gtid: global thread ID for calling thread 815 // task: task to be finished 816 // resumed_task: task to be resumed. (may be NULL if task is serialized) 817 // 818 // template<ompt>: effectively ompt_enabled.enabled!=0 819 // the version with ompt=false is inlined, allowing to optimize away all ompt 820 // code in this case 821 template <bool ompt> 822 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, 823 kmp_taskdata_t *resumed_task) { 824 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 825 kmp_info_t *thread = __kmp_threads[gtid]; 826 kmp_task_team_t *task_team = 827 thread->th.th_task_team; // might be NULL for serial teams... 828 #if KMP_DEBUG 829 kmp_int32 children = 0; 830 #endif 831 KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming " 832 "task %p\n", 833 gtid, taskdata, resumed_task)); 834 835 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 836 837 // Pop task from stack if tied 838 #ifdef BUILD_TIED_TASK_STACK 839 if (taskdata->td_flags.tiedness == TASK_TIED) { 840 __kmp_pop_task_stack(gtid, thread, taskdata); 841 } 842 #endif /* BUILD_TIED_TASK_STACK */ 843 844 if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) { 845 // untied task needs to check the counter so that the task structure is not 846 // freed prematurely 847 kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1; 848 KA_TRACE( 849 20, 850 ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n", 851 gtid, counter, taskdata)); 852 if (counter > 0) { 853 // untied task is not done, to be continued possibly by other thread, do 854 // not free it now 855 if (resumed_task == NULL) { 856 KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial); 857 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 858 // task is the parent 859 } 860 thread->th.th_current_task = resumed_task; // restore current_task 861 resumed_task->td_flags.executing = 1; // resume previous task 862 KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, " 863 "resuming task %p\n", 864 gtid, taskdata, resumed_task)); 865 return; 866 } 867 } 868 869 // bookkeeping for resuming task: 870 // GEH - note tasking_ser => task_serial 871 KMP_DEBUG_ASSERT( 872 (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) == 873 taskdata->td_flags.task_serial); 874 if (taskdata->td_flags.task_serial) { 875 if (resumed_task == NULL) { 876 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 877 // task is the parent 878 } 879 } else { 880 KMP_DEBUG_ASSERT(resumed_task != 881 NULL); // verify that resumed task is passed as argument 882 } 883 884 /* If the tasks' destructor thunk flag has been set, we need to invoke the 885 destructor thunk that has been generated by the compiler. The code is 886 placed here, since at this point other tasks might have been released 887 hence overlapping the destructor invocations with some other work in the 888 released tasks. The OpenMP spec is not specific on when the destructors 889 are invoked, so we should be free to choose. */ 890 if (UNLIKELY(taskdata->td_flags.destructors_thunk)) { 891 kmp_routine_entry_t destr_thunk = task->data1.destructors; 892 KMP_ASSERT(destr_thunk); 893 destr_thunk(gtid, task); 894 } 895 896 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 897 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1); 898 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 899 900 bool detach = false; 901 if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) { 902 if (taskdata->td_allow_completion_event.type == 903 KMP_EVENT_ALLOW_COMPLETION) { 904 // event hasn't been fulfilled yet. Try to detach task. 905 __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid); 906 if (taskdata->td_allow_completion_event.type == 907 KMP_EVENT_ALLOW_COMPLETION) { 908 // task finished execution 909 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); 910 taskdata->td_flags.executing = 0; // suspend the finishing task 911 912 #if OMPT_SUPPORT 913 // For a detached task, which is not completed, we switch back 914 // the omp_fulfill_event signals completion 915 // locking is necessary to avoid a race with ompt_task_late_fulfill 916 if (ompt) 917 __ompt_task_finish(task, resumed_task, ompt_task_detach); 918 #endif 919 920 // no access to taskdata after this point! 921 // __kmp_fulfill_event might free taskdata at any time from now 922 923 taskdata->td_flags.proxy = TASK_PROXY; // proxify! 924 detach = true; 925 } 926 __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid); 927 } 928 } 929 930 if (!detach) { 931 taskdata->td_flags.complete = 1; // mark the task as completed 932 933 #if OMPT_SUPPORT 934 // This is not a detached task, we are done here 935 if (ompt) 936 __ompt_task_finish(task, resumed_task, ompt_task_complete); 937 #endif 938 939 // Only need to keep track of count if team parallel and tasking not 940 // serialized, or task is detachable and event has already been fulfilled 941 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) || 942 taskdata->td_flags.detachable == TASK_DETACHABLE || 943 taskdata->td_flags.hidden_helper) { 944 __kmp_release_deps(gtid, taskdata); 945 // Predecrement simulated by "- 1" calculation 946 #if KMP_DEBUG 947 children = -1 + 948 #endif 949 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks); 950 KMP_DEBUG_ASSERT(children >= 0); 951 if (taskdata->td_taskgroup) 952 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); 953 } else if (task_team && (task_team->tt.tt_found_proxy_tasks || 954 task_team->tt.tt_hidden_helper_task_encountered)) { 955 // if we found proxy or hidden helper tasks there could exist a dependency 956 // chain with the proxy task as origin 957 __kmp_release_deps(gtid, taskdata); 958 } 959 // td_flags.executing must be marked as 0 after __kmp_release_deps has been 960 // called. Othertwise, if a task is executed immediately from the 961 // release_deps code, the flag will be reset to 1 again by this same 962 // function 963 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); 964 taskdata->td_flags.executing = 0; // suspend the finishing task 965 } 966 967 KA_TRACE( 968 20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n", 969 gtid, taskdata, children)); 970 971 // Free this task and then ancestor tasks if they have no children. 972 // Restore th_current_task first as suggested by John: 973 // johnmc: if an asynchronous inquiry peers into the runtime system 974 // it doesn't see the freed task as the current task. 975 thread->th.th_current_task = resumed_task; 976 if (!detach) 977 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 978 979 // TODO: GEH - make sure root team implicit task is initialized properly. 980 // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 ); 981 resumed_task->td_flags.executing = 1; // resume previous task 982 983 KA_TRACE( 984 10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n", 985 gtid, taskdata, resumed_task)); 986 987 return; 988 } 989 990 template <bool ompt> 991 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref, 992 kmp_int32 gtid, 993 kmp_task_t *task) { 994 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n", 995 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 996 KMP_DEBUG_ASSERT(gtid >= 0); 997 // this routine will provide task to resume 998 __kmp_task_finish<ompt>(gtid, task, NULL); 999 1000 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n", 1001 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 1002 1003 #if OMPT_SUPPORT 1004 if (ompt) { 1005 ompt_frame_t *ompt_frame; 1006 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 1007 ompt_frame->enter_frame = ompt_data_none; 1008 ompt_frame->enter_frame_flags = 1009 ompt_frame_runtime | ompt_frame_framepointer; 1010 } 1011 #endif 1012 1013 return; 1014 } 1015 1016 #if OMPT_SUPPORT 1017 OMPT_NOINLINE 1018 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid, 1019 kmp_task_t *task) { 1020 __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task); 1021 } 1022 #endif // OMPT_SUPPORT 1023 1024 // __kmpc_omp_task_complete_if0: report that a task has completed execution 1025 // 1026 // loc_ref: source location information; points to end of task block. 1027 // gtid: global thread number. 1028 // task: task thunk for the completed task. 1029 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid, 1030 kmp_task_t *task) { 1031 #if OMPT_SUPPORT 1032 if (UNLIKELY(ompt_enabled.enabled)) { 1033 __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task); 1034 return; 1035 } 1036 #endif 1037 __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task); 1038 } 1039 1040 #ifdef TASK_UNUSED 1041 // __kmpc_omp_task_complete: report that a task has completed execution 1042 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 1043 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid, 1044 kmp_task_t *task) { 1045 KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid, 1046 loc_ref, KMP_TASK_TO_TASKDATA(task))); 1047 1048 __kmp_task_finish<false>(gtid, task, 1049 NULL); // Not sure how to find task to resume 1050 1051 KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid, 1052 loc_ref, KMP_TASK_TO_TASKDATA(task))); 1053 return; 1054 } 1055 #endif // TASK_UNUSED 1056 1057 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit 1058 // task for a given thread 1059 // 1060 // loc_ref: reference to source location of parallel region 1061 // this_thr: thread data structure corresponding to implicit task 1062 // team: team for this_thr 1063 // tid: thread id of given thread within team 1064 // set_curr_task: TRUE if need to push current task to thread 1065 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to 1066 // have already been done elsewhere. 1067 // TODO: Get better loc_ref. Value passed in may be NULL 1068 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr, 1069 kmp_team_t *team, int tid, int set_curr_task) { 1070 kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid]; 1071 1072 KF_TRACE( 1073 10, 1074 ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n", 1075 tid, team, task, set_curr_task ? "TRUE" : "FALSE")); 1076 1077 task->td_task_id = KMP_GEN_TASK_ID(); 1078 task->td_team = team; 1079 // task->td_parent = NULL; // fix for CQ230101 (broken parent task info 1080 // in debugger) 1081 task->td_ident = loc_ref; 1082 task->td_taskwait_ident = NULL; 1083 task->td_taskwait_counter = 0; 1084 task->td_taskwait_thread = 0; 1085 1086 task->td_flags.tiedness = TASK_TIED; 1087 task->td_flags.tasktype = TASK_IMPLICIT; 1088 task->td_flags.proxy = TASK_FULL; 1089 1090 // All implicit tasks are executed immediately, not deferred 1091 task->td_flags.task_serial = 1; 1092 task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 1093 task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 1094 1095 task->td_flags.started = 1; 1096 task->td_flags.executing = 1; 1097 task->td_flags.complete = 0; 1098 task->td_flags.freed = 0; 1099 1100 task->td_depnode = NULL; 1101 task->td_last_tied = task; 1102 task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED; 1103 1104 if (set_curr_task) { // only do this init first time thread is created 1105 KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0); 1106 // Not used: don't need to deallocate implicit task 1107 KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0); 1108 task->td_taskgroup = NULL; // An implicit task does not have taskgroup 1109 task->td_dephash = NULL; 1110 __kmp_push_current_task_to_thread(this_thr, team, tid); 1111 } else { 1112 KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0); 1113 KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0); 1114 } 1115 1116 #if OMPT_SUPPORT 1117 if (UNLIKELY(ompt_enabled.enabled)) 1118 __ompt_task_init(task, tid); 1119 #endif 1120 1121 KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid, 1122 team, task)); 1123 } 1124 1125 // __kmp_finish_implicit_task: Release resources associated to implicit tasks 1126 // at the end of parallel regions. Some resources are kept for reuse in the next 1127 // parallel region. 1128 // 1129 // thread: thread data structure corresponding to implicit task 1130 void __kmp_finish_implicit_task(kmp_info_t *thread) { 1131 kmp_taskdata_t *task = thread->th.th_current_task; 1132 if (task->td_dephash) { 1133 int children; 1134 task->td_flags.complete = 1; 1135 children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks); 1136 kmp_tasking_flags_t flags_old = task->td_flags; 1137 if (children == 0 && flags_old.complete == 1) { 1138 kmp_tasking_flags_t flags_new = flags_old; 1139 flags_new.complete = 0; 1140 if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags), 1141 *RCAST(kmp_int32 *, &flags_old), 1142 *RCAST(kmp_int32 *, &flags_new))) { 1143 KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans " 1144 "dephash of implicit task %p\n", 1145 thread->th.th_info.ds.ds_gtid, task)); 1146 __kmp_dephash_free_entries(thread, task->td_dephash); 1147 } 1148 } 1149 } 1150 } 1151 1152 // __kmp_free_implicit_task: Release resources associated to implicit tasks 1153 // when these are destroyed regions 1154 // 1155 // thread: thread data structure corresponding to implicit task 1156 void __kmp_free_implicit_task(kmp_info_t *thread) { 1157 kmp_taskdata_t *task = thread->th.th_current_task; 1158 if (task && task->td_dephash) { 1159 __kmp_dephash_free(thread, task->td_dephash); 1160 task->td_dephash = NULL; 1161 } 1162 } 1163 1164 // Round up a size to a power of two specified by val: Used to insert padding 1165 // between structures co-allocated using a single malloc() call 1166 static size_t __kmp_round_up_to_val(size_t size, size_t val) { 1167 if (size & (val - 1)) { 1168 size &= ~(val - 1); 1169 if (size <= KMP_SIZE_T_MAX - val) { 1170 size += val; // Round up if there is no overflow. 1171 } 1172 } 1173 return size; 1174 } // __kmp_round_up_to_va 1175 1176 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task 1177 // 1178 // loc_ref: source location information 1179 // gtid: global thread number. 1180 // flags: include tiedness & task type (explicit vs. implicit) of the ''new'' 1181 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine. 1182 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including 1183 // private vars accessed in task. 1184 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed 1185 // in task. 1186 // task_entry: Pointer to task code entry point generated by compiler. 1187 // returns: a pointer to the allocated kmp_task_t structure (task). 1188 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1189 kmp_tasking_flags_t *flags, 1190 size_t sizeof_kmp_task_t, size_t sizeof_shareds, 1191 kmp_routine_entry_t task_entry) { 1192 kmp_task_t *task; 1193 kmp_taskdata_t *taskdata; 1194 kmp_info_t *thread = __kmp_threads[gtid]; 1195 kmp_info_t *encountering_thread = thread; 1196 kmp_team_t *team = thread->th.th_team; 1197 kmp_taskdata_t *parent_task = thread->th.th_current_task; 1198 size_t shareds_offset; 1199 1200 if (UNLIKELY(!TCR_4(__kmp_init_middle))) 1201 __kmp_middle_initialize(); 1202 1203 if (flags->hidden_helper) { 1204 if (__kmp_enable_hidden_helper) { 1205 if (!TCR_4(__kmp_init_hidden_helper)) 1206 __kmp_hidden_helper_initialize(); 1207 1208 // For a hidden helper task encountered by a regular thread, we will push 1209 // the task to the (gtid%__kmp_hidden_helper_threads_num)-th hidden helper 1210 // thread. 1211 if (!KMP_HIDDEN_HELPER_THREAD(gtid)) { 1212 thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)]; 1213 // We don't change the parent-child relation for hidden helper task as 1214 // we need that to do per-task-region synchronization. 1215 } 1216 } else { 1217 // If the hidden helper task is not enabled, reset the flag to FALSE. 1218 flags->hidden_helper = FALSE; 1219 } 1220 } 1221 1222 KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) " 1223 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1224 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t, 1225 sizeof_shareds, task_entry)); 1226 1227 KMP_DEBUG_ASSERT(parent_task); 1228 if (parent_task->td_flags.final) { 1229 if (flags->merged_if0) { 1230 } 1231 flags->final = 1; 1232 } 1233 1234 if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) { 1235 // Untied task encountered causes the TSC algorithm to check entire deque of 1236 // the victim thread. If no untied task encountered, then checking the head 1237 // of the deque should be enough. 1238 KMP_CHECK_UPDATE( 1239 encountering_thread->th.th_task_team->tt.tt_untied_task_encountered, 1); 1240 } 1241 1242 // Detachable tasks are not proxy tasks yet but could be in the future. Doing 1243 // the tasking setup 1244 // when that happens is too late. 1245 if (UNLIKELY(flags->proxy == TASK_PROXY || 1246 flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) { 1247 if (flags->proxy == TASK_PROXY) { 1248 flags->tiedness = TASK_UNTIED; 1249 flags->merged_if0 = 1; 1250 } 1251 /* are we running in a sequential parallel or tskm_immediate_exec... we need 1252 tasking support enabled */ 1253 if ((encountering_thread->th.th_task_team) == NULL) { 1254 /* This should only happen if the team is serialized 1255 setup a task team and propagate it to the thread */ 1256 KMP_DEBUG_ASSERT(team->t.t_serialized); 1257 KA_TRACE(30, 1258 ("T#%d creating task team in __kmp_task_alloc for proxy task\n", 1259 gtid)); 1260 __kmp_task_team_setup( 1261 encountering_thread, team, 1262 1); // 1 indicates setup the current team regardless of nthreads 1263 encountering_thread->th.th_task_team = 1264 team->t.t_task_team[encountering_thread->th.th_task_state]; 1265 } 1266 kmp_task_team_t *task_team = encountering_thread->th.th_task_team; 1267 1268 /* tasking must be enabled now as the task might not be pushed */ 1269 if (!KMP_TASKING_ENABLED(task_team)) { 1270 KA_TRACE( 1271 30, 1272 ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid)); 1273 __kmp_enable_tasking(task_team, encountering_thread); 1274 kmp_int32 tid = encountering_thread->th.th_info.ds.ds_tid; 1275 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 1276 // No lock needed since only owner can allocate 1277 if (thread_data->td.td_deque == NULL) { 1278 __kmp_alloc_task_deque(encountering_thread, thread_data); 1279 } 1280 } 1281 1282 if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) && 1283 task_team->tt.tt_found_proxy_tasks == FALSE) 1284 TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE); 1285 if (flags->hidden_helper && 1286 task_team->tt.tt_hidden_helper_task_encountered == FALSE) 1287 TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE); 1288 } 1289 1290 // Calculate shared structure offset including padding after kmp_task_t struct 1291 // to align pointers in shared struct 1292 shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t; 1293 shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *)); 1294 1295 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 1296 KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid, 1297 shareds_offset)); 1298 KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid, 1299 sizeof_shareds)); 1300 1301 // Avoid double allocation here by combining shareds with taskdata 1302 #if USE_FAST_MEMORY 1303 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate( 1304 encountering_thread, shareds_offset + sizeof_shareds); 1305 #else /* ! USE_FAST_MEMORY */ 1306 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc( 1307 encountering_thread, shareds_offset + sizeof_shareds); 1308 #endif /* USE_FAST_MEMORY */ 1309 1310 task = KMP_TASKDATA_TO_TASK(taskdata); 1311 1312 // Make sure task & taskdata are aligned appropriately 1313 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD 1314 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0); 1315 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0); 1316 #else 1317 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0); 1318 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0); 1319 #endif 1320 if (sizeof_shareds > 0) { 1321 // Avoid double allocation here by combining shareds with taskdata 1322 task->shareds = &((char *)taskdata)[shareds_offset]; 1323 // Make sure shareds struct is aligned to pointer size 1324 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 1325 0); 1326 } else { 1327 task->shareds = NULL; 1328 } 1329 task->routine = task_entry; 1330 task->part_id = 0; // AC: Always start with 0 part id 1331 1332 taskdata->td_task_id = KMP_GEN_TASK_ID(); 1333 taskdata->td_team = thread->th.th_team; 1334 taskdata->td_alloc_thread = encountering_thread; 1335 taskdata->td_parent = parent_task; 1336 taskdata->td_level = parent_task->td_level + 1; // increment nesting level 1337 KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0); 1338 taskdata->td_ident = loc_ref; 1339 taskdata->td_taskwait_ident = NULL; 1340 taskdata->td_taskwait_counter = 0; 1341 taskdata->td_taskwait_thread = 0; 1342 KMP_DEBUG_ASSERT(taskdata->td_parent != NULL); 1343 // avoid copying icvs for proxy tasks 1344 if (flags->proxy == TASK_FULL) 1345 copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs); 1346 1347 taskdata->td_flags = *flags; 1348 taskdata->encountering_gtid = gtid; 1349 taskdata->td_task_team = thread->th.th_task_team; 1350 taskdata->td_size_alloc = shareds_offset + sizeof_shareds; 1351 taskdata->td_flags.tasktype = TASK_EXPLICIT; 1352 1353 // GEH - TODO: fix this to copy parent task's value of tasking_ser flag 1354 taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 1355 1356 // GEH - TODO: fix this to copy parent task's value of team_serial flag 1357 taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 1358 1359 // GEH - Note we serialize the task if the team is serialized to make sure 1360 // implicit parallel region tasks are not left until program termination to 1361 // execute. Also, it helps locality to execute immediately. 1362 1363 taskdata->td_flags.task_serial = 1364 (parent_task->td_flags.final || taskdata->td_flags.team_serial || 1365 taskdata->td_flags.tasking_ser || flags->merged_if0); 1366 1367 taskdata->td_flags.started = 0; 1368 taskdata->td_flags.executing = 0; 1369 taskdata->td_flags.complete = 0; 1370 taskdata->td_flags.freed = 0; 1371 1372 KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0); 1373 // start at one because counts current task and children 1374 KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1); 1375 taskdata->td_taskgroup = 1376 parent_task->td_taskgroup; // task inherits taskgroup from the parent task 1377 taskdata->td_dephash = NULL; 1378 taskdata->td_depnode = NULL; 1379 if (flags->tiedness == TASK_UNTIED) 1380 taskdata->td_last_tied = NULL; // will be set when the task is scheduled 1381 else 1382 taskdata->td_last_tied = taskdata; 1383 taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED; 1384 #if OMPT_SUPPORT 1385 if (UNLIKELY(ompt_enabled.enabled)) 1386 __ompt_task_init(taskdata, gtid); 1387 #endif 1388 // Only need to keep track of child task counts if team parallel and tasking 1389 // not serialized or if it is a proxy or detachable or hidden helper task 1390 if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE || 1391 flags->hidden_helper || 1392 !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { 1393 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks); 1394 if (parent_task->td_taskgroup) 1395 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count); 1396 // Only need to keep track of allocated child tasks for explicit tasks since 1397 // implicit not deallocated 1398 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) { 1399 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks); 1400 } 1401 if (flags->hidden_helper) { 1402 taskdata->td_flags.task_serial = FALSE; 1403 // Increment the number of hidden helper tasks to be executed 1404 KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks); 1405 } 1406 } 1407 1408 KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n", 1409 gtid, taskdata, taskdata->td_parent)); 1410 1411 return task; 1412 } 1413 1414 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1415 kmp_int32 flags, size_t sizeof_kmp_task_t, 1416 size_t sizeof_shareds, 1417 kmp_routine_entry_t task_entry) { 1418 kmp_task_t *retval; 1419 kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags; 1420 __kmp_assert_valid_gtid(gtid); 1421 input_flags->native = FALSE; 1422 // __kmp_task_alloc() sets up all other runtime flags 1423 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) " 1424 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1425 gtid, loc_ref, input_flags->tiedness ? "tied " : "untied", 1426 input_flags->proxy ? "proxy" : "", 1427 input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t, 1428 sizeof_shareds, task_entry)); 1429 1430 retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t, 1431 sizeof_shareds, task_entry); 1432 1433 KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval)); 1434 1435 return retval; 1436 } 1437 1438 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1439 kmp_int32 flags, 1440 size_t sizeof_kmp_task_t, 1441 size_t sizeof_shareds, 1442 kmp_routine_entry_t task_entry, 1443 kmp_int64 device_id) { 1444 if (__kmp_enable_hidden_helper) { 1445 auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags); 1446 input_flags.hidden_helper = TRUE; 1447 } 1448 1449 return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t, 1450 sizeof_shareds, task_entry); 1451 } 1452 1453 /*! 1454 @ingroup TASKING 1455 @param loc_ref location of the original task directive 1456 @param gtid Global Thread ID of encountering thread 1457 @param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new 1458 task'' 1459 @param naffins Number of affinity items 1460 @param affin_list List of affinity items 1461 @return Returns non-zero if registering affinity information was not successful. 1462 Returns 0 if registration was successful 1463 This entry registers the affinity information attached to a task with the task 1464 thunk structure kmp_taskdata_t. 1465 */ 1466 kmp_int32 1467 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, 1468 kmp_task_t *new_task, kmp_int32 naffins, 1469 kmp_task_affinity_info_t *affin_list) { 1470 return 0; 1471 } 1472 1473 // __kmp_invoke_task: invoke the specified task 1474 // 1475 // gtid: global thread ID of caller 1476 // task: the task to invoke 1477 // current_task: the task to resume after task invocation 1478 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, 1479 kmp_taskdata_t *current_task) { 1480 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 1481 kmp_info_t *thread; 1482 int discard = 0 /* false */; 1483 KA_TRACE( 1484 30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n", 1485 gtid, taskdata, current_task)); 1486 KMP_DEBUG_ASSERT(task); 1487 if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY && 1488 taskdata->td_flags.complete == 1)) { 1489 // This is a proxy task that was already completed but it needs to run 1490 // its bottom-half finish 1491 KA_TRACE( 1492 30, 1493 ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n", 1494 gtid, taskdata)); 1495 1496 __kmp_bottom_half_finish_proxy(gtid, task); 1497 1498 KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for " 1499 "proxy task %p, resuming task %p\n", 1500 gtid, taskdata, current_task)); 1501 1502 return; 1503 } 1504 1505 #if OMPT_SUPPORT 1506 // For untied tasks, the first task executed only calls __kmpc_omp_task and 1507 // does not execute code. 1508 ompt_thread_info_t oldInfo; 1509 if (UNLIKELY(ompt_enabled.enabled)) { 1510 // Store the threads states and restore them after the task 1511 thread = __kmp_threads[gtid]; 1512 oldInfo = thread->th.ompt_thread_info; 1513 thread->th.ompt_thread_info.wait_id = 0; 1514 thread->th.ompt_thread_info.state = (thread->th.th_team_serialized) 1515 ? ompt_state_work_serial 1516 : ompt_state_work_parallel; 1517 taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1518 } 1519 #endif 1520 1521 // Decreament the counter of hidden helper tasks to be executed 1522 if (taskdata->td_flags.hidden_helper) { 1523 // Hidden helper tasks can only be executed by hidden helper threads 1524 KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid)); 1525 KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks); 1526 } 1527 1528 // Proxy tasks are not handled by the runtime 1529 if (taskdata->td_flags.proxy != TASK_PROXY) { 1530 __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded 1531 } 1532 1533 // TODO: cancel tasks if the parallel region has also been cancelled 1534 // TODO: check if this sequence can be hoisted above __kmp_task_start 1535 // if cancellation has been enabled for this run ... 1536 if (UNLIKELY(__kmp_omp_cancellation)) { 1537 thread = __kmp_threads[gtid]; 1538 kmp_team_t *this_team = thread->th.th_team; 1539 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 1540 if ((taskgroup && taskgroup->cancel_request) || 1541 (this_team->t.t_cancel_request == cancel_parallel)) { 1542 #if OMPT_SUPPORT && OMPT_OPTIONAL 1543 ompt_data_t *task_data; 1544 if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) { 1545 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL); 1546 ompt_callbacks.ompt_callback(ompt_callback_cancel)( 1547 task_data, 1548 ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup 1549 : ompt_cancel_parallel) | 1550 ompt_cancel_discarded_task, 1551 NULL); 1552 } 1553 #endif 1554 KMP_COUNT_BLOCK(TASK_cancelled); 1555 // this task belongs to a task group and we need to cancel it 1556 discard = 1 /* true */; 1557 } 1558 } 1559 1560 // Invoke the task routine and pass in relevant data. 1561 // Thunks generated by gcc take a different argument list. 1562 if (!discard) { 1563 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 1564 taskdata->td_last_tied = current_task->td_last_tied; 1565 KMP_DEBUG_ASSERT(taskdata->td_last_tied); 1566 } 1567 #if KMP_STATS_ENABLED 1568 KMP_COUNT_BLOCK(TASK_executed); 1569 switch (KMP_GET_THREAD_STATE()) { 1570 case FORK_JOIN_BARRIER: 1571 KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar); 1572 break; 1573 case PLAIN_BARRIER: 1574 KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar); 1575 break; 1576 case TASKYIELD: 1577 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield); 1578 break; 1579 case TASKWAIT: 1580 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait); 1581 break; 1582 case TASKGROUP: 1583 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup); 1584 break; 1585 default: 1586 KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate); 1587 break; 1588 } 1589 #endif // KMP_STATS_ENABLED 1590 1591 // OMPT task begin 1592 #if OMPT_SUPPORT 1593 if (UNLIKELY(ompt_enabled.enabled)) 1594 __ompt_task_start(task, current_task, gtid); 1595 #endif 1596 1597 #if OMPD_SUPPORT 1598 if (ompd_state & OMPD_ENABLE_BP) 1599 ompd_bp_task_begin(); 1600 #endif 1601 1602 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1603 kmp_uint64 cur_time; 1604 kmp_int32 kmp_itt_count_task = 1605 __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial && 1606 current_task->td_flags.tasktype == TASK_IMPLICIT; 1607 if (kmp_itt_count_task) { 1608 thread = __kmp_threads[gtid]; 1609 // Time outer level explicit task on barrier for adjusting imbalance time 1610 if (thread->th.th_bar_arrive_time) 1611 cur_time = __itt_get_timestamp(); 1612 else 1613 kmp_itt_count_task = 0; // thread is not on a barrier - skip timing 1614 } 1615 KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task) 1616 #endif 1617 1618 #ifdef KMP_GOMP_COMPAT 1619 if (taskdata->td_flags.native) { 1620 ((void (*)(void *))(*(task->routine)))(task->shareds); 1621 } else 1622 #endif /* KMP_GOMP_COMPAT */ 1623 { 1624 (*(task->routine))(gtid, task); 1625 } 1626 KMP_POP_PARTITIONED_TIMER(); 1627 1628 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1629 if (kmp_itt_count_task) { 1630 // Barrier imbalance - adjust arrive time with the task duration 1631 thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time); 1632 } 1633 KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed) 1634 KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent 1635 #endif 1636 } 1637 1638 #if OMPD_SUPPORT 1639 if (ompd_state & OMPD_ENABLE_BP) 1640 ompd_bp_task_end(); 1641 #endif 1642 1643 // Proxy tasks are not handled by the runtime 1644 if (taskdata->td_flags.proxy != TASK_PROXY) { 1645 #if OMPT_SUPPORT 1646 if (UNLIKELY(ompt_enabled.enabled)) { 1647 thread->th.ompt_thread_info = oldInfo; 1648 if (taskdata->td_flags.tiedness == TASK_TIED) { 1649 taskdata->ompt_task_info.frame.exit_frame = ompt_data_none; 1650 } 1651 __kmp_task_finish<true>(gtid, task, current_task); 1652 } else 1653 #endif 1654 __kmp_task_finish<false>(gtid, task, current_task); 1655 } 1656 1657 KA_TRACE( 1658 30, 1659 ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n", 1660 gtid, taskdata, current_task)); 1661 return; 1662 } 1663 1664 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution 1665 // 1666 // loc_ref: location of original task pragma (ignored) 1667 // gtid: Global Thread ID of encountering thread 1668 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task'' 1669 // Returns: 1670 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1671 // be resumed later. 1672 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1673 // resumed later. 1674 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid, 1675 kmp_task_t *new_task) { 1676 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1677 1678 KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid, 1679 loc_ref, new_taskdata)); 1680 1681 #if OMPT_SUPPORT 1682 kmp_taskdata_t *parent; 1683 if (UNLIKELY(ompt_enabled.enabled)) { 1684 parent = new_taskdata->td_parent; 1685 if (ompt_enabled.ompt_callback_task_create) { 1686 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1687 &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame), 1688 &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0, 1689 OMPT_GET_RETURN_ADDRESS(0)); 1690 } 1691 } 1692 #endif 1693 1694 /* Should we execute the new task or queue it? For now, let's just always try 1695 to queue it. If the queue fills up, then we'll execute it. */ 1696 1697 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1698 { // Execute this task immediately 1699 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1700 new_taskdata->td_flags.task_serial = 1; 1701 __kmp_invoke_task(gtid, new_task, current_task); 1702 } 1703 1704 KA_TRACE( 1705 10, 1706 ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: " 1707 "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n", 1708 gtid, loc_ref, new_taskdata)); 1709 1710 #if OMPT_SUPPORT 1711 if (UNLIKELY(ompt_enabled.enabled)) { 1712 parent->ompt_task_info.frame.enter_frame = ompt_data_none; 1713 } 1714 #endif 1715 return TASK_CURRENT_NOT_QUEUED; 1716 } 1717 1718 // __kmp_omp_task: Schedule a non-thread-switchable task for execution 1719 // 1720 // gtid: Global Thread ID of encountering thread 1721 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc() 1722 // serialize_immediate: if TRUE then if the task is executed immediately its 1723 // execution will be serialized 1724 // Returns: 1725 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1726 // be resumed later. 1727 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1728 // resumed later. 1729 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task, 1730 bool serialize_immediate) { 1731 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1732 1733 /* Should we execute the new task or queue it? For now, let's just always try 1734 to queue it. If the queue fills up, then we'll execute it. */ 1735 if (new_taskdata->td_flags.proxy == TASK_PROXY || 1736 __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1737 { // Execute this task immediately 1738 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1739 if (serialize_immediate) 1740 new_taskdata->td_flags.task_serial = 1; 1741 __kmp_invoke_task(gtid, new_task, current_task); 1742 } 1743 1744 return TASK_CURRENT_NOT_QUEUED; 1745 } 1746 1747 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a 1748 // non-thread-switchable task from the parent thread only! 1749 // 1750 // loc_ref: location of original task pragma (ignored) 1751 // gtid: Global Thread ID of encountering thread 1752 // new_task: non-thread-switchable task thunk allocated by 1753 // __kmp_omp_task_alloc() 1754 // Returns: 1755 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1756 // be resumed later. 1757 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1758 // resumed later. 1759 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid, 1760 kmp_task_t *new_task) { 1761 kmp_int32 res; 1762 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); 1763 1764 #if KMP_DEBUG || OMPT_SUPPORT 1765 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1766 #endif 1767 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref, 1768 new_taskdata)); 1769 __kmp_assert_valid_gtid(gtid); 1770 1771 #if OMPT_SUPPORT 1772 kmp_taskdata_t *parent = NULL; 1773 if (UNLIKELY(ompt_enabled.enabled)) { 1774 if (!new_taskdata->td_flags.started) { 1775 OMPT_STORE_RETURN_ADDRESS(gtid); 1776 parent = new_taskdata->td_parent; 1777 if (!parent->ompt_task_info.frame.enter_frame.ptr) { 1778 parent->ompt_task_info.frame.enter_frame.ptr = 1779 OMPT_GET_FRAME_ADDRESS(0); 1780 } 1781 if (ompt_enabled.ompt_callback_task_create) { 1782 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1783 &(parent->ompt_task_info.task_data), 1784 &(parent->ompt_task_info.frame), 1785 &(new_taskdata->ompt_task_info.task_data), 1786 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, 1787 OMPT_LOAD_RETURN_ADDRESS(gtid)); 1788 } 1789 } else { 1790 // We are scheduling the continuation of an UNTIED task. 1791 // Scheduling back to the parent task. 1792 __ompt_task_finish(new_task, 1793 new_taskdata->ompt_task_info.scheduling_parent, 1794 ompt_task_switch); 1795 new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none; 1796 } 1797 } 1798 #endif 1799 1800 res = __kmp_omp_task(gtid, new_task, true); 1801 1802 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning " 1803 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n", 1804 gtid, loc_ref, new_taskdata)); 1805 #if OMPT_SUPPORT 1806 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) { 1807 parent->ompt_task_info.frame.enter_frame = ompt_data_none; 1808 } 1809 #endif 1810 return res; 1811 } 1812 1813 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule 1814 // a taskloop task with the correct OMPT return address 1815 // 1816 // loc_ref: location of original task pragma (ignored) 1817 // gtid: Global Thread ID of encountering thread 1818 // new_task: non-thread-switchable task thunk allocated by 1819 // __kmp_omp_task_alloc() 1820 // codeptr_ra: return address for OMPT callback 1821 // Returns: 1822 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1823 // be resumed later. 1824 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1825 // resumed later. 1826 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid, 1827 kmp_task_t *new_task, void *codeptr_ra) { 1828 kmp_int32 res; 1829 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); 1830 1831 #if KMP_DEBUG || OMPT_SUPPORT 1832 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1833 #endif 1834 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref, 1835 new_taskdata)); 1836 1837 #if OMPT_SUPPORT 1838 kmp_taskdata_t *parent = NULL; 1839 if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) { 1840 parent = new_taskdata->td_parent; 1841 if (!parent->ompt_task_info.frame.enter_frame.ptr) 1842 parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1843 if (ompt_enabled.ompt_callback_task_create) { 1844 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1845 &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame), 1846 &(new_taskdata->ompt_task_info.task_data), 1847 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, 1848 codeptr_ra); 1849 } 1850 } 1851 #endif 1852 1853 res = __kmp_omp_task(gtid, new_task, true); 1854 1855 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning " 1856 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n", 1857 gtid, loc_ref, new_taskdata)); 1858 #if OMPT_SUPPORT 1859 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) { 1860 parent->ompt_task_info.frame.enter_frame = ompt_data_none; 1861 } 1862 #endif 1863 return res; 1864 } 1865 1866 template <bool ompt> 1867 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid, 1868 void *frame_address, 1869 void *return_address) { 1870 kmp_taskdata_t *taskdata = nullptr; 1871 kmp_info_t *thread; 1872 int thread_finished = FALSE; 1873 KMP_SET_THREAD_STATE_BLOCK(TASKWAIT); 1874 1875 KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref)); 1876 KMP_DEBUG_ASSERT(gtid >= 0); 1877 1878 if (__kmp_tasking_mode != tskm_immediate_exec) { 1879 thread = __kmp_threads[gtid]; 1880 taskdata = thread->th.th_current_task; 1881 1882 #if OMPT_SUPPORT && OMPT_OPTIONAL 1883 ompt_data_t *my_task_data; 1884 ompt_data_t *my_parallel_data; 1885 1886 if (ompt) { 1887 my_task_data = &(taskdata->ompt_task_info.task_data); 1888 my_parallel_data = OMPT_CUR_TEAM_DATA(thread); 1889 1890 taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address; 1891 1892 if (ompt_enabled.ompt_callback_sync_region) { 1893 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1894 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data, 1895 my_task_data, return_address); 1896 } 1897 1898 if (ompt_enabled.ompt_callback_sync_region_wait) { 1899 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1900 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data, 1901 my_task_data, return_address); 1902 } 1903 } 1904 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1905 1906 // Debugger: The taskwait is active. Store location and thread encountered the 1907 // taskwait. 1908 #if USE_ITT_BUILD 1909 // Note: These values are used by ITT events as well. 1910 #endif /* USE_ITT_BUILD */ 1911 taskdata->td_taskwait_counter += 1; 1912 taskdata->td_taskwait_ident = loc_ref; 1913 taskdata->td_taskwait_thread = gtid + 1; 1914 1915 #if USE_ITT_BUILD 1916 void *itt_sync_obj = NULL; 1917 #if USE_ITT_NOTIFY 1918 KMP_ITT_TASKWAIT_STARTING(itt_sync_obj); 1919 #endif /* USE_ITT_NOTIFY */ 1920 #endif /* USE_ITT_BUILD */ 1921 1922 bool must_wait = 1923 !taskdata->td_flags.team_serial && !taskdata->td_flags.final; 1924 1925 must_wait = must_wait || (thread->th.th_task_team != NULL && 1926 thread->th.th_task_team->tt.tt_found_proxy_tasks); 1927 // If hidden helper thread is encountered, we must enable wait here. 1928 must_wait = 1929 must_wait || 1930 (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL && 1931 thread->th.th_task_team->tt.tt_hidden_helper_task_encountered); 1932 1933 if (must_wait) { 1934 kmp_flag_32<false, false> flag( 1935 RCAST(std::atomic<kmp_uint32> *, 1936 &(taskdata->td_incomplete_child_tasks)), 1937 0U); 1938 while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) { 1939 flag.execute_tasks(thread, gtid, FALSE, 1940 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 1941 __kmp_task_stealing_constraint); 1942 } 1943 } 1944 #if USE_ITT_BUILD 1945 KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj); 1946 KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children 1947 #endif /* USE_ITT_BUILD */ 1948 1949 // Debugger: The taskwait is completed. Location remains, but thread is 1950 // negated. 1951 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 1952 1953 #if OMPT_SUPPORT && OMPT_OPTIONAL 1954 if (ompt) { 1955 if (ompt_enabled.ompt_callback_sync_region_wait) { 1956 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1957 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data, 1958 my_task_data, return_address); 1959 } 1960 if (ompt_enabled.ompt_callback_sync_region) { 1961 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1962 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data, 1963 my_task_data, return_address); 1964 } 1965 taskdata->ompt_task_info.frame.enter_frame = ompt_data_none; 1966 } 1967 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1968 1969 } 1970 1971 KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, " 1972 "returning TASK_CURRENT_NOT_QUEUED\n", 1973 gtid, taskdata)); 1974 1975 return TASK_CURRENT_NOT_QUEUED; 1976 } 1977 1978 #if OMPT_SUPPORT && OMPT_OPTIONAL 1979 OMPT_NOINLINE 1980 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid, 1981 void *frame_address, 1982 void *return_address) { 1983 return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address, 1984 return_address); 1985 } 1986 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1987 1988 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are 1989 // complete 1990 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) { 1991 #if OMPT_SUPPORT && OMPT_OPTIONAL 1992 if (UNLIKELY(ompt_enabled.enabled)) { 1993 OMPT_STORE_RETURN_ADDRESS(gtid); 1994 return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0), 1995 OMPT_LOAD_RETURN_ADDRESS(gtid)); 1996 } 1997 #endif 1998 return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL); 1999 } 2000 2001 // __kmpc_omp_taskyield: switch to a different task 2002 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) { 2003 kmp_taskdata_t *taskdata = NULL; 2004 kmp_info_t *thread; 2005 int thread_finished = FALSE; 2006 2007 KMP_COUNT_BLOCK(OMP_TASKYIELD); 2008 KMP_SET_THREAD_STATE_BLOCK(TASKYIELD); 2009 2010 KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n", 2011 gtid, loc_ref, end_part)); 2012 __kmp_assert_valid_gtid(gtid); 2013 2014 if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) { 2015 thread = __kmp_threads[gtid]; 2016 taskdata = thread->th.th_current_task; 2017 // Should we model this as a task wait or not? 2018 // Debugger: The taskwait is active. Store location and thread encountered the 2019 // taskwait. 2020 #if USE_ITT_BUILD 2021 // Note: These values are used by ITT events as well. 2022 #endif /* USE_ITT_BUILD */ 2023 taskdata->td_taskwait_counter += 1; 2024 taskdata->td_taskwait_ident = loc_ref; 2025 taskdata->td_taskwait_thread = gtid + 1; 2026 2027 #if USE_ITT_BUILD 2028 void *itt_sync_obj = NULL; 2029 #if USE_ITT_NOTIFY 2030 KMP_ITT_TASKWAIT_STARTING(itt_sync_obj); 2031 #endif /* USE_ITT_NOTIFY */ 2032 #endif /* USE_ITT_BUILD */ 2033 if (!taskdata->td_flags.team_serial) { 2034 kmp_task_team_t *task_team = thread->th.th_task_team; 2035 if (task_team != NULL) { 2036 if (KMP_TASKING_ENABLED(task_team)) { 2037 #if OMPT_SUPPORT 2038 if (UNLIKELY(ompt_enabled.enabled)) 2039 thread->th.ompt_thread_info.ompt_task_yielded = 1; 2040 #endif 2041 __kmp_execute_tasks_32( 2042 thread, gtid, (kmp_flag_32<> *)NULL, FALSE, 2043 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 2044 __kmp_task_stealing_constraint); 2045 #if OMPT_SUPPORT 2046 if (UNLIKELY(ompt_enabled.enabled)) 2047 thread->th.ompt_thread_info.ompt_task_yielded = 0; 2048 #endif 2049 } 2050 } 2051 } 2052 #if USE_ITT_BUILD 2053 KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj); 2054 #endif /* USE_ITT_BUILD */ 2055 2056 // Debugger: The taskwait is completed. Location remains, but thread is 2057 // negated. 2058 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 2059 } 2060 2061 KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, " 2062 "returning TASK_CURRENT_NOT_QUEUED\n", 2063 gtid, taskdata)); 2064 2065 return TASK_CURRENT_NOT_QUEUED; 2066 } 2067 2068 // Task Reduction implementation 2069 // 2070 // Note: initial implementation didn't take into account the possibility 2071 // to specify omp_orig for initializer of the UDR (user defined reduction). 2072 // Corrected implementation takes into account the omp_orig object. 2073 // Compiler is free to use old implementation if omp_orig is not specified. 2074 2075 /*! 2076 @ingroup BASIC_TYPES 2077 @{ 2078 */ 2079 2080 /*! 2081 Flags for special info per task reduction item. 2082 */ 2083 typedef struct kmp_taskred_flags { 2084 /*! 1 - use lazy alloc/init (e.g. big objects, #tasks < #threads) */ 2085 unsigned lazy_priv : 1; 2086 unsigned reserved31 : 31; 2087 } kmp_taskred_flags_t; 2088 2089 /*! 2090 Internal struct for reduction data item related info set up by compiler. 2091 */ 2092 typedef struct kmp_task_red_input { 2093 void *reduce_shar; /**< shared between tasks item to reduce into */ 2094 size_t reduce_size; /**< size of data item in bytes */ 2095 // three compiler-generated routines (init, fini are optional): 2096 void *reduce_init; /**< data initialization routine (single parameter) */ 2097 void *reduce_fini; /**< data finalization routine */ 2098 void *reduce_comb; /**< data combiner routine */ 2099 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */ 2100 } kmp_task_red_input_t; 2101 2102 /*! 2103 Internal struct for reduction data item related info saved by the library. 2104 */ 2105 typedef struct kmp_taskred_data { 2106 void *reduce_shar; /**< shared between tasks item to reduce into */ 2107 size_t reduce_size; /**< size of data item */ 2108 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */ 2109 void *reduce_priv; /**< array of thread specific items */ 2110 void *reduce_pend; /**< end of private data for faster comparison op */ 2111 // three compiler-generated routines (init, fini are optional): 2112 void *reduce_comb; /**< data combiner routine */ 2113 void *reduce_init; /**< data initialization routine (two parameters) */ 2114 void *reduce_fini; /**< data finalization routine */ 2115 void *reduce_orig; /**< original item (can be used in UDR initializer) */ 2116 } kmp_taskred_data_t; 2117 2118 /*! 2119 Internal struct for reduction data item related info set up by compiler. 2120 2121 New interface: added reduce_orig field to provide omp_orig for UDR initializer. 2122 */ 2123 typedef struct kmp_taskred_input { 2124 void *reduce_shar; /**< shared between tasks item to reduce into */ 2125 void *reduce_orig; /**< original reduction item used for initialization */ 2126 size_t reduce_size; /**< size of data item */ 2127 // three compiler-generated routines (init, fini are optional): 2128 void *reduce_init; /**< data initialization routine (two parameters) */ 2129 void *reduce_fini; /**< data finalization routine */ 2130 void *reduce_comb; /**< data combiner routine */ 2131 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */ 2132 } kmp_taskred_input_t; 2133 /*! 2134 @} 2135 */ 2136 2137 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src); 2138 template <> 2139 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item, 2140 kmp_task_red_input_t &src) { 2141 item.reduce_orig = NULL; 2142 } 2143 template <> 2144 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item, 2145 kmp_taskred_input_t &src) { 2146 if (src.reduce_orig != NULL) { 2147 item.reduce_orig = src.reduce_orig; 2148 } else { 2149 item.reduce_orig = src.reduce_shar; 2150 } // non-NULL reduce_orig means new interface used 2151 } 2152 2153 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j); 2154 template <> 2155 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item, 2156 size_t offset) { 2157 ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset); 2158 } 2159 template <> 2160 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item, 2161 size_t offset) { 2162 ((void (*)(void *, void *))item.reduce_init)( 2163 (char *)(item.reduce_priv) + offset, item.reduce_orig); 2164 } 2165 2166 template <typename T> 2167 void *__kmp_task_reduction_init(int gtid, int num, T *data) { 2168 __kmp_assert_valid_gtid(gtid); 2169 kmp_info_t *thread = __kmp_threads[gtid]; 2170 kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup; 2171 kmp_uint32 nth = thread->th.th_team_nproc; 2172 kmp_taskred_data_t *arr; 2173 2174 // check input data just in case 2175 KMP_ASSERT(tg != NULL); 2176 KMP_ASSERT(data != NULL); 2177 KMP_ASSERT(num > 0); 2178 if (nth == 1) { 2179 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n", 2180 gtid, tg)); 2181 return (void *)tg; 2182 } 2183 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n", 2184 gtid, tg, num)); 2185 arr = (kmp_taskred_data_t *)__kmp_thread_malloc( 2186 thread, num * sizeof(kmp_taskred_data_t)); 2187 for (int i = 0; i < num; ++i) { 2188 size_t size = data[i].reduce_size - 1; 2189 // round the size up to cache line per thread-specific item 2190 size += CACHE_LINE - size % CACHE_LINE; 2191 KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory 2192 arr[i].reduce_shar = data[i].reduce_shar; 2193 arr[i].reduce_size = size; 2194 arr[i].flags = data[i].flags; 2195 arr[i].reduce_comb = data[i].reduce_comb; 2196 arr[i].reduce_init = data[i].reduce_init; 2197 arr[i].reduce_fini = data[i].reduce_fini; 2198 __kmp_assign_orig<T>(arr[i], data[i]); 2199 if (!arr[i].flags.lazy_priv) { 2200 // allocate cache-line aligned block and fill it with zeros 2201 arr[i].reduce_priv = __kmp_allocate(nth * size); 2202 arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size; 2203 if (arr[i].reduce_init != NULL) { 2204 // initialize all thread-specific items 2205 for (size_t j = 0; j < nth; ++j) { 2206 __kmp_call_init<T>(arr[i], j * size); 2207 } 2208 } 2209 } else { 2210 // only allocate space for pointers now, 2211 // objects will be lazily allocated/initialized if/when requested 2212 // note that __kmp_allocate zeroes the allocated memory 2213 arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *)); 2214 } 2215 } 2216 tg->reduce_data = (void *)arr; 2217 tg->reduce_num_data = num; 2218 return (void *)tg; 2219 } 2220 2221 /*! 2222 @ingroup TASKING 2223 @param gtid Global thread ID 2224 @param num Number of data items to reduce 2225 @param data Array of data for reduction 2226 @return The taskgroup identifier 2227 2228 Initialize task reduction for the taskgroup. 2229 2230 Note: this entry supposes the optional compiler-generated initializer routine 2231 has single parameter - pointer to object to be initialized. That means 2232 the reduction either does not use omp_orig object, or the omp_orig is accessible 2233 without help of the runtime library. 2234 */ 2235 void *__kmpc_task_reduction_init(int gtid, int num, void *data) { 2236 return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data); 2237 } 2238 2239 /*! 2240 @ingroup TASKING 2241 @param gtid Global thread ID 2242 @param num Number of data items to reduce 2243 @param data Array of data for reduction 2244 @return The taskgroup identifier 2245 2246 Initialize task reduction for the taskgroup. 2247 2248 Note: this entry supposes the optional compiler-generated initializer routine 2249 has two parameters, pointer to object to be initialized and pointer to omp_orig 2250 */ 2251 void *__kmpc_taskred_init(int gtid, int num, void *data) { 2252 return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data); 2253 } 2254 2255 // Copy task reduction data (except for shared pointers). 2256 template <typename T> 2257 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data, 2258 kmp_taskgroup_t *tg, void *reduce_data) { 2259 kmp_taskred_data_t *arr; 2260 KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p," 2261 " from data %p\n", 2262 thr, tg, reduce_data)); 2263 arr = (kmp_taskred_data_t *)__kmp_thread_malloc( 2264 thr, num * sizeof(kmp_taskred_data_t)); 2265 // threads will share private copies, thunk routines, sizes, flags, etc.: 2266 KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t)); 2267 for (int i = 0; i < num; ++i) { 2268 arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers 2269 } 2270 tg->reduce_data = (void *)arr; 2271 tg->reduce_num_data = num; 2272 } 2273 2274 /*! 2275 @ingroup TASKING 2276 @param gtid Global thread ID 2277 @param tskgrp The taskgroup ID (optional) 2278 @param data Shared location of the item 2279 @return The pointer to per-thread data 2280 2281 Get thread-specific location of data item 2282 */ 2283 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) { 2284 __kmp_assert_valid_gtid(gtid); 2285 kmp_info_t *thread = __kmp_threads[gtid]; 2286 kmp_int32 nth = thread->th.th_team_nproc; 2287 if (nth == 1) 2288 return data; // nothing to do 2289 2290 kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp; 2291 if (tg == NULL) 2292 tg = thread->th.th_current_task->td_taskgroup; 2293 KMP_ASSERT(tg != NULL); 2294 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data); 2295 kmp_int32 num = tg->reduce_num_data; 2296 kmp_int32 tid = thread->th.th_info.ds.ds_tid; 2297 2298 KMP_ASSERT(data != NULL); 2299 while (tg != NULL) { 2300 for (int i = 0; i < num; ++i) { 2301 if (!arr[i].flags.lazy_priv) { 2302 if (data == arr[i].reduce_shar || 2303 (data >= arr[i].reduce_priv && data < arr[i].reduce_pend)) 2304 return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size; 2305 } else { 2306 // check shared location first 2307 void **p_priv = (void **)(arr[i].reduce_priv); 2308 if (data == arr[i].reduce_shar) 2309 goto found; 2310 // check if we get some thread specific location as parameter 2311 for (int j = 0; j < nth; ++j) 2312 if (data == p_priv[j]) 2313 goto found; 2314 continue; // not found, continue search 2315 found: 2316 if (p_priv[tid] == NULL) { 2317 // allocate thread specific object lazily 2318 p_priv[tid] = __kmp_allocate(arr[i].reduce_size); 2319 if (arr[i].reduce_init != NULL) { 2320 if (arr[i].reduce_orig != NULL) { // new interface 2321 ((void (*)(void *, void *))arr[i].reduce_init)( 2322 p_priv[tid], arr[i].reduce_orig); 2323 } else { // old interface (single parameter) 2324 ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]); 2325 } 2326 } 2327 } 2328 return p_priv[tid]; 2329 } 2330 } 2331 tg = tg->parent; 2332 arr = (kmp_taskred_data_t *)(tg->reduce_data); 2333 num = tg->reduce_num_data; 2334 } 2335 KMP_ASSERT2(0, "Unknown task reduction item"); 2336 return NULL; // ERROR, this line never executed 2337 } 2338 2339 // Finalize task reduction. 2340 // Called from __kmpc_end_taskgroup() 2341 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) { 2342 kmp_int32 nth = th->th.th_team_nproc; 2343 KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1 2344 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data; 2345 kmp_int32 num = tg->reduce_num_data; 2346 for (int i = 0; i < num; ++i) { 2347 void *sh_data = arr[i].reduce_shar; 2348 void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini); 2349 void (*f_comb)(void *, void *) = 2350 (void (*)(void *, void *))(arr[i].reduce_comb); 2351 if (!arr[i].flags.lazy_priv) { 2352 void *pr_data = arr[i].reduce_priv; 2353 size_t size = arr[i].reduce_size; 2354 for (int j = 0; j < nth; ++j) { 2355 void *priv_data = (char *)pr_data + j * size; 2356 f_comb(sh_data, priv_data); // combine results 2357 if (f_fini) 2358 f_fini(priv_data); // finalize if needed 2359 } 2360 } else { 2361 void **pr_data = (void **)(arr[i].reduce_priv); 2362 for (int j = 0; j < nth; ++j) { 2363 if (pr_data[j] != NULL) { 2364 f_comb(sh_data, pr_data[j]); // combine results 2365 if (f_fini) 2366 f_fini(pr_data[j]); // finalize if needed 2367 __kmp_free(pr_data[j]); 2368 } 2369 } 2370 } 2371 __kmp_free(arr[i].reduce_priv); 2372 } 2373 __kmp_thread_free(th, arr); 2374 tg->reduce_data = NULL; 2375 tg->reduce_num_data = 0; 2376 } 2377 2378 // Cleanup task reduction data for parallel or worksharing, 2379 // do not touch task private data other threads still working with. 2380 // Called from __kmpc_end_taskgroup() 2381 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) { 2382 __kmp_thread_free(th, tg->reduce_data); 2383 tg->reduce_data = NULL; 2384 tg->reduce_num_data = 0; 2385 } 2386 2387 template <typename T> 2388 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, 2389 int num, T *data) { 2390 __kmp_assert_valid_gtid(gtid); 2391 kmp_info_t *thr = __kmp_threads[gtid]; 2392 kmp_int32 nth = thr->th.th_team_nproc; 2393 __kmpc_taskgroup(loc, gtid); // form new taskgroup first 2394 if (nth == 1) { 2395 KA_TRACE(10, 2396 ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n", 2397 gtid, thr->th.th_current_task->td_taskgroup)); 2398 return (void *)thr->th.th_current_task->td_taskgroup; 2399 } 2400 kmp_team_t *team = thr->th.th_team; 2401 void *reduce_data; 2402 kmp_taskgroup_t *tg; 2403 reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]); 2404 if (reduce_data == NULL && 2405 __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data, 2406 (void *)1)) { 2407 // single thread enters this block to initialize common reduction data 2408 KMP_DEBUG_ASSERT(reduce_data == NULL); 2409 // first initialize own data, then make a copy other threads can use 2410 tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data); 2411 reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t)); 2412 KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t)); 2413 // fini counters should be 0 at this point 2414 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0); 2415 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0); 2416 KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data); 2417 } else { 2418 while ( 2419 (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) == 2420 (void *)1) { // wait for task reduction initialization 2421 KMP_CPU_PAUSE(); 2422 } 2423 KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here 2424 tg = thr->th.th_current_task->td_taskgroup; 2425 __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data); 2426 } 2427 return tg; 2428 } 2429 2430 /*! 2431 @ingroup TASKING 2432 @param loc Source location info 2433 @param gtid Global thread ID 2434 @param is_ws Is 1 if the reduction is for worksharing, 0 otherwise 2435 @param num Number of data items to reduce 2436 @param data Array of data for reduction 2437 @return The taskgroup identifier 2438 2439 Initialize task reduction for a parallel or worksharing. 2440 2441 Note: this entry supposes the optional compiler-generated initializer routine 2442 has single parameter - pointer to object to be initialized. That means 2443 the reduction either does not use omp_orig object, or the omp_orig is accessible 2444 without help of the runtime library. 2445 */ 2446 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, 2447 int num, void *data) { 2448 return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num, 2449 (kmp_task_red_input_t *)data); 2450 } 2451 2452 /*! 2453 @ingroup TASKING 2454 @param loc Source location info 2455 @param gtid Global thread ID 2456 @param is_ws Is 1 if the reduction is for worksharing, 0 otherwise 2457 @param num Number of data items to reduce 2458 @param data Array of data for reduction 2459 @return The taskgroup identifier 2460 2461 Initialize task reduction for a parallel or worksharing. 2462 2463 Note: this entry supposes the optional compiler-generated initializer routine 2464 has two parameters, pointer to object to be initialized and pointer to omp_orig 2465 */ 2466 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, 2467 void *data) { 2468 return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num, 2469 (kmp_taskred_input_t *)data); 2470 } 2471 2472 /*! 2473 @ingroup TASKING 2474 @param loc Source location info 2475 @param gtid Global thread ID 2476 @param is_ws Is 1 if the reduction is for worksharing, 0 otherwise 2477 2478 Finalize task reduction for a parallel or worksharing. 2479 */ 2480 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) { 2481 __kmpc_end_taskgroup(loc, gtid); 2482 } 2483 2484 // __kmpc_taskgroup: Start a new taskgroup 2485 void __kmpc_taskgroup(ident_t *loc, int gtid) { 2486 __kmp_assert_valid_gtid(gtid); 2487 kmp_info_t *thread = __kmp_threads[gtid]; 2488 kmp_taskdata_t *taskdata = thread->th.th_current_task; 2489 kmp_taskgroup_t *tg_new = 2490 (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t)); 2491 KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new)); 2492 KMP_ATOMIC_ST_RLX(&tg_new->count, 0); 2493 KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq); 2494 tg_new->parent = taskdata->td_taskgroup; 2495 tg_new->reduce_data = NULL; 2496 tg_new->reduce_num_data = 0; 2497 tg_new->gomp_data = NULL; 2498 taskdata->td_taskgroup = tg_new; 2499 2500 #if OMPT_SUPPORT && OMPT_OPTIONAL 2501 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) { 2502 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2503 if (!codeptr) 2504 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2505 kmp_team_t *team = thread->th.th_team; 2506 ompt_data_t my_task_data = taskdata->ompt_task_info.task_data; 2507 // FIXME: I think this is wrong for lwt! 2508 ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data; 2509 2510 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2511 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data), 2512 &(my_task_data), codeptr); 2513 } 2514 #endif 2515 } 2516 2517 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task 2518 // and its descendants are complete 2519 void __kmpc_end_taskgroup(ident_t *loc, int gtid) { 2520 __kmp_assert_valid_gtid(gtid); 2521 kmp_info_t *thread = __kmp_threads[gtid]; 2522 kmp_taskdata_t *taskdata = thread->th.th_current_task; 2523 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 2524 int thread_finished = FALSE; 2525 2526 #if OMPT_SUPPORT && OMPT_OPTIONAL 2527 kmp_team_t *team; 2528 ompt_data_t my_task_data; 2529 ompt_data_t my_parallel_data; 2530 void *codeptr = nullptr; 2531 if (UNLIKELY(ompt_enabled.enabled)) { 2532 team = thread->th.th_team; 2533 my_task_data = taskdata->ompt_task_info.task_data; 2534 // FIXME: I think this is wrong for lwt! 2535 my_parallel_data = team->t.ompt_team_info.parallel_data; 2536 codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2537 if (!codeptr) 2538 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2539 } 2540 #endif 2541 2542 KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc)); 2543 KMP_DEBUG_ASSERT(taskgroup != NULL); 2544 KMP_SET_THREAD_STATE_BLOCK(TASKGROUP); 2545 2546 if (__kmp_tasking_mode != tskm_immediate_exec) { 2547 // mark task as waiting not on a barrier 2548 taskdata->td_taskwait_counter += 1; 2549 taskdata->td_taskwait_ident = loc; 2550 taskdata->td_taskwait_thread = gtid + 1; 2551 #if USE_ITT_BUILD 2552 // For ITT the taskgroup wait is similar to taskwait until we need to 2553 // distinguish them 2554 void *itt_sync_obj = NULL; 2555 #if USE_ITT_NOTIFY 2556 KMP_ITT_TASKWAIT_STARTING(itt_sync_obj); 2557 #endif /* USE_ITT_NOTIFY */ 2558 #endif /* USE_ITT_BUILD */ 2559 2560 #if OMPT_SUPPORT && OMPT_OPTIONAL 2561 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) { 2562 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2563 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data), 2564 &(my_task_data), codeptr); 2565 } 2566 #endif 2567 2568 if (!taskdata->td_flags.team_serial || 2569 (thread->th.th_task_team != NULL && 2570 (thread->th.th_task_team->tt.tt_found_proxy_tasks || 2571 thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) { 2572 kmp_flag_32<false, false> flag( 2573 RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U); 2574 while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) { 2575 flag.execute_tasks(thread, gtid, FALSE, 2576 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 2577 __kmp_task_stealing_constraint); 2578 } 2579 } 2580 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting 2581 2582 #if OMPT_SUPPORT && OMPT_OPTIONAL 2583 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) { 2584 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2585 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data), 2586 &(my_task_data), codeptr); 2587 } 2588 #endif 2589 2590 #if USE_ITT_BUILD 2591 KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj); 2592 KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants 2593 #endif /* USE_ITT_BUILD */ 2594 } 2595 KMP_DEBUG_ASSERT(taskgroup->count == 0); 2596 2597 if (taskgroup->reduce_data != NULL && 2598 !taskgroup->gomp_data) { // need to reduce? 2599 int cnt; 2600 void *reduce_data; 2601 kmp_team_t *t = thread->th.th_team; 2602 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data; 2603 // check if <priv> data of the first reduction variable shared for the team 2604 void *priv0 = arr[0].reduce_priv; 2605 if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL && 2606 ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) { 2607 // finishing task reduction on parallel 2608 cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]); 2609 if (cnt == thread->th.th_team_nproc - 1) { 2610 // we are the last thread passing __kmpc_reduction_modifier_fini() 2611 // finalize task reduction: 2612 __kmp_task_reduction_fini(thread, taskgroup); 2613 // cleanup fields in the team structure: 2614 // TODO: is relaxed store enough here (whole barrier should follow)? 2615 __kmp_thread_free(thread, reduce_data); 2616 KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL); 2617 KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0); 2618 } else { 2619 // we are not the last thread passing __kmpc_reduction_modifier_fini(), 2620 // so do not finalize reduction, just clean own copy of the data 2621 __kmp_task_reduction_clean(thread, taskgroup); 2622 } 2623 } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) != 2624 NULL && 2625 ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) { 2626 // finishing task reduction on worksharing 2627 cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]); 2628 if (cnt == thread->th.th_team_nproc - 1) { 2629 // we are the last thread passing __kmpc_reduction_modifier_fini() 2630 __kmp_task_reduction_fini(thread, taskgroup); 2631 // cleanup fields in team structure: 2632 // TODO: is relaxed store enough here (whole barrier should follow)? 2633 __kmp_thread_free(thread, reduce_data); 2634 KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL); 2635 KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0); 2636 } else { 2637 // we are not the last thread passing __kmpc_reduction_modifier_fini(), 2638 // so do not finalize reduction, just clean own copy of the data 2639 __kmp_task_reduction_clean(thread, taskgroup); 2640 } 2641 } else { 2642 // finishing task reduction on taskgroup 2643 __kmp_task_reduction_fini(thread, taskgroup); 2644 } 2645 } 2646 // Restore parent taskgroup for the current task 2647 taskdata->td_taskgroup = taskgroup->parent; 2648 __kmp_thread_free(thread, taskgroup); 2649 2650 KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n", 2651 gtid, taskdata)); 2652 2653 #if OMPT_SUPPORT && OMPT_OPTIONAL 2654 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) { 2655 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2656 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data), 2657 &(my_task_data), codeptr); 2658 } 2659 #endif 2660 } 2661 2662 // __kmp_remove_my_task: remove a task from my own deque 2663 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid, 2664 kmp_task_team_t *task_team, 2665 kmp_int32 is_constrained) { 2666 kmp_task_t *task; 2667 kmp_taskdata_t *taskdata; 2668 kmp_thread_data_t *thread_data; 2669 kmp_uint32 tail; 2670 2671 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2672 KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data != 2673 NULL); // Caller should check this condition 2674 2675 thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 2676 2677 KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n", 2678 gtid, thread_data->td.td_deque_ntasks, 2679 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2680 2681 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 2682 KA_TRACE(10, 2683 ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: " 2684 "ntasks=%d head=%u tail=%u\n", 2685 gtid, thread_data->td.td_deque_ntasks, 2686 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2687 return NULL; 2688 } 2689 2690 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 2691 2692 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 2693 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2694 KA_TRACE(10, 2695 ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: " 2696 "ntasks=%d head=%u tail=%u\n", 2697 gtid, thread_data->td.td_deque_ntasks, 2698 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2699 return NULL; 2700 } 2701 2702 tail = (thread_data->td.td_deque_tail - 1) & 2703 TASK_DEQUE_MASK(thread_data->td); // Wrap index. 2704 taskdata = thread_data->td.td_deque[tail]; 2705 2706 if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata, 2707 thread->th.th_current_task)) { 2708 // The TSC does not allow to steal victim task 2709 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2710 KA_TRACE(10, 2711 ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: " 2712 "ntasks=%d head=%u tail=%u\n", 2713 gtid, thread_data->td.td_deque_ntasks, 2714 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2715 return NULL; 2716 } 2717 2718 thread_data->td.td_deque_tail = tail; 2719 TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1); 2720 2721 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2722 2723 KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: " 2724 "ntasks=%d head=%u tail=%u\n", 2725 gtid, taskdata, thread_data->td.td_deque_ntasks, 2726 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2727 2728 task = KMP_TASKDATA_TO_TASK(taskdata); 2729 return task; 2730 } 2731 2732 // __kmp_steal_task: remove a task from another thread's deque 2733 // Assume that calling thread has already checked existence of 2734 // task_team thread_data before calling this routine. 2735 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid, 2736 kmp_task_team_t *task_team, 2737 std::atomic<kmp_int32> *unfinished_threads, 2738 int *thread_finished, 2739 kmp_int32 is_constrained) { 2740 kmp_task_t *task; 2741 kmp_taskdata_t *taskdata; 2742 kmp_taskdata_t *current; 2743 kmp_thread_data_t *victim_td, *threads_data; 2744 kmp_int32 target; 2745 kmp_int32 victim_tid; 2746 2747 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2748 2749 threads_data = task_team->tt.tt_threads_data; 2750 KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition 2751 2752 victim_tid = victim_thr->th.th_info.ds.ds_tid; 2753 victim_td = &threads_data[victim_tid]; 2754 2755 KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: " 2756 "task_team=%p ntasks=%d head=%u tail=%u\n", 2757 gtid, __kmp_gtid_from_thread(victim_thr), task_team, 2758 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2759 victim_td->td.td_deque_tail)); 2760 2761 if (TCR_4(victim_td->td.td_deque_ntasks) == 0) { 2762 KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: " 2763 "task_team=%p ntasks=%d head=%u tail=%u\n", 2764 gtid, __kmp_gtid_from_thread(victim_thr), task_team, 2765 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2766 victim_td->td.td_deque_tail)); 2767 return NULL; 2768 } 2769 2770 __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock); 2771 2772 int ntasks = TCR_4(victim_td->td.td_deque_ntasks); 2773 // Check again after we acquire the lock 2774 if (ntasks == 0) { 2775 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2776 KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: " 2777 "task_team=%p ntasks=%d head=%u tail=%u\n", 2778 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2779 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2780 return NULL; 2781 } 2782 2783 KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL); 2784 current = __kmp_threads[gtid]->th.th_current_task; 2785 taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head]; 2786 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) { 2787 // Bump head pointer and Wrap. 2788 victim_td->td.td_deque_head = 2789 (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td); 2790 } else { 2791 if (!task_team->tt.tt_untied_task_encountered) { 2792 // The TSC does not allow to steal victim task 2793 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2794 KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from " 2795 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n", 2796 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2797 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2798 return NULL; 2799 } 2800 int i; 2801 // walk through victim's deque trying to steal any task 2802 target = victim_td->td.td_deque_head; 2803 taskdata = NULL; 2804 for (i = 1; i < ntasks; ++i) { 2805 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td); 2806 taskdata = victim_td->td.td_deque[target]; 2807 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) { 2808 break; // found victim task 2809 } else { 2810 taskdata = NULL; 2811 } 2812 } 2813 if (taskdata == NULL) { 2814 // No appropriate candidate to steal found 2815 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2816 KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from " 2817 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n", 2818 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2819 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2820 return NULL; 2821 } 2822 int prev = target; 2823 for (i = i + 1; i < ntasks; ++i) { 2824 // shift remaining tasks in the deque left by 1 2825 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td); 2826 victim_td->td.td_deque[prev] = victim_td->td.td_deque[target]; 2827 prev = target; 2828 } 2829 KMP_DEBUG_ASSERT( 2830 victim_td->td.td_deque_tail == 2831 (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td))); 2832 victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped)) 2833 } 2834 if (*thread_finished) { 2835 // We need to un-mark this victim as a finished victim. This must be done 2836 // before releasing the lock, or else other threads (starting with the 2837 // primary thread victim) might be prematurely released from the barrier!!! 2838 #if KMP_DEBUG 2839 kmp_int32 count = 2840 #endif 2841 KMP_ATOMIC_INC(unfinished_threads); 2842 KA_TRACE( 2843 20, 2844 ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n", 2845 gtid, count + 1, task_team)); 2846 *thread_finished = FALSE; 2847 } 2848 TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1); 2849 2850 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2851 2852 KMP_COUNT_BLOCK(TASK_stolen); 2853 KA_TRACE(10, 2854 ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: " 2855 "task_team=%p ntasks=%d head=%u tail=%u\n", 2856 gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team, 2857 ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2858 2859 task = KMP_TASKDATA_TO_TASK(taskdata); 2860 return task; 2861 } 2862 2863 // __kmp_execute_tasks_template: Choose and execute tasks until either the 2864 // condition is statisfied (return true) or there are none left (return false). 2865 // 2866 // final_spin is TRUE if this is the spin at the release barrier. 2867 // thread_finished indicates whether the thread is finished executing all 2868 // the tasks it has on its deque, and is at the release barrier. 2869 // spinner is the location on which to spin. 2870 // spinner == NULL means only execute a single task and return. 2871 // checker is the value to check to terminate the spin. 2872 template <class C> 2873 static inline int __kmp_execute_tasks_template( 2874 kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin, 2875 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2876 kmp_int32 is_constrained) { 2877 kmp_task_team_t *task_team = thread->th.th_task_team; 2878 kmp_thread_data_t *threads_data; 2879 kmp_task_t *task; 2880 kmp_info_t *other_thread; 2881 kmp_taskdata_t *current_task = thread->th.th_current_task; 2882 std::atomic<kmp_int32> *unfinished_threads; 2883 kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0, 2884 tid = thread->th.th_info.ds.ds_tid; 2885 2886 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2887 KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]); 2888 2889 if (task_team == NULL || current_task == NULL) 2890 return FALSE; 2891 2892 KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d " 2893 "*thread_finished=%d\n", 2894 gtid, final_spin, *thread_finished)); 2895 2896 thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 2897 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 2898 2899 KMP_DEBUG_ASSERT(threads_data != NULL); 2900 2901 nthreads = task_team->tt.tt_nproc; 2902 unfinished_threads = &(task_team->tt.tt_unfinished_threads); 2903 KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks || 2904 task_team->tt.tt_hidden_helper_task_encountered); 2905 KMP_DEBUG_ASSERT(*unfinished_threads >= 0); 2906 2907 while (1) { // Outer loop keeps trying to find tasks in case of single thread 2908 // getting tasks from target constructs 2909 while (1) { // Inner loop to find a task and execute it 2910 task = NULL; 2911 if (use_own_tasks) { // check on own queue first 2912 task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained); 2913 } 2914 if ((task == NULL) && (nthreads > 1)) { // Steal a task 2915 int asleep = 1; 2916 use_own_tasks = 0; 2917 // Try to steal from the last place I stole from successfully. 2918 if (victim_tid == -2) { // haven't stolen anything yet 2919 victim_tid = threads_data[tid].td.td_deque_last_stolen; 2920 if (victim_tid != 2921 -1) // if we have a last stolen from victim, get the thread 2922 other_thread = threads_data[victim_tid].td.td_thr; 2923 } 2924 if (victim_tid != -1) { // found last victim 2925 asleep = 0; 2926 } else if (!new_victim) { // no recent steals and we haven't already 2927 // used a new victim; select a random thread 2928 do { // Find a different thread to steal work from. 2929 // Pick a random thread. Initial plan was to cycle through all the 2930 // threads, and only return if we tried to steal from every thread, 2931 // and failed. Arch says that's not such a great idea. 2932 victim_tid = __kmp_get_random(thread) % (nthreads - 1); 2933 if (victim_tid >= tid) { 2934 ++victim_tid; // Adjusts random distribution to exclude self 2935 } 2936 // Found a potential victim 2937 other_thread = threads_data[victim_tid].td.td_thr; 2938 // There is a slight chance that __kmp_enable_tasking() did not wake 2939 // up all threads waiting at the barrier. If victim is sleeping, 2940 // then wake it up. Since we were going to pay the cache miss 2941 // penalty for referencing another thread's kmp_info_t struct 2942 // anyway, 2943 // the check shouldn't cost too much performance at this point. In 2944 // extra barrier mode, tasks do not sleep at the separate tasking 2945 // barrier, so this isn't a problem. 2946 asleep = 0; 2947 if ((__kmp_tasking_mode == tskm_task_teams) && 2948 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) && 2949 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) != 2950 NULL)) { 2951 asleep = 1; 2952 __kmp_null_resume_wrapper(other_thread); 2953 // A sleeping thread should not have any tasks on it's queue. 2954 // There is a slight possibility that it resumes, steals a task 2955 // from another thread, which spawns more tasks, all in the time 2956 // that it takes this thread to check => don't write an assertion 2957 // that the victim's queue is empty. Try stealing from a 2958 // different thread. 2959 } 2960 } while (asleep); 2961 } 2962 2963 if (!asleep) { 2964 // We have a victim to try to steal from 2965 task = __kmp_steal_task(other_thread, gtid, task_team, 2966 unfinished_threads, thread_finished, 2967 is_constrained); 2968 } 2969 if (task != NULL) { // set last stolen to victim 2970 if (threads_data[tid].td.td_deque_last_stolen != victim_tid) { 2971 threads_data[tid].td.td_deque_last_stolen = victim_tid; 2972 // The pre-refactored code did not try more than 1 successful new 2973 // vicitm, unless the last one generated more local tasks; 2974 // new_victim keeps track of this 2975 new_victim = 1; 2976 } 2977 } else { // No tasks found; unset last_stolen 2978 KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1); 2979 victim_tid = -2; // no successful victim found 2980 } 2981 } 2982 2983 if (task == NULL) 2984 break; // break out of tasking loop 2985 2986 // Found a task; execute it 2987 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2988 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2989 if (itt_sync_obj == NULL) { // we are at fork barrier where we could not 2990 // get the object reliably 2991 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2992 } 2993 __kmp_itt_task_starting(itt_sync_obj); 2994 } 2995 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2996 __kmp_invoke_task(gtid, task, current_task); 2997 #if USE_ITT_BUILD 2998 if (itt_sync_obj != NULL) 2999 __kmp_itt_task_finished(itt_sync_obj); 3000 #endif /* USE_ITT_BUILD */ 3001 // If this thread is only partway through the barrier and the condition is 3002 // met, then return now, so that the barrier gather/release pattern can 3003 // proceed. If this thread is in the last spin loop in the barrier, 3004 // waiting to be released, we know that the termination condition will not 3005 // be satisfied, so don't waste any cycles checking it. 3006 if (flag == NULL || (!final_spin && flag->done_check())) { 3007 KA_TRACE( 3008 15, 3009 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 3010 gtid)); 3011 return TRUE; 3012 } 3013 if (thread->th.th_task_team == NULL) { 3014 break; 3015 } 3016 KMP_YIELD(__kmp_library == library_throughput); // Yield before next task 3017 // If execution of a stolen task results in more tasks being placed on our 3018 // run queue, reset use_own_tasks 3019 if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) { 3020 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned " 3021 "other tasks, restart\n", 3022 gtid)); 3023 use_own_tasks = 1; 3024 new_victim = 0; 3025 } 3026 } 3027 3028 // The task source has been exhausted. If in final spin loop of barrier, 3029 // check if termination condition is satisfied. The work queue may be empty 3030 // but there might be proxy tasks still executing. 3031 if (final_spin && 3032 KMP_ATOMIC_LD_ACQ(¤t_task->td_incomplete_child_tasks) == 0) { 3033 // First, decrement the #unfinished threads, if that has not already been 3034 // done. This decrement might be to the spin location, and result in the 3035 // termination condition being satisfied. 3036 if (!*thread_finished) { 3037 #if KMP_DEBUG 3038 kmp_int32 count = -1 + 3039 #endif 3040 KMP_ATOMIC_DEC(unfinished_threads); 3041 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec " 3042 "unfinished_threads to %d task_team=%p\n", 3043 gtid, count, task_team)); 3044 *thread_finished = TRUE; 3045 } 3046 3047 // It is now unsafe to reference thread->th.th_team !!! 3048 // Decrementing task_team->tt.tt_unfinished_threads can allow the primary 3049 // thread to pass through the barrier, where it might reset each thread's 3050 // th.th_team field for the next parallel region. If we can steal more 3051 // work, we know that this has not happened yet. 3052 if (flag != NULL && flag->done_check()) { 3053 KA_TRACE( 3054 15, 3055 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 3056 gtid)); 3057 return TRUE; 3058 } 3059 } 3060 3061 // If this thread's task team is NULL, primary thread has recognized that 3062 // there are no more tasks; bail out 3063 if (thread->th.th_task_team == NULL) { 3064 KA_TRACE(15, 3065 ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid)); 3066 return FALSE; 3067 } 3068 3069 // We could be getting tasks from target constructs; if this is the only 3070 // thread, keep trying to execute tasks from own queue 3071 if (nthreads == 1 && 3072 KMP_ATOMIC_LD_ACQ(¤t_task->td_incomplete_child_tasks)) 3073 use_own_tasks = 1; 3074 else { 3075 KA_TRACE(15, 3076 ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid)); 3077 return FALSE; 3078 } 3079 } 3080 } 3081 3082 template <bool C, bool S> 3083 int __kmp_execute_tasks_32( 3084 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin, 3085 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 3086 kmp_int32 is_constrained) { 3087 return __kmp_execute_tasks_template( 3088 thread, gtid, flag, final_spin, 3089 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 3090 } 3091 3092 template <bool C, bool S> 3093 int __kmp_execute_tasks_64( 3094 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin, 3095 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 3096 kmp_int32 is_constrained) { 3097 return __kmp_execute_tasks_template( 3098 thread, gtid, flag, final_spin, 3099 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 3100 } 3101 3102 template <bool C, bool S> 3103 int __kmp_atomic_execute_tasks_64( 3104 kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag, 3105 int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 3106 kmp_int32 is_constrained) { 3107 return __kmp_execute_tasks_template( 3108 thread, gtid, flag, final_spin, 3109 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 3110 } 3111 3112 int __kmp_execute_tasks_oncore( 3113 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin, 3114 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 3115 kmp_int32 is_constrained) { 3116 return __kmp_execute_tasks_template( 3117 thread, gtid, flag, final_spin, 3118 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 3119 } 3120 3121 template int 3122 __kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32, 3123 kmp_flag_32<false, false> *, int, 3124 int *USE_ITT_BUILD_ARG(void *), kmp_int32); 3125 3126 template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32, 3127 kmp_flag_64<false, true> *, 3128 int, 3129 int *USE_ITT_BUILD_ARG(void *), 3130 kmp_int32); 3131 3132 template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32, 3133 kmp_flag_64<true, false> *, 3134 int, 3135 int *USE_ITT_BUILD_ARG(void *), 3136 kmp_int32); 3137 3138 template int __kmp_atomic_execute_tasks_64<false, true>( 3139 kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int, 3140 int *USE_ITT_BUILD_ARG(void *), kmp_int32); 3141 3142 template int __kmp_atomic_execute_tasks_64<true, false>( 3143 kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int, 3144 int *USE_ITT_BUILD_ARG(void *), kmp_int32); 3145 3146 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the 3147 // next barrier so they can assist in executing enqueued tasks. 3148 // First thread in allocates the task team atomically. 3149 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 3150 kmp_info_t *this_thr) { 3151 kmp_thread_data_t *threads_data; 3152 int nthreads, i, is_init_thread; 3153 3154 KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n", 3155 __kmp_gtid_from_thread(this_thr))); 3156 3157 KMP_DEBUG_ASSERT(task_team != NULL); 3158 KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL); 3159 3160 nthreads = task_team->tt.tt_nproc; 3161 KMP_DEBUG_ASSERT(nthreads > 0); 3162 KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc); 3163 3164 // Allocate or increase the size of threads_data if necessary 3165 is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team); 3166 3167 if (!is_init_thread) { 3168 // Some other thread already set up the array. 3169 KA_TRACE( 3170 20, 3171 ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n", 3172 __kmp_gtid_from_thread(this_thr))); 3173 return; 3174 } 3175 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 3176 KMP_DEBUG_ASSERT(threads_data != NULL); 3177 3178 if (__kmp_tasking_mode == tskm_task_teams && 3179 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) { 3180 // Release any threads sleeping at the barrier, so that they can steal 3181 // tasks and execute them. In extra barrier mode, tasks do not sleep 3182 // at the separate tasking barrier, so this isn't a problem. 3183 for (i = 0; i < nthreads; i++) { 3184 void *sleep_loc; 3185 kmp_info_t *thread = threads_data[i].td.td_thr; 3186 3187 if (i == this_thr->th.th_info.ds.ds_tid) { 3188 continue; 3189 } 3190 // Since we haven't locked the thread's suspend mutex lock at this 3191 // point, there is a small window where a thread might be putting 3192 // itself to sleep, but hasn't set the th_sleep_loc field yet. 3193 // To work around this, __kmp_execute_tasks_template() periodically checks 3194 // see if other threads are sleeping (using the same random mechanism that 3195 // is used for task stealing) and awakens them if they are. 3196 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != 3197 NULL) { 3198 KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n", 3199 __kmp_gtid_from_thread(this_thr), 3200 __kmp_gtid_from_thread(thread))); 3201 __kmp_null_resume_wrapper(thread); 3202 } else { 3203 KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n", 3204 __kmp_gtid_from_thread(this_thr), 3205 __kmp_gtid_from_thread(thread))); 3206 } 3207 } 3208 } 3209 3210 KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n", 3211 __kmp_gtid_from_thread(this_thr))); 3212 } 3213 3214 /* // TODO: Check the comment consistency 3215 * Utility routines for "task teams". A task team (kmp_task_t) is kind of 3216 * like a shadow of the kmp_team_t data struct, with a different lifetime. 3217 * After a child * thread checks into a barrier and calls __kmp_release() from 3218 * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no 3219 * longer assume that the kmp_team_t structure is intact (at any moment, the 3220 * primary thread may exit the barrier code and free the team data structure, 3221 * and return the threads to the thread pool). 3222 * 3223 * This does not work with the tasking code, as the thread is still 3224 * expected to participate in the execution of any tasks that may have been 3225 * spawned my a member of the team, and the thread still needs access to all 3226 * to each thread in the team, so that it can steal work from it. 3227 * 3228 * Enter the existence of the kmp_task_team_t struct. It employs a reference 3229 * counting mechanism, and is allocated by the primary thread before calling 3230 * __kmp_<barrier_kind>_release, and then is release by the last thread to 3231 * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes 3232 * of the kmp_task_team_t structs for consecutive barriers can overlap 3233 * (and will, unless the primary thread is the last thread to exit the barrier 3234 * release phase, which is not typical). The existence of such a struct is 3235 * useful outside the context of tasking. 3236 * 3237 * We currently use the existence of the threads array as an indicator that 3238 * tasks were spawned since the last barrier. If the structure is to be 3239 * useful outside the context of tasking, then this will have to change, but 3240 * not setting the field minimizes the performance impact of tasking on 3241 * barriers, when no explicit tasks were spawned (pushed, actually). 3242 */ 3243 3244 static kmp_task_team_t *__kmp_free_task_teams = 3245 NULL; // Free list for task_team data structures 3246 // Lock for task team data structures 3247 kmp_bootstrap_lock_t __kmp_task_team_lock = 3248 KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock); 3249 3250 // __kmp_alloc_task_deque: 3251 // Allocates a task deque for a particular thread, and initialize the necessary 3252 // data structures relating to the deque. This only happens once per thread 3253 // per task team since task teams are recycled. No lock is needed during 3254 // allocation since each thread allocates its own deque. 3255 static void __kmp_alloc_task_deque(kmp_info_t *thread, 3256 kmp_thread_data_t *thread_data) { 3257 __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock); 3258 KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL); 3259 3260 // Initialize last stolen task field to "none" 3261 thread_data->td.td_deque_last_stolen = -1; 3262 3263 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0); 3264 KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0); 3265 KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0); 3266 3267 KE_TRACE( 3268 10, 3269 ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n", 3270 __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data)); 3271 // Allocate space for task deque, and zero the deque 3272 // Cannot use __kmp_thread_calloc() because threads not around for 3273 // kmp_reap_task_team( ). 3274 thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate( 3275 INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *)); 3276 thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE; 3277 } 3278 3279 // __kmp_free_task_deque: 3280 // Deallocates a task deque for a particular thread. Happens at library 3281 // deallocation so don't need to reset all thread data fields. 3282 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) { 3283 if (thread_data->td.td_deque != NULL) { 3284 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3285 TCW_4(thread_data->td.td_deque_ntasks, 0); 3286 __kmp_free(thread_data->td.td_deque); 3287 thread_data->td.td_deque = NULL; 3288 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 3289 } 3290 3291 #ifdef BUILD_TIED_TASK_STACK 3292 // GEH: Figure out what to do here for td_susp_tied_tasks 3293 if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) { 3294 __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data); 3295 } 3296 #endif // BUILD_TIED_TASK_STACK 3297 } 3298 3299 // __kmp_realloc_task_threads_data: 3300 // Allocates a threads_data array for a task team, either by allocating an 3301 // initial array or enlarging an existing array. Only the first thread to get 3302 // the lock allocs or enlarges the array and re-initializes the array elements. 3303 // That thread returns "TRUE", the rest return "FALSE". 3304 // Assumes that the new array size is given by task_team -> tt.tt_nproc. 3305 // The current size is given by task_team -> tt.tt_max_threads. 3306 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 3307 kmp_task_team_t *task_team) { 3308 kmp_thread_data_t **threads_data_p; 3309 kmp_int32 nthreads, maxthreads; 3310 int is_init_thread = FALSE; 3311 3312 if (TCR_4(task_team->tt.tt_found_tasks)) { 3313 // Already reallocated and initialized. 3314 return FALSE; 3315 } 3316 3317 threads_data_p = &task_team->tt.tt_threads_data; 3318 nthreads = task_team->tt.tt_nproc; 3319 maxthreads = task_team->tt.tt_max_threads; 3320 3321 // All threads must lock when they encounter the first task of the implicit 3322 // task region to make sure threads_data fields are (re)initialized before 3323 // used. 3324 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 3325 3326 if (!TCR_4(task_team->tt.tt_found_tasks)) { 3327 // first thread to enable tasking 3328 kmp_team_t *team = thread->th.th_team; 3329 int i; 3330 3331 is_init_thread = TRUE; 3332 if (maxthreads < nthreads) { 3333 3334 if (*threads_data_p != NULL) { 3335 kmp_thread_data_t *old_data = *threads_data_p; 3336 kmp_thread_data_t *new_data = NULL; 3337 3338 KE_TRACE( 3339 10, 3340 ("__kmp_realloc_task_threads_data: T#%d reallocating " 3341 "threads data for task_team %p, new_size = %d, old_size = %d\n", 3342 __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads)); 3343 // Reallocate threads_data to have more elements than current array 3344 // Cannot use __kmp_thread_realloc() because threads not around for 3345 // kmp_reap_task_team( ). Note all new array entries are initialized 3346 // to zero by __kmp_allocate(). 3347 new_data = (kmp_thread_data_t *)__kmp_allocate( 3348 nthreads * sizeof(kmp_thread_data_t)); 3349 // copy old data to new data 3350 KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t), 3351 (void *)old_data, maxthreads * sizeof(kmp_thread_data_t)); 3352 3353 #ifdef BUILD_TIED_TASK_STACK 3354 // GEH: Figure out if this is the right thing to do 3355 for (i = maxthreads; i < nthreads; i++) { 3356 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 3357 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 3358 } 3359 #endif // BUILD_TIED_TASK_STACK 3360 // Install the new data and free the old data 3361 (*threads_data_p) = new_data; 3362 __kmp_free(old_data); 3363 } else { 3364 KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating " 3365 "threads data for task_team %p, size = %d\n", 3366 __kmp_gtid_from_thread(thread), task_team, nthreads)); 3367 // Make the initial allocate for threads_data array, and zero entries 3368 // Cannot use __kmp_thread_calloc() because threads not around for 3369 // kmp_reap_task_team( ). 3370 *threads_data_p = (kmp_thread_data_t *)__kmp_allocate( 3371 nthreads * sizeof(kmp_thread_data_t)); 3372 #ifdef BUILD_TIED_TASK_STACK 3373 // GEH: Figure out if this is the right thing to do 3374 for (i = 0; i < nthreads; i++) { 3375 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 3376 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 3377 } 3378 #endif // BUILD_TIED_TASK_STACK 3379 } 3380 task_team->tt.tt_max_threads = nthreads; 3381 } else { 3382 // If array has (more than) enough elements, go ahead and use it 3383 KMP_DEBUG_ASSERT(*threads_data_p != NULL); 3384 } 3385 3386 // initialize threads_data pointers back to thread_info structures 3387 for (i = 0; i < nthreads; i++) { 3388 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 3389 thread_data->td.td_thr = team->t.t_threads[i]; 3390 3391 if (thread_data->td.td_deque_last_stolen >= nthreads) { 3392 // The last stolen field survives across teams / barrier, and the number 3393 // of threads may have changed. It's possible (likely?) that a new 3394 // parallel region will exhibit the same behavior as previous region. 3395 thread_data->td.td_deque_last_stolen = -1; 3396 } 3397 } 3398 3399 KMP_MB(); 3400 TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE); 3401 } 3402 3403 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 3404 return is_init_thread; 3405 } 3406 3407 // __kmp_free_task_threads_data: 3408 // Deallocates a threads_data array for a task team, including any attached 3409 // tasking deques. Only occurs at library shutdown. 3410 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) { 3411 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 3412 if (task_team->tt.tt_threads_data != NULL) { 3413 int i; 3414 for (i = 0; i < task_team->tt.tt_max_threads; i++) { 3415 __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]); 3416 } 3417 __kmp_free(task_team->tt.tt_threads_data); 3418 task_team->tt.tt_threads_data = NULL; 3419 } 3420 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 3421 } 3422 3423 // __kmp_allocate_task_team: 3424 // Allocates a task team associated with a specific team, taking it from 3425 // the global task team free list if possible. Also initializes data 3426 // structures. 3427 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread, 3428 kmp_team_t *team) { 3429 kmp_task_team_t *task_team = NULL; 3430 int nthreads; 3431 3432 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n", 3433 (thread ? __kmp_gtid_from_thread(thread) : -1), team)); 3434 3435 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 3436 // Take a task team from the task team pool 3437 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3438 if (__kmp_free_task_teams != NULL) { 3439 task_team = __kmp_free_task_teams; 3440 TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next); 3441 task_team->tt.tt_next = NULL; 3442 } 3443 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3444 } 3445 3446 if (task_team == NULL) { 3447 KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating " 3448 "task team for team %p\n", 3449 __kmp_gtid_from_thread(thread), team)); 3450 // Allocate a new task team if one is not available. Cannot use 3451 // __kmp_thread_malloc because threads not around for kmp_reap_task_team. 3452 task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t)); 3453 __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock); 3454 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 3455 // suppress race conditions detection on synchronization flags in debug mode 3456 // this helps to analyze library internals eliminating false positives 3457 __itt_suppress_mark_range( 3458 __itt_suppress_range, __itt_suppress_threading_errors, 3459 &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks)); 3460 __itt_suppress_mark_range(__itt_suppress_range, 3461 __itt_suppress_threading_errors, 3462 CCAST(kmp_uint32 *, &task_team->tt.tt_active), 3463 sizeof(task_team->tt.tt_active)); 3464 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 3465 // Note: __kmp_allocate zeroes returned memory, othewise we would need: 3466 // task_team->tt.tt_threads_data = NULL; 3467 // task_team->tt.tt_max_threads = 0; 3468 // task_team->tt.tt_next = NULL; 3469 } 3470 3471 TCW_4(task_team->tt.tt_found_tasks, FALSE); 3472 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3473 task_team->tt.tt_nproc = nthreads = team->t.t_nproc; 3474 3475 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads); 3476 TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE); 3477 TCW_4(task_team->tt.tt_active, TRUE); 3478 3479 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p " 3480 "unfinished_threads init'd to %d\n", 3481 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team, 3482 KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads))); 3483 return task_team; 3484 } 3485 3486 // __kmp_free_task_team: 3487 // Frees the task team associated with a specific thread, and adds it 3488 // to the global task team free list. 3489 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) { 3490 KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n", 3491 thread ? __kmp_gtid_from_thread(thread) : -1, task_team)); 3492 3493 // Put task team back on free list 3494 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3495 3496 KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL); 3497 task_team->tt.tt_next = __kmp_free_task_teams; 3498 TCW_PTR(__kmp_free_task_teams, task_team); 3499 3500 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3501 } 3502 3503 // __kmp_reap_task_teams: 3504 // Free all the task teams on the task team free list. 3505 // Should only be done during library shutdown. 3506 // Cannot do anything that needs a thread structure or gtid since they are 3507 // already gone. 3508 void __kmp_reap_task_teams(void) { 3509 kmp_task_team_t *task_team; 3510 3511 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 3512 // Free all task_teams on the free list 3513 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3514 while ((task_team = __kmp_free_task_teams) != NULL) { 3515 __kmp_free_task_teams = task_team->tt.tt_next; 3516 task_team->tt.tt_next = NULL; 3517 3518 // Free threads_data if necessary 3519 if (task_team->tt.tt_threads_data != NULL) { 3520 __kmp_free_task_threads_data(task_team); 3521 } 3522 __kmp_free(task_team); 3523 } 3524 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3525 } 3526 } 3527 3528 // __kmp_wait_to_unref_task_teams: 3529 // Some threads could still be in the fork barrier release code, possibly 3530 // trying to steal tasks. Wait for each thread to unreference its task team. 3531 void __kmp_wait_to_unref_task_teams(void) { 3532 kmp_info_t *thread; 3533 kmp_uint32 spins; 3534 int done; 3535 3536 KMP_INIT_YIELD(spins); 3537 3538 for (;;) { 3539 done = TRUE; 3540 3541 // TODO: GEH - this may be is wrong because some sync would be necessary 3542 // in case threads are added to the pool during the traversal. Need to 3543 // verify that lock for thread pool is held when calling this routine. 3544 for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL; 3545 thread = thread->th.th_next_pool) { 3546 #if KMP_OS_WINDOWS 3547 DWORD exit_val; 3548 #endif 3549 if (TCR_PTR(thread->th.th_task_team) == NULL) { 3550 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n", 3551 __kmp_gtid_from_thread(thread))); 3552 continue; 3553 } 3554 #if KMP_OS_WINDOWS 3555 // TODO: GEH - add this check for Linux* OS / OS X* as well? 3556 if (!__kmp_is_thread_alive(thread, &exit_val)) { 3557 thread->th.th_task_team = NULL; 3558 continue; 3559 } 3560 #endif 3561 3562 done = FALSE; // Because th_task_team pointer is not NULL for this thread 3563 3564 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to " 3565 "unreference task_team\n", 3566 __kmp_gtid_from_thread(thread))); 3567 3568 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 3569 void *sleep_loc; 3570 // If the thread is sleeping, awaken it. 3571 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != 3572 NULL) { 3573 KA_TRACE( 3574 10, 3575 ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n", 3576 __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread))); 3577 __kmp_null_resume_wrapper(thread); 3578 } 3579 } 3580 } 3581 if (done) { 3582 break; 3583 } 3584 3585 // If oversubscribed or have waited a bit, yield. 3586 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 3587 } 3588 } 3589 3590 // __kmp_task_team_setup: Create a task_team for the current team, but use 3591 // an already created, unused one if it already exists. 3592 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) { 3593 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3594 3595 // If this task_team hasn't been created yet, allocate it. It will be used in 3596 // the region after the next. 3597 // If it exists, it is the current task team and shouldn't be touched yet as 3598 // it may still be in use. 3599 if (team->t.t_task_team[this_thr->th.th_task_state] == NULL && 3600 (always || team->t.t_nproc > 1)) { 3601 team->t.t_task_team[this_thr->th.th_task_state] = 3602 __kmp_allocate_task_team(this_thr, team); 3603 KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p" 3604 " for team %d at parity=%d\n", 3605 __kmp_gtid_from_thread(this_thr), 3606 team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id, 3607 this_thr->th.th_task_state)); 3608 } 3609 3610 // After threads exit the release, they will call sync, and then point to this 3611 // other task_team; make sure it is allocated and properly initialized. As 3612 // threads spin in the barrier release phase, they will continue to use the 3613 // previous task_team struct(above), until they receive the signal to stop 3614 // checking for tasks (they can't safely reference the kmp_team_t struct, 3615 // which could be reallocated by the primary thread). No task teams are formed 3616 // for serialized teams. 3617 if (team->t.t_nproc > 1) { 3618 int other_team = 1 - this_thr->th.th_task_state; 3619 KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2); 3620 if (team->t.t_task_team[other_team] == NULL) { // setup other team as well 3621 team->t.t_task_team[other_team] = 3622 __kmp_allocate_task_team(this_thr, team); 3623 KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new " 3624 "task_team %p for team %d at parity=%d\n", 3625 __kmp_gtid_from_thread(this_thr), 3626 team->t.t_task_team[other_team], team->t.t_id, other_team)); 3627 } else { // Leave the old task team struct in place for the upcoming region; 3628 // adjust as needed 3629 kmp_task_team_t *task_team = team->t.t_task_team[other_team]; 3630 if (!task_team->tt.tt_active || 3631 team->t.t_nproc != task_team->tt.tt_nproc) { 3632 TCW_4(task_team->tt.tt_nproc, team->t.t_nproc); 3633 TCW_4(task_team->tt.tt_found_tasks, FALSE); 3634 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3635 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, 3636 team->t.t_nproc); 3637 TCW_4(task_team->tt.tt_active, TRUE); 3638 } 3639 // if team size has changed, the first thread to enable tasking will 3640 // realloc threads_data if necessary 3641 KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team " 3642 "%p for team %d at parity=%d\n", 3643 __kmp_gtid_from_thread(this_thr), 3644 team->t.t_task_team[other_team], team->t.t_id, other_team)); 3645 } 3646 } 3647 3648 // For regular thread, task enabling should be called when the task is going 3649 // to be pushed to a dequeue. However, for the hidden helper thread, we need 3650 // it ahead of time so that some operations can be performed without race 3651 // condition. 3652 if (this_thr == __kmp_hidden_helper_main_thread) { 3653 for (int i = 0; i < 2; ++i) { 3654 kmp_task_team_t *task_team = team->t.t_task_team[i]; 3655 if (KMP_TASKING_ENABLED(task_team)) { 3656 continue; 3657 } 3658 __kmp_enable_tasking(task_team, this_thr); 3659 for (int j = 0; j < task_team->tt.tt_nproc; ++j) { 3660 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j]; 3661 if (thread_data->td.td_deque == NULL) { 3662 __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data); 3663 } 3664 } 3665 } 3666 } 3667 } 3668 3669 // __kmp_task_team_sync: Propagation of task team data from team to threads 3670 // which happens just after the release phase of a team barrier. This may be 3671 // called by any thread, but only for teams with # threads > 1. 3672 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) { 3673 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3674 3675 // Toggle the th_task_state field, to switch which task_team this thread 3676 // refers to 3677 this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state); 3678 3679 // It is now safe to propagate the task team pointer from the team struct to 3680 // the current thread. 3681 TCW_PTR(this_thr->th.th_task_team, 3682 team->t.t_task_team[this_thr->th.th_task_state]); 3683 KA_TRACE(20, 3684 ("__kmp_task_team_sync: Thread T#%d task team switched to task_team " 3685 "%p from Team #%d (parity=%d)\n", 3686 __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team, 3687 team->t.t_id, this_thr->th.th_task_state)); 3688 } 3689 3690 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the 3691 // barrier gather phase. Only called by primary thread if #threads in team > 1 3692 // or if proxy tasks were created. 3693 // 3694 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off 3695 // by passing in 0 optionally as the last argument. When wait is zero, primary 3696 // thread does not wait for unfinished_threads to reach 0. 3697 void __kmp_task_team_wait( 3698 kmp_info_t *this_thr, 3699 kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) { 3700 kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state]; 3701 3702 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3703 KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team); 3704 3705 if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) { 3706 if (wait) { 3707 KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks " 3708 "(for unfinished_threads to reach 0) on task_team = %p\n", 3709 __kmp_gtid_from_thread(this_thr), task_team)); 3710 // Worker threads may have dropped through to release phase, but could 3711 // still be executing tasks. Wait here for tasks to complete. To avoid 3712 // memory contention, only primary thread checks termination condition. 3713 kmp_flag_32<false, false> flag( 3714 RCAST(std::atomic<kmp_uint32> *, 3715 &task_team->tt.tt_unfinished_threads), 3716 0U); 3717 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 3718 } 3719 // Deactivate the old task team, so that the worker threads will stop 3720 // referencing it while spinning. 3721 KA_TRACE( 3722 20, 3723 ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: " 3724 "setting active to false, setting local and team's pointer to NULL\n", 3725 __kmp_gtid_from_thread(this_thr), task_team)); 3726 KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 || 3727 task_team->tt.tt_found_proxy_tasks == TRUE); 3728 TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3729 KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0); 3730 TCW_SYNC_4(task_team->tt.tt_active, FALSE); 3731 KMP_MB(); 3732 3733 TCW_PTR(this_thr->th.th_task_team, NULL); 3734 } 3735 } 3736 3737 // __kmp_tasking_barrier: 3738 // This routine is called only when __kmp_tasking_mode == tskm_extra_barrier. 3739 // Internal function to execute all tasks prior to a regular barrier or a join 3740 // barrier. It is a full barrier itself, which unfortunately turns regular 3741 // barriers into double barriers and join barriers into 1 1/2 barriers. 3742 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) { 3743 std::atomic<kmp_uint32> *spin = RCAST( 3744 std::atomic<kmp_uint32> *, 3745 &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads); 3746 int flag = FALSE; 3747 KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier); 3748 3749 #if USE_ITT_BUILD 3750 KMP_FSYNC_SPIN_INIT(spin, NULL); 3751 #endif /* USE_ITT_BUILD */ 3752 kmp_flag_32<false, false> spin_flag(spin, 0U); 3753 while (!spin_flag.execute_tasks(thread, gtid, TRUE, 3754 &flag USE_ITT_BUILD_ARG(NULL), 0)) { 3755 #if USE_ITT_BUILD 3756 // TODO: What about itt_sync_obj?? 3757 KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin)); 3758 #endif /* USE_ITT_BUILD */ 3759 3760 if (TCR_4(__kmp_global.g.g_done)) { 3761 if (__kmp_global.g.g_abort) 3762 __kmp_abort_thread(); 3763 break; 3764 } 3765 KMP_YIELD(TRUE); 3766 } 3767 #if USE_ITT_BUILD 3768 KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin)); 3769 #endif /* USE_ITT_BUILD */ 3770 } 3771 3772 // __kmp_give_task puts a task into a given thread queue if: 3773 // - the queue for that thread was created 3774 // - there's space in that queue 3775 // Because of this, __kmp_push_task needs to check if there's space after 3776 // getting the lock 3777 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task, 3778 kmp_int32 pass) { 3779 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 3780 kmp_task_team_t *task_team = taskdata->td_task_team; 3781 3782 KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n", 3783 taskdata, tid)); 3784 3785 // If task_team is NULL something went really bad... 3786 KMP_DEBUG_ASSERT(task_team != NULL); 3787 3788 bool result = false; 3789 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 3790 3791 if (thread_data->td.td_deque == NULL) { 3792 // There's no queue in this thread, go find another one 3793 // We're guaranteed that at least one thread has a queue 3794 KA_TRACE(30, 3795 ("__kmp_give_task: thread %d has no queue while giving task %p.\n", 3796 tid, taskdata)); 3797 return result; 3798 } 3799 3800 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3801 TASK_DEQUE_SIZE(thread_data->td)) { 3802 KA_TRACE( 3803 30, 3804 ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", 3805 taskdata, tid)); 3806 3807 // if this deque is bigger than the pass ratio give a chance to another 3808 // thread 3809 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 3810 return result; 3811 3812 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3813 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3814 TASK_DEQUE_SIZE(thread_data->td)) { 3815 // expand deque to push the task which is not allowed to execute 3816 __kmp_realloc_task_deque(thread, thread_data); 3817 } 3818 3819 } else { 3820 3821 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3822 3823 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3824 TASK_DEQUE_SIZE(thread_data->td)) { 3825 KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to " 3826 "thread %d.\n", 3827 taskdata, tid)); 3828 3829 // if this deque is bigger than the pass ratio give a chance to another 3830 // thread 3831 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 3832 goto release_and_exit; 3833 3834 __kmp_realloc_task_deque(thread, thread_data); 3835 } 3836 } 3837 3838 // lock is held here, and there is space in the deque 3839 3840 thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata; 3841 // Wrap index. 3842 thread_data->td.td_deque_tail = 3843 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 3844 TCW_4(thread_data->td.td_deque_ntasks, 3845 TCR_4(thread_data->td.td_deque_ntasks) + 1); 3846 3847 result = true; 3848 KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n", 3849 taskdata, tid)); 3850 3851 release_and_exit: 3852 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 3853 3854 return result; 3855 } 3856 3857 #define PROXY_TASK_FLAG 0x40000000 3858 /* The finish of the proxy tasks is divided in two pieces: 3859 - the top half is the one that can be done from a thread outside the team 3860 - the bottom half must be run from a thread within the team 3861 3862 In order to run the bottom half the task gets queued back into one of the 3863 threads of the team. Once the td_incomplete_child_task counter of the parent 3864 is decremented the threads can leave the barriers. So, the bottom half needs 3865 to be queued before the counter is decremented. The top half is therefore 3866 divided in two parts: 3867 - things that can be run before queuing the bottom half 3868 - things that must be run after queuing the bottom half 3869 3870 This creates a second race as the bottom half can free the task before the 3871 second top half is executed. To avoid this we use the 3872 td_incomplete_child_task of the proxy task to synchronize the top and bottom 3873 half. */ 3874 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 3875 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 3876 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3877 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 3878 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 3879 3880 taskdata->td_flags.complete = 1; // mark the task as completed 3881 3882 if (taskdata->td_taskgroup) 3883 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); 3884 3885 // Create an imaginary children for this task so the bottom half cannot 3886 // release the task before we have completed the second top half 3887 KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG); 3888 } 3889 3890 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 3891 #if KMP_DEBUG 3892 kmp_int32 children = 0; 3893 // Predecrement simulated by "- 1" calculation 3894 children = -1 + 3895 #endif 3896 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks); 3897 KMP_DEBUG_ASSERT(children >= 0); 3898 3899 // Remove the imaginary children 3900 KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG); 3901 } 3902 3903 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) { 3904 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3905 kmp_info_t *thread = __kmp_threads[gtid]; 3906 3907 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3908 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 3909 1); // top half must run before bottom half 3910 3911 // We need to wait to make sure the top half is finished 3912 // Spinning here should be ok as this should happen quickly 3913 while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) & 3914 PROXY_TASK_FLAG) > 0) 3915 ; 3916 3917 __kmp_release_deps(gtid, taskdata); 3918 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 3919 } 3920 3921 /*! 3922 @ingroup TASKING 3923 @param gtid Global Thread ID of encountering thread 3924 @param ptask Task which execution is completed 3925 3926 Execute the completion of a proxy task from a thread of that is part of the 3927 team. Run first and bottom halves directly. 3928 */ 3929 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) { 3930 KMP_DEBUG_ASSERT(ptask != NULL); 3931 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3932 KA_TRACE( 3933 10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n", 3934 gtid, taskdata)); 3935 __kmp_assert_valid_gtid(gtid); 3936 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3937 3938 __kmp_first_top_half_finish_proxy(taskdata); 3939 __kmp_second_top_half_finish_proxy(taskdata); 3940 __kmp_bottom_half_finish_proxy(gtid, ptask); 3941 3942 KA_TRACE(10, 3943 ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n", 3944 gtid, taskdata)); 3945 } 3946 3947 void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) { 3948 KMP_DEBUG_ASSERT(ptask != NULL); 3949 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3950 3951 // Enqueue task to complete bottom half completion from a thread within the 3952 // corresponding team 3953 kmp_team_t *team = taskdata->td_team; 3954 kmp_int32 nthreads = team->t.t_nproc; 3955 kmp_info_t *thread; 3956 3957 // This should be similar to start_k = __kmp_get_random( thread ) % nthreads 3958 // but we cannot use __kmp_get_random here 3959 kmp_int32 start_k = start; 3960 kmp_int32 pass = 1; 3961 kmp_int32 k = start_k; 3962 3963 do { 3964 // For now we're just linearly trying to find a thread 3965 thread = team->t.t_threads[k]; 3966 k = (k + 1) % nthreads; 3967 3968 // we did a full pass through all the threads 3969 if (k == start_k) 3970 pass = pass << 1; 3971 3972 } while (!__kmp_give_task(thread, k, ptask, pass)); 3973 } 3974 3975 /*! 3976 @ingroup TASKING 3977 @param ptask Task which execution is completed 3978 3979 Execute the completion of a proxy task from a thread that could not belong to 3980 the team. 3981 */ 3982 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) { 3983 KMP_DEBUG_ASSERT(ptask != NULL); 3984 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3985 3986 KA_TRACE( 3987 10, 3988 ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n", 3989 taskdata)); 3990 3991 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3992 3993 __kmp_first_top_half_finish_proxy(taskdata); 3994 3995 __kmpc_give_task(ptask); 3996 3997 __kmp_second_top_half_finish_proxy(taskdata); 3998 3999 KA_TRACE( 4000 10, 4001 ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", 4002 taskdata)); 4003 } 4004 4005 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid, 4006 kmp_task_t *task) { 4007 kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task); 4008 if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) { 4009 td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION; 4010 td->td_allow_completion_event.ed.task = task; 4011 __kmp_init_tas_lock(&td->td_allow_completion_event.lock); 4012 } 4013 return &td->td_allow_completion_event; 4014 } 4015 4016 void __kmp_fulfill_event(kmp_event_t *event) { 4017 if (event->type == KMP_EVENT_ALLOW_COMPLETION) { 4018 kmp_task_t *ptask = event->ed.task; 4019 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 4020 bool detached = false; 4021 int gtid = __kmp_get_gtid(); 4022 4023 // The associated task might have completed or could be completing at this 4024 // point. 4025 // We need to take the lock to avoid races 4026 __kmp_acquire_tas_lock(&event->lock, gtid); 4027 if (taskdata->td_flags.proxy == TASK_PROXY) { 4028 detached = true; 4029 } else { 4030 #if OMPT_SUPPORT 4031 // The OMPT event must occur under mutual exclusion, 4032 // otherwise the tool might access ptask after free 4033 if (UNLIKELY(ompt_enabled.enabled)) 4034 __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill); 4035 #endif 4036 } 4037 event->type = KMP_EVENT_UNINITIALIZED; 4038 __kmp_release_tas_lock(&event->lock, gtid); 4039 4040 if (detached) { 4041 #if OMPT_SUPPORT 4042 // We free ptask afterwards and know the task is finished, 4043 // so locking is not necessary 4044 if (UNLIKELY(ompt_enabled.enabled)) 4045 __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill); 4046 #endif 4047 // If the task detached complete the proxy task 4048 if (gtid >= 0) { 4049 kmp_team_t *team = taskdata->td_team; 4050 kmp_info_t *thread = __kmp_get_thread(); 4051 if (thread->th.th_team == team) { 4052 __kmpc_proxy_task_completed(gtid, ptask); 4053 return; 4054 } 4055 } 4056 4057 // fallback 4058 __kmpc_proxy_task_completed_ooo(ptask); 4059 } 4060 } 4061 } 4062 4063 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task 4064 // for taskloop 4065 // 4066 // thread: allocating thread 4067 // task_src: pointer to source task to be duplicated 4068 // returns: a pointer to the allocated kmp_task_t structure (task). 4069 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) { 4070 kmp_task_t *task; 4071 kmp_taskdata_t *taskdata; 4072 kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src); 4073 kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task 4074 size_t shareds_offset; 4075 size_t task_size; 4076 4077 KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, 4078 task_src)); 4079 KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy == 4080 TASK_FULL); // it should not be proxy task 4081 KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT); 4082 task_size = taskdata_src->td_size_alloc; 4083 4084 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 4085 KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread, 4086 task_size)); 4087 #if USE_FAST_MEMORY 4088 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size); 4089 #else 4090 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size); 4091 #endif /* USE_FAST_MEMORY */ 4092 KMP_MEMCPY(taskdata, taskdata_src, task_size); 4093 4094 task = KMP_TASKDATA_TO_TASK(taskdata); 4095 4096 // Initialize new task (only specific fields not affected by memcpy) 4097 taskdata->td_task_id = KMP_GEN_TASK_ID(); 4098 if (task->shareds != NULL) { // need setup shareds pointer 4099 shareds_offset = (char *)task_src->shareds - (char *)taskdata_src; 4100 task->shareds = &((char *)taskdata)[shareds_offset]; 4101 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 4102 0); 4103 } 4104 taskdata->td_alloc_thread = thread; 4105 taskdata->td_parent = parent_task; 4106 // task inherits the taskgroup from the parent task 4107 taskdata->td_taskgroup = parent_task->td_taskgroup; 4108 // tied task needs to initialize the td_last_tied at creation, 4109 // untied one does this when it is scheduled for execution 4110 if (taskdata->td_flags.tiedness == TASK_TIED) 4111 taskdata->td_last_tied = taskdata; 4112 4113 // Only need to keep track of child task counts if team parallel and tasking 4114 // not serialized 4115 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { 4116 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks); 4117 if (parent_task->td_taskgroup) 4118 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count); 4119 // Only need to keep track of allocated child tasks for explicit tasks since 4120 // implicit not deallocated 4121 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) 4122 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks); 4123 } 4124 4125 KA_TRACE(20, 4126 ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n", 4127 thread, taskdata, taskdata->td_parent)); 4128 #if OMPT_SUPPORT 4129 if (UNLIKELY(ompt_enabled.enabled)) 4130 __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid); 4131 #endif 4132 return task; 4133 } 4134 4135 // Routine optionally generated by the compiler for setting the lastprivate flag 4136 // and calling needed constructors for private/firstprivate objects 4137 // (used to form taskloop tasks from pattern task) 4138 // Parameters: dest task, src task, lastprivate flag. 4139 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32); 4140 4141 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8); 4142 4143 // class to encapsulate manipulating loop bounds in a taskloop task. 4144 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting 4145 // the loop bound variables. 4146 class kmp_taskloop_bounds_t { 4147 kmp_task_t *task; 4148 const kmp_taskdata_t *taskdata; 4149 size_t lower_offset; 4150 size_t upper_offset; 4151 4152 public: 4153 kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub) 4154 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)), 4155 lower_offset((char *)lb - (char *)task), 4156 upper_offset((char *)ub - (char *)task) { 4157 KMP_DEBUG_ASSERT((char *)lb > (char *)_task); 4158 KMP_DEBUG_ASSERT((char *)ub > (char *)_task); 4159 } 4160 kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds) 4161 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)), 4162 lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {} 4163 size_t get_lower_offset() const { return lower_offset; } 4164 size_t get_upper_offset() const { return upper_offset; } 4165 kmp_uint64 get_lb() const { 4166 kmp_int64 retval; 4167 #if defined(KMP_GOMP_COMPAT) 4168 // Intel task just returns the lower bound normally 4169 if (!taskdata->td_flags.native) { 4170 retval = *(kmp_int64 *)((char *)task + lower_offset); 4171 } else { 4172 // GOMP task has to take into account the sizeof(long) 4173 if (taskdata->td_size_loop_bounds == 4) { 4174 kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds); 4175 retval = (kmp_int64)*lb; 4176 } else { 4177 kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds); 4178 retval = (kmp_int64)*lb; 4179 } 4180 } 4181 #else 4182 (void)taskdata; 4183 retval = *(kmp_int64 *)((char *)task + lower_offset); 4184 #endif // defined(KMP_GOMP_COMPAT) 4185 return retval; 4186 } 4187 kmp_uint64 get_ub() const { 4188 kmp_int64 retval; 4189 #if defined(KMP_GOMP_COMPAT) 4190 // Intel task just returns the upper bound normally 4191 if (!taskdata->td_flags.native) { 4192 retval = *(kmp_int64 *)((char *)task + upper_offset); 4193 } else { 4194 // GOMP task has to take into account the sizeof(long) 4195 if (taskdata->td_size_loop_bounds == 4) { 4196 kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1; 4197 retval = (kmp_int64)*ub; 4198 } else { 4199 kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1; 4200 retval = (kmp_int64)*ub; 4201 } 4202 } 4203 #else 4204 retval = *(kmp_int64 *)((char *)task + upper_offset); 4205 #endif // defined(KMP_GOMP_COMPAT) 4206 return retval; 4207 } 4208 void set_lb(kmp_uint64 lb) { 4209 #if defined(KMP_GOMP_COMPAT) 4210 // Intel task just sets the lower bound normally 4211 if (!taskdata->td_flags.native) { 4212 *(kmp_uint64 *)((char *)task + lower_offset) = lb; 4213 } else { 4214 // GOMP task has to take into account the sizeof(long) 4215 if (taskdata->td_size_loop_bounds == 4) { 4216 kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds); 4217 *lower = (kmp_uint32)lb; 4218 } else { 4219 kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds); 4220 *lower = (kmp_uint64)lb; 4221 } 4222 } 4223 #else 4224 *(kmp_uint64 *)((char *)task + lower_offset) = lb; 4225 #endif // defined(KMP_GOMP_COMPAT) 4226 } 4227 void set_ub(kmp_uint64 ub) { 4228 #if defined(KMP_GOMP_COMPAT) 4229 // Intel task just sets the upper bound normally 4230 if (!taskdata->td_flags.native) { 4231 *(kmp_uint64 *)((char *)task + upper_offset) = ub; 4232 } else { 4233 // GOMP task has to take into account the sizeof(long) 4234 if (taskdata->td_size_loop_bounds == 4) { 4235 kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1; 4236 *upper = (kmp_uint32)ub; 4237 } else { 4238 kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1; 4239 *upper = (kmp_uint64)ub; 4240 } 4241 } 4242 #else 4243 *(kmp_uint64 *)((char *)task + upper_offset) = ub; 4244 #endif // defined(KMP_GOMP_COMPAT) 4245 } 4246 }; 4247 4248 // __kmp_taskloop_linear: Start tasks of the taskloop linearly 4249 // 4250 // loc Source location information 4251 // gtid Global thread ID 4252 // task Pattern task, exposes the loop iteration range 4253 // lb Pointer to loop lower bound in task structure 4254 // ub Pointer to loop upper bound in task structure 4255 // st Loop stride 4256 // ub_glob Global upper bound (used for lastprivate check) 4257 // num_tasks Number of tasks to execute 4258 // grainsize Number of loop iterations per task 4259 // extras Number of chunks with grainsize+1 iterations 4260 // last_chunk Reduction of grainsize for last task 4261 // tc Iterations count 4262 // task_dup Tasks duplication routine 4263 // codeptr_ra Return address for OMPT events 4264 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, 4265 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 4266 kmp_uint64 ub_glob, kmp_uint64 num_tasks, 4267 kmp_uint64 grainsize, kmp_uint64 extras, 4268 kmp_int64 last_chunk, kmp_uint64 tc, 4269 #if OMPT_SUPPORT 4270 void *codeptr_ra, 4271 #endif 4272 void *task_dup) { 4273 KMP_COUNT_BLOCK(OMP_TASKLOOP); 4274 KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling); 4275 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 4276 // compiler provides global bounds here 4277 kmp_taskloop_bounds_t task_bounds(task, lb, ub); 4278 kmp_uint64 lower = task_bounds.get_lb(); 4279 kmp_uint64 upper = task_bounds.get_ub(); 4280 kmp_uint64 i; 4281 kmp_info_t *thread = __kmp_threads[gtid]; 4282 kmp_taskdata_t *current_task = thread->th.th_current_task; 4283 kmp_task_t *next_task; 4284 kmp_int32 lastpriv = 0; 4285 4286 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + 4287 (last_chunk < 0 ? last_chunk : extras)); 4288 KMP_DEBUG_ASSERT(num_tasks > extras); 4289 KMP_DEBUG_ASSERT(num_tasks > 0); 4290 KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, " 4291 "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n", 4292 gtid, num_tasks, grainsize, extras, last_chunk, lower, upper, 4293 ub_glob, st, task_dup)); 4294 4295 // Launch num_tasks tasks, assign grainsize iterations each task 4296 for (i = 0; i < num_tasks; ++i) { 4297 kmp_uint64 chunk_minus_1; 4298 if (extras == 0) { 4299 chunk_minus_1 = grainsize - 1; 4300 } else { 4301 chunk_minus_1 = grainsize; 4302 --extras; // first extras iterations get bigger chunk (grainsize+1) 4303 } 4304 upper = lower + st * chunk_minus_1; 4305 if (upper > *ub) { 4306 upper = *ub; 4307 } 4308 if (i == num_tasks - 1) { 4309 // schedule the last task, set lastprivate flag if needed 4310 if (st == 1) { // most common case 4311 KMP_DEBUG_ASSERT(upper == *ub); 4312 if (upper == ub_glob) 4313 lastpriv = 1; 4314 } else if (st > 0) { // positive loop stride 4315 KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper); 4316 if ((kmp_uint64)st > ub_glob - upper) 4317 lastpriv = 1; 4318 } else { // negative loop stride 4319 KMP_DEBUG_ASSERT(upper + st < *ub); 4320 if (upper - ub_glob < (kmp_uint64)(-st)) 4321 lastpriv = 1; 4322 } 4323 } 4324 next_task = __kmp_task_dup_alloc(thread, task); // allocate new task 4325 kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task); 4326 kmp_taskloop_bounds_t next_task_bounds = 4327 kmp_taskloop_bounds_t(next_task, task_bounds); 4328 4329 // adjust task-specific bounds 4330 next_task_bounds.set_lb(lower); 4331 if (next_taskdata->td_flags.native) { 4332 next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1)); 4333 } else { 4334 next_task_bounds.set_ub(upper); 4335 } 4336 if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates, 4337 // etc. 4338 ptask_dup(next_task, task, lastpriv); 4339 KA_TRACE(40, 4340 ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, " 4341 "upper %lld stride %lld, (offsets %p %p)\n", 4342 gtid, i, next_task, lower, upper, st, 4343 next_task_bounds.get_lower_offset(), 4344 next_task_bounds.get_upper_offset())); 4345 #if OMPT_SUPPORT 4346 __kmp_omp_taskloop_task(NULL, gtid, next_task, 4347 codeptr_ra); // schedule new task 4348 #else 4349 __kmp_omp_task(gtid, next_task, true); // schedule new task 4350 #endif 4351 lower = upper + st; // adjust lower bound for the next iteration 4352 } 4353 // free the pattern task and exit 4354 __kmp_task_start(gtid, task, current_task); // make internal bookkeeping 4355 // do not execute the pattern task, just do internal bookkeeping 4356 __kmp_task_finish<false>(gtid, task, current_task); 4357 } 4358 4359 // Structure to keep taskloop parameters for auxiliary task 4360 // kept in the shareds of the task structure. 4361 typedef struct __taskloop_params { 4362 kmp_task_t *task; 4363 kmp_uint64 *lb; 4364 kmp_uint64 *ub; 4365 void *task_dup; 4366 kmp_int64 st; 4367 kmp_uint64 ub_glob; 4368 kmp_uint64 num_tasks; 4369 kmp_uint64 grainsize; 4370 kmp_uint64 extras; 4371 kmp_int64 last_chunk; 4372 kmp_uint64 tc; 4373 kmp_uint64 num_t_min; 4374 #if OMPT_SUPPORT 4375 void *codeptr_ra; 4376 #endif 4377 } __taskloop_params_t; 4378 4379 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *, 4380 kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64, 4381 kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64, 4382 kmp_uint64, 4383 #if OMPT_SUPPORT 4384 void *, 4385 #endif 4386 void *); 4387 4388 // Execute part of the taskloop submitted as a task. 4389 int __kmp_taskloop_task(int gtid, void *ptask) { 4390 __taskloop_params_t *p = 4391 (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds; 4392 kmp_task_t *task = p->task; 4393 kmp_uint64 *lb = p->lb; 4394 kmp_uint64 *ub = p->ub; 4395 void *task_dup = p->task_dup; 4396 // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 4397 kmp_int64 st = p->st; 4398 kmp_uint64 ub_glob = p->ub_glob; 4399 kmp_uint64 num_tasks = p->num_tasks; 4400 kmp_uint64 grainsize = p->grainsize; 4401 kmp_uint64 extras = p->extras; 4402 kmp_int64 last_chunk = p->last_chunk; 4403 kmp_uint64 tc = p->tc; 4404 kmp_uint64 num_t_min = p->num_t_min; 4405 #if OMPT_SUPPORT 4406 void *codeptr_ra = p->codeptr_ra; 4407 #endif 4408 #if KMP_DEBUG 4409 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4410 KMP_DEBUG_ASSERT(task != NULL); 4411 KA_TRACE(20, 4412 ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize" 4413 " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n", 4414 gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub, 4415 st, task_dup)); 4416 #endif 4417 KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min); 4418 if (num_tasks > num_t_min) 4419 __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, 4420 grainsize, extras, last_chunk, tc, num_t_min, 4421 #if OMPT_SUPPORT 4422 codeptr_ra, 4423 #endif 4424 task_dup); 4425 else 4426 __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, 4427 grainsize, extras, last_chunk, tc, 4428 #if OMPT_SUPPORT 4429 codeptr_ra, 4430 #endif 4431 task_dup); 4432 4433 KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid)); 4434 return 0; 4435 } 4436 4437 // Schedule part of the taskloop as a task, 4438 // execute the rest of the taskloop. 4439 // 4440 // loc Source location information 4441 // gtid Global thread ID 4442 // task Pattern task, exposes the loop iteration range 4443 // lb Pointer to loop lower bound in task structure 4444 // ub Pointer to loop upper bound in task structure 4445 // st Loop stride 4446 // ub_glob Global upper bound (used for lastprivate check) 4447 // num_tasks Number of tasks to execute 4448 // grainsize Number of loop iterations per task 4449 // extras Number of chunks with grainsize+1 iterations 4450 // last_chunk Reduction of grainsize for last task 4451 // tc Iterations count 4452 // num_t_min Threshold to launch tasks recursively 4453 // task_dup Tasks duplication routine 4454 // codeptr_ra Return address for OMPT events 4455 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task, 4456 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 4457 kmp_uint64 ub_glob, kmp_uint64 num_tasks, 4458 kmp_uint64 grainsize, kmp_uint64 extras, 4459 kmp_int64 last_chunk, kmp_uint64 tc, 4460 kmp_uint64 num_t_min, 4461 #if OMPT_SUPPORT 4462 void *codeptr_ra, 4463 #endif 4464 void *task_dup) { 4465 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4466 KMP_DEBUG_ASSERT(task != NULL); 4467 KMP_DEBUG_ASSERT(num_tasks > num_t_min); 4468 KA_TRACE(20, 4469 ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize" 4470 " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n", 4471 gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub, 4472 st, task_dup)); 4473 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 4474 kmp_uint64 lower = *lb; 4475 kmp_info_t *thread = __kmp_threads[gtid]; 4476 // kmp_taskdata_t *current_task = thread->th.th_current_task; 4477 kmp_task_t *next_task; 4478 size_t lower_offset = 4479 (char *)lb - (char *)task; // remember offset of lb in the task structure 4480 size_t upper_offset = 4481 (char *)ub - (char *)task; // remember offset of ub in the task structure 4482 4483 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + 4484 (last_chunk < 0 ? last_chunk : extras)); 4485 KMP_DEBUG_ASSERT(num_tasks > extras); 4486 KMP_DEBUG_ASSERT(num_tasks > 0); 4487 4488 // split the loop in two halves 4489 kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1; 4490 kmp_int64 last_chunk0 = 0, last_chunk1 = 0; 4491 kmp_uint64 gr_size0 = grainsize; 4492 kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute 4493 kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task 4494 if (last_chunk < 0) { 4495 ext0 = ext1 = 0; 4496 last_chunk1 = last_chunk; 4497 tc0 = grainsize * n_tsk0; 4498 tc1 = tc - tc0; 4499 } else if (n_tsk0 <= extras) { 4500 gr_size0++; // integrate extras into grainsize 4501 ext0 = 0; // no extra iters in 1st half 4502 ext1 = extras - n_tsk0; // remaining extras 4503 tc0 = gr_size0 * n_tsk0; 4504 tc1 = tc - tc0; 4505 } else { // n_tsk0 > extras 4506 ext1 = 0; // no extra iters in 2nd half 4507 ext0 = extras; 4508 tc1 = grainsize * n_tsk1; 4509 tc0 = tc - tc1; 4510 } 4511 ub0 = lower + st * (tc0 - 1); 4512 lb1 = ub0 + st; 4513 4514 // create pattern task for 2nd half of the loop 4515 next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task 4516 // adjust lower bound (upper bound is not changed) for the 2nd half 4517 *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1; 4518 if (ptask_dup != NULL) // construct firstprivates, etc. 4519 ptask_dup(next_task, task, 0); 4520 *ub = ub0; // adjust upper bound for the 1st half 4521 4522 // create auxiliary task for 2nd half of the loop 4523 // make sure new task has same parent task as the pattern task 4524 kmp_taskdata_t *current_task = thread->th.th_current_task; 4525 thread->th.th_current_task = taskdata->td_parent; 4526 kmp_task_t *new_task = 4527 __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *), 4528 sizeof(__taskloop_params_t), &__kmp_taskloop_task); 4529 // restore current task 4530 thread->th.th_current_task = current_task; 4531 __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds; 4532 p->task = next_task; 4533 p->lb = (kmp_uint64 *)((char *)next_task + lower_offset); 4534 p->ub = (kmp_uint64 *)((char *)next_task + upper_offset); 4535 p->task_dup = task_dup; 4536 p->st = st; 4537 p->ub_glob = ub_glob; 4538 p->num_tasks = n_tsk1; 4539 p->grainsize = grainsize; 4540 p->extras = ext1; 4541 p->last_chunk = last_chunk1; 4542 p->tc = tc1; 4543 p->num_t_min = num_t_min; 4544 #if OMPT_SUPPORT 4545 p->codeptr_ra = codeptr_ra; 4546 #endif 4547 4548 #if OMPT_SUPPORT 4549 // schedule new task with correct return address for OMPT events 4550 __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra); 4551 #else 4552 __kmp_omp_task(gtid, new_task, true); // schedule new task 4553 #endif 4554 4555 // execute the 1st half of current subrange 4556 if (n_tsk0 > num_t_min) 4557 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0, 4558 ext0, last_chunk0, tc0, num_t_min, 4559 #if OMPT_SUPPORT 4560 codeptr_ra, 4561 #endif 4562 task_dup); 4563 else 4564 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, 4565 gr_size0, ext0, last_chunk0, tc0, 4566 #if OMPT_SUPPORT 4567 codeptr_ra, 4568 #endif 4569 task_dup); 4570 4571 KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid)); 4572 } 4573 4574 static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, 4575 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 4576 int nogroup, int sched, kmp_uint64 grainsize, 4577 int modifier, void *task_dup) { 4578 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4579 KMP_DEBUG_ASSERT(task != NULL); 4580 if (nogroup == 0) { 4581 #if OMPT_SUPPORT && OMPT_OPTIONAL 4582 OMPT_STORE_RETURN_ADDRESS(gtid); 4583 #endif 4584 __kmpc_taskgroup(loc, gtid); 4585 } 4586 4587 // ========================================================================= 4588 // calculate loop parameters 4589 kmp_taskloop_bounds_t task_bounds(task, lb, ub); 4590 kmp_uint64 tc; 4591 // compiler provides global bounds here 4592 kmp_uint64 lower = task_bounds.get_lb(); 4593 kmp_uint64 upper = task_bounds.get_ub(); 4594 kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag 4595 kmp_uint64 num_tasks = 0, extras = 0; 4596 kmp_int64 last_chunk = 4597 0; // reduce grainsize of last task by last_chunk in strict mode 4598 kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks; 4599 kmp_info_t *thread = __kmp_threads[gtid]; 4600 kmp_taskdata_t *current_task = thread->th.th_current_task; 4601 4602 KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, " 4603 "grain %llu(%d, %d), dup %p\n", 4604 gtid, taskdata, lower, upper, st, grainsize, sched, modifier, 4605 task_dup)); 4606 4607 // compute trip count 4608 if (st == 1) { // most common case 4609 tc = upper - lower + 1; 4610 } else if (st < 0) { 4611 tc = (lower - upper) / (-st) + 1; 4612 } else { // st > 0 4613 tc = (upper - lower) / st + 1; 4614 } 4615 if (tc == 0) { 4616 KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid)); 4617 // free the pattern task and exit 4618 __kmp_task_start(gtid, task, current_task); 4619 // do not execute anything for zero-trip loop 4620 __kmp_task_finish<false>(gtid, task, current_task); 4621 return; 4622 } 4623 4624 #if OMPT_SUPPORT && OMPT_OPTIONAL 4625 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 4626 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 4627 if (ompt_enabled.ompt_callback_work) { 4628 ompt_callbacks.ompt_callback(ompt_callback_work)( 4629 ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data), 4630 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0)); 4631 } 4632 #endif 4633 4634 if (num_tasks_min == 0) 4635 // TODO: can we choose better default heuristic? 4636 num_tasks_min = 4637 KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE); 4638 4639 // compute num_tasks/grainsize based on the input provided 4640 switch (sched) { 4641 case 0: // no schedule clause specified, we can choose the default 4642 // let's try to schedule (team_size*10) tasks 4643 grainsize = thread->th.th_team_nproc * 10; 4644 KMP_FALLTHROUGH(); 4645 case 2: // num_tasks provided 4646 if (grainsize > tc) { 4647 num_tasks = tc; // too big num_tasks requested, adjust values 4648 grainsize = 1; 4649 extras = 0; 4650 } else { 4651 num_tasks = grainsize; 4652 grainsize = tc / num_tasks; 4653 extras = tc % num_tasks; 4654 } 4655 break; 4656 case 1: // grainsize provided 4657 if (grainsize > tc) { 4658 num_tasks = 1; 4659 grainsize = tc; // too big grainsize requested, adjust values 4660 extras = 0; 4661 } else { 4662 if (modifier) { 4663 num_tasks = (tc + grainsize - 1) / grainsize; 4664 last_chunk = tc - (num_tasks * grainsize); 4665 extras = 0; 4666 } else { 4667 num_tasks = tc / grainsize; 4668 // adjust grainsize for balanced distribution of iterations 4669 grainsize = tc / num_tasks; 4670 extras = tc % num_tasks; 4671 } 4672 } 4673 break; 4674 default: 4675 KMP_ASSERT2(0, "unknown scheduling of taskloop"); 4676 } 4677 4678 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + 4679 (last_chunk < 0 ? last_chunk : extras)); 4680 KMP_DEBUG_ASSERT(num_tasks > extras); 4681 KMP_DEBUG_ASSERT(num_tasks > 0); 4682 // ========================================================================= 4683 4684 // check if clause value first 4685 // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native) 4686 if (if_val == 0) { // if(0) specified, mark task as serial 4687 taskdata->td_flags.task_serial = 1; 4688 taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied 4689 // always start serial tasks linearly 4690 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4691 grainsize, extras, last_chunk, tc, 4692 #if OMPT_SUPPORT 4693 OMPT_GET_RETURN_ADDRESS(0), 4694 #endif 4695 task_dup); 4696 // !taskdata->td_flags.native => currently force linear spawning of tasks 4697 // for GOMP_taskloop 4698 } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) { 4699 KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu" 4700 "(%lld), grain %llu, extras %llu, last_chunk %lld\n", 4701 gtid, tc, num_tasks, num_tasks_min, grainsize, extras, 4702 last_chunk)); 4703 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4704 grainsize, extras, last_chunk, tc, num_tasks_min, 4705 #if OMPT_SUPPORT 4706 OMPT_GET_RETURN_ADDRESS(0), 4707 #endif 4708 task_dup); 4709 } else { 4710 KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu" 4711 "(%lld), grain %llu, extras %llu, last_chunk %lld\n", 4712 gtid, tc, num_tasks, num_tasks_min, grainsize, extras, 4713 last_chunk)); 4714 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4715 grainsize, extras, last_chunk, tc, 4716 #if OMPT_SUPPORT 4717 OMPT_GET_RETURN_ADDRESS(0), 4718 #endif 4719 task_dup); 4720 } 4721 4722 #if OMPT_SUPPORT && OMPT_OPTIONAL 4723 if (ompt_enabled.ompt_callback_work) { 4724 ompt_callbacks.ompt_callback(ompt_callback_work)( 4725 ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data), 4726 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0)); 4727 } 4728 #endif 4729 4730 if (nogroup == 0) { 4731 #if OMPT_SUPPORT && OMPT_OPTIONAL 4732 OMPT_STORE_RETURN_ADDRESS(gtid); 4733 #endif 4734 __kmpc_end_taskgroup(loc, gtid); 4735 } 4736 KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid)); 4737 } 4738 4739 /*! 4740 @ingroup TASKING 4741 @param loc Source location information 4742 @param gtid Global thread ID 4743 @param task Task structure 4744 @param if_val Value of the if clause 4745 @param lb Pointer to loop lower bound in task structure 4746 @param ub Pointer to loop upper bound in task structure 4747 @param st Loop stride 4748 @param nogroup Flag, 1 if nogroup clause specified, 0 otherwise 4749 @param sched Schedule specified 0/1/2 for none/grainsize/num_tasks 4750 @param grainsize Schedule value if specified 4751 @param task_dup Tasks duplication routine 4752 4753 Execute the taskloop construct. 4754 */ 4755 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, 4756 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, 4757 int sched, kmp_uint64 grainsize, void *task_dup) { 4758 __kmp_assert_valid_gtid(gtid); 4759 KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid)); 4760 __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize, 4761 0, task_dup); 4762 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid)); 4763 } 4764 4765 /*! 4766 @ingroup TASKING 4767 @param loc Source location information 4768 @param gtid Global thread ID 4769 @param task Task structure 4770 @param if_val Value of the if clause 4771 @param lb Pointer to loop lower bound in task structure 4772 @param ub Pointer to loop upper bound in task structure 4773 @param st Loop stride 4774 @param nogroup Flag, 1 if nogroup clause specified, 0 otherwise 4775 @param sched Schedule specified 0/1/2 for none/grainsize/num_tasks 4776 @param grainsize Schedule value if specified 4777 @param modifer Modifier 'strict' for sched, 1 if present, 0 otherwise 4778 @param task_dup Tasks duplication routine 4779 4780 Execute the taskloop construct. 4781 */ 4782 void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val, 4783 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 4784 int nogroup, int sched, kmp_uint64 grainsize, 4785 int modifier, void *task_dup) { 4786 __kmp_assert_valid_gtid(gtid); 4787 KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid)); 4788 __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize, 4789 modifier, task_dup); 4790 KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid)); 4791 } 4792