1 /* 2 * kmp_tasking.cpp -- OpenMP 3.0 tasking support. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_i18n.h" 15 #include "kmp_itt.h" 16 #include "kmp_stats.h" 17 #include "kmp_wait_release.h" 18 #include "kmp_taskdeps.h" 19 20 #if OMPT_SUPPORT 21 #include "ompt-specific.h" 22 #endif 23 24 #include "tsan_annotations.h" 25 26 /* forward declaration */ 27 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 28 kmp_info_t *this_thr); 29 static void __kmp_alloc_task_deque(kmp_info_t *thread, 30 kmp_thread_data_t *thread_data); 31 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 32 kmp_task_team_t *task_team); 33 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask); 34 35 #ifdef BUILD_TIED_TASK_STACK 36 37 // __kmp_trace_task_stack: print the tied tasks from the task stack in order 38 // from top do bottom 39 // 40 // gtid: global thread identifier for thread containing stack 41 // thread_data: thread data for task team thread containing stack 42 // threshold: value above which the trace statement triggers 43 // location: string identifying call site of this function (for trace) 44 static void __kmp_trace_task_stack(kmp_int32 gtid, 45 kmp_thread_data_t *thread_data, 46 int threshold, char *location) { 47 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 48 kmp_taskdata_t **stack_top = task_stack->ts_top; 49 kmp_int32 entries = task_stack->ts_entries; 50 kmp_taskdata_t *tied_task; 51 52 KA_TRACE( 53 threshold, 54 ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, " 55 "first_block = %p, stack_top = %p \n", 56 location, gtid, entries, task_stack->ts_first_block, stack_top)); 57 58 KMP_DEBUG_ASSERT(stack_top != NULL); 59 KMP_DEBUG_ASSERT(entries > 0); 60 61 while (entries != 0) { 62 KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]); 63 // fix up ts_top if we need to pop from previous block 64 if (entries & TASK_STACK_INDEX_MASK == 0) { 65 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top); 66 67 stack_block = stack_block->sb_prev; 68 stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 69 } 70 71 // finish bookkeeping 72 stack_top--; 73 entries--; 74 75 tied_task = *stack_top; 76 77 KMP_DEBUG_ASSERT(tied_task != NULL); 78 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 79 80 KA_TRACE(threshold, 81 ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, " 82 "stack_top=%p, tied_task=%p\n", 83 location, gtid, entries, stack_top, tied_task)); 84 } 85 KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]); 86 87 KA_TRACE(threshold, 88 ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n", 89 location, gtid)); 90 } 91 92 // __kmp_init_task_stack: initialize the task stack for the first time 93 // after a thread_data structure is created. 94 // It should not be necessary to do this again (assuming the stack works). 95 // 96 // gtid: global thread identifier of calling thread 97 // thread_data: thread data for task team thread containing stack 98 static void __kmp_init_task_stack(kmp_int32 gtid, 99 kmp_thread_data_t *thread_data) { 100 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 101 kmp_stack_block_t *first_block; 102 103 // set up the first block of the stack 104 first_block = &task_stack->ts_first_block; 105 task_stack->ts_top = (kmp_taskdata_t **)first_block; 106 memset((void *)first_block, '\0', 107 TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *)); 108 109 // initialize the stack to be empty 110 task_stack->ts_entries = TASK_STACK_EMPTY; 111 first_block->sb_next = NULL; 112 first_block->sb_prev = NULL; 113 } 114 115 // __kmp_free_task_stack: free the task stack when thread_data is destroyed. 116 // 117 // gtid: global thread identifier for calling thread 118 // thread_data: thread info for thread containing stack 119 static void __kmp_free_task_stack(kmp_int32 gtid, 120 kmp_thread_data_t *thread_data) { 121 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 122 kmp_stack_block_t *stack_block = &task_stack->ts_first_block; 123 124 KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY); 125 // free from the second block of the stack 126 while (stack_block != NULL) { 127 kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL; 128 129 stack_block->sb_next = NULL; 130 stack_block->sb_prev = NULL; 131 if (stack_block != &task_stack->ts_first_block) { 132 __kmp_thread_free(thread, 133 stack_block); // free the block, if not the first 134 } 135 stack_block = next_block; 136 } 137 // initialize the stack to be empty 138 task_stack->ts_entries = 0; 139 task_stack->ts_top = NULL; 140 } 141 142 // __kmp_push_task_stack: Push the tied task onto the task stack. 143 // Grow the stack if necessary by allocating another block. 144 // 145 // gtid: global thread identifier for calling thread 146 // thread: thread info for thread containing stack 147 // tied_task: the task to push on the stack 148 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread, 149 kmp_taskdata_t *tied_task) { 150 // GEH - need to consider what to do if tt_threads_data not allocated yet 151 kmp_thread_data_t *thread_data = 152 &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 153 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 154 155 if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) { 156 return; // Don't push anything on stack if team or team tasks are serialized 157 } 158 159 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 160 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 161 162 KA_TRACE(20, 163 ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n", 164 gtid, thread, tied_task)); 165 // Store entry 166 *(task_stack->ts_top) = tied_task; 167 168 // Do bookkeeping for next push 169 task_stack->ts_top++; 170 task_stack->ts_entries++; 171 172 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 173 // Find beginning of this task block 174 kmp_stack_block_t *stack_block = 175 (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE); 176 177 // Check if we already have a block 178 if (stack_block->sb_next != 179 NULL) { // reset ts_top to beginning of next block 180 task_stack->ts_top = &stack_block->sb_next->sb_block[0]; 181 } else { // Alloc new block and link it up 182 kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc( 183 thread, sizeof(kmp_stack_block_t)); 184 185 task_stack->ts_top = &new_block->sb_block[0]; 186 stack_block->sb_next = new_block; 187 new_block->sb_prev = stack_block; 188 new_block->sb_next = NULL; 189 190 KA_TRACE( 191 30, 192 ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n", 193 gtid, tied_task, new_block)); 194 } 195 } 196 KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 197 tied_task)); 198 } 199 200 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return 201 // the task, just check to make sure it matches the ending task passed in. 202 // 203 // gtid: global thread identifier for the calling thread 204 // thread: thread info structure containing stack 205 // tied_task: the task popped off the stack 206 // ending_task: the task that is ending (should match popped task) 207 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread, 208 kmp_taskdata_t *ending_task) { 209 // GEH - need to consider what to do if tt_threads_data not allocated yet 210 kmp_thread_data_t *thread_data = 211 &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)]; 212 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 213 kmp_taskdata_t *tied_task; 214 215 if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) { 216 // Don't pop anything from stack if team or team tasks are serialized 217 return; 218 } 219 220 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 221 KMP_DEBUG_ASSERT(task_stack->ts_entries > 0); 222 223 KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid, 224 thread)); 225 226 // fix up ts_top if we need to pop from previous block 227 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 228 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top); 229 230 stack_block = stack_block->sb_prev; 231 task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 232 } 233 234 // finish bookkeeping 235 task_stack->ts_top--; 236 task_stack->ts_entries--; 237 238 tied_task = *(task_stack->ts_top); 239 240 KMP_DEBUG_ASSERT(tied_task != NULL); 241 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 242 KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly 243 244 KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 245 tied_task)); 246 return; 247 } 248 #endif /* BUILD_TIED_TASK_STACK */ 249 250 // returns 1 if new task is allowed to execute, 0 otherwise 251 // checks Task Scheduling constraint (if requested) and 252 // mutexinoutset dependencies if any 253 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained, 254 const kmp_taskdata_t *tasknew, 255 const kmp_taskdata_t *taskcurr) { 256 if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) { 257 // Check if the candidate obeys the Task Scheduling Constraints (TSC) 258 // only descendant of all deferred tied tasks can be scheduled, checking 259 // the last one is enough, as it in turn is the descendant of all others 260 kmp_taskdata_t *current = taskcurr->td_last_tied; 261 KMP_DEBUG_ASSERT(current != NULL); 262 // check if the task is not suspended on barrier 263 if (current->td_flags.tasktype == TASK_EXPLICIT || 264 current->td_taskwait_thread > 0) { // <= 0 on barrier 265 kmp_int32 level = current->td_level; 266 kmp_taskdata_t *parent = tasknew->td_parent; 267 while (parent != current && parent->td_level > level) { 268 // check generation up to the level of the current task 269 parent = parent->td_parent; 270 KMP_DEBUG_ASSERT(parent != NULL); 271 } 272 if (parent != current) 273 return false; 274 } 275 } 276 // Check mutexinoutset dependencies, acquire locks 277 kmp_depnode_t *node = tasknew->td_depnode; 278 if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) { 279 for (int i = 0; i < node->dn.mtx_num_locks; ++i) { 280 KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL); 281 if (__kmp_test_lock(node->dn.mtx_locks[i], gtid)) 282 continue; 283 // could not get the lock, release previous locks 284 for (int j = i - 1; j >= 0; --j) 285 __kmp_release_lock(node->dn.mtx_locks[j], gtid); 286 return false; 287 } 288 // negative num_locks means all locks acquired successfully 289 node->dn.mtx_num_locks = -node->dn.mtx_num_locks; 290 } 291 return true; 292 } 293 294 // __kmp_realloc_task_deque: 295 // Re-allocates a task deque for a particular thread, copies the content from 296 // the old deque and adjusts the necessary data structures relating to the 297 // deque. This operation must be done with the deque_lock being held 298 static void __kmp_realloc_task_deque(kmp_info_t *thread, 299 kmp_thread_data_t *thread_data) { 300 kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td); 301 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size); 302 kmp_int32 new_size = 2 * size; 303 304 KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to " 305 "%d] for thread_data %p\n", 306 __kmp_gtid_from_thread(thread), size, new_size, thread_data)); 307 308 kmp_taskdata_t **new_deque = 309 (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *)); 310 311 int i, j; 312 for (i = thread_data->td.td_deque_head, j = 0; j < size; 313 i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++) 314 new_deque[j] = thread_data->td.td_deque[i]; 315 316 __kmp_free(thread_data->td.td_deque); 317 318 thread_data->td.td_deque_head = 0; 319 thread_data->td.td_deque_tail = size; 320 thread_data->td.td_deque = new_deque; 321 thread_data->td.td_deque_size = new_size; 322 } 323 324 // __kmp_push_task: Add a task to the thread's deque 325 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { 326 kmp_info_t *thread = __kmp_threads[gtid]; 327 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 328 329 if (taskdata->td_flags.hidden_helper) { 330 gtid = KMP_GTID_TO_SHADOW_GTID(gtid); 331 thread = __kmp_threads[gtid]; 332 } 333 334 kmp_task_team_t *task_team = thread->th.th_task_team; 335 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 336 kmp_thread_data_t *thread_data; 337 338 KA_TRACE(20, 339 ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata)); 340 341 if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) { 342 // untied task needs to increment counter so that the task structure is not 343 // freed prematurely 344 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count); 345 KMP_DEBUG_USE_VAR(counter); 346 KA_TRACE( 347 20, 348 ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n", 349 gtid, counter, taskdata)); 350 } 351 352 // The first check avoids building task_team thread data if serialized 353 if (UNLIKELY(taskdata->td_flags.task_serial)) { 354 KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning " 355 "TASK_NOT_PUSHED for task %p\n", 356 gtid, taskdata)); 357 return TASK_NOT_PUSHED; 358 } 359 360 // Now that serialized tasks have returned, we can assume that we are not in 361 // immediate exec mode 362 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 363 if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) { 364 __kmp_enable_tasking(task_team, thread); 365 } 366 KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE); 367 KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL); 368 369 // Find tasking deque specific to encountering thread 370 thread_data = &task_team->tt.tt_threads_data[tid]; 371 372 // No lock needed since only owner can allocate. If the task is hidden_helper, 373 // we don't need it either because we have initialized the dequeue for hidden 374 // helper thread data. 375 if (UNLIKELY(thread_data->td.td_deque == NULL)) { 376 __kmp_alloc_task_deque(thread, thread_data); 377 } 378 379 int locked = 0; 380 // Check if deque is full 381 if (TCR_4(thread_data->td.td_deque_ntasks) >= 382 TASK_DEQUE_SIZE(thread_data->td)) { 383 if (__kmp_enable_task_throttling && 384 __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata, 385 thread->th.th_current_task)) { 386 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning " 387 "TASK_NOT_PUSHED for task %p\n", 388 gtid, taskdata)); 389 return TASK_NOT_PUSHED; 390 } else { 391 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 392 locked = 1; 393 if (TCR_4(thread_data->td.td_deque_ntasks) >= 394 TASK_DEQUE_SIZE(thread_data->td)) { 395 // expand deque to push the task which is not allowed to execute 396 __kmp_realloc_task_deque(thread, thread_data); 397 } 398 } 399 } 400 // Lock the deque for the task push operation 401 if (!locked) { 402 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 403 // Need to recheck as we can get a proxy task from thread outside of OpenMP 404 if (TCR_4(thread_data->td.td_deque_ntasks) >= 405 TASK_DEQUE_SIZE(thread_data->td)) { 406 if (__kmp_enable_task_throttling && 407 __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata, 408 thread->th.th_current_task)) { 409 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 410 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; " 411 "returning TASK_NOT_PUSHED for task %p\n", 412 gtid, taskdata)); 413 return TASK_NOT_PUSHED; 414 } else { 415 // expand deque to push the task which is not allowed to execute 416 __kmp_realloc_task_deque(thread, thread_data); 417 } 418 } 419 } 420 // Must have room since no thread can add tasks but calling thread 421 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) < 422 TASK_DEQUE_SIZE(thread_data->td)); 423 424 thread_data->td.td_deque[thread_data->td.td_deque_tail] = 425 taskdata; // Push taskdata 426 // Wrap index. 427 thread_data->td.td_deque_tail = 428 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 429 TCW_4(thread_data->td.td_deque_ntasks, 430 TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count 431 KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self 432 KMP_FSYNC_RELEASING(taskdata); // releasing child 433 KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: " 434 "task=%p ntasks=%d head=%u tail=%u\n", 435 gtid, taskdata, thread_data->td.td_deque_ntasks, 436 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 437 438 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 439 440 // Signal one worker thread to execute the task 441 if (taskdata->td_flags.hidden_helper) { 442 // Wake hidden helper threads up if they're sleeping 443 __kmp_hidden_helper_worker_thread_signal(); 444 } 445 446 return TASK_SUCCESSFULLY_PUSHED; 447 } 448 449 // __kmp_pop_current_task_from_thread: set up current task from called thread 450 // when team ends 451 // 452 // this_thr: thread structure to set current_task in. 453 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) { 454 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d " 455 "this_thread=%p, curtask=%p, " 456 "curtask_parent=%p\n", 457 0, this_thr, this_thr->th.th_current_task, 458 this_thr->th.th_current_task->td_parent)); 459 460 this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent; 461 462 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d " 463 "this_thread=%p, curtask=%p, " 464 "curtask_parent=%p\n", 465 0, this_thr, this_thr->th.th_current_task, 466 this_thr->th.th_current_task->td_parent)); 467 } 468 469 // __kmp_push_current_task_to_thread: set up current task in called thread for a 470 // new team 471 // 472 // this_thr: thread structure to set up 473 // team: team for implicit task data 474 // tid: thread within team to set up 475 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team, 476 int tid) { 477 // current task of the thread is a parent of the new just created implicit 478 // tasks of new team 479 KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p " 480 "curtask=%p " 481 "parent_task=%p\n", 482 tid, this_thr, this_thr->th.th_current_task, 483 team->t.t_implicit_task_taskdata[tid].td_parent)); 484 485 KMP_DEBUG_ASSERT(this_thr != NULL); 486 487 if (tid == 0) { 488 if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) { 489 team->t.t_implicit_task_taskdata[0].td_parent = 490 this_thr->th.th_current_task; 491 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0]; 492 } 493 } else { 494 team->t.t_implicit_task_taskdata[tid].td_parent = 495 team->t.t_implicit_task_taskdata[0].td_parent; 496 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid]; 497 } 498 499 KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p " 500 "curtask=%p " 501 "parent_task=%p\n", 502 tid, this_thr, this_thr->th.th_current_task, 503 team->t.t_implicit_task_taskdata[tid].td_parent)); 504 } 505 506 // __kmp_task_start: bookkeeping for a task starting execution 507 // 508 // GTID: global thread id of calling thread 509 // task: task starting execution 510 // current_task: task suspending 511 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task, 512 kmp_taskdata_t *current_task) { 513 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 514 kmp_info_t *thread = __kmp_threads[gtid]; 515 516 KA_TRACE(10, 517 ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n", 518 gtid, taskdata, current_task)); 519 520 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 521 522 // mark currently executing task as suspended 523 // TODO: GEH - make sure root team implicit task is initialized properly. 524 // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 ); 525 current_task->td_flags.executing = 0; 526 527 // Add task to stack if tied 528 #ifdef BUILD_TIED_TASK_STACK 529 if (taskdata->td_flags.tiedness == TASK_TIED) { 530 __kmp_push_task_stack(gtid, thread, taskdata); 531 } 532 #endif /* BUILD_TIED_TASK_STACK */ 533 534 // mark starting task as executing and as current task 535 thread->th.th_current_task = taskdata; 536 537 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 || 538 taskdata->td_flags.tiedness == TASK_UNTIED); 539 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 || 540 taskdata->td_flags.tiedness == TASK_UNTIED); 541 taskdata->td_flags.started = 1; 542 taskdata->td_flags.executing = 1; 543 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 544 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 545 546 // GEH TODO: shouldn't we pass some sort of location identifier here? 547 // APT: yes, we will pass location here. 548 // need to store current thread state (in a thread or taskdata structure) 549 // before setting work_state, otherwise wrong state is set after end of task 550 551 KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata)); 552 553 return; 554 } 555 556 #if OMPT_SUPPORT 557 //------------------------------------------------------------------------------ 558 // __ompt_task_init: 559 // Initialize OMPT fields maintained by a task. This will only be called after 560 // ompt_start_tool, so we already know whether ompt is enabled or not. 561 562 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) { 563 // The calls to __ompt_task_init already have the ompt_enabled condition. 564 task->ompt_task_info.task_data.value = 0; 565 task->ompt_task_info.frame.exit_frame = ompt_data_none; 566 task->ompt_task_info.frame.enter_frame = ompt_data_none; 567 task->ompt_task_info.frame.exit_frame_flags = 568 ompt_frame_runtime | ompt_frame_framepointer; 569 task->ompt_task_info.frame.enter_frame_flags = 570 ompt_frame_runtime | ompt_frame_framepointer; 571 } 572 573 // __ompt_task_start: 574 // Build and trigger task-begin event 575 static inline void __ompt_task_start(kmp_task_t *task, 576 kmp_taskdata_t *current_task, 577 kmp_int32 gtid) { 578 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 579 ompt_task_status_t status = ompt_task_switch; 580 if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) { 581 status = ompt_task_yield; 582 __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0; 583 } 584 /* let OMPT know that we're about to run this task */ 585 if (ompt_enabled.ompt_callback_task_schedule) { 586 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)( 587 &(current_task->ompt_task_info.task_data), status, 588 &(taskdata->ompt_task_info.task_data)); 589 } 590 taskdata->ompt_task_info.scheduling_parent = current_task; 591 } 592 593 // __ompt_task_finish: 594 // Build and trigger final task-schedule event 595 static inline void __ompt_task_finish(kmp_task_t *task, 596 kmp_taskdata_t *resumed_task, 597 ompt_task_status_t status) { 598 if (ompt_enabled.ompt_callback_task_schedule) { 599 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 600 if (__kmp_omp_cancellation && taskdata->td_taskgroup && 601 taskdata->td_taskgroup->cancel_request == cancel_taskgroup) { 602 status = ompt_task_cancel; 603 } 604 605 /* let OMPT know that we're returning to the callee task */ 606 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)( 607 &(taskdata->ompt_task_info.task_data), status, 608 (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL)); 609 } 610 } 611 #endif 612 613 template <bool ompt> 614 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid, 615 kmp_task_t *task, 616 void *frame_address, 617 void *return_address) { 618 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 619 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 620 621 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p " 622 "current_task=%p\n", 623 gtid, loc_ref, taskdata, current_task)); 624 625 if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) { 626 // untied task needs to increment counter so that the task structure is not 627 // freed prematurely 628 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count); 629 KMP_DEBUG_USE_VAR(counter); 630 KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) " 631 "incremented for task %p\n", 632 gtid, counter, taskdata)); 633 } 634 635 taskdata->td_flags.task_serial = 636 1; // Execute this task immediately, not deferred. 637 __kmp_task_start(gtid, task, current_task); 638 639 #if OMPT_SUPPORT 640 if (ompt) { 641 if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) { 642 current_task->ompt_task_info.frame.enter_frame.ptr = 643 taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address; 644 current_task->ompt_task_info.frame.enter_frame_flags = 645 taskdata->ompt_task_info.frame.exit_frame_flags = 646 ompt_frame_application | ompt_frame_framepointer; 647 } 648 if (ompt_enabled.ompt_callback_task_create) { 649 ompt_task_info_t *parent_info = &(current_task->ompt_task_info); 650 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 651 &(parent_info->task_data), &(parent_info->frame), 652 &(taskdata->ompt_task_info.task_data), 653 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0, 654 return_address); 655 } 656 __ompt_task_start(task, current_task, gtid); 657 } 658 #endif // OMPT_SUPPORT 659 660 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid, 661 loc_ref, taskdata)); 662 } 663 664 #if OMPT_SUPPORT 665 OMPT_NOINLINE 666 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid, 667 kmp_task_t *task, 668 void *frame_address, 669 void *return_address) { 670 __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address, 671 return_address); 672 } 673 #endif // OMPT_SUPPORT 674 675 // __kmpc_omp_task_begin_if0: report that a given serialized task has started 676 // execution 677 // 678 // loc_ref: source location information; points to beginning of task block. 679 // gtid: global thread number. 680 // task: task thunk for the started task. 681 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid, 682 kmp_task_t *task) { 683 #if OMPT_SUPPORT 684 if (UNLIKELY(ompt_enabled.enabled)) { 685 OMPT_STORE_RETURN_ADDRESS(gtid); 686 __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task, 687 OMPT_GET_FRAME_ADDRESS(1), 688 OMPT_LOAD_RETURN_ADDRESS(gtid)); 689 return; 690 } 691 #endif 692 __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL); 693 } 694 695 #ifdef TASK_UNUSED 696 // __kmpc_omp_task_begin: report that a given task has started execution 697 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 698 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) { 699 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 700 701 KA_TRACE( 702 10, 703 ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n", 704 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task)); 705 706 __kmp_task_start(gtid, task, current_task); 707 708 KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid, 709 loc_ref, KMP_TASK_TO_TASKDATA(task))); 710 return; 711 } 712 #endif // TASK_UNUSED 713 714 // __kmp_free_task: free the current task space and the space for shareds 715 // 716 // gtid: Global thread ID of calling thread 717 // taskdata: task to free 718 // thread: thread data structure of caller 719 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata, 720 kmp_info_t *thread) { 721 KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid, 722 taskdata)); 723 724 // Check to make sure all flags and counters have the correct values 725 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 726 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0); 727 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1); 728 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 729 KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 || 730 taskdata->td_flags.task_serial == 1); 731 KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0); 732 733 taskdata->td_flags.freed = 1; 734 ANNOTATE_HAPPENS_BEFORE(taskdata); 735 // deallocate the taskdata and shared variable blocks associated with this task 736 #if USE_FAST_MEMORY 737 __kmp_fast_free(thread, taskdata); 738 #else /* ! USE_FAST_MEMORY */ 739 __kmp_thread_free(thread, taskdata); 740 #endif 741 KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata)); 742 } 743 744 // __kmp_free_task_and_ancestors: free the current task and ancestors without 745 // children 746 // 747 // gtid: Global thread ID of calling thread 748 // taskdata: task to free 749 // thread: thread data structure of caller 750 static void __kmp_free_task_and_ancestors(kmp_int32 gtid, 751 kmp_taskdata_t *taskdata, 752 kmp_info_t *thread) { 753 // Proxy tasks must always be allowed to free their parents 754 // because they can be run in background even in serial mode. 755 kmp_int32 team_serial = 756 (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) && 757 !taskdata->td_flags.proxy; 758 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 759 760 kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1; 761 KMP_DEBUG_ASSERT(children >= 0); 762 763 // Now, go up the ancestor tree to see if any ancestors can now be freed. 764 while (children == 0) { 765 kmp_taskdata_t *parent_taskdata = taskdata->td_parent; 766 767 KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete " 768 "and freeing itself\n", 769 gtid, taskdata)); 770 771 // --- Deallocate my ancestor task --- 772 __kmp_free_task(gtid, taskdata, thread); 773 774 taskdata = parent_taskdata; 775 776 if (team_serial) 777 return; 778 // Stop checking ancestors at implicit task instead of walking up ancestor 779 // tree to avoid premature deallocation of ancestors. 780 if (taskdata->td_flags.tasktype == TASK_IMPLICIT) { 781 if (taskdata->td_dephash) { // do we need to cleanup dephash? 782 int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks); 783 kmp_tasking_flags_t flags_old = taskdata->td_flags; 784 if (children == 0 && flags_old.complete == 1) { 785 kmp_tasking_flags_t flags_new = flags_old; 786 flags_new.complete = 0; 787 if (KMP_COMPARE_AND_STORE_ACQ32( 788 RCAST(kmp_int32 *, &taskdata->td_flags), 789 *RCAST(kmp_int32 *, &flags_old), 790 *RCAST(kmp_int32 *, &flags_new))) { 791 KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans " 792 "dephash of implicit task %p\n", 793 gtid, taskdata)); 794 // cleanup dephash of finished implicit task 795 __kmp_dephash_free_entries(thread, taskdata->td_dephash); 796 } 797 } 798 } 799 return; 800 } 801 // Predecrement simulated by "- 1" calculation 802 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1; 803 KMP_DEBUG_ASSERT(children >= 0); 804 } 805 806 KA_TRACE( 807 20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; " 808 "not freeing it yet\n", 809 gtid, taskdata, children)); 810 } 811 812 // __kmp_task_finish: bookkeeping to do when a task finishes execution 813 // 814 // gtid: global thread ID for calling thread 815 // task: task to be finished 816 // resumed_task: task to be resumed. (may be NULL if task is serialized) 817 // 818 // template<ompt>: effectively ompt_enabled.enabled!=0 819 // the version with ompt=false is inlined, allowing to optimize away all ompt 820 // code in this case 821 template <bool ompt> 822 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, 823 kmp_taskdata_t *resumed_task) { 824 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 825 kmp_info_t *thread = __kmp_threads[gtid]; 826 kmp_task_team_t *task_team = 827 thread->th.th_task_team; // might be NULL for serial teams... 828 kmp_int32 children = 0; 829 830 KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming " 831 "task %p\n", 832 gtid, taskdata, resumed_task)); 833 834 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 835 836 // Pop task from stack if tied 837 #ifdef BUILD_TIED_TASK_STACK 838 if (taskdata->td_flags.tiedness == TASK_TIED) { 839 __kmp_pop_task_stack(gtid, thread, taskdata); 840 } 841 #endif /* BUILD_TIED_TASK_STACK */ 842 843 if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) { 844 // untied task needs to check the counter so that the task structure is not 845 // freed prematurely 846 kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1; 847 KA_TRACE( 848 20, 849 ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n", 850 gtid, counter, taskdata)); 851 if (counter > 0) { 852 // untied task is not done, to be continued possibly by other thread, do 853 // not free it now 854 if (resumed_task == NULL) { 855 KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial); 856 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 857 // task is the parent 858 } 859 thread->th.th_current_task = resumed_task; // restore current_task 860 resumed_task->td_flags.executing = 1; // resume previous task 861 KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, " 862 "resuming task %p\n", 863 gtid, taskdata, resumed_task)); 864 return; 865 } 866 } 867 868 // bookkeeping for resuming task: 869 // GEH - note tasking_ser => task_serial 870 KMP_DEBUG_ASSERT( 871 (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) == 872 taskdata->td_flags.task_serial); 873 if (taskdata->td_flags.task_serial) { 874 if (resumed_task == NULL) { 875 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 876 // task is the parent 877 } 878 } else { 879 KMP_DEBUG_ASSERT(resumed_task != 880 NULL); // verify that resumed task is passed as argument 881 } 882 883 /* If the tasks' destructor thunk flag has been set, we need to invoke the 884 destructor thunk that has been generated by the compiler. The code is 885 placed here, since at this point other tasks might have been released 886 hence overlapping the destructor invocations with some other work in the 887 released tasks. The OpenMP spec is not specific on when the destructors 888 are invoked, so we should be free to choose. */ 889 if (UNLIKELY(taskdata->td_flags.destructors_thunk)) { 890 kmp_routine_entry_t destr_thunk = task->data1.destructors; 891 KMP_ASSERT(destr_thunk); 892 destr_thunk(gtid, task); 893 } 894 895 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 896 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1); 897 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 898 899 bool detach = false; 900 if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) { 901 if (taskdata->td_allow_completion_event.type == 902 KMP_EVENT_ALLOW_COMPLETION) { 903 // event hasn't been fulfilled yet. Try to detach task. 904 __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid); 905 if (taskdata->td_allow_completion_event.type == 906 KMP_EVENT_ALLOW_COMPLETION) { 907 // task finished execution 908 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); 909 taskdata->td_flags.executing = 0; // suspend the finishing task 910 911 #if OMPT_SUPPORT 912 // For a detached task, which is not completed, we switch back 913 // the omp_fulfill_event signals completion 914 // locking is necessary to avoid a race with ompt_task_late_fulfill 915 if (ompt) 916 __ompt_task_finish(task, resumed_task, ompt_task_detach); 917 #endif 918 919 // no access to taskdata after this point! 920 // __kmp_fulfill_event might free taskdata at any time from now 921 922 taskdata->td_flags.proxy = TASK_PROXY; // proxify! 923 detach = true; 924 } 925 __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid); 926 } 927 } 928 929 if (!detach) { 930 taskdata->td_flags.complete = 1; // mark the task as completed 931 932 #if OMPT_SUPPORT 933 // This is not a detached task, we are done here 934 if (ompt) 935 __ompt_task_finish(task, resumed_task, ompt_task_complete); 936 #endif 937 938 // Only need to keep track of count if team parallel and tasking not 939 // serialized, or task is detachable and event has already been fulfilled 940 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) || 941 taskdata->td_flags.detachable == TASK_DETACHABLE || 942 taskdata->td_flags.hidden_helper) { 943 // Predecrement simulated by "- 1" calculation 944 children = 945 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1; 946 KMP_DEBUG_ASSERT(children >= 0); 947 if (taskdata->td_taskgroup) 948 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); 949 __kmp_release_deps(gtid, taskdata); 950 } else if (task_team && task_team->tt.tt_found_proxy_tasks) { 951 // if we found proxy tasks there could exist a dependency chain 952 // with the proxy task as origin 953 __kmp_release_deps(gtid, taskdata); 954 } 955 // td_flags.executing must be marked as 0 after __kmp_release_deps has been 956 // called. Othertwise, if a task is executed immediately from the 957 // release_deps code, the flag will be reset to 1 again by this same 958 // function 959 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); 960 taskdata->td_flags.executing = 0; // suspend the finishing task 961 } 962 963 KA_TRACE( 964 20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n", 965 gtid, taskdata, children)); 966 967 // Free this task and then ancestor tasks if they have no children. 968 // Restore th_current_task first as suggested by John: 969 // johnmc: if an asynchronous inquiry peers into the runtime system 970 // it doesn't see the freed task as the current task. 971 thread->th.th_current_task = resumed_task; 972 if (!detach) 973 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 974 975 // TODO: GEH - make sure root team implicit task is initialized properly. 976 // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 ); 977 resumed_task->td_flags.executing = 1; // resume previous task 978 979 KA_TRACE( 980 10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n", 981 gtid, taskdata, resumed_task)); 982 983 return; 984 } 985 986 template <bool ompt> 987 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref, 988 kmp_int32 gtid, 989 kmp_task_t *task) { 990 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n", 991 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 992 KMP_DEBUG_ASSERT(gtid >= 0); 993 // this routine will provide task to resume 994 __kmp_task_finish<ompt>(gtid, task, NULL); 995 996 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n", 997 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 998 999 #if OMPT_SUPPORT 1000 if (ompt) { 1001 ompt_frame_t *ompt_frame; 1002 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 1003 ompt_frame->enter_frame = ompt_data_none; 1004 ompt_frame->enter_frame_flags = 1005 ompt_frame_runtime | ompt_frame_framepointer; 1006 } 1007 #endif 1008 1009 return; 1010 } 1011 1012 #if OMPT_SUPPORT 1013 OMPT_NOINLINE 1014 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid, 1015 kmp_task_t *task) { 1016 __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task); 1017 } 1018 #endif // OMPT_SUPPORT 1019 1020 // __kmpc_omp_task_complete_if0: report that a task has completed execution 1021 // 1022 // loc_ref: source location information; points to end of task block. 1023 // gtid: global thread number. 1024 // task: task thunk for the completed task. 1025 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid, 1026 kmp_task_t *task) { 1027 #if OMPT_SUPPORT 1028 if (UNLIKELY(ompt_enabled.enabled)) { 1029 __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task); 1030 return; 1031 } 1032 #endif 1033 __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task); 1034 } 1035 1036 #ifdef TASK_UNUSED 1037 // __kmpc_omp_task_complete: report that a task has completed execution 1038 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 1039 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid, 1040 kmp_task_t *task) { 1041 KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid, 1042 loc_ref, KMP_TASK_TO_TASKDATA(task))); 1043 1044 __kmp_task_finish<false>(gtid, task, 1045 NULL); // Not sure how to find task to resume 1046 1047 KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid, 1048 loc_ref, KMP_TASK_TO_TASKDATA(task))); 1049 return; 1050 } 1051 #endif // TASK_UNUSED 1052 1053 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit 1054 // task for a given thread 1055 // 1056 // loc_ref: reference to source location of parallel region 1057 // this_thr: thread data structure corresponding to implicit task 1058 // team: team for this_thr 1059 // tid: thread id of given thread within team 1060 // set_curr_task: TRUE if need to push current task to thread 1061 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to 1062 // have already been done elsewhere. 1063 // TODO: Get better loc_ref. Value passed in may be NULL 1064 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr, 1065 kmp_team_t *team, int tid, int set_curr_task) { 1066 kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid]; 1067 1068 KF_TRACE( 1069 10, 1070 ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n", 1071 tid, team, task, set_curr_task ? "TRUE" : "FALSE")); 1072 1073 task->td_task_id = KMP_GEN_TASK_ID(); 1074 task->td_team = team; 1075 // task->td_parent = NULL; // fix for CQ230101 (broken parent task info 1076 // in debugger) 1077 task->td_ident = loc_ref; 1078 task->td_taskwait_ident = NULL; 1079 task->td_taskwait_counter = 0; 1080 task->td_taskwait_thread = 0; 1081 1082 task->td_flags.tiedness = TASK_TIED; 1083 task->td_flags.tasktype = TASK_IMPLICIT; 1084 task->td_flags.proxy = TASK_FULL; 1085 1086 // All implicit tasks are executed immediately, not deferred 1087 task->td_flags.task_serial = 1; 1088 task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 1089 task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 1090 1091 task->td_flags.started = 1; 1092 task->td_flags.executing = 1; 1093 task->td_flags.complete = 0; 1094 task->td_flags.freed = 0; 1095 1096 task->td_depnode = NULL; 1097 task->td_last_tied = task; 1098 task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED; 1099 1100 if (set_curr_task) { // only do this init first time thread is created 1101 KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0); 1102 // Not used: don't need to deallocate implicit task 1103 KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0); 1104 task->td_taskgroup = NULL; // An implicit task does not have taskgroup 1105 task->td_dephash = NULL; 1106 __kmp_push_current_task_to_thread(this_thr, team, tid); 1107 } else { 1108 KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0); 1109 KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0); 1110 } 1111 1112 #if OMPT_SUPPORT 1113 if (UNLIKELY(ompt_enabled.enabled)) 1114 __ompt_task_init(task, tid); 1115 #endif 1116 1117 KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid, 1118 team, task)); 1119 } 1120 1121 // __kmp_finish_implicit_task: Release resources associated to implicit tasks 1122 // at the end of parallel regions. Some resources are kept for reuse in the next 1123 // parallel region. 1124 // 1125 // thread: thread data structure corresponding to implicit task 1126 void __kmp_finish_implicit_task(kmp_info_t *thread) { 1127 kmp_taskdata_t *task = thread->th.th_current_task; 1128 if (task->td_dephash) { 1129 int children; 1130 task->td_flags.complete = 1; 1131 children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks); 1132 kmp_tasking_flags_t flags_old = task->td_flags; 1133 if (children == 0 && flags_old.complete == 1) { 1134 kmp_tasking_flags_t flags_new = flags_old; 1135 flags_new.complete = 0; 1136 if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags), 1137 *RCAST(kmp_int32 *, &flags_old), 1138 *RCAST(kmp_int32 *, &flags_new))) { 1139 KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans " 1140 "dephash of implicit task %p\n", 1141 thread->th.th_info.ds.ds_gtid, task)); 1142 __kmp_dephash_free_entries(thread, task->td_dephash); 1143 } 1144 } 1145 } 1146 } 1147 1148 // __kmp_free_implicit_task: Release resources associated to implicit tasks 1149 // when these are destroyed regions 1150 // 1151 // thread: thread data structure corresponding to implicit task 1152 void __kmp_free_implicit_task(kmp_info_t *thread) { 1153 kmp_taskdata_t *task = thread->th.th_current_task; 1154 if (task && task->td_dephash) { 1155 __kmp_dephash_free(thread, task->td_dephash); 1156 task->td_dephash = NULL; 1157 } 1158 } 1159 1160 // Round up a size to a power of two specified by val: Used to insert padding 1161 // between structures co-allocated using a single malloc() call 1162 static size_t __kmp_round_up_to_val(size_t size, size_t val) { 1163 if (size & (val - 1)) { 1164 size &= ~(val - 1); 1165 if (size <= KMP_SIZE_T_MAX - val) { 1166 size += val; // Round up if there is no overflow. 1167 } 1168 } 1169 return size; 1170 } // __kmp_round_up_to_va 1171 1172 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task 1173 // 1174 // loc_ref: source location information 1175 // gtid: global thread number. 1176 // flags: include tiedness & task type (explicit vs. implicit) of the ''new'' 1177 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine. 1178 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including 1179 // private vars accessed in task. 1180 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed 1181 // in task. 1182 // task_entry: Pointer to task code entry point generated by compiler. 1183 // returns: a pointer to the allocated kmp_task_t structure (task). 1184 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1185 kmp_tasking_flags_t *flags, 1186 size_t sizeof_kmp_task_t, size_t sizeof_shareds, 1187 kmp_routine_entry_t task_entry) { 1188 kmp_task_t *task; 1189 kmp_taskdata_t *taskdata; 1190 kmp_info_t *thread = __kmp_threads[gtid]; 1191 kmp_info_t *encountering_thread = thread; 1192 kmp_team_t *team = thread->th.th_team; 1193 kmp_taskdata_t *parent_task = thread->th.th_current_task; 1194 size_t shareds_offset; 1195 1196 if (UNLIKELY(!TCR_4(__kmp_init_middle))) 1197 __kmp_middle_initialize(); 1198 1199 if (flags->hidden_helper) { 1200 if (__kmp_enable_hidden_helper) { 1201 if (!TCR_4(__kmp_init_hidden_helper)) 1202 __kmp_hidden_helper_initialize(); 1203 1204 // For a hidden helper task encountered by a regular thread, we will push 1205 // the task to the (gtid%__kmp_hidden_helper_threads_num)-th hidden helper 1206 // thread. 1207 if (!KMP_HIDDEN_HELPER_THREAD(gtid)) { 1208 thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)]; 1209 // We don't change the parent-child relation for hidden helper task as 1210 // we need that to do per-task-region synchronization. 1211 } 1212 } else { 1213 // If the hidden helper task is not enabled, reset the flag to FALSE. 1214 flags->hidden_helper = FALSE; 1215 } 1216 } 1217 1218 KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) " 1219 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1220 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t, 1221 sizeof_shareds, task_entry)); 1222 1223 if (parent_task->td_flags.final) { 1224 if (flags->merged_if0) { 1225 } 1226 flags->final = 1; 1227 } 1228 1229 if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) { 1230 // Untied task encountered causes the TSC algorithm to check entire deque of 1231 // the victim thread. If no untied task encountered, then checking the head 1232 // of the deque should be enough. 1233 KMP_CHECK_UPDATE( 1234 encountering_thread->th.th_task_team->tt.tt_untied_task_encountered, 1); 1235 } 1236 1237 // Detachable tasks are not proxy tasks yet but could be in the future. Doing 1238 // the tasking setup 1239 // when that happens is too late. 1240 if (UNLIKELY(flags->proxy == TASK_PROXY || 1241 flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) { 1242 if (flags->proxy == TASK_PROXY) { 1243 flags->tiedness = TASK_UNTIED; 1244 flags->merged_if0 = 1; 1245 } 1246 /* are we running in a sequential parallel or tskm_immediate_exec... we need 1247 tasking support enabled */ 1248 if ((encountering_thread->th.th_task_team) == NULL) { 1249 /* This should only happen if the team is serialized 1250 setup a task team and propagate it to the thread */ 1251 KMP_DEBUG_ASSERT(team->t.t_serialized); 1252 KA_TRACE(30, 1253 ("T#%d creating task team in __kmp_task_alloc for proxy task\n", 1254 gtid)); 1255 __kmp_task_team_setup( 1256 encountering_thread, team, 1257 1); // 1 indicates setup the current team regardless of nthreads 1258 encountering_thread->th.th_task_team = 1259 team->t.t_task_team[encountering_thread->th.th_task_state]; 1260 } 1261 kmp_task_team_t *task_team = encountering_thread->th.th_task_team; 1262 1263 /* tasking must be enabled now as the task might not be pushed */ 1264 if (!KMP_TASKING_ENABLED(task_team)) { 1265 KA_TRACE( 1266 30, 1267 ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid)); 1268 __kmp_enable_tasking(task_team, encountering_thread); 1269 kmp_int32 tid = encountering_thread->th.th_info.ds.ds_tid; 1270 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 1271 // No lock needed since only owner can allocate 1272 if (thread_data->td.td_deque == NULL) { 1273 __kmp_alloc_task_deque(encountering_thread, thread_data); 1274 } 1275 } 1276 1277 if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) && 1278 task_team->tt.tt_found_proxy_tasks == FALSE) 1279 TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE); 1280 if (flags->hidden_helper && 1281 task_team->tt.tt_hidden_helper_task_encountered == FALSE) 1282 TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE); 1283 } 1284 1285 // Calculate shared structure offset including padding after kmp_task_t struct 1286 // to align pointers in shared struct 1287 shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t; 1288 shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *)); 1289 1290 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 1291 KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid, 1292 shareds_offset)); 1293 KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid, 1294 sizeof_shareds)); 1295 1296 // Avoid double allocation here by combining shareds with taskdata 1297 #if USE_FAST_MEMORY 1298 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate( 1299 encountering_thread, shareds_offset + sizeof_shareds); 1300 #else /* ! USE_FAST_MEMORY */ 1301 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc( 1302 encountering_thread, shareds_offset + sizeof_shareds); 1303 #endif /* USE_FAST_MEMORY */ 1304 ANNOTATE_HAPPENS_AFTER(taskdata); 1305 1306 task = KMP_TASKDATA_TO_TASK(taskdata); 1307 1308 // Make sure task & taskdata are aligned appropriately 1309 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD 1310 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0); 1311 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0); 1312 #else 1313 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0); 1314 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0); 1315 #endif 1316 if (sizeof_shareds > 0) { 1317 // Avoid double allocation here by combining shareds with taskdata 1318 task->shareds = &((char *)taskdata)[shareds_offset]; 1319 // Make sure shareds struct is aligned to pointer size 1320 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 1321 0); 1322 } else { 1323 task->shareds = NULL; 1324 } 1325 task->routine = task_entry; 1326 task->part_id = 0; // AC: Always start with 0 part id 1327 1328 taskdata->td_task_id = KMP_GEN_TASK_ID(); 1329 taskdata->td_team = thread->th.th_team; 1330 taskdata->td_alloc_thread = encountering_thread; 1331 taskdata->td_parent = parent_task; 1332 taskdata->td_level = parent_task->td_level + 1; // increment nesting level 1333 KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0); 1334 taskdata->td_ident = loc_ref; 1335 taskdata->td_taskwait_ident = NULL; 1336 taskdata->td_taskwait_counter = 0; 1337 taskdata->td_taskwait_thread = 0; 1338 KMP_DEBUG_ASSERT(taskdata->td_parent != NULL); 1339 // avoid copying icvs for proxy tasks 1340 if (flags->proxy == TASK_FULL) 1341 copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs); 1342 1343 taskdata->td_flags.tiedness = flags->tiedness; 1344 taskdata->td_flags.final = flags->final; 1345 taskdata->td_flags.merged_if0 = flags->merged_if0; 1346 taskdata->td_flags.destructors_thunk = flags->destructors_thunk; 1347 taskdata->td_flags.proxy = flags->proxy; 1348 taskdata->td_flags.detachable = flags->detachable; 1349 taskdata->td_flags.hidden_helper = flags->hidden_helper; 1350 taskdata->encountering_gtid = gtid; 1351 taskdata->td_task_team = thread->th.th_task_team; 1352 taskdata->td_size_alloc = shareds_offset + sizeof_shareds; 1353 taskdata->td_flags.tasktype = TASK_EXPLICIT; 1354 1355 // GEH - TODO: fix this to copy parent task's value of tasking_ser flag 1356 taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 1357 1358 // GEH - TODO: fix this to copy parent task's value of team_serial flag 1359 taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 1360 1361 // GEH - Note we serialize the task if the team is serialized to make sure 1362 // implicit parallel region tasks are not left until program termination to 1363 // execute. Also, it helps locality to execute immediately. 1364 1365 taskdata->td_flags.task_serial = 1366 (parent_task->td_flags.final || taskdata->td_flags.team_serial || 1367 taskdata->td_flags.tasking_ser || flags->merged_if0); 1368 1369 taskdata->td_flags.started = 0; 1370 taskdata->td_flags.executing = 0; 1371 taskdata->td_flags.complete = 0; 1372 taskdata->td_flags.freed = 0; 1373 1374 taskdata->td_flags.native = flags->native; 1375 1376 KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0); 1377 // start at one because counts current task and children 1378 KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1); 1379 taskdata->td_taskgroup = 1380 parent_task->td_taskgroup; // task inherits taskgroup from the parent task 1381 taskdata->td_dephash = NULL; 1382 taskdata->td_depnode = NULL; 1383 if (flags->tiedness == TASK_UNTIED) 1384 taskdata->td_last_tied = NULL; // will be set when the task is scheduled 1385 else 1386 taskdata->td_last_tied = taskdata; 1387 taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED; 1388 #if OMPT_SUPPORT 1389 if (UNLIKELY(ompt_enabled.enabled)) 1390 __ompt_task_init(taskdata, gtid); 1391 #endif 1392 // Only need to keep track of child task counts if team parallel and tasking 1393 // not serialized or if it is a proxy or detachable or hidden helper task 1394 if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE || 1395 flags->hidden_helper || 1396 !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { 1397 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks); 1398 if (parent_task->td_taskgroup) 1399 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count); 1400 // Only need to keep track of allocated child tasks for explicit tasks since 1401 // implicit not deallocated 1402 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) { 1403 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks); 1404 } 1405 } 1406 1407 if (flags->hidden_helper) { 1408 taskdata->td_flags.task_serial = FALSE; 1409 // Increment the number of hidden helper tasks to be executed 1410 KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks); 1411 } 1412 1413 KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n", 1414 gtid, taskdata, taskdata->td_parent)); 1415 ANNOTATE_HAPPENS_BEFORE(task); 1416 1417 return task; 1418 } 1419 1420 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1421 kmp_int32 flags, size_t sizeof_kmp_task_t, 1422 size_t sizeof_shareds, 1423 kmp_routine_entry_t task_entry) { 1424 kmp_task_t *retval; 1425 kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags; 1426 __kmp_assert_valid_gtid(gtid); 1427 input_flags->native = FALSE; 1428 // __kmp_task_alloc() sets up all other runtime flags 1429 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) " 1430 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1431 gtid, loc_ref, input_flags->tiedness ? "tied " : "untied", 1432 input_flags->proxy ? "proxy" : "", 1433 input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t, 1434 sizeof_shareds, task_entry)); 1435 1436 retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t, 1437 sizeof_shareds, task_entry); 1438 1439 KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval)); 1440 1441 return retval; 1442 } 1443 1444 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1445 kmp_int32 flags, 1446 size_t sizeof_kmp_task_t, 1447 size_t sizeof_shareds, 1448 kmp_routine_entry_t task_entry, 1449 kmp_int64 device_id) { 1450 if (__kmp_enable_hidden_helper) { 1451 auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags); 1452 input_flags.hidden_helper = TRUE; 1453 } 1454 1455 return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t, 1456 sizeof_shareds, task_entry); 1457 } 1458 1459 /*! 1460 @ingroup TASKING 1461 @param loc_ref location of the original task directive 1462 @param gtid Global Thread ID of encountering thread 1463 @param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new 1464 task'' 1465 @param naffins Number of affinity items 1466 @param affin_list List of affinity items 1467 @return Returns non-zero if registering affinity information was not successful. 1468 Returns 0 if registration was successful 1469 This entry registers the affinity information attached to a task with the task 1470 thunk structure kmp_taskdata_t. 1471 */ 1472 kmp_int32 1473 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, 1474 kmp_task_t *new_task, kmp_int32 naffins, 1475 kmp_task_affinity_info_t *affin_list) { 1476 return 0; 1477 } 1478 1479 // __kmp_invoke_task: invoke the specified task 1480 // 1481 // gtid: global thread ID of caller 1482 // task: the task to invoke 1483 // current_task: the task to resume after task invocation 1484 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, 1485 kmp_taskdata_t *current_task) { 1486 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 1487 kmp_info_t *thread; 1488 int discard = 0 /* false */; 1489 KA_TRACE( 1490 30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n", 1491 gtid, taskdata, current_task)); 1492 KMP_DEBUG_ASSERT(task); 1493 if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY && 1494 taskdata->td_flags.complete == 1)) { 1495 // This is a proxy task that was already completed but it needs to run 1496 // its bottom-half finish 1497 KA_TRACE( 1498 30, 1499 ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n", 1500 gtid, taskdata)); 1501 1502 __kmp_bottom_half_finish_proxy(gtid, task); 1503 1504 KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for " 1505 "proxy task %p, resuming task %p\n", 1506 gtid, taskdata, current_task)); 1507 1508 return; 1509 } 1510 1511 #if OMPT_SUPPORT 1512 // For untied tasks, the first task executed only calls __kmpc_omp_task and 1513 // does not execute code. 1514 ompt_thread_info_t oldInfo; 1515 if (UNLIKELY(ompt_enabled.enabled)) { 1516 // Store the threads states and restore them after the task 1517 thread = __kmp_threads[gtid]; 1518 oldInfo = thread->th.ompt_thread_info; 1519 thread->th.ompt_thread_info.wait_id = 0; 1520 thread->th.ompt_thread_info.state = (thread->th.th_team_serialized) 1521 ? ompt_state_work_serial 1522 : ompt_state_work_parallel; 1523 taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1524 } 1525 #endif 1526 1527 // Decreament the counter of hidden helper tasks to be executed 1528 if (taskdata->td_flags.hidden_helper) { 1529 // Hidden helper tasks can only be executed by hidden helper threads 1530 KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid)); 1531 KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks); 1532 } 1533 1534 // Proxy tasks are not handled by the runtime 1535 if (taskdata->td_flags.proxy != TASK_PROXY) { 1536 ANNOTATE_HAPPENS_AFTER(task); 1537 __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded 1538 } 1539 1540 // TODO: cancel tasks if the parallel region has also been cancelled 1541 // TODO: check if this sequence can be hoisted above __kmp_task_start 1542 // if cancellation has been enabled for this run ... 1543 if (UNLIKELY(__kmp_omp_cancellation)) { 1544 thread = __kmp_threads[gtid]; 1545 kmp_team_t *this_team = thread->th.th_team; 1546 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 1547 if ((taskgroup && taskgroup->cancel_request) || 1548 (this_team->t.t_cancel_request == cancel_parallel)) { 1549 #if OMPT_SUPPORT && OMPT_OPTIONAL 1550 ompt_data_t *task_data; 1551 if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) { 1552 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL); 1553 ompt_callbacks.ompt_callback(ompt_callback_cancel)( 1554 task_data, 1555 ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup 1556 : ompt_cancel_parallel) | 1557 ompt_cancel_discarded_task, 1558 NULL); 1559 } 1560 #endif 1561 KMP_COUNT_BLOCK(TASK_cancelled); 1562 // this task belongs to a task group and we need to cancel it 1563 discard = 1 /* true */; 1564 } 1565 } 1566 1567 // Invoke the task routine and pass in relevant data. 1568 // Thunks generated by gcc take a different argument list. 1569 if (!discard) { 1570 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 1571 taskdata->td_last_tied = current_task->td_last_tied; 1572 KMP_DEBUG_ASSERT(taskdata->td_last_tied); 1573 } 1574 #if KMP_STATS_ENABLED 1575 KMP_COUNT_BLOCK(TASK_executed); 1576 switch (KMP_GET_THREAD_STATE()) { 1577 case FORK_JOIN_BARRIER: 1578 KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar); 1579 break; 1580 case PLAIN_BARRIER: 1581 KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar); 1582 break; 1583 case TASKYIELD: 1584 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield); 1585 break; 1586 case TASKWAIT: 1587 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait); 1588 break; 1589 case TASKGROUP: 1590 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup); 1591 break; 1592 default: 1593 KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate); 1594 break; 1595 } 1596 #endif // KMP_STATS_ENABLED 1597 1598 // OMPT task begin 1599 #if OMPT_SUPPORT 1600 if (UNLIKELY(ompt_enabled.enabled)) 1601 __ompt_task_start(task, current_task, gtid); 1602 #endif 1603 1604 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1605 kmp_uint64 cur_time; 1606 kmp_int32 kmp_itt_count_task = 1607 __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial && 1608 current_task->td_flags.tasktype == TASK_IMPLICIT; 1609 if (kmp_itt_count_task) { 1610 thread = __kmp_threads[gtid]; 1611 // Time outer level explicit task on barrier for adjusting imbalance time 1612 if (thread->th.th_bar_arrive_time) 1613 cur_time = __itt_get_timestamp(); 1614 else 1615 kmp_itt_count_task = 0; // thread is not on a barrier - skip timing 1616 } 1617 KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task) 1618 #endif 1619 1620 #ifdef KMP_GOMP_COMPAT 1621 if (taskdata->td_flags.native) { 1622 ((void (*)(void *))(*(task->routine)))(task->shareds); 1623 } else 1624 #endif /* KMP_GOMP_COMPAT */ 1625 { 1626 (*(task->routine))(gtid, task); 1627 } 1628 KMP_POP_PARTITIONED_TIMER(); 1629 1630 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1631 if (kmp_itt_count_task) { 1632 // Barrier imbalance - adjust arrive time with the task duration 1633 thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time); 1634 } 1635 KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed) 1636 KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent 1637 #endif 1638 } 1639 1640 // Proxy tasks are not handled by the runtime 1641 if (taskdata->td_flags.proxy != TASK_PROXY) { 1642 ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent); 1643 #if OMPT_SUPPORT 1644 if (UNLIKELY(ompt_enabled.enabled)) { 1645 thread->th.ompt_thread_info = oldInfo; 1646 if (taskdata->td_flags.tiedness == TASK_TIED) { 1647 taskdata->ompt_task_info.frame.exit_frame = ompt_data_none; 1648 } 1649 __kmp_task_finish<true>(gtid, task, current_task); 1650 } else 1651 #endif 1652 __kmp_task_finish<false>(gtid, task, current_task); 1653 } 1654 1655 KA_TRACE( 1656 30, 1657 ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n", 1658 gtid, taskdata, current_task)); 1659 return; 1660 } 1661 1662 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution 1663 // 1664 // loc_ref: location of original task pragma (ignored) 1665 // gtid: Global Thread ID of encountering thread 1666 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task'' 1667 // Returns: 1668 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1669 // be resumed later. 1670 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1671 // resumed later. 1672 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid, 1673 kmp_task_t *new_task) { 1674 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1675 1676 KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid, 1677 loc_ref, new_taskdata)); 1678 1679 #if OMPT_SUPPORT 1680 kmp_taskdata_t *parent; 1681 if (UNLIKELY(ompt_enabled.enabled)) { 1682 parent = new_taskdata->td_parent; 1683 if (ompt_enabled.ompt_callback_task_create) { 1684 ompt_data_t task_data = ompt_data_none; 1685 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1686 parent ? &(parent->ompt_task_info.task_data) : &task_data, 1687 parent ? &(parent->ompt_task_info.frame) : NULL, 1688 &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0, 1689 OMPT_GET_RETURN_ADDRESS(0)); 1690 } 1691 } 1692 #endif 1693 1694 /* Should we execute the new task or queue it? For now, let's just always try 1695 to queue it. If the queue fills up, then we'll execute it. */ 1696 1697 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1698 { // Execute this task immediately 1699 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1700 new_taskdata->td_flags.task_serial = 1; 1701 __kmp_invoke_task(gtid, new_task, current_task); 1702 } 1703 1704 KA_TRACE( 1705 10, 1706 ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: " 1707 "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n", 1708 gtid, loc_ref, new_taskdata)); 1709 1710 ANNOTATE_HAPPENS_BEFORE(new_task); 1711 #if OMPT_SUPPORT 1712 if (UNLIKELY(ompt_enabled.enabled)) { 1713 parent->ompt_task_info.frame.enter_frame = ompt_data_none; 1714 } 1715 #endif 1716 return TASK_CURRENT_NOT_QUEUED; 1717 } 1718 1719 // __kmp_omp_task: Schedule a non-thread-switchable task for execution 1720 // 1721 // gtid: Global Thread ID of encountering thread 1722 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc() 1723 // serialize_immediate: if TRUE then if the task is executed immediately its 1724 // execution will be serialized 1725 // Returns: 1726 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1727 // be resumed later. 1728 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1729 // resumed later. 1730 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task, 1731 bool serialize_immediate) { 1732 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1733 1734 /* Should we execute the new task or queue it? For now, let's just always try 1735 to queue it. If the queue fills up, then we'll execute it. */ 1736 if (new_taskdata->td_flags.proxy == TASK_PROXY || 1737 __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1738 { // Execute this task immediately 1739 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1740 if (serialize_immediate) 1741 new_taskdata->td_flags.task_serial = 1; 1742 __kmp_invoke_task(gtid, new_task, current_task); 1743 } 1744 1745 ANNOTATE_HAPPENS_BEFORE(new_task); 1746 return TASK_CURRENT_NOT_QUEUED; 1747 } 1748 1749 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a 1750 // non-thread-switchable task from the parent thread only! 1751 // 1752 // loc_ref: location of original task pragma (ignored) 1753 // gtid: Global Thread ID of encountering thread 1754 // new_task: non-thread-switchable task thunk allocated by 1755 // __kmp_omp_task_alloc() 1756 // Returns: 1757 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1758 // be resumed later. 1759 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1760 // resumed later. 1761 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid, 1762 kmp_task_t *new_task) { 1763 kmp_int32 res; 1764 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); 1765 1766 #if KMP_DEBUG || OMPT_SUPPORT 1767 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1768 #endif 1769 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref, 1770 new_taskdata)); 1771 __kmp_assert_valid_gtid(gtid); 1772 1773 #if OMPT_SUPPORT 1774 kmp_taskdata_t *parent = NULL; 1775 if (UNLIKELY(ompt_enabled.enabled)) { 1776 if (!new_taskdata->td_flags.started) { 1777 OMPT_STORE_RETURN_ADDRESS(gtid); 1778 parent = new_taskdata->td_parent; 1779 if (!parent->ompt_task_info.frame.enter_frame.ptr) { 1780 parent->ompt_task_info.frame.enter_frame.ptr = 1781 OMPT_GET_FRAME_ADDRESS(0); 1782 } 1783 if (ompt_enabled.ompt_callback_task_create) { 1784 ompt_data_t task_data = ompt_data_none; 1785 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1786 parent ? &(parent->ompt_task_info.task_data) : &task_data, 1787 parent ? &(parent->ompt_task_info.frame) : NULL, 1788 &(new_taskdata->ompt_task_info.task_data), 1789 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, 1790 OMPT_LOAD_RETURN_ADDRESS(gtid)); 1791 } 1792 } else { 1793 // We are scheduling the continuation of an UNTIED task. 1794 // Scheduling back to the parent task. 1795 __ompt_task_finish(new_task, 1796 new_taskdata->ompt_task_info.scheduling_parent, 1797 ompt_task_switch); 1798 new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none; 1799 } 1800 } 1801 #endif 1802 1803 res = __kmp_omp_task(gtid, new_task, true); 1804 1805 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning " 1806 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n", 1807 gtid, loc_ref, new_taskdata)); 1808 #if OMPT_SUPPORT 1809 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) { 1810 parent->ompt_task_info.frame.enter_frame = ompt_data_none; 1811 } 1812 #endif 1813 return res; 1814 } 1815 1816 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule 1817 // a taskloop task with the correct OMPT return address 1818 // 1819 // loc_ref: location of original task pragma (ignored) 1820 // gtid: Global Thread ID of encountering thread 1821 // new_task: non-thread-switchable task thunk allocated by 1822 // __kmp_omp_task_alloc() 1823 // codeptr_ra: return address for OMPT callback 1824 // Returns: 1825 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1826 // be resumed later. 1827 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1828 // resumed later. 1829 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid, 1830 kmp_task_t *new_task, void *codeptr_ra) { 1831 kmp_int32 res; 1832 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); 1833 1834 #if KMP_DEBUG || OMPT_SUPPORT 1835 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1836 #endif 1837 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref, 1838 new_taskdata)); 1839 1840 #if OMPT_SUPPORT 1841 kmp_taskdata_t *parent = NULL; 1842 if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) { 1843 parent = new_taskdata->td_parent; 1844 if (!parent->ompt_task_info.frame.enter_frame.ptr) 1845 parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1846 if (ompt_enabled.ompt_callback_task_create) { 1847 ompt_data_t task_data = ompt_data_none; 1848 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1849 parent ? &(parent->ompt_task_info.task_data) : &task_data, 1850 parent ? &(parent->ompt_task_info.frame) : NULL, 1851 &(new_taskdata->ompt_task_info.task_data), 1852 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, 1853 codeptr_ra); 1854 } 1855 } 1856 #endif 1857 1858 res = __kmp_omp_task(gtid, new_task, true); 1859 1860 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning " 1861 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n", 1862 gtid, loc_ref, new_taskdata)); 1863 #if OMPT_SUPPORT 1864 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) { 1865 parent->ompt_task_info.frame.enter_frame = ompt_data_none; 1866 } 1867 #endif 1868 return res; 1869 } 1870 1871 template <bool ompt> 1872 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid, 1873 void *frame_address, 1874 void *return_address) { 1875 kmp_taskdata_t *taskdata; 1876 kmp_info_t *thread; 1877 int thread_finished = FALSE; 1878 KMP_SET_THREAD_STATE_BLOCK(TASKWAIT); 1879 1880 KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref)); 1881 KMP_DEBUG_ASSERT(gtid >= 0); 1882 1883 if (__kmp_tasking_mode != tskm_immediate_exec) { 1884 thread = __kmp_threads[gtid]; 1885 taskdata = thread->th.th_current_task; 1886 1887 #if OMPT_SUPPORT && OMPT_OPTIONAL 1888 ompt_data_t *my_task_data; 1889 ompt_data_t *my_parallel_data; 1890 1891 if (ompt) { 1892 my_task_data = &(taskdata->ompt_task_info.task_data); 1893 my_parallel_data = OMPT_CUR_TEAM_DATA(thread); 1894 1895 taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address; 1896 1897 if (ompt_enabled.ompt_callback_sync_region) { 1898 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1899 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data, 1900 my_task_data, return_address); 1901 } 1902 1903 if (ompt_enabled.ompt_callback_sync_region_wait) { 1904 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1905 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data, 1906 my_task_data, return_address); 1907 } 1908 } 1909 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1910 1911 // Debugger: The taskwait is active. Store location and thread encountered the 1912 // taskwait. 1913 #if USE_ITT_BUILD 1914 // Note: These values are used by ITT events as well. 1915 #endif /* USE_ITT_BUILD */ 1916 taskdata->td_taskwait_counter += 1; 1917 taskdata->td_taskwait_ident = loc_ref; 1918 taskdata->td_taskwait_thread = gtid + 1; 1919 1920 #if USE_ITT_BUILD 1921 void *itt_sync_obj = NULL; 1922 #if USE_ITT_NOTIFY 1923 KMP_ITT_TASKWAIT_STARTING(itt_sync_obj); 1924 #endif /* USE_ITT_NOTIFY */ 1925 #endif /* USE_ITT_BUILD */ 1926 1927 bool must_wait = 1928 !taskdata->td_flags.team_serial && !taskdata->td_flags.final; 1929 1930 must_wait = must_wait || (thread->th.th_task_team != NULL && 1931 thread->th.th_task_team->tt.tt_found_proxy_tasks); 1932 // If hidden helper thread is encountered, we must enable wait here. 1933 must_wait = 1934 must_wait || 1935 (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL && 1936 thread->th.th_task_team->tt.tt_hidden_helper_task_encountered); 1937 1938 if (must_wait) { 1939 kmp_flag_32<false, false> flag( 1940 RCAST(std::atomic<kmp_uint32> *, 1941 &(taskdata->td_incomplete_child_tasks)), 1942 0U); 1943 while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) { 1944 flag.execute_tasks(thread, gtid, FALSE, 1945 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 1946 __kmp_task_stealing_constraint); 1947 } 1948 } 1949 #if USE_ITT_BUILD 1950 KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj); 1951 KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children 1952 #endif /* USE_ITT_BUILD */ 1953 1954 // Debugger: The taskwait is completed. Location remains, but thread is 1955 // negated. 1956 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 1957 1958 #if OMPT_SUPPORT && OMPT_OPTIONAL 1959 if (ompt) { 1960 if (ompt_enabled.ompt_callback_sync_region_wait) { 1961 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1962 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data, 1963 my_task_data, return_address); 1964 } 1965 if (ompt_enabled.ompt_callback_sync_region) { 1966 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1967 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data, 1968 my_task_data, return_address); 1969 } 1970 taskdata->ompt_task_info.frame.enter_frame = ompt_data_none; 1971 } 1972 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1973 1974 ANNOTATE_HAPPENS_AFTER(taskdata); 1975 } 1976 1977 KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, " 1978 "returning TASK_CURRENT_NOT_QUEUED\n", 1979 gtid, taskdata)); 1980 1981 return TASK_CURRENT_NOT_QUEUED; 1982 } 1983 1984 #if OMPT_SUPPORT && OMPT_OPTIONAL 1985 OMPT_NOINLINE 1986 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid, 1987 void *frame_address, 1988 void *return_address) { 1989 return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address, 1990 return_address); 1991 } 1992 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1993 1994 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are 1995 // complete 1996 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) { 1997 #if OMPT_SUPPORT && OMPT_OPTIONAL 1998 if (UNLIKELY(ompt_enabled.enabled)) { 1999 OMPT_STORE_RETURN_ADDRESS(gtid); 2000 return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0), 2001 OMPT_LOAD_RETURN_ADDRESS(gtid)); 2002 } 2003 #endif 2004 return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL); 2005 } 2006 2007 // __kmpc_omp_taskyield: switch to a different task 2008 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) { 2009 kmp_taskdata_t *taskdata; 2010 kmp_info_t *thread; 2011 int thread_finished = FALSE; 2012 2013 KMP_COUNT_BLOCK(OMP_TASKYIELD); 2014 KMP_SET_THREAD_STATE_BLOCK(TASKYIELD); 2015 2016 KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n", 2017 gtid, loc_ref, end_part)); 2018 __kmp_assert_valid_gtid(gtid); 2019 2020 if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) { 2021 thread = __kmp_threads[gtid]; 2022 taskdata = thread->th.th_current_task; 2023 // Should we model this as a task wait or not? 2024 // Debugger: The taskwait is active. Store location and thread encountered the 2025 // taskwait. 2026 #if USE_ITT_BUILD 2027 // Note: These values are used by ITT events as well. 2028 #endif /* USE_ITT_BUILD */ 2029 taskdata->td_taskwait_counter += 1; 2030 taskdata->td_taskwait_ident = loc_ref; 2031 taskdata->td_taskwait_thread = gtid + 1; 2032 2033 #if USE_ITT_BUILD 2034 void *itt_sync_obj = NULL; 2035 #if USE_ITT_NOTIFY 2036 KMP_ITT_TASKWAIT_STARTING(itt_sync_obj); 2037 #endif /* USE_ITT_NOTIFY */ 2038 #endif /* USE_ITT_BUILD */ 2039 if (!taskdata->td_flags.team_serial) { 2040 kmp_task_team_t *task_team = thread->th.th_task_team; 2041 if (task_team != NULL) { 2042 if (KMP_TASKING_ENABLED(task_team)) { 2043 #if OMPT_SUPPORT 2044 if (UNLIKELY(ompt_enabled.enabled)) 2045 thread->th.ompt_thread_info.ompt_task_yielded = 1; 2046 #endif 2047 __kmp_execute_tasks_32( 2048 thread, gtid, (kmp_flag_32<> *)NULL, FALSE, 2049 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 2050 __kmp_task_stealing_constraint); 2051 #if OMPT_SUPPORT 2052 if (UNLIKELY(ompt_enabled.enabled)) 2053 thread->th.ompt_thread_info.ompt_task_yielded = 0; 2054 #endif 2055 } 2056 } 2057 } 2058 #if USE_ITT_BUILD 2059 KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj); 2060 #endif /* USE_ITT_BUILD */ 2061 2062 // Debugger: The taskwait is completed. Location remains, but thread is 2063 // negated. 2064 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 2065 } 2066 2067 KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, " 2068 "returning TASK_CURRENT_NOT_QUEUED\n", 2069 gtid, taskdata)); 2070 2071 return TASK_CURRENT_NOT_QUEUED; 2072 } 2073 2074 // Task Reduction implementation 2075 // 2076 // Note: initial implementation didn't take into account the possibility 2077 // to specify omp_orig for initializer of the UDR (user defined reduction). 2078 // Corrected implementation takes into account the omp_orig object. 2079 // Compiler is free to use old implementation if omp_orig is not specified. 2080 2081 /*! 2082 @ingroup BASIC_TYPES 2083 @{ 2084 */ 2085 2086 /*! 2087 Flags for special info per task reduction item. 2088 */ 2089 typedef struct kmp_taskred_flags { 2090 /*! 1 - use lazy alloc/init (e.g. big objects, #tasks < #threads) */ 2091 unsigned lazy_priv : 1; 2092 unsigned reserved31 : 31; 2093 } kmp_taskred_flags_t; 2094 2095 /*! 2096 Internal struct for reduction data item related info set up by compiler. 2097 */ 2098 typedef struct kmp_task_red_input { 2099 void *reduce_shar; /**< shared between tasks item to reduce into */ 2100 size_t reduce_size; /**< size of data item in bytes */ 2101 // three compiler-generated routines (init, fini are optional): 2102 void *reduce_init; /**< data initialization routine (single parameter) */ 2103 void *reduce_fini; /**< data finalization routine */ 2104 void *reduce_comb; /**< data combiner routine */ 2105 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */ 2106 } kmp_task_red_input_t; 2107 2108 /*! 2109 Internal struct for reduction data item related info saved by the library. 2110 */ 2111 typedef struct kmp_taskred_data { 2112 void *reduce_shar; /**< shared between tasks item to reduce into */ 2113 size_t reduce_size; /**< size of data item */ 2114 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */ 2115 void *reduce_priv; /**< array of thread specific items */ 2116 void *reduce_pend; /**< end of private data for faster comparison op */ 2117 // three compiler-generated routines (init, fini are optional): 2118 void *reduce_comb; /**< data combiner routine */ 2119 void *reduce_init; /**< data initialization routine (two parameters) */ 2120 void *reduce_fini; /**< data finalization routine */ 2121 void *reduce_orig; /**< original item (can be used in UDR initializer) */ 2122 } kmp_taskred_data_t; 2123 2124 /*! 2125 Internal struct for reduction data item related info set up by compiler. 2126 2127 New interface: added reduce_orig field to provide omp_orig for UDR initializer. 2128 */ 2129 typedef struct kmp_taskred_input { 2130 void *reduce_shar; /**< shared between tasks item to reduce into */ 2131 void *reduce_orig; /**< original reduction item used for initialization */ 2132 size_t reduce_size; /**< size of data item */ 2133 // three compiler-generated routines (init, fini are optional): 2134 void *reduce_init; /**< data initialization routine (two parameters) */ 2135 void *reduce_fini; /**< data finalization routine */ 2136 void *reduce_comb; /**< data combiner routine */ 2137 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */ 2138 } kmp_taskred_input_t; 2139 /*! 2140 @} 2141 */ 2142 2143 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src); 2144 template <> 2145 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item, 2146 kmp_task_red_input_t &src) { 2147 item.reduce_orig = NULL; 2148 } 2149 template <> 2150 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item, 2151 kmp_taskred_input_t &src) { 2152 if (src.reduce_orig != NULL) { 2153 item.reduce_orig = src.reduce_orig; 2154 } else { 2155 item.reduce_orig = src.reduce_shar; 2156 } // non-NULL reduce_orig means new interface used 2157 } 2158 2159 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j); 2160 template <> 2161 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item, 2162 size_t offset) { 2163 ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset); 2164 } 2165 template <> 2166 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item, 2167 size_t offset) { 2168 ((void (*)(void *, void *))item.reduce_init)( 2169 (char *)(item.reduce_priv) + offset, item.reduce_orig); 2170 } 2171 2172 template <typename T> 2173 void *__kmp_task_reduction_init(int gtid, int num, T *data) { 2174 __kmp_assert_valid_gtid(gtid); 2175 kmp_info_t *thread = __kmp_threads[gtid]; 2176 kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup; 2177 kmp_uint32 nth = thread->th.th_team_nproc; 2178 kmp_taskred_data_t *arr; 2179 2180 // check input data just in case 2181 KMP_ASSERT(tg != NULL); 2182 KMP_ASSERT(data != NULL); 2183 KMP_ASSERT(num > 0); 2184 if (nth == 1) { 2185 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n", 2186 gtid, tg)); 2187 return (void *)tg; 2188 } 2189 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n", 2190 gtid, tg, num)); 2191 arr = (kmp_taskred_data_t *)__kmp_thread_malloc( 2192 thread, num * sizeof(kmp_taskred_data_t)); 2193 for (int i = 0; i < num; ++i) { 2194 size_t size = data[i].reduce_size - 1; 2195 // round the size up to cache line per thread-specific item 2196 size += CACHE_LINE - size % CACHE_LINE; 2197 KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory 2198 arr[i].reduce_shar = data[i].reduce_shar; 2199 arr[i].reduce_size = size; 2200 arr[i].flags = data[i].flags; 2201 arr[i].reduce_comb = data[i].reduce_comb; 2202 arr[i].reduce_init = data[i].reduce_init; 2203 arr[i].reduce_fini = data[i].reduce_fini; 2204 __kmp_assign_orig<T>(arr[i], data[i]); 2205 if (!arr[i].flags.lazy_priv) { 2206 // allocate cache-line aligned block and fill it with zeros 2207 arr[i].reduce_priv = __kmp_allocate(nth * size); 2208 arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size; 2209 if (arr[i].reduce_init != NULL) { 2210 // initialize all thread-specific items 2211 for (size_t j = 0; j < nth; ++j) { 2212 __kmp_call_init<T>(arr[i], j * size); 2213 } 2214 } 2215 } else { 2216 // only allocate space for pointers now, 2217 // objects will be lazily allocated/initialized if/when requested 2218 // note that __kmp_allocate zeroes the allocated memory 2219 arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *)); 2220 } 2221 } 2222 tg->reduce_data = (void *)arr; 2223 tg->reduce_num_data = num; 2224 return (void *)tg; 2225 } 2226 2227 /*! 2228 @ingroup TASKING 2229 @param gtid Global thread ID 2230 @param num Number of data items to reduce 2231 @param data Array of data for reduction 2232 @return The taskgroup identifier 2233 2234 Initialize task reduction for the taskgroup. 2235 2236 Note: this entry supposes the optional compiler-generated initializer routine 2237 has single parameter - pointer to object to be initialized. That means 2238 the reduction either does not use omp_orig object, or the omp_orig is accessible 2239 without help of the runtime library. 2240 */ 2241 void *__kmpc_task_reduction_init(int gtid, int num, void *data) { 2242 return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data); 2243 } 2244 2245 /*! 2246 @ingroup TASKING 2247 @param gtid Global thread ID 2248 @param num Number of data items to reduce 2249 @param data Array of data for reduction 2250 @return The taskgroup identifier 2251 2252 Initialize task reduction for the taskgroup. 2253 2254 Note: this entry supposes the optional compiler-generated initializer routine 2255 has two parameters, pointer to object to be initialized and pointer to omp_orig 2256 */ 2257 void *__kmpc_taskred_init(int gtid, int num, void *data) { 2258 return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data); 2259 } 2260 2261 // Copy task reduction data (except for shared pointers). 2262 template <typename T> 2263 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data, 2264 kmp_taskgroup_t *tg, void *reduce_data) { 2265 kmp_taskred_data_t *arr; 2266 KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p," 2267 " from data %p\n", 2268 thr, tg, reduce_data)); 2269 arr = (kmp_taskred_data_t *)__kmp_thread_malloc( 2270 thr, num * sizeof(kmp_taskred_data_t)); 2271 // threads will share private copies, thunk routines, sizes, flags, etc.: 2272 KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t)); 2273 for (int i = 0; i < num; ++i) { 2274 arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers 2275 } 2276 tg->reduce_data = (void *)arr; 2277 tg->reduce_num_data = num; 2278 } 2279 2280 /*! 2281 @ingroup TASKING 2282 @param gtid Global thread ID 2283 @param tskgrp The taskgroup ID (optional) 2284 @param data Shared location of the item 2285 @return The pointer to per-thread data 2286 2287 Get thread-specific location of data item 2288 */ 2289 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) { 2290 __kmp_assert_valid_gtid(gtid); 2291 kmp_info_t *thread = __kmp_threads[gtid]; 2292 kmp_int32 nth = thread->th.th_team_nproc; 2293 if (nth == 1) 2294 return data; // nothing to do 2295 2296 kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp; 2297 if (tg == NULL) 2298 tg = thread->th.th_current_task->td_taskgroup; 2299 KMP_ASSERT(tg != NULL); 2300 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data); 2301 kmp_int32 num = tg->reduce_num_data; 2302 kmp_int32 tid = thread->th.th_info.ds.ds_tid; 2303 2304 KMP_ASSERT(data != NULL); 2305 while (tg != NULL) { 2306 for (int i = 0; i < num; ++i) { 2307 if (!arr[i].flags.lazy_priv) { 2308 if (data == arr[i].reduce_shar || 2309 (data >= arr[i].reduce_priv && data < arr[i].reduce_pend)) 2310 return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size; 2311 } else { 2312 // check shared location first 2313 void **p_priv = (void **)(arr[i].reduce_priv); 2314 if (data == arr[i].reduce_shar) 2315 goto found; 2316 // check if we get some thread specific location as parameter 2317 for (int j = 0; j < nth; ++j) 2318 if (data == p_priv[j]) 2319 goto found; 2320 continue; // not found, continue search 2321 found: 2322 if (p_priv[tid] == NULL) { 2323 // allocate thread specific object lazily 2324 p_priv[tid] = __kmp_allocate(arr[i].reduce_size); 2325 if (arr[i].reduce_init != NULL) { 2326 if (arr[i].reduce_orig != NULL) { // new interface 2327 ((void (*)(void *, void *))arr[i].reduce_init)( 2328 p_priv[tid], arr[i].reduce_orig); 2329 } else { // old interface (single parameter) 2330 ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]); 2331 } 2332 } 2333 } 2334 return p_priv[tid]; 2335 } 2336 } 2337 tg = tg->parent; 2338 arr = (kmp_taskred_data_t *)(tg->reduce_data); 2339 num = tg->reduce_num_data; 2340 } 2341 KMP_ASSERT2(0, "Unknown task reduction item"); 2342 return NULL; // ERROR, this line never executed 2343 } 2344 2345 // Finalize task reduction. 2346 // Called from __kmpc_end_taskgroup() 2347 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) { 2348 kmp_int32 nth = th->th.th_team_nproc; 2349 KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1 2350 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data; 2351 kmp_int32 num = tg->reduce_num_data; 2352 for (int i = 0; i < num; ++i) { 2353 void *sh_data = arr[i].reduce_shar; 2354 void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini); 2355 void (*f_comb)(void *, void *) = 2356 (void (*)(void *, void *))(arr[i].reduce_comb); 2357 if (!arr[i].flags.lazy_priv) { 2358 void *pr_data = arr[i].reduce_priv; 2359 size_t size = arr[i].reduce_size; 2360 for (int j = 0; j < nth; ++j) { 2361 void *priv_data = (char *)pr_data + j * size; 2362 f_comb(sh_data, priv_data); // combine results 2363 if (f_fini) 2364 f_fini(priv_data); // finalize if needed 2365 } 2366 } else { 2367 void **pr_data = (void **)(arr[i].reduce_priv); 2368 for (int j = 0; j < nth; ++j) { 2369 if (pr_data[j] != NULL) { 2370 f_comb(sh_data, pr_data[j]); // combine results 2371 if (f_fini) 2372 f_fini(pr_data[j]); // finalize if needed 2373 __kmp_free(pr_data[j]); 2374 } 2375 } 2376 } 2377 __kmp_free(arr[i].reduce_priv); 2378 } 2379 __kmp_thread_free(th, arr); 2380 tg->reduce_data = NULL; 2381 tg->reduce_num_data = 0; 2382 } 2383 2384 // Cleanup task reduction data for parallel or worksharing, 2385 // do not touch task private data other threads still working with. 2386 // Called from __kmpc_end_taskgroup() 2387 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) { 2388 __kmp_thread_free(th, tg->reduce_data); 2389 tg->reduce_data = NULL; 2390 tg->reduce_num_data = 0; 2391 } 2392 2393 template <typename T> 2394 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, 2395 int num, T *data) { 2396 __kmp_assert_valid_gtid(gtid); 2397 kmp_info_t *thr = __kmp_threads[gtid]; 2398 kmp_int32 nth = thr->th.th_team_nproc; 2399 __kmpc_taskgroup(loc, gtid); // form new taskgroup first 2400 if (nth == 1) { 2401 KA_TRACE(10, 2402 ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n", 2403 gtid, thr->th.th_current_task->td_taskgroup)); 2404 return (void *)thr->th.th_current_task->td_taskgroup; 2405 } 2406 kmp_team_t *team = thr->th.th_team; 2407 void *reduce_data; 2408 kmp_taskgroup_t *tg; 2409 reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]); 2410 if (reduce_data == NULL && 2411 __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data, 2412 (void *)1)) { 2413 // single thread enters this block to initialize common reduction data 2414 KMP_DEBUG_ASSERT(reduce_data == NULL); 2415 // first initialize own data, then make a copy other threads can use 2416 tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data); 2417 reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t)); 2418 KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t)); 2419 // fini counters should be 0 at this point 2420 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0); 2421 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0); 2422 KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data); 2423 } else { 2424 while ( 2425 (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) == 2426 (void *)1) { // wait for task reduction initialization 2427 KMP_CPU_PAUSE(); 2428 } 2429 KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here 2430 tg = thr->th.th_current_task->td_taskgroup; 2431 __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data); 2432 } 2433 return tg; 2434 } 2435 2436 /*! 2437 @ingroup TASKING 2438 @param loc Source location info 2439 @param gtid Global thread ID 2440 @param is_ws Is 1 if the reduction is for worksharing, 0 otherwise 2441 @param num Number of data items to reduce 2442 @param data Array of data for reduction 2443 @return The taskgroup identifier 2444 2445 Initialize task reduction for a parallel or worksharing. 2446 2447 Note: this entry supposes the optional compiler-generated initializer routine 2448 has single parameter - pointer to object to be initialized. That means 2449 the reduction either does not use omp_orig object, or the omp_orig is accessible 2450 without help of the runtime library. 2451 */ 2452 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, 2453 int num, void *data) { 2454 return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num, 2455 (kmp_task_red_input_t *)data); 2456 } 2457 2458 /*! 2459 @ingroup TASKING 2460 @param loc Source location info 2461 @param gtid Global thread ID 2462 @param is_ws Is 1 if the reduction is for worksharing, 0 otherwise 2463 @param num Number of data items to reduce 2464 @param data Array of data for reduction 2465 @return The taskgroup identifier 2466 2467 Initialize task reduction for a parallel or worksharing. 2468 2469 Note: this entry supposes the optional compiler-generated initializer routine 2470 has two parameters, pointer to object to be initialized and pointer to omp_orig 2471 */ 2472 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, 2473 void *data) { 2474 return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num, 2475 (kmp_taskred_input_t *)data); 2476 } 2477 2478 /*! 2479 @ingroup TASKING 2480 @param loc Source location info 2481 @param gtid Global thread ID 2482 @param is_ws Is 1 if the reduction is for worksharing, 0 otherwise 2483 2484 Finalize task reduction for a parallel or worksharing. 2485 */ 2486 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) { 2487 __kmpc_end_taskgroup(loc, gtid); 2488 } 2489 2490 // __kmpc_taskgroup: Start a new taskgroup 2491 void __kmpc_taskgroup(ident_t *loc, int gtid) { 2492 __kmp_assert_valid_gtid(gtid); 2493 kmp_info_t *thread = __kmp_threads[gtid]; 2494 kmp_taskdata_t *taskdata = thread->th.th_current_task; 2495 kmp_taskgroup_t *tg_new = 2496 (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t)); 2497 KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new)); 2498 KMP_ATOMIC_ST_RLX(&tg_new->count, 0); 2499 KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq); 2500 tg_new->parent = taskdata->td_taskgroup; 2501 tg_new->reduce_data = NULL; 2502 tg_new->reduce_num_data = 0; 2503 taskdata->td_taskgroup = tg_new; 2504 2505 #if OMPT_SUPPORT && OMPT_OPTIONAL 2506 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) { 2507 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2508 if (!codeptr) 2509 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2510 kmp_team_t *team = thread->th.th_team; 2511 ompt_data_t my_task_data = taskdata->ompt_task_info.task_data; 2512 // FIXME: I think this is wrong for lwt! 2513 ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data; 2514 2515 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2516 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data), 2517 &(my_task_data), codeptr); 2518 } 2519 #endif 2520 } 2521 2522 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task 2523 // and its descendants are complete 2524 void __kmpc_end_taskgroup(ident_t *loc, int gtid) { 2525 __kmp_assert_valid_gtid(gtid); 2526 kmp_info_t *thread = __kmp_threads[gtid]; 2527 kmp_taskdata_t *taskdata = thread->th.th_current_task; 2528 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 2529 int thread_finished = FALSE; 2530 2531 #if OMPT_SUPPORT && OMPT_OPTIONAL 2532 kmp_team_t *team; 2533 ompt_data_t my_task_data; 2534 ompt_data_t my_parallel_data; 2535 void *codeptr; 2536 if (UNLIKELY(ompt_enabled.enabled)) { 2537 team = thread->th.th_team; 2538 my_task_data = taskdata->ompt_task_info.task_data; 2539 // FIXME: I think this is wrong for lwt! 2540 my_parallel_data = team->t.ompt_team_info.parallel_data; 2541 codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2542 if (!codeptr) 2543 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2544 } 2545 #endif 2546 2547 KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc)); 2548 KMP_DEBUG_ASSERT(taskgroup != NULL); 2549 KMP_SET_THREAD_STATE_BLOCK(TASKGROUP); 2550 2551 if (__kmp_tasking_mode != tskm_immediate_exec) { 2552 // mark task as waiting not on a barrier 2553 taskdata->td_taskwait_counter += 1; 2554 taskdata->td_taskwait_ident = loc; 2555 taskdata->td_taskwait_thread = gtid + 1; 2556 #if USE_ITT_BUILD 2557 // For ITT the taskgroup wait is similar to taskwait until we need to 2558 // distinguish them 2559 void *itt_sync_obj = NULL; 2560 #if USE_ITT_NOTIFY 2561 KMP_ITT_TASKWAIT_STARTING(itt_sync_obj); 2562 #endif /* USE_ITT_NOTIFY */ 2563 #endif /* USE_ITT_BUILD */ 2564 2565 #if OMPT_SUPPORT && OMPT_OPTIONAL 2566 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) { 2567 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2568 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data), 2569 &(my_task_data), codeptr); 2570 } 2571 #endif 2572 2573 if (!taskdata->td_flags.team_serial || 2574 (thread->th.th_task_team != NULL && 2575 thread->th.th_task_team->tt.tt_found_proxy_tasks)) { 2576 kmp_flag_32<false, false> flag( 2577 RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U); 2578 while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) { 2579 flag.execute_tasks(thread, gtid, FALSE, 2580 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 2581 __kmp_task_stealing_constraint); 2582 } 2583 } 2584 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting 2585 2586 #if OMPT_SUPPORT && OMPT_OPTIONAL 2587 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) { 2588 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2589 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data), 2590 &(my_task_data), codeptr); 2591 } 2592 #endif 2593 2594 #if USE_ITT_BUILD 2595 KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj); 2596 KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants 2597 #endif /* USE_ITT_BUILD */ 2598 } 2599 KMP_DEBUG_ASSERT(taskgroup->count == 0); 2600 2601 if (taskgroup->reduce_data != NULL) { // need to reduce? 2602 int cnt; 2603 void *reduce_data; 2604 kmp_team_t *t = thread->th.th_team; 2605 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data; 2606 // check if <priv> data of the first reduction variable shared for the team 2607 void *priv0 = arr[0].reduce_priv; 2608 if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL && 2609 ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) { 2610 // finishing task reduction on parallel 2611 cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]); 2612 if (cnt == thread->th.th_team_nproc - 1) { 2613 // we are the last thread passing __kmpc_reduction_modifier_fini() 2614 // finalize task reduction: 2615 __kmp_task_reduction_fini(thread, taskgroup); 2616 // cleanup fields in the team structure: 2617 // TODO: is relaxed store enough here (whole barrier should follow)? 2618 __kmp_thread_free(thread, reduce_data); 2619 KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL); 2620 KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0); 2621 } else { 2622 // we are not the last thread passing __kmpc_reduction_modifier_fini(), 2623 // so do not finalize reduction, just clean own copy of the data 2624 __kmp_task_reduction_clean(thread, taskgroup); 2625 } 2626 } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) != 2627 NULL && 2628 ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) { 2629 // finishing task reduction on worksharing 2630 cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]); 2631 if (cnt == thread->th.th_team_nproc - 1) { 2632 // we are the last thread passing __kmpc_reduction_modifier_fini() 2633 __kmp_task_reduction_fini(thread, taskgroup); 2634 // cleanup fields in team structure: 2635 // TODO: is relaxed store enough here (whole barrier should follow)? 2636 __kmp_thread_free(thread, reduce_data); 2637 KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL); 2638 KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0); 2639 } else { 2640 // we are not the last thread passing __kmpc_reduction_modifier_fini(), 2641 // so do not finalize reduction, just clean own copy of the data 2642 __kmp_task_reduction_clean(thread, taskgroup); 2643 } 2644 } else { 2645 // finishing task reduction on taskgroup 2646 __kmp_task_reduction_fini(thread, taskgroup); 2647 } 2648 } 2649 // Restore parent taskgroup for the current task 2650 taskdata->td_taskgroup = taskgroup->parent; 2651 __kmp_thread_free(thread, taskgroup); 2652 2653 KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n", 2654 gtid, taskdata)); 2655 ANNOTATE_HAPPENS_AFTER(taskdata); 2656 2657 #if OMPT_SUPPORT && OMPT_OPTIONAL 2658 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) { 2659 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2660 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data), 2661 &(my_task_data), codeptr); 2662 } 2663 #endif 2664 } 2665 2666 // __kmp_remove_my_task: remove a task from my own deque 2667 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid, 2668 kmp_task_team_t *task_team, 2669 kmp_int32 is_constrained) { 2670 kmp_task_t *task; 2671 kmp_taskdata_t *taskdata; 2672 kmp_thread_data_t *thread_data; 2673 kmp_uint32 tail; 2674 2675 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2676 KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data != 2677 NULL); // Caller should check this condition 2678 2679 thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 2680 2681 KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n", 2682 gtid, thread_data->td.td_deque_ntasks, 2683 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2684 2685 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 2686 KA_TRACE(10, 2687 ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: " 2688 "ntasks=%d head=%u tail=%u\n", 2689 gtid, thread_data->td.td_deque_ntasks, 2690 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2691 return NULL; 2692 } 2693 2694 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 2695 2696 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 2697 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2698 KA_TRACE(10, 2699 ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: " 2700 "ntasks=%d head=%u tail=%u\n", 2701 gtid, thread_data->td.td_deque_ntasks, 2702 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2703 return NULL; 2704 } 2705 2706 tail = (thread_data->td.td_deque_tail - 1) & 2707 TASK_DEQUE_MASK(thread_data->td); // Wrap index. 2708 taskdata = thread_data->td.td_deque[tail]; 2709 2710 if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata, 2711 thread->th.th_current_task)) { 2712 // The TSC does not allow to steal victim task 2713 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2714 KA_TRACE(10, 2715 ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: " 2716 "ntasks=%d head=%u tail=%u\n", 2717 gtid, thread_data->td.td_deque_ntasks, 2718 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2719 return NULL; 2720 } 2721 2722 thread_data->td.td_deque_tail = tail; 2723 TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1); 2724 2725 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2726 2727 KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: " 2728 "ntasks=%d head=%u tail=%u\n", 2729 gtid, taskdata, thread_data->td.td_deque_ntasks, 2730 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2731 2732 task = KMP_TASKDATA_TO_TASK(taskdata); 2733 return task; 2734 } 2735 2736 // __kmp_steal_task: remove a task from another thread's deque 2737 // Assume that calling thread has already checked existence of 2738 // task_team thread_data before calling this routine. 2739 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid, 2740 kmp_task_team_t *task_team, 2741 std::atomic<kmp_int32> *unfinished_threads, 2742 int *thread_finished, 2743 kmp_int32 is_constrained) { 2744 kmp_task_t *task; 2745 kmp_taskdata_t *taskdata; 2746 kmp_taskdata_t *current; 2747 kmp_thread_data_t *victim_td, *threads_data; 2748 kmp_int32 target; 2749 kmp_int32 victim_tid; 2750 2751 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2752 2753 threads_data = task_team->tt.tt_threads_data; 2754 KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition 2755 2756 victim_tid = victim_thr->th.th_info.ds.ds_tid; 2757 victim_td = &threads_data[victim_tid]; 2758 2759 KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: " 2760 "task_team=%p ntasks=%d head=%u tail=%u\n", 2761 gtid, __kmp_gtid_from_thread(victim_thr), task_team, 2762 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2763 victim_td->td.td_deque_tail)); 2764 2765 if (TCR_4(victim_td->td.td_deque_ntasks) == 0) { 2766 KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: " 2767 "task_team=%p ntasks=%d head=%u tail=%u\n", 2768 gtid, __kmp_gtid_from_thread(victim_thr), task_team, 2769 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2770 victim_td->td.td_deque_tail)); 2771 return NULL; 2772 } 2773 2774 __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock); 2775 2776 int ntasks = TCR_4(victim_td->td.td_deque_ntasks); 2777 // Check again after we acquire the lock 2778 if (ntasks == 0) { 2779 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2780 KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: " 2781 "task_team=%p ntasks=%d head=%u tail=%u\n", 2782 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2783 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2784 return NULL; 2785 } 2786 2787 KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL); 2788 current = __kmp_threads[gtid]->th.th_current_task; 2789 taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head]; 2790 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) { 2791 // Bump head pointer and Wrap. 2792 victim_td->td.td_deque_head = 2793 (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td); 2794 } else { 2795 if (!task_team->tt.tt_untied_task_encountered) { 2796 // The TSC does not allow to steal victim task 2797 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2798 KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from " 2799 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n", 2800 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2801 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2802 return NULL; 2803 } 2804 int i; 2805 // walk through victim's deque trying to steal any task 2806 target = victim_td->td.td_deque_head; 2807 taskdata = NULL; 2808 for (i = 1; i < ntasks; ++i) { 2809 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td); 2810 taskdata = victim_td->td.td_deque[target]; 2811 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) { 2812 break; // found victim task 2813 } else { 2814 taskdata = NULL; 2815 } 2816 } 2817 if (taskdata == NULL) { 2818 // No appropriate candidate to steal found 2819 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2820 KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from " 2821 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n", 2822 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2823 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2824 return NULL; 2825 } 2826 int prev = target; 2827 for (i = i + 1; i < ntasks; ++i) { 2828 // shift remaining tasks in the deque left by 1 2829 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td); 2830 victim_td->td.td_deque[prev] = victim_td->td.td_deque[target]; 2831 prev = target; 2832 } 2833 KMP_DEBUG_ASSERT( 2834 victim_td->td.td_deque_tail == 2835 (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td))); 2836 victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped)) 2837 } 2838 if (*thread_finished) { 2839 // We need to un-mark this victim as a finished victim. This must be done 2840 // before releasing the lock, or else other threads (starting with the 2841 // master victim) might be prematurely released from the barrier!!! 2842 kmp_int32 count; 2843 2844 count = KMP_ATOMIC_INC(unfinished_threads); 2845 2846 KA_TRACE( 2847 20, 2848 ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n", 2849 gtid, count + 1, task_team)); 2850 2851 *thread_finished = FALSE; 2852 } 2853 TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1); 2854 2855 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2856 2857 KMP_COUNT_BLOCK(TASK_stolen); 2858 KA_TRACE(10, 2859 ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: " 2860 "task_team=%p ntasks=%d head=%u tail=%u\n", 2861 gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team, 2862 ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2863 2864 task = KMP_TASKDATA_TO_TASK(taskdata); 2865 return task; 2866 } 2867 2868 // __kmp_execute_tasks_template: Choose and execute tasks until either the 2869 // condition is statisfied (return true) or there are none left (return false). 2870 // 2871 // final_spin is TRUE if this is the spin at the release barrier. 2872 // thread_finished indicates whether the thread is finished executing all 2873 // the tasks it has on its deque, and is at the release barrier. 2874 // spinner is the location on which to spin. 2875 // spinner == NULL means only execute a single task and return. 2876 // checker is the value to check to terminate the spin. 2877 template <class C> 2878 static inline int __kmp_execute_tasks_template( 2879 kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin, 2880 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2881 kmp_int32 is_constrained) { 2882 kmp_task_team_t *task_team = thread->th.th_task_team; 2883 kmp_thread_data_t *threads_data; 2884 kmp_task_t *task; 2885 kmp_info_t *other_thread; 2886 kmp_taskdata_t *current_task = thread->th.th_current_task; 2887 std::atomic<kmp_int32> *unfinished_threads; 2888 kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0, 2889 tid = thread->th.th_info.ds.ds_tid; 2890 2891 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2892 KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]); 2893 2894 if (task_team == NULL || current_task == NULL) 2895 return FALSE; 2896 2897 KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d " 2898 "*thread_finished=%d\n", 2899 gtid, final_spin, *thread_finished)); 2900 2901 thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 2902 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 2903 2904 KMP_DEBUG_ASSERT(threads_data != NULL); 2905 2906 nthreads = task_team->tt.tt_nproc; 2907 unfinished_threads = &(task_team->tt.tt_unfinished_threads); 2908 KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks || 2909 task_team->tt.tt_hidden_helper_task_encountered); 2910 KMP_DEBUG_ASSERT(*unfinished_threads >= 0); 2911 2912 while (1) { // Outer loop keeps trying to find tasks in case of single thread 2913 // getting tasks from target constructs 2914 while (1) { // Inner loop to find a task and execute it 2915 task = NULL; 2916 if (use_own_tasks) { // check on own queue first 2917 task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained); 2918 } 2919 if ((task == NULL) && (nthreads > 1)) { // Steal a task 2920 int asleep = 1; 2921 use_own_tasks = 0; 2922 // Try to steal from the last place I stole from successfully. 2923 if (victim_tid == -2) { // haven't stolen anything yet 2924 victim_tid = threads_data[tid].td.td_deque_last_stolen; 2925 if (victim_tid != 2926 -1) // if we have a last stolen from victim, get the thread 2927 other_thread = threads_data[victim_tid].td.td_thr; 2928 } 2929 if (victim_tid != -1) { // found last victim 2930 asleep = 0; 2931 } else if (!new_victim) { // no recent steals and we haven't already 2932 // used a new victim; select a random thread 2933 do { // Find a different thread to steal work from. 2934 // Pick a random thread. Initial plan was to cycle through all the 2935 // threads, and only return if we tried to steal from every thread, 2936 // and failed. Arch says that's not such a great idea. 2937 victim_tid = __kmp_get_random(thread) % (nthreads - 1); 2938 if (victim_tid >= tid) { 2939 ++victim_tid; // Adjusts random distribution to exclude self 2940 } 2941 // Found a potential victim 2942 other_thread = threads_data[victim_tid].td.td_thr; 2943 // There is a slight chance that __kmp_enable_tasking() did not wake 2944 // up all threads waiting at the barrier. If victim is sleeping, 2945 // then wake it up. Since we were going to pay the cache miss 2946 // penalty for referencing another thread's kmp_info_t struct 2947 // anyway, 2948 // the check shouldn't cost too much performance at this point. In 2949 // extra barrier mode, tasks do not sleep at the separate tasking 2950 // barrier, so this isn't a problem. 2951 asleep = 0; 2952 if ((__kmp_tasking_mode == tskm_task_teams) && 2953 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) && 2954 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) != 2955 NULL)) { 2956 asleep = 1; 2957 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread), 2958 other_thread->th.th_sleep_loc); 2959 // A sleeping thread should not have any tasks on it's queue. 2960 // There is a slight possibility that it resumes, steals a task 2961 // from another thread, which spawns more tasks, all in the time 2962 // that it takes this thread to check => don't write an assertion 2963 // that the victim's queue is empty. Try stealing from a 2964 // different thread. 2965 } 2966 } while (asleep); 2967 } 2968 2969 if (!asleep) { 2970 // We have a victim to try to steal from 2971 task = __kmp_steal_task(other_thread, gtid, task_team, 2972 unfinished_threads, thread_finished, 2973 is_constrained); 2974 } 2975 if (task != NULL) { // set last stolen to victim 2976 if (threads_data[tid].td.td_deque_last_stolen != victim_tid) { 2977 threads_data[tid].td.td_deque_last_stolen = victim_tid; 2978 // The pre-refactored code did not try more than 1 successful new 2979 // vicitm, unless the last one generated more local tasks; 2980 // new_victim keeps track of this 2981 new_victim = 1; 2982 } 2983 } else { // No tasks found; unset last_stolen 2984 KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1); 2985 victim_tid = -2; // no successful victim found 2986 } 2987 } 2988 2989 if (task == NULL) 2990 break; // break out of tasking loop 2991 2992 // Found a task; execute it 2993 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2994 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2995 if (itt_sync_obj == NULL) { // we are at fork barrier where we could not 2996 // get the object reliably 2997 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2998 } 2999 __kmp_itt_task_starting(itt_sync_obj); 3000 } 3001 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 3002 __kmp_invoke_task(gtid, task, current_task); 3003 #if USE_ITT_BUILD 3004 if (itt_sync_obj != NULL) 3005 __kmp_itt_task_finished(itt_sync_obj); 3006 #endif /* USE_ITT_BUILD */ 3007 // If this thread is only partway through the barrier and the condition is 3008 // met, then return now, so that the barrier gather/release pattern can 3009 // proceed. If this thread is in the last spin loop in the barrier, 3010 // waiting to be released, we know that the termination condition will not 3011 // be satisfied, so don't waste any cycles checking it. 3012 if (flag == NULL || (!final_spin && flag->done_check())) { 3013 KA_TRACE( 3014 15, 3015 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 3016 gtid)); 3017 return TRUE; 3018 } 3019 if (thread->th.th_task_team == NULL) { 3020 break; 3021 } 3022 KMP_YIELD(__kmp_library == library_throughput); // Yield before next task 3023 // If execution of a stolen task results in more tasks being placed on our 3024 // run queue, reset use_own_tasks 3025 if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) { 3026 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned " 3027 "other tasks, restart\n", 3028 gtid)); 3029 use_own_tasks = 1; 3030 new_victim = 0; 3031 } 3032 } 3033 3034 // The task source has been exhausted. If in final spin loop of barrier, 3035 // check if termination condition is satisfied. The work queue may be empty 3036 // but there might be proxy tasks still executing. 3037 if (final_spin && 3038 KMP_ATOMIC_LD_ACQ(¤t_task->td_incomplete_child_tasks) == 0) { 3039 // First, decrement the #unfinished threads, if that has not already been 3040 // done. This decrement might be to the spin location, and result in the 3041 // termination condition being satisfied. 3042 if (!*thread_finished) { 3043 kmp_int32 count; 3044 3045 count = KMP_ATOMIC_DEC(unfinished_threads) - 1; 3046 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec " 3047 "unfinished_threads to %d task_team=%p\n", 3048 gtid, count, task_team)); 3049 *thread_finished = TRUE; 3050 } 3051 3052 // It is now unsafe to reference thread->th.th_team !!! 3053 // Decrementing task_team->tt.tt_unfinished_threads can allow the master 3054 // thread to pass through the barrier, where it might reset each thread's 3055 // th.th_team field for the next parallel region. If we can steal more 3056 // work, we know that this has not happened yet. 3057 if (flag != NULL && flag->done_check()) { 3058 KA_TRACE( 3059 15, 3060 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 3061 gtid)); 3062 return TRUE; 3063 } 3064 } 3065 3066 // If this thread's task team is NULL, master has recognized that there are 3067 // no more tasks; bail out 3068 if (thread->th.th_task_team == NULL) { 3069 KA_TRACE(15, 3070 ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid)); 3071 return FALSE; 3072 } 3073 3074 // We could be getting tasks from target constructs; if this is the only 3075 // thread, keep trying to execute tasks from own queue 3076 if (nthreads == 1 && 3077 KMP_ATOMIC_LD_ACQ(¤t_task->td_incomplete_child_tasks)) 3078 use_own_tasks = 1; 3079 else { 3080 KA_TRACE(15, 3081 ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid)); 3082 return FALSE; 3083 } 3084 } 3085 } 3086 3087 template <bool C, bool S> 3088 int __kmp_execute_tasks_32( 3089 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin, 3090 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 3091 kmp_int32 is_constrained) { 3092 return __kmp_execute_tasks_template( 3093 thread, gtid, flag, final_spin, 3094 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 3095 } 3096 3097 template <bool C, bool S> 3098 int __kmp_execute_tasks_64( 3099 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin, 3100 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 3101 kmp_int32 is_constrained) { 3102 return __kmp_execute_tasks_template( 3103 thread, gtid, flag, final_spin, 3104 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 3105 } 3106 3107 int __kmp_execute_tasks_oncore( 3108 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin, 3109 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 3110 kmp_int32 is_constrained) { 3111 return __kmp_execute_tasks_template( 3112 thread, gtid, flag, final_spin, 3113 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 3114 } 3115 3116 template int 3117 __kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32, 3118 kmp_flag_32<false, false> *, int, 3119 int *USE_ITT_BUILD_ARG(void *), kmp_int32); 3120 3121 template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32, 3122 kmp_flag_64<false, true> *, 3123 int, 3124 int *USE_ITT_BUILD_ARG(void *), 3125 kmp_int32); 3126 3127 template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32, 3128 kmp_flag_64<true, false> *, 3129 int, 3130 int *USE_ITT_BUILD_ARG(void *), 3131 kmp_int32); 3132 3133 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the 3134 // next barrier so they can assist in executing enqueued tasks. 3135 // First thread in allocates the task team atomically. 3136 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 3137 kmp_info_t *this_thr) { 3138 kmp_thread_data_t *threads_data; 3139 int nthreads, i, is_init_thread; 3140 3141 KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n", 3142 __kmp_gtid_from_thread(this_thr))); 3143 3144 KMP_DEBUG_ASSERT(task_team != NULL); 3145 KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL); 3146 3147 nthreads = task_team->tt.tt_nproc; 3148 KMP_DEBUG_ASSERT(nthreads > 0); 3149 KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc); 3150 3151 // Allocate or increase the size of threads_data if necessary 3152 is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team); 3153 3154 if (!is_init_thread) { 3155 // Some other thread already set up the array. 3156 KA_TRACE( 3157 20, 3158 ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n", 3159 __kmp_gtid_from_thread(this_thr))); 3160 return; 3161 } 3162 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 3163 KMP_DEBUG_ASSERT(threads_data != NULL); 3164 3165 if (__kmp_tasking_mode == tskm_task_teams && 3166 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) { 3167 // Release any threads sleeping at the barrier, so that they can steal 3168 // tasks and execute them. In extra barrier mode, tasks do not sleep 3169 // at the separate tasking barrier, so this isn't a problem. 3170 for (i = 0; i < nthreads; i++) { 3171 volatile void *sleep_loc; 3172 kmp_info_t *thread = threads_data[i].td.td_thr; 3173 3174 if (i == this_thr->th.th_info.ds.ds_tid) { 3175 continue; 3176 } 3177 // Since we haven't locked the thread's suspend mutex lock at this 3178 // point, there is a small window where a thread might be putting 3179 // itself to sleep, but hasn't set the th_sleep_loc field yet. 3180 // To work around this, __kmp_execute_tasks_template() periodically checks 3181 // see if other threads are sleeping (using the same random mechanism that 3182 // is used for task stealing) and awakens them if they are. 3183 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != 3184 NULL) { 3185 KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n", 3186 __kmp_gtid_from_thread(this_thr), 3187 __kmp_gtid_from_thread(thread))); 3188 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc); 3189 } else { 3190 KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n", 3191 __kmp_gtid_from_thread(this_thr), 3192 __kmp_gtid_from_thread(thread))); 3193 } 3194 } 3195 } 3196 3197 KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n", 3198 __kmp_gtid_from_thread(this_thr))); 3199 } 3200 3201 /* // TODO: Check the comment consistency 3202 * Utility routines for "task teams". A task team (kmp_task_t) is kind of 3203 * like a shadow of the kmp_team_t data struct, with a different lifetime. 3204 * After a child * thread checks into a barrier and calls __kmp_release() from 3205 * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no 3206 * longer assume that the kmp_team_t structure is intact (at any moment, the 3207 * master thread may exit the barrier code and free the team data structure, 3208 * and return the threads to the thread pool). 3209 * 3210 * This does not work with the tasking code, as the thread is still 3211 * expected to participate in the execution of any tasks that may have been 3212 * spawned my a member of the team, and the thread still needs access to all 3213 * to each thread in the team, so that it can steal work from it. 3214 * 3215 * Enter the existence of the kmp_task_team_t struct. It employs a reference 3216 * counting mechanism, and is allocated by the master thread before calling 3217 * __kmp_<barrier_kind>_release, and then is release by the last thread to 3218 * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes 3219 * of the kmp_task_team_t structs for consecutive barriers can overlap 3220 * (and will, unless the master thread is the last thread to exit the barrier 3221 * release phase, which is not typical). The existence of such a struct is 3222 * useful outside the context of tasking. 3223 * 3224 * We currently use the existence of the threads array as an indicator that 3225 * tasks were spawned since the last barrier. If the structure is to be 3226 * useful outside the context of tasking, then this will have to change, but 3227 * not setting the field minimizes the performance impact of tasking on 3228 * barriers, when no explicit tasks were spawned (pushed, actually). 3229 */ 3230 3231 static kmp_task_team_t *__kmp_free_task_teams = 3232 NULL; // Free list for task_team data structures 3233 // Lock for task team data structures 3234 kmp_bootstrap_lock_t __kmp_task_team_lock = 3235 KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock); 3236 3237 // __kmp_alloc_task_deque: 3238 // Allocates a task deque for a particular thread, and initialize the necessary 3239 // data structures relating to the deque. This only happens once per thread 3240 // per task team since task teams are recycled. No lock is needed during 3241 // allocation since each thread allocates its own deque. 3242 static void __kmp_alloc_task_deque(kmp_info_t *thread, 3243 kmp_thread_data_t *thread_data) { 3244 __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock); 3245 KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL); 3246 3247 // Initialize last stolen task field to "none" 3248 thread_data->td.td_deque_last_stolen = -1; 3249 3250 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0); 3251 KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0); 3252 KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0); 3253 3254 KE_TRACE( 3255 10, 3256 ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n", 3257 __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data)); 3258 // Allocate space for task deque, and zero the deque 3259 // Cannot use __kmp_thread_calloc() because threads not around for 3260 // kmp_reap_task_team( ). 3261 thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate( 3262 INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *)); 3263 thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE; 3264 } 3265 3266 // __kmp_free_task_deque: 3267 // Deallocates a task deque for a particular thread. Happens at library 3268 // deallocation so don't need to reset all thread data fields. 3269 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) { 3270 if (thread_data->td.td_deque != NULL) { 3271 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3272 TCW_4(thread_data->td.td_deque_ntasks, 0); 3273 __kmp_free(thread_data->td.td_deque); 3274 thread_data->td.td_deque = NULL; 3275 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 3276 } 3277 3278 #ifdef BUILD_TIED_TASK_STACK 3279 // GEH: Figure out what to do here for td_susp_tied_tasks 3280 if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) { 3281 __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data); 3282 } 3283 #endif // BUILD_TIED_TASK_STACK 3284 } 3285 3286 // __kmp_realloc_task_threads_data: 3287 // Allocates a threads_data array for a task team, either by allocating an 3288 // initial array or enlarging an existing array. Only the first thread to get 3289 // the lock allocs or enlarges the array and re-initializes the array elements. 3290 // That thread returns "TRUE", the rest return "FALSE". 3291 // Assumes that the new array size is given by task_team -> tt.tt_nproc. 3292 // The current size is given by task_team -> tt.tt_max_threads. 3293 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 3294 kmp_task_team_t *task_team) { 3295 kmp_thread_data_t **threads_data_p; 3296 kmp_int32 nthreads, maxthreads; 3297 int is_init_thread = FALSE; 3298 3299 if (TCR_4(task_team->tt.tt_found_tasks)) { 3300 // Already reallocated and initialized. 3301 return FALSE; 3302 } 3303 3304 threads_data_p = &task_team->tt.tt_threads_data; 3305 nthreads = task_team->tt.tt_nproc; 3306 maxthreads = task_team->tt.tt_max_threads; 3307 3308 // All threads must lock when they encounter the first task of the implicit 3309 // task region to make sure threads_data fields are (re)initialized before 3310 // used. 3311 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 3312 3313 if (!TCR_4(task_team->tt.tt_found_tasks)) { 3314 // first thread to enable tasking 3315 kmp_team_t *team = thread->th.th_team; 3316 int i; 3317 3318 is_init_thread = TRUE; 3319 if (maxthreads < nthreads) { 3320 3321 if (*threads_data_p != NULL) { 3322 kmp_thread_data_t *old_data = *threads_data_p; 3323 kmp_thread_data_t *new_data = NULL; 3324 3325 KE_TRACE( 3326 10, 3327 ("__kmp_realloc_task_threads_data: T#%d reallocating " 3328 "threads data for task_team %p, new_size = %d, old_size = %d\n", 3329 __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads)); 3330 // Reallocate threads_data to have more elements than current array 3331 // Cannot use __kmp_thread_realloc() because threads not around for 3332 // kmp_reap_task_team( ). Note all new array entries are initialized 3333 // to zero by __kmp_allocate(). 3334 new_data = (kmp_thread_data_t *)__kmp_allocate( 3335 nthreads * sizeof(kmp_thread_data_t)); 3336 // copy old data to new data 3337 KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t), 3338 (void *)old_data, maxthreads * sizeof(kmp_thread_data_t)); 3339 3340 #ifdef BUILD_TIED_TASK_STACK 3341 // GEH: Figure out if this is the right thing to do 3342 for (i = maxthreads; i < nthreads; i++) { 3343 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 3344 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 3345 } 3346 #endif // BUILD_TIED_TASK_STACK 3347 // Install the new data and free the old data 3348 (*threads_data_p) = new_data; 3349 __kmp_free(old_data); 3350 } else { 3351 KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating " 3352 "threads data for task_team %p, size = %d\n", 3353 __kmp_gtid_from_thread(thread), task_team, nthreads)); 3354 // Make the initial allocate for threads_data array, and zero entries 3355 // Cannot use __kmp_thread_calloc() because threads not around for 3356 // kmp_reap_task_team( ). 3357 ANNOTATE_IGNORE_WRITES_BEGIN(); 3358 *threads_data_p = (kmp_thread_data_t *)__kmp_allocate( 3359 nthreads * sizeof(kmp_thread_data_t)); 3360 ANNOTATE_IGNORE_WRITES_END(); 3361 #ifdef BUILD_TIED_TASK_STACK 3362 // GEH: Figure out if this is the right thing to do 3363 for (i = 0; i < nthreads; i++) { 3364 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 3365 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 3366 } 3367 #endif // BUILD_TIED_TASK_STACK 3368 } 3369 task_team->tt.tt_max_threads = nthreads; 3370 } else { 3371 // If array has (more than) enough elements, go ahead and use it 3372 KMP_DEBUG_ASSERT(*threads_data_p != NULL); 3373 } 3374 3375 // initialize threads_data pointers back to thread_info structures 3376 for (i = 0; i < nthreads; i++) { 3377 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 3378 thread_data->td.td_thr = team->t.t_threads[i]; 3379 3380 if (thread_data->td.td_deque_last_stolen >= nthreads) { 3381 // The last stolen field survives across teams / barrier, and the number 3382 // of threads may have changed. It's possible (likely?) that a new 3383 // parallel region will exhibit the same behavior as previous region. 3384 thread_data->td.td_deque_last_stolen = -1; 3385 } 3386 } 3387 3388 KMP_MB(); 3389 TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE); 3390 } 3391 3392 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 3393 return is_init_thread; 3394 } 3395 3396 // __kmp_free_task_threads_data: 3397 // Deallocates a threads_data array for a task team, including any attached 3398 // tasking deques. Only occurs at library shutdown. 3399 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) { 3400 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 3401 if (task_team->tt.tt_threads_data != NULL) { 3402 int i; 3403 for (i = 0; i < task_team->tt.tt_max_threads; i++) { 3404 __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]); 3405 } 3406 __kmp_free(task_team->tt.tt_threads_data); 3407 task_team->tt.tt_threads_data = NULL; 3408 } 3409 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 3410 } 3411 3412 // __kmp_allocate_task_team: 3413 // Allocates a task team associated with a specific team, taking it from 3414 // the global task team free list if possible. Also initializes data 3415 // structures. 3416 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread, 3417 kmp_team_t *team) { 3418 kmp_task_team_t *task_team = NULL; 3419 int nthreads; 3420 3421 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n", 3422 (thread ? __kmp_gtid_from_thread(thread) : -1), team)); 3423 3424 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 3425 // Take a task team from the task team pool 3426 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3427 if (__kmp_free_task_teams != NULL) { 3428 task_team = __kmp_free_task_teams; 3429 TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next); 3430 task_team->tt.tt_next = NULL; 3431 } 3432 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3433 } 3434 3435 if (task_team == NULL) { 3436 KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating " 3437 "task team for team %p\n", 3438 __kmp_gtid_from_thread(thread), team)); 3439 // Allocate a new task team if one is not available. Cannot use 3440 // __kmp_thread_malloc because threads not around for kmp_reap_task_team. 3441 task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t)); 3442 __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock); 3443 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 3444 // suppress race conditions detection on synchronization flags in debug mode 3445 // this helps to analyze library internals eliminating false positives 3446 __itt_suppress_mark_range( 3447 __itt_suppress_range, __itt_suppress_threading_errors, 3448 &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks)); 3449 __itt_suppress_mark_range(__itt_suppress_range, 3450 __itt_suppress_threading_errors, 3451 CCAST(kmp_uint32 *, &task_team->tt.tt_active), 3452 sizeof(task_team->tt.tt_active)); 3453 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 3454 // Note: __kmp_allocate zeroes returned memory, othewise we would need: 3455 // task_team->tt.tt_threads_data = NULL; 3456 // task_team->tt.tt_max_threads = 0; 3457 // task_team->tt.tt_next = NULL; 3458 } 3459 3460 TCW_4(task_team->tt.tt_found_tasks, FALSE); 3461 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3462 task_team->tt.tt_nproc = nthreads = team->t.t_nproc; 3463 3464 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads); 3465 TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE); 3466 TCW_4(task_team->tt.tt_active, TRUE); 3467 3468 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p " 3469 "unfinished_threads init'd to %d\n", 3470 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team, 3471 KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads))); 3472 return task_team; 3473 } 3474 3475 // __kmp_free_task_team: 3476 // Frees the task team associated with a specific thread, and adds it 3477 // to the global task team free list. 3478 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) { 3479 KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n", 3480 thread ? __kmp_gtid_from_thread(thread) : -1, task_team)); 3481 3482 // Put task team back on free list 3483 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3484 3485 KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL); 3486 task_team->tt.tt_next = __kmp_free_task_teams; 3487 TCW_PTR(__kmp_free_task_teams, task_team); 3488 3489 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3490 } 3491 3492 // __kmp_reap_task_teams: 3493 // Free all the task teams on the task team free list. 3494 // Should only be done during library shutdown. 3495 // Cannot do anything that needs a thread structure or gtid since they are 3496 // already gone. 3497 void __kmp_reap_task_teams(void) { 3498 kmp_task_team_t *task_team; 3499 3500 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 3501 // Free all task_teams on the free list 3502 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3503 while ((task_team = __kmp_free_task_teams) != NULL) { 3504 __kmp_free_task_teams = task_team->tt.tt_next; 3505 task_team->tt.tt_next = NULL; 3506 3507 // Free threads_data if necessary 3508 if (task_team->tt.tt_threads_data != NULL) { 3509 __kmp_free_task_threads_data(task_team); 3510 } 3511 __kmp_free(task_team); 3512 } 3513 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3514 } 3515 } 3516 3517 // __kmp_wait_to_unref_task_teams: 3518 // Some threads could still be in the fork barrier release code, possibly 3519 // trying to steal tasks. Wait for each thread to unreference its task team. 3520 void __kmp_wait_to_unref_task_teams(void) { 3521 kmp_info_t *thread; 3522 kmp_uint32 spins; 3523 int done; 3524 3525 KMP_INIT_YIELD(spins); 3526 3527 for (;;) { 3528 done = TRUE; 3529 3530 // TODO: GEH - this may be is wrong because some sync would be necessary 3531 // in case threads are added to the pool during the traversal. Need to 3532 // verify that lock for thread pool is held when calling this routine. 3533 for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL; 3534 thread = thread->th.th_next_pool) { 3535 #if KMP_OS_WINDOWS 3536 DWORD exit_val; 3537 #endif 3538 if (TCR_PTR(thread->th.th_task_team) == NULL) { 3539 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n", 3540 __kmp_gtid_from_thread(thread))); 3541 continue; 3542 } 3543 #if KMP_OS_WINDOWS 3544 // TODO: GEH - add this check for Linux* OS / OS X* as well? 3545 if (!__kmp_is_thread_alive(thread, &exit_val)) { 3546 thread->th.th_task_team = NULL; 3547 continue; 3548 } 3549 #endif 3550 3551 done = FALSE; // Because th_task_team pointer is not NULL for this thread 3552 3553 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to " 3554 "unreference task_team\n", 3555 __kmp_gtid_from_thread(thread))); 3556 3557 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 3558 volatile void *sleep_loc; 3559 // If the thread is sleeping, awaken it. 3560 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != 3561 NULL) { 3562 KA_TRACE( 3563 10, 3564 ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n", 3565 __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread))); 3566 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc); 3567 } 3568 } 3569 } 3570 if (done) { 3571 break; 3572 } 3573 3574 // If oversubscribed or have waited a bit, yield. 3575 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 3576 } 3577 } 3578 3579 // __kmp_task_team_setup: Create a task_team for the current team, but use 3580 // an already created, unused one if it already exists. 3581 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) { 3582 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3583 3584 // If this task_team hasn't been created yet, allocate it. It will be used in 3585 // the region after the next. 3586 // If it exists, it is the current task team and shouldn't be touched yet as 3587 // it may still be in use. 3588 if (team->t.t_task_team[this_thr->th.th_task_state] == NULL && 3589 (always || team->t.t_nproc > 1)) { 3590 team->t.t_task_team[this_thr->th.th_task_state] = 3591 __kmp_allocate_task_team(this_thr, team); 3592 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p " 3593 "for team %d at parity=%d\n", 3594 __kmp_gtid_from_thread(this_thr), 3595 team->t.t_task_team[this_thr->th.th_task_state], 3596 ((team != NULL) ? team->t.t_id : -1), 3597 this_thr->th.th_task_state)); 3598 } 3599 3600 // After threads exit the release, they will call sync, and then point to this 3601 // other task_team; make sure it is allocated and properly initialized. As 3602 // threads spin in the barrier release phase, they will continue to use the 3603 // previous task_team struct(above), until they receive the signal to stop 3604 // checking for tasks (they can't safely reference the kmp_team_t struct, 3605 // which could be reallocated by the master thread). No task teams are formed 3606 // for serialized teams. 3607 if (team->t.t_nproc > 1) { 3608 int other_team = 1 - this_thr->th.th_task_state; 3609 if (team->t.t_task_team[other_team] == NULL) { // setup other team as well 3610 team->t.t_task_team[other_team] = 3611 __kmp_allocate_task_team(this_thr, team); 3612 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new " 3613 "task_team %p for team %d at parity=%d\n", 3614 __kmp_gtid_from_thread(this_thr), 3615 team->t.t_task_team[other_team], 3616 ((team != NULL) ? team->t.t_id : -1), other_team)); 3617 } else { // Leave the old task team struct in place for the upcoming region; 3618 // adjust as needed 3619 kmp_task_team_t *task_team = team->t.t_task_team[other_team]; 3620 if (!task_team->tt.tt_active || 3621 team->t.t_nproc != task_team->tt.tt_nproc) { 3622 TCW_4(task_team->tt.tt_nproc, team->t.t_nproc); 3623 TCW_4(task_team->tt.tt_found_tasks, FALSE); 3624 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3625 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, 3626 team->t.t_nproc); 3627 TCW_4(task_team->tt.tt_active, TRUE); 3628 } 3629 // if team size has changed, the first thread to enable tasking will 3630 // realloc threads_data if necessary 3631 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team " 3632 "%p for team %d at parity=%d\n", 3633 __kmp_gtid_from_thread(this_thr), 3634 team->t.t_task_team[other_team], 3635 ((team != NULL) ? team->t.t_id : -1), other_team)); 3636 } 3637 } 3638 3639 // For regular thread, task enabling should be called when the task is going 3640 // to be pushed to a dequeue. However, for the hidden helper thread, we need 3641 // it ahead of time so that some operations can be performed without race 3642 // condition. 3643 if (this_thr == __kmp_hidden_helper_main_thread) { 3644 for (int i = 0; i < 2; ++i) { 3645 kmp_task_team_t *task_team = team->t.t_task_team[i]; 3646 if (KMP_TASKING_ENABLED(task_team)) { 3647 continue; 3648 } 3649 __kmp_enable_tasking(task_team, this_thr); 3650 for (int j = 0; j < task_team->tt.tt_nproc; ++j) { 3651 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j]; 3652 if (thread_data->td.td_deque == NULL) { 3653 __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data); 3654 } 3655 } 3656 } 3657 } 3658 } 3659 3660 // __kmp_task_team_sync: Propagation of task team data from team to threads 3661 // which happens just after the release phase of a team barrier. This may be 3662 // called by any thread, but only for teams with # threads > 1. 3663 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) { 3664 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3665 3666 // Toggle the th_task_state field, to switch which task_team this thread 3667 // refers to 3668 this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state); 3669 3670 // It is now safe to propagate the task team pointer from the team struct to 3671 // the current thread. 3672 TCW_PTR(this_thr->th.th_task_team, 3673 team->t.t_task_team[this_thr->th.th_task_state]); 3674 KA_TRACE(20, 3675 ("__kmp_task_team_sync: Thread T#%d task team switched to task_team " 3676 "%p from Team #%d (parity=%d)\n", 3677 __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team, 3678 ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state)); 3679 } 3680 3681 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the 3682 // barrier gather phase. Only called by master thread if #threads in team > 1 or 3683 // if proxy tasks were created. 3684 // 3685 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off 3686 // by passing in 0 optionally as the last argument. When wait is zero, master 3687 // thread does not wait for unfinished_threads to reach 0. 3688 void __kmp_task_team_wait( 3689 kmp_info_t *this_thr, 3690 kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) { 3691 kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state]; 3692 3693 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3694 KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team); 3695 3696 if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) { 3697 if (wait) { 3698 KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks " 3699 "(for unfinished_threads to reach 0) on task_team = %p\n", 3700 __kmp_gtid_from_thread(this_thr), task_team)); 3701 // Worker threads may have dropped through to release phase, but could 3702 // still be executing tasks. Wait here for tasks to complete. To avoid 3703 // memory contention, only master thread checks termination condition. 3704 kmp_flag_32<false, false> flag( 3705 RCAST(std::atomic<kmp_uint32> *, 3706 &task_team->tt.tt_unfinished_threads), 3707 0U); 3708 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 3709 } 3710 // Deactivate the old task team, so that the worker threads will stop 3711 // referencing it while spinning. 3712 KA_TRACE( 3713 20, 3714 ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: " 3715 "setting active to false, setting local and team's pointer to NULL\n", 3716 __kmp_gtid_from_thread(this_thr), task_team)); 3717 KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 || 3718 task_team->tt.tt_found_proxy_tasks == TRUE); 3719 TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3720 KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0); 3721 TCW_SYNC_4(task_team->tt.tt_active, FALSE); 3722 KMP_MB(); 3723 3724 TCW_PTR(this_thr->th.th_task_team, NULL); 3725 } 3726 } 3727 3728 // __kmp_tasking_barrier: 3729 // This routine is called only when __kmp_tasking_mode == tskm_extra_barrier. 3730 // Internal function to execute all tasks prior to a regular barrier or a join 3731 // barrier. It is a full barrier itself, which unfortunately turns regular 3732 // barriers into double barriers and join barriers into 1 1/2 barriers. 3733 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) { 3734 std::atomic<kmp_uint32> *spin = RCAST( 3735 std::atomic<kmp_uint32> *, 3736 &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads); 3737 int flag = FALSE; 3738 KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier); 3739 3740 #if USE_ITT_BUILD 3741 KMP_FSYNC_SPIN_INIT(spin, NULL); 3742 #endif /* USE_ITT_BUILD */ 3743 kmp_flag_32<false, false> spin_flag(spin, 0U); 3744 while (!spin_flag.execute_tasks(thread, gtid, TRUE, 3745 &flag USE_ITT_BUILD_ARG(NULL), 0)) { 3746 #if USE_ITT_BUILD 3747 // TODO: What about itt_sync_obj?? 3748 KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin)); 3749 #endif /* USE_ITT_BUILD */ 3750 3751 if (TCR_4(__kmp_global.g.g_done)) { 3752 if (__kmp_global.g.g_abort) 3753 __kmp_abort_thread(); 3754 break; 3755 } 3756 KMP_YIELD(TRUE); 3757 } 3758 #if USE_ITT_BUILD 3759 KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin)); 3760 #endif /* USE_ITT_BUILD */ 3761 } 3762 3763 // __kmp_give_task puts a task into a given thread queue if: 3764 // - the queue for that thread was created 3765 // - there's space in that queue 3766 // Because of this, __kmp_push_task needs to check if there's space after 3767 // getting the lock 3768 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task, 3769 kmp_int32 pass) { 3770 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 3771 kmp_task_team_t *task_team = taskdata->td_task_team; 3772 3773 KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n", 3774 taskdata, tid)); 3775 3776 // If task_team is NULL something went really bad... 3777 KMP_DEBUG_ASSERT(task_team != NULL); 3778 3779 bool result = false; 3780 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 3781 3782 if (thread_data->td.td_deque == NULL) { 3783 // There's no queue in this thread, go find another one 3784 // We're guaranteed that at least one thread has a queue 3785 KA_TRACE(30, 3786 ("__kmp_give_task: thread %d has no queue while giving task %p.\n", 3787 tid, taskdata)); 3788 return result; 3789 } 3790 3791 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3792 TASK_DEQUE_SIZE(thread_data->td)) { 3793 KA_TRACE( 3794 30, 3795 ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", 3796 taskdata, tid)); 3797 3798 // if this deque is bigger than the pass ratio give a chance to another 3799 // thread 3800 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 3801 return result; 3802 3803 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3804 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3805 TASK_DEQUE_SIZE(thread_data->td)) { 3806 // expand deque to push the task which is not allowed to execute 3807 __kmp_realloc_task_deque(thread, thread_data); 3808 } 3809 3810 } else { 3811 3812 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3813 3814 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3815 TASK_DEQUE_SIZE(thread_data->td)) { 3816 KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to " 3817 "thread %d.\n", 3818 taskdata, tid)); 3819 3820 // if this deque is bigger than the pass ratio give a chance to another 3821 // thread 3822 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 3823 goto release_and_exit; 3824 3825 __kmp_realloc_task_deque(thread, thread_data); 3826 } 3827 } 3828 3829 // lock is held here, and there is space in the deque 3830 3831 thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata; 3832 // Wrap index. 3833 thread_data->td.td_deque_tail = 3834 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 3835 TCW_4(thread_data->td.td_deque_ntasks, 3836 TCR_4(thread_data->td.td_deque_ntasks) + 1); 3837 3838 result = true; 3839 KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n", 3840 taskdata, tid)); 3841 3842 release_and_exit: 3843 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 3844 3845 return result; 3846 } 3847 3848 /* The finish of the proxy tasks is divided in two pieces: 3849 - the top half is the one that can be done from a thread outside the team 3850 - the bottom half must be run from a thread within the team 3851 3852 In order to run the bottom half the task gets queued back into one of the 3853 threads of the team. Once the td_incomplete_child_task counter of the parent 3854 is decremented the threads can leave the barriers. So, the bottom half needs 3855 to be queued before the counter is decremented. The top half is therefore 3856 divided in two parts: 3857 - things that can be run before queuing the bottom half 3858 - things that must be run after queuing the bottom half 3859 3860 This creates a second race as the bottom half can free the task before the 3861 second top half is executed. To avoid this we use the 3862 td_incomplete_child_task of the proxy task to synchronize the top and bottom 3863 half. */ 3864 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 3865 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 3866 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3867 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 3868 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 3869 3870 taskdata->td_flags.complete = 1; // mark the task as completed 3871 3872 if (taskdata->td_taskgroup) 3873 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); 3874 3875 // Create an imaginary children for this task so the bottom half cannot 3876 // release the task before we have completed the second top half 3877 KMP_ATOMIC_INC(&taskdata->td_incomplete_child_tasks); 3878 } 3879 3880 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 3881 kmp_int32 children = 0; 3882 3883 // Predecrement simulated by "- 1" calculation 3884 children = 3885 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1; 3886 KMP_DEBUG_ASSERT(children >= 0); 3887 3888 // Remove the imaginary children 3889 KMP_ATOMIC_DEC(&taskdata->td_incomplete_child_tasks); 3890 } 3891 3892 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) { 3893 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3894 kmp_info_t *thread = __kmp_threads[gtid]; 3895 3896 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3897 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 3898 1); // top half must run before bottom half 3899 3900 // We need to wait to make sure the top half is finished 3901 // Spinning here should be ok as this should happen quickly 3902 while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) > 0) 3903 ; 3904 3905 __kmp_release_deps(gtid, taskdata); 3906 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 3907 } 3908 3909 /*! 3910 @ingroup TASKING 3911 @param gtid Global Thread ID of encountering thread 3912 @param ptask Task which execution is completed 3913 3914 Execute the completion of a proxy task from a thread of that is part of the 3915 team. Run first and bottom halves directly. 3916 */ 3917 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) { 3918 KMP_DEBUG_ASSERT(ptask != NULL); 3919 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3920 KA_TRACE( 3921 10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n", 3922 gtid, taskdata)); 3923 __kmp_assert_valid_gtid(gtid); 3924 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3925 3926 __kmp_first_top_half_finish_proxy(taskdata); 3927 __kmp_second_top_half_finish_proxy(taskdata); 3928 __kmp_bottom_half_finish_proxy(gtid, ptask); 3929 3930 KA_TRACE(10, 3931 ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n", 3932 gtid, taskdata)); 3933 } 3934 3935 /*! 3936 @ingroup TASKING 3937 @param ptask Task which execution is completed 3938 3939 Execute the completion of a proxy task from a thread that could not belong to 3940 the team. 3941 */ 3942 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) { 3943 KMP_DEBUG_ASSERT(ptask != NULL); 3944 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3945 3946 KA_TRACE( 3947 10, 3948 ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n", 3949 taskdata)); 3950 3951 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3952 3953 __kmp_first_top_half_finish_proxy(taskdata); 3954 3955 // Enqueue task to complete bottom half completion from a thread within the 3956 // corresponding team 3957 kmp_team_t *team = taskdata->td_team; 3958 kmp_int32 nthreads = team->t.t_nproc; 3959 kmp_info_t *thread; 3960 3961 // This should be similar to start_k = __kmp_get_random( thread ) % nthreads 3962 // but we cannot use __kmp_get_random here 3963 kmp_int32 start_k = 0; 3964 kmp_int32 pass = 1; 3965 kmp_int32 k = start_k; 3966 3967 do { 3968 // For now we're just linearly trying to find a thread 3969 thread = team->t.t_threads[k]; 3970 k = (k + 1) % nthreads; 3971 3972 // we did a full pass through all the threads 3973 if (k == start_k) 3974 pass = pass << 1; 3975 3976 } while (!__kmp_give_task(thread, k, ptask, pass)); 3977 3978 __kmp_second_top_half_finish_proxy(taskdata); 3979 3980 KA_TRACE( 3981 10, 3982 ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", 3983 taskdata)); 3984 } 3985 3986 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid, 3987 kmp_task_t *task) { 3988 kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task); 3989 if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) { 3990 td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION; 3991 td->td_allow_completion_event.ed.task = task; 3992 __kmp_init_tas_lock(&td->td_allow_completion_event.lock); 3993 } 3994 return &td->td_allow_completion_event; 3995 } 3996 3997 void __kmp_fulfill_event(kmp_event_t *event) { 3998 if (event->type == KMP_EVENT_ALLOW_COMPLETION) { 3999 kmp_task_t *ptask = event->ed.task; 4000 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 4001 bool detached = false; 4002 int gtid = __kmp_get_gtid(); 4003 4004 // The associated task might have completed or could be completing at this 4005 // point. 4006 // We need to take the lock to avoid races 4007 __kmp_acquire_tas_lock(&event->lock, gtid); 4008 if (taskdata->td_flags.proxy == TASK_PROXY) { 4009 detached = true; 4010 } else { 4011 #if OMPT_SUPPORT 4012 // The OMPT event must occur under mutual exclusion, 4013 // otherwise the tool might access ptask after free 4014 if (UNLIKELY(ompt_enabled.enabled)) 4015 __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill); 4016 #endif 4017 } 4018 event->type = KMP_EVENT_UNINITIALIZED; 4019 __kmp_release_tas_lock(&event->lock, gtid); 4020 4021 if (detached) { 4022 #if OMPT_SUPPORT 4023 // We free ptask afterwards and know the task is finished, 4024 // so locking is not necessary 4025 if (UNLIKELY(ompt_enabled.enabled)) 4026 __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill); 4027 #endif 4028 // If the task detached complete the proxy task 4029 if (gtid >= 0) { 4030 kmp_team_t *team = taskdata->td_team; 4031 kmp_info_t *thread = __kmp_get_thread(); 4032 if (thread->th.th_team == team) { 4033 __kmpc_proxy_task_completed(gtid, ptask); 4034 return; 4035 } 4036 } 4037 4038 // fallback 4039 __kmpc_proxy_task_completed_ooo(ptask); 4040 } 4041 } 4042 } 4043 4044 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task 4045 // for taskloop 4046 // 4047 // thread: allocating thread 4048 // task_src: pointer to source task to be duplicated 4049 // returns: a pointer to the allocated kmp_task_t structure (task). 4050 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) { 4051 kmp_task_t *task; 4052 kmp_taskdata_t *taskdata; 4053 kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src); 4054 kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task 4055 size_t shareds_offset; 4056 size_t task_size; 4057 4058 KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, 4059 task_src)); 4060 KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy == 4061 TASK_FULL); // it should not be proxy task 4062 KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT); 4063 task_size = taskdata_src->td_size_alloc; 4064 4065 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 4066 KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread, 4067 task_size)); 4068 #if USE_FAST_MEMORY 4069 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size); 4070 #else 4071 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size); 4072 #endif /* USE_FAST_MEMORY */ 4073 KMP_MEMCPY(taskdata, taskdata_src, task_size); 4074 4075 task = KMP_TASKDATA_TO_TASK(taskdata); 4076 4077 // Initialize new task (only specific fields not affected by memcpy) 4078 taskdata->td_task_id = KMP_GEN_TASK_ID(); 4079 if (task->shareds != NULL) { // need setup shareds pointer 4080 shareds_offset = (char *)task_src->shareds - (char *)taskdata_src; 4081 task->shareds = &((char *)taskdata)[shareds_offset]; 4082 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 4083 0); 4084 } 4085 taskdata->td_alloc_thread = thread; 4086 taskdata->td_parent = parent_task; 4087 // task inherits the taskgroup from the parent task 4088 taskdata->td_taskgroup = parent_task->td_taskgroup; 4089 // tied task needs to initialize the td_last_tied at creation, 4090 // untied one does this when it is scheduled for execution 4091 if (taskdata->td_flags.tiedness == TASK_TIED) 4092 taskdata->td_last_tied = taskdata; 4093 4094 // Only need to keep track of child task counts if team parallel and tasking 4095 // not serialized 4096 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { 4097 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks); 4098 if (parent_task->td_taskgroup) 4099 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count); 4100 // Only need to keep track of allocated child tasks for explicit tasks since 4101 // implicit not deallocated 4102 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) 4103 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks); 4104 } 4105 4106 KA_TRACE(20, 4107 ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n", 4108 thread, taskdata, taskdata->td_parent)); 4109 #if OMPT_SUPPORT 4110 if (UNLIKELY(ompt_enabled.enabled)) 4111 __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid); 4112 #endif 4113 return task; 4114 } 4115 4116 // Routine optionally generated by the compiler for setting the lastprivate flag 4117 // and calling needed constructors for private/firstprivate objects 4118 // (used to form taskloop tasks from pattern task) 4119 // Parameters: dest task, src task, lastprivate flag. 4120 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32); 4121 4122 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8); 4123 4124 // class to encapsulate manipulating loop bounds in a taskloop task. 4125 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting 4126 // the loop bound variables. 4127 class kmp_taskloop_bounds_t { 4128 kmp_task_t *task; 4129 const kmp_taskdata_t *taskdata; 4130 size_t lower_offset; 4131 size_t upper_offset; 4132 4133 public: 4134 kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub) 4135 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)), 4136 lower_offset((char *)lb - (char *)task), 4137 upper_offset((char *)ub - (char *)task) { 4138 KMP_DEBUG_ASSERT((char *)lb > (char *)_task); 4139 KMP_DEBUG_ASSERT((char *)ub > (char *)_task); 4140 } 4141 kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds) 4142 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)), 4143 lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {} 4144 size_t get_lower_offset() const { return lower_offset; } 4145 size_t get_upper_offset() const { return upper_offset; } 4146 kmp_uint64 get_lb() const { 4147 kmp_int64 retval; 4148 #if defined(KMP_GOMP_COMPAT) 4149 // Intel task just returns the lower bound normally 4150 if (!taskdata->td_flags.native) { 4151 retval = *(kmp_int64 *)((char *)task + lower_offset); 4152 } else { 4153 // GOMP task has to take into account the sizeof(long) 4154 if (taskdata->td_size_loop_bounds == 4) { 4155 kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds); 4156 retval = (kmp_int64)*lb; 4157 } else { 4158 kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds); 4159 retval = (kmp_int64)*lb; 4160 } 4161 } 4162 #else 4163 (void)taskdata; 4164 retval = *(kmp_int64 *)((char *)task + lower_offset); 4165 #endif // defined(KMP_GOMP_COMPAT) 4166 return retval; 4167 } 4168 kmp_uint64 get_ub() const { 4169 kmp_int64 retval; 4170 #if defined(KMP_GOMP_COMPAT) 4171 // Intel task just returns the upper bound normally 4172 if (!taskdata->td_flags.native) { 4173 retval = *(kmp_int64 *)((char *)task + upper_offset); 4174 } else { 4175 // GOMP task has to take into account the sizeof(long) 4176 if (taskdata->td_size_loop_bounds == 4) { 4177 kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1; 4178 retval = (kmp_int64)*ub; 4179 } else { 4180 kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1; 4181 retval = (kmp_int64)*ub; 4182 } 4183 } 4184 #else 4185 retval = *(kmp_int64 *)((char *)task + upper_offset); 4186 #endif // defined(KMP_GOMP_COMPAT) 4187 return retval; 4188 } 4189 void set_lb(kmp_uint64 lb) { 4190 #if defined(KMP_GOMP_COMPAT) 4191 // Intel task just sets the lower bound normally 4192 if (!taskdata->td_flags.native) { 4193 *(kmp_uint64 *)((char *)task + lower_offset) = lb; 4194 } else { 4195 // GOMP task has to take into account the sizeof(long) 4196 if (taskdata->td_size_loop_bounds == 4) { 4197 kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds); 4198 *lower = (kmp_uint32)lb; 4199 } else { 4200 kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds); 4201 *lower = (kmp_uint64)lb; 4202 } 4203 } 4204 #else 4205 *(kmp_uint64 *)((char *)task + lower_offset) = lb; 4206 #endif // defined(KMP_GOMP_COMPAT) 4207 } 4208 void set_ub(kmp_uint64 ub) { 4209 #if defined(KMP_GOMP_COMPAT) 4210 // Intel task just sets the upper bound normally 4211 if (!taskdata->td_flags.native) { 4212 *(kmp_uint64 *)((char *)task + upper_offset) = ub; 4213 } else { 4214 // GOMP task has to take into account the sizeof(long) 4215 if (taskdata->td_size_loop_bounds == 4) { 4216 kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1; 4217 *upper = (kmp_uint32)ub; 4218 } else { 4219 kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1; 4220 *upper = (kmp_uint64)ub; 4221 } 4222 } 4223 #else 4224 *(kmp_uint64 *)((char *)task + upper_offset) = ub; 4225 #endif // defined(KMP_GOMP_COMPAT) 4226 } 4227 }; 4228 4229 // __kmp_taskloop_linear: Start tasks of the taskloop linearly 4230 // 4231 // loc Source location information 4232 // gtid Global thread ID 4233 // task Pattern task, exposes the loop iteration range 4234 // lb Pointer to loop lower bound in task structure 4235 // ub Pointer to loop upper bound in task structure 4236 // st Loop stride 4237 // ub_glob Global upper bound (used for lastprivate check) 4238 // num_tasks Number of tasks to execute 4239 // grainsize Number of loop iterations per task 4240 // extras Number of chunks with grainsize+1 iterations 4241 // last_chunk Reduction of grainsize for last task 4242 // tc Iterations count 4243 // task_dup Tasks duplication routine 4244 // codeptr_ra Return address for OMPT events 4245 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, 4246 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 4247 kmp_uint64 ub_glob, kmp_uint64 num_tasks, 4248 kmp_uint64 grainsize, kmp_uint64 extras, 4249 kmp_int64 last_chunk, kmp_uint64 tc, 4250 #if OMPT_SUPPORT 4251 void *codeptr_ra, 4252 #endif 4253 void *task_dup) { 4254 KMP_COUNT_BLOCK(OMP_TASKLOOP); 4255 KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling); 4256 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 4257 // compiler provides global bounds here 4258 kmp_taskloop_bounds_t task_bounds(task, lb, ub); 4259 kmp_uint64 lower = task_bounds.get_lb(); 4260 kmp_uint64 upper = task_bounds.get_ub(); 4261 kmp_uint64 i; 4262 kmp_info_t *thread = __kmp_threads[gtid]; 4263 kmp_taskdata_t *current_task = thread->th.th_current_task; 4264 kmp_task_t *next_task; 4265 kmp_int32 lastpriv = 0; 4266 4267 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + 4268 (last_chunk < 0 ? last_chunk : extras)); 4269 KMP_DEBUG_ASSERT(num_tasks > extras); 4270 KMP_DEBUG_ASSERT(num_tasks > 0); 4271 KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, " 4272 "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n", 4273 gtid, num_tasks, grainsize, extras, last_chunk, lower, upper, 4274 ub_glob, st, task_dup)); 4275 4276 // Launch num_tasks tasks, assign grainsize iterations each task 4277 for (i = 0; i < num_tasks; ++i) { 4278 kmp_uint64 chunk_minus_1; 4279 if (extras == 0) { 4280 chunk_minus_1 = grainsize - 1; 4281 } else { 4282 chunk_minus_1 = grainsize; 4283 --extras; // first extras iterations get bigger chunk (grainsize+1) 4284 } 4285 upper = lower + st * chunk_minus_1; 4286 if (upper > *ub) { 4287 upper = *ub; 4288 } 4289 if (i == num_tasks - 1) { 4290 // schedule the last task, set lastprivate flag if needed 4291 if (st == 1) { // most common case 4292 KMP_DEBUG_ASSERT(upper == *ub); 4293 if (upper == ub_glob) 4294 lastpriv = 1; 4295 } else if (st > 0) { // positive loop stride 4296 KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper); 4297 if ((kmp_uint64)st > ub_glob - upper) 4298 lastpriv = 1; 4299 } else { // negative loop stride 4300 KMP_DEBUG_ASSERT(upper + st < *ub); 4301 if (upper - ub_glob < (kmp_uint64)(-st)) 4302 lastpriv = 1; 4303 } 4304 } 4305 next_task = __kmp_task_dup_alloc(thread, task); // allocate new task 4306 kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task); 4307 kmp_taskloop_bounds_t next_task_bounds = 4308 kmp_taskloop_bounds_t(next_task, task_bounds); 4309 4310 // adjust task-specific bounds 4311 next_task_bounds.set_lb(lower); 4312 if (next_taskdata->td_flags.native) { 4313 next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1)); 4314 } else { 4315 next_task_bounds.set_ub(upper); 4316 } 4317 if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates, 4318 // etc. 4319 ptask_dup(next_task, task, lastpriv); 4320 KA_TRACE(40, 4321 ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, " 4322 "upper %lld stride %lld, (offsets %p %p)\n", 4323 gtid, i, next_task, lower, upper, st, 4324 next_task_bounds.get_lower_offset(), 4325 next_task_bounds.get_upper_offset())); 4326 #if OMPT_SUPPORT 4327 __kmp_omp_taskloop_task(NULL, gtid, next_task, 4328 codeptr_ra); // schedule new task 4329 #else 4330 __kmp_omp_task(gtid, next_task, true); // schedule new task 4331 #endif 4332 lower = upper + st; // adjust lower bound for the next iteration 4333 } 4334 // free the pattern task and exit 4335 __kmp_task_start(gtid, task, current_task); // make internal bookkeeping 4336 // do not execute the pattern task, just do internal bookkeeping 4337 __kmp_task_finish<false>(gtid, task, current_task); 4338 } 4339 4340 // Structure to keep taskloop parameters for auxiliary task 4341 // kept in the shareds of the task structure. 4342 typedef struct __taskloop_params { 4343 kmp_task_t *task; 4344 kmp_uint64 *lb; 4345 kmp_uint64 *ub; 4346 void *task_dup; 4347 kmp_int64 st; 4348 kmp_uint64 ub_glob; 4349 kmp_uint64 num_tasks; 4350 kmp_uint64 grainsize; 4351 kmp_uint64 extras; 4352 kmp_int64 last_chunk; 4353 kmp_uint64 tc; 4354 kmp_uint64 num_t_min; 4355 #if OMPT_SUPPORT 4356 void *codeptr_ra; 4357 #endif 4358 } __taskloop_params_t; 4359 4360 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *, 4361 kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64, 4362 kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64, 4363 kmp_uint64, 4364 #if OMPT_SUPPORT 4365 void *, 4366 #endif 4367 void *); 4368 4369 // Execute part of the taskloop submitted as a task. 4370 int __kmp_taskloop_task(int gtid, void *ptask) { 4371 __taskloop_params_t *p = 4372 (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds; 4373 kmp_task_t *task = p->task; 4374 kmp_uint64 *lb = p->lb; 4375 kmp_uint64 *ub = p->ub; 4376 void *task_dup = p->task_dup; 4377 // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 4378 kmp_int64 st = p->st; 4379 kmp_uint64 ub_glob = p->ub_glob; 4380 kmp_uint64 num_tasks = p->num_tasks; 4381 kmp_uint64 grainsize = p->grainsize; 4382 kmp_uint64 extras = p->extras; 4383 kmp_int64 last_chunk = p->last_chunk; 4384 kmp_uint64 tc = p->tc; 4385 kmp_uint64 num_t_min = p->num_t_min; 4386 #if OMPT_SUPPORT 4387 void *codeptr_ra = p->codeptr_ra; 4388 #endif 4389 #if KMP_DEBUG 4390 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4391 KMP_DEBUG_ASSERT(task != NULL); 4392 KA_TRACE(20, 4393 ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize" 4394 " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n", 4395 gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub, 4396 st, task_dup)); 4397 #endif 4398 KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min); 4399 if (num_tasks > num_t_min) 4400 __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, 4401 grainsize, extras, last_chunk, tc, num_t_min, 4402 #if OMPT_SUPPORT 4403 codeptr_ra, 4404 #endif 4405 task_dup); 4406 else 4407 __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, 4408 grainsize, extras, last_chunk, tc, 4409 #if OMPT_SUPPORT 4410 codeptr_ra, 4411 #endif 4412 task_dup); 4413 4414 KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid)); 4415 return 0; 4416 } 4417 4418 // Schedule part of the taskloop as a task, 4419 // execute the rest of the taskloop. 4420 // 4421 // loc Source location information 4422 // gtid Global thread ID 4423 // task Pattern task, exposes the loop iteration range 4424 // lb Pointer to loop lower bound in task structure 4425 // ub Pointer to loop upper bound in task structure 4426 // st Loop stride 4427 // ub_glob Global upper bound (used for lastprivate check) 4428 // num_tasks Number of tasks to execute 4429 // grainsize Number of loop iterations per task 4430 // extras Number of chunks with grainsize+1 iterations 4431 // last_chunk Reduction of grainsize for last task 4432 // tc Iterations count 4433 // num_t_min Threshold to launch tasks recursively 4434 // task_dup Tasks duplication routine 4435 // codeptr_ra Return address for OMPT events 4436 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task, 4437 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 4438 kmp_uint64 ub_glob, kmp_uint64 num_tasks, 4439 kmp_uint64 grainsize, kmp_uint64 extras, 4440 kmp_int64 last_chunk, kmp_uint64 tc, 4441 kmp_uint64 num_t_min, 4442 #if OMPT_SUPPORT 4443 void *codeptr_ra, 4444 #endif 4445 void *task_dup) { 4446 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4447 KMP_DEBUG_ASSERT(task != NULL); 4448 KMP_DEBUG_ASSERT(num_tasks > num_t_min); 4449 KA_TRACE(20, 4450 ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize" 4451 " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n", 4452 gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub, 4453 st, task_dup)); 4454 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 4455 kmp_uint64 lower = *lb; 4456 kmp_info_t *thread = __kmp_threads[gtid]; 4457 // kmp_taskdata_t *current_task = thread->th.th_current_task; 4458 kmp_task_t *next_task; 4459 size_t lower_offset = 4460 (char *)lb - (char *)task; // remember offset of lb in the task structure 4461 size_t upper_offset = 4462 (char *)ub - (char *)task; // remember offset of ub in the task structure 4463 4464 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + 4465 (last_chunk < 0 ? last_chunk : extras)); 4466 KMP_DEBUG_ASSERT(num_tasks > extras); 4467 KMP_DEBUG_ASSERT(num_tasks > 0); 4468 4469 // split the loop in two halves 4470 kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1; 4471 kmp_int64 last_chunk0 = 0, last_chunk1 = 0; 4472 kmp_uint64 gr_size0 = grainsize; 4473 kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute 4474 kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task 4475 if (last_chunk < 0) { 4476 ext0 = ext1 = 0; 4477 last_chunk1 = last_chunk; 4478 tc0 = grainsize * n_tsk0; 4479 tc1 = tc - tc0; 4480 } else if (n_tsk0 <= extras) { 4481 gr_size0++; // integrate extras into grainsize 4482 ext0 = 0; // no extra iters in 1st half 4483 ext1 = extras - n_tsk0; // remaining extras 4484 tc0 = gr_size0 * n_tsk0; 4485 tc1 = tc - tc0; 4486 } else { // n_tsk0 > extras 4487 ext1 = 0; // no extra iters in 2nd half 4488 ext0 = extras; 4489 tc1 = grainsize * n_tsk1; 4490 tc0 = tc - tc1; 4491 } 4492 ub0 = lower + st * (tc0 - 1); 4493 lb1 = ub0 + st; 4494 4495 // create pattern task for 2nd half of the loop 4496 next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task 4497 // adjust lower bound (upper bound is not changed) for the 2nd half 4498 *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1; 4499 if (ptask_dup != NULL) // construct firstprivates, etc. 4500 ptask_dup(next_task, task, 0); 4501 *ub = ub0; // adjust upper bound for the 1st half 4502 4503 // create auxiliary task for 2nd half of the loop 4504 // make sure new task has same parent task as the pattern task 4505 kmp_taskdata_t *current_task = thread->th.th_current_task; 4506 thread->th.th_current_task = taskdata->td_parent; 4507 kmp_task_t *new_task = 4508 __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *), 4509 sizeof(__taskloop_params_t), &__kmp_taskloop_task); 4510 // restore current task 4511 thread->th.th_current_task = current_task; 4512 __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds; 4513 p->task = next_task; 4514 p->lb = (kmp_uint64 *)((char *)next_task + lower_offset); 4515 p->ub = (kmp_uint64 *)((char *)next_task + upper_offset); 4516 p->task_dup = task_dup; 4517 p->st = st; 4518 p->ub_glob = ub_glob; 4519 p->num_tasks = n_tsk1; 4520 p->grainsize = grainsize; 4521 p->extras = ext1; 4522 p->last_chunk = last_chunk1; 4523 p->tc = tc1; 4524 p->num_t_min = num_t_min; 4525 #if OMPT_SUPPORT 4526 p->codeptr_ra = codeptr_ra; 4527 #endif 4528 4529 #if OMPT_SUPPORT 4530 // schedule new task with correct return address for OMPT events 4531 __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra); 4532 #else 4533 __kmp_omp_task(gtid, new_task, true); // schedule new task 4534 #endif 4535 4536 // execute the 1st half of current subrange 4537 if (n_tsk0 > num_t_min) 4538 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0, 4539 ext0, last_chunk0, tc0, num_t_min, 4540 #if OMPT_SUPPORT 4541 codeptr_ra, 4542 #endif 4543 task_dup); 4544 else 4545 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, 4546 gr_size0, ext0, last_chunk0, tc0, 4547 #if OMPT_SUPPORT 4548 codeptr_ra, 4549 #endif 4550 task_dup); 4551 4552 KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid)); 4553 } 4554 4555 static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, 4556 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 4557 int nogroup, int sched, kmp_uint64 grainsize, 4558 int modifier, void *task_dup) { 4559 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4560 KMP_DEBUG_ASSERT(task != NULL); 4561 if (nogroup == 0) { 4562 #if OMPT_SUPPORT && OMPT_OPTIONAL 4563 OMPT_STORE_RETURN_ADDRESS(gtid); 4564 #endif 4565 __kmpc_taskgroup(loc, gtid); 4566 } 4567 4568 // ========================================================================= 4569 // calculate loop parameters 4570 kmp_taskloop_bounds_t task_bounds(task, lb, ub); 4571 kmp_uint64 tc; 4572 // compiler provides global bounds here 4573 kmp_uint64 lower = task_bounds.get_lb(); 4574 kmp_uint64 upper = task_bounds.get_ub(); 4575 kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag 4576 kmp_uint64 num_tasks = 0, extras = 0; 4577 kmp_int64 last_chunk = 4578 0; // reduce grainsize of last task by last_chunk in strict mode 4579 kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks; 4580 kmp_info_t *thread = __kmp_threads[gtid]; 4581 kmp_taskdata_t *current_task = thread->th.th_current_task; 4582 4583 KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, " 4584 "grain %llu(%d, %d), dup %p\n", 4585 gtid, taskdata, lower, upper, st, grainsize, sched, modifier, 4586 task_dup)); 4587 4588 // compute trip count 4589 if (st == 1) { // most common case 4590 tc = upper - lower + 1; 4591 } else if (st < 0) { 4592 tc = (lower - upper) / (-st) + 1; 4593 } else { // st > 0 4594 tc = (upper - lower) / st + 1; 4595 } 4596 if (tc == 0) { 4597 KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid)); 4598 // free the pattern task and exit 4599 __kmp_task_start(gtid, task, current_task); 4600 // do not execute anything for zero-trip loop 4601 __kmp_task_finish<false>(gtid, task, current_task); 4602 return; 4603 } 4604 4605 #if OMPT_SUPPORT && OMPT_OPTIONAL 4606 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 4607 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 4608 if (ompt_enabled.ompt_callback_work) { 4609 ompt_callbacks.ompt_callback(ompt_callback_work)( 4610 ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data), 4611 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0)); 4612 } 4613 #endif 4614 4615 if (num_tasks_min == 0) 4616 // TODO: can we choose better default heuristic? 4617 num_tasks_min = 4618 KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE); 4619 4620 // compute num_tasks/grainsize based on the input provided 4621 switch (sched) { 4622 case 0: // no schedule clause specified, we can choose the default 4623 // let's try to schedule (team_size*10) tasks 4624 grainsize = thread->th.th_team_nproc * 10; 4625 KMP_FALLTHROUGH(); 4626 case 2: // num_tasks provided 4627 if (grainsize > tc) { 4628 num_tasks = tc; // too big num_tasks requested, adjust values 4629 grainsize = 1; 4630 extras = 0; 4631 } else { 4632 num_tasks = grainsize; 4633 grainsize = tc / num_tasks; 4634 extras = tc % num_tasks; 4635 } 4636 break; 4637 case 1: // grainsize provided 4638 if (grainsize > tc) { 4639 num_tasks = 1; 4640 grainsize = tc; // too big grainsize requested, adjust values 4641 extras = 0; 4642 } else { 4643 if (modifier) { 4644 num_tasks = (tc + grainsize - 1) / grainsize; 4645 last_chunk = tc - (num_tasks * grainsize); 4646 extras = 0; 4647 } else { 4648 num_tasks = tc / grainsize; 4649 // adjust grainsize for balanced distribution of iterations 4650 grainsize = tc / num_tasks; 4651 extras = tc % num_tasks; 4652 } 4653 } 4654 break; 4655 default: 4656 KMP_ASSERT2(0, "unknown scheduling of taskloop"); 4657 } 4658 4659 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + 4660 (last_chunk < 0 ? last_chunk : extras)); 4661 KMP_DEBUG_ASSERT(num_tasks > extras); 4662 KMP_DEBUG_ASSERT(num_tasks > 0); 4663 // ========================================================================= 4664 4665 // check if clause value first 4666 // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native) 4667 if (if_val == 0) { // if(0) specified, mark task as serial 4668 taskdata->td_flags.task_serial = 1; 4669 taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied 4670 // always start serial tasks linearly 4671 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4672 grainsize, extras, last_chunk, tc, 4673 #if OMPT_SUPPORT 4674 OMPT_GET_RETURN_ADDRESS(0), 4675 #endif 4676 task_dup); 4677 // !taskdata->td_flags.native => currently force linear spawning of tasks 4678 // for GOMP_taskloop 4679 } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) { 4680 KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu" 4681 "(%lld), grain %llu, extras %llu, last_chunk %lld\n", 4682 gtid, tc, num_tasks, num_tasks_min, grainsize, extras, 4683 last_chunk)); 4684 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4685 grainsize, extras, last_chunk, tc, num_tasks_min, 4686 #if OMPT_SUPPORT 4687 OMPT_GET_RETURN_ADDRESS(0), 4688 #endif 4689 task_dup); 4690 } else { 4691 KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu" 4692 "(%lld), grain %llu, extras %llu, last_chunk %lld\n", 4693 gtid, tc, num_tasks, num_tasks_min, grainsize, extras, 4694 last_chunk)); 4695 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4696 grainsize, extras, last_chunk, tc, 4697 #if OMPT_SUPPORT 4698 OMPT_GET_RETURN_ADDRESS(0), 4699 #endif 4700 task_dup); 4701 } 4702 4703 #if OMPT_SUPPORT && OMPT_OPTIONAL 4704 if (ompt_enabled.ompt_callback_work) { 4705 ompt_callbacks.ompt_callback(ompt_callback_work)( 4706 ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data), 4707 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0)); 4708 } 4709 #endif 4710 4711 if (nogroup == 0) { 4712 #if OMPT_SUPPORT && OMPT_OPTIONAL 4713 OMPT_STORE_RETURN_ADDRESS(gtid); 4714 #endif 4715 __kmpc_end_taskgroup(loc, gtid); 4716 } 4717 KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid)); 4718 } 4719 4720 /*! 4721 @ingroup TASKING 4722 @param loc Source location information 4723 @param gtid Global thread ID 4724 @param task Task structure 4725 @param if_val Value of the if clause 4726 @param lb Pointer to loop lower bound in task structure 4727 @param ub Pointer to loop upper bound in task structure 4728 @param st Loop stride 4729 @param nogroup Flag, 1 if nogroup clause specified, 0 otherwise 4730 @param sched Schedule specified 0/1/2 for none/grainsize/num_tasks 4731 @param grainsize Schedule value if specified 4732 @param task_dup Tasks duplication routine 4733 4734 Execute the taskloop construct. 4735 */ 4736 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, 4737 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, 4738 int sched, kmp_uint64 grainsize, void *task_dup) { 4739 __kmp_assert_valid_gtid(gtid); 4740 KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid)); 4741 __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize, 4742 0, task_dup); 4743 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid)); 4744 } 4745 4746 /*! 4747 @ingroup TASKING 4748 @param loc Source location information 4749 @param gtid Global thread ID 4750 @param task Task structure 4751 @param if_val Value of the if clause 4752 @param lb Pointer to loop lower bound in task structure 4753 @param ub Pointer to loop upper bound in task structure 4754 @param st Loop stride 4755 @param nogroup Flag, 1 if nogroup clause specified, 0 otherwise 4756 @param sched Schedule specified 0/1/2 for none/grainsize/num_tasks 4757 @param grainsize Schedule value if specified 4758 @param modifer Modifier 'strict' for sched, 1 if present, 0 otherwise 4759 @param task_dup Tasks duplication routine 4760 4761 Execute the taskloop construct. 4762 */ 4763 void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val, 4764 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 4765 int nogroup, int sched, kmp_uint64 grainsize, 4766 int modifier, void *task_dup) { 4767 __kmp_assert_valid_gtid(gtid); 4768 KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid)); 4769 __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize, 4770 modifier, task_dup); 4771 KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid)); 4772 } 4773