1 /* 2 * kmp_tasking.cpp -- OpenMP 3.0 tasking support. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_i18n.h" 15 #include "kmp_itt.h" 16 #include "kmp_stats.h" 17 #include "kmp_wait_release.h" 18 #include "kmp_taskdeps.h" 19 20 #if OMPT_SUPPORT 21 #include "ompt-specific.h" 22 #endif 23 24 /* forward declaration */ 25 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 26 kmp_info_t *this_thr); 27 static void __kmp_alloc_task_deque(kmp_info_t *thread, 28 kmp_thread_data_t *thread_data); 29 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 30 kmp_task_team_t *task_team); 31 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask); 32 33 #ifdef BUILD_TIED_TASK_STACK 34 35 // __kmp_trace_task_stack: print the tied tasks from the task stack in order 36 // from top do bottom 37 // 38 // gtid: global thread identifier for thread containing stack 39 // thread_data: thread data for task team thread containing stack 40 // threshold: value above which the trace statement triggers 41 // location: string identifying call site of this function (for trace) 42 static void __kmp_trace_task_stack(kmp_int32 gtid, 43 kmp_thread_data_t *thread_data, 44 int threshold, char *location) { 45 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 46 kmp_taskdata_t **stack_top = task_stack->ts_top; 47 kmp_int32 entries = task_stack->ts_entries; 48 kmp_taskdata_t *tied_task; 49 50 KA_TRACE( 51 threshold, 52 ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, " 53 "first_block = %p, stack_top = %p \n", 54 location, gtid, entries, task_stack->ts_first_block, stack_top)); 55 56 KMP_DEBUG_ASSERT(stack_top != NULL); 57 KMP_DEBUG_ASSERT(entries > 0); 58 59 while (entries != 0) { 60 KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]); 61 // fix up ts_top if we need to pop from previous block 62 if (entries & TASK_STACK_INDEX_MASK == 0) { 63 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top); 64 65 stack_block = stack_block->sb_prev; 66 stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 67 } 68 69 // finish bookkeeping 70 stack_top--; 71 entries--; 72 73 tied_task = *stack_top; 74 75 KMP_DEBUG_ASSERT(tied_task != NULL); 76 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 77 78 KA_TRACE(threshold, 79 ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, " 80 "stack_top=%p, tied_task=%p\n", 81 location, gtid, entries, stack_top, tied_task)); 82 } 83 KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]); 84 85 KA_TRACE(threshold, 86 ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n", 87 location, gtid)); 88 } 89 90 // __kmp_init_task_stack: initialize the task stack for the first time 91 // after a thread_data structure is created. 92 // It should not be necessary to do this again (assuming the stack works). 93 // 94 // gtid: global thread identifier of calling thread 95 // thread_data: thread data for task team thread containing stack 96 static void __kmp_init_task_stack(kmp_int32 gtid, 97 kmp_thread_data_t *thread_data) { 98 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 99 kmp_stack_block_t *first_block; 100 101 // set up the first block of the stack 102 first_block = &task_stack->ts_first_block; 103 task_stack->ts_top = (kmp_taskdata_t **)first_block; 104 memset((void *)first_block, '\0', 105 TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *)); 106 107 // initialize the stack to be empty 108 task_stack->ts_entries = TASK_STACK_EMPTY; 109 first_block->sb_next = NULL; 110 first_block->sb_prev = NULL; 111 } 112 113 // __kmp_free_task_stack: free the task stack when thread_data is destroyed. 114 // 115 // gtid: global thread identifier for calling thread 116 // thread_data: thread info for thread containing stack 117 static void __kmp_free_task_stack(kmp_int32 gtid, 118 kmp_thread_data_t *thread_data) { 119 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 120 kmp_stack_block_t *stack_block = &task_stack->ts_first_block; 121 122 KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY); 123 // free from the second block of the stack 124 while (stack_block != NULL) { 125 kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL; 126 127 stack_block->sb_next = NULL; 128 stack_block->sb_prev = NULL; 129 if (stack_block != &task_stack->ts_first_block) { 130 __kmp_thread_free(thread, 131 stack_block); // free the block, if not the first 132 } 133 stack_block = next_block; 134 } 135 // initialize the stack to be empty 136 task_stack->ts_entries = 0; 137 task_stack->ts_top = NULL; 138 } 139 140 // __kmp_push_task_stack: Push the tied task onto the task stack. 141 // Grow the stack if necessary by allocating another block. 142 // 143 // gtid: global thread identifier for calling thread 144 // thread: thread info for thread containing stack 145 // tied_task: the task to push on the stack 146 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread, 147 kmp_taskdata_t *tied_task) { 148 // GEH - need to consider what to do if tt_threads_data not allocated yet 149 kmp_thread_data_t *thread_data = 150 &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 151 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 152 153 if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) { 154 return; // Don't push anything on stack if team or team tasks are serialized 155 } 156 157 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 158 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 159 160 KA_TRACE(20, 161 ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n", 162 gtid, thread, tied_task)); 163 // Store entry 164 *(task_stack->ts_top) = tied_task; 165 166 // Do bookkeeping for next push 167 task_stack->ts_top++; 168 task_stack->ts_entries++; 169 170 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 171 // Find beginning of this task block 172 kmp_stack_block_t *stack_block = 173 (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE); 174 175 // Check if we already have a block 176 if (stack_block->sb_next != 177 NULL) { // reset ts_top to beginning of next block 178 task_stack->ts_top = &stack_block->sb_next->sb_block[0]; 179 } else { // Alloc new block and link it up 180 kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc( 181 thread, sizeof(kmp_stack_block_t)); 182 183 task_stack->ts_top = &new_block->sb_block[0]; 184 stack_block->sb_next = new_block; 185 new_block->sb_prev = stack_block; 186 new_block->sb_next = NULL; 187 188 KA_TRACE( 189 30, 190 ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n", 191 gtid, tied_task, new_block)); 192 } 193 } 194 KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 195 tied_task)); 196 } 197 198 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return 199 // the task, just check to make sure it matches the ending task passed in. 200 // 201 // gtid: global thread identifier for the calling thread 202 // thread: thread info structure containing stack 203 // tied_task: the task popped off the stack 204 // ending_task: the task that is ending (should match popped task) 205 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread, 206 kmp_taskdata_t *ending_task) { 207 // GEH - need to consider what to do if tt_threads_data not allocated yet 208 kmp_thread_data_t *thread_data = 209 &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)]; 210 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 211 kmp_taskdata_t *tied_task; 212 213 if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) { 214 // Don't pop anything from stack if team or team tasks are serialized 215 return; 216 } 217 218 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 219 KMP_DEBUG_ASSERT(task_stack->ts_entries > 0); 220 221 KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid, 222 thread)); 223 224 // fix up ts_top if we need to pop from previous block 225 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 226 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top); 227 228 stack_block = stack_block->sb_prev; 229 task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 230 } 231 232 // finish bookkeeping 233 task_stack->ts_top--; 234 task_stack->ts_entries--; 235 236 tied_task = *(task_stack->ts_top); 237 238 KMP_DEBUG_ASSERT(tied_task != NULL); 239 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 240 KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly 241 242 KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 243 tied_task)); 244 return; 245 } 246 #endif /* BUILD_TIED_TASK_STACK */ 247 248 // returns 1 if new task is allowed to execute, 0 otherwise 249 // checks Task Scheduling constraint (if requested) and 250 // mutexinoutset dependencies if any 251 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained, 252 const kmp_taskdata_t *tasknew, 253 const kmp_taskdata_t *taskcurr) { 254 if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) { 255 // Check if the candidate obeys the Task Scheduling Constraints (TSC) 256 // only descendant of all deferred tied tasks can be scheduled, checking 257 // the last one is enough, as it in turn is the descendant of all others 258 kmp_taskdata_t *current = taskcurr->td_last_tied; 259 KMP_DEBUG_ASSERT(current != NULL); 260 // check if the task is not suspended on barrier 261 if (current->td_flags.tasktype == TASK_EXPLICIT || 262 current->td_taskwait_thread > 0) { // <= 0 on barrier 263 kmp_int32 level = current->td_level; 264 kmp_taskdata_t *parent = tasknew->td_parent; 265 while (parent != current && parent->td_level > level) { 266 // check generation up to the level of the current task 267 parent = parent->td_parent; 268 KMP_DEBUG_ASSERT(parent != NULL); 269 } 270 if (parent != current) 271 return false; 272 } 273 } 274 // Check mutexinoutset dependencies, acquire locks 275 kmp_depnode_t *node = tasknew->td_depnode; 276 if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) { 277 for (int i = 0; i < node->dn.mtx_num_locks; ++i) { 278 KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL); 279 if (__kmp_test_lock(node->dn.mtx_locks[i], gtid)) 280 continue; 281 // could not get the lock, release previous locks 282 for (int j = i - 1; j >= 0; --j) 283 __kmp_release_lock(node->dn.mtx_locks[j], gtid); 284 return false; 285 } 286 // negative num_locks means all locks acquired successfully 287 node->dn.mtx_num_locks = -node->dn.mtx_num_locks; 288 } 289 return true; 290 } 291 292 // __kmp_realloc_task_deque: 293 // Re-allocates a task deque for a particular thread, copies the content from 294 // the old deque and adjusts the necessary data structures relating to the 295 // deque. This operation must be done with the deque_lock being held 296 static void __kmp_realloc_task_deque(kmp_info_t *thread, 297 kmp_thread_data_t *thread_data) { 298 kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td); 299 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size); 300 kmp_int32 new_size = 2 * size; 301 302 KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to " 303 "%d] for thread_data %p\n", 304 __kmp_gtid_from_thread(thread), size, new_size, thread_data)); 305 306 kmp_taskdata_t **new_deque = 307 (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *)); 308 309 int i, j; 310 for (i = thread_data->td.td_deque_head, j = 0; j < size; 311 i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++) 312 new_deque[j] = thread_data->td.td_deque[i]; 313 314 __kmp_free(thread_data->td.td_deque); 315 316 thread_data->td.td_deque_head = 0; 317 thread_data->td.td_deque_tail = size; 318 thread_data->td.td_deque = new_deque; 319 thread_data->td.td_deque_size = new_size; 320 } 321 322 // __kmp_push_task: Add a task to the thread's deque 323 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { 324 kmp_info_t *thread = __kmp_threads[gtid]; 325 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 326 327 // We don't need to map to shadow gtid if it is already hidden helper thread 328 if (taskdata->td_flags.hidden_helper && !KMP_HIDDEN_HELPER_THREAD(gtid)) { 329 gtid = KMP_GTID_TO_SHADOW_GTID(gtid); 330 thread = __kmp_threads[gtid]; 331 } 332 333 kmp_task_team_t *task_team = thread->th.th_task_team; 334 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 335 kmp_thread_data_t *thread_data; 336 337 KA_TRACE(20, 338 ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata)); 339 340 if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) { 341 // untied task needs to increment counter so that the task structure is not 342 // freed prematurely 343 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count); 344 KMP_DEBUG_USE_VAR(counter); 345 KA_TRACE( 346 20, 347 ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n", 348 gtid, counter, taskdata)); 349 } 350 351 // The first check avoids building task_team thread data if serialized 352 if (UNLIKELY(taskdata->td_flags.task_serial)) { 353 KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning " 354 "TASK_NOT_PUSHED for task %p\n", 355 gtid, taskdata)); 356 return TASK_NOT_PUSHED; 357 } 358 359 // Now that serialized tasks have returned, we can assume that we are not in 360 // immediate exec mode 361 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 362 if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) { 363 __kmp_enable_tasking(task_team, thread); 364 } 365 KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE); 366 KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL); 367 368 // Find tasking deque specific to encountering thread 369 thread_data = &task_team->tt.tt_threads_data[tid]; 370 371 // No lock needed since only owner can allocate. If the task is hidden_helper, 372 // we don't need it either because we have initialized the dequeue for hidden 373 // helper thread data. 374 if (UNLIKELY(thread_data->td.td_deque == NULL)) { 375 __kmp_alloc_task_deque(thread, thread_data); 376 } 377 378 int locked = 0; 379 // Check if deque is full 380 if (TCR_4(thread_data->td.td_deque_ntasks) >= 381 TASK_DEQUE_SIZE(thread_data->td)) { 382 if (__kmp_enable_task_throttling && 383 __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata, 384 thread->th.th_current_task)) { 385 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning " 386 "TASK_NOT_PUSHED for task %p\n", 387 gtid, taskdata)); 388 return TASK_NOT_PUSHED; 389 } else { 390 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 391 locked = 1; 392 if (TCR_4(thread_data->td.td_deque_ntasks) >= 393 TASK_DEQUE_SIZE(thread_data->td)) { 394 // expand deque to push the task which is not allowed to execute 395 __kmp_realloc_task_deque(thread, thread_data); 396 } 397 } 398 } 399 // Lock the deque for the task push operation 400 if (!locked) { 401 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 402 // Need to recheck as we can get a proxy task from thread outside of OpenMP 403 if (TCR_4(thread_data->td.td_deque_ntasks) >= 404 TASK_DEQUE_SIZE(thread_data->td)) { 405 if (__kmp_enable_task_throttling && 406 __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata, 407 thread->th.th_current_task)) { 408 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 409 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; " 410 "returning TASK_NOT_PUSHED for task %p\n", 411 gtid, taskdata)); 412 return TASK_NOT_PUSHED; 413 } else { 414 // expand deque to push the task which is not allowed to execute 415 __kmp_realloc_task_deque(thread, thread_data); 416 } 417 } 418 } 419 // Must have room since no thread can add tasks but calling thread 420 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) < 421 TASK_DEQUE_SIZE(thread_data->td)); 422 423 thread_data->td.td_deque[thread_data->td.td_deque_tail] = 424 taskdata; // Push taskdata 425 // Wrap index. 426 thread_data->td.td_deque_tail = 427 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 428 TCW_4(thread_data->td.td_deque_ntasks, 429 TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count 430 KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self 431 KMP_FSYNC_RELEASING(taskdata); // releasing child 432 KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: " 433 "task=%p ntasks=%d head=%u tail=%u\n", 434 gtid, taskdata, thread_data->td.td_deque_ntasks, 435 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 436 437 auto hidden_helper = taskdata->td_flags.hidden_helper; 438 439 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 440 441 // Signal one worker thread to execute the task 442 if (UNLIKELY(hidden_helper)) { 443 // Wake hidden helper threads up if they're sleeping 444 __kmp_hidden_helper_worker_thread_signal(); 445 } 446 447 return TASK_SUCCESSFULLY_PUSHED; 448 } 449 450 // __kmp_pop_current_task_from_thread: set up current task from called thread 451 // when team ends 452 // 453 // this_thr: thread structure to set current_task in. 454 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) { 455 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d " 456 "this_thread=%p, curtask=%p, " 457 "curtask_parent=%p\n", 458 0, this_thr, this_thr->th.th_current_task, 459 this_thr->th.th_current_task->td_parent)); 460 461 this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent; 462 463 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d " 464 "this_thread=%p, curtask=%p, " 465 "curtask_parent=%p\n", 466 0, this_thr, this_thr->th.th_current_task, 467 this_thr->th.th_current_task->td_parent)); 468 } 469 470 // __kmp_push_current_task_to_thread: set up current task in called thread for a 471 // new team 472 // 473 // this_thr: thread structure to set up 474 // team: team for implicit task data 475 // tid: thread within team to set up 476 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team, 477 int tid) { 478 // current task of the thread is a parent of the new just created implicit 479 // tasks of new team 480 KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p " 481 "curtask=%p " 482 "parent_task=%p\n", 483 tid, this_thr, this_thr->th.th_current_task, 484 team->t.t_implicit_task_taskdata[tid].td_parent)); 485 486 KMP_DEBUG_ASSERT(this_thr != NULL); 487 488 if (tid == 0) { 489 if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) { 490 team->t.t_implicit_task_taskdata[0].td_parent = 491 this_thr->th.th_current_task; 492 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0]; 493 } 494 } else { 495 team->t.t_implicit_task_taskdata[tid].td_parent = 496 team->t.t_implicit_task_taskdata[0].td_parent; 497 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid]; 498 } 499 500 KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p " 501 "curtask=%p " 502 "parent_task=%p\n", 503 tid, this_thr, this_thr->th.th_current_task, 504 team->t.t_implicit_task_taskdata[tid].td_parent)); 505 } 506 507 // __kmp_task_start: bookkeeping for a task starting execution 508 // 509 // GTID: global thread id of calling thread 510 // task: task starting execution 511 // current_task: task suspending 512 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task, 513 kmp_taskdata_t *current_task) { 514 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 515 kmp_info_t *thread = __kmp_threads[gtid]; 516 517 KA_TRACE(10, 518 ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n", 519 gtid, taskdata, current_task)); 520 521 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 522 523 // mark currently executing task as suspended 524 // TODO: GEH - make sure root team implicit task is initialized properly. 525 // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 ); 526 current_task->td_flags.executing = 0; 527 528 // Add task to stack if tied 529 #ifdef BUILD_TIED_TASK_STACK 530 if (taskdata->td_flags.tiedness == TASK_TIED) { 531 __kmp_push_task_stack(gtid, thread, taskdata); 532 } 533 #endif /* BUILD_TIED_TASK_STACK */ 534 535 // mark starting task as executing and as current task 536 thread->th.th_current_task = taskdata; 537 538 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 || 539 taskdata->td_flags.tiedness == TASK_UNTIED); 540 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 || 541 taskdata->td_flags.tiedness == TASK_UNTIED); 542 taskdata->td_flags.started = 1; 543 taskdata->td_flags.executing = 1; 544 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 545 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 546 547 // GEH TODO: shouldn't we pass some sort of location identifier here? 548 // APT: yes, we will pass location here. 549 // need to store current thread state (in a thread or taskdata structure) 550 // before setting work_state, otherwise wrong state is set after end of task 551 552 KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata)); 553 554 return; 555 } 556 557 #if OMPT_SUPPORT 558 //------------------------------------------------------------------------------ 559 // __ompt_task_init: 560 // Initialize OMPT fields maintained by a task. This will only be called after 561 // ompt_start_tool, so we already know whether ompt is enabled or not. 562 563 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) { 564 // The calls to __ompt_task_init already have the ompt_enabled condition. 565 task->ompt_task_info.task_data.value = 0; 566 task->ompt_task_info.frame.exit_frame = ompt_data_none; 567 task->ompt_task_info.frame.enter_frame = ompt_data_none; 568 task->ompt_task_info.frame.exit_frame_flags = 569 ompt_frame_runtime | ompt_frame_framepointer; 570 task->ompt_task_info.frame.enter_frame_flags = 571 ompt_frame_runtime | ompt_frame_framepointer; 572 } 573 574 // __ompt_task_start: 575 // Build and trigger task-begin event 576 static inline void __ompt_task_start(kmp_task_t *task, 577 kmp_taskdata_t *current_task, 578 kmp_int32 gtid) { 579 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 580 ompt_task_status_t status = ompt_task_switch; 581 if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) { 582 status = ompt_task_yield; 583 __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0; 584 } 585 /* let OMPT know that we're about to run this task */ 586 if (ompt_enabled.ompt_callback_task_schedule) { 587 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)( 588 &(current_task->ompt_task_info.task_data), status, 589 &(taskdata->ompt_task_info.task_data)); 590 } 591 taskdata->ompt_task_info.scheduling_parent = current_task; 592 } 593 594 // __ompt_task_finish: 595 // Build and trigger final task-schedule event 596 static inline void __ompt_task_finish(kmp_task_t *task, 597 kmp_taskdata_t *resumed_task, 598 ompt_task_status_t status) { 599 if (ompt_enabled.ompt_callback_task_schedule) { 600 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 601 if (__kmp_omp_cancellation && taskdata->td_taskgroup && 602 taskdata->td_taskgroup->cancel_request == cancel_taskgroup) { 603 status = ompt_task_cancel; 604 } 605 606 /* let OMPT know that we're returning to the callee task */ 607 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)( 608 &(taskdata->ompt_task_info.task_data), status, 609 (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL)); 610 } 611 } 612 #endif 613 614 template <bool ompt> 615 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid, 616 kmp_task_t *task, 617 void *frame_address, 618 void *return_address) { 619 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 620 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 621 622 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p " 623 "current_task=%p\n", 624 gtid, loc_ref, taskdata, current_task)); 625 626 if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) { 627 // untied task needs to increment counter so that the task structure is not 628 // freed prematurely 629 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count); 630 KMP_DEBUG_USE_VAR(counter); 631 KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) " 632 "incremented for task %p\n", 633 gtid, counter, taskdata)); 634 } 635 636 taskdata->td_flags.task_serial = 637 1; // Execute this task immediately, not deferred. 638 __kmp_task_start(gtid, task, current_task); 639 640 #if OMPT_SUPPORT 641 if (ompt) { 642 if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) { 643 current_task->ompt_task_info.frame.enter_frame.ptr = 644 taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address; 645 current_task->ompt_task_info.frame.enter_frame_flags = 646 taskdata->ompt_task_info.frame.exit_frame_flags = 647 ompt_frame_application | ompt_frame_framepointer; 648 } 649 if (ompt_enabled.ompt_callback_task_create) { 650 ompt_task_info_t *parent_info = &(current_task->ompt_task_info); 651 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 652 &(parent_info->task_data), &(parent_info->frame), 653 &(taskdata->ompt_task_info.task_data), 654 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0, 655 return_address); 656 } 657 __ompt_task_start(task, current_task, gtid); 658 } 659 #endif // OMPT_SUPPORT 660 661 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid, 662 loc_ref, taskdata)); 663 } 664 665 #if OMPT_SUPPORT 666 OMPT_NOINLINE 667 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid, 668 kmp_task_t *task, 669 void *frame_address, 670 void *return_address) { 671 __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address, 672 return_address); 673 } 674 #endif // OMPT_SUPPORT 675 676 // __kmpc_omp_task_begin_if0: report that a given serialized task has started 677 // execution 678 // 679 // loc_ref: source location information; points to beginning of task block. 680 // gtid: global thread number. 681 // task: task thunk for the started task. 682 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid, 683 kmp_task_t *task) { 684 #if OMPT_SUPPORT 685 if (UNLIKELY(ompt_enabled.enabled)) { 686 OMPT_STORE_RETURN_ADDRESS(gtid); 687 __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task, 688 OMPT_GET_FRAME_ADDRESS(1), 689 OMPT_LOAD_RETURN_ADDRESS(gtid)); 690 return; 691 } 692 #endif 693 __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL); 694 } 695 696 #ifdef TASK_UNUSED 697 // __kmpc_omp_task_begin: report that a given task has started execution 698 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 699 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) { 700 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 701 702 KA_TRACE( 703 10, 704 ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n", 705 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task)); 706 707 __kmp_task_start(gtid, task, current_task); 708 709 KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid, 710 loc_ref, KMP_TASK_TO_TASKDATA(task))); 711 return; 712 } 713 #endif // TASK_UNUSED 714 715 // __kmp_free_task: free the current task space and the space for shareds 716 // 717 // gtid: Global thread ID of calling thread 718 // taskdata: task to free 719 // thread: thread data structure of caller 720 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata, 721 kmp_info_t *thread) { 722 KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid, 723 taskdata)); 724 725 // Check to make sure all flags and counters have the correct values 726 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 727 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0); 728 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1); 729 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 730 KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 || 731 taskdata->td_flags.task_serial == 1); 732 KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0); 733 734 taskdata->td_flags.freed = 1; 735 // deallocate the taskdata and shared variable blocks associated with this task 736 #if USE_FAST_MEMORY 737 __kmp_fast_free(thread, taskdata); 738 #else /* ! USE_FAST_MEMORY */ 739 __kmp_thread_free(thread, taskdata); 740 #endif 741 KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata)); 742 } 743 744 // __kmp_free_task_and_ancestors: free the current task and ancestors without 745 // children 746 // 747 // gtid: Global thread ID of calling thread 748 // taskdata: task to free 749 // thread: thread data structure of caller 750 static void __kmp_free_task_and_ancestors(kmp_int32 gtid, 751 kmp_taskdata_t *taskdata, 752 kmp_info_t *thread) { 753 // Proxy tasks must always be allowed to free their parents 754 // because they can be run in background even in serial mode. 755 kmp_int32 team_serial = 756 (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) && 757 !taskdata->td_flags.proxy; 758 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 759 760 kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1; 761 KMP_DEBUG_ASSERT(children >= 0); 762 763 // Now, go up the ancestor tree to see if any ancestors can now be freed. 764 while (children == 0) { 765 kmp_taskdata_t *parent_taskdata = taskdata->td_parent; 766 767 KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete " 768 "and freeing itself\n", 769 gtid, taskdata)); 770 771 // --- Deallocate my ancestor task --- 772 __kmp_free_task(gtid, taskdata, thread); 773 774 taskdata = parent_taskdata; 775 776 if (team_serial) 777 return; 778 // Stop checking ancestors at implicit task instead of walking up ancestor 779 // tree to avoid premature deallocation of ancestors. 780 if (taskdata->td_flags.tasktype == TASK_IMPLICIT) { 781 if (taskdata->td_dephash) { // do we need to cleanup dephash? 782 int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks); 783 kmp_tasking_flags_t flags_old = taskdata->td_flags; 784 if (children == 0 && flags_old.complete == 1) { 785 kmp_tasking_flags_t flags_new = flags_old; 786 flags_new.complete = 0; 787 if (KMP_COMPARE_AND_STORE_ACQ32( 788 RCAST(kmp_int32 *, &taskdata->td_flags), 789 *RCAST(kmp_int32 *, &flags_old), 790 *RCAST(kmp_int32 *, &flags_new))) { 791 KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans " 792 "dephash of implicit task %p\n", 793 gtid, taskdata)); 794 // cleanup dephash of finished implicit task 795 __kmp_dephash_free_entries(thread, taskdata->td_dephash); 796 } 797 } 798 } 799 return; 800 } 801 // Predecrement simulated by "- 1" calculation 802 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1; 803 KMP_DEBUG_ASSERT(children >= 0); 804 } 805 806 KA_TRACE( 807 20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; " 808 "not freeing it yet\n", 809 gtid, taskdata, children)); 810 } 811 812 // __kmp_task_finish: bookkeeping to do when a task finishes execution 813 // 814 // gtid: global thread ID for calling thread 815 // task: task to be finished 816 // resumed_task: task to be resumed. (may be NULL if task is serialized) 817 // 818 // template<ompt>: effectively ompt_enabled.enabled!=0 819 // the version with ompt=false is inlined, allowing to optimize away all ompt 820 // code in this case 821 template <bool ompt> 822 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, 823 kmp_taskdata_t *resumed_task) { 824 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 825 kmp_info_t *thread = __kmp_threads[gtid]; 826 kmp_task_team_t *task_team = 827 thread->th.th_task_team; // might be NULL for serial teams... 828 #if KMP_DEBUG 829 kmp_int32 children = 0; 830 #endif 831 KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming " 832 "task %p\n", 833 gtid, taskdata, resumed_task)); 834 835 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 836 837 // Pop task from stack if tied 838 #ifdef BUILD_TIED_TASK_STACK 839 if (taskdata->td_flags.tiedness == TASK_TIED) { 840 __kmp_pop_task_stack(gtid, thread, taskdata); 841 } 842 #endif /* BUILD_TIED_TASK_STACK */ 843 844 if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) { 845 // untied task needs to check the counter so that the task structure is not 846 // freed prematurely 847 kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1; 848 KA_TRACE( 849 20, 850 ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n", 851 gtid, counter, taskdata)); 852 if (counter > 0) { 853 // untied task is not done, to be continued possibly by other thread, do 854 // not free it now 855 if (resumed_task == NULL) { 856 KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial); 857 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 858 // task is the parent 859 } 860 thread->th.th_current_task = resumed_task; // restore current_task 861 resumed_task->td_flags.executing = 1; // resume previous task 862 KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, " 863 "resuming task %p\n", 864 gtid, taskdata, resumed_task)); 865 return; 866 } 867 } 868 869 // bookkeeping for resuming task: 870 // GEH - note tasking_ser => task_serial 871 KMP_DEBUG_ASSERT( 872 (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) == 873 taskdata->td_flags.task_serial); 874 if (taskdata->td_flags.task_serial) { 875 if (resumed_task == NULL) { 876 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 877 // task is the parent 878 } 879 } else { 880 KMP_DEBUG_ASSERT(resumed_task != 881 NULL); // verify that resumed task is passed as argument 882 } 883 884 /* If the tasks' destructor thunk flag has been set, we need to invoke the 885 destructor thunk that has been generated by the compiler. The code is 886 placed here, since at this point other tasks might have been released 887 hence overlapping the destructor invocations with some other work in the 888 released tasks. The OpenMP spec is not specific on when the destructors 889 are invoked, so we should be free to choose. */ 890 if (UNLIKELY(taskdata->td_flags.destructors_thunk)) { 891 kmp_routine_entry_t destr_thunk = task->data1.destructors; 892 KMP_ASSERT(destr_thunk); 893 destr_thunk(gtid, task); 894 } 895 896 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 897 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1); 898 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 899 900 bool detach = false; 901 if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) { 902 if (taskdata->td_allow_completion_event.type == 903 KMP_EVENT_ALLOW_COMPLETION) { 904 // event hasn't been fulfilled yet. Try to detach task. 905 __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid); 906 if (taskdata->td_allow_completion_event.type == 907 KMP_EVENT_ALLOW_COMPLETION) { 908 // task finished execution 909 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); 910 taskdata->td_flags.executing = 0; // suspend the finishing task 911 912 #if OMPT_SUPPORT 913 // For a detached task, which is not completed, we switch back 914 // the omp_fulfill_event signals completion 915 // locking is necessary to avoid a race with ompt_task_late_fulfill 916 if (ompt) 917 __ompt_task_finish(task, resumed_task, ompt_task_detach); 918 #endif 919 920 // no access to taskdata after this point! 921 // __kmp_fulfill_event might free taskdata at any time from now 922 923 taskdata->td_flags.proxy = TASK_PROXY; // proxify! 924 detach = true; 925 } 926 __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid); 927 } 928 } 929 930 if (!detach) { 931 taskdata->td_flags.complete = 1; // mark the task as completed 932 933 #if OMPT_SUPPORT 934 // This is not a detached task, we are done here 935 if (ompt) 936 __ompt_task_finish(task, resumed_task, ompt_task_complete); 937 #endif 938 939 // Only need to keep track of count if team parallel and tasking not 940 // serialized, or task is detachable and event has already been fulfilled 941 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) || 942 taskdata->td_flags.detachable == TASK_DETACHABLE || 943 taskdata->td_flags.hidden_helper) { 944 __kmp_release_deps(gtid, taskdata); 945 // Predecrement simulated by "- 1" calculation 946 #if KMP_DEBUG 947 children = -1 + 948 #endif 949 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks); 950 KMP_DEBUG_ASSERT(children >= 0); 951 if (taskdata->td_taskgroup) 952 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); 953 } else if (task_team && (task_team->tt.tt_found_proxy_tasks || 954 task_team->tt.tt_hidden_helper_task_encountered)) { 955 // if we found proxy or hidden helper tasks there could exist a dependency 956 // chain with the proxy task as origin 957 __kmp_release_deps(gtid, taskdata); 958 } 959 // td_flags.executing must be marked as 0 after __kmp_release_deps has been 960 // called. Othertwise, if a task is executed immediately from the 961 // release_deps code, the flag will be reset to 1 again by this same 962 // function 963 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); 964 taskdata->td_flags.executing = 0; // suspend the finishing task 965 } 966 967 KA_TRACE( 968 20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n", 969 gtid, taskdata, children)); 970 971 // Free this task and then ancestor tasks if they have no children. 972 // Restore th_current_task first as suggested by John: 973 // johnmc: if an asynchronous inquiry peers into the runtime system 974 // it doesn't see the freed task as the current task. 975 thread->th.th_current_task = resumed_task; 976 if (!detach) 977 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 978 979 // TODO: GEH - make sure root team implicit task is initialized properly. 980 // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 ); 981 resumed_task->td_flags.executing = 1; // resume previous task 982 983 KA_TRACE( 984 10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n", 985 gtid, taskdata, resumed_task)); 986 987 return; 988 } 989 990 template <bool ompt> 991 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref, 992 kmp_int32 gtid, 993 kmp_task_t *task) { 994 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n", 995 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 996 KMP_DEBUG_ASSERT(gtid >= 0); 997 // this routine will provide task to resume 998 __kmp_task_finish<ompt>(gtid, task, NULL); 999 1000 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n", 1001 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 1002 1003 #if OMPT_SUPPORT 1004 if (ompt) { 1005 ompt_frame_t *ompt_frame; 1006 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 1007 ompt_frame->enter_frame = ompt_data_none; 1008 ompt_frame->enter_frame_flags = 1009 ompt_frame_runtime | ompt_frame_framepointer; 1010 } 1011 #endif 1012 1013 return; 1014 } 1015 1016 #if OMPT_SUPPORT 1017 OMPT_NOINLINE 1018 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid, 1019 kmp_task_t *task) { 1020 __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task); 1021 } 1022 #endif // OMPT_SUPPORT 1023 1024 // __kmpc_omp_task_complete_if0: report that a task has completed execution 1025 // 1026 // loc_ref: source location information; points to end of task block. 1027 // gtid: global thread number. 1028 // task: task thunk for the completed task. 1029 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid, 1030 kmp_task_t *task) { 1031 #if OMPT_SUPPORT 1032 if (UNLIKELY(ompt_enabled.enabled)) { 1033 __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task); 1034 return; 1035 } 1036 #endif 1037 __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task); 1038 } 1039 1040 #ifdef TASK_UNUSED 1041 // __kmpc_omp_task_complete: report that a task has completed execution 1042 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 1043 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid, 1044 kmp_task_t *task) { 1045 KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid, 1046 loc_ref, KMP_TASK_TO_TASKDATA(task))); 1047 1048 __kmp_task_finish<false>(gtid, task, 1049 NULL); // Not sure how to find task to resume 1050 1051 KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid, 1052 loc_ref, KMP_TASK_TO_TASKDATA(task))); 1053 return; 1054 } 1055 #endif // TASK_UNUSED 1056 1057 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit 1058 // task for a given thread 1059 // 1060 // loc_ref: reference to source location of parallel region 1061 // this_thr: thread data structure corresponding to implicit task 1062 // team: team for this_thr 1063 // tid: thread id of given thread within team 1064 // set_curr_task: TRUE if need to push current task to thread 1065 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to 1066 // have already been done elsewhere. 1067 // TODO: Get better loc_ref. Value passed in may be NULL 1068 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr, 1069 kmp_team_t *team, int tid, int set_curr_task) { 1070 kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid]; 1071 1072 KF_TRACE( 1073 10, 1074 ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n", 1075 tid, team, task, set_curr_task ? "TRUE" : "FALSE")); 1076 1077 task->td_task_id = KMP_GEN_TASK_ID(); 1078 task->td_team = team; 1079 // task->td_parent = NULL; // fix for CQ230101 (broken parent task info 1080 // in debugger) 1081 task->td_ident = loc_ref; 1082 task->td_taskwait_ident = NULL; 1083 task->td_taskwait_counter = 0; 1084 task->td_taskwait_thread = 0; 1085 1086 task->td_flags.tiedness = TASK_TIED; 1087 task->td_flags.tasktype = TASK_IMPLICIT; 1088 task->td_flags.proxy = TASK_FULL; 1089 1090 // All implicit tasks are executed immediately, not deferred 1091 task->td_flags.task_serial = 1; 1092 task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 1093 task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 1094 1095 task->td_flags.started = 1; 1096 task->td_flags.executing = 1; 1097 task->td_flags.complete = 0; 1098 task->td_flags.freed = 0; 1099 1100 task->td_depnode = NULL; 1101 task->td_last_tied = task; 1102 task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED; 1103 1104 if (set_curr_task) { // only do this init first time thread is created 1105 KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0); 1106 // Not used: don't need to deallocate implicit task 1107 KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0); 1108 task->td_taskgroup = NULL; // An implicit task does not have taskgroup 1109 task->td_dephash = NULL; 1110 __kmp_push_current_task_to_thread(this_thr, team, tid); 1111 } else { 1112 KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0); 1113 KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0); 1114 } 1115 1116 #if OMPT_SUPPORT 1117 if (UNLIKELY(ompt_enabled.enabled)) 1118 __ompt_task_init(task, tid); 1119 #endif 1120 1121 KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid, 1122 team, task)); 1123 } 1124 1125 // __kmp_finish_implicit_task: Release resources associated to implicit tasks 1126 // at the end of parallel regions. Some resources are kept for reuse in the next 1127 // parallel region. 1128 // 1129 // thread: thread data structure corresponding to implicit task 1130 void __kmp_finish_implicit_task(kmp_info_t *thread) { 1131 kmp_taskdata_t *task = thread->th.th_current_task; 1132 if (task->td_dephash) { 1133 int children; 1134 task->td_flags.complete = 1; 1135 children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks); 1136 kmp_tasking_flags_t flags_old = task->td_flags; 1137 if (children == 0 && flags_old.complete == 1) { 1138 kmp_tasking_flags_t flags_new = flags_old; 1139 flags_new.complete = 0; 1140 if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags), 1141 *RCAST(kmp_int32 *, &flags_old), 1142 *RCAST(kmp_int32 *, &flags_new))) { 1143 KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans " 1144 "dephash of implicit task %p\n", 1145 thread->th.th_info.ds.ds_gtid, task)); 1146 __kmp_dephash_free_entries(thread, task->td_dephash); 1147 } 1148 } 1149 } 1150 } 1151 1152 // __kmp_free_implicit_task: Release resources associated to implicit tasks 1153 // when these are destroyed regions 1154 // 1155 // thread: thread data structure corresponding to implicit task 1156 void __kmp_free_implicit_task(kmp_info_t *thread) { 1157 kmp_taskdata_t *task = thread->th.th_current_task; 1158 if (task && task->td_dephash) { 1159 __kmp_dephash_free(thread, task->td_dephash); 1160 task->td_dephash = NULL; 1161 } 1162 } 1163 1164 // Round up a size to a power of two specified by val: Used to insert padding 1165 // between structures co-allocated using a single malloc() call 1166 static size_t __kmp_round_up_to_val(size_t size, size_t val) { 1167 if (size & (val - 1)) { 1168 size &= ~(val - 1); 1169 if (size <= KMP_SIZE_T_MAX - val) { 1170 size += val; // Round up if there is no overflow. 1171 } 1172 } 1173 return size; 1174 } // __kmp_round_up_to_va 1175 1176 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task 1177 // 1178 // loc_ref: source location information 1179 // gtid: global thread number. 1180 // flags: include tiedness & task type (explicit vs. implicit) of the ''new'' 1181 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine. 1182 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including 1183 // private vars accessed in task. 1184 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed 1185 // in task. 1186 // task_entry: Pointer to task code entry point generated by compiler. 1187 // returns: a pointer to the allocated kmp_task_t structure (task). 1188 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1189 kmp_tasking_flags_t *flags, 1190 size_t sizeof_kmp_task_t, size_t sizeof_shareds, 1191 kmp_routine_entry_t task_entry) { 1192 kmp_task_t *task; 1193 kmp_taskdata_t *taskdata; 1194 kmp_info_t *thread = __kmp_threads[gtid]; 1195 kmp_team_t *team = thread->th.th_team; 1196 kmp_taskdata_t *parent_task = thread->th.th_current_task; 1197 size_t shareds_offset; 1198 1199 if (UNLIKELY(!TCR_4(__kmp_init_middle))) 1200 __kmp_middle_initialize(); 1201 1202 if (flags->hidden_helper) { 1203 if (__kmp_enable_hidden_helper) { 1204 if (!TCR_4(__kmp_init_hidden_helper)) 1205 __kmp_hidden_helper_initialize(); 1206 } else { 1207 // If the hidden helper task is not enabled, reset the flag to FALSE. 1208 flags->hidden_helper = FALSE; 1209 } 1210 } 1211 1212 KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) " 1213 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1214 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t, 1215 sizeof_shareds, task_entry)); 1216 1217 KMP_DEBUG_ASSERT(parent_task); 1218 if (parent_task->td_flags.final) { 1219 if (flags->merged_if0) { 1220 } 1221 flags->final = 1; 1222 } 1223 1224 if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) { 1225 // Untied task encountered causes the TSC algorithm to check entire deque of 1226 // the victim thread. If no untied task encountered, then checking the head 1227 // of the deque should be enough. 1228 KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1); 1229 } 1230 1231 // Detachable tasks are not proxy tasks yet but could be in the future. Doing 1232 // the tasking setup 1233 // when that happens is too late. 1234 if (UNLIKELY(flags->proxy == TASK_PROXY || 1235 flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) { 1236 if (flags->proxy == TASK_PROXY) { 1237 flags->tiedness = TASK_UNTIED; 1238 flags->merged_if0 = 1; 1239 } 1240 /* are we running in a sequential parallel or tskm_immediate_exec... we need 1241 tasking support enabled */ 1242 if ((thread->th.th_task_team) == NULL) { 1243 /* This should only happen if the team is serialized 1244 setup a task team and propagate it to the thread */ 1245 KMP_DEBUG_ASSERT(team->t.t_serialized); 1246 KA_TRACE(30, 1247 ("T#%d creating task team in __kmp_task_alloc for proxy task\n", 1248 gtid)); 1249 // 1 indicates setup the current team regardless of nthreads 1250 __kmp_task_team_setup(thread, team, 1); 1251 thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state]; 1252 } 1253 kmp_task_team_t *task_team = thread->th.th_task_team; 1254 1255 /* tasking must be enabled now as the task might not be pushed */ 1256 if (!KMP_TASKING_ENABLED(task_team)) { 1257 KA_TRACE( 1258 30, 1259 ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid)); 1260 __kmp_enable_tasking(task_team, thread); 1261 kmp_int32 tid = thread->th.th_info.ds.ds_tid; 1262 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 1263 // No lock needed since only owner can allocate 1264 if (thread_data->td.td_deque == NULL) { 1265 __kmp_alloc_task_deque(thread, thread_data); 1266 } 1267 } 1268 1269 if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) && 1270 task_team->tt.tt_found_proxy_tasks == FALSE) 1271 TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE); 1272 if (flags->hidden_helper && 1273 task_team->tt.tt_hidden_helper_task_encountered == FALSE) 1274 TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE); 1275 } 1276 1277 // Calculate shared structure offset including padding after kmp_task_t struct 1278 // to align pointers in shared struct 1279 shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t; 1280 shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *)); 1281 1282 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 1283 KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid, 1284 shareds_offset)); 1285 KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid, 1286 sizeof_shareds)); 1287 1288 // Avoid double allocation here by combining shareds with taskdata 1289 #if USE_FAST_MEMORY 1290 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset + 1291 sizeof_shareds); 1292 #else /* ! USE_FAST_MEMORY */ 1293 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset + 1294 sizeof_shareds); 1295 #endif /* USE_FAST_MEMORY */ 1296 1297 task = KMP_TASKDATA_TO_TASK(taskdata); 1298 1299 // Make sure task & taskdata are aligned appropriately 1300 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD 1301 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0); 1302 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0); 1303 #else 1304 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0); 1305 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0); 1306 #endif 1307 if (sizeof_shareds > 0) { 1308 // Avoid double allocation here by combining shareds with taskdata 1309 task->shareds = &((char *)taskdata)[shareds_offset]; 1310 // Make sure shareds struct is aligned to pointer size 1311 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 1312 0); 1313 } else { 1314 task->shareds = NULL; 1315 } 1316 task->routine = task_entry; 1317 task->part_id = 0; // AC: Always start with 0 part id 1318 1319 taskdata->td_task_id = KMP_GEN_TASK_ID(); 1320 taskdata->td_team = thread->th.th_team; 1321 taskdata->td_alloc_thread = thread; 1322 taskdata->td_parent = parent_task; 1323 taskdata->td_level = parent_task->td_level + 1; // increment nesting level 1324 KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0); 1325 taskdata->td_ident = loc_ref; 1326 taskdata->td_taskwait_ident = NULL; 1327 taskdata->td_taskwait_counter = 0; 1328 taskdata->td_taskwait_thread = 0; 1329 KMP_DEBUG_ASSERT(taskdata->td_parent != NULL); 1330 // avoid copying icvs for proxy tasks 1331 if (flags->proxy == TASK_FULL) 1332 copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs); 1333 1334 taskdata->td_flags = *flags; 1335 taskdata->td_task_team = thread->th.th_task_team; 1336 taskdata->td_size_alloc = shareds_offset + sizeof_shareds; 1337 taskdata->td_flags.tasktype = TASK_EXPLICIT; 1338 // If it is hidden helper task, we need to set the team and task team 1339 // correspondingly. 1340 if (flags->hidden_helper) { 1341 kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)]; 1342 taskdata->td_team = shadow_thread->th.th_team; 1343 taskdata->td_task_team = shadow_thread->th.th_task_team; 1344 } 1345 1346 // GEH - TODO: fix this to copy parent task's value of tasking_ser flag 1347 taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 1348 1349 // GEH - TODO: fix this to copy parent task's value of team_serial flag 1350 taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 1351 1352 // GEH - Note we serialize the task if the team is serialized to make sure 1353 // implicit parallel region tasks are not left until program termination to 1354 // execute. Also, it helps locality to execute immediately. 1355 1356 taskdata->td_flags.task_serial = 1357 (parent_task->td_flags.final || taskdata->td_flags.team_serial || 1358 taskdata->td_flags.tasking_ser || flags->merged_if0); 1359 1360 taskdata->td_flags.started = 0; 1361 taskdata->td_flags.executing = 0; 1362 taskdata->td_flags.complete = 0; 1363 taskdata->td_flags.freed = 0; 1364 1365 KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0); 1366 // start at one because counts current task and children 1367 KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1); 1368 taskdata->td_taskgroup = 1369 parent_task->td_taskgroup; // task inherits taskgroup from the parent task 1370 taskdata->td_dephash = NULL; 1371 taskdata->td_depnode = NULL; 1372 if (flags->tiedness == TASK_UNTIED) 1373 taskdata->td_last_tied = NULL; // will be set when the task is scheduled 1374 else 1375 taskdata->td_last_tied = taskdata; 1376 taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED; 1377 #if OMPT_SUPPORT 1378 if (UNLIKELY(ompt_enabled.enabled)) 1379 __ompt_task_init(taskdata, gtid); 1380 #endif 1381 // Only need to keep track of child task counts if team parallel and tasking 1382 // not serialized or if it is a proxy or detachable or hidden helper task 1383 if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE || 1384 flags->hidden_helper || 1385 !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { 1386 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks); 1387 if (parent_task->td_taskgroup) 1388 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count); 1389 // Only need to keep track of allocated child tasks for explicit tasks since 1390 // implicit not deallocated 1391 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) { 1392 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks); 1393 } 1394 if (flags->hidden_helper) { 1395 taskdata->td_flags.task_serial = FALSE; 1396 // Increment the number of hidden helper tasks to be executed 1397 KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks); 1398 } 1399 } 1400 1401 KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n", 1402 gtid, taskdata, taskdata->td_parent)); 1403 1404 return task; 1405 } 1406 1407 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1408 kmp_int32 flags, size_t sizeof_kmp_task_t, 1409 size_t sizeof_shareds, 1410 kmp_routine_entry_t task_entry) { 1411 kmp_task_t *retval; 1412 kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags; 1413 __kmp_assert_valid_gtid(gtid); 1414 input_flags->native = FALSE; 1415 // __kmp_task_alloc() sets up all other runtime flags 1416 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) " 1417 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1418 gtid, loc_ref, input_flags->tiedness ? "tied " : "untied", 1419 input_flags->proxy ? "proxy" : "", 1420 input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t, 1421 sizeof_shareds, task_entry)); 1422 1423 retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t, 1424 sizeof_shareds, task_entry); 1425 1426 KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval)); 1427 1428 return retval; 1429 } 1430 1431 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1432 kmp_int32 flags, 1433 size_t sizeof_kmp_task_t, 1434 size_t sizeof_shareds, 1435 kmp_routine_entry_t task_entry, 1436 kmp_int64 device_id) { 1437 auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags); 1438 // target task is untied defined in the specification 1439 input_flags.tiedness = TASK_UNTIED; 1440 1441 if (__kmp_enable_hidden_helper) 1442 input_flags.hidden_helper = TRUE; 1443 1444 return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t, 1445 sizeof_shareds, task_entry); 1446 } 1447 1448 /*! 1449 @ingroup TASKING 1450 @param loc_ref location of the original task directive 1451 @param gtid Global Thread ID of encountering thread 1452 @param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new 1453 task'' 1454 @param naffins Number of affinity items 1455 @param affin_list List of affinity items 1456 @return Returns non-zero if registering affinity information was not successful. 1457 Returns 0 if registration was successful 1458 This entry registers the affinity information attached to a task with the task 1459 thunk structure kmp_taskdata_t. 1460 */ 1461 kmp_int32 1462 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, 1463 kmp_task_t *new_task, kmp_int32 naffins, 1464 kmp_task_affinity_info_t *affin_list) { 1465 return 0; 1466 } 1467 1468 // __kmp_invoke_task: invoke the specified task 1469 // 1470 // gtid: global thread ID of caller 1471 // task: the task to invoke 1472 // current_task: the task to resume after task invocation 1473 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, 1474 kmp_taskdata_t *current_task) { 1475 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 1476 kmp_info_t *thread; 1477 int discard = 0 /* false */; 1478 KA_TRACE( 1479 30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n", 1480 gtid, taskdata, current_task)); 1481 KMP_DEBUG_ASSERT(task); 1482 if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY && 1483 taskdata->td_flags.complete == 1)) { 1484 // This is a proxy task that was already completed but it needs to run 1485 // its bottom-half finish 1486 KA_TRACE( 1487 30, 1488 ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n", 1489 gtid, taskdata)); 1490 1491 __kmp_bottom_half_finish_proxy(gtid, task); 1492 1493 KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for " 1494 "proxy task %p, resuming task %p\n", 1495 gtid, taskdata, current_task)); 1496 1497 return; 1498 } 1499 1500 #if OMPT_SUPPORT 1501 // For untied tasks, the first task executed only calls __kmpc_omp_task and 1502 // does not execute code. 1503 ompt_thread_info_t oldInfo; 1504 if (UNLIKELY(ompt_enabled.enabled)) { 1505 // Store the threads states and restore them after the task 1506 thread = __kmp_threads[gtid]; 1507 oldInfo = thread->th.ompt_thread_info; 1508 thread->th.ompt_thread_info.wait_id = 0; 1509 thread->th.ompt_thread_info.state = (thread->th.th_team_serialized) 1510 ? ompt_state_work_serial 1511 : ompt_state_work_parallel; 1512 taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1513 } 1514 #endif 1515 1516 // Decreament the counter of hidden helper tasks to be executed 1517 if (taskdata->td_flags.hidden_helper) { 1518 // Hidden helper tasks can only be executed by hidden helper threads 1519 KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid)); 1520 KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks); 1521 } 1522 1523 // Proxy tasks are not handled by the runtime 1524 if (taskdata->td_flags.proxy != TASK_PROXY) { 1525 __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded 1526 } 1527 1528 // TODO: cancel tasks if the parallel region has also been cancelled 1529 // TODO: check if this sequence can be hoisted above __kmp_task_start 1530 // if cancellation has been enabled for this run ... 1531 if (UNLIKELY(__kmp_omp_cancellation)) { 1532 thread = __kmp_threads[gtid]; 1533 kmp_team_t *this_team = thread->th.th_team; 1534 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 1535 if ((taskgroup && taskgroup->cancel_request) || 1536 (this_team->t.t_cancel_request == cancel_parallel)) { 1537 #if OMPT_SUPPORT && OMPT_OPTIONAL 1538 ompt_data_t *task_data; 1539 if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) { 1540 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL); 1541 ompt_callbacks.ompt_callback(ompt_callback_cancel)( 1542 task_data, 1543 ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup 1544 : ompt_cancel_parallel) | 1545 ompt_cancel_discarded_task, 1546 NULL); 1547 } 1548 #endif 1549 KMP_COUNT_BLOCK(TASK_cancelled); 1550 // this task belongs to a task group and we need to cancel it 1551 discard = 1 /* true */; 1552 } 1553 } 1554 1555 // Invoke the task routine and pass in relevant data. 1556 // Thunks generated by gcc take a different argument list. 1557 if (!discard) { 1558 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 1559 taskdata->td_last_tied = current_task->td_last_tied; 1560 KMP_DEBUG_ASSERT(taskdata->td_last_tied); 1561 } 1562 #if KMP_STATS_ENABLED 1563 KMP_COUNT_BLOCK(TASK_executed); 1564 switch (KMP_GET_THREAD_STATE()) { 1565 case FORK_JOIN_BARRIER: 1566 KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar); 1567 break; 1568 case PLAIN_BARRIER: 1569 KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar); 1570 break; 1571 case TASKYIELD: 1572 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield); 1573 break; 1574 case TASKWAIT: 1575 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait); 1576 break; 1577 case TASKGROUP: 1578 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup); 1579 break; 1580 default: 1581 KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate); 1582 break; 1583 } 1584 #endif // KMP_STATS_ENABLED 1585 1586 // OMPT task begin 1587 #if OMPT_SUPPORT 1588 if (UNLIKELY(ompt_enabled.enabled)) 1589 __ompt_task_start(task, current_task, gtid); 1590 #endif 1591 1592 #if OMPD_SUPPORT 1593 if (ompd_state & OMPD_ENABLE_BP) 1594 ompd_bp_task_begin(); 1595 #endif 1596 1597 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1598 kmp_uint64 cur_time; 1599 kmp_int32 kmp_itt_count_task = 1600 __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial && 1601 current_task->td_flags.tasktype == TASK_IMPLICIT; 1602 if (kmp_itt_count_task) { 1603 thread = __kmp_threads[gtid]; 1604 // Time outer level explicit task on barrier for adjusting imbalance time 1605 if (thread->th.th_bar_arrive_time) 1606 cur_time = __itt_get_timestamp(); 1607 else 1608 kmp_itt_count_task = 0; // thread is not on a barrier - skip timing 1609 } 1610 KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task) 1611 #endif 1612 1613 #ifdef KMP_GOMP_COMPAT 1614 if (taskdata->td_flags.native) { 1615 ((void (*)(void *))(*(task->routine)))(task->shareds); 1616 } else 1617 #endif /* KMP_GOMP_COMPAT */ 1618 { 1619 (*(task->routine))(gtid, task); 1620 } 1621 KMP_POP_PARTITIONED_TIMER(); 1622 1623 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1624 if (kmp_itt_count_task) { 1625 // Barrier imbalance - adjust arrive time with the task duration 1626 thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time); 1627 } 1628 KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed) 1629 KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent 1630 #endif 1631 } 1632 1633 #if OMPD_SUPPORT 1634 if (ompd_state & OMPD_ENABLE_BP) 1635 ompd_bp_task_end(); 1636 #endif 1637 1638 // Proxy tasks are not handled by the runtime 1639 if (taskdata->td_flags.proxy != TASK_PROXY) { 1640 #if OMPT_SUPPORT 1641 if (UNLIKELY(ompt_enabled.enabled)) { 1642 thread->th.ompt_thread_info = oldInfo; 1643 if (taskdata->td_flags.tiedness == TASK_TIED) { 1644 taskdata->ompt_task_info.frame.exit_frame = ompt_data_none; 1645 } 1646 __kmp_task_finish<true>(gtid, task, current_task); 1647 } else 1648 #endif 1649 __kmp_task_finish<false>(gtid, task, current_task); 1650 } 1651 1652 KA_TRACE( 1653 30, 1654 ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n", 1655 gtid, taskdata, current_task)); 1656 return; 1657 } 1658 1659 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution 1660 // 1661 // loc_ref: location of original task pragma (ignored) 1662 // gtid: Global Thread ID of encountering thread 1663 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task'' 1664 // Returns: 1665 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1666 // be resumed later. 1667 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1668 // resumed later. 1669 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid, 1670 kmp_task_t *new_task) { 1671 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1672 1673 KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid, 1674 loc_ref, new_taskdata)); 1675 1676 #if OMPT_SUPPORT 1677 kmp_taskdata_t *parent; 1678 if (UNLIKELY(ompt_enabled.enabled)) { 1679 parent = new_taskdata->td_parent; 1680 if (ompt_enabled.ompt_callback_task_create) { 1681 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1682 &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame), 1683 &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0, 1684 OMPT_GET_RETURN_ADDRESS(0)); 1685 } 1686 } 1687 #endif 1688 1689 /* Should we execute the new task or queue it? For now, let's just always try 1690 to queue it. If the queue fills up, then we'll execute it. */ 1691 1692 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1693 { // Execute this task immediately 1694 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1695 new_taskdata->td_flags.task_serial = 1; 1696 __kmp_invoke_task(gtid, new_task, current_task); 1697 } 1698 1699 KA_TRACE( 1700 10, 1701 ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: " 1702 "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n", 1703 gtid, loc_ref, new_taskdata)); 1704 1705 #if OMPT_SUPPORT 1706 if (UNLIKELY(ompt_enabled.enabled)) { 1707 parent->ompt_task_info.frame.enter_frame = ompt_data_none; 1708 } 1709 #endif 1710 return TASK_CURRENT_NOT_QUEUED; 1711 } 1712 1713 // __kmp_omp_task: Schedule a non-thread-switchable task for execution 1714 // 1715 // gtid: Global Thread ID of encountering thread 1716 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc() 1717 // serialize_immediate: if TRUE then if the task is executed immediately its 1718 // execution will be serialized 1719 // Returns: 1720 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1721 // be resumed later. 1722 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1723 // resumed later. 1724 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task, 1725 bool serialize_immediate) { 1726 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1727 1728 /* Should we execute the new task or queue it? For now, let's just always try 1729 to queue it. If the queue fills up, then we'll execute it. */ 1730 if (new_taskdata->td_flags.proxy == TASK_PROXY || 1731 __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1732 { // Execute this task immediately 1733 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1734 if (serialize_immediate) 1735 new_taskdata->td_flags.task_serial = 1; 1736 __kmp_invoke_task(gtid, new_task, current_task); 1737 } 1738 1739 return TASK_CURRENT_NOT_QUEUED; 1740 } 1741 1742 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a 1743 // non-thread-switchable task from the parent thread only! 1744 // 1745 // loc_ref: location of original task pragma (ignored) 1746 // gtid: Global Thread ID of encountering thread 1747 // new_task: non-thread-switchable task thunk allocated by 1748 // __kmp_omp_task_alloc() 1749 // Returns: 1750 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1751 // be resumed later. 1752 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1753 // resumed later. 1754 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid, 1755 kmp_task_t *new_task) { 1756 kmp_int32 res; 1757 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); 1758 1759 #if KMP_DEBUG || OMPT_SUPPORT 1760 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1761 #endif 1762 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref, 1763 new_taskdata)); 1764 __kmp_assert_valid_gtid(gtid); 1765 1766 #if OMPT_SUPPORT 1767 kmp_taskdata_t *parent = NULL; 1768 if (UNLIKELY(ompt_enabled.enabled)) { 1769 if (!new_taskdata->td_flags.started) { 1770 OMPT_STORE_RETURN_ADDRESS(gtid); 1771 parent = new_taskdata->td_parent; 1772 if (!parent->ompt_task_info.frame.enter_frame.ptr) { 1773 parent->ompt_task_info.frame.enter_frame.ptr = 1774 OMPT_GET_FRAME_ADDRESS(0); 1775 } 1776 if (ompt_enabled.ompt_callback_task_create) { 1777 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1778 &(parent->ompt_task_info.task_data), 1779 &(parent->ompt_task_info.frame), 1780 &(new_taskdata->ompt_task_info.task_data), 1781 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, 1782 OMPT_LOAD_RETURN_ADDRESS(gtid)); 1783 } 1784 } else { 1785 // We are scheduling the continuation of an UNTIED task. 1786 // Scheduling back to the parent task. 1787 __ompt_task_finish(new_task, 1788 new_taskdata->ompt_task_info.scheduling_parent, 1789 ompt_task_switch); 1790 new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none; 1791 } 1792 } 1793 #endif 1794 1795 res = __kmp_omp_task(gtid, new_task, true); 1796 1797 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning " 1798 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n", 1799 gtid, loc_ref, new_taskdata)); 1800 #if OMPT_SUPPORT 1801 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) { 1802 parent->ompt_task_info.frame.enter_frame = ompt_data_none; 1803 } 1804 #endif 1805 return res; 1806 } 1807 1808 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule 1809 // a taskloop task with the correct OMPT return address 1810 // 1811 // loc_ref: location of original task pragma (ignored) 1812 // gtid: Global Thread ID of encountering thread 1813 // new_task: non-thread-switchable task thunk allocated by 1814 // __kmp_omp_task_alloc() 1815 // codeptr_ra: return address for OMPT callback 1816 // Returns: 1817 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1818 // be resumed later. 1819 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1820 // resumed later. 1821 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid, 1822 kmp_task_t *new_task, void *codeptr_ra) { 1823 kmp_int32 res; 1824 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); 1825 1826 #if KMP_DEBUG || OMPT_SUPPORT 1827 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1828 #endif 1829 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref, 1830 new_taskdata)); 1831 1832 #if OMPT_SUPPORT 1833 kmp_taskdata_t *parent = NULL; 1834 if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) { 1835 parent = new_taskdata->td_parent; 1836 if (!parent->ompt_task_info.frame.enter_frame.ptr) 1837 parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1838 if (ompt_enabled.ompt_callback_task_create) { 1839 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1840 &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame), 1841 &(new_taskdata->ompt_task_info.task_data), 1842 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, 1843 codeptr_ra); 1844 } 1845 } 1846 #endif 1847 1848 res = __kmp_omp_task(gtid, new_task, true); 1849 1850 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning " 1851 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n", 1852 gtid, loc_ref, new_taskdata)); 1853 #if OMPT_SUPPORT 1854 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) { 1855 parent->ompt_task_info.frame.enter_frame = ompt_data_none; 1856 } 1857 #endif 1858 return res; 1859 } 1860 1861 template <bool ompt> 1862 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid, 1863 void *frame_address, 1864 void *return_address) { 1865 kmp_taskdata_t *taskdata = nullptr; 1866 kmp_info_t *thread; 1867 int thread_finished = FALSE; 1868 KMP_SET_THREAD_STATE_BLOCK(TASKWAIT); 1869 1870 KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref)); 1871 KMP_DEBUG_ASSERT(gtid >= 0); 1872 1873 if (__kmp_tasking_mode != tskm_immediate_exec) { 1874 thread = __kmp_threads[gtid]; 1875 taskdata = thread->th.th_current_task; 1876 1877 #if OMPT_SUPPORT && OMPT_OPTIONAL 1878 ompt_data_t *my_task_data; 1879 ompt_data_t *my_parallel_data; 1880 1881 if (ompt) { 1882 my_task_data = &(taskdata->ompt_task_info.task_data); 1883 my_parallel_data = OMPT_CUR_TEAM_DATA(thread); 1884 1885 taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address; 1886 1887 if (ompt_enabled.ompt_callback_sync_region) { 1888 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1889 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data, 1890 my_task_data, return_address); 1891 } 1892 1893 if (ompt_enabled.ompt_callback_sync_region_wait) { 1894 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1895 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data, 1896 my_task_data, return_address); 1897 } 1898 } 1899 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1900 1901 // Debugger: The taskwait is active. Store location and thread encountered the 1902 // taskwait. 1903 #if USE_ITT_BUILD 1904 // Note: These values are used by ITT events as well. 1905 #endif /* USE_ITT_BUILD */ 1906 taskdata->td_taskwait_counter += 1; 1907 taskdata->td_taskwait_ident = loc_ref; 1908 taskdata->td_taskwait_thread = gtid + 1; 1909 1910 #if USE_ITT_BUILD 1911 void *itt_sync_obj = NULL; 1912 #if USE_ITT_NOTIFY 1913 KMP_ITT_TASKWAIT_STARTING(itt_sync_obj); 1914 #endif /* USE_ITT_NOTIFY */ 1915 #endif /* USE_ITT_BUILD */ 1916 1917 bool must_wait = 1918 !taskdata->td_flags.team_serial && !taskdata->td_flags.final; 1919 1920 must_wait = must_wait || (thread->th.th_task_team != NULL && 1921 thread->th.th_task_team->tt.tt_found_proxy_tasks); 1922 // If hidden helper thread is encountered, we must enable wait here. 1923 must_wait = 1924 must_wait || 1925 (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL && 1926 thread->th.th_task_team->tt.tt_hidden_helper_task_encountered); 1927 1928 if (must_wait) { 1929 kmp_flag_32<false, false> flag( 1930 RCAST(std::atomic<kmp_uint32> *, 1931 &(taskdata->td_incomplete_child_tasks)), 1932 0U); 1933 while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) { 1934 flag.execute_tasks(thread, gtid, FALSE, 1935 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 1936 __kmp_task_stealing_constraint); 1937 } 1938 } 1939 #if USE_ITT_BUILD 1940 KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj); 1941 KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children 1942 #endif /* USE_ITT_BUILD */ 1943 1944 // Debugger: The taskwait is completed. Location remains, but thread is 1945 // negated. 1946 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 1947 1948 #if OMPT_SUPPORT && OMPT_OPTIONAL 1949 if (ompt) { 1950 if (ompt_enabled.ompt_callback_sync_region_wait) { 1951 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1952 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data, 1953 my_task_data, return_address); 1954 } 1955 if (ompt_enabled.ompt_callback_sync_region) { 1956 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1957 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data, 1958 my_task_data, return_address); 1959 } 1960 taskdata->ompt_task_info.frame.enter_frame = ompt_data_none; 1961 } 1962 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1963 1964 } 1965 1966 KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, " 1967 "returning TASK_CURRENT_NOT_QUEUED\n", 1968 gtid, taskdata)); 1969 1970 return TASK_CURRENT_NOT_QUEUED; 1971 } 1972 1973 #if OMPT_SUPPORT && OMPT_OPTIONAL 1974 OMPT_NOINLINE 1975 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid, 1976 void *frame_address, 1977 void *return_address) { 1978 return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address, 1979 return_address); 1980 } 1981 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1982 1983 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are 1984 // complete 1985 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) { 1986 #if OMPT_SUPPORT && OMPT_OPTIONAL 1987 if (UNLIKELY(ompt_enabled.enabled)) { 1988 OMPT_STORE_RETURN_ADDRESS(gtid); 1989 return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0), 1990 OMPT_LOAD_RETURN_ADDRESS(gtid)); 1991 } 1992 #endif 1993 return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL); 1994 } 1995 1996 // __kmpc_omp_taskyield: switch to a different task 1997 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) { 1998 kmp_taskdata_t *taskdata = NULL; 1999 kmp_info_t *thread; 2000 int thread_finished = FALSE; 2001 2002 KMP_COUNT_BLOCK(OMP_TASKYIELD); 2003 KMP_SET_THREAD_STATE_BLOCK(TASKYIELD); 2004 2005 KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n", 2006 gtid, loc_ref, end_part)); 2007 __kmp_assert_valid_gtid(gtid); 2008 2009 if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) { 2010 thread = __kmp_threads[gtid]; 2011 taskdata = thread->th.th_current_task; 2012 // Should we model this as a task wait or not? 2013 // Debugger: The taskwait is active. Store location and thread encountered the 2014 // taskwait. 2015 #if USE_ITT_BUILD 2016 // Note: These values are used by ITT events as well. 2017 #endif /* USE_ITT_BUILD */ 2018 taskdata->td_taskwait_counter += 1; 2019 taskdata->td_taskwait_ident = loc_ref; 2020 taskdata->td_taskwait_thread = gtid + 1; 2021 2022 #if USE_ITT_BUILD 2023 void *itt_sync_obj = NULL; 2024 #if USE_ITT_NOTIFY 2025 KMP_ITT_TASKWAIT_STARTING(itt_sync_obj); 2026 #endif /* USE_ITT_NOTIFY */ 2027 #endif /* USE_ITT_BUILD */ 2028 if (!taskdata->td_flags.team_serial) { 2029 kmp_task_team_t *task_team = thread->th.th_task_team; 2030 if (task_team != NULL) { 2031 if (KMP_TASKING_ENABLED(task_team)) { 2032 #if OMPT_SUPPORT 2033 if (UNLIKELY(ompt_enabled.enabled)) 2034 thread->th.ompt_thread_info.ompt_task_yielded = 1; 2035 #endif 2036 __kmp_execute_tasks_32( 2037 thread, gtid, (kmp_flag_32<> *)NULL, FALSE, 2038 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 2039 __kmp_task_stealing_constraint); 2040 #if OMPT_SUPPORT 2041 if (UNLIKELY(ompt_enabled.enabled)) 2042 thread->th.ompt_thread_info.ompt_task_yielded = 0; 2043 #endif 2044 } 2045 } 2046 } 2047 #if USE_ITT_BUILD 2048 KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj); 2049 #endif /* USE_ITT_BUILD */ 2050 2051 // Debugger: The taskwait is completed. Location remains, but thread is 2052 // negated. 2053 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 2054 } 2055 2056 KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, " 2057 "returning TASK_CURRENT_NOT_QUEUED\n", 2058 gtid, taskdata)); 2059 2060 return TASK_CURRENT_NOT_QUEUED; 2061 } 2062 2063 // Task Reduction implementation 2064 // 2065 // Note: initial implementation didn't take into account the possibility 2066 // to specify omp_orig for initializer of the UDR (user defined reduction). 2067 // Corrected implementation takes into account the omp_orig object. 2068 // Compiler is free to use old implementation if omp_orig is not specified. 2069 2070 /*! 2071 @ingroup BASIC_TYPES 2072 @{ 2073 */ 2074 2075 /*! 2076 Flags for special info per task reduction item. 2077 */ 2078 typedef struct kmp_taskred_flags { 2079 /*! 1 - use lazy alloc/init (e.g. big objects, #tasks < #threads) */ 2080 unsigned lazy_priv : 1; 2081 unsigned reserved31 : 31; 2082 } kmp_taskred_flags_t; 2083 2084 /*! 2085 Internal struct for reduction data item related info set up by compiler. 2086 */ 2087 typedef struct kmp_task_red_input { 2088 void *reduce_shar; /**< shared between tasks item to reduce into */ 2089 size_t reduce_size; /**< size of data item in bytes */ 2090 // three compiler-generated routines (init, fini are optional): 2091 void *reduce_init; /**< data initialization routine (single parameter) */ 2092 void *reduce_fini; /**< data finalization routine */ 2093 void *reduce_comb; /**< data combiner routine */ 2094 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */ 2095 } kmp_task_red_input_t; 2096 2097 /*! 2098 Internal struct for reduction data item related info saved by the library. 2099 */ 2100 typedef struct kmp_taskred_data { 2101 void *reduce_shar; /**< shared between tasks item to reduce into */ 2102 size_t reduce_size; /**< size of data item */ 2103 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */ 2104 void *reduce_priv; /**< array of thread specific items */ 2105 void *reduce_pend; /**< end of private data for faster comparison op */ 2106 // three compiler-generated routines (init, fini are optional): 2107 void *reduce_comb; /**< data combiner routine */ 2108 void *reduce_init; /**< data initialization routine (two parameters) */ 2109 void *reduce_fini; /**< data finalization routine */ 2110 void *reduce_orig; /**< original item (can be used in UDR initializer) */ 2111 } kmp_taskred_data_t; 2112 2113 /*! 2114 Internal struct for reduction data item related info set up by compiler. 2115 2116 New interface: added reduce_orig field to provide omp_orig for UDR initializer. 2117 */ 2118 typedef struct kmp_taskred_input { 2119 void *reduce_shar; /**< shared between tasks item to reduce into */ 2120 void *reduce_orig; /**< original reduction item used for initialization */ 2121 size_t reduce_size; /**< size of data item */ 2122 // three compiler-generated routines (init, fini are optional): 2123 void *reduce_init; /**< data initialization routine (two parameters) */ 2124 void *reduce_fini; /**< data finalization routine */ 2125 void *reduce_comb; /**< data combiner routine */ 2126 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */ 2127 } kmp_taskred_input_t; 2128 /*! 2129 @} 2130 */ 2131 2132 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src); 2133 template <> 2134 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item, 2135 kmp_task_red_input_t &src) { 2136 item.reduce_orig = NULL; 2137 } 2138 template <> 2139 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item, 2140 kmp_taskred_input_t &src) { 2141 if (src.reduce_orig != NULL) { 2142 item.reduce_orig = src.reduce_orig; 2143 } else { 2144 item.reduce_orig = src.reduce_shar; 2145 } // non-NULL reduce_orig means new interface used 2146 } 2147 2148 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j); 2149 template <> 2150 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item, 2151 size_t offset) { 2152 ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset); 2153 } 2154 template <> 2155 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item, 2156 size_t offset) { 2157 ((void (*)(void *, void *))item.reduce_init)( 2158 (char *)(item.reduce_priv) + offset, item.reduce_orig); 2159 } 2160 2161 template <typename T> 2162 void *__kmp_task_reduction_init(int gtid, int num, T *data) { 2163 __kmp_assert_valid_gtid(gtid); 2164 kmp_info_t *thread = __kmp_threads[gtid]; 2165 kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup; 2166 kmp_uint32 nth = thread->th.th_team_nproc; 2167 kmp_taskred_data_t *arr; 2168 2169 // check input data just in case 2170 KMP_ASSERT(tg != NULL); 2171 KMP_ASSERT(data != NULL); 2172 KMP_ASSERT(num > 0); 2173 if (nth == 1) { 2174 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n", 2175 gtid, tg)); 2176 return (void *)tg; 2177 } 2178 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n", 2179 gtid, tg, num)); 2180 arr = (kmp_taskred_data_t *)__kmp_thread_malloc( 2181 thread, num * sizeof(kmp_taskred_data_t)); 2182 for (int i = 0; i < num; ++i) { 2183 size_t size = data[i].reduce_size - 1; 2184 // round the size up to cache line per thread-specific item 2185 size += CACHE_LINE - size % CACHE_LINE; 2186 KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory 2187 arr[i].reduce_shar = data[i].reduce_shar; 2188 arr[i].reduce_size = size; 2189 arr[i].flags = data[i].flags; 2190 arr[i].reduce_comb = data[i].reduce_comb; 2191 arr[i].reduce_init = data[i].reduce_init; 2192 arr[i].reduce_fini = data[i].reduce_fini; 2193 __kmp_assign_orig<T>(arr[i], data[i]); 2194 if (!arr[i].flags.lazy_priv) { 2195 // allocate cache-line aligned block and fill it with zeros 2196 arr[i].reduce_priv = __kmp_allocate(nth * size); 2197 arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size; 2198 if (arr[i].reduce_init != NULL) { 2199 // initialize all thread-specific items 2200 for (size_t j = 0; j < nth; ++j) { 2201 __kmp_call_init<T>(arr[i], j * size); 2202 } 2203 } 2204 } else { 2205 // only allocate space for pointers now, 2206 // objects will be lazily allocated/initialized if/when requested 2207 // note that __kmp_allocate zeroes the allocated memory 2208 arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *)); 2209 } 2210 } 2211 tg->reduce_data = (void *)arr; 2212 tg->reduce_num_data = num; 2213 return (void *)tg; 2214 } 2215 2216 /*! 2217 @ingroup TASKING 2218 @param gtid Global thread ID 2219 @param num Number of data items to reduce 2220 @param data Array of data for reduction 2221 @return The taskgroup identifier 2222 2223 Initialize task reduction for the taskgroup. 2224 2225 Note: this entry supposes the optional compiler-generated initializer routine 2226 has single parameter - pointer to object to be initialized. That means 2227 the reduction either does not use omp_orig object, or the omp_orig is accessible 2228 without help of the runtime library. 2229 */ 2230 void *__kmpc_task_reduction_init(int gtid, int num, void *data) { 2231 return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data); 2232 } 2233 2234 /*! 2235 @ingroup TASKING 2236 @param gtid Global thread ID 2237 @param num Number of data items to reduce 2238 @param data Array of data for reduction 2239 @return The taskgroup identifier 2240 2241 Initialize task reduction for the taskgroup. 2242 2243 Note: this entry supposes the optional compiler-generated initializer routine 2244 has two parameters, pointer to object to be initialized and pointer to omp_orig 2245 */ 2246 void *__kmpc_taskred_init(int gtid, int num, void *data) { 2247 return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data); 2248 } 2249 2250 // Copy task reduction data (except for shared pointers). 2251 template <typename T> 2252 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data, 2253 kmp_taskgroup_t *tg, void *reduce_data) { 2254 kmp_taskred_data_t *arr; 2255 KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p," 2256 " from data %p\n", 2257 thr, tg, reduce_data)); 2258 arr = (kmp_taskred_data_t *)__kmp_thread_malloc( 2259 thr, num * sizeof(kmp_taskred_data_t)); 2260 // threads will share private copies, thunk routines, sizes, flags, etc.: 2261 KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t)); 2262 for (int i = 0; i < num; ++i) { 2263 arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers 2264 } 2265 tg->reduce_data = (void *)arr; 2266 tg->reduce_num_data = num; 2267 } 2268 2269 /*! 2270 @ingroup TASKING 2271 @param gtid Global thread ID 2272 @param tskgrp The taskgroup ID (optional) 2273 @param data Shared location of the item 2274 @return The pointer to per-thread data 2275 2276 Get thread-specific location of data item 2277 */ 2278 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) { 2279 __kmp_assert_valid_gtid(gtid); 2280 kmp_info_t *thread = __kmp_threads[gtid]; 2281 kmp_int32 nth = thread->th.th_team_nproc; 2282 if (nth == 1) 2283 return data; // nothing to do 2284 2285 kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp; 2286 if (tg == NULL) 2287 tg = thread->th.th_current_task->td_taskgroup; 2288 KMP_ASSERT(tg != NULL); 2289 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data); 2290 kmp_int32 num = tg->reduce_num_data; 2291 kmp_int32 tid = thread->th.th_info.ds.ds_tid; 2292 2293 KMP_ASSERT(data != NULL); 2294 while (tg != NULL) { 2295 for (int i = 0; i < num; ++i) { 2296 if (!arr[i].flags.lazy_priv) { 2297 if (data == arr[i].reduce_shar || 2298 (data >= arr[i].reduce_priv && data < arr[i].reduce_pend)) 2299 return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size; 2300 } else { 2301 // check shared location first 2302 void **p_priv = (void **)(arr[i].reduce_priv); 2303 if (data == arr[i].reduce_shar) 2304 goto found; 2305 // check if we get some thread specific location as parameter 2306 for (int j = 0; j < nth; ++j) 2307 if (data == p_priv[j]) 2308 goto found; 2309 continue; // not found, continue search 2310 found: 2311 if (p_priv[tid] == NULL) { 2312 // allocate thread specific object lazily 2313 p_priv[tid] = __kmp_allocate(arr[i].reduce_size); 2314 if (arr[i].reduce_init != NULL) { 2315 if (arr[i].reduce_orig != NULL) { // new interface 2316 ((void (*)(void *, void *))arr[i].reduce_init)( 2317 p_priv[tid], arr[i].reduce_orig); 2318 } else { // old interface (single parameter) 2319 ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]); 2320 } 2321 } 2322 } 2323 return p_priv[tid]; 2324 } 2325 } 2326 tg = tg->parent; 2327 arr = (kmp_taskred_data_t *)(tg->reduce_data); 2328 num = tg->reduce_num_data; 2329 } 2330 KMP_ASSERT2(0, "Unknown task reduction item"); 2331 return NULL; // ERROR, this line never executed 2332 } 2333 2334 // Finalize task reduction. 2335 // Called from __kmpc_end_taskgroup() 2336 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) { 2337 kmp_int32 nth = th->th.th_team_nproc; 2338 KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1 2339 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data; 2340 kmp_int32 num = tg->reduce_num_data; 2341 for (int i = 0; i < num; ++i) { 2342 void *sh_data = arr[i].reduce_shar; 2343 void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini); 2344 void (*f_comb)(void *, void *) = 2345 (void (*)(void *, void *))(arr[i].reduce_comb); 2346 if (!arr[i].flags.lazy_priv) { 2347 void *pr_data = arr[i].reduce_priv; 2348 size_t size = arr[i].reduce_size; 2349 for (int j = 0; j < nth; ++j) { 2350 void *priv_data = (char *)pr_data + j * size; 2351 f_comb(sh_data, priv_data); // combine results 2352 if (f_fini) 2353 f_fini(priv_data); // finalize if needed 2354 } 2355 } else { 2356 void **pr_data = (void **)(arr[i].reduce_priv); 2357 for (int j = 0; j < nth; ++j) { 2358 if (pr_data[j] != NULL) { 2359 f_comb(sh_data, pr_data[j]); // combine results 2360 if (f_fini) 2361 f_fini(pr_data[j]); // finalize if needed 2362 __kmp_free(pr_data[j]); 2363 } 2364 } 2365 } 2366 __kmp_free(arr[i].reduce_priv); 2367 } 2368 __kmp_thread_free(th, arr); 2369 tg->reduce_data = NULL; 2370 tg->reduce_num_data = 0; 2371 } 2372 2373 // Cleanup task reduction data for parallel or worksharing, 2374 // do not touch task private data other threads still working with. 2375 // Called from __kmpc_end_taskgroup() 2376 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) { 2377 __kmp_thread_free(th, tg->reduce_data); 2378 tg->reduce_data = NULL; 2379 tg->reduce_num_data = 0; 2380 } 2381 2382 template <typename T> 2383 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, 2384 int num, T *data) { 2385 __kmp_assert_valid_gtid(gtid); 2386 kmp_info_t *thr = __kmp_threads[gtid]; 2387 kmp_int32 nth = thr->th.th_team_nproc; 2388 __kmpc_taskgroup(loc, gtid); // form new taskgroup first 2389 if (nth == 1) { 2390 KA_TRACE(10, 2391 ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n", 2392 gtid, thr->th.th_current_task->td_taskgroup)); 2393 return (void *)thr->th.th_current_task->td_taskgroup; 2394 } 2395 kmp_team_t *team = thr->th.th_team; 2396 void *reduce_data; 2397 kmp_taskgroup_t *tg; 2398 reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]); 2399 if (reduce_data == NULL && 2400 __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data, 2401 (void *)1)) { 2402 // single thread enters this block to initialize common reduction data 2403 KMP_DEBUG_ASSERT(reduce_data == NULL); 2404 // first initialize own data, then make a copy other threads can use 2405 tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data); 2406 reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t)); 2407 KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t)); 2408 // fini counters should be 0 at this point 2409 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0); 2410 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0); 2411 KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data); 2412 } else { 2413 while ( 2414 (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) == 2415 (void *)1) { // wait for task reduction initialization 2416 KMP_CPU_PAUSE(); 2417 } 2418 KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here 2419 tg = thr->th.th_current_task->td_taskgroup; 2420 __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data); 2421 } 2422 return tg; 2423 } 2424 2425 /*! 2426 @ingroup TASKING 2427 @param loc Source location info 2428 @param gtid Global thread ID 2429 @param is_ws Is 1 if the reduction is for worksharing, 0 otherwise 2430 @param num Number of data items to reduce 2431 @param data Array of data for reduction 2432 @return The taskgroup identifier 2433 2434 Initialize task reduction for a parallel or worksharing. 2435 2436 Note: this entry supposes the optional compiler-generated initializer routine 2437 has single parameter - pointer to object to be initialized. That means 2438 the reduction either does not use omp_orig object, or the omp_orig is accessible 2439 without help of the runtime library. 2440 */ 2441 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, 2442 int num, void *data) { 2443 return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num, 2444 (kmp_task_red_input_t *)data); 2445 } 2446 2447 /*! 2448 @ingroup TASKING 2449 @param loc Source location info 2450 @param gtid Global thread ID 2451 @param is_ws Is 1 if the reduction is for worksharing, 0 otherwise 2452 @param num Number of data items to reduce 2453 @param data Array of data for reduction 2454 @return The taskgroup identifier 2455 2456 Initialize task reduction for a parallel or worksharing. 2457 2458 Note: this entry supposes the optional compiler-generated initializer routine 2459 has two parameters, pointer to object to be initialized and pointer to omp_orig 2460 */ 2461 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, 2462 void *data) { 2463 return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num, 2464 (kmp_taskred_input_t *)data); 2465 } 2466 2467 /*! 2468 @ingroup TASKING 2469 @param loc Source location info 2470 @param gtid Global thread ID 2471 @param is_ws Is 1 if the reduction is for worksharing, 0 otherwise 2472 2473 Finalize task reduction for a parallel or worksharing. 2474 */ 2475 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) { 2476 __kmpc_end_taskgroup(loc, gtid); 2477 } 2478 2479 // __kmpc_taskgroup: Start a new taskgroup 2480 void __kmpc_taskgroup(ident_t *loc, int gtid) { 2481 __kmp_assert_valid_gtid(gtid); 2482 kmp_info_t *thread = __kmp_threads[gtid]; 2483 kmp_taskdata_t *taskdata = thread->th.th_current_task; 2484 kmp_taskgroup_t *tg_new = 2485 (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t)); 2486 KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new)); 2487 KMP_ATOMIC_ST_RLX(&tg_new->count, 0); 2488 KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq); 2489 tg_new->parent = taskdata->td_taskgroup; 2490 tg_new->reduce_data = NULL; 2491 tg_new->reduce_num_data = 0; 2492 tg_new->gomp_data = NULL; 2493 taskdata->td_taskgroup = tg_new; 2494 2495 #if OMPT_SUPPORT && OMPT_OPTIONAL 2496 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) { 2497 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2498 if (!codeptr) 2499 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2500 kmp_team_t *team = thread->th.th_team; 2501 ompt_data_t my_task_data = taskdata->ompt_task_info.task_data; 2502 // FIXME: I think this is wrong for lwt! 2503 ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data; 2504 2505 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2506 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data), 2507 &(my_task_data), codeptr); 2508 } 2509 #endif 2510 } 2511 2512 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task 2513 // and its descendants are complete 2514 void __kmpc_end_taskgroup(ident_t *loc, int gtid) { 2515 __kmp_assert_valid_gtid(gtid); 2516 kmp_info_t *thread = __kmp_threads[gtid]; 2517 kmp_taskdata_t *taskdata = thread->th.th_current_task; 2518 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 2519 int thread_finished = FALSE; 2520 2521 #if OMPT_SUPPORT && OMPT_OPTIONAL 2522 kmp_team_t *team; 2523 ompt_data_t my_task_data; 2524 ompt_data_t my_parallel_data; 2525 void *codeptr = nullptr; 2526 if (UNLIKELY(ompt_enabled.enabled)) { 2527 team = thread->th.th_team; 2528 my_task_data = taskdata->ompt_task_info.task_data; 2529 // FIXME: I think this is wrong for lwt! 2530 my_parallel_data = team->t.ompt_team_info.parallel_data; 2531 codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2532 if (!codeptr) 2533 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2534 } 2535 #endif 2536 2537 KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc)); 2538 KMP_DEBUG_ASSERT(taskgroup != NULL); 2539 KMP_SET_THREAD_STATE_BLOCK(TASKGROUP); 2540 2541 if (__kmp_tasking_mode != tskm_immediate_exec) { 2542 // mark task as waiting not on a barrier 2543 taskdata->td_taskwait_counter += 1; 2544 taskdata->td_taskwait_ident = loc; 2545 taskdata->td_taskwait_thread = gtid + 1; 2546 #if USE_ITT_BUILD 2547 // For ITT the taskgroup wait is similar to taskwait until we need to 2548 // distinguish them 2549 void *itt_sync_obj = NULL; 2550 #if USE_ITT_NOTIFY 2551 KMP_ITT_TASKWAIT_STARTING(itt_sync_obj); 2552 #endif /* USE_ITT_NOTIFY */ 2553 #endif /* USE_ITT_BUILD */ 2554 2555 #if OMPT_SUPPORT && OMPT_OPTIONAL 2556 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) { 2557 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2558 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data), 2559 &(my_task_data), codeptr); 2560 } 2561 #endif 2562 2563 if (!taskdata->td_flags.team_serial || 2564 (thread->th.th_task_team != NULL && 2565 (thread->th.th_task_team->tt.tt_found_proxy_tasks || 2566 thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) { 2567 kmp_flag_32<false, false> flag( 2568 RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U); 2569 while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) { 2570 flag.execute_tasks(thread, gtid, FALSE, 2571 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 2572 __kmp_task_stealing_constraint); 2573 } 2574 } 2575 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting 2576 2577 #if OMPT_SUPPORT && OMPT_OPTIONAL 2578 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) { 2579 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2580 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data), 2581 &(my_task_data), codeptr); 2582 } 2583 #endif 2584 2585 #if USE_ITT_BUILD 2586 KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj); 2587 KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants 2588 #endif /* USE_ITT_BUILD */ 2589 } 2590 KMP_DEBUG_ASSERT(taskgroup->count == 0); 2591 2592 if (taskgroup->reduce_data != NULL && 2593 !taskgroup->gomp_data) { // need to reduce? 2594 int cnt; 2595 void *reduce_data; 2596 kmp_team_t *t = thread->th.th_team; 2597 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data; 2598 // check if <priv> data of the first reduction variable shared for the team 2599 void *priv0 = arr[0].reduce_priv; 2600 if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL && 2601 ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) { 2602 // finishing task reduction on parallel 2603 cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]); 2604 if (cnt == thread->th.th_team_nproc - 1) { 2605 // we are the last thread passing __kmpc_reduction_modifier_fini() 2606 // finalize task reduction: 2607 __kmp_task_reduction_fini(thread, taskgroup); 2608 // cleanup fields in the team structure: 2609 // TODO: is relaxed store enough here (whole barrier should follow)? 2610 __kmp_thread_free(thread, reduce_data); 2611 KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL); 2612 KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0); 2613 } else { 2614 // we are not the last thread passing __kmpc_reduction_modifier_fini(), 2615 // so do not finalize reduction, just clean own copy of the data 2616 __kmp_task_reduction_clean(thread, taskgroup); 2617 } 2618 } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) != 2619 NULL && 2620 ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) { 2621 // finishing task reduction on worksharing 2622 cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]); 2623 if (cnt == thread->th.th_team_nproc - 1) { 2624 // we are the last thread passing __kmpc_reduction_modifier_fini() 2625 __kmp_task_reduction_fini(thread, taskgroup); 2626 // cleanup fields in team structure: 2627 // TODO: is relaxed store enough here (whole barrier should follow)? 2628 __kmp_thread_free(thread, reduce_data); 2629 KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL); 2630 KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0); 2631 } else { 2632 // we are not the last thread passing __kmpc_reduction_modifier_fini(), 2633 // so do not finalize reduction, just clean own copy of the data 2634 __kmp_task_reduction_clean(thread, taskgroup); 2635 } 2636 } else { 2637 // finishing task reduction on taskgroup 2638 __kmp_task_reduction_fini(thread, taskgroup); 2639 } 2640 } 2641 // Restore parent taskgroup for the current task 2642 taskdata->td_taskgroup = taskgroup->parent; 2643 __kmp_thread_free(thread, taskgroup); 2644 2645 KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n", 2646 gtid, taskdata)); 2647 2648 #if OMPT_SUPPORT && OMPT_OPTIONAL 2649 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) { 2650 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2651 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data), 2652 &(my_task_data), codeptr); 2653 } 2654 #endif 2655 } 2656 2657 // __kmp_remove_my_task: remove a task from my own deque 2658 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid, 2659 kmp_task_team_t *task_team, 2660 kmp_int32 is_constrained) { 2661 kmp_task_t *task; 2662 kmp_taskdata_t *taskdata; 2663 kmp_thread_data_t *thread_data; 2664 kmp_uint32 tail; 2665 2666 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2667 KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data != 2668 NULL); // Caller should check this condition 2669 2670 thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 2671 2672 KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n", 2673 gtid, thread_data->td.td_deque_ntasks, 2674 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2675 2676 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 2677 KA_TRACE(10, 2678 ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: " 2679 "ntasks=%d head=%u tail=%u\n", 2680 gtid, thread_data->td.td_deque_ntasks, 2681 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2682 return NULL; 2683 } 2684 2685 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 2686 2687 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 2688 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2689 KA_TRACE(10, 2690 ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: " 2691 "ntasks=%d head=%u tail=%u\n", 2692 gtid, thread_data->td.td_deque_ntasks, 2693 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2694 return NULL; 2695 } 2696 2697 tail = (thread_data->td.td_deque_tail - 1) & 2698 TASK_DEQUE_MASK(thread_data->td); // Wrap index. 2699 taskdata = thread_data->td.td_deque[tail]; 2700 2701 if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata, 2702 thread->th.th_current_task)) { 2703 // The TSC does not allow to steal victim task 2704 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2705 KA_TRACE(10, 2706 ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: " 2707 "ntasks=%d head=%u tail=%u\n", 2708 gtid, thread_data->td.td_deque_ntasks, 2709 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2710 return NULL; 2711 } 2712 2713 thread_data->td.td_deque_tail = tail; 2714 TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1); 2715 2716 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2717 2718 KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: " 2719 "ntasks=%d head=%u tail=%u\n", 2720 gtid, taskdata, thread_data->td.td_deque_ntasks, 2721 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2722 2723 task = KMP_TASKDATA_TO_TASK(taskdata); 2724 return task; 2725 } 2726 2727 // __kmp_steal_task: remove a task from another thread's deque 2728 // Assume that calling thread has already checked existence of 2729 // task_team thread_data before calling this routine. 2730 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid, 2731 kmp_task_team_t *task_team, 2732 std::atomic<kmp_int32> *unfinished_threads, 2733 int *thread_finished, 2734 kmp_int32 is_constrained) { 2735 kmp_task_t *task; 2736 kmp_taskdata_t *taskdata; 2737 kmp_taskdata_t *current; 2738 kmp_thread_data_t *victim_td, *threads_data; 2739 kmp_int32 target; 2740 kmp_int32 victim_tid; 2741 2742 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2743 2744 threads_data = task_team->tt.tt_threads_data; 2745 KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition 2746 2747 victim_tid = victim_thr->th.th_info.ds.ds_tid; 2748 victim_td = &threads_data[victim_tid]; 2749 2750 KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: " 2751 "task_team=%p ntasks=%d head=%u tail=%u\n", 2752 gtid, __kmp_gtid_from_thread(victim_thr), task_team, 2753 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2754 victim_td->td.td_deque_tail)); 2755 2756 if (TCR_4(victim_td->td.td_deque_ntasks) == 0) { 2757 KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: " 2758 "task_team=%p ntasks=%d head=%u tail=%u\n", 2759 gtid, __kmp_gtid_from_thread(victim_thr), task_team, 2760 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2761 victim_td->td.td_deque_tail)); 2762 return NULL; 2763 } 2764 2765 __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock); 2766 2767 int ntasks = TCR_4(victim_td->td.td_deque_ntasks); 2768 // Check again after we acquire the lock 2769 if (ntasks == 0) { 2770 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2771 KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: " 2772 "task_team=%p ntasks=%d head=%u tail=%u\n", 2773 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2774 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2775 return NULL; 2776 } 2777 2778 KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL); 2779 current = __kmp_threads[gtid]->th.th_current_task; 2780 taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head]; 2781 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) { 2782 // Bump head pointer and Wrap. 2783 victim_td->td.td_deque_head = 2784 (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td); 2785 } else { 2786 if (!task_team->tt.tt_untied_task_encountered) { 2787 // The TSC does not allow to steal victim task 2788 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2789 KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from " 2790 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n", 2791 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2792 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2793 return NULL; 2794 } 2795 int i; 2796 // walk through victim's deque trying to steal any task 2797 target = victim_td->td.td_deque_head; 2798 taskdata = NULL; 2799 for (i = 1; i < ntasks; ++i) { 2800 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td); 2801 taskdata = victim_td->td.td_deque[target]; 2802 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) { 2803 break; // found victim task 2804 } else { 2805 taskdata = NULL; 2806 } 2807 } 2808 if (taskdata == NULL) { 2809 // No appropriate candidate to steal found 2810 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2811 KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from " 2812 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n", 2813 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2814 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2815 return NULL; 2816 } 2817 int prev = target; 2818 for (i = i + 1; i < ntasks; ++i) { 2819 // shift remaining tasks in the deque left by 1 2820 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td); 2821 victim_td->td.td_deque[prev] = victim_td->td.td_deque[target]; 2822 prev = target; 2823 } 2824 KMP_DEBUG_ASSERT( 2825 victim_td->td.td_deque_tail == 2826 (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td))); 2827 victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped)) 2828 } 2829 if (*thread_finished) { 2830 // We need to un-mark this victim as a finished victim. This must be done 2831 // before releasing the lock, or else other threads (starting with the 2832 // primary thread victim) might be prematurely released from the barrier!!! 2833 #if KMP_DEBUG 2834 kmp_int32 count = 2835 #endif 2836 KMP_ATOMIC_INC(unfinished_threads); 2837 KA_TRACE( 2838 20, 2839 ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n", 2840 gtid, count + 1, task_team)); 2841 *thread_finished = FALSE; 2842 } 2843 TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1); 2844 2845 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2846 2847 KMP_COUNT_BLOCK(TASK_stolen); 2848 KA_TRACE(10, 2849 ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: " 2850 "task_team=%p ntasks=%d head=%u tail=%u\n", 2851 gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team, 2852 ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2853 2854 task = KMP_TASKDATA_TO_TASK(taskdata); 2855 return task; 2856 } 2857 2858 // __kmp_execute_tasks_template: Choose and execute tasks until either the 2859 // condition is statisfied (return true) or there are none left (return false). 2860 // 2861 // final_spin is TRUE if this is the spin at the release barrier. 2862 // thread_finished indicates whether the thread is finished executing all 2863 // the tasks it has on its deque, and is at the release barrier. 2864 // spinner is the location on which to spin. 2865 // spinner == NULL means only execute a single task and return. 2866 // checker is the value to check to terminate the spin. 2867 template <class C> 2868 static inline int __kmp_execute_tasks_template( 2869 kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin, 2870 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2871 kmp_int32 is_constrained) { 2872 kmp_task_team_t *task_team = thread->th.th_task_team; 2873 kmp_thread_data_t *threads_data; 2874 kmp_task_t *task; 2875 kmp_info_t *other_thread; 2876 kmp_taskdata_t *current_task = thread->th.th_current_task; 2877 std::atomic<kmp_int32> *unfinished_threads; 2878 kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0, 2879 tid = thread->th.th_info.ds.ds_tid; 2880 2881 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2882 KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]); 2883 2884 if (task_team == NULL || current_task == NULL) 2885 return FALSE; 2886 2887 KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d " 2888 "*thread_finished=%d\n", 2889 gtid, final_spin, *thread_finished)); 2890 2891 thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 2892 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 2893 2894 KMP_DEBUG_ASSERT(threads_data != NULL); 2895 2896 nthreads = task_team->tt.tt_nproc; 2897 unfinished_threads = &(task_team->tt.tt_unfinished_threads); 2898 KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks || 2899 task_team->tt.tt_hidden_helper_task_encountered); 2900 KMP_DEBUG_ASSERT(*unfinished_threads >= 0); 2901 2902 while (1) { // Outer loop keeps trying to find tasks in case of single thread 2903 // getting tasks from target constructs 2904 while (1) { // Inner loop to find a task and execute it 2905 task = NULL; 2906 if (use_own_tasks) { // check on own queue first 2907 task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained); 2908 } 2909 if ((task == NULL) && (nthreads > 1)) { // Steal a task 2910 int asleep = 1; 2911 use_own_tasks = 0; 2912 // Try to steal from the last place I stole from successfully. 2913 if (victim_tid == -2) { // haven't stolen anything yet 2914 victim_tid = threads_data[tid].td.td_deque_last_stolen; 2915 if (victim_tid != 2916 -1) // if we have a last stolen from victim, get the thread 2917 other_thread = threads_data[victim_tid].td.td_thr; 2918 } 2919 if (victim_tid != -1) { // found last victim 2920 asleep = 0; 2921 } else if (!new_victim) { // no recent steals and we haven't already 2922 // used a new victim; select a random thread 2923 do { // Find a different thread to steal work from. 2924 // Pick a random thread. Initial plan was to cycle through all the 2925 // threads, and only return if we tried to steal from every thread, 2926 // and failed. Arch says that's not such a great idea. 2927 victim_tid = __kmp_get_random(thread) % (nthreads - 1); 2928 if (victim_tid >= tid) { 2929 ++victim_tid; // Adjusts random distribution to exclude self 2930 } 2931 // Found a potential victim 2932 other_thread = threads_data[victim_tid].td.td_thr; 2933 // There is a slight chance that __kmp_enable_tasking() did not wake 2934 // up all threads waiting at the barrier. If victim is sleeping, 2935 // then wake it up. Since we were going to pay the cache miss 2936 // penalty for referencing another thread's kmp_info_t struct 2937 // anyway, 2938 // the check shouldn't cost too much performance at this point. In 2939 // extra barrier mode, tasks do not sleep at the separate tasking 2940 // barrier, so this isn't a problem. 2941 asleep = 0; 2942 if ((__kmp_tasking_mode == tskm_task_teams) && 2943 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) && 2944 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) != 2945 NULL)) { 2946 asleep = 1; 2947 __kmp_null_resume_wrapper(other_thread); 2948 // A sleeping thread should not have any tasks on it's queue. 2949 // There is a slight possibility that it resumes, steals a task 2950 // from another thread, which spawns more tasks, all in the time 2951 // that it takes this thread to check => don't write an assertion 2952 // that the victim's queue is empty. Try stealing from a 2953 // different thread. 2954 } 2955 } while (asleep); 2956 } 2957 2958 if (!asleep) { 2959 // We have a victim to try to steal from 2960 task = __kmp_steal_task(other_thread, gtid, task_team, 2961 unfinished_threads, thread_finished, 2962 is_constrained); 2963 } 2964 if (task != NULL) { // set last stolen to victim 2965 if (threads_data[tid].td.td_deque_last_stolen != victim_tid) { 2966 threads_data[tid].td.td_deque_last_stolen = victim_tid; 2967 // The pre-refactored code did not try more than 1 successful new 2968 // vicitm, unless the last one generated more local tasks; 2969 // new_victim keeps track of this 2970 new_victim = 1; 2971 } 2972 } else { // No tasks found; unset last_stolen 2973 KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1); 2974 victim_tid = -2; // no successful victim found 2975 } 2976 } 2977 2978 if (task == NULL) 2979 break; // break out of tasking loop 2980 2981 // Found a task; execute it 2982 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2983 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2984 if (itt_sync_obj == NULL) { // we are at fork barrier where we could not 2985 // get the object reliably 2986 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2987 } 2988 __kmp_itt_task_starting(itt_sync_obj); 2989 } 2990 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2991 __kmp_invoke_task(gtid, task, current_task); 2992 #if USE_ITT_BUILD 2993 if (itt_sync_obj != NULL) 2994 __kmp_itt_task_finished(itt_sync_obj); 2995 #endif /* USE_ITT_BUILD */ 2996 // If this thread is only partway through the barrier and the condition is 2997 // met, then return now, so that the barrier gather/release pattern can 2998 // proceed. If this thread is in the last spin loop in the barrier, 2999 // waiting to be released, we know that the termination condition will not 3000 // be satisfied, so don't waste any cycles checking it. 3001 if (flag == NULL || (!final_spin && flag->done_check())) { 3002 KA_TRACE( 3003 15, 3004 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 3005 gtid)); 3006 return TRUE; 3007 } 3008 if (thread->th.th_task_team == NULL) { 3009 break; 3010 } 3011 KMP_YIELD(__kmp_library == library_throughput); // Yield before next task 3012 // If execution of a stolen task results in more tasks being placed on our 3013 // run queue, reset use_own_tasks 3014 if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) { 3015 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned " 3016 "other tasks, restart\n", 3017 gtid)); 3018 use_own_tasks = 1; 3019 new_victim = 0; 3020 } 3021 } 3022 3023 // The task source has been exhausted. If in final spin loop of barrier, 3024 // check if termination condition is satisfied. The work queue may be empty 3025 // but there might be proxy tasks still executing. 3026 if (final_spin && 3027 KMP_ATOMIC_LD_ACQ(¤t_task->td_incomplete_child_tasks) == 0) { 3028 // First, decrement the #unfinished threads, if that has not already been 3029 // done. This decrement might be to the spin location, and result in the 3030 // termination condition being satisfied. 3031 if (!*thread_finished) { 3032 #if KMP_DEBUG 3033 kmp_int32 count = -1 + 3034 #endif 3035 KMP_ATOMIC_DEC(unfinished_threads); 3036 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec " 3037 "unfinished_threads to %d task_team=%p\n", 3038 gtid, count, task_team)); 3039 *thread_finished = TRUE; 3040 } 3041 3042 // It is now unsafe to reference thread->th.th_team !!! 3043 // Decrementing task_team->tt.tt_unfinished_threads can allow the primary 3044 // thread to pass through the barrier, where it might reset each thread's 3045 // th.th_team field for the next parallel region. If we can steal more 3046 // work, we know that this has not happened yet. 3047 if (flag != NULL && flag->done_check()) { 3048 KA_TRACE( 3049 15, 3050 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 3051 gtid)); 3052 return TRUE; 3053 } 3054 } 3055 3056 // If this thread's task team is NULL, primary thread has recognized that 3057 // there are no more tasks; bail out 3058 if (thread->th.th_task_team == NULL) { 3059 KA_TRACE(15, 3060 ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid)); 3061 return FALSE; 3062 } 3063 3064 // We could be getting tasks from target constructs; if this is the only 3065 // thread, keep trying to execute tasks from own queue 3066 if (nthreads == 1 && 3067 KMP_ATOMIC_LD_ACQ(¤t_task->td_incomplete_child_tasks)) 3068 use_own_tasks = 1; 3069 else { 3070 KA_TRACE(15, 3071 ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid)); 3072 return FALSE; 3073 } 3074 } 3075 } 3076 3077 template <bool C, bool S> 3078 int __kmp_execute_tasks_32( 3079 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin, 3080 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 3081 kmp_int32 is_constrained) { 3082 return __kmp_execute_tasks_template( 3083 thread, gtid, flag, final_spin, 3084 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 3085 } 3086 3087 template <bool C, bool S> 3088 int __kmp_execute_tasks_64( 3089 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin, 3090 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 3091 kmp_int32 is_constrained) { 3092 return __kmp_execute_tasks_template( 3093 thread, gtid, flag, final_spin, 3094 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 3095 } 3096 3097 template <bool C, bool S> 3098 int __kmp_atomic_execute_tasks_64( 3099 kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag, 3100 int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 3101 kmp_int32 is_constrained) { 3102 return __kmp_execute_tasks_template( 3103 thread, gtid, flag, final_spin, 3104 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 3105 } 3106 3107 int __kmp_execute_tasks_oncore( 3108 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin, 3109 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 3110 kmp_int32 is_constrained) { 3111 return __kmp_execute_tasks_template( 3112 thread, gtid, flag, final_spin, 3113 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 3114 } 3115 3116 template int 3117 __kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32, 3118 kmp_flag_32<false, false> *, int, 3119 int *USE_ITT_BUILD_ARG(void *), kmp_int32); 3120 3121 template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32, 3122 kmp_flag_64<false, true> *, 3123 int, 3124 int *USE_ITT_BUILD_ARG(void *), 3125 kmp_int32); 3126 3127 template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32, 3128 kmp_flag_64<true, false> *, 3129 int, 3130 int *USE_ITT_BUILD_ARG(void *), 3131 kmp_int32); 3132 3133 template int __kmp_atomic_execute_tasks_64<false, true>( 3134 kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int, 3135 int *USE_ITT_BUILD_ARG(void *), kmp_int32); 3136 3137 template int __kmp_atomic_execute_tasks_64<true, false>( 3138 kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int, 3139 int *USE_ITT_BUILD_ARG(void *), kmp_int32); 3140 3141 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the 3142 // next barrier so they can assist in executing enqueued tasks. 3143 // First thread in allocates the task team atomically. 3144 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 3145 kmp_info_t *this_thr) { 3146 kmp_thread_data_t *threads_data; 3147 int nthreads, i, is_init_thread; 3148 3149 KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n", 3150 __kmp_gtid_from_thread(this_thr))); 3151 3152 KMP_DEBUG_ASSERT(task_team != NULL); 3153 KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL); 3154 3155 nthreads = task_team->tt.tt_nproc; 3156 KMP_DEBUG_ASSERT(nthreads > 0); 3157 KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc); 3158 3159 // Allocate or increase the size of threads_data if necessary 3160 is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team); 3161 3162 if (!is_init_thread) { 3163 // Some other thread already set up the array. 3164 KA_TRACE( 3165 20, 3166 ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n", 3167 __kmp_gtid_from_thread(this_thr))); 3168 return; 3169 } 3170 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 3171 KMP_DEBUG_ASSERT(threads_data != NULL); 3172 3173 if (__kmp_tasking_mode == tskm_task_teams && 3174 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) { 3175 // Release any threads sleeping at the barrier, so that they can steal 3176 // tasks and execute them. In extra barrier mode, tasks do not sleep 3177 // at the separate tasking barrier, so this isn't a problem. 3178 for (i = 0; i < nthreads; i++) { 3179 void *sleep_loc; 3180 kmp_info_t *thread = threads_data[i].td.td_thr; 3181 3182 if (i == this_thr->th.th_info.ds.ds_tid) { 3183 continue; 3184 } 3185 // Since we haven't locked the thread's suspend mutex lock at this 3186 // point, there is a small window where a thread might be putting 3187 // itself to sleep, but hasn't set the th_sleep_loc field yet. 3188 // To work around this, __kmp_execute_tasks_template() periodically checks 3189 // see if other threads are sleeping (using the same random mechanism that 3190 // is used for task stealing) and awakens them if they are. 3191 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != 3192 NULL) { 3193 KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n", 3194 __kmp_gtid_from_thread(this_thr), 3195 __kmp_gtid_from_thread(thread))); 3196 __kmp_null_resume_wrapper(thread); 3197 } else { 3198 KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n", 3199 __kmp_gtid_from_thread(this_thr), 3200 __kmp_gtid_from_thread(thread))); 3201 } 3202 } 3203 } 3204 3205 KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n", 3206 __kmp_gtid_from_thread(this_thr))); 3207 } 3208 3209 /* // TODO: Check the comment consistency 3210 * Utility routines for "task teams". A task team (kmp_task_t) is kind of 3211 * like a shadow of the kmp_team_t data struct, with a different lifetime. 3212 * After a child * thread checks into a barrier and calls __kmp_release() from 3213 * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no 3214 * longer assume that the kmp_team_t structure is intact (at any moment, the 3215 * primary thread may exit the barrier code and free the team data structure, 3216 * and return the threads to the thread pool). 3217 * 3218 * This does not work with the tasking code, as the thread is still 3219 * expected to participate in the execution of any tasks that may have been 3220 * spawned my a member of the team, and the thread still needs access to all 3221 * to each thread in the team, so that it can steal work from it. 3222 * 3223 * Enter the existence of the kmp_task_team_t struct. It employs a reference 3224 * counting mechanism, and is allocated by the primary thread before calling 3225 * __kmp_<barrier_kind>_release, and then is release by the last thread to 3226 * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes 3227 * of the kmp_task_team_t structs for consecutive barriers can overlap 3228 * (and will, unless the primary thread is the last thread to exit the barrier 3229 * release phase, which is not typical). The existence of such a struct is 3230 * useful outside the context of tasking. 3231 * 3232 * We currently use the existence of the threads array as an indicator that 3233 * tasks were spawned since the last barrier. If the structure is to be 3234 * useful outside the context of tasking, then this will have to change, but 3235 * not setting the field minimizes the performance impact of tasking on 3236 * barriers, when no explicit tasks were spawned (pushed, actually). 3237 */ 3238 3239 static kmp_task_team_t *__kmp_free_task_teams = 3240 NULL; // Free list for task_team data structures 3241 // Lock for task team data structures 3242 kmp_bootstrap_lock_t __kmp_task_team_lock = 3243 KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock); 3244 3245 // __kmp_alloc_task_deque: 3246 // Allocates a task deque for a particular thread, and initialize the necessary 3247 // data structures relating to the deque. This only happens once per thread 3248 // per task team since task teams are recycled. No lock is needed during 3249 // allocation since each thread allocates its own deque. 3250 static void __kmp_alloc_task_deque(kmp_info_t *thread, 3251 kmp_thread_data_t *thread_data) { 3252 __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock); 3253 KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL); 3254 3255 // Initialize last stolen task field to "none" 3256 thread_data->td.td_deque_last_stolen = -1; 3257 3258 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0); 3259 KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0); 3260 KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0); 3261 3262 KE_TRACE( 3263 10, 3264 ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n", 3265 __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data)); 3266 // Allocate space for task deque, and zero the deque 3267 // Cannot use __kmp_thread_calloc() because threads not around for 3268 // kmp_reap_task_team( ). 3269 thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate( 3270 INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *)); 3271 thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE; 3272 } 3273 3274 // __kmp_free_task_deque: 3275 // Deallocates a task deque for a particular thread. Happens at library 3276 // deallocation so don't need to reset all thread data fields. 3277 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) { 3278 if (thread_data->td.td_deque != NULL) { 3279 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3280 TCW_4(thread_data->td.td_deque_ntasks, 0); 3281 __kmp_free(thread_data->td.td_deque); 3282 thread_data->td.td_deque = NULL; 3283 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 3284 } 3285 3286 #ifdef BUILD_TIED_TASK_STACK 3287 // GEH: Figure out what to do here for td_susp_tied_tasks 3288 if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) { 3289 __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data); 3290 } 3291 #endif // BUILD_TIED_TASK_STACK 3292 } 3293 3294 // __kmp_realloc_task_threads_data: 3295 // Allocates a threads_data array for a task team, either by allocating an 3296 // initial array or enlarging an existing array. Only the first thread to get 3297 // the lock allocs or enlarges the array and re-initializes the array elements. 3298 // That thread returns "TRUE", the rest return "FALSE". 3299 // Assumes that the new array size is given by task_team -> tt.tt_nproc. 3300 // The current size is given by task_team -> tt.tt_max_threads. 3301 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 3302 kmp_task_team_t *task_team) { 3303 kmp_thread_data_t **threads_data_p; 3304 kmp_int32 nthreads, maxthreads; 3305 int is_init_thread = FALSE; 3306 3307 if (TCR_4(task_team->tt.tt_found_tasks)) { 3308 // Already reallocated and initialized. 3309 return FALSE; 3310 } 3311 3312 threads_data_p = &task_team->tt.tt_threads_data; 3313 nthreads = task_team->tt.tt_nproc; 3314 maxthreads = task_team->tt.tt_max_threads; 3315 3316 // All threads must lock when they encounter the first task of the implicit 3317 // task region to make sure threads_data fields are (re)initialized before 3318 // used. 3319 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 3320 3321 if (!TCR_4(task_team->tt.tt_found_tasks)) { 3322 // first thread to enable tasking 3323 kmp_team_t *team = thread->th.th_team; 3324 int i; 3325 3326 is_init_thread = TRUE; 3327 if (maxthreads < nthreads) { 3328 3329 if (*threads_data_p != NULL) { 3330 kmp_thread_data_t *old_data = *threads_data_p; 3331 kmp_thread_data_t *new_data = NULL; 3332 3333 KE_TRACE( 3334 10, 3335 ("__kmp_realloc_task_threads_data: T#%d reallocating " 3336 "threads data for task_team %p, new_size = %d, old_size = %d\n", 3337 __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads)); 3338 // Reallocate threads_data to have more elements than current array 3339 // Cannot use __kmp_thread_realloc() because threads not around for 3340 // kmp_reap_task_team( ). Note all new array entries are initialized 3341 // to zero by __kmp_allocate(). 3342 new_data = (kmp_thread_data_t *)__kmp_allocate( 3343 nthreads * sizeof(kmp_thread_data_t)); 3344 // copy old data to new data 3345 KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t), 3346 (void *)old_data, maxthreads * sizeof(kmp_thread_data_t)); 3347 3348 #ifdef BUILD_TIED_TASK_STACK 3349 // GEH: Figure out if this is the right thing to do 3350 for (i = maxthreads; i < nthreads; i++) { 3351 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 3352 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 3353 } 3354 #endif // BUILD_TIED_TASK_STACK 3355 // Install the new data and free the old data 3356 (*threads_data_p) = new_data; 3357 __kmp_free(old_data); 3358 } else { 3359 KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating " 3360 "threads data for task_team %p, size = %d\n", 3361 __kmp_gtid_from_thread(thread), task_team, nthreads)); 3362 // Make the initial allocate for threads_data array, and zero entries 3363 // Cannot use __kmp_thread_calloc() because threads not around for 3364 // kmp_reap_task_team( ). 3365 *threads_data_p = (kmp_thread_data_t *)__kmp_allocate( 3366 nthreads * sizeof(kmp_thread_data_t)); 3367 #ifdef BUILD_TIED_TASK_STACK 3368 // GEH: Figure out if this is the right thing to do 3369 for (i = 0; i < nthreads; i++) { 3370 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 3371 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 3372 } 3373 #endif // BUILD_TIED_TASK_STACK 3374 } 3375 task_team->tt.tt_max_threads = nthreads; 3376 } else { 3377 // If array has (more than) enough elements, go ahead and use it 3378 KMP_DEBUG_ASSERT(*threads_data_p != NULL); 3379 } 3380 3381 // initialize threads_data pointers back to thread_info structures 3382 for (i = 0; i < nthreads; i++) { 3383 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 3384 thread_data->td.td_thr = team->t.t_threads[i]; 3385 3386 if (thread_data->td.td_deque_last_stolen >= nthreads) { 3387 // The last stolen field survives across teams / barrier, and the number 3388 // of threads may have changed. It's possible (likely?) that a new 3389 // parallel region will exhibit the same behavior as previous region. 3390 thread_data->td.td_deque_last_stolen = -1; 3391 } 3392 } 3393 3394 KMP_MB(); 3395 TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE); 3396 } 3397 3398 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 3399 return is_init_thread; 3400 } 3401 3402 // __kmp_free_task_threads_data: 3403 // Deallocates a threads_data array for a task team, including any attached 3404 // tasking deques. Only occurs at library shutdown. 3405 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) { 3406 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 3407 if (task_team->tt.tt_threads_data != NULL) { 3408 int i; 3409 for (i = 0; i < task_team->tt.tt_max_threads; i++) { 3410 __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]); 3411 } 3412 __kmp_free(task_team->tt.tt_threads_data); 3413 task_team->tt.tt_threads_data = NULL; 3414 } 3415 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 3416 } 3417 3418 // __kmp_allocate_task_team: 3419 // Allocates a task team associated with a specific team, taking it from 3420 // the global task team free list if possible. Also initializes data 3421 // structures. 3422 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread, 3423 kmp_team_t *team) { 3424 kmp_task_team_t *task_team = NULL; 3425 int nthreads; 3426 3427 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n", 3428 (thread ? __kmp_gtid_from_thread(thread) : -1), team)); 3429 3430 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 3431 // Take a task team from the task team pool 3432 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3433 if (__kmp_free_task_teams != NULL) { 3434 task_team = __kmp_free_task_teams; 3435 TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next); 3436 task_team->tt.tt_next = NULL; 3437 } 3438 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3439 } 3440 3441 if (task_team == NULL) { 3442 KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating " 3443 "task team for team %p\n", 3444 __kmp_gtid_from_thread(thread), team)); 3445 // Allocate a new task team if one is not available. Cannot use 3446 // __kmp_thread_malloc because threads not around for kmp_reap_task_team. 3447 task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t)); 3448 __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock); 3449 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 3450 // suppress race conditions detection on synchronization flags in debug mode 3451 // this helps to analyze library internals eliminating false positives 3452 __itt_suppress_mark_range( 3453 __itt_suppress_range, __itt_suppress_threading_errors, 3454 &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks)); 3455 __itt_suppress_mark_range(__itt_suppress_range, 3456 __itt_suppress_threading_errors, 3457 CCAST(kmp_uint32 *, &task_team->tt.tt_active), 3458 sizeof(task_team->tt.tt_active)); 3459 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 3460 // Note: __kmp_allocate zeroes returned memory, othewise we would need: 3461 // task_team->tt.tt_threads_data = NULL; 3462 // task_team->tt.tt_max_threads = 0; 3463 // task_team->tt.tt_next = NULL; 3464 } 3465 3466 TCW_4(task_team->tt.tt_found_tasks, FALSE); 3467 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3468 task_team->tt.tt_nproc = nthreads = team->t.t_nproc; 3469 3470 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads); 3471 TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE); 3472 TCW_4(task_team->tt.tt_active, TRUE); 3473 3474 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p " 3475 "unfinished_threads init'd to %d\n", 3476 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team, 3477 KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads))); 3478 return task_team; 3479 } 3480 3481 // __kmp_free_task_team: 3482 // Frees the task team associated with a specific thread, and adds it 3483 // to the global task team free list. 3484 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) { 3485 KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n", 3486 thread ? __kmp_gtid_from_thread(thread) : -1, task_team)); 3487 3488 // Put task team back on free list 3489 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3490 3491 KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL); 3492 task_team->tt.tt_next = __kmp_free_task_teams; 3493 TCW_PTR(__kmp_free_task_teams, task_team); 3494 3495 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3496 } 3497 3498 // __kmp_reap_task_teams: 3499 // Free all the task teams on the task team free list. 3500 // Should only be done during library shutdown. 3501 // Cannot do anything that needs a thread structure or gtid since they are 3502 // already gone. 3503 void __kmp_reap_task_teams(void) { 3504 kmp_task_team_t *task_team; 3505 3506 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 3507 // Free all task_teams on the free list 3508 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3509 while ((task_team = __kmp_free_task_teams) != NULL) { 3510 __kmp_free_task_teams = task_team->tt.tt_next; 3511 task_team->tt.tt_next = NULL; 3512 3513 // Free threads_data if necessary 3514 if (task_team->tt.tt_threads_data != NULL) { 3515 __kmp_free_task_threads_data(task_team); 3516 } 3517 __kmp_free(task_team); 3518 } 3519 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3520 } 3521 } 3522 3523 // __kmp_wait_to_unref_task_teams: 3524 // Some threads could still be in the fork barrier release code, possibly 3525 // trying to steal tasks. Wait for each thread to unreference its task team. 3526 void __kmp_wait_to_unref_task_teams(void) { 3527 kmp_info_t *thread; 3528 kmp_uint32 spins; 3529 int done; 3530 3531 KMP_INIT_YIELD(spins); 3532 3533 for (;;) { 3534 done = TRUE; 3535 3536 // TODO: GEH - this may be is wrong because some sync would be necessary 3537 // in case threads are added to the pool during the traversal. Need to 3538 // verify that lock for thread pool is held when calling this routine. 3539 for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL; 3540 thread = thread->th.th_next_pool) { 3541 #if KMP_OS_WINDOWS 3542 DWORD exit_val; 3543 #endif 3544 if (TCR_PTR(thread->th.th_task_team) == NULL) { 3545 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n", 3546 __kmp_gtid_from_thread(thread))); 3547 continue; 3548 } 3549 #if KMP_OS_WINDOWS 3550 // TODO: GEH - add this check for Linux* OS / OS X* as well? 3551 if (!__kmp_is_thread_alive(thread, &exit_val)) { 3552 thread->th.th_task_team = NULL; 3553 continue; 3554 } 3555 #endif 3556 3557 done = FALSE; // Because th_task_team pointer is not NULL for this thread 3558 3559 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to " 3560 "unreference task_team\n", 3561 __kmp_gtid_from_thread(thread))); 3562 3563 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 3564 void *sleep_loc; 3565 // If the thread is sleeping, awaken it. 3566 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != 3567 NULL) { 3568 KA_TRACE( 3569 10, 3570 ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n", 3571 __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread))); 3572 __kmp_null_resume_wrapper(thread); 3573 } 3574 } 3575 } 3576 if (done) { 3577 break; 3578 } 3579 3580 // If oversubscribed or have waited a bit, yield. 3581 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 3582 } 3583 } 3584 3585 // __kmp_task_team_setup: Create a task_team for the current team, but use 3586 // an already created, unused one if it already exists. 3587 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) { 3588 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3589 3590 // If this task_team hasn't been created yet, allocate it. It will be used in 3591 // the region after the next. 3592 // If it exists, it is the current task team and shouldn't be touched yet as 3593 // it may still be in use. 3594 if (team->t.t_task_team[this_thr->th.th_task_state] == NULL && 3595 (always || team->t.t_nproc > 1)) { 3596 team->t.t_task_team[this_thr->th.th_task_state] = 3597 __kmp_allocate_task_team(this_thr, team); 3598 KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p" 3599 " for team %d at parity=%d\n", 3600 __kmp_gtid_from_thread(this_thr), 3601 team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id, 3602 this_thr->th.th_task_state)); 3603 } 3604 3605 // After threads exit the release, they will call sync, and then point to this 3606 // other task_team; make sure it is allocated and properly initialized. As 3607 // threads spin in the barrier release phase, they will continue to use the 3608 // previous task_team struct(above), until they receive the signal to stop 3609 // checking for tasks (they can't safely reference the kmp_team_t struct, 3610 // which could be reallocated by the primary thread). No task teams are formed 3611 // for serialized teams. 3612 if (team->t.t_nproc > 1) { 3613 int other_team = 1 - this_thr->th.th_task_state; 3614 KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2); 3615 if (team->t.t_task_team[other_team] == NULL) { // setup other team as well 3616 team->t.t_task_team[other_team] = 3617 __kmp_allocate_task_team(this_thr, team); 3618 KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new " 3619 "task_team %p for team %d at parity=%d\n", 3620 __kmp_gtid_from_thread(this_thr), 3621 team->t.t_task_team[other_team], team->t.t_id, other_team)); 3622 } else { // Leave the old task team struct in place for the upcoming region; 3623 // adjust as needed 3624 kmp_task_team_t *task_team = team->t.t_task_team[other_team]; 3625 if (!task_team->tt.tt_active || 3626 team->t.t_nproc != task_team->tt.tt_nproc) { 3627 TCW_4(task_team->tt.tt_nproc, team->t.t_nproc); 3628 TCW_4(task_team->tt.tt_found_tasks, FALSE); 3629 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3630 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, 3631 team->t.t_nproc); 3632 TCW_4(task_team->tt.tt_active, TRUE); 3633 } 3634 // if team size has changed, the first thread to enable tasking will 3635 // realloc threads_data if necessary 3636 KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team " 3637 "%p for team %d at parity=%d\n", 3638 __kmp_gtid_from_thread(this_thr), 3639 team->t.t_task_team[other_team], team->t.t_id, other_team)); 3640 } 3641 } 3642 3643 // For regular thread, task enabling should be called when the task is going 3644 // to be pushed to a dequeue. However, for the hidden helper thread, we need 3645 // it ahead of time so that some operations can be performed without race 3646 // condition. 3647 if (this_thr == __kmp_hidden_helper_main_thread) { 3648 for (int i = 0; i < 2; ++i) { 3649 kmp_task_team_t *task_team = team->t.t_task_team[i]; 3650 if (KMP_TASKING_ENABLED(task_team)) { 3651 continue; 3652 } 3653 __kmp_enable_tasking(task_team, this_thr); 3654 for (int j = 0; j < task_team->tt.tt_nproc; ++j) { 3655 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j]; 3656 if (thread_data->td.td_deque == NULL) { 3657 __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data); 3658 } 3659 } 3660 } 3661 } 3662 } 3663 3664 // __kmp_task_team_sync: Propagation of task team data from team to threads 3665 // which happens just after the release phase of a team barrier. This may be 3666 // called by any thread, but only for teams with # threads > 1. 3667 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) { 3668 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3669 3670 // Toggle the th_task_state field, to switch which task_team this thread 3671 // refers to 3672 this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state); 3673 3674 // It is now safe to propagate the task team pointer from the team struct to 3675 // the current thread. 3676 TCW_PTR(this_thr->th.th_task_team, 3677 team->t.t_task_team[this_thr->th.th_task_state]); 3678 KA_TRACE(20, 3679 ("__kmp_task_team_sync: Thread T#%d task team switched to task_team " 3680 "%p from Team #%d (parity=%d)\n", 3681 __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team, 3682 team->t.t_id, this_thr->th.th_task_state)); 3683 } 3684 3685 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the 3686 // barrier gather phase. Only called by primary thread if #threads in team > 1 3687 // or if proxy tasks were created. 3688 // 3689 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off 3690 // by passing in 0 optionally as the last argument. When wait is zero, primary 3691 // thread does not wait for unfinished_threads to reach 0. 3692 void __kmp_task_team_wait( 3693 kmp_info_t *this_thr, 3694 kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) { 3695 kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state]; 3696 3697 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3698 KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team); 3699 3700 if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) { 3701 if (wait) { 3702 KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks " 3703 "(for unfinished_threads to reach 0) on task_team = %p\n", 3704 __kmp_gtid_from_thread(this_thr), task_team)); 3705 // Worker threads may have dropped through to release phase, but could 3706 // still be executing tasks. Wait here for tasks to complete. To avoid 3707 // memory contention, only primary thread checks termination condition. 3708 kmp_flag_32<false, false> flag( 3709 RCAST(std::atomic<kmp_uint32> *, 3710 &task_team->tt.tt_unfinished_threads), 3711 0U); 3712 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 3713 } 3714 // Deactivate the old task team, so that the worker threads will stop 3715 // referencing it while spinning. 3716 KA_TRACE( 3717 20, 3718 ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: " 3719 "setting active to false, setting local and team's pointer to NULL\n", 3720 __kmp_gtid_from_thread(this_thr), task_team)); 3721 KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 || 3722 task_team->tt.tt_found_proxy_tasks == TRUE); 3723 TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3724 KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0); 3725 TCW_SYNC_4(task_team->tt.tt_active, FALSE); 3726 KMP_MB(); 3727 3728 TCW_PTR(this_thr->th.th_task_team, NULL); 3729 } 3730 } 3731 3732 // __kmp_tasking_barrier: 3733 // This routine is called only when __kmp_tasking_mode == tskm_extra_barrier. 3734 // Internal function to execute all tasks prior to a regular barrier or a join 3735 // barrier. It is a full barrier itself, which unfortunately turns regular 3736 // barriers into double barriers and join barriers into 1 1/2 barriers. 3737 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) { 3738 std::atomic<kmp_uint32> *spin = RCAST( 3739 std::atomic<kmp_uint32> *, 3740 &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads); 3741 int flag = FALSE; 3742 KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier); 3743 3744 #if USE_ITT_BUILD 3745 KMP_FSYNC_SPIN_INIT(spin, NULL); 3746 #endif /* USE_ITT_BUILD */ 3747 kmp_flag_32<false, false> spin_flag(spin, 0U); 3748 while (!spin_flag.execute_tasks(thread, gtid, TRUE, 3749 &flag USE_ITT_BUILD_ARG(NULL), 0)) { 3750 #if USE_ITT_BUILD 3751 // TODO: What about itt_sync_obj?? 3752 KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin)); 3753 #endif /* USE_ITT_BUILD */ 3754 3755 if (TCR_4(__kmp_global.g.g_done)) { 3756 if (__kmp_global.g.g_abort) 3757 __kmp_abort_thread(); 3758 break; 3759 } 3760 KMP_YIELD(TRUE); 3761 } 3762 #if USE_ITT_BUILD 3763 KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin)); 3764 #endif /* USE_ITT_BUILD */ 3765 } 3766 3767 // __kmp_give_task puts a task into a given thread queue if: 3768 // - the queue for that thread was created 3769 // - there's space in that queue 3770 // Because of this, __kmp_push_task needs to check if there's space after 3771 // getting the lock 3772 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task, 3773 kmp_int32 pass) { 3774 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 3775 kmp_task_team_t *task_team = taskdata->td_task_team; 3776 3777 KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n", 3778 taskdata, tid)); 3779 3780 // If task_team is NULL something went really bad... 3781 KMP_DEBUG_ASSERT(task_team != NULL); 3782 3783 bool result = false; 3784 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 3785 3786 if (thread_data->td.td_deque == NULL) { 3787 // There's no queue in this thread, go find another one 3788 // We're guaranteed that at least one thread has a queue 3789 KA_TRACE(30, 3790 ("__kmp_give_task: thread %d has no queue while giving task %p.\n", 3791 tid, taskdata)); 3792 return result; 3793 } 3794 3795 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3796 TASK_DEQUE_SIZE(thread_data->td)) { 3797 KA_TRACE( 3798 30, 3799 ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", 3800 taskdata, tid)); 3801 3802 // if this deque is bigger than the pass ratio give a chance to another 3803 // thread 3804 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 3805 return result; 3806 3807 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3808 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3809 TASK_DEQUE_SIZE(thread_data->td)) { 3810 // expand deque to push the task which is not allowed to execute 3811 __kmp_realloc_task_deque(thread, thread_data); 3812 } 3813 3814 } else { 3815 3816 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3817 3818 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3819 TASK_DEQUE_SIZE(thread_data->td)) { 3820 KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to " 3821 "thread %d.\n", 3822 taskdata, tid)); 3823 3824 // if this deque is bigger than the pass ratio give a chance to another 3825 // thread 3826 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 3827 goto release_and_exit; 3828 3829 __kmp_realloc_task_deque(thread, thread_data); 3830 } 3831 } 3832 3833 // lock is held here, and there is space in the deque 3834 3835 thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata; 3836 // Wrap index. 3837 thread_data->td.td_deque_tail = 3838 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 3839 TCW_4(thread_data->td.td_deque_ntasks, 3840 TCR_4(thread_data->td.td_deque_ntasks) + 1); 3841 3842 result = true; 3843 KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n", 3844 taskdata, tid)); 3845 3846 release_and_exit: 3847 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 3848 3849 return result; 3850 } 3851 3852 #define PROXY_TASK_FLAG 0x40000000 3853 /* The finish of the proxy tasks is divided in two pieces: 3854 - the top half is the one that can be done from a thread outside the team 3855 - the bottom half must be run from a thread within the team 3856 3857 In order to run the bottom half the task gets queued back into one of the 3858 threads of the team. Once the td_incomplete_child_task counter of the parent 3859 is decremented the threads can leave the barriers. So, the bottom half needs 3860 to be queued before the counter is decremented. The top half is therefore 3861 divided in two parts: 3862 - things that can be run before queuing the bottom half 3863 - things that must be run after queuing the bottom half 3864 3865 This creates a second race as the bottom half can free the task before the 3866 second top half is executed. To avoid this we use the 3867 td_incomplete_child_task of the proxy task to synchronize the top and bottom 3868 half. */ 3869 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 3870 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 3871 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3872 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 3873 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 3874 3875 taskdata->td_flags.complete = 1; // mark the task as completed 3876 3877 if (taskdata->td_taskgroup) 3878 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); 3879 3880 // Create an imaginary children for this task so the bottom half cannot 3881 // release the task before we have completed the second top half 3882 KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG); 3883 } 3884 3885 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 3886 #if KMP_DEBUG 3887 kmp_int32 children = 0; 3888 // Predecrement simulated by "- 1" calculation 3889 children = -1 + 3890 #endif 3891 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks); 3892 KMP_DEBUG_ASSERT(children >= 0); 3893 3894 // Remove the imaginary children 3895 KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG); 3896 } 3897 3898 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) { 3899 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3900 kmp_info_t *thread = __kmp_threads[gtid]; 3901 3902 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3903 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 3904 1); // top half must run before bottom half 3905 3906 // We need to wait to make sure the top half is finished 3907 // Spinning here should be ok as this should happen quickly 3908 while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) & 3909 PROXY_TASK_FLAG) > 0) 3910 ; 3911 3912 __kmp_release_deps(gtid, taskdata); 3913 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 3914 } 3915 3916 /*! 3917 @ingroup TASKING 3918 @param gtid Global Thread ID of encountering thread 3919 @param ptask Task which execution is completed 3920 3921 Execute the completion of a proxy task from a thread of that is part of the 3922 team. Run first and bottom halves directly. 3923 */ 3924 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) { 3925 KMP_DEBUG_ASSERT(ptask != NULL); 3926 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3927 KA_TRACE( 3928 10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n", 3929 gtid, taskdata)); 3930 __kmp_assert_valid_gtid(gtid); 3931 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3932 3933 __kmp_first_top_half_finish_proxy(taskdata); 3934 __kmp_second_top_half_finish_proxy(taskdata); 3935 __kmp_bottom_half_finish_proxy(gtid, ptask); 3936 3937 KA_TRACE(10, 3938 ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n", 3939 gtid, taskdata)); 3940 } 3941 3942 void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) { 3943 KMP_DEBUG_ASSERT(ptask != NULL); 3944 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3945 3946 // Enqueue task to complete bottom half completion from a thread within the 3947 // corresponding team 3948 kmp_team_t *team = taskdata->td_team; 3949 kmp_int32 nthreads = team->t.t_nproc; 3950 kmp_info_t *thread; 3951 3952 // This should be similar to start_k = __kmp_get_random( thread ) % nthreads 3953 // but we cannot use __kmp_get_random here 3954 kmp_int32 start_k = start % nthreads; 3955 kmp_int32 pass = 1; 3956 kmp_int32 k = start_k; 3957 3958 do { 3959 // For now we're just linearly trying to find a thread 3960 thread = team->t.t_threads[k]; 3961 k = (k + 1) % nthreads; 3962 3963 // we did a full pass through all the threads 3964 if (k == start_k) 3965 pass = pass << 1; 3966 3967 } while (!__kmp_give_task(thread, k, ptask, pass)); 3968 } 3969 3970 /*! 3971 @ingroup TASKING 3972 @param ptask Task which execution is completed 3973 3974 Execute the completion of a proxy task from a thread that could not belong to 3975 the team. 3976 */ 3977 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) { 3978 KMP_DEBUG_ASSERT(ptask != NULL); 3979 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3980 3981 KA_TRACE( 3982 10, 3983 ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n", 3984 taskdata)); 3985 3986 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3987 3988 __kmp_first_top_half_finish_proxy(taskdata); 3989 3990 __kmpc_give_task(ptask); 3991 3992 __kmp_second_top_half_finish_proxy(taskdata); 3993 3994 KA_TRACE( 3995 10, 3996 ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", 3997 taskdata)); 3998 } 3999 4000 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid, 4001 kmp_task_t *task) { 4002 kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task); 4003 if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) { 4004 td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION; 4005 td->td_allow_completion_event.ed.task = task; 4006 __kmp_init_tas_lock(&td->td_allow_completion_event.lock); 4007 } 4008 return &td->td_allow_completion_event; 4009 } 4010 4011 void __kmp_fulfill_event(kmp_event_t *event) { 4012 if (event->type == KMP_EVENT_ALLOW_COMPLETION) { 4013 kmp_task_t *ptask = event->ed.task; 4014 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 4015 bool detached = false; 4016 int gtid = __kmp_get_gtid(); 4017 4018 // The associated task might have completed or could be completing at this 4019 // point. 4020 // We need to take the lock to avoid races 4021 __kmp_acquire_tas_lock(&event->lock, gtid); 4022 if (taskdata->td_flags.proxy == TASK_PROXY) { 4023 detached = true; 4024 } else { 4025 #if OMPT_SUPPORT 4026 // The OMPT event must occur under mutual exclusion, 4027 // otherwise the tool might access ptask after free 4028 if (UNLIKELY(ompt_enabled.enabled)) 4029 __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill); 4030 #endif 4031 } 4032 event->type = KMP_EVENT_UNINITIALIZED; 4033 __kmp_release_tas_lock(&event->lock, gtid); 4034 4035 if (detached) { 4036 #if OMPT_SUPPORT 4037 // We free ptask afterwards and know the task is finished, 4038 // so locking is not necessary 4039 if (UNLIKELY(ompt_enabled.enabled)) 4040 __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill); 4041 #endif 4042 // If the task detached complete the proxy task 4043 if (gtid >= 0) { 4044 kmp_team_t *team = taskdata->td_team; 4045 kmp_info_t *thread = __kmp_get_thread(); 4046 if (thread->th.th_team == team) { 4047 __kmpc_proxy_task_completed(gtid, ptask); 4048 return; 4049 } 4050 } 4051 4052 // fallback 4053 __kmpc_proxy_task_completed_ooo(ptask); 4054 } 4055 } 4056 } 4057 4058 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task 4059 // for taskloop 4060 // 4061 // thread: allocating thread 4062 // task_src: pointer to source task to be duplicated 4063 // returns: a pointer to the allocated kmp_task_t structure (task). 4064 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) { 4065 kmp_task_t *task; 4066 kmp_taskdata_t *taskdata; 4067 kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src); 4068 kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task 4069 size_t shareds_offset; 4070 size_t task_size; 4071 4072 KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, 4073 task_src)); 4074 KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy == 4075 TASK_FULL); // it should not be proxy task 4076 KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT); 4077 task_size = taskdata_src->td_size_alloc; 4078 4079 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 4080 KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread, 4081 task_size)); 4082 #if USE_FAST_MEMORY 4083 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size); 4084 #else 4085 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size); 4086 #endif /* USE_FAST_MEMORY */ 4087 KMP_MEMCPY(taskdata, taskdata_src, task_size); 4088 4089 task = KMP_TASKDATA_TO_TASK(taskdata); 4090 4091 // Initialize new task (only specific fields not affected by memcpy) 4092 taskdata->td_task_id = KMP_GEN_TASK_ID(); 4093 if (task->shareds != NULL) { // need setup shareds pointer 4094 shareds_offset = (char *)task_src->shareds - (char *)taskdata_src; 4095 task->shareds = &((char *)taskdata)[shareds_offset]; 4096 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 4097 0); 4098 } 4099 taskdata->td_alloc_thread = thread; 4100 taskdata->td_parent = parent_task; 4101 // task inherits the taskgroup from the parent task 4102 taskdata->td_taskgroup = parent_task->td_taskgroup; 4103 // tied task needs to initialize the td_last_tied at creation, 4104 // untied one does this when it is scheduled for execution 4105 if (taskdata->td_flags.tiedness == TASK_TIED) 4106 taskdata->td_last_tied = taskdata; 4107 4108 // Only need to keep track of child task counts if team parallel and tasking 4109 // not serialized 4110 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { 4111 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks); 4112 if (parent_task->td_taskgroup) 4113 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count); 4114 // Only need to keep track of allocated child tasks for explicit tasks since 4115 // implicit not deallocated 4116 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) 4117 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks); 4118 } 4119 4120 KA_TRACE(20, 4121 ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n", 4122 thread, taskdata, taskdata->td_parent)); 4123 #if OMPT_SUPPORT 4124 if (UNLIKELY(ompt_enabled.enabled)) 4125 __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid); 4126 #endif 4127 return task; 4128 } 4129 4130 // Routine optionally generated by the compiler for setting the lastprivate flag 4131 // and calling needed constructors for private/firstprivate objects 4132 // (used to form taskloop tasks from pattern task) 4133 // Parameters: dest task, src task, lastprivate flag. 4134 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32); 4135 4136 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8); 4137 4138 // class to encapsulate manipulating loop bounds in a taskloop task. 4139 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting 4140 // the loop bound variables. 4141 class kmp_taskloop_bounds_t { 4142 kmp_task_t *task; 4143 const kmp_taskdata_t *taskdata; 4144 size_t lower_offset; 4145 size_t upper_offset; 4146 4147 public: 4148 kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub) 4149 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)), 4150 lower_offset((char *)lb - (char *)task), 4151 upper_offset((char *)ub - (char *)task) { 4152 KMP_DEBUG_ASSERT((char *)lb > (char *)_task); 4153 KMP_DEBUG_ASSERT((char *)ub > (char *)_task); 4154 } 4155 kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds) 4156 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)), 4157 lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {} 4158 size_t get_lower_offset() const { return lower_offset; } 4159 size_t get_upper_offset() const { return upper_offset; } 4160 kmp_uint64 get_lb() const { 4161 kmp_int64 retval; 4162 #if defined(KMP_GOMP_COMPAT) 4163 // Intel task just returns the lower bound normally 4164 if (!taskdata->td_flags.native) { 4165 retval = *(kmp_int64 *)((char *)task + lower_offset); 4166 } else { 4167 // GOMP task has to take into account the sizeof(long) 4168 if (taskdata->td_size_loop_bounds == 4) { 4169 kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds); 4170 retval = (kmp_int64)*lb; 4171 } else { 4172 kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds); 4173 retval = (kmp_int64)*lb; 4174 } 4175 } 4176 #else 4177 (void)taskdata; 4178 retval = *(kmp_int64 *)((char *)task + lower_offset); 4179 #endif // defined(KMP_GOMP_COMPAT) 4180 return retval; 4181 } 4182 kmp_uint64 get_ub() const { 4183 kmp_int64 retval; 4184 #if defined(KMP_GOMP_COMPAT) 4185 // Intel task just returns the upper bound normally 4186 if (!taskdata->td_flags.native) { 4187 retval = *(kmp_int64 *)((char *)task + upper_offset); 4188 } else { 4189 // GOMP task has to take into account the sizeof(long) 4190 if (taskdata->td_size_loop_bounds == 4) { 4191 kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1; 4192 retval = (kmp_int64)*ub; 4193 } else { 4194 kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1; 4195 retval = (kmp_int64)*ub; 4196 } 4197 } 4198 #else 4199 retval = *(kmp_int64 *)((char *)task + upper_offset); 4200 #endif // defined(KMP_GOMP_COMPAT) 4201 return retval; 4202 } 4203 void set_lb(kmp_uint64 lb) { 4204 #if defined(KMP_GOMP_COMPAT) 4205 // Intel task just sets the lower bound normally 4206 if (!taskdata->td_flags.native) { 4207 *(kmp_uint64 *)((char *)task + lower_offset) = lb; 4208 } else { 4209 // GOMP task has to take into account the sizeof(long) 4210 if (taskdata->td_size_loop_bounds == 4) { 4211 kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds); 4212 *lower = (kmp_uint32)lb; 4213 } else { 4214 kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds); 4215 *lower = (kmp_uint64)lb; 4216 } 4217 } 4218 #else 4219 *(kmp_uint64 *)((char *)task + lower_offset) = lb; 4220 #endif // defined(KMP_GOMP_COMPAT) 4221 } 4222 void set_ub(kmp_uint64 ub) { 4223 #if defined(KMP_GOMP_COMPAT) 4224 // Intel task just sets the upper bound normally 4225 if (!taskdata->td_flags.native) { 4226 *(kmp_uint64 *)((char *)task + upper_offset) = ub; 4227 } else { 4228 // GOMP task has to take into account the sizeof(long) 4229 if (taskdata->td_size_loop_bounds == 4) { 4230 kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1; 4231 *upper = (kmp_uint32)ub; 4232 } else { 4233 kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1; 4234 *upper = (kmp_uint64)ub; 4235 } 4236 } 4237 #else 4238 *(kmp_uint64 *)((char *)task + upper_offset) = ub; 4239 #endif // defined(KMP_GOMP_COMPAT) 4240 } 4241 }; 4242 4243 // __kmp_taskloop_linear: Start tasks of the taskloop linearly 4244 // 4245 // loc Source location information 4246 // gtid Global thread ID 4247 // task Pattern task, exposes the loop iteration range 4248 // lb Pointer to loop lower bound in task structure 4249 // ub Pointer to loop upper bound in task structure 4250 // st Loop stride 4251 // ub_glob Global upper bound (used for lastprivate check) 4252 // num_tasks Number of tasks to execute 4253 // grainsize Number of loop iterations per task 4254 // extras Number of chunks with grainsize+1 iterations 4255 // last_chunk Reduction of grainsize for last task 4256 // tc Iterations count 4257 // task_dup Tasks duplication routine 4258 // codeptr_ra Return address for OMPT events 4259 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, 4260 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 4261 kmp_uint64 ub_glob, kmp_uint64 num_tasks, 4262 kmp_uint64 grainsize, kmp_uint64 extras, 4263 kmp_int64 last_chunk, kmp_uint64 tc, 4264 #if OMPT_SUPPORT 4265 void *codeptr_ra, 4266 #endif 4267 void *task_dup) { 4268 KMP_COUNT_BLOCK(OMP_TASKLOOP); 4269 KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling); 4270 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 4271 // compiler provides global bounds here 4272 kmp_taskloop_bounds_t task_bounds(task, lb, ub); 4273 kmp_uint64 lower = task_bounds.get_lb(); 4274 kmp_uint64 upper = task_bounds.get_ub(); 4275 kmp_uint64 i; 4276 kmp_info_t *thread = __kmp_threads[gtid]; 4277 kmp_taskdata_t *current_task = thread->th.th_current_task; 4278 kmp_task_t *next_task; 4279 kmp_int32 lastpriv = 0; 4280 4281 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + 4282 (last_chunk < 0 ? last_chunk : extras)); 4283 KMP_DEBUG_ASSERT(num_tasks > extras); 4284 KMP_DEBUG_ASSERT(num_tasks > 0); 4285 KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, " 4286 "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n", 4287 gtid, num_tasks, grainsize, extras, last_chunk, lower, upper, 4288 ub_glob, st, task_dup)); 4289 4290 // Launch num_tasks tasks, assign grainsize iterations each task 4291 for (i = 0; i < num_tasks; ++i) { 4292 kmp_uint64 chunk_minus_1; 4293 if (extras == 0) { 4294 chunk_minus_1 = grainsize - 1; 4295 } else { 4296 chunk_minus_1 = grainsize; 4297 --extras; // first extras iterations get bigger chunk (grainsize+1) 4298 } 4299 upper = lower + st * chunk_minus_1; 4300 if (upper > *ub) { 4301 upper = *ub; 4302 } 4303 if (i == num_tasks - 1) { 4304 // schedule the last task, set lastprivate flag if needed 4305 if (st == 1) { // most common case 4306 KMP_DEBUG_ASSERT(upper == *ub); 4307 if (upper == ub_glob) 4308 lastpriv = 1; 4309 } else if (st > 0) { // positive loop stride 4310 KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper); 4311 if ((kmp_uint64)st > ub_glob - upper) 4312 lastpriv = 1; 4313 } else { // negative loop stride 4314 KMP_DEBUG_ASSERT(upper + st < *ub); 4315 if (upper - ub_glob < (kmp_uint64)(-st)) 4316 lastpriv = 1; 4317 } 4318 } 4319 next_task = __kmp_task_dup_alloc(thread, task); // allocate new task 4320 kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task); 4321 kmp_taskloop_bounds_t next_task_bounds = 4322 kmp_taskloop_bounds_t(next_task, task_bounds); 4323 4324 // adjust task-specific bounds 4325 next_task_bounds.set_lb(lower); 4326 if (next_taskdata->td_flags.native) { 4327 next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1)); 4328 } else { 4329 next_task_bounds.set_ub(upper); 4330 } 4331 if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates, 4332 // etc. 4333 ptask_dup(next_task, task, lastpriv); 4334 KA_TRACE(40, 4335 ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, " 4336 "upper %lld stride %lld, (offsets %p %p)\n", 4337 gtid, i, next_task, lower, upper, st, 4338 next_task_bounds.get_lower_offset(), 4339 next_task_bounds.get_upper_offset())); 4340 #if OMPT_SUPPORT 4341 __kmp_omp_taskloop_task(NULL, gtid, next_task, 4342 codeptr_ra); // schedule new task 4343 #else 4344 __kmp_omp_task(gtid, next_task, true); // schedule new task 4345 #endif 4346 lower = upper + st; // adjust lower bound for the next iteration 4347 } 4348 // free the pattern task and exit 4349 __kmp_task_start(gtid, task, current_task); // make internal bookkeeping 4350 // do not execute the pattern task, just do internal bookkeeping 4351 __kmp_task_finish<false>(gtid, task, current_task); 4352 } 4353 4354 // Structure to keep taskloop parameters for auxiliary task 4355 // kept in the shareds of the task structure. 4356 typedef struct __taskloop_params { 4357 kmp_task_t *task; 4358 kmp_uint64 *lb; 4359 kmp_uint64 *ub; 4360 void *task_dup; 4361 kmp_int64 st; 4362 kmp_uint64 ub_glob; 4363 kmp_uint64 num_tasks; 4364 kmp_uint64 grainsize; 4365 kmp_uint64 extras; 4366 kmp_int64 last_chunk; 4367 kmp_uint64 tc; 4368 kmp_uint64 num_t_min; 4369 #if OMPT_SUPPORT 4370 void *codeptr_ra; 4371 #endif 4372 } __taskloop_params_t; 4373 4374 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *, 4375 kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64, 4376 kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64, 4377 kmp_uint64, 4378 #if OMPT_SUPPORT 4379 void *, 4380 #endif 4381 void *); 4382 4383 // Execute part of the taskloop submitted as a task. 4384 int __kmp_taskloop_task(int gtid, void *ptask) { 4385 __taskloop_params_t *p = 4386 (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds; 4387 kmp_task_t *task = p->task; 4388 kmp_uint64 *lb = p->lb; 4389 kmp_uint64 *ub = p->ub; 4390 void *task_dup = p->task_dup; 4391 // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 4392 kmp_int64 st = p->st; 4393 kmp_uint64 ub_glob = p->ub_glob; 4394 kmp_uint64 num_tasks = p->num_tasks; 4395 kmp_uint64 grainsize = p->grainsize; 4396 kmp_uint64 extras = p->extras; 4397 kmp_int64 last_chunk = p->last_chunk; 4398 kmp_uint64 tc = p->tc; 4399 kmp_uint64 num_t_min = p->num_t_min; 4400 #if OMPT_SUPPORT 4401 void *codeptr_ra = p->codeptr_ra; 4402 #endif 4403 #if KMP_DEBUG 4404 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4405 KMP_DEBUG_ASSERT(task != NULL); 4406 KA_TRACE(20, 4407 ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize" 4408 " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n", 4409 gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub, 4410 st, task_dup)); 4411 #endif 4412 KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min); 4413 if (num_tasks > num_t_min) 4414 __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, 4415 grainsize, extras, last_chunk, tc, num_t_min, 4416 #if OMPT_SUPPORT 4417 codeptr_ra, 4418 #endif 4419 task_dup); 4420 else 4421 __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, 4422 grainsize, extras, last_chunk, tc, 4423 #if OMPT_SUPPORT 4424 codeptr_ra, 4425 #endif 4426 task_dup); 4427 4428 KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid)); 4429 return 0; 4430 } 4431 4432 // Schedule part of the taskloop as a task, 4433 // execute the rest of the taskloop. 4434 // 4435 // loc Source location information 4436 // gtid Global thread ID 4437 // task Pattern task, exposes the loop iteration range 4438 // lb Pointer to loop lower bound in task structure 4439 // ub Pointer to loop upper bound in task structure 4440 // st Loop stride 4441 // ub_glob Global upper bound (used for lastprivate check) 4442 // num_tasks Number of tasks to execute 4443 // grainsize Number of loop iterations per task 4444 // extras Number of chunks with grainsize+1 iterations 4445 // last_chunk Reduction of grainsize for last task 4446 // tc Iterations count 4447 // num_t_min Threshold to launch tasks recursively 4448 // task_dup Tasks duplication routine 4449 // codeptr_ra Return address for OMPT events 4450 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task, 4451 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 4452 kmp_uint64 ub_glob, kmp_uint64 num_tasks, 4453 kmp_uint64 grainsize, kmp_uint64 extras, 4454 kmp_int64 last_chunk, kmp_uint64 tc, 4455 kmp_uint64 num_t_min, 4456 #if OMPT_SUPPORT 4457 void *codeptr_ra, 4458 #endif 4459 void *task_dup) { 4460 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4461 KMP_DEBUG_ASSERT(task != NULL); 4462 KMP_DEBUG_ASSERT(num_tasks > num_t_min); 4463 KA_TRACE(20, 4464 ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize" 4465 " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n", 4466 gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub, 4467 st, task_dup)); 4468 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 4469 kmp_uint64 lower = *lb; 4470 kmp_info_t *thread = __kmp_threads[gtid]; 4471 // kmp_taskdata_t *current_task = thread->th.th_current_task; 4472 kmp_task_t *next_task; 4473 size_t lower_offset = 4474 (char *)lb - (char *)task; // remember offset of lb in the task structure 4475 size_t upper_offset = 4476 (char *)ub - (char *)task; // remember offset of ub in the task structure 4477 4478 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + 4479 (last_chunk < 0 ? last_chunk : extras)); 4480 KMP_DEBUG_ASSERT(num_tasks > extras); 4481 KMP_DEBUG_ASSERT(num_tasks > 0); 4482 4483 // split the loop in two halves 4484 kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1; 4485 kmp_int64 last_chunk0 = 0, last_chunk1 = 0; 4486 kmp_uint64 gr_size0 = grainsize; 4487 kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute 4488 kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task 4489 if (last_chunk < 0) { 4490 ext0 = ext1 = 0; 4491 last_chunk1 = last_chunk; 4492 tc0 = grainsize * n_tsk0; 4493 tc1 = tc - tc0; 4494 } else if (n_tsk0 <= extras) { 4495 gr_size0++; // integrate extras into grainsize 4496 ext0 = 0; // no extra iters in 1st half 4497 ext1 = extras - n_tsk0; // remaining extras 4498 tc0 = gr_size0 * n_tsk0; 4499 tc1 = tc - tc0; 4500 } else { // n_tsk0 > extras 4501 ext1 = 0; // no extra iters in 2nd half 4502 ext0 = extras; 4503 tc1 = grainsize * n_tsk1; 4504 tc0 = tc - tc1; 4505 } 4506 ub0 = lower + st * (tc0 - 1); 4507 lb1 = ub0 + st; 4508 4509 // create pattern task for 2nd half of the loop 4510 next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task 4511 // adjust lower bound (upper bound is not changed) for the 2nd half 4512 *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1; 4513 if (ptask_dup != NULL) // construct firstprivates, etc. 4514 ptask_dup(next_task, task, 0); 4515 *ub = ub0; // adjust upper bound for the 1st half 4516 4517 // create auxiliary task for 2nd half of the loop 4518 // make sure new task has same parent task as the pattern task 4519 kmp_taskdata_t *current_task = thread->th.th_current_task; 4520 thread->th.th_current_task = taskdata->td_parent; 4521 kmp_task_t *new_task = 4522 __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *), 4523 sizeof(__taskloop_params_t), &__kmp_taskloop_task); 4524 // restore current task 4525 thread->th.th_current_task = current_task; 4526 __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds; 4527 p->task = next_task; 4528 p->lb = (kmp_uint64 *)((char *)next_task + lower_offset); 4529 p->ub = (kmp_uint64 *)((char *)next_task + upper_offset); 4530 p->task_dup = task_dup; 4531 p->st = st; 4532 p->ub_glob = ub_glob; 4533 p->num_tasks = n_tsk1; 4534 p->grainsize = grainsize; 4535 p->extras = ext1; 4536 p->last_chunk = last_chunk1; 4537 p->tc = tc1; 4538 p->num_t_min = num_t_min; 4539 #if OMPT_SUPPORT 4540 p->codeptr_ra = codeptr_ra; 4541 #endif 4542 4543 #if OMPT_SUPPORT 4544 // schedule new task with correct return address for OMPT events 4545 __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra); 4546 #else 4547 __kmp_omp_task(gtid, new_task, true); // schedule new task 4548 #endif 4549 4550 // execute the 1st half of current subrange 4551 if (n_tsk0 > num_t_min) 4552 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0, 4553 ext0, last_chunk0, tc0, num_t_min, 4554 #if OMPT_SUPPORT 4555 codeptr_ra, 4556 #endif 4557 task_dup); 4558 else 4559 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, 4560 gr_size0, ext0, last_chunk0, tc0, 4561 #if OMPT_SUPPORT 4562 codeptr_ra, 4563 #endif 4564 task_dup); 4565 4566 KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid)); 4567 } 4568 4569 static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, 4570 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 4571 int nogroup, int sched, kmp_uint64 grainsize, 4572 int modifier, void *task_dup) { 4573 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4574 KMP_DEBUG_ASSERT(task != NULL); 4575 if (nogroup == 0) { 4576 #if OMPT_SUPPORT && OMPT_OPTIONAL 4577 OMPT_STORE_RETURN_ADDRESS(gtid); 4578 #endif 4579 __kmpc_taskgroup(loc, gtid); 4580 } 4581 4582 // ========================================================================= 4583 // calculate loop parameters 4584 kmp_taskloop_bounds_t task_bounds(task, lb, ub); 4585 kmp_uint64 tc; 4586 // compiler provides global bounds here 4587 kmp_uint64 lower = task_bounds.get_lb(); 4588 kmp_uint64 upper = task_bounds.get_ub(); 4589 kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag 4590 kmp_uint64 num_tasks = 0, extras = 0; 4591 kmp_int64 last_chunk = 4592 0; // reduce grainsize of last task by last_chunk in strict mode 4593 kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks; 4594 kmp_info_t *thread = __kmp_threads[gtid]; 4595 kmp_taskdata_t *current_task = thread->th.th_current_task; 4596 4597 KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, " 4598 "grain %llu(%d, %d), dup %p\n", 4599 gtid, taskdata, lower, upper, st, grainsize, sched, modifier, 4600 task_dup)); 4601 4602 // compute trip count 4603 if (st == 1) { // most common case 4604 tc = upper - lower + 1; 4605 } else if (st < 0) { 4606 tc = (lower - upper) / (-st) + 1; 4607 } else { // st > 0 4608 tc = (upper - lower) / st + 1; 4609 } 4610 if (tc == 0) { 4611 KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid)); 4612 // free the pattern task and exit 4613 __kmp_task_start(gtid, task, current_task); 4614 // do not execute anything for zero-trip loop 4615 __kmp_task_finish<false>(gtid, task, current_task); 4616 return; 4617 } 4618 4619 #if OMPT_SUPPORT && OMPT_OPTIONAL 4620 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 4621 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 4622 if (ompt_enabled.ompt_callback_work) { 4623 ompt_callbacks.ompt_callback(ompt_callback_work)( 4624 ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data), 4625 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0)); 4626 } 4627 #endif 4628 4629 if (num_tasks_min == 0) 4630 // TODO: can we choose better default heuristic? 4631 num_tasks_min = 4632 KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE); 4633 4634 // compute num_tasks/grainsize based on the input provided 4635 switch (sched) { 4636 case 0: // no schedule clause specified, we can choose the default 4637 // let's try to schedule (team_size*10) tasks 4638 grainsize = thread->th.th_team_nproc * 10; 4639 KMP_FALLTHROUGH(); 4640 case 2: // num_tasks provided 4641 if (grainsize > tc) { 4642 num_tasks = tc; // too big num_tasks requested, adjust values 4643 grainsize = 1; 4644 extras = 0; 4645 } else { 4646 num_tasks = grainsize; 4647 grainsize = tc / num_tasks; 4648 extras = tc % num_tasks; 4649 } 4650 break; 4651 case 1: // grainsize provided 4652 if (grainsize > tc) { 4653 num_tasks = 1; 4654 grainsize = tc; // too big grainsize requested, adjust values 4655 extras = 0; 4656 } else { 4657 if (modifier) { 4658 num_tasks = (tc + grainsize - 1) / grainsize; 4659 last_chunk = tc - (num_tasks * grainsize); 4660 extras = 0; 4661 } else { 4662 num_tasks = tc / grainsize; 4663 // adjust grainsize for balanced distribution of iterations 4664 grainsize = tc / num_tasks; 4665 extras = tc % num_tasks; 4666 } 4667 } 4668 break; 4669 default: 4670 KMP_ASSERT2(0, "unknown scheduling of taskloop"); 4671 } 4672 4673 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + 4674 (last_chunk < 0 ? last_chunk : extras)); 4675 KMP_DEBUG_ASSERT(num_tasks > extras); 4676 KMP_DEBUG_ASSERT(num_tasks > 0); 4677 // ========================================================================= 4678 4679 // check if clause value first 4680 // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native) 4681 if (if_val == 0) { // if(0) specified, mark task as serial 4682 taskdata->td_flags.task_serial = 1; 4683 taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied 4684 // always start serial tasks linearly 4685 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4686 grainsize, extras, last_chunk, tc, 4687 #if OMPT_SUPPORT 4688 OMPT_GET_RETURN_ADDRESS(0), 4689 #endif 4690 task_dup); 4691 // !taskdata->td_flags.native => currently force linear spawning of tasks 4692 // for GOMP_taskloop 4693 } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) { 4694 KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu" 4695 "(%lld), grain %llu, extras %llu, last_chunk %lld\n", 4696 gtid, tc, num_tasks, num_tasks_min, grainsize, extras, 4697 last_chunk)); 4698 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4699 grainsize, extras, last_chunk, tc, num_tasks_min, 4700 #if OMPT_SUPPORT 4701 OMPT_GET_RETURN_ADDRESS(0), 4702 #endif 4703 task_dup); 4704 } else { 4705 KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu" 4706 "(%lld), grain %llu, extras %llu, last_chunk %lld\n", 4707 gtid, tc, num_tasks, num_tasks_min, grainsize, extras, 4708 last_chunk)); 4709 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4710 grainsize, extras, last_chunk, tc, 4711 #if OMPT_SUPPORT 4712 OMPT_GET_RETURN_ADDRESS(0), 4713 #endif 4714 task_dup); 4715 } 4716 4717 #if OMPT_SUPPORT && OMPT_OPTIONAL 4718 if (ompt_enabled.ompt_callback_work) { 4719 ompt_callbacks.ompt_callback(ompt_callback_work)( 4720 ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data), 4721 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0)); 4722 } 4723 #endif 4724 4725 if (nogroup == 0) { 4726 #if OMPT_SUPPORT && OMPT_OPTIONAL 4727 OMPT_STORE_RETURN_ADDRESS(gtid); 4728 #endif 4729 __kmpc_end_taskgroup(loc, gtid); 4730 } 4731 KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid)); 4732 } 4733 4734 /*! 4735 @ingroup TASKING 4736 @param loc Source location information 4737 @param gtid Global thread ID 4738 @param task Task structure 4739 @param if_val Value of the if clause 4740 @param lb Pointer to loop lower bound in task structure 4741 @param ub Pointer to loop upper bound in task structure 4742 @param st Loop stride 4743 @param nogroup Flag, 1 if nogroup clause specified, 0 otherwise 4744 @param sched Schedule specified 0/1/2 for none/grainsize/num_tasks 4745 @param grainsize Schedule value if specified 4746 @param task_dup Tasks duplication routine 4747 4748 Execute the taskloop construct. 4749 */ 4750 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, 4751 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, 4752 int sched, kmp_uint64 grainsize, void *task_dup) { 4753 __kmp_assert_valid_gtid(gtid); 4754 KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid)); 4755 __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize, 4756 0, task_dup); 4757 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid)); 4758 } 4759 4760 /*! 4761 @ingroup TASKING 4762 @param loc Source location information 4763 @param gtid Global thread ID 4764 @param task Task structure 4765 @param if_val Value of the if clause 4766 @param lb Pointer to loop lower bound in task structure 4767 @param ub Pointer to loop upper bound in task structure 4768 @param st Loop stride 4769 @param nogroup Flag, 1 if nogroup clause specified, 0 otherwise 4770 @param sched Schedule specified 0/1/2 for none/grainsize/num_tasks 4771 @param grainsize Schedule value if specified 4772 @param modifer Modifier 'strict' for sched, 1 if present, 0 otherwise 4773 @param task_dup Tasks duplication routine 4774 4775 Execute the taskloop construct. 4776 */ 4777 void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val, 4778 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 4779 int nogroup, int sched, kmp_uint64 grainsize, 4780 int modifier, void *task_dup) { 4781 __kmp_assert_valid_gtid(gtid); 4782 KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid)); 4783 __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize, 4784 modifier, task_dup); 4785 KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid)); 4786 } 4787