1 /* 2 * kmp_tasking.cpp -- OpenMP 3.0 tasking support. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_i18n.h" 15 #include "kmp_itt.h" 16 #include "kmp_stats.h" 17 #include "kmp_wait_release.h" 18 #include "kmp_taskdeps.h" 19 20 #if OMPT_SUPPORT 21 #include "ompt-specific.h" 22 #endif 23 24 #include "tsan_annotations.h" 25 26 /* forward declaration */ 27 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 28 kmp_info_t *this_thr); 29 static void __kmp_alloc_task_deque(kmp_info_t *thread, 30 kmp_thread_data_t *thread_data); 31 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 32 kmp_task_team_t *task_team); 33 34 #if OMP_45_ENABLED 35 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask); 36 #endif 37 38 #ifdef BUILD_TIED_TASK_STACK 39 40 // __kmp_trace_task_stack: print the tied tasks from the task stack in order 41 // from top do bottom 42 // 43 // gtid: global thread identifier for thread containing stack 44 // thread_data: thread data for task team thread containing stack 45 // threshold: value above which the trace statement triggers 46 // location: string identifying call site of this function (for trace) 47 static void __kmp_trace_task_stack(kmp_int32 gtid, 48 kmp_thread_data_t *thread_data, 49 int threshold, char *location) { 50 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 51 kmp_taskdata_t **stack_top = task_stack->ts_top; 52 kmp_int32 entries = task_stack->ts_entries; 53 kmp_taskdata_t *tied_task; 54 55 KA_TRACE( 56 threshold, 57 ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, " 58 "first_block = %p, stack_top = %p \n", 59 location, gtid, entries, task_stack->ts_first_block, stack_top)); 60 61 KMP_DEBUG_ASSERT(stack_top != NULL); 62 KMP_DEBUG_ASSERT(entries > 0); 63 64 while (entries != 0) { 65 KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]); 66 // fix up ts_top if we need to pop from previous block 67 if (entries & TASK_STACK_INDEX_MASK == 0) { 68 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top); 69 70 stack_block = stack_block->sb_prev; 71 stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 72 } 73 74 // finish bookkeeping 75 stack_top--; 76 entries--; 77 78 tied_task = *stack_top; 79 80 KMP_DEBUG_ASSERT(tied_task != NULL); 81 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 82 83 KA_TRACE(threshold, 84 ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, " 85 "stack_top=%p, tied_task=%p\n", 86 location, gtid, entries, stack_top, tied_task)); 87 } 88 KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]); 89 90 KA_TRACE(threshold, 91 ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n", 92 location, gtid)); 93 } 94 95 // __kmp_init_task_stack: initialize the task stack for the first time 96 // after a thread_data structure is created. 97 // It should not be necessary to do this again (assuming the stack works). 98 // 99 // gtid: global thread identifier of calling thread 100 // thread_data: thread data for task team thread containing stack 101 static void __kmp_init_task_stack(kmp_int32 gtid, 102 kmp_thread_data_t *thread_data) { 103 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 104 kmp_stack_block_t *first_block; 105 106 // set up the first block of the stack 107 first_block = &task_stack->ts_first_block; 108 task_stack->ts_top = (kmp_taskdata_t **)first_block; 109 memset((void *)first_block, '\0', 110 TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *)); 111 112 // initialize the stack to be empty 113 task_stack->ts_entries = TASK_STACK_EMPTY; 114 first_block->sb_next = NULL; 115 first_block->sb_prev = NULL; 116 } 117 118 // __kmp_free_task_stack: free the task stack when thread_data is destroyed. 119 // 120 // gtid: global thread identifier for calling thread 121 // thread_data: thread info for thread containing stack 122 static void __kmp_free_task_stack(kmp_int32 gtid, 123 kmp_thread_data_t *thread_data) { 124 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 125 kmp_stack_block_t *stack_block = &task_stack->ts_first_block; 126 127 KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY); 128 // free from the second block of the stack 129 while (stack_block != NULL) { 130 kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL; 131 132 stack_block->sb_next = NULL; 133 stack_block->sb_prev = NULL; 134 if (stack_block != &task_stack->ts_first_block) { 135 __kmp_thread_free(thread, 136 stack_block); // free the block, if not the first 137 } 138 stack_block = next_block; 139 } 140 // initialize the stack to be empty 141 task_stack->ts_entries = 0; 142 task_stack->ts_top = NULL; 143 } 144 145 // __kmp_push_task_stack: Push the tied task onto the task stack. 146 // Grow the stack if necessary by allocating another block. 147 // 148 // gtid: global thread identifier for calling thread 149 // thread: thread info for thread containing stack 150 // tied_task: the task to push on the stack 151 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread, 152 kmp_taskdata_t *tied_task) { 153 // GEH - need to consider what to do if tt_threads_data not allocated yet 154 kmp_thread_data_t *thread_data = 155 &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 156 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 157 158 if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) { 159 return; // Don't push anything on stack if team or team tasks are serialized 160 } 161 162 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 163 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 164 165 KA_TRACE(20, 166 ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n", 167 gtid, thread, tied_task)); 168 // Store entry 169 *(task_stack->ts_top) = tied_task; 170 171 // Do bookkeeping for next push 172 task_stack->ts_top++; 173 task_stack->ts_entries++; 174 175 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 176 // Find beginning of this task block 177 kmp_stack_block_t *stack_block = 178 (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE); 179 180 // Check if we already have a block 181 if (stack_block->sb_next != 182 NULL) { // reset ts_top to beginning of next block 183 task_stack->ts_top = &stack_block->sb_next->sb_block[0]; 184 } else { // Alloc new block and link it up 185 kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc( 186 thread, sizeof(kmp_stack_block_t)); 187 188 task_stack->ts_top = &new_block->sb_block[0]; 189 stack_block->sb_next = new_block; 190 new_block->sb_prev = stack_block; 191 new_block->sb_next = NULL; 192 193 KA_TRACE( 194 30, 195 ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n", 196 gtid, tied_task, new_block)); 197 } 198 } 199 KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 200 tied_task)); 201 } 202 203 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return 204 // the task, just check to make sure it matches the ending task passed in. 205 // 206 // gtid: global thread identifier for the calling thread 207 // thread: thread info structure containing stack 208 // tied_task: the task popped off the stack 209 // ending_task: the task that is ending (should match popped task) 210 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread, 211 kmp_taskdata_t *ending_task) { 212 // GEH - need to consider what to do if tt_threads_data not allocated yet 213 kmp_thread_data_t *thread_data = 214 &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)]; 215 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 216 kmp_taskdata_t *tied_task; 217 218 if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) { 219 // Don't pop anything from stack if team or team tasks are serialized 220 return; 221 } 222 223 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 224 KMP_DEBUG_ASSERT(task_stack->ts_entries > 0); 225 226 KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid, 227 thread)); 228 229 // fix up ts_top if we need to pop from previous block 230 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 231 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top); 232 233 stack_block = stack_block->sb_prev; 234 task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 235 } 236 237 // finish bookkeeping 238 task_stack->ts_top--; 239 task_stack->ts_entries--; 240 241 tied_task = *(task_stack->ts_top); 242 243 KMP_DEBUG_ASSERT(tied_task != NULL); 244 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 245 KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly 246 247 KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 248 tied_task)); 249 return; 250 } 251 #endif /* BUILD_TIED_TASK_STACK */ 252 253 // returns 1 if new task is allowed to execute, 0 otherwise 254 // checks Task Scheduling constraint (if requested) and 255 // mutexinoutset dependencies if any 256 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained, 257 const kmp_taskdata_t *tasknew, 258 const kmp_taskdata_t *taskcurr) { 259 if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) { 260 // Check if the candidate obeys the Task Scheduling Constraints (TSC) 261 // only descendant of all deferred tied tasks can be scheduled, checking 262 // the last one is enough, as it in turn is the descendant of all others 263 kmp_taskdata_t *current = taskcurr->td_last_tied; 264 KMP_DEBUG_ASSERT(current != NULL); 265 // check if the task is not suspended on barrier 266 if (current->td_flags.tasktype == TASK_EXPLICIT || 267 current->td_taskwait_thread > 0) { // <= 0 on barrier 268 kmp_int32 level = current->td_level; 269 kmp_taskdata_t *parent = tasknew->td_parent; 270 while (parent != current && parent->td_level > level) { 271 // check generation up to the level of the current task 272 parent = parent->td_parent; 273 KMP_DEBUG_ASSERT(parent != NULL); 274 } 275 if (parent != current) 276 return false; 277 } 278 } 279 // Check mutexinoutset dependencies, acquire locks 280 kmp_depnode_t *node = tasknew->td_depnode; 281 if (node && (node->dn.mtx_num_locks > 0)) { 282 for (int i = 0; i < node->dn.mtx_num_locks; ++i) { 283 KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL); 284 if (__kmp_test_lock(node->dn.mtx_locks[i], gtid)) 285 continue; 286 // could not get the lock, release previous locks 287 for (int j = i - 1; j >= 0; --j) 288 __kmp_release_lock(node->dn.mtx_locks[j], gtid); 289 return false; 290 } 291 // negative num_locks means all locks acquired successfully 292 node->dn.mtx_num_locks = -node->dn.mtx_num_locks; 293 } 294 return true; 295 } 296 297 // __kmp_realloc_task_deque: 298 // Re-allocates a task deque for a particular thread, copies the content from 299 // the old deque and adjusts the necessary data structures relating to the 300 // deque. This operation must be done with the deque_lock being held 301 static void __kmp_realloc_task_deque(kmp_info_t *thread, 302 kmp_thread_data_t *thread_data) { 303 kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td); 304 kmp_int32 new_size = 2 * size; 305 306 KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to " 307 "%d] for thread_data %p\n", 308 __kmp_gtid_from_thread(thread), size, new_size, thread_data)); 309 310 kmp_taskdata_t **new_deque = 311 (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *)); 312 313 int i, j; 314 for (i = thread_data->td.td_deque_head, j = 0; j < size; 315 i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++) 316 new_deque[j] = thread_data->td.td_deque[i]; 317 318 __kmp_free(thread_data->td.td_deque); 319 320 thread_data->td.td_deque_head = 0; 321 thread_data->td.td_deque_tail = size; 322 thread_data->td.td_deque = new_deque; 323 thread_data->td.td_deque_size = new_size; 324 } 325 326 // __kmp_push_task: Add a task to the thread's deque 327 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { 328 kmp_info_t *thread = __kmp_threads[gtid]; 329 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 330 kmp_task_team_t *task_team = thread->th.th_task_team; 331 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 332 kmp_thread_data_t *thread_data; 333 334 KA_TRACE(20, 335 ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata)); 336 337 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 338 // untied task needs to increment counter so that the task structure is not 339 // freed prematurely 340 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count); 341 KMP_DEBUG_USE_VAR(counter); 342 KA_TRACE( 343 20, 344 ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n", 345 gtid, counter, taskdata)); 346 } 347 348 // The first check avoids building task_team thread data if serialized 349 if (taskdata->td_flags.task_serial) { 350 KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning " 351 "TASK_NOT_PUSHED for task %p\n", 352 gtid, taskdata)); 353 return TASK_NOT_PUSHED; 354 } 355 356 // Now that serialized tasks have returned, we can assume that we are not in 357 // immediate exec mode 358 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 359 if (!KMP_TASKING_ENABLED(task_team)) { 360 __kmp_enable_tasking(task_team, thread); 361 } 362 KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE); 363 KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL); 364 365 // Find tasking deque specific to encountering thread 366 thread_data = &task_team->tt.tt_threads_data[tid]; 367 368 // No lock needed since only owner can allocate 369 if (thread_data->td.td_deque == NULL) { 370 __kmp_alloc_task_deque(thread, thread_data); 371 } 372 373 int locked = 0; 374 // Check if deque is full 375 if (TCR_4(thread_data->td.td_deque_ntasks) >= 376 TASK_DEQUE_SIZE(thread_data->td)) { 377 if (__kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata, 378 thread->th.th_current_task)) { 379 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning " 380 "TASK_NOT_PUSHED for task %p\n", 381 gtid, taskdata)); 382 return TASK_NOT_PUSHED; 383 } else { 384 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 385 locked = 1; 386 // expand deque to push the task which is not allowed to execute 387 __kmp_realloc_task_deque(thread, thread_data); 388 } 389 } 390 // Lock the deque for the task push operation 391 if (!locked) { 392 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 393 #if OMP_45_ENABLED 394 // Need to recheck as we can get a proxy task from thread outside of OpenMP 395 if (TCR_4(thread_data->td.td_deque_ntasks) >= 396 TASK_DEQUE_SIZE(thread_data->td)) { 397 if (__kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata, 398 thread->th.th_current_task)) { 399 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 400 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; " 401 "returning TASK_NOT_PUSHED for task %p\n", 402 gtid, taskdata)); 403 return TASK_NOT_PUSHED; 404 } else { 405 // expand deque to push the task which is not allowed to execute 406 __kmp_realloc_task_deque(thread, thread_data); 407 } 408 } 409 #endif 410 } 411 // Must have room since no thread can add tasks but calling thread 412 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) < 413 TASK_DEQUE_SIZE(thread_data->td)); 414 415 thread_data->td.td_deque[thread_data->td.td_deque_tail] = 416 taskdata; // Push taskdata 417 // Wrap index. 418 thread_data->td.td_deque_tail = 419 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 420 TCW_4(thread_data->td.td_deque_ntasks, 421 TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count 422 423 KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: " 424 "task=%p ntasks=%d head=%u tail=%u\n", 425 gtid, taskdata, thread_data->td.td_deque_ntasks, 426 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 427 428 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 429 430 return TASK_SUCCESSFULLY_PUSHED; 431 } 432 433 // __kmp_pop_current_task_from_thread: set up current task from called thread 434 // when team ends 435 // 436 // this_thr: thread structure to set current_task in. 437 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) { 438 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d " 439 "this_thread=%p, curtask=%p, " 440 "curtask_parent=%p\n", 441 0, this_thr, this_thr->th.th_current_task, 442 this_thr->th.th_current_task->td_parent)); 443 444 this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent; 445 446 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d " 447 "this_thread=%p, curtask=%p, " 448 "curtask_parent=%p\n", 449 0, this_thr, this_thr->th.th_current_task, 450 this_thr->th.th_current_task->td_parent)); 451 } 452 453 // __kmp_push_current_task_to_thread: set up current task in called thread for a 454 // new team 455 // 456 // this_thr: thread structure to set up 457 // team: team for implicit task data 458 // tid: thread within team to set up 459 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team, 460 int tid) { 461 // current task of the thread is a parent of the new just created implicit 462 // tasks of new team 463 KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p " 464 "curtask=%p " 465 "parent_task=%p\n", 466 tid, this_thr, this_thr->th.th_current_task, 467 team->t.t_implicit_task_taskdata[tid].td_parent)); 468 469 KMP_DEBUG_ASSERT(this_thr != NULL); 470 471 if (tid == 0) { 472 if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) { 473 team->t.t_implicit_task_taskdata[0].td_parent = 474 this_thr->th.th_current_task; 475 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0]; 476 } 477 } else { 478 team->t.t_implicit_task_taskdata[tid].td_parent = 479 team->t.t_implicit_task_taskdata[0].td_parent; 480 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid]; 481 } 482 483 KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p " 484 "curtask=%p " 485 "parent_task=%p\n", 486 tid, this_thr, this_thr->th.th_current_task, 487 team->t.t_implicit_task_taskdata[tid].td_parent)); 488 } 489 490 // __kmp_task_start: bookkeeping for a task starting execution 491 // 492 // GTID: global thread id of calling thread 493 // task: task starting execution 494 // current_task: task suspending 495 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task, 496 kmp_taskdata_t *current_task) { 497 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 498 kmp_info_t *thread = __kmp_threads[gtid]; 499 500 KA_TRACE(10, 501 ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n", 502 gtid, taskdata, current_task)); 503 504 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 505 506 // mark currently executing task as suspended 507 // TODO: GEH - make sure root team implicit task is initialized properly. 508 // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 ); 509 current_task->td_flags.executing = 0; 510 511 // Add task to stack if tied 512 #ifdef BUILD_TIED_TASK_STACK 513 if (taskdata->td_flags.tiedness == TASK_TIED) { 514 __kmp_push_task_stack(gtid, thread, taskdata); 515 } 516 #endif /* BUILD_TIED_TASK_STACK */ 517 518 // mark starting task as executing and as current task 519 thread->th.th_current_task = taskdata; 520 521 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 || 522 taskdata->td_flags.tiedness == TASK_UNTIED); 523 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 || 524 taskdata->td_flags.tiedness == TASK_UNTIED); 525 taskdata->td_flags.started = 1; 526 taskdata->td_flags.executing = 1; 527 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 528 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 529 530 // GEH TODO: shouldn't we pass some sort of location identifier here? 531 // APT: yes, we will pass location here. 532 // need to store current thread state (in a thread or taskdata structure) 533 // before setting work_state, otherwise wrong state is set after end of task 534 535 KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata)); 536 537 return; 538 } 539 540 #if OMPT_SUPPORT 541 //------------------------------------------------------------------------------ 542 // __ompt_task_init: 543 // Initialize OMPT fields maintained by a task. This will only be called after 544 // ompt_start_tool, so we already know whether ompt is enabled or not. 545 546 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) { 547 // The calls to __ompt_task_init already have the ompt_enabled condition. 548 task->ompt_task_info.task_data.value = 0; 549 task->ompt_task_info.frame.exit_frame = ompt_data_none; 550 task->ompt_task_info.frame.enter_frame = ompt_data_none; 551 task->ompt_task_info.frame.exit_frame_flags = ompt_frame_runtime | ompt_frame_framepointer; 552 task->ompt_task_info.frame.enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer; 553 #if OMP_40_ENABLED 554 task->ompt_task_info.ndeps = 0; 555 task->ompt_task_info.deps = NULL; 556 #endif /* OMP_40_ENABLED */ 557 } 558 559 // __ompt_task_start: 560 // Build and trigger task-begin event 561 static inline void __ompt_task_start(kmp_task_t *task, 562 kmp_taskdata_t *current_task, 563 kmp_int32 gtid) { 564 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 565 ompt_task_status_t status = ompt_task_switch; 566 if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) { 567 status = ompt_task_yield; 568 __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0; 569 } 570 /* let OMPT know that we're about to run this task */ 571 if (ompt_enabled.ompt_callback_task_schedule) { 572 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)( 573 &(current_task->ompt_task_info.task_data), status, 574 &(taskdata->ompt_task_info.task_data)); 575 } 576 taskdata->ompt_task_info.scheduling_parent = current_task; 577 } 578 579 // __ompt_task_finish: 580 // Build and trigger final task-schedule event 581 static inline void 582 __ompt_task_finish(kmp_task_t *task, kmp_taskdata_t *resumed_task, 583 ompt_task_status_t status = ompt_task_complete) { 584 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 585 if (__kmp_omp_cancellation && taskdata->td_taskgroup && 586 taskdata->td_taskgroup->cancel_request == cancel_taskgroup) { 587 status = ompt_task_cancel; 588 } 589 590 /* let OMPT know that we're returning to the callee task */ 591 if (ompt_enabled.ompt_callback_task_schedule) { 592 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)( 593 &(taskdata->ompt_task_info.task_data), status, 594 &((resumed_task ? resumed_task 595 : (taskdata->ompt_task_info.scheduling_parent 596 ? taskdata->ompt_task_info.scheduling_parent 597 : taskdata->td_parent)) 598 ->ompt_task_info.task_data)); 599 } 600 } 601 #endif 602 603 template <bool ompt> 604 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid, 605 kmp_task_t *task, 606 void *frame_address, 607 void *return_address) { 608 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 609 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 610 611 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p " 612 "current_task=%p\n", 613 gtid, loc_ref, taskdata, current_task)); 614 615 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 616 // untied task needs to increment counter so that the task structure is not 617 // freed prematurely 618 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count); 619 KMP_DEBUG_USE_VAR(counter); 620 KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) " 621 "incremented for task %p\n", 622 gtid, counter, taskdata)); 623 } 624 625 taskdata->td_flags.task_serial = 626 1; // Execute this task immediately, not deferred. 627 __kmp_task_start(gtid, task, current_task); 628 629 #if OMPT_SUPPORT 630 if (ompt) { 631 if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) { 632 current_task->ompt_task_info.frame.enter_frame.ptr = 633 taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address; 634 current_task->ompt_task_info.frame.enter_frame_flags = 635 taskdata->ompt_task_info.frame.exit_frame_flags = ompt_frame_application | ompt_frame_framepointer; 636 } 637 if (ompt_enabled.ompt_callback_task_create) { 638 ompt_task_info_t *parent_info = &(current_task->ompt_task_info); 639 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 640 &(parent_info->task_data), &(parent_info->frame), 641 &(taskdata->ompt_task_info.task_data), 642 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0, 643 return_address); 644 } 645 __ompt_task_start(task, current_task, gtid); 646 } 647 #endif // OMPT_SUPPORT 648 649 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid, 650 loc_ref, taskdata)); 651 } 652 653 #if OMPT_SUPPORT 654 OMPT_NOINLINE 655 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid, 656 kmp_task_t *task, 657 void *frame_address, 658 void *return_address) { 659 __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address, 660 return_address); 661 } 662 #endif // OMPT_SUPPORT 663 664 // __kmpc_omp_task_begin_if0: report that a given serialized task has started 665 // execution 666 // 667 // loc_ref: source location information; points to beginning of task block. 668 // gtid: global thread number. 669 // task: task thunk for the started task. 670 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid, 671 kmp_task_t *task) { 672 #if OMPT_SUPPORT 673 if (UNLIKELY(ompt_enabled.enabled)) { 674 OMPT_STORE_RETURN_ADDRESS(gtid); 675 __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task, 676 OMPT_GET_FRAME_ADDRESS(1), 677 OMPT_LOAD_RETURN_ADDRESS(gtid)); 678 return; 679 } 680 #endif 681 __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL); 682 } 683 684 #ifdef TASK_UNUSED 685 // __kmpc_omp_task_begin: report that a given task has started execution 686 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 687 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) { 688 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 689 690 KA_TRACE( 691 10, 692 ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n", 693 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task)); 694 695 __kmp_task_start(gtid, task, current_task); 696 697 KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid, 698 loc_ref, KMP_TASK_TO_TASKDATA(task))); 699 return; 700 } 701 #endif // TASK_UNUSED 702 703 // __kmp_free_task: free the current task space and the space for shareds 704 // 705 // gtid: Global thread ID of calling thread 706 // taskdata: task to free 707 // thread: thread data structure of caller 708 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata, 709 kmp_info_t *thread) { 710 KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid, 711 taskdata)); 712 713 // Check to make sure all flags and counters have the correct values 714 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 715 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0); 716 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1); 717 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 718 KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 || 719 taskdata->td_flags.task_serial == 1); 720 KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0); 721 722 taskdata->td_flags.freed = 1; 723 ANNOTATE_HAPPENS_BEFORE(taskdata); 724 // deallocate the taskdata and shared variable blocks associated with this task 725 #if USE_FAST_MEMORY 726 __kmp_fast_free(thread, taskdata); 727 #else /* ! USE_FAST_MEMORY */ 728 __kmp_thread_free(thread, taskdata); 729 #endif 730 731 KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata)); 732 } 733 734 // __kmp_free_task_and_ancestors: free the current task and ancestors without 735 // children 736 // 737 // gtid: Global thread ID of calling thread 738 // taskdata: task to free 739 // thread: thread data structure of caller 740 static void __kmp_free_task_and_ancestors(kmp_int32 gtid, 741 kmp_taskdata_t *taskdata, 742 kmp_info_t *thread) { 743 #if OMP_45_ENABLED 744 // Proxy tasks must always be allowed to free their parents 745 // because they can be run in background even in serial mode. 746 kmp_int32 team_serial = 747 (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) && 748 !taskdata->td_flags.proxy; 749 #else 750 kmp_int32 team_serial = 751 taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser; 752 #endif 753 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 754 755 kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1; 756 KMP_DEBUG_ASSERT(children >= 0); 757 758 // Now, go up the ancestor tree to see if any ancestors can now be freed. 759 while (children == 0) { 760 kmp_taskdata_t *parent_taskdata = taskdata->td_parent; 761 762 KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete " 763 "and freeing itself\n", 764 gtid, taskdata)); 765 766 // --- Deallocate my ancestor task --- 767 __kmp_free_task(gtid, taskdata, thread); 768 769 taskdata = parent_taskdata; 770 771 if (team_serial) 772 return; 773 // Stop checking ancestors at implicit task instead of walking up ancestor 774 // tree to avoid premature deallocation of ancestors. 775 if (taskdata->td_flags.tasktype == TASK_IMPLICIT) { 776 if (taskdata->td_dephash) { // do we need to cleanup dephash? 777 int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks); 778 kmp_tasking_flags_t flags_old = taskdata->td_flags; 779 if (children == 0 && flags_old.complete == 1) { 780 kmp_tasking_flags_t flags_new = flags_old; 781 flags_new.complete = 0; 782 if (KMP_COMPARE_AND_STORE_ACQ32( 783 RCAST(kmp_int32 *, &taskdata->td_flags), 784 *RCAST(kmp_int32 *, &flags_old), 785 *RCAST(kmp_int32 *, &flags_new))) { 786 KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans " 787 "dephash of implicit task %p\n", 788 gtid, taskdata)); 789 // cleanup dephash of finished implicit task 790 __kmp_dephash_free_entries(thread, taskdata->td_dephash); 791 } 792 } 793 } 794 return; 795 } 796 // Predecrement simulated by "- 1" calculation 797 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1; 798 KMP_DEBUG_ASSERT(children >= 0); 799 } 800 801 KA_TRACE( 802 20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; " 803 "not freeing it yet\n", 804 gtid, taskdata, children)); 805 } 806 807 // __kmp_task_finish: bookkeeping to do when a task finishes execution 808 // 809 // gtid: global thread ID for calling thread 810 // task: task to be finished 811 // resumed_task: task to be resumed. (may be NULL if task is serialized) 812 template <bool ompt> 813 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, 814 kmp_taskdata_t *resumed_task) { 815 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 816 kmp_info_t *thread = __kmp_threads[gtid]; 817 #if OMP_45_ENABLED 818 kmp_task_team_t *task_team = 819 thread->th.th_task_team; // might be NULL for serial teams... 820 #endif // OMP_45_ENABLED 821 kmp_int32 children = 0; 822 823 KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming " 824 "task %p\n", 825 gtid, taskdata, resumed_task)); 826 827 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 828 829 // Pop task from stack if tied 830 #ifdef BUILD_TIED_TASK_STACK 831 if (taskdata->td_flags.tiedness == TASK_TIED) { 832 __kmp_pop_task_stack(gtid, thread, taskdata); 833 } 834 #endif /* BUILD_TIED_TASK_STACK */ 835 836 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 837 // untied task needs to check the counter so that the task structure is not 838 // freed prematurely 839 kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1; 840 KA_TRACE( 841 20, 842 ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n", 843 gtid, counter, taskdata)); 844 if (counter > 0) { 845 // untied task is not done, to be continued possibly by other thread, do 846 // not free it now 847 if (resumed_task == NULL) { 848 KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial); 849 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 850 // task is the parent 851 } 852 thread->th.th_current_task = resumed_task; // restore current_task 853 resumed_task->td_flags.executing = 1; // resume previous task 854 KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, " 855 "resuming task %p\n", 856 gtid, taskdata, resumed_task)); 857 return; 858 } 859 } 860 #if OMPT_SUPPORT 861 if (ompt) 862 __ompt_task_finish(task, resumed_task); 863 #endif 864 865 // Check mutexinoutset dependencies, release locks 866 kmp_depnode_t *node = taskdata->td_depnode; 867 if (node && (node->dn.mtx_num_locks < 0)) { 868 // negative num_locks means all locks were acquired 869 node->dn.mtx_num_locks = -node->dn.mtx_num_locks; 870 for (int i = node->dn.mtx_num_locks - 1; i >= 0; --i) { 871 KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL); 872 __kmp_release_lock(node->dn.mtx_locks[i], gtid); 873 } 874 } 875 876 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 877 taskdata->td_flags.complete = 1; // mark the task as completed 878 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1); 879 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 880 881 // Only need to keep track of count if team parallel and tasking not 882 // serialized 883 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { 884 // Predecrement simulated by "- 1" calculation 885 children = 886 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1; 887 KMP_DEBUG_ASSERT(children >= 0); 888 #if OMP_40_ENABLED 889 if (taskdata->td_taskgroup) 890 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); 891 __kmp_release_deps(gtid, taskdata); 892 #if OMP_45_ENABLED 893 } else if (task_team && task_team->tt.tt_found_proxy_tasks) { 894 // if we found proxy tasks there could exist a dependency chain 895 // with the proxy task as origin 896 __kmp_release_deps(gtid, taskdata); 897 #endif // OMP_45_ENABLED 898 #endif // OMP_40_ENABLED 899 } 900 901 // td_flags.executing must be marked as 0 after __kmp_release_deps has been 902 // called. Othertwise, if a task is executed immediately from the release_deps 903 // code, the flag will be reset to 1 again by this same function 904 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); 905 taskdata->td_flags.executing = 0; // suspend the finishing task 906 907 KA_TRACE( 908 20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n", 909 gtid, taskdata, children)); 910 911 #if OMP_40_ENABLED 912 /* If the tasks' destructor thunk flag has been set, we need to invoke the 913 destructor thunk that has been generated by the compiler. The code is 914 placed here, since at this point other tasks might have been released 915 hence overlapping the destructor invokations with some other work in the 916 released tasks. The OpenMP spec is not specific on when the destructors 917 are invoked, so we should be free to choose. */ 918 if (taskdata->td_flags.destructors_thunk) { 919 kmp_routine_entry_t destr_thunk = task->data1.destructors; 920 KMP_ASSERT(destr_thunk); 921 destr_thunk(gtid, task); 922 } 923 #endif // OMP_40_ENABLED 924 925 // bookkeeping for resuming task: 926 // GEH - note tasking_ser => task_serial 927 KMP_DEBUG_ASSERT( 928 (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) == 929 taskdata->td_flags.task_serial); 930 if (taskdata->td_flags.task_serial) { 931 if (resumed_task == NULL) { 932 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 933 // task is the parent 934 } 935 } else { 936 KMP_DEBUG_ASSERT(resumed_task != 937 NULL); // verify that resumed task is passed as arguemnt 938 } 939 940 // Free this task and then ancestor tasks if they have no children. 941 // Restore th_current_task first as suggested by John: 942 // johnmc: if an asynchronous inquiry peers into the runtime system 943 // it doesn't see the freed task as the current task. 944 thread->th.th_current_task = resumed_task; 945 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 946 947 // TODO: GEH - make sure root team implicit task is initialized properly. 948 // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 ); 949 resumed_task->td_flags.executing = 1; // resume previous task 950 951 KA_TRACE( 952 10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n", 953 gtid, taskdata, resumed_task)); 954 955 return; 956 } 957 958 template <bool ompt> 959 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref, 960 kmp_int32 gtid, 961 kmp_task_t *task) { 962 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n", 963 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 964 // this routine will provide task to resume 965 __kmp_task_finish<ompt>(gtid, task, NULL); 966 967 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n", 968 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 969 970 #if OMPT_SUPPORT 971 if (ompt) { 972 ompt_frame_t *ompt_frame; 973 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 974 ompt_frame->enter_frame = ompt_data_none; 975 ompt_frame->enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer; 976 } 977 #endif 978 979 return; 980 } 981 982 #if OMPT_SUPPORT 983 OMPT_NOINLINE 984 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid, 985 kmp_task_t *task) { 986 __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task); 987 } 988 #endif // OMPT_SUPPORT 989 990 // __kmpc_omp_task_complete_if0: report that a task has completed execution 991 // 992 // loc_ref: source location information; points to end of task block. 993 // gtid: global thread number. 994 // task: task thunk for the completed task. 995 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid, 996 kmp_task_t *task) { 997 #if OMPT_SUPPORT 998 if (UNLIKELY(ompt_enabled.enabled)) { 999 __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task); 1000 return; 1001 } 1002 #endif 1003 __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task); 1004 } 1005 1006 #ifdef TASK_UNUSED 1007 // __kmpc_omp_task_complete: report that a task has completed execution 1008 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 1009 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid, 1010 kmp_task_t *task) { 1011 KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid, 1012 loc_ref, KMP_TASK_TO_TASKDATA(task))); 1013 1014 __kmp_task_finish<false>(gtid, task, 1015 NULL); // Not sure how to find task to resume 1016 1017 KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid, 1018 loc_ref, KMP_TASK_TO_TASKDATA(task))); 1019 return; 1020 } 1021 #endif // TASK_UNUSED 1022 1023 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit 1024 // task for a given thread 1025 // 1026 // loc_ref: reference to source location of parallel region 1027 // this_thr: thread data structure corresponding to implicit task 1028 // team: team for this_thr 1029 // tid: thread id of given thread within team 1030 // set_curr_task: TRUE if need to push current task to thread 1031 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to 1032 // have already been done elsewhere. 1033 // TODO: Get better loc_ref. Value passed in may be NULL 1034 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr, 1035 kmp_team_t *team, int tid, int set_curr_task) { 1036 kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid]; 1037 1038 KF_TRACE( 1039 10, 1040 ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n", 1041 tid, team, task, set_curr_task ? "TRUE" : "FALSE")); 1042 1043 task->td_task_id = KMP_GEN_TASK_ID(); 1044 task->td_team = team; 1045 // task->td_parent = NULL; // fix for CQ230101 (broken parent task info 1046 // in debugger) 1047 task->td_ident = loc_ref; 1048 task->td_taskwait_ident = NULL; 1049 task->td_taskwait_counter = 0; 1050 task->td_taskwait_thread = 0; 1051 1052 task->td_flags.tiedness = TASK_TIED; 1053 task->td_flags.tasktype = TASK_IMPLICIT; 1054 #if OMP_45_ENABLED 1055 task->td_flags.proxy = TASK_FULL; 1056 #endif 1057 1058 // All implicit tasks are executed immediately, not deferred 1059 task->td_flags.task_serial = 1; 1060 task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 1061 task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 1062 1063 task->td_flags.started = 1; 1064 task->td_flags.executing = 1; 1065 task->td_flags.complete = 0; 1066 task->td_flags.freed = 0; 1067 1068 #if OMP_40_ENABLED 1069 task->td_depnode = NULL; 1070 #endif 1071 task->td_last_tied = task; 1072 1073 if (set_curr_task) { // only do this init first time thread is created 1074 KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0); 1075 // Not used: don't need to deallocate implicit task 1076 KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0); 1077 #if OMP_40_ENABLED 1078 task->td_taskgroup = NULL; // An implicit task does not have taskgroup 1079 task->td_dephash = NULL; 1080 #endif 1081 __kmp_push_current_task_to_thread(this_thr, team, tid); 1082 } else { 1083 KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0); 1084 KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0); 1085 } 1086 1087 #if OMPT_SUPPORT 1088 if (UNLIKELY(ompt_enabled.enabled)) 1089 __ompt_task_init(task, tid); 1090 #endif 1091 1092 KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid, 1093 team, task)); 1094 } 1095 1096 // __kmp_finish_implicit_task: Release resources associated to implicit tasks 1097 // at the end of parallel regions. Some resources are kept for reuse in the next 1098 // parallel region. 1099 // 1100 // thread: thread data structure corresponding to implicit task 1101 void __kmp_finish_implicit_task(kmp_info_t *thread) { 1102 kmp_taskdata_t *task = thread->th.th_current_task; 1103 if (task->td_dephash) { 1104 int children; 1105 task->td_flags.complete = 1; 1106 children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks); 1107 kmp_tasking_flags_t flags_old = task->td_flags; 1108 if (children == 0 && flags_old.complete == 1) { 1109 kmp_tasking_flags_t flags_new = flags_old; 1110 flags_new.complete = 0; 1111 if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags), 1112 *RCAST(kmp_int32 *, &flags_old), 1113 *RCAST(kmp_int32 *, &flags_new))) { 1114 KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans " 1115 "dephash of implicit task %p\n", 1116 thread->th.th_info.ds.ds_gtid, task)); 1117 __kmp_dephash_free_entries(thread, task->td_dephash); 1118 } 1119 } 1120 } 1121 } 1122 1123 // __kmp_free_implicit_task: Release resources associated to implicit tasks 1124 // when these are destroyed regions 1125 // 1126 // thread: thread data structure corresponding to implicit task 1127 void __kmp_free_implicit_task(kmp_info_t *thread) { 1128 kmp_taskdata_t *task = thread->th.th_current_task; 1129 if (task && task->td_dephash) { 1130 __kmp_dephash_free(thread, task->td_dephash); 1131 task->td_dephash = NULL; 1132 } 1133 } 1134 1135 // Round up a size to a power of two specified by val: Used to insert padding 1136 // between structures co-allocated using a single malloc() call 1137 static size_t __kmp_round_up_to_val(size_t size, size_t val) { 1138 if (size & (val - 1)) { 1139 size &= ~(val - 1); 1140 if (size <= KMP_SIZE_T_MAX - val) { 1141 size += val; // Round up if there is no overflow. 1142 } 1143 } 1144 return size; 1145 } // __kmp_round_up_to_va 1146 1147 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task 1148 // 1149 // loc_ref: source location information 1150 // gtid: global thread number. 1151 // flags: include tiedness & task type (explicit vs. implicit) of the ''new'' 1152 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine. 1153 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including 1154 // private vars accessed in task. 1155 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed 1156 // in task. 1157 // task_entry: Pointer to task code entry point generated by compiler. 1158 // returns: a pointer to the allocated kmp_task_t structure (task). 1159 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1160 kmp_tasking_flags_t *flags, 1161 size_t sizeof_kmp_task_t, size_t sizeof_shareds, 1162 kmp_routine_entry_t task_entry) { 1163 kmp_task_t *task; 1164 kmp_taskdata_t *taskdata; 1165 kmp_info_t *thread = __kmp_threads[gtid]; 1166 kmp_team_t *team = thread->th.th_team; 1167 kmp_taskdata_t *parent_task = thread->th.th_current_task; 1168 size_t shareds_offset; 1169 1170 if (!TCR_4(__kmp_init_middle)) 1171 __kmp_middle_initialize(); 1172 1173 KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) " 1174 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1175 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t, 1176 sizeof_shareds, task_entry)); 1177 1178 if (parent_task->td_flags.final) { 1179 if (flags->merged_if0) { 1180 } 1181 flags->final = 1; 1182 } 1183 if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) { 1184 // Untied task encountered causes the TSC algorithm to check entire deque of 1185 // the victim thread. If no untied task encountered, then checking the head 1186 // of the deque should be enough. 1187 KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1); 1188 } 1189 1190 #if OMP_45_ENABLED 1191 if (flags->proxy == TASK_PROXY) { 1192 flags->tiedness = TASK_UNTIED; 1193 flags->merged_if0 = 1; 1194 1195 /* are we running in a sequential parallel or tskm_immediate_exec... we need 1196 tasking support enabled */ 1197 if ((thread->th.th_task_team) == NULL) { 1198 /* This should only happen if the team is serialized 1199 setup a task team and propagate it to the thread */ 1200 KMP_DEBUG_ASSERT(team->t.t_serialized); 1201 KA_TRACE(30, 1202 ("T#%d creating task team in __kmp_task_alloc for proxy task\n", 1203 gtid)); 1204 __kmp_task_team_setup( 1205 thread, team, 1206 1); // 1 indicates setup the current team regardless of nthreads 1207 thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state]; 1208 } 1209 kmp_task_team_t *task_team = thread->th.th_task_team; 1210 1211 /* tasking must be enabled now as the task might not be pushed */ 1212 if (!KMP_TASKING_ENABLED(task_team)) { 1213 KA_TRACE( 1214 30, 1215 ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid)); 1216 __kmp_enable_tasking(task_team, thread); 1217 kmp_int32 tid = thread->th.th_info.ds.ds_tid; 1218 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 1219 // No lock needed since only owner can allocate 1220 if (thread_data->td.td_deque == NULL) { 1221 __kmp_alloc_task_deque(thread, thread_data); 1222 } 1223 } 1224 1225 if (task_team->tt.tt_found_proxy_tasks == FALSE) 1226 TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE); 1227 } 1228 #endif 1229 1230 // Calculate shared structure offset including padding after kmp_task_t struct 1231 // to align pointers in shared struct 1232 shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t; 1233 shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *)); 1234 1235 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 1236 KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid, 1237 shareds_offset)); 1238 KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid, 1239 sizeof_shareds)); 1240 1241 // Avoid double allocation here by combining shareds with taskdata 1242 #if USE_FAST_MEMORY 1243 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset + 1244 sizeof_shareds); 1245 #else /* ! USE_FAST_MEMORY */ 1246 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset + 1247 sizeof_shareds); 1248 #endif /* USE_FAST_MEMORY */ 1249 ANNOTATE_HAPPENS_AFTER(taskdata); 1250 1251 task = KMP_TASKDATA_TO_TASK(taskdata); 1252 1253 // Make sure task & taskdata are aligned appropriately 1254 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD 1255 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0); 1256 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0); 1257 #else 1258 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0); 1259 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0); 1260 #endif 1261 if (sizeof_shareds > 0) { 1262 // Avoid double allocation here by combining shareds with taskdata 1263 task->shareds = &((char *)taskdata)[shareds_offset]; 1264 // Make sure shareds struct is aligned to pointer size 1265 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 1266 0); 1267 } else { 1268 task->shareds = NULL; 1269 } 1270 task->routine = task_entry; 1271 task->part_id = 0; // AC: Always start with 0 part id 1272 1273 taskdata->td_task_id = KMP_GEN_TASK_ID(); 1274 taskdata->td_team = team; 1275 taskdata->td_alloc_thread = thread; 1276 taskdata->td_parent = parent_task; 1277 taskdata->td_level = parent_task->td_level + 1; // increment nesting level 1278 KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0); 1279 taskdata->td_ident = loc_ref; 1280 taskdata->td_taskwait_ident = NULL; 1281 taskdata->td_taskwait_counter = 0; 1282 taskdata->td_taskwait_thread = 0; 1283 KMP_DEBUG_ASSERT(taskdata->td_parent != NULL); 1284 #if OMP_45_ENABLED 1285 // avoid copying icvs for proxy tasks 1286 if (flags->proxy == TASK_FULL) 1287 #endif 1288 copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs); 1289 1290 taskdata->td_flags.tiedness = flags->tiedness; 1291 taskdata->td_flags.final = flags->final; 1292 taskdata->td_flags.merged_if0 = flags->merged_if0; 1293 #if OMP_40_ENABLED 1294 taskdata->td_flags.destructors_thunk = flags->destructors_thunk; 1295 #endif // OMP_40_ENABLED 1296 #if OMP_45_ENABLED 1297 taskdata->td_flags.proxy = flags->proxy; 1298 taskdata->td_task_team = thread->th.th_task_team; 1299 taskdata->td_size_alloc = shareds_offset + sizeof_shareds; 1300 #endif 1301 taskdata->td_flags.tasktype = TASK_EXPLICIT; 1302 1303 // GEH - TODO: fix this to copy parent task's value of tasking_ser flag 1304 taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 1305 1306 // GEH - TODO: fix this to copy parent task's value of team_serial flag 1307 taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 1308 1309 // GEH - Note we serialize the task if the team is serialized to make sure 1310 // implicit parallel region tasks are not left until program termination to 1311 // execute. Also, it helps locality to execute immediately. 1312 1313 taskdata->td_flags.task_serial = 1314 (parent_task->td_flags.final || taskdata->td_flags.team_serial || 1315 taskdata->td_flags.tasking_ser); 1316 1317 taskdata->td_flags.started = 0; 1318 taskdata->td_flags.executing = 0; 1319 taskdata->td_flags.complete = 0; 1320 taskdata->td_flags.freed = 0; 1321 1322 taskdata->td_flags.native = flags->native; 1323 1324 KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0); 1325 // start at one because counts current task and children 1326 KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1); 1327 #if OMP_40_ENABLED 1328 taskdata->td_taskgroup = 1329 parent_task->td_taskgroup; // task inherits taskgroup from the parent task 1330 taskdata->td_dephash = NULL; 1331 taskdata->td_depnode = NULL; 1332 #endif 1333 if (flags->tiedness == TASK_UNTIED) 1334 taskdata->td_last_tied = NULL; // will be set when the task is scheduled 1335 else 1336 taskdata->td_last_tied = taskdata; 1337 1338 #if OMPT_SUPPORT 1339 if (UNLIKELY(ompt_enabled.enabled)) 1340 __ompt_task_init(taskdata, gtid); 1341 #endif 1342 // Only need to keep track of child task counts if team parallel and tasking not 1343 // serialized or if it is a proxy task 1344 #if OMP_45_ENABLED 1345 if (flags->proxy == TASK_PROXY || 1346 !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) 1347 #else 1348 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) 1349 #endif 1350 { 1351 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks); 1352 #if OMP_40_ENABLED 1353 if (parent_task->td_taskgroup) 1354 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count); 1355 #endif 1356 // Only need to keep track of allocated child tasks for explicit tasks since 1357 // implicit not deallocated 1358 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) { 1359 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks); 1360 } 1361 } 1362 1363 KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n", 1364 gtid, taskdata, taskdata->td_parent)); 1365 ANNOTATE_HAPPENS_BEFORE(task); 1366 1367 return task; 1368 } 1369 1370 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1371 kmp_int32 flags, size_t sizeof_kmp_task_t, 1372 size_t sizeof_shareds, 1373 kmp_routine_entry_t task_entry) { 1374 kmp_task_t *retval; 1375 kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags; 1376 1377 input_flags->native = FALSE; 1378 // __kmp_task_alloc() sets up all other runtime flags 1379 1380 #if OMP_45_ENABLED 1381 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) " 1382 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1383 gtid, loc_ref, input_flags->tiedness ? "tied " : "untied", 1384 input_flags->proxy ? "proxy" : "", sizeof_kmp_task_t, 1385 sizeof_shareds, task_entry)); 1386 #else 1387 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) " 1388 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1389 gtid, loc_ref, input_flags->tiedness ? "tied " : "untied", 1390 sizeof_kmp_task_t, sizeof_shareds, task_entry)); 1391 #endif 1392 1393 retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t, 1394 sizeof_shareds, task_entry); 1395 1396 KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval)); 1397 1398 return retval; 1399 } 1400 1401 #if OMP_50_ENABLED 1402 /*! 1403 @ingroup TASKING 1404 @param loc_ref location of the original task directive 1405 @param gtid Global Thread ID of encountering thread 1406 @param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new 1407 task'' 1408 @param naffins Number of affinity items 1409 @param affin_list List of affinity items 1410 @return Returns non-zero if registering affinity information was not successful. 1411 Returns 0 if registration was successful 1412 This entry registers the affinity information attached to a task with the task 1413 thunk structure kmp_taskdata_t. 1414 */ 1415 kmp_int32 1416 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, 1417 kmp_task_t *new_task, kmp_int32 naffins, 1418 kmp_task_affinity_info_t *affin_list) { 1419 return 0; 1420 } 1421 #endif 1422 1423 // __kmp_invoke_task: invoke the specified task 1424 // 1425 // gtid: global thread ID of caller 1426 // task: the task to invoke 1427 // current_task: the task to resume after task invokation 1428 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, 1429 kmp_taskdata_t *current_task) { 1430 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 1431 kmp_info_t *thread; 1432 #if OMP_40_ENABLED 1433 int discard = 0 /* false */; 1434 #endif 1435 KA_TRACE( 1436 30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n", 1437 gtid, taskdata, current_task)); 1438 KMP_DEBUG_ASSERT(task); 1439 #if OMP_45_ENABLED 1440 if (taskdata->td_flags.proxy == TASK_PROXY && 1441 taskdata->td_flags.complete == 1) { 1442 // This is a proxy task that was already completed but it needs to run 1443 // its bottom-half finish 1444 KA_TRACE( 1445 30, 1446 ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n", 1447 gtid, taskdata)); 1448 1449 __kmp_bottom_half_finish_proxy(gtid, task); 1450 1451 KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for " 1452 "proxy task %p, resuming task %p\n", 1453 gtid, taskdata, current_task)); 1454 1455 return; 1456 } 1457 #endif 1458 1459 #if OMPT_SUPPORT 1460 // For untied tasks, the first task executed only calls __kmpc_omp_task and 1461 // does not execute code. 1462 ompt_thread_info_t oldInfo; 1463 if (UNLIKELY(ompt_enabled.enabled)) { 1464 // Store the threads states and restore them after the task 1465 thread = __kmp_threads[gtid]; 1466 oldInfo = thread->th.ompt_thread_info; 1467 thread->th.ompt_thread_info.wait_id = 0; 1468 thread->th.ompt_thread_info.state = (thread->th.th_team_serialized) 1469 ? ompt_state_work_serial 1470 : ompt_state_work_parallel; 1471 taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1472 } 1473 #endif 1474 1475 #if OMP_45_ENABLED 1476 // Proxy tasks are not handled by the runtime 1477 if (taskdata->td_flags.proxy != TASK_PROXY) { 1478 #endif 1479 ANNOTATE_HAPPENS_AFTER(task); 1480 __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded 1481 #if OMP_45_ENABLED 1482 } 1483 #endif 1484 1485 #if OMP_40_ENABLED 1486 // TODO: cancel tasks if the parallel region has also been cancelled 1487 // TODO: check if this sequence can be hoisted above __kmp_task_start 1488 // if cancellation has been enabled for this run ... 1489 if (__kmp_omp_cancellation) { 1490 thread = __kmp_threads[gtid]; 1491 kmp_team_t *this_team = thread->th.th_team; 1492 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 1493 if ((taskgroup && taskgroup->cancel_request) || 1494 (this_team->t.t_cancel_request == cancel_parallel)) { 1495 #if OMPT_SUPPORT && OMPT_OPTIONAL 1496 ompt_data_t *task_data; 1497 if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) { 1498 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL); 1499 ompt_callbacks.ompt_callback(ompt_callback_cancel)( 1500 task_data, 1501 ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup 1502 : ompt_cancel_parallel) | 1503 ompt_cancel_discarded_task, 1504 NULL); 1505 } 1506 #endif 1507 KMP_COUNT_BLOCK(TASK_cancelled); 1508 // this task belongs to a task group and we need to cancel it 1509 discard = 1 /* true */; 1510 } 1511 } 1512 1513 // Invoke the task routine and pass in relevant data. 1514 // Thunks generated by gcc take a different argument list. 1515 if (!discard) { 1516 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 1517 taskdata->td_last_tied = current_task->td_last_tied; 1518 KMP_DEBUG_ASSERT(taskdata->td_last_tied); 1519 } 1520 #if KMP_STATS_ENABLED 1521 KMP_COUNT_BLOCK(TASK_executed); 1522 switch (KMP_GET_THREAD_STATE()) { 1523 case FORK_JOIN_BARRIER: 1524 KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar); 1525 break; 1526 case PLAIN_BARRIER: 1527 KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar); 1528 break; 1529 case TASKYIELD: 1530 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield); 1531 break; 1532 case TASKWAIT: 1533 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait); 1534 break; 1535 case TASKGROUP: 1536 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup); 1537 break; 1538 default: 1539 KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate); 1540 break; 1541 } 1542 #endif // KMP_STATS_ENABLED 1543 #endif // OMP_40_ENABLED 1544 1545 // OMPT task begin 1546 #if OMPT_SUPPORT 1547 if (UNLIKELY(ompt_enabled.enabled)) 1548 __ompt_task_start(task, current_task, gtid); 1549 #endif 1550 1551 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1552 kmp_uint64 cur_time; 1553 kmp_int32 kmp_itt_count_task = 1554 __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial && 1555 current_task->td_flags.tasktype == TASK_IMPLICIT; 1556 if (kmp_itt_count_task) { 1557 thread = __kmp_threads[gtid]; 1558 // Time outer level explicit task on barrier for adjusting imbalance time 1559 if (thread->th.th_bar_arrive_time) 1560 cur_time = __itt_get_timestamp(); 1561 else 1562 kmp_itt_count_task = 0; // thread is not on a barrier - skip timing 1563 } 1564 #endif 1565 1566 #ifdef KMP_GOMP_COMPAT 1567 if (taskdata->td_flags.native) { 1568 ((void (*)(void *))(*(task->routine)))(task->shareds); 1569 } else 1570 #endif /* KMP_GOMP_COMPAT */ 1571 { 1572 (*(task->routine))(gtid, task); 1573 } 1574 KMP_POP_PARTITIONED_TIMER(); 1575 1576 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1577 if (kmp_itt_count_task) { 1578 // Barrier imbalance - adjust arrive time with the task duration 1579 thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time); 1580 } 1581 #endif 1582 1583 #if OMP_40_ENABLED 1584 } 1585 #endif // OMP_40_ENABLED 1586 1587 1588 #if OMP_45_ENABLED 1589 // Proxy tasks are not handled by the runtime 1590 if (taskdata->td_flags.proxy != TASK_PROXY) { 1591 #endif 1592 ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent); 1593 #if OMPT_SUPPORT 1594 if (UNLIKELY(ompt_enabled.enabled)) { 1595 thread->th.ompt_thread_info = oldInfo; 1596 if (taskdata->td_flags.tiedness == TASK_TIED) { 1597 taskdata->ompt_task_info.frame.exit_frame = ompt_data_none; 1598 } 1599 __kmp_task_finish<true>(gtid, task, current_task); 1600 } else 1601 #endif 1602 __kmp_task_finish<false>(gtid, task, current_task); 1603 #if OMP_45_ENABLED 1604 } 1605 #endif 1606 1607 KA_TRACE( 1608 30, 1609 ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n", 1610 gtid, taskdata, current_task)); 1611 return; 1612 } 1613 1614 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution 1615 // 1616 // loc_ref: location of original task pragma (ignored) 1617 // gtid: Global Thread ID of encountering thread 1618 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task'' 1619 // Returns: 1620 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1621 // be resumed later. 1622 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1623 // resumed later. 1624 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid, 1625 kmp_task_t *new_task) { 1626 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1627 1628 KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid, 1629 loc_ref, new_taskdata)); 1630 1631 #if OMPT_SUPPORT 1632 kmp_taskdata_t *parent; 1633 if (UNLIKELY(ompt_enabled.enabled)) { 1634 parent = new_taskdata->td_parent; 1635 if (ompt_enabled.ompt_callback_task_create) { 1636 ompt_data_t task_data = ompt_data_none; 1637 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1638 parent ? &(parent->ompt_task_info.task_data) : &task_data, 1639 parent ? &(parent->ompt_task_info.frame) : NULL, 1640 &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0, 1641 OMPT_GET_RETURN_ADDRESS(0)); 1642 } 1643 } 1644 #endif 1645 1646 /* Should we execute the new task or queue it? For now, let's just always try 1647 to queue it. If the queue fills up, then we'll execute it. */ 1648 1649 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1650 { // Execute this task immediately 1651 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1652 new_taskdata->td_flags.task_serial = 1; 1653 __kmp_invoke_task(gtid, new_task, current_task); 1654 } 1655 1656 KA_TRACE( 1657 10, 1658 ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: " 1659 "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n", 1660 gtid, loc_ref, new_taskdata)); 1661 1662 ANNOTATE_HAPPENS_BEFORE(new_task); 1663 #if OMPT_SUPPORT 1664 if (UNLIKELY(ompt_enabled.enabled)) { 1665 parent->ompt_task_info.frame.enter_frame = ompt_data_none; 1666 } 1667 #endif 1668 return TASK_CURRENT_NOT_QUEUED; 1669 } 1670 1671 // __kmp_omp_task: Schedule a non-thread-switchable task for execution 1672 // 1673 // gtid: Global Thread ID of encountering thread 1674 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc() 1675 // serialize_immediate: if TRUE then if the task is executed immediately its 1676 // execution will be serialized 1677 // Returns: 1678 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1679 // be resumed later. 1680 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1681 // resumed later. 1682 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task, 1683 bool serialize_immediate) { 1684 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1685 1686 /* Should we execute the new task or queue it? For now, let's just always try to 1687 queue it. If the queue fills up, then we'll execute it. */ 1688 #if OMP_45_ENABLED 1689 if (new_taskdata->td_flags.proxy == TASK_PROXY || 1690 __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1691 #else 1692 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1693 #endif 1694 { // Execute this task immediately 1695 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1696 if (serialize_immediate) 1697 new_taskdata->td_flags.task_serial = 1; 1698 __kmp_invoke_task(gtid, new_task, current_task); 1699 } 1700 1701 ANNOTATE_HAPPENS_BEFORE(new_task); 1702 return TASK_CURRENT_NOT_QUEUED; 1703 } 1704 1705 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a 1706 // non-thread-switchable task from the parent thread only! 1707 // 1708 // loc_ref: location of original task pragma (ignored) 1709 // gtid: Global Thread ID of encountering thread 1710 // new_task: non-thread-switchable task thunk allocated by 1711 // __kmp_omp_task_alloc() 1712 // Returns: 1713 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1714 // be resumed later. 1715 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1716 // resumed later. 1717 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid, 1718 kmp_task_t *new_task) { 1719 kmp_int32 res; 1720 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); 1721 1722 #if KMP_DEBUG || OMPT_SUPPORT 1723 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1724 #endif 1725 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref, 1726 new_taskdata)); 1727 1728 #if OMPT_SUPPORT 1729 kmp_taskdata_t *parent = NULL; 1730 if (UNLIKELY(ompt_enabled.enabled)) { 1731 if (!new_taskdata->td_flags.started) { 1732 OMPT_STORE_RETURN_ADDRESS(gtid); 1733 parent = new_taskdata->td_parent; 1734 if (!parent->ompt_task_info.frame.enter_frame.ptr) { 1735 parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1736 } 1737 if (ompt_enabled.ompt_callback_task_create) { 1738 ompt_data_t task_data = ompt_data_none; 1739 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1740 parent ? &(parent->ompt_task_info.task_data) : &task_data, 1741 parent ? &(parent->ompt_task_info.frame) : NULL, 1742 &(new_taskdata->ompt_task_info.task_data), 1743 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, 1744 OMPT_LOAD_RETURN_ADDRESS(gtid)); 1745 } 1746 } else { 1747 // We are scheduling the continuation of an UNTIED task. 1748 // Scheduling back to the parent task. 1749 __ompt_task_finish(new_task, 1750 new_taskdata->ompt_task_info.scheduling_parent, 1751 ompt_task_switch); 1752 new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none; 1753 } 1754 } 1755 #endif 1756 1757 res = __kmp_omp_task(gtid, new_task, true); 1758 1759 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning " 1760 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n", 1761 gtid, loc_ref, new_taskdata)); 1762 #if OMPT_SUPPORT 1763 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) { 1764 parent->ompt_task_info.frame.enter_frame = ompt_data_none; 1765 } 1766 #endif 1767 return res; 1768 } 1769 1770 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule 1771 // a taskloop task with the correct OMPT return address 1772 // 1773 // loc_ref: location of original task pragma (ignored) 1774 // gtid: Global Thread ID of encountering thread 1775 // new_task: non-thread-switchable task thunk allocated by 1776 // __kmp_omp_task_alloc() 1777 // codeptr_ra: return address for OMPT callback 1778 // Returns: 1779 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1780 // be resumed later. 1781 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1782 // resumed later. 1783 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid, 1784 kmp_task_t *new_task, void *codeptr_ra) { 1785 kmp_int32 res; 1786 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); 1787 1788 #if KMP_DEBUG || OMPT_SUPPORT 1789 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1790 #endif 1791 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref, 1792 new_taskdata)); 1793 1794 #if OMPT_SUPPORT 1795 kmp_taskdata_t *parent = NULL; 1796 if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) { 1797 parent = new_taskdata->td_parent; 1798 if (!parent->ompt_task_info.frame.enter_frame.ptr) 1799 parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1800 if (ompt_enabled.ompt_callback_task_create) { 1801 ompt_data_t task_data = ompt_data_none; 1802 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1803 parent ? &(parent->ompt_task_info.task_data) : &task_data, 1804 parent ? &(parent->ompt_task_info.frame) : NULL, 1805 &(new_taskdata->ompt_task_info.task_data), 1806 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, 1807 codeptr_ra); 1808 } 1809 } 1810 #endif 1811 1812 res = __kmp_omp_task(gtid, new_task, true); 1813 1814 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning " 1815 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n", 1816 gtid, loc_ref, new_taskdata)); 1817 #if OMPT_SUPPORT 1818 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) { 1819 parent->ompt_task_info.frame.enter_frame = ompt_data_none; 1820 } 1821 #endif 1822 return res; 1823 } 1824 1825 template <bool ompt> 1826 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid, 1827 void *frame_address, 1828 void *return_address) { 1829 kmp_taskdata_t *taskdata; 1830 kmp_info_t *thread; 1831 int thread_finished = FALSE; 1832 KMP_SET_THREAD_STATE_BLOCK(TASKWAIT); 1833 1834 KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref)); 1835 1836 if (__kmp_tasking_mode != tskm_immediate_exec) { 1837 thread = __kmp_threads[gtid]; 1838 taskdata = thread->th.th_current_task; 1839 1840 #if OMPT_SUPPORT && OMPT_OPTIONAL 1841 ompt_data_t *my_task_data; 1842 ompt_data_t *my_parallel_data; 1843 1844 if (ompt) { 1845 my_task_data = &(taskdata->ompt_task_info.task_data); 1846 my_parallel_data = OMPT_CUR_TEAM_DATA(thread); 1847 1848 taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address; 1849 1850 if (ompt_enabled.ompt_callback_sync_region) { 1851 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1852 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data, 1853 my_task_data, return_address); 1854 } 1855 1856 if (ompt_enabled.ompt_callback_sync_region_wait) { 1857 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1858 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data, 1859 my_task_data, return_address); 1860 } 1861 } 1862 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1863 1864 // Debugger: The taskwait is active. Store location and thread encountered the 1865 // taskwait. 1866 #if USE_ITT_BUILD 1867 // Note: These values are used by ITT events as well. 1868 #endif /* USE_ITT_BUILD */ 1869 taskdata->td_taskwait_counter += 1; 1870 taskdata->td_taskwait_ident = loc_ref; 1871 taskdata->td_taskwait_thread = gtid + 1; 1872 1873 #if USE_ITT_BUILD 1874 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 1875 if (itt_sync_obj != NULL) 1876 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 1877 #endif /* USE_ITT_BUILD */ 1878 1879 bool must_wait = 1880 !taskdata->td_flags.team_serial && !taskdata->td_flags.final; 1881 1882 #if OMP_45_ENABLED 1883 must_wait = must_wait || (thread->th.th_task_team != NULL && 1884 thread->th.th_task_team->tt.tt_found_proxy_tasks); 1885 #endif 1886 if (must_wait) { 1887 kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, 1888 &(taskdata->td_incomplete_child_tasks)), 1889 0U); 1890 while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) { 1891 flag.execute_tasks(thread, gtid, FALSE, 1892 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 1893 __kmp_task_stealing_constraint); 1894 } 1895 } 1896 #if USE_ITT_BUILD 1897 if (itt_sync_obj != NULL) 1898 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 1899 #endif /* USE_ITT_BUILD */ 1900 1901 // Debugger: The taskwait is completed. Location remains, but thread is 1902 // negated. 1903 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 1904 1905 #if OMPT_SUPPORT && OMPT_OPTIONAL 1906 if (ompt) { 1907 if (ompt_enabled.ompt_callback_sync_region_wait) { 1908 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1909 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data, 1910 my_task_data, return_address); 1911 } 1912 if (ompt_enabled.ompt_callback_sync_region) { 1913 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1914 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data, 1915 my_task_data, return_address); 1916 } 1917 taskdata->ompt_task_info.frame.enter_frame = ompt_data_none; 1918 } 1919 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1920 1921 ANNOTATE_HAPPENS_AFTER(taskdata); 1922 } 1923 1924 KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, " 1925 "returning TASK_CURRENT_NOT_QUEUED\n", 1926 gtid, taskdata)); 1927 1928 return TASK_CURRENT_NOT_QUEUED; 1929 } 1930 1931 #if OMPT_SUPPORT && OMPT_OPTIONAL 1932 OMPT_NOINLINE 1933 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid, 1934 void *frame_address, 1935 void *return_address) { 1936 return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address, 1937 return_address); 1938 } 1939 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1940 1941 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are 1942 // complete 1943 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) { 1944 #if OMPT_SUPPORT && OMPT_OPTIONAL 1945 if (UNLIKELY(ompt_enabled.enabled)) { 1946 OMPT_STORE_RETURN_ADDRESS(gtid); 1947 return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0), 1948 OMPT_LOAD_RETURN_ADDRESS(gtid)); 1949 } 1950 #endif 1951 return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL); 1952 } 1953 1954 // __kmpc_omp_taskyield: switch to a different task 1955 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) { 1956 kmp_taskdata_t *taskdata; 1957 kmp_info_t *thread; 1958 int thread_finished = FALSE; 1959 1960 KMP_COUNT_BLOCK(OMP_TASKYIELD); 1961 KMP_SET_THREAD_STATE_BLOCK(TASKYIELD); 1962 1963 KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n", 1964 gtid, loc_ref, end_part)); 1965 1966 if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) { 1967 thread = __kmp_threads[gtid]; 1968 taskdata = thread->th.th_current_task; 1969 // Should we model this as a task wait or not? 1970 // Debugger: The taskwait is active. Store location and thread encountered the 1971 // taskwait. 1972 #if USE_ITT_BUILD 1973 // Note: These values are used by ITT events as well. 1974 #endif /* USE_ITT_BUILD */ 1975 taskdata->td_taskwait_counter += 1; 1976 taskdata->td_taskwait_ident = loc_ref; 1977 taskdata->td_taskwait_thread = gtid + 1; 1978 1979 #if USE_ITT_BUILD 1980 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 1981 if (itt_sync_obj != NULL) 1982 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 1983 #endif /* USE_ITT_BUILD */ 1984 if (!taskdata->td_flags.team_serial) { 1985 kmp_task_team_t *task_team = thread->th.th_task_team; 1986 if (task_team != NULL) { 1987 if (KMP_TASKING_ENABLED(task_team)) { 1988 #if OMPT_SUPPORT 1989 if (UNLIKELY(ompt_enabled.enabled)) 1990 thread->th.ompt_thread_info.ompt_task_yielded = 1; 1991 #endif 1992 __kmp_execute_tasks_32( 1993 thread, gtid, NULL, FALSE, 1994 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 1995 __kmp_task_stealing_constraint); 1996 #if OMPT_SUPPORT 1997 if (UNLIKELY(ompt_enabled.enabled)) 1998 thread->th.ompt_thread_info.ompt_task_yielded = 0; 1999 #endif 2000 } 2001 } 2002 } 2003 #if USE_ITT_BUILD 2004 if (itt_sync_obj != NULL) 2005 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 2006 #endif /* USE_ITT_BUILD */ 2007 2008 // Debugger: The taskwait is completed. Location remains, but thread is 2009 // negated. 2010 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 2011 } 2012 2013 KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, " 2014 "returning TASK_CURRENT_NOT_QUEUED\n", 2015 gtid, taskdata)); 2016 2017 return TASK_CURRENT_NOT_QUEUED; 2018 } 2019 2020 #if OMP_50_ENABLED 2021 // Task Reduction implementation 2022 // 2023 // Note: initial implementation didn't take into account the possibility 2024 // to specify omp_orig for initializer of the UDR (user defined reduction). 2025 // Corrected implementation takes into account the omp_orig object. 2026 // Compiler is free to use old implementation if omp_orig is not specified. 2027 2028 /*! 2029 @ingroup BASIC_TYPES 2030 @{ 2031 */ 2032 2033 /*! 2034 Flags for special info per task reduction item. 2035 */ 2036 typedef struct kmp_taskred_flags { 2037 /*! 1 - use lazy alloc/init (e.g. big objects, #tasks < #threads) */ 2038 unsigned lazy_priv : 1; 2039 unsigned reserved31 : 31; 2040 } kmp_taskred_flags_t; 2041 2042 /*! 2043 Internal struct for reduction data item related info set up by compiler. 2044 */ 2045 typedef struct kmp_task_red_input { 2046 void *reduce_shar; /**< shared between tasks item to reduce into */ 2047 size_t reduce_size; /**< size of data item in bytes */ 2048 // three compiler-generated routines (init, fini are optional): 2049 void *reduce_init; /**< data initialization routine (single parameter) */ 2050 void *reduce_fini; /**< data finalization routine */ 2051 void *reduce_comb; /**< data combiner routine */ 2052 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */ 2053 } kmp_task_red_input_t; 2054 2055 /*! 2056 Internal struct for reduction data item related info saved by the library. 2057 */ 2058 typedef struct kmp_taskred_data { 2059 void *reduce_shar; /**< shared between tasks item to reduce into */ 2060 size_t reduce_size; /**< size of data item */ 2061 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */ 2062 void *reduce_priv; /**< array of thread specific items */ 2063 void *reduce_pend; /**< end of private data for faster comparison op */ 2064 // three compiler-generated routines (init, fini are optional): 2065 void *reduce_comb; /**< data combiner routine */ 2066 void *reduce_init; /**< data initialization routine (two parameters) */ 2067 void *reduce_fini; /**< data finalization routine */ 2068 void *reduce_orig; /**< original item (can be used in UDR initializer) */ 2069 } kmp_taskred_data_t; 2070 2071 /*! 2072 Internal struct for reduction data item related info set up by compiler. 2073 2074 New interface: added reduce_orig field to provide omp_orig for UDR initializer. 2075 */ 2076 typedef struct kmp_taskred_input { 2077 void *reduce_shar; /**< shared between tasks item to reduce into */ 2078 void *reduce_orig; /**< original reduction item used for initialization */ 2079 size_t reduce_size; /**< size of data item */ 2080 // three compiler-generated routines (init, fini are optional): 2081 void *reduce_init; /**< data initialization routine (two parameters) */ 2082 void *reduce_fini; /**< data finalization routine */ 2083 void *reduce_comb; /**< data combiner routine */ 2084 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */ 2085 } kmp_taskred_input_t; 2086 /*! 2087 @} 2088 */ 2089 2090 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src); 2091 template <> 2092 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item, 2093 kmp_task_red_input_t &src) { 2094 item.reduce_orig = NULL; 2095 } 2096 template <> 2097 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item, 2098 kmp_taskred_input_t &src) { 2099 if (src.reduce_orig != NULL) { 2100 item.reduce_orig = src.reduce_orig; 2101 } else { 2102 item.reduce_orig = src.reduce_shar; 2103 } // non-NULL reduce_orig means new interface used 2104 } 2105 2106 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, int j); 2107 template <> 2108 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item, 2109 int offset) { 2110 ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset); 2111 } 2112 template <> 2113 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item, 2114 int offset) { 2115 ((void (*)(void *, void *))item.reduce_init)( 2116 (char *)(item.reduce_priv) + offset, item.reduce_orig); 2117 } 2118 2119 template <typename T> 2120 void *__kmp_task_reduction_init(int gtid, int num, T *data) { 2121 kmp_info_t *thread = __kmp_threads[gtid]; 2122 kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup; 2123 kmp_int32 nth = thread->th.th_team_nproc; 2124 kmp_taskred_data_t *arr; 2125 2126 // check input data just in case 2127 KMP_ASSERT(tg != NULL); 2128 KMP_ASSERT(data != NULL); 2129 KMP_ASSERT(num > 0); 2130 if (nth == 1) { 2131 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n", 2132 gtid, tg)); 2133 return (void *)tg; 2134 } 2135 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n", 2136 gtid, tg, num)); 2137 arr = (kmp_taskred_data_t *)__kmp_thread_malloc( 2138 thread, num * sizeof(kmp_taskred_data_t)); 2139 for (int i = 0; i < num; ++i) { 2140 size_t size = data[i].reduce_size - 1; 2141 // round the size up to cache line per thread-specific item 2142 size += CACHE_LINE - size % CACHE_LINE; 2143 KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory 2144 arr[i].reduce_shar = data[i].reduce_shar; 2145 arr[i].reduce_size = size; 2146 arr[i].flags = data[i].flags; 2147 arr[i].reduce_comb = data[i].reduce_comb; 2148 arr[i].reduce_init = data[i].reduce_init; 2149 arr[i].reduce_fini = data[i].reduce_fini; 2150 __kmp_assign_orig<T>(arr[i], data[i]); 2151 if (!arr[i].flags.lazy_priv) { 2152 // allocate cache-line aligned block and fill it with zeros 2153 arr[i].reduce_priv = __kmp_allocate(nth * size); 2154 arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size; 2155 if (arr[i].reduce_init != NULL) { 2156 // initialize all thread-specific items 2157 for (int j = 0; j < nth; ++j) { 2158 __kmp_call_init<T>(arr[i], j * size); 2159 } 2160 } 2161 } else { 2162 // only allocate space for pointers now, 2163 // objects will be lazily allocated/initialized if/when requested 2164 // note that __kmp_allocate zeroes the allocated memory 2165 arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *)); 2166 } 2167 } 2168 tg->reduce_data = (void *)arr; 2169 tg->reduce_num_data = num; 2170 return (void *)tg; 2171 } 2172 2173 /*! 2174 @ingroup TASKING 2175 @param gtid Global thread ID 2176 @param num Number of data items to reduce 2177 @param data Array of data for reduction 2178 @return The taskgroup identifier 2179 2180 Initialize task reduction for the taskgroup. 2181 2182 Note: this entry supposes the optional compiler-generated initializer routine 2183 has single parameter - pointer to object to be initialized. That means 2184 the reduction either does not use omp_orig object, or the omp_orig is accessible 2185 without help of the runtime library. 2186 */ 2187 void *__kmpc_task_reduction_init(int gtid, int num, void *data) { 2188 return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data); 2189 } 2190 2191 /*! 2192 @ingroup TASKING 2193 @param gtid Global thread ID 2194 @param num Number of data items to reduce 2195 @param data Array of data for reduction 2196 @return The taskgroup identifier 2197 2198 Initialize task reduction for the taskgroup. 2199 2200 Note: this entry supposes the optional compiler-generated initializer routine 2201 has two parameters, pointer to object to be initialized and pointer to omp_orig 2202 */ 2203 void *__kmpc_taskred_init(int gtid, int num, void *data) { 2204 return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data); 2205 } 2206 2207 // Copy task reduction data (except for shared pointers). 2208 template <typename T> 2209 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data, 2210 kmp_taskgroup_t *tg, void *reduce_data) { 2211 kmp_taskred_data_t *arr; 2212 KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p," 2213 " from data %p\n", 2214 thr, tg, reduce_data)); 2215 arr = (kmp_taskred_data_t *)__kmp_thread_malloc( 2216 thr, num * sizeof(kmp_taskred_data_t)); 2217 // threads will share private copies, thunk routines, sizes, flags, etc.: 2218 KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t)); 2219 for (int i = 0; i < num; ++i) { 2220 arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers 2221 } 2222 tg->reduce_data = (void *)arr; 2223 tg->reduce_num_data = num; 2224 } 2225 2226 /*! 2227 @ingroup TASKING 2228 @param gtid Global thread ID 2229 @param tskgrp The taskgroup ID (optional) 2230 @param data Shared location of the item 2231 @return The pointer to per-thread data 2232 2233 Get thread-specific location of data item 2234 */ 2235 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) { 2236 kmp_info_t *thread = __kmp_threads[gtid]; 2237 kmp_int32 nth = thread->th.th_team_nproc; 2238 if (nth == 1) 2239 return data; // nothing to do 2240 2241 kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp; 2242 if (tg == NULL) 2243 tg = thread->th.th_current_task->td_taskgroup; 2244 KMP_ASSERT(tg != NULL); 2245 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data); 2246 kmp_int32 num = tg->reduce_num_data; 2247 kmp_int32 tid = thread->th.th_info.ds.ds_tid; 2248 2249 KMP_ASSERT(data != NULL); 2250 while (tg != NULL) { 2251 for (int i = 0; i < num; ++i) { 2252 if (!arr[i].flags.lazy_priv) { 2253 if (data == arr[i].reduce_shar || 2254 (data >= arr[i].reduce_priv && data < arr[i].reduce_pend)) 2255 return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size; 2256 } else { 2257 // check shared location first 2258 void **p_priv = (void **)(arr[i].reduce_priv); 2259 if (data == arr[i].reduce_shar) 2260 goto found; 2261 // check if we get some thread specific location as parameter 2262 for (int j = 0; j < nth; ++j) 2263 if (data == p_priv[j]) 2264 goto found; 2265 continue; // not found, continue search 2266 found: 2267 if (p_priv[tid] == NULL) { 2268 // allocate thread specific object lazily 2269 p_priv[tid] = __kmp_allocate(arr[i].reduce_size); 2270 if (arr[i].reduce_init != NULL) { 2271 if (arr[i].reduce_orig != NULL) { // new interface 2272 ((void (*)(void *, void *))arr[i].reduce_init)( 2273 p_priv[tid], arr[i].reduce_orig); 2274 } else { // old interface (single parameter) 2275 ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]); 2276 } 2277 } 2278 } 2279 return p_priv[tid]; 2280 } 2281 } 2282 tg = tg->parent; 2283 arr = (kmp_taskred_data_t *)(tg->reduce_data); 2284 num = tg->reduce_num_data; 2285 } 2286 KMP_ASSERT2(0, "Unknown task reduction item"); 2287 return NULL; // ERROR, this line never executed 2288 } 2289 2290 // Finalize task reduction. 2291 // Called from __kmpc_end_taskgroup() 2292 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) { 2293 kmp_int32 nth = th->th.th_team_nproc; 2294 KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1 2295 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data; 2296 kmp_int32 num = tg->reduce_num_data; 2297 for (int i = 0; i < num; ++i) { 2298 void *sh_data = arr[i].reduce_shar; 2299 void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini); 2300 void (*f_comb)(void *, void *) = 2301 (void (*)(void *, void *))(arr[i].reduce_comb); 2302 if (!arr[i].flags.lazy_priv) { 2303 void *pr_data = arr[i].reduce_priv; 2304 size_t size = arr[i].reduce_size; 2305 for (int j = 0; j < nth; ++j) { 2306 void *priv_data = (char *)pr_data + j * size; 2307 f_comb(sh_data, priv_data); // combine results 2308 if (f_fini) 2309 f_fini(priv_data); // finalize if needed 2310 } 2311 } else { 2312 void **pr_data = (void **)(arr[i].reduce_priv); 2313 for (int j = 0; j < nth; ++j) { 2314 if (pr_data[j] != NULL) { 2315 f_comb(sh_data, pr_data[j]); // combine results 2316 if (f_fini) 2317 f_fini(pr_data[j]); // finalize if needed 2318 __kmp_free(pr_data[j]); 2319 } 2320 } 2321 } 2322 __kmp_free(arr[i].reduce_priv); 2323 } 2324 __kmp_thread_free(th, arr); 2325 tg->reduce_data = NULL; 2326 tg->reduce_num_data = 0; 2327 } 2328 2329 // Cleanup task reduction data for parallel or worksharing, 2330 // do not touch task private data other threads still working with. 2331 // Called from __kmpc_end_taskgroup() 2332 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) { 2333 __kmp_thread_free(th, tg->reduce_data); 2334 tg->reduce_data = NULL; 2335 tg->reduce_num_data = 0; 2336 } 2337 2338 template <typename T> 2339 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, 2340 int num, T *data) { 2341 kmp_info_t *thr = __kmp_threads[gtid]; 2342 kmp_int32 nth = thr->th.th_team_nproc; 2343 __kmpc_taskgroup(loc, gtid); // form new taskgroup first 2344 if (nth == 1) { 2345 KA_TRACE(10, 2346 ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n", 2347 gtid, thr->th.th_current_task->td_taskgroup)); 2348 return (void *)thr->th.th_current_task->td_taskgroup; 2349 } 2350 kmp_team_t *team = thr->th.th_team; 2351 void *reduce_data; 2352 kmp_taskgroup_t *tg; 2353 reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]); 2354 if (reduce_data == NULL && 2355 __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data, 2356 (void *)1)) { 2357 // single thread enters this block to initialize common reduction data 2358 KMP_DEBUG_ASSERT(reduce_data == NULL); 2359 // first initialize own data, then make a copy other threads can use 2360 tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data); 2361 reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t)); 2362 KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t)); 2363 // fini counters should be 0 at this point 2364 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0); 2365 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0); 2366 KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data); 2367 } else { 2368 while ( 2369 (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) == 2370 (void *)1) { // wait for task reduction initialization 2371 KMP_CPU_PAUSE(); 2372 } 2373 KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here 2374 tg = thr->th.th_current_task->td_taskgroup; 2375 __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data); 2376 } 2377 return tg; 2378 } 2379 2380 /*! 2381 @ingroup TASKING 2382 @param loc Source location info 2383 @param gtid Global thread ID 2384 @param is_ws Is 1 if the reduction is for worksharing, 0 otherwise 2385 @param num Number of data items to reduce 2386 @param data Array of data for reduction 2387 @return The taskgroup identifier 2388 2389 Initialize task reduction for a parallel or worksharing. 2390 2391 Note: this entry supposes the optional compiler-generated initializer routine 2392 has single parameter - pointer to object to be initialized. That means 2393 the reduction either does not use omp_orig object, or the omp_orig is accessible 2394 without help of the runtime library. 2395 */ 2396 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, 2397 int num, void *data) { 2398 return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num, 2399 (kmp_task_red_input_t *)data); 2400 } 2401 2402 /*! 2403 @ingroup TASKING 2404 @param loc Source location info 2405 @param gtid Global thread ID 2406 @param is_ws Is 1 if the reduction is for worksharing, 0 otherwise 2407 @param num Number of data items to reduce 2408 @param data Array of data for reduction 2409 @return The taskgroup identifier 2410 2411 Initialize task reduction for a parallel or worksharing. 2412 2413 Note: this entry supposes the optional compiler-generated initializer routine 2414 has two parameters, pointer to object to be initialized and pointer to omp_orig 2415 */ 2416 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, 2417 void *data) { 2418 return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num, 2419 (kmp_taskred_input_t *)data); 2420 } 2421 2422 /*! 2423 @ingroup TASKING 2424 @param loc Source location info 2425 @param gtid Global thread ID 2426 @param is_ws Is 1 if the reduction is for worksharing, 0 otherwise 2427 2428 Finalize task reduction for a parallel or worksharing. 2429 */ 2430 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) { 2431 __kmpc_end_taskgroup(loc, gtid); 2432 } 2433 #endif 2434 2435 #if OMP_40_ENABLED 2436 // __kmpc_taskgroup: Start a new taskgroup 2437 void __kmpc_taskgroup(ident_t *loc, int gtid) { 2438 kmp_info_t *thread = __kmp_threads[gtid]; 2439 kmp_taskdata_t *taskdata = thread->th.th_current_task; 2440 kmp_taskgroup_t *tg_new = 2441 (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t)); 2442 KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new)); 2443 KMP_ATOMIC_ST_RLX(&tg_new->count, 0); 2444 KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq); 2445 tg_new->parent = taskdata->td_taskgroup; 2446 #if OMP_50_ENABLED 2447 tg_new->reduce_data = NULL; 2448 tg_new->reduce_num_data = 0; 2449 #endif 2450 taskdata->td_taskgroup = tg_new; 2451 2452 #if OMPT_SUPPORT && OMPT_OPTIONAL 2453 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) { 2454 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2455 if (!codeptr) 2456 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2457 kmp_team_t *team = thread->th.th_team; 2458 ompt_data_t my_task_data = taskdata->ompt_task_info.task_data; 2459 // FIXME: I think this is wrong for lwt! 2460 ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data; 2461 2462 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2463 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data), 2464 &(my_task_data), codeptr); 2465 } 2466 #endif 2467 } 2468 2469 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task 2470 // and its descendants are complete 2471 void __kmpc_end_taskgroup(ident_t *loc, int gtid) { 2472 kmp_info_t *thread = __kmp_threads[gtid]; 2473 kmp_taskdata_t *taskdata = thread->th.th_current_task; 2474 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 2475 int thread_finished = FALSE; 2476 2477 #if OMPT_SUPPORT && OMPT_OPTIONAL 2478 kmp_team_t *team; 2479 ompt_data_t my_task_data; 2480 ompt_data_t my_parallel_data; 2481 void *codeptr; 2482 if (UNLIKELY(ompt_enabled.enabled)) { 2483 team = thread->th.th_team; 2484 my_task_data = taskdata->ompt_task_info.task_data; 2485 // FIXME: I think this is wrong for lwt! 2486 my_parallel_data = team->t.ompt_team_info.parallel_data; 2487 codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2488 if (!codeptr) 2489 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2490 } 2491 #endif 2492 2493 KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc)); 2494 KMP_DEBUG_ASSERT(taskgroup != NULL); 2495 KMP_SET_THREAD_STATE_BLOCK(TASKGROUP); 2496 2497 if (__kmp_tasking_mode != tskm_immediate_exec) { 2498 // mark task as waiting not on a barrier 2499 taskdata->td_taskwait_counter += 1; 2500 taskdata->td_taskwait_ident = loc; 2501 taskdata->td_taskwait_thread = gtid + 1; 2502 #if USE_ITT_BUILD 2503 // For ITT the taskgroup wait is similar to taskwait until we need to 2504 // distinguish them 2505 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 2506 if (itt_sync_obj != NULL) 2507 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 2508 #endif /* USE_ITT_BUILD */ 2509 2510 #if OMPT_SUPPORT && OMPT_OPTIONAL 2511 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) { 2512 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2513 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data), 2514 &(my_task_data), codeptr); 2515 } 2516 #endif 2517 2518 #if OMP_45_ENABLED 2519 if (!taskdata->td_flags.team_serial || 2520 (thread->th.th_task_team != NULL && 2521 thread->th.th_task_team->tt.tt_found_proxy_tasks)) 2522 #else 2523 if (!taskdata->td_flags.team_serial) 2524 #endif 2525 { 2526 kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 2527 0U); 2528 while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) { 2529 flag.execute_tasks(thread, gtid, FALSE, 2530 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 2531 __kmp_task_stealing_constraint); 2532 } 2533 } 2534 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting 2535 2536 #if OMPT_SUPPORT && OMPT_OPTIONAL 2537 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) { 2538 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2539 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data), 2540 &(my_task_data), codeptr); 2541 } 2542 #endif 2543 2544 #if USE_ITT_BUILD 2545 if (itt_sync_obj != NULL) 2546 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 2547 #endif /* USE_ITT_BUILD */ 2548 } 2549 KMP_DEBUG_ASSERT(taskgroup->count == 0); 2550 2551 #if OMP_50_ENABLED 2552 if (taskgroup->reduce_data != NULL) { // need to reduce? 2553 int cnt; 2554 void *reduce_data; 2555 kmp_team_t *t = thread->th.th_team; 2556 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data; 2557 // check if <priv> data of the first reduction variable shared for the team 2558 void *priv0 = arr[0].reduce_priv; 2559 if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL && 2560 ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) { 2561 // finishing task reduction on parallel 2562 cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]); 2563 if (cnt == thread->th.th_team_nproc - 1) { 2564 // we are the last thread passing __kmpc_reduction_modifier_fini() 2565 // finalize task reduction: 2566 __kmp_task_reduction_fini(thread, taskgroup); 2567 // cleanup fields in the team structure: 2568 // TODO: is relaxed store enough here (whole barrier should follow)? 2569 __kmp_thread_free(thread, reduce_data); 2570 KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL); 2571 KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0); 2572 } else { 2573 // we are not the last thread passing __kmpc_reduction_modifier_fini(), 2574 // so do not finalize reduction, just clean own copy of the data 2575 __kmp_task_reduction_clean(thread, taskgroup); 2576 } 2577 } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) != 2578 NULL && 2579 ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) { 2580 // finishing task reduction on worksharing 2581 cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]); 2582 if (cnt == thread->th.th_team_nproc - 1) { 2583 // we are the last thread passing __kmpc_reduction_modifier_fini() 2584 __kmp_task_reduction_fini(thread, taskgroup); 2585 // cleanup fields in team structure: 2586 // TODO: is relaxed store enough here (whole barrier should follow)? 2587 __kmp_thread_free(thread, reduce_data); 2588 KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL); 2589 KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0); 2590 } else { 2591 // we are not the last thread passing __kmpc_reduction_modifier_fini(), 2592 // so do not finalize reduction, just clean own copy of the data 2593 __kmp_task_reduction_clean(thread, taskgroup); 2594 } 2595 } else { 2596 // finishing task reduction on taskgroup 2597 __kmp_task_reduction_fini(thread, taskgroup); 2598 } 2599 } 2600 #endif 2601 // Restore parent taskgroup for the current task 2602 taskdata->td_taskgroup = taskgroup->parent; 2603 __kmp_thread_free(thread, taskgroup); 2604 2605 KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n", 2606 gtid, taskdata)); 2607 ANNOTATE_HAPPENS_AFTER(taskdata); 2608 2609 #if OMPT_SUPPORT && OMPT_OPTIONAL 2610 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) { 2611 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2612 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data), 2613 &(my_task_data), codeptr); 2614 } 2615 #endif 2616 } 2617 #endif 2618 2619 // __kmp_remove_my_task: remove a task from my own deque 2620 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid, 2621 kmp_task_team_t *task_team, 2622 kmp_int32 is_constrained) { 2623 kmp_task_t *task; 2624 kmp_taskdata_t *taskdata; 2625 kmp_thread_data_t *thread_data; 2626 kmp_uint32 tail; 2627 2628 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2629 KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data != 2630 NULL); // Caller should check this condition 2631 2632 thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 2633 2634 KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n", 2635 gtid, thread_data->td.td_deque_ntasks, 2636 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2637 2638 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 2639 KA_TRACE(10, 2640 ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: " 2641 "ntasks=%d head=%u tail=%u\n", 2642 gtid, thread_data->td.td_deque_ntasks, 2643 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2644 return NULL; 2645 } 2646 2647 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 2648 2649 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 2650 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2651 KA_TRACE(10, 2652 ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: " 2653 "ntasks=%d head=%u tail=%u\n", 2654 gtid, thread_data->td.td_deque_ntasks, 2655 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2656 return NULL; 2657 } 2658 2659 tail = (thread_data->td.td_deque_tail - 1) & 2660 TASK_DEQUE_MASK(thread_data->td); // Wrap index. 2661 taskdata = thread_data->td.td_deque[tail]; 2662 2663 if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata, 2664 thread->th.th_current_task)) { 2665 // The TSC does not allow to steal victim task 2666 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2667 KA_TRACE(10, 2668 ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: " 2669 "ntasks=%d head=%u tail=%u\n", 2670 gtid, thread_data->td.td_deque_ntasks, 2671 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2672 return NULL; 2673 } 2674 2675 thread_data->td.td_deque_tail = tail; 2676 TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1); 2677 2678 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2679 2680 KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: " 2681 "ntasks=%d head=%u tail=%u\n", 2682 gtid, taskdata, thread_data->td.td_deque_ntasks, 2683 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2684 2685 task = KMP_TASKDATA_TO_TASK(taskdata); 2686 return task; 2687 } 2688 2689 // __kmp_steal_task: remove a task from another thread's deque 2690 // Assume that calling thread has already checked existence of 2691 // task_team thread_data before calling this routine. 2692 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid, 2693 kmp_task_team_t *task_team, 2694 std::atomic<kmp_int32> *unfinished_threads, 2695 int *thread_finished, 2696 kmp_int32 is_constrained) { 2697 kmp_task_t *task; 2698 kmp_taskdata_t *taskdata; 2699 kmp_taskdata_t *current; 2700 kmp_thread_data_t *victim_td, *threads_data; 2701 kmp_int32 target; 2702 kmp_int32 victim_tid; 2703 2704 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2705 2706 threads_data = task_team->tt.tt_threads_data; 2707 KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition 2708 2709 victim_tid = victim_thr->th.th_info.ds.ds_tid; 2710 victim_td = &threads_data[victim_tid]; 2711 2712 KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: " 2713 "task_team=%p ntasks=%d head=%u tail=%u\n", 2714 gtid, __kmp_gtid_from_thread(victim_thr), task_team, 2715 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2716 victim_td->td.td_deque_tail)); 2717 2718 if (TCR_4(victim_td->td.td_deque_ntasks) == 0) { 2719 KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: " 2720 "task_team=%p ntasks=%d head=%u tail=%u\n", 2721 gtid, __kmp_gtid_from_thread(victim_thr), task_team, 2722 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2723 victim_td->td.td_deque_tail)); 2724 return NULL; 2725 } 2726 2727 __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock); 2728 2729 int ntasks = TCR_4(victim_td->td.td_deque_ntasks); 2730 // Check again after we acquire the lock 2731 if (ntasks == 0) { 2732 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2733 KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: " 2734 "task_team=%p ntasks=%d head=%u tail=%u\n", 2735 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2736 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2737 return NULL; 2738 } 2739 2740 KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL); 2741 current = __kmp_threads[gtid]->th.th_current_task; 2742 taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head]; 2743 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) { 2744 // Bump head pointer and Wrap. 2745 victim_td->td.td_deque_head = 2746 (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td); 2747 } else { 2748 if (!task_team->tt.tt_untied_task_encountered) { 2749 // The TSC does not allow to steal victim task 2750 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2751 KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from " 2752 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n", 2753 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2754 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2755 return NULL; 2756 } 2757 int i; 2758 // walk through victim's deque trying to steal any task 2759 target = victim_td->td.td_deque_head; 2760 taskdata = NULL; 2761 for (i = 1; i < ntasks; ++i) { 2762 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td); 2763 taskdata = victim_td->td.td_deque[target]; 2764 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) { 2765 break; // found victim task 2766 } else { 2767 taskdata = NULL; 2768 } 2769 } 2770 if (taskdata == NULL) { 2771 // No appropriate candidate to steal found 2772 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2773 KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from " 2774 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n", 2775 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2776 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2777 return NULL; 2778 } 2779 int prev = target; 2780 for (i = i + 1; i < ntasks; ++i) { 2781 // shift remaining tasks in the deque left by 1 2782 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td); 2783 victim_td->td.td_deque[prev] = victim_td->td.td_deque[target]; 2784 prev = target; 2785 } 2786 KMP_DEBUG_ASSERT( 2787 victim_td->td.td_deque_tail == 2788 (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td))); 2789 victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped)) 2790 } 2791 if (*thread_finished) { 2792 // We need to un-mark this victim as a finished victim. This must be done 2793 // before releasing the lock, or else other threads (starting with the 2794 // master victim) might be prematurely released from the barrier!!! 2795 kmp_int32 count; 2796 2797 count = KMP_ATOMIC_INC(unfinished_threads); 2798 2799 KA_TRACE( 2800 20, 2801 ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n", 2802 gtid, count + 1, task_team)); 2803 2804 *thread_finished = FALSE; 2805 } 2806 TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1); 2807 2808 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2809 2810 KMP_COUNT_BLOCK(TASK_stolen); 2811 KA_TRACE(10, 2812 ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: " 2813 "task_team=%p ntasks=%d head=%u tail=%u\n", 2814 gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team, 2815 ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2816 2817 task = KMP_TASKDATA_TO_TASK(taskdata); 2818 return task; 2819 } 2820 2821 // __kmp_execute_tasks_template: Choose and execute tasks until either the 2822 // condition is statisfied (return true) or there are none left (return false). 2823 // 2824 // final_spin is TRUE if this is the spin at the release barrier. 2825 // thread_finished indicates whether the thread is finished executing all 2826 // the tasks it has on its deque, and is at the release barrier. 2827 // spinner is the location on which to spin. 2828 // spinner == NULL means only execute a single task and return. 2829 // checker is the value to check to terminate the spin. 2830 template <class C> 2831 static inline int __kmp_execute_tasks_template( 2832 kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin, 2833 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2834 kmp_int32 is_constrained) { 2835 kmp_task_team_t *task_team = thread->th.th_task_team; 2836 kmp_thread_data_t *threads_data; 2837 kmp_task_t *task; 2838 kmp_info_t *other_thread; 2839 kmp_taskdata_t *current_task = thread->th.th_current_task; 2840 std::atomic<kmp_int32> *unfinished_threads; 2841 kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0, 2842 tid = thread->th.th_info.ds.ds_tid; 2843 2844 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2845 KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]); 2846 2847 if (task_team == NULL || current_task == NULL) 2848 return FALSE; 2849 2850 KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d " 2851 "*thread_finished=%d\n", 2852 gtid, final_spin, *thread_finished)); 2853 2854 thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 2855 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 2856 KMP_DEBUG_ASSERT(threads_data != NULL); 2857 2858 nthreads = task_team->tt.tt_nproc; 2859 unfinished_threads = &(task_team->tt.tt_unfinished_threads); 2860 #if OMP_45_ENABLED 2861 KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks); 2862 #else 2863 KMP_DEBUG_ASSERT(nthreads > 1); 2864 #endif 2865 KMP_DEBUG_ASSERT(*unfinished_threads >= 0); 2866 2867 while (1) { // Outer loop keeps trying to find tasks in case of single thread 2868 // getting tasks from target constructs 2869 while (1) { // Inner loop to find a task and execute it 2870 task = NULL; 2871 if (use_own_tasks) { // check on own queue first 2872 task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained); 2873 } 2874 if ((task == NULL) && (nthreads > 1)) { // Steal a task 2875 int asleep = 1; 2876 use_own_tasks = 0; 2877 // Try to steal from the last place I stole from successfully. 2878 if (victim_tid == -2) { // haven't stolen anything yet 2879 victim_tid = threads_data[tid].td.td_deque_last_stolen; 2880 if (victim_tid != 2881 -1) // if we have a last stolen from victim, get the thread 2882 other_thread = threads_data[victim_tid].td.td_thr; 2883 } 2884 if (victim_tid != -1) { // found last victim 2885 asleep = 0; 2886 } else if (!new_victim) { // no recent steals and we haven't already 2887 // used a new victim; select a random thread 2888 do { // Find a different thread to steal work from. 2889 // Pick a random thread. Initial plan was to cycle through all the 2890 // threads, and only return if we tried to steal from every thread, 2891 // and failed. Arch says that's not such a great idea. 2892 victim_tid = __kmp_get_random(thread) % (nthreads - 1); 2893 if (victim_tid >= tid) { 2894 ++victim_tid; // Adjusts random distribution to exclude self 2895 } 2896 // Found a potential victim 2897 other_thread = threads_data[victim_tid].td.td_thr; 2898 // There is a slight chance that __kmp_enable_tasking() did not wake 2899 // up all threads waiting at the barrier. If victim is sleeping, 2900 // then wake it up. Since we were going to pay the cache miss 2901 // penalty for referencing another thread's kmp_info_t struct 2902 // anyway, 2903 // the check shouldn't cost too much performance at this point. In 2904 // extra barrier mode, tasks do not sleep at the separate tasking 2905 // barrier, so this isn't a problem. 2906 asleep = 0; 2907 if ((__kmp_tasking_mode == tskm_task_teams) && 2908 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) && 2909 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) != 2910 NULL)) { 2911 asleep = 1; 2912 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread), 2913 other_thread->th.th_sleep_loc); 2914 // A sleeping thread should not have any tasks on it's queue. 2915 // There is a slight possibility that it resumes, steals a task 2916 // from another thread, which spawns more tasks, all in the time 2917 // that it takes this thread to check => don't write an assertion 2918 // that the victim's queue is empty. Try stealing from a 2919 // different thread. 2920 } 2921 } while (asleep); 2922 } 2923 2924 if (!asleep) { 2925 // We have a victim to try to steal from 2926 task = __kmp_steal_task(other_thread, gtid, task_team, 2927 unfinished_threads, thread_finished, 2928 is_constrained); 2929 } 2930 if (task != NULL) { // set last stolen to victim 2931 if (threads_data[tid].td.td_deque_last_stolen != victim_tid) { 2932 threads_data[tid].td.td_deque_last_stolen = victim_tid; 2933 // The pre-refactored code did not try more than 1 successful new 2934 // vicitm, unless the last one generated more local tasks; 2935 // new_victim keeps track of this 2936 new_victim = 1; 2937 } 2938 } else { // No tasks found; unset last_stolen 2939 KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1); 2940 victim_tid = -2; // no successful victim found 2941 } 2942 } 2943 2944 if (task == NULL) // break out of tasking loop 2945 break; 2946 2947 // Found a task; execute it 2948 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2949 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2950 if (itt_sync_obj == NULL) { // we are at fork barrier where we could not 2951 // get the object reliably 2952 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2953 } 2954 __kmp_itt_task_starting(itt_sync_obj); 2955 } 2956 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2957 __kmp_invoke_task(gtid, task, current_task); 2958 #if USE_ITT_BUILD 2959 if (itt_sync_obj != NULL) 2960 __kmp_itt_task_finished(itt_sync_obj); 2961 #endif /* USE_ITT_BUILD */ 2962 // If this thread is only partway through the barrier and the condition is 2963 // met, then return now, so that the barrier gather/release pattern can 2964 // proceed. If this thread is in the last spin loop in the barrier, 2965 // waiting to be released, we know that the termination condition will not 2966 // be satisified, so don't waste any cycles checking it. 2967 if (flag == NULL || (!final_spin && flag->done_check())) { 2968 KA_TRACE( 2969 15, 2970 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 2971 gtid)); 2972 return TRUE; 2973 } 2974 if (thread->th.th_task_team == NULL) { 2975 break; 2976 } 2977 KMP_YIELD(__kmp_library == library_throughput); // Yield before next task 2978 // If execution of a stolen task results in more tasks being placed on our 2979 // run queue, reset use_own_tasks 2980 if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) { 2981 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned " 2982 "other tasks, restart\n", 2983 gtid)); 2984 use_own_tasks = 1; 2985 new_victim = 0; 2986 } 2987 } 2988 2989 // The task source has been exhausted. If in final spin loop of barrier, check 2990 // if termination condition is satisfied. 2991 #if OMP_45_ENABLED 2992 // The work queue may be empty but there might be proxy tasks still 2993 // executing 2994 if (final_spin && 2995 KMP_ATOMIC_LD_ACQ(¤t_task->td_incomplete_child_tasks) == 0) 2996 #else 2997 if (final_spin) 2998 #endif 2999 { 3000 // First, decrement the #unfinished threads, if that has not already been 3001 // done. This decrement might be to the spin location, and result in the 3002 // termination condition being satisfied. 3003 if (!*thread_finished) { 3004 kmp_int32 count; 3005 3006 count = KMP_ATOMIC_DEC(unfinished_threads) - 1; 3007 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec " 3008 "unfinished_threads to %d task_team=%p\n", 3009 gtid, count, task_team)); 3010 *thread_finished = TRUE; 3011 } 3012 3013 // It is now unsafe to reference thread->th.th_team !!! 3014 // Decrementing task_team->tt.tt_unfinished_threads can allow the master 3015 // thread to pass through the barrier, where it might reset each thread's 3016 // th.th_team field for the next parallel region. If we can steal more 3017 // work, we know that this has not happened yet. 3018 if (flag != NULL && flag->done_check()) { 3019 KA_TRACE( 3020 15, 3021 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 3022 gtid)); 3023 return TRUE; 3024 } 3025 } 3026 3027 // If this thread's task team is NULL, master has recognized that there are 3028 // no more tasks; bail out 3029 if (thread->th.th_task_team == NULL) { 3030 KA_TRACE(15, 3031 ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid)); 3032 return FALSE; 3033 } 3034 3035 #if OMP_45_ENABLED 3036 // We could be getting tasks from target constructs; if this is the only 3037 // thread, keep trying to execute tasks from own queue 3038 if (nthreads == 1) 3039 use_own_tasks = 1; 3040 else 3041 #endif 3042 { 3043 KA_TRACE(15, 3044 ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid)); 3045 return FALSE; 3046 } 3047 } 3048 } 3049 3050 int __kmp_execute_tasks_32( 3051 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin, 3052 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 3053 kmp_int32 is_constrained) { 3054 return __kmp_execute_tasks_template( 3055 thread, gtid, flag, final_spin, 3056 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 3057 } 3058 3059 int __kmp_execute_tasks_64( 3060 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin, 3061 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 3062 kmp_int32 is_constrained) { 3063 return __kmp_execute_tasks_template( 3064 thread, gtid, flag, final_spin, 3065 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 3066 } 3067 3068 int __kmp_execute_tasks_oncore( 3069 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin, 3070 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 3071 kmp_int32 is_constrained) { 3072 return __kmp_execute_tasks_template( 3073 thread, gtid, flag, final_spin, 3074 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 3075 } 3076 3077 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the 3078 // next barrier so they can assist in executing enqueued tasks. 3079 // First thread in allocates the task team atomically. 3080 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 3081 kmp_info_t *this_thr) { 3082 kmp_thread_data_t *threads_data; 3083 int nthreads, i, is_init_thread; 3084 3085 KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n", 3086 __kmp_gtid_from_thread(this_thr))); 3087 3088 KMP_DEBUG_ASSERT(task_team != NULL); 3089 KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL); 3090 3091 nthreads = task_team->tt.tt_nproc; 3092 KMP_DEBUG_ASSERT(nthreads > 0); 3093 KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc); 3094 3095 // Allocate or increase the size of threads_data if necessary 3096 is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team); 3097 3098 if (!is_init_thread) { 3099 // Some other thread already set up the array. 3100 KA_TRACE( 3101 20, 3102 ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n", 3103 __kmp_gtid_from_thread(this_thr))); 3104 return; 3105 } 3106 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 3107 KMP_DEBUG_ASSERT(threads_data != NULL); 3108 3109 if (__kmp_tasking_mode == tskm_task_teams && 3110 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) { 3111 // Release any threads sleeping at the barrier, so that they can steal 3112 // tasks and execute them. In extra barrier mode, tasks do not sleep 3113 // at the separate tasking barrier, so this isn't a problem. 3114 for (i = 0; i < nthreads; i++) { 3115 volatile void *sleep_loc; 3116 kmp_info_t *thread = threads_data[i].td.td_thr; 3117 3118 if (i == this_thr->th.th_info.ds.ds_tid) { 3119 continue; 3120 } 3121 // Since we haven't locked the thread's suspend mutex lock at this 3122 // point, there is a small window where a thread might be putting 3123 // itself to sleep, but hasn't set the th_sleep_loc field yet. 3124 // To work around this, __kmp_execute_tasks_template() periodically checks 3125 // see if other threads are sleeping (using the same random mechanism that 3126 // is used for task stealing) and awakens them if they are. 3127 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != 3128 NULL) { 3129 KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n", 3130 __kmp_gtid_from_thread(this_thr), 3131 __kmp_gtid_from_thread(thread))); 3132 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc); 3133 } else { 3134 KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n", 3135 __kmp_gtid_from_thread(this_thr), 3136 __kmp_gtid_from_thread(thread))); 3137 } 3138 } 3139 } 3140 3141 KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n", 3142 __kmp_gtid_from_thread(this_thr))); 3143 } 3144 3145 /* // TODO: Check the comment consistency 3146 * Utility routines for "task teams". A task team (kmp_task_t) is kind of 3147 * like a shadow of the kmp_team_t data struct, with a different lifetime. 3148 * After a child * thread checks into a barrier and calls __kmp_release() from 3149 * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no 3150 * longer assume that the kmp_team_t structure is intact (at any moment, the 3151 * master thread may exit the barrier code and free the team data structure, 3152 * and return the threads to the thread pool). 3153 * 3154 * This does not work with the the tasking code, as the thread is still 3155 * expected to participate in the execution of any tasks that may have been 3156 * spawned my a member of the team, and the thread still needs access to all 3157 * to each thread in the team, so that it can steal work from it. 3158 * 3159 * Enter the existence of the kmp_task_team_t struct. It employs a reference 3160 * counting mechanims, and is allocated by the master thread before calling 3161 * __kmp_<barrier_kind>_release, and then is release by the last thread to 3162 * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes 3163 * of the kmp_task_team_t structs for consecutive barriers can overlap 3164 * (and will, unless the master thread is the last thread to exit the barrier 3165 * release phase, which is not typical). 3166 * 3167 * The existence of such a struct is useful outside the context of tasking, 3168 * but for now, I'm trying to keep it specific to the OMP_30_ENABLED macro, 3169 * so that any performance differences show up when comparing the 2.5 vs. 3.0 3170 * libraries. 3171 * 3172 * We currently use the existence of the threads array as an indicator that 3173 * tasks were spawned since the last barrier. If the structure is to be 3174 * useful outside the context of tasking, then this will have to change, but 3175 * not settting the field minimizes the performance impact of tasking on 3176 * barriers, when no explicit tasks were spawned (pushed, actually). 3177 */ 3178 3179 static kmp_task_team_t *__kmp_free_task_teams = 3180 NULL; // Free list for task_team data structures 3181 // Lock for task team data structures 3182 kmp_bootstrap_lock_t __kmp_task_team_lock = 3183 KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock); 3184 3185 // __kmp_alloc_task_deque: 3186 // Allocates a task deque for a particular thread, and initialize the necessary 3187 // data structures relating to the deque. This only happens once per thread 3188 // per task team since task teams are recycled. No lock is needed during 3189 // allocation since each thread allocates its own deque. 3190 static void __kmp_alloc_task_deque(kmp_info_t *thread, 3191 kmp_thread_data_t *thread_data) { 3192 __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock); 3193 KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL); 3194 3195 // Initialize last stolen task field to "none" 3196 thread_data->td.td_deque_last_stolen = -1; 3197 3198 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0); 3199 KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0); 3200 KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0); 3201 3202 KE_TRACE( 3203 10, 3204 ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n", 3205 __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data)); 3206 // Allocate space for task deque, and zero the deque 3207 // Cannot use __kmp_thread_calloc() because threads not around for 3208 // kmp_reap_task_team( ). 3209 thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate( 3210 INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *)); 3211 thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE; 3212 } 3213 3214 // __kmp_free_task_deque: 3215 // Deallocates a task deque for a particular thread. Happens at library 3216 // deallocation so don't need to reset all thread data fields. 3217 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) { 3218 if (thread_data->td.td_deque != NULL) { 3219 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3220 TCW_4(thread_data->td.td_deque_ntasks, 0); 3221 __kmp_free(thread_data->td.td_deque); 3222 thread_data->td.td_deque = NULL; 3223 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 3224 } 3225 3226 #ifdef BUILD_TIED_TASK_STACK 3227 // GEH: Figure out what to do here for td_susp_tied_tasks 3228 if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) { 3229 __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data); 3230 } 3231 #endif // BUILD_TIED_TASK_STACK 3232 } 3233 3234 // __kmp_realloc_task_threads_data: 3235 // Allocates a threads_data array for a task team, either by allocating an 3236 // initial array or enlarging an existing array. Only the first thread to get 3237 // the lock allocs or enlarges the array and re-initializes the array eleemnts. 3238 // That thread returns "TRUE", the rest return "FALSE". 3239 // Assumes that the new array size is given by task_team -> tt.tt_nproc. 3240 // The current size is given by task_team -> tt.tt_max_threads. 3241 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 3242 kmp_task_team_t *task_team) { 3243 kmp_thread_data_t **threads_data_p; 3244 kmp_int32 nthreads, maxthreads; 3245 int is_init_thread = FALSE; 3246 3247 if (TCR_4(task_team->tt.tt_found_tasks)) { 3248 // Already reallocated and initialized. 3249 return FALSE; 3250 } 3251 3252 threads_data_p = &task_team->tt.tt_threads_data; 3253 nthreads = task_team->tt.tt_nproc; 3254 maxthreads = task_team->tt.tt_max_threads; 3255 3256 // All threads must lock when they encounter the first task of the implicit 3257 // task region to make sure threads_data fields are (re)initialized before 3258 // used. 3259 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 3260 3261 if (!TCR_4(task_team->tt.tt_found_tasks)) { 3262 // first thread to enable tasking 3263 kmp_team_t *team = thread->th.th_team; 3264 int i; 3265 3266 is_init_thread = TRUE; 3267 if (maxthreads < nthreads) { 3268 3269 if (*threads_data_p != NULL) { 3270 kmp_thread_data_t *old_data = *threads_data_p; 3271 kmp_thread_data_t *new_data = NULL; 3272 3273 KE_TRACE( 3274 10, 3275 ("__kmp_realloc_task_threads_data: T#%d reallocating " 3276 "threads data for task_team %p, new_size = %d, old_size = %d\n", 3277 __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads)); 3278 // Reallocate threads_data to have more elements than current array 3279 // Cannot use __kmp_thread_realloc() because threads not around for 3280 // kmp_reap_task_team( ). Note all new array entries are initialized 3281 // to zero by __kmp_allocate(). 3282 new_data = (kmp_thread_data_t *)__kmp_allocate( 3283 nthreads * sizeof(kmp_thread_data_t)); 3284 // copy old data to new data 3285 KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t), 3286 (void *)old_data, maxthreads * sizeof(kmp_thread_data_t)); 3287 3288 #ifdef BUILD_TIED_TASK_STACK 3289 // GEH: Figure out if this is the right thing to do 3290 for (i = maxthreads; i < nthreads; i++) { 3291 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 3292 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 3293 } 3294 #endif // BUILD_TIED_TASK_STACK 3295 // Install the new data and free the old data 3296 (*threads_data_p) = new_data; 3297 __kmp_free(old_data); 3298 } else { 3299 KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating " 3300 "threads data for task_team %p, size = %d\n", 3301 __kmp_gtid_from_thread(thread), task_team, nthreads)); 3302 // Make the initial allocate for threads_data array, and zero entries 3303 // Cannot use __kmp_thread_calloc() because threads not around for 3304 // kmp_reap_task_team( ). 3305 ANNOTATE_IGNORE_WRITES_BEGIN(); 3306 *threads_data_p = (kmp_thread_data_t *)__kmp_allocate( 3307 nthreads * sizeof(kmp_thread_data_t)); 3308 ANNOTATE_IGNORE_WRITES_END(); 3309 #ifdef BUILD_TIED_TASK_STACK 3310 // GEH: Figure out if this is the right thing to do 3311 for (i = 0; i < nthreads; i++) { 3312 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 3313 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 3314 } 3315 #endif // BUILD_TIED_TASK_STACK 3316 } 3317 task_team->tt.tt_max_threads = nthreads; 3318 } else { 3319 // If array has (more than) enough elements, go ahead and use it 3320 KMP_DEBUG_ASSERT(*threads_data_p != NULL); 3321 } 3322 3323 // initialize threads_data pointers back to thread_info structures 3324 for (i = 0; i < nthreads; i++) { 3325 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 3326 thread_data->td.td_thr = team->t.t_threads[i]; 3327 3328 if (thread_data->td.td_deque_last_stolen >= nthreads) { 3329 // The last stolen field survives across teams / barrier, and the number 3330 // of threads may have changed. It's possible (likely?) that a new 3331 // parallel region will exhibit the same behavior as previous region. 3332 thread_data->td.td_deque_last_stolen = -1; 3333 } 3334 } 3335 3336 KMP_MB(); 3337 TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE); 3338 } 3339 3340 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 3341 return is_init_thread; 3342 } 3343 3344 // __kmp_free_task_threads_data: 3345 // Deallocates a threads_data array for a task team, including any attached 3346 // tasking deques. Only occurs at library shutdown. 3347 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) { 3348 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 3349 if (task_team->tt.tt_threads_data != NULL) { 3350 int i; 3351 for (i = 0; i < task_team->tt.tt_max_threads; i++) { 3352 __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]); 3353 } 3354 __kmp_free(task_team->tt.tt_threads_data); 3355 task_team->tt.tt_threads_data = NULL; 3356 } 3357 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 3358 } 3359 3360 // __kmp_allocate_task_team: 3361 // Allocates a task team associated with a specific team, taking it from 3362 // the global task team free list if possible. Also initializes data 3363 // structures. 3364 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread, 3365 kmp_team_t *team) { 3366 kmp_task_team_t *task_team = NULL; 3367 int nthreads; 3368 3369 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n", 3370 (thread ? __kmp_gtid_from_thread(thread) : -1), team)); 3371 3372 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 3373 // Take a task team from the task team pool 3374 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3375 if (__kmp_free_task_teams != NULL) { 3376 task_team = __kmp_free_task_teams; 3377 TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next); 3378 task_team->tt.tt_next = NULL; 3379 } 3380 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3381 } 3382 3383 if (task_team == NULL) { 3384 KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating " 3385 "task team for team %p\n", 3386 __kmp_gtid_from_thread(thread), team)); 3387 // Allocate a new task team if one is not available. 3388 // Cannot use __kmp_thread_malloc() because threads not around for 3389 // kmp_reap_task_team( ). 3390 task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t)); 3391 __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock); 3392 // AC: __kmp_allocate zeroes returned memory 3393 // task_team -> tt.tt_threads_data = NULL; 3394 // task_team -> tt.tt_max_threads = 0; 3395 // task_team -> tt.tt_next = NULL; 3396 } 3397 3398 TCW_4(task_team->tt.tt_found_tasks, FALSE); 3399 #if OMP_45_ENABLED 3400 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3401 #endif 3402 task_team->tt.tt_nproc = nthreads = team->t.t_nproc; 3403 3404 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads); 3405 TCW_4(task_team->tt.tt_active, TRUE); 3406 3407 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p " 3408 "unfinished_threads init'd to %d\n", 3409 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team, 3410 KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads))); 3411 return task_team; 3412 } 3413 3414 // __kmp_free_task_team: 3415 // Frees the task team associated with a specific thread, and adds it 3416 // to the global task team free list. 3417 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) { 3418 KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n", 3419 thread ? __kmp_gtid_from_thread(thread) : -1, task_team)); 3420 3421 // Put task team back on free list 3422 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3423 3424 KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL); 3425 task_team->tt.tt_next = __kmp_free_task_teams; 3426 TCW_PTR(__kmp_free_task_teams, task_team); 3427 3428 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3429 } 3430 3431 // __kmp_reap_task_teams: 3432 // Free all the task teams on the task team free list. 3433 // Should only be done during library shutdown. 3434 // Cannot do anything that needs a thread structure or gtid since they are 3435 // already gone. 3436 void __kmp_reap_task_teams(void) { 3437 kmp_task_team_t *task_team; 3438 3439 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 3440 // Free all task_teams on the free list 3441 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3442 while ((task_team = __kmp_free_task_teams) != NULL) { 3443 __kmp_free_task_teams = task_team->tt.tt_next; 3444 task_team->tt.tt_next = NULL; 3445 3446 // Free threads_data if necessary 3447 if (task_team->tt.tt_threads_data != NULL) { 3448 __kmp_free_task_threads_data(task_team); 3449 } 3450 __kmp_free(task_team); 3451 } 3452 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3453 } 3454 } 3455 3456 // __kmp_wait_to_unref_task_teams: 3457 // Some threads could still be in the fork barrier release code, possibly 3458 // trying to steal tasks. Wait for each thread to unreference its task team. 3459 void __kmp_wait_to_unref_task_teams(void) { 3460 kmp_info_t *thread; 3461 kmp_uint32 spins; 3462 int done; 3463 3464 KMP_INIT_YIELD(spins); 3465 3466 for (;;) { 3467 done = TRUE; 3468 3469 // TODO: GEH - this may be is wrong because some sync would be necessary 3470 // in case threads are added to the pool during the traversal. Need to 3471 // verify that lock for thread pool is held when calling this routine. 3472 for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL; 3473 thread = thread->th.th_next_pool) { 3474 #if KMP_OS_WINDOWS 3475 DWORD exit_val; 3476 #endif 3477 if (TCR_PTR(thread->th.th_task_team) == NULL) { 3478 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n", 3479 __kmp_gtid_from_thread(thread))); 3480 continue; 3481 } 3482 #if KMP_OS_WINDOWS 3483 // TODO: GEH - add this check for Linux* OS / OS X* as well? 3484 if (!__kmp_is_thread_alive(thread, &exit_val)) { 3485 thread->th.th_task_team = NULL; 3486 continue; 3487 } 3488 #endif 3489 3490 done = FALSE; // Because th_task_team pointer is not NULL for this thread 3491 3492 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to " 3493 "unreference task_team\n", 3494 __kmp_gtid_from_thread(thread))); 3495 3496 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 3497 volatile void *sleep_loc; 3498 // If the thread is sleeping, awaken it. 3499 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != 3500 NULL) { 3501 KA_TRACE( 3502 10, 3503 ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n", 3504 __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread))); 3505 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc); 3506 } 3507 } 3508 } 3509 if (done) { 3510 break; 3511 } 3512 3513 // If oversubscribed or have waited a bit, yield. 3514 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 3515 } 3516 } 3517 3518 // __kmp_task_team_setup: Create a task_team for the current team, but use 3519 // an already created, unused one if it already exists. 3520 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) { 3521 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3522 3523 // If this task_team hasn't been created yet, allocate it. It will be used in 3524 // the region after the next. 3525 // If it exists, it is the current task team and shouldn't be touched yet as 3526 // it may still be in use. 3527 if (team->t.t_task_team[this_thr->th.th_task_state] == NULL && 3528 (always || team->t.t_nproc > 1)) { 3529 team->t.t_task_team[this_thr->th.th_task_state] = 3530 __kmp_allocate_task_team(this_thr, team); 3531 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p " 3532 "for team %d at parity=%d\n", 3533 __kmp_gtid_from_thread(this_thr), 3534 team->t.t_task_team[this_thr->th.th_task_state], 3535 ((team != NULL) ? team->t.t_id : -1), 3536 this_thr->th.th_task_state)); 3537 } 3538 3539 // After threads exit the release, they will call sync, and then point to this 3540 // other task_team; make sure it is allocated and properly initialized. As 3541 // threads spin in the barrier release phase, they will continue to use the 3542 // previous task_team struct(above), until they receive the signal to stop 3543 // checking for tasks (they can't safely reference the kmp_team_t struct, 3544 // which could be reallocated by the master thread). No task teams are formed 3545 // for serialized teams. 3546 if (team->t.t_nproc > 1) { 3547 int other_team = 1 - this_thr->th.th_task_state; 3548 if (team->t.t_task_team[other_team] == NULL) { // setup other team as well 3549 team->t.t_task_team[other_team] = 3550 __kmp_allocate_task_team(this_thr, team); 3551 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new " 3552 "task_team %p for team %d at parity=%d\n", 3553 __kmp_gtid_from_thread(this_thr), 3554 team->t.t_task_team[other_team], 3555 ((team != NULL) ? team->t.t_id : -1), other_team)); 3556 } else { // Leave the old task team struct in place for the upcoming region; 3557 // adjust as needed 3558 kmp_task_team_t *task_team = team->t.t_task_team[other_team]; 3559 if (!task_team->tt.tt_active || 3560 team->t.t_nproc != task_team->tt.tt_nproc) { 3561 TCW_4(task_team->tt.tt_nproc, team->t.t_nproc); 3562 TCW_4(task_team->tt.tt_found_tasks, FALSE); 3563 #if OMP_45_ENABLED 3564 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3565 #endif 3566 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, 3567 team->t.t_nproc); 3568 TCW_4(task_team->tt.tt_active, TRUE); 3569 } 3570 // if team size has changed, the first thread to enable tasking will 3571 // realloc threads_data if necessary 3572 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team " 3573 "%p for team %d at parity=%d\n", 3574 __kmp_gtid_from_thread(this_thr), 3575 team->t.t_task_team[other_team], 3576 ((team != NULL) ? team->t.t_id : -1), other_team)); 3577 } 3578 } 3579 } 3580 3581 // __kmp_task_team_sync: Propagation of task team data from team to threads 3582 // which happens just after the release phase of a team barrier. This may be 3583 // called by any thread, but only for teams with # threads > 1. 3584 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) { 3585 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3586 3587 // Toggle the th_task_state field, to switch which task_team this thread 3588 // refers to 3589 this_thr->th.th_task_state = 1 - this_thr->th.th_task_state; 3590 // It is now safe to propagate the task team pointer from the team struct to 3591 // the current thread. 3592 TCW_PTR(this_thr->th.th_task_team, 3593 team->t.t_task_team[this_thr->th.th_task_state]); 3594 KA_TRACE(20, 3595 ("__kmp_task_team_sync: Thread T#%d task team switched to task_team " 3596 "%p from Team #%d (parity=%d)\n", 3597 __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team, 3598 ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state)); 3599 } 3600 3601 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the 3602 // barrier gather phase. Only called by master thread if #threads in team > 1 or 3603 // if proxy tasks were created. 3604 // 3605 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off 3606 // by passing in 0 optionally as the last argument. When wait is zero, master 3607 // thread does not wait for unfinished_threads to reach 0. 3608 void __kmp_task_team_wait( 3609 kmp_info_t *this_thr, 3610 kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) { 3611 kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state]; 3612 3613 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3614 KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team); 3615 3616 if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) { 3617 if (wait) { 3618 KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks " 3619 "(for unfinished_threads to reach 0) on task_team = %p\n", 3620 __kmp_gtid_from_thread(this_thr), task_team)); 3621 // Worker threads may have dropped through to release phase, but could 3622 // still be executing tasks. Wait here for tasks to complete. To avoid 3623 // memory contention, only master thread checks termination condition. 3624 kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, 3625 &task_team->tt.tt_unfinished_threads), 3626 0U); 3627 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 3628 } 3629 // Deactivate the old task team, so that the worker threads will stop 3630 // referencing it while spinning. 3631 KA_TRACE( 3632 20, 3633 ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: " 3634 "setting active to false, setting local and team's pointer to NULL\n", 3635 __kmp_gtid_from_thread(this_thr), task_team)); 3636 #if OMP_45_ENABLED 3637 KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 || 3638 task_team->tt.tt_found_proxy_tasks == TRUE); 3639 TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3640 #else 3641 KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1); 3642 #endif 3643 KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0); 3644 TCW_SYNC_4(task_team->tt.tt_active, FALSE); 3645 KMP_MB(); 3646 3647 TCW_PTR(this_thr->th.th_task_team, NULL); 3648 } 3649 } 3650 3651 // __kmp_tasking_barrier: 3652 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier. 3653 // Internal function to execute all tasks prior to a regular barrier or a join 3654 // barrier. It is a full barrier itself, which unfortunately turns regular 3655 // barriers into double barriers and join barriers into 1 1/2 barriers. 3656 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) { 3657 std::atomic<kmp_uint32> *spin = RCAST( 3658 std::atomic<kmp_uint32> *, 3659 &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads); 3660 int flag = FALSE; 3661 KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier); 3662 3663 #if USE_ITT_BUILD 3664 KMP_FSYNC_SPIN_INIT(spin, NULL); 3665 #endif /* USE_ITT_BUILD */ 3666 kmp_flag_32 spin_flag(spin, 0U); 3667 while (!spin_flag.execute_tasks(thread, gtid, TRUE, 3668 &flag USE_ITT_BUILD_ARG(NULL), 0)) { 3669 #if USE_ITT_BUILD 3670 // TODO: What about itt_sync_obj?? 3671 KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin)); 3672 #endif /* USE_ITT_BUILD */ 3673 3674 if (TCR_4(__kmp_global.g.g_done)) { 3675 if (__kmp_global.g.g_abort) 3676 __kmp_abort_thread(); 3677 break; 3678 } 3679 KMP_YIELD(TRUE); 3680 } 3681 #if USE_ITT_BUILD 3682 KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin)); 3683 #endif /* USE_ITT_BUILD */ 3684 } 3685 3686 #if OMP_45_ENABLED 3687 3688 // __kmp_give_task puts a task into a given thread queue if: 3689 // - the queue for that thread was created 3690 // - there's space in that queue 3691 // Because of this, __kmp_push_task needs to check if there's space after 3692 // getting the lock 3693 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task, 3694 kmp_int32 pass) { 3695 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 3696 kmp_task_team_t *task_team = taskdata->td_task_team; 3697 3698 KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n", 3699 taskdata, tid)); 3700 3701 // If task_team is NULL something went really bad... 3702 KMP_DEBUG_ASSERT(task_team != NULL); 3703 3704 bool result = false; 3705 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 3706 3707 if (thread_data->td.td_deque == NULL) { 3708 // There's no queue in this thread, go find another one 3709 // We're guaranteed that at least one thread has a queue 3710 KA_TRACE(30, 3711 ("__kmp_give_task: thread %d has no queue while giving task %p.\n", 3712 tid, taskdata)); 3713 return result; 3714 } 3715 3716 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3717 TASK_DEQUE_SIZE(thread_data->td)) { 3718 KA_TRACE( 3719 30, 3720 ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", 3721 taskdata, tid)); 3722 3723 // if this deque is bigger than the pass ratio give a chance to another 3724 // thread 3725 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 3726 return result; 3727 3728 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3729 __kmp_realloc_task_deque(thread, thread_data); 3730 3731 } else { 3732 3733 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3734 3735 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3736 TASK_DEQUE_SIZE(thread_data->td)) { 3737 KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to " 3738 "thread %d.\n", 3739 taskdata, tid)); 3740 3741 // if this deque is bigger than the pass ratio give a chance to another 3742 // thread 3743 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 3744 goto release_and_exit; 3745 3746 __kmp_realloc_task_deque(thread, thread_data); 3747 } 3748 } 3749 3750 // lock is held here, and there is space in the deque 3751 3752 thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata; 3753 // Wrap index. 3754 thread_data->td.td_deque_tail = 3755 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 3756 TCW_4(thread_data->td.td_deque_ntasks, 3757 TCR_4(thread_data->td.td_deque_ntasks) + 1); 3758 3759 result = true; 3760 KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n", 3761 taskdata, tid)); 3762 3763 release_and_exit: 3764 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 3765 3766 return result; 3767 } 3768 3769 /* The finish of the proxy tasks is divided in two pieces: 3770 - the top half is the one that can be done from a thread outside the team 3771 - the bottom half must be run from a thread within the team 3772 3773 In order to run the bottom half the task gets queued back into one of the 3774 threads of the team. Once the td_incomplete_child_task counter of the parent 3775 is decremented the threads can leave the barriers. So, the bottom half needs 3776 to be queued before the counter is decremented. The top half is therefore 3777 divided in two parts: 3778 - things that can be run before queuing the bottom half 3779 - things that must be run after queuing the bottom half 3780 3781 This creates a second race as the bottom half can free the task before the 3782 second top half is executed. To avoid this we use the 3783 td_incomplete_child_task of the proxy task to synchronize the top and bottom 3784 half. */ 3785 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 3786 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 3787 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3788 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 3789 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 3790 3791 taskdata->td_flags.complete = 1; // mark the task as completed 3792 3793 if (taskdata->td_taskgroup) 3794 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); 3795 3796 // Create an imaginary children for this task so the bottom half cannot 3797 // release the task before we have completed the second top half 3798 KMP_ATOMIC_INC(&taskdata->td_incomplete_child_tasks); 3799 } 3800 3801 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 3802 kmp_int32 children = 0; 3803 3804 // Predecrement simulated by "- 1" calculation 3805 children = 3806 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1; 3807 KMP_DEBUG_ASSERT(children >= 0); 3808 3809 // Remove the imaginary children 3810 KMP_ATOMIC_DEC(&taskdata->td_incomplete_child_tasks); 3811 } 3812 3813 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) { 3814 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3815 kmp_info_t *thread = __kmp_threads[gtid]; 3816 3817 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3818 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 3819 1); // top half must run before bottom half 3820 3821 // We need to wait to make sure the top half is finished 3822 // Spinning here should be ok as this should happen quickly 3823 while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) > 0) 3824 ; 3825 3826 __kmp_release_deps(gtid, taskdata); 3827 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 3828 } 3829 3830 /*! 3831 @ingroup TASKING 3832 @param gtid Global Thread ID of encountering thread 3833 @param ptask Task which execution is completed 3834 3835 Execute the completation of a proxy task from a thread of that is part of the 3836 team. Run first and bottom halves directly. 3837 */ 3838 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) { 3839 KMP_DEBUG_ASSERT(ptask != NULL); 3840 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3841 KA_TRACE( 3842 10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n", 3843 gtid, taskdata)); 3844 3845 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3846 3847 __kmp_first_top_half_finish_proxy(taskdata); 3848 __kmp_second_top_half_finish_proxy(taskdata); 3849 __kmp_bottom_half_finish_proxy(gtid, ptask); 3850 3851 KA_TRACE(10, 3852 ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n", 3853 gtid, taskdata)); 3854 } 3855 3856 /*! 3857 @ingroup TASKING 3858 @param ptask Task which execution is completed 3859 3860 Execute the completation of a proxy task from a thread that could not belong to 3861 the team. 3862 */ 3863 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) { 3864 KMP_DEBUG_ASSERT(ptask != NULL); 3865 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3866 3867 KA_TRACE( 3868 10, 3869 ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n", 3870 taskdata)); 3871 3872 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3873 3874 __kmp_first_top_half_finish_proxy(taskdata); 3875 3876 // Enqueue task to complete bottom half completion from a thread within the 3877 // corresponding team 3878 kmp_team_t *team = taskdata->td_team; 3879 kmp_int32 nthreads = team->t.t_nproc; 3880 kmp_info_t *thread; 3881 3882 // This should be similar to start_k = __kmp_get_random( thread ) % nthreads 3883 // but we cannot use __kmp_get_random here 3884 kmp_int32 start_k = 0; 3885 kmp_int32 pass = 1; 3886 kmp_int32 k = start_k; 3887 3888 do { 3889 // For now we're just linearly trying to find a thread 3890 thread = team->t.t_threads[k]; 3891 k = (k + 1) % nthreads; 3892 3893 // we did a full pass through all the threads 3894 if (k == start_k) 3895 pass = pass << 1; 3896 3897 } while (!__kmp_give_task(thread, k, ptask, pass)); 3898 3899 __kmp_second_top_half_finish_proxy(taskdata); 3900 3901 KA_TRACE( 3902 10, 3903 ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", 3904 taskdata)); 3905 } 3906 3907 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task 3908 // for taskloop 3909 // 3910 // thread: allocating thread 3911 // task_src: pointer to source task to be duplicated 3912 // returns: a pointer to the allocated kmp_task_t structure (task). 3913 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) { 3914 kmp_task_t *task; 3915 kmp_taskdata_t *taskdata; 3916 kmp_taskdata_t *taskdata_src; 3917 kmp_taskdata_t *parent_task = thread->th.th_current_task; 3918 size_t shareds_offset; 3919 size_t task_size; 3920 3921 KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, 3922 task_src)); 3923 taskdata_src = KMP_TASK_TO_TASKDATA(task_src); 3924 KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy == 3925 TASK_FULL); // it should not be proxy task 3926 KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT); 3927 task_size = taskdata_src->td_size_alloc; 3928 3929 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 3930 KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread, 3931 task_size)); 3932 #if USE_FAST_MEMORY 3933 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size); 3934 #else 3935 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size); 3936 #endif /* USE_FAST_MEMORY */ 3937 KMP_MEMCPY(taskdata, taskdata_src, task_size); 3938 3939 task = KMP_TASKDATA_TO_TASK(taskdata); 3940 3941 // Initialize new task (only specific fields not affected by memcpy) 3942 taskdata->td_task_id = KMP_GEN_TASK_ID(); 3943 if (task->shareds != NULL) { // need setup shareds pointer 3944 shareds_offset = (char *)task_src->shareds - (char *)taskdata_src; 3945 task->shareds = &((char *)taskdata)[shareds_offset]; 3946 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 3947 0); 3948 } 3949 taskdata->td_alloc_thread = thread; 3950 taskdata->td_parent = parent_task; 3951 taskdata->td_taskgroup = 3952 parent_task 3953 ->td_taskgroup; // task inherits the taskgroup from the parent task 3954 3955 // Only need to keep track of child task counts if team parallel and tasking 3956 // not serialized 3957 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { 3958 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks); 3959 if (parent_task->td_taskgroup) 3960 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count); 3961 // Only need to keep track of allocated child tasks for explicit tasks since 3962 // implicit not deallocated 3963 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) 3964 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks); 3965 } 3966 3967 KA_TRACE(20, 3968 ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n", 3969 thread, taskdata, taskdata->td_parent)); 3970 #if OMPT_SUPPORT 3971 if (UNLIKELY(ompt_enabled.enabled)) 3972 __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid); 3973 #endif 3974 return task; 3975 } 3976 3977 // Routine optionally generated by the compiler for setting the lastprivate flag 3978 // and calling needed constructors for private/firstprivate objects 3979 // (used to form taskloop tasks from pattern task) 3980 // Parameters: dest task, src task, lastprivate flag. 3981 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32); 3982 3983 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8); 3984 3985 // class to encapsulate manipulating loop bounds in a taskloop task. 3986 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting 3987 // the loop bound variables. 3988 class kmp_taskloop_bounds_t { 3989 kmp_task_t *task; 3990 const kmp_taskdata_t *taskdata; 3991 size_t lower_offset; 3992 size_t upper_offset; 3993 3994 public: 3995 kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub) 3996 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)), 3997 lower_offset((char *)lb - (char *)task), 3998 upper_offset((char *)ub - (char *)task) { 3999 KMP_DEBUG_ASSERT((char *)lb > (char *)_task); 4000 KMP_DEBUG_ASSERT((char *)ub > (char *)_task); 4001 } 4002 kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds) 4003 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)), 4004 lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {} 4005 size_t get_lower_offset() const { return lower_offset; } 4006 size_t get_upper_offset() const { return upper_offset; } 4007 kmp_uint64 get_lb() const { 4008 kmp_int64 retval; 4009 #if defined(KMP_GOMP_COMPAT) 4010 // Intel task just returns the lower bound normally 4011 if (!taskdata->td_flags.native) { 4012 retval = *(kmp_int64 *)((char *)task + lower_offset); 4013 } else { 4014 // GOMP task has to take into account the sizeof(long) 4015 if (taskdata->td_size_loop_bounds == 4) { 4016 kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds); 4017 retval = (kmp_int64)*lb; 4018 } else { 4019 kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds); 4020 retval = (kmp_int64)*lb; 4021 } 4022 } 4023 #else 4024 retval = *(kmp_int64 *)((char *)task + lower_offset); 4025 #endif // defined(KMP_GOMP_COMPAT) 4026 return retval; 4027 } 4028 kmp_uint64 get_ub() const { 4029 kmp_int64 retval; 4030 #if defined(KMP_GOMP_COMPAT) 4031 // Intel task just returns the upper bound normally 4032 if (!taskdata->td_flags.native) { 4033 retval = *(kmp_int64 *)((char *)task + upper_offset); 4034 } else { 4035 // GOMP task has to take into account the sizeof(long) 4036 if (taskdata->td_size_loop_bounds == 4) { 4037 kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1; 4038 retval = (kmp_int64)*ub; 4039 } else { 4040 kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1; 4041 retval = (kmp_int64)*ub; 4042 } 4043 } 4044 #else 4045 retval = *(kmp_int64 *)((char *)task + upper_offset); 4046 #endif // defined(KMP_GOMP_COMPAT) 4047 return retval; 4048 } 4049 void set_lb(kmp_uint64 lb) { 4050 #if defined(KMP_GOMP_COMPAT) 4051 // Intel task just sets the lower bound normally 4052 if (!taskdata->td_flags.native) { 4053 *(kmp_uint64 *)((char *)task + lower_offset) = lb; 4054 } else { 4055 // GOMP task has to take into account the sizeof(long) 4056 if (taskdata->td_size_loop_bounds == 4) { 4057 kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds); 4058 *lower = (kmp_uint32)lb; 4059 } else { 4060 kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds); 4061 *lower = (kmp_uint64)lb; 4062 } 4063 } 4064 #else 4065 *(kmp_uint64 *)((char *)task + lower_offset) = lb; 4066 #endif // defined(KMP_GOMP_COMPAT) 4067 } 4068 void set_ub(kmp_uint64 ub) { 4069 #if defined(KMP_GOMP_COMPAT) 4070 // Intel task just sets the upper bound normally 4071 if (!taskdata->td_flags.native) { 4072 *(kmp_uint64 *)((char *)task + upper_offset) = ub; 4073 } else { 4074 // GOMP task has to take into account the sizeof(long) 4075 if (taskdata->td_size_loop_bounds == 4) { 4076 kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1; 4077 *upper = (kmp_uint32)ub; 4078 } else { 4079 kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1; 4080 *upper = (kmp_uint64)ub; 4081 } 4082 } 4083 #else 4084 *(kmp_uint64 *)((char *)task + upper_offset) = ub; 4085 #endif // defined(KMP_GOMP_COMPAT) 4086 } 4087 }; 4088 4089 // __kmp_taskloop_linear: Start tasks of the taskloop linearly 4090 // 4091 // loc Source location information 4092 // gtid Global thread ID 4093 // task Pattern task, exposes the loop iteration range 4094 // lb Pointer to loop lower bound in task structure 4095 // ub Pointer to loop upper bound in task structure 4096 // st Loop stride 4097 // ub_glob Global upper bound (used for lastprivate check) 4098 // num_tasks Number of tasks to execute 4099 // grainsize Number of loop iterations per task 4100 // extras Number of chunks with grainsize+1 iterations 4101 // tc Iterations count 4102 // task_dup Tasks duplication routine 4103 // codeptr_ra Return address for OMPT events 4104 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, 4105 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 4106 kmp_uint64 ub_glob, kmp_uint64 num_tasks, 4107 kmp_uint64 grainsize, kmp_uint64 extras, 4108 kmp_uint64 tc, 4109 #if OMPT_SUPPORT 4110 void *codeptr_ra, 4111 #endif 4112 void *task_dup) { 4113 KMP_COUNT_BLOCK(OMP_TASKLOOP); 4114 KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling); 4115 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 4116 // compiler provides global bounds here 4117 kmp_taskloop_bounds_t task_bounds(task, lb, ub); 4118 kmp_uint64 lower = task_bounds.get_lb(); 4119 kmp_uint64 upper = task_bounds.get_ub(); 4120 kmp_uint64 i; 4121 kmp_info_t *thread = __kmp_threads[gtid]; 4122 kmp_taskdata_t *current_task = thread->th.th_current_task; 4123 kmp_task_t *next_task; 4124 kmp_int32 lastpriv = 0; 4125 4126 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); 4127 KMP_DEBUG_ASSERT(num_tasks > extras); 4128 KMP_DEBUG_ASSERT(num_tasks > 0); 4129 KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, " 4130 "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n", 4131 gtid, num_tasks, grainsize, extras, lower, upper, ub_glob, st, 4132 task_dup)); 4133 4134 // Launch num_tasks tasks, assign grainsize iterations each task 4135 for (i = 0; i < num_tasks; ++i) { 4136 kmp_uint64 chunk_minus_1; 4137 if (extras == 0) { 4138 chunk_minus_1 = grainsize - 1; 4139 } else { 4140 chunk_minus_1 = grainsize; 4141 --extras; // first extras iterations get bigger chunk (grainsize+1) 4142 } 4143 upper = lower + st * chunk_minus_1; 4144 if (i == num_tasks - 1) { 4145 // schedule the last task, set lastprivate flag if needed 4146 if (st == 1) { // most common case 4147 KMP_DEBUG_ASSERT(upper == *ub); 4148 if (upper == ub_glob) 4149 lastpriv = 1; 4150 } else if (st > 0) { // positive loop stride 4151 KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper); 4152 if ((kmp_uint64)st > ub_glob - upper) 4153 lastpriv = 1; 4154 } else { // negative loop stride 4155 KMP_DEBUG_ASSERT(upper + st < *ub); 4156 if (upper - ub_glob < (kmp_uint64)(-st)) 4157 lastpriv = 1; 4158 } 4159 } 4160 next_task = __kmp_task_dup_alloc(thread, task); // allocate new task 4161 kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task); 4162 kmp_taskloop_bounds_t next_task_bounds = 4163 kmp_taskloop_bounds_t(next_task, task_bounds); 4164 4165 // adjust task-specific bounds 4166 next_task_bounds.set_lb(lower); 4167 if (next_taskdata->td_flags.native) { 4168 next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1)); 4169 } else { 4170 next_task_bounds.set_ub(upper); 4171 } 4172 if (ptask_dup != NULL) // set lastprivate flag, construct fistprivates, etc. 4173 ptask_dup(next_task, task, lastpriv); 4174 KA_TRACE(40, 4175 ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, " 4176 "upper %lld stride %lld, (offsets %p %p)\n", 4177 gtid, i, next_task, lower, upper, st, 4178 next_task_bounds.get_lower_offset(), 4179 next_task_bounds.get_upper_offset())); 4180 #if OMPT_SUPPORT 4181 __kmp_omp_taskloop_task(NULL, gtid, next_task, 4182 codeptr_ra); // schedule new task 4183 #else 4184 __kmp_omp_task(gtid, next_task, true); // schedule new task 4185 #endif 4186 lower = upper + st; // adjust lower bound for the next iteration 4187 } 4188 // free the pattern task and exit 4189 __kmp_task_start(gtid, task, current_task); // make internal bookkeeping 4190 // do not execute the pattern task, just do internal bookkeeping 4191 __kmp_task_finish<false>(gtid, task, current_task); 4192 } 4193 4194 // Structure to keep taskloop parameters for auxiliary task 4195 // kept in the shareds of the task structure. 4196 typedef struct __taskloop_params { 4197 kmp_task_t *task; 4198 kmp_uint64 *lb; 4199 kmp_uint64 *ub; 4200 void *task_dup; 4201 kmp_int64 st; 4202 kmp_uint64 ub_glob; 4203 kmp_uint64 num_tasks; 4204 kmp_uint64 grainsize; 4205 kmp_uint64 extras; 4206 kmp_uint64 tc; 4207 kmp_uint64 num_t_min; 4208 #if OMPT_SUPPORT 4209 void *codeptr_ra; 4210 #endif 4211 } __taskloop_params_t; 4212 4213 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *, 4214 kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64, 4215 kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64, 4216 #if OMPT_SUPPORT 4217 void *, 4218 #endif 4219 void *); 4220 4221 // Execute part of the the taskloop submitted as a task. 4222 int __kmp_taskloop_task(int gtid, void *ptask) { 4223 __taskloop_params_t *p = 4224 (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds; 4225 kmp_task_t *task = p->task; 4226 kmp_uint64 *lb = p->lb; 4227 kmp_uint64 *ub = p->ub; 4228 void *task_dup = p->task_dup; 4229 // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 4230 kmp_int64 st = p->st; 4231 kmp_uint64 ub_glob = p->ub_glob; 4232 kmp_uint64 num_tasks = p->num_tasks; 4233 kmp_uint64 grainsize = p->grainsize; 4234 kmp_uint64 extras = p->extras; 4235 kmp_uint64 tc = p->tc; 4236 kmp_uint64 num_t_min = p->num_t_min; 4237 #if OMPT_SUPPORT 4238 void *codeptr_ra = p->codeptr_ra; 4239 #endif 4240 #if KMP_DEBUG 4241 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4242 KMP_DEBUG_ASSERT(task != NULL); 4243 KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize" 4244 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", 4245 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st, 4246 task_dup)); 4247 #endif 4248 KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min); 4249 if (num_tasks > num_t_min) 4250 __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, 4251 grainsize, extras, tc, num_t_min, 4252 #if OMPT_SUPPORT 4253 codeptr_ra, 4254 #endif 4255 task_dup); 4256 else 4257 __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, 4258 grainsize, extras, tc, 4259 #if OMPT_SUPPORT 4260 codeptr_ra, 4261 #endif 4262 task_dup); 4263 4264 KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid)); 4265 return 0; 4266 } 4267 4268 // Schedule part of the the taskloop as a task, 4269 // execute the rest of the the taskloop. 4270 // 4271 // loc Source location information 4272 // gtid Global thread ID 4273 // task Pattern task, exposes the loop iteration range 4274 // lb Pointer to loop lower bound in task structure 4275 // ub Pointer to loop upper bound in task structure 4276 // st Loop stride 4277 // ub_glob Global upper bound (used for lastprivate check) 4278 // num_tasks Number of tasks to execute 4279 // grainsize Number of loop iterations per task 4280 // extras Number of chunks with grainsize+1 iterations 4281 // tc Iterations count 4282 // num_t_min Threashold to launch tasks recursively 4283 // task_dup Tasks duplication routine 4284 // codeptr_ra Return address for OMPT events 4285 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task, 4286 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 4287 kmp_uint64 ub_glob, kmp_uint64 num_tasks, 4288 kmp_uint64 grainsize, kmp_uint64 extras, 4289 kmp_uint64 tc, kmp_uint64 num_t_min, 4290 #if OMPT_SUPPORT 4291 void *codeptr_ra, 4292 #endif 4293 void *task_dup) { 4294 #if KMP_DEBUG 4295 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4296 KMP_DEBUG_ASSERT(task != NULL); 4297 KMP_DEBUG_ASSERT(num_tasks > num_t_min); 4298 KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize" 4299 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", 4300 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st, 4301 task_dup)); 4302 #endif 4303 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 4304 kmp_uint64 lower = *lb; 4305 kmp_info_t *thread = __kmp_threads[gtid]; 4306 // kmp_taskdata_t *current_task = thread->th.th_current_task; 4307 kmp_task_t *next_task; 4308 size_t lower_offset = 4309 (char *)lb - (char *)task; // remember offset of lb in the task structure 4310 size_t upper_offset = 4311 (char *)ub - (char *)task; // remember offset of ub in the task structure 4312 4313 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); 4314 KMP_DEBUG_ASSERT(num_tasks > extras); 4315 KMP_DEBUG_ASSERT(num_tasks > 0); 4316 4317 // split the loop in two halves 4318 kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1; 4319 kmp_uint64 gr_size0 = grainsize; 4320 kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute 4321 kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task 4322 if (n_tsk0 <= extras) { 4323 gr_size0++; // integrate extras into grainsize 4324 ext0 = 0; // no extra iters in 1st half 4325 ext1 = extras - n_tsk0; // remaining extras 4326 tc0 = gr_size0 * n_tsk0; 4327 tc1 = tc - tc0; 4328 } else { // n_tsk0 > extras 4329 ext1 = 0; // no extra iters in 2nd half 4330 ext0 = extras; 4331 tc1 = grainsize * n_tsk1; 4332 tc0 = tc - tc1; 4333 } 4334 ub0 = lower + st * (tc0 - 1); 4335 lb1 = ub0 + st; 4336 4337 // create pattern task for 2nd half of the loop 4338 next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task 4339 // adjust lower bound (upper bound is not changed) for the 2nd half 4340 *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1; 4341 if (ptask_dup != NULL) // construct fistprivates, etc. 4342 ptask_dup(next_task, task, 0); 4343 *ub = ub0; // adjust upper bound for the 1st half 4344 4345 // create auxiliary task for 2nd half of the loop 4346 kmp_task_t *new_task = 4347 __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *), 4348 sizeof(__taskloop_params_t), &__kmp_taskloop_task); 4349 __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds; 4350 p->task = next_task; 4351 p->lb = (kmp_uint64 *)((char *)next_task + lower_offset); 4352 p->ub = (kmp_uint64 *)((char *)next_task + upper_offset); 4353 p->task_dup = task_dup; 4354 p->st = st; 4355 p->ub_glob = ub_glob; 4356 p->num_tasks = n_tsk1; 4357 p->grainsize = grainsize; 4358 p->extras = ext1; 4359 p->tc = tc1; 4360 p->num_t_min = num_t_min; 4361 #if OMPT_SUPPORT 4362 p->codeptr_ra = codeptr_ra; 4363 #endif 4364 4365 #if OMPT_SUPPORT 4366 // schedule new task with correct return address for OMPT events 4367 __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra); 4368 #else 4369 __kmp_omp_task(gtid, new_task, true); // schedule new task 4370 #endif 4371 4372 // execute the 1st half of current subrange 4373 if (n_tsk0 > num_t_min) 4374 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0, 4375 ext0, tc0, num_t_min, 4376 #if OMPT_SUPPORT 4377 codeptr_ra, 4378 #endif 4379 task_dup); 4380 else 4381 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, 4382 gr_size0, ext0, tc0, 4383 #if OMPT_SUPPORT 4384 codeptr_ra, 4385 #endif 4386 task_dup); 4387 4388 KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid)); 4389 } 4390 4391 /*! 4392 @ingroup TASKING 4393 @param loc Source location information 4394 @param gtid Global thread ID 4395 @param task Task structure 4396 @param if_val Value of the if clause 4397 @param lb Pointer to loop lower bound in task structure 4398 @param ub Pointer to loop upper bound in task structure 4399 @param st Loop stride 4400 @param nogroup Flag, 1 if no taskgroup needs to be added, 0 otherwise 4401 @param sched Schedule specified 0/1/2 for none/grainsize/num_tasks 4402 @param grainsize Schedule value if specified 4403 @param task_dup Tasks duplication routine 4404 4405 Execute the taskloop construct. 4406 */ 4407 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, 4408 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, 4409 int sched, kmp_uint64 grainsize, void *task_dup) { 4410 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4411 KMP_DEBUG_ASSERT(task != NULL); 4412 4413 if (nogroup == 0) { 4414 #if OMPT_SUPPORT && OMPT_OPTIONAL 4415 OMPT_STORE_RETURN_ADDRESS(gtid); 4416 #endif 4417 __kmpc_taskgroup(loc, gtid); 4418 } 4419 4420 // ========================================================================= 4421 // calculate loop parameters 4422 kmp_taskloop_bounds_t task_bounds(task, lb, ub); 4423 kmp_uint64 tc; 4424 // compiler provides global bounds here 4425 kmp_uint64 lower = task_bounds.get_lb(); 4426 kmp_uint64 upper = task_bounds.get_ub(); 4427 kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag 4428 kmp_uint64 num_tasks = 0, extras = 0; 4429 kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks; 4430 kmp_info_t *thread = __kmp_threads[gtid]; 4431 kmp_taskdata_t *current_task = thread->th.th_current_task; 4432 4433 KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, " 4434 "grain %llu(%d), dup %p\n", 4435 gtid, taskdata, lower, upper, st, grainsize, sched, task_dup)); 4436 4437 // compute trip count 4438 if (st == 1) { // most common case 4439 tc = upper - lower + 1; 4440 } else if (st < 0) { 4441 tc = (lower - upper) / (-st) + 1; 4442 } else { // st > 0 4443 tc = (upper - lower) / st + 1; 4444 } 4445 if (tc == 0) { 4446 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid)); 4447 // free the pattern task and exit 4448 __kmp_task_start(gtid, task, current_task); 4449 // do not execute anything for zero-trip loop 4450 __kmp_task_finish<false>(gtid, task, current_task); 4451 return; 4452 } 4453 4454 #if OMPT_SUPPORT && OMPT_OPTIONAL 4455 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 4456 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 4457 if (ompt_enabled.ompt_callback_work) { 4458 ompt_callbacks.ompt_callback(ompt_callback_work)( 4459 ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data), 4460 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0)); 4461 } 4462 #endif 4463 4464 if (num_tasks_min == 0) 4465 // TODO: can we choose better default heuristic? 4466 num_tasks_min = 4467 KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE); 4468 4469 // compute num_tasks/grainsize based on the input provided 4470 switch (sched) { 4471 case 0: // no schedule clause specified, we can choose the default 4472 // let's try to schedule (team_size*10) tasks 4473 grainsize = thread->th.th_team_nproc * 10; 4474 KMP_FALLTHROUGH(); 4475 case 2: // num_tasks provided 4476 if (grainsize > tc) { 4477 num_tasks = tc; // too big num_tasks requested, adjust values 4478 grainsize = 1; 4479 extras = 0; 4480 } else { 4481 num_tasks = grainsize; 4482 grainsize = tc / num_tasks; 4483 extras = tc % num_tasks; 4484 } 4485 break; 4486 case 1: // grainsize provided 4487 if (grainsize > tc) { 4488 num_tasks = 1; // too big grainsize requested, adjust values 4489 grainsize = tc; 4490 extras = 0; 4491 } else { 4492 num_tasks = tc / grainsize; 4493 // adjust grainsize for balanced distribution of iterations 4494 grainsize = tc / num_tasks; 4495 extras = tc % num_tasks; 4496 } 4497 break; 4498 default: 4499 KMP_ASSERT2(0, "unknown scheduling of taskloop"); 4500 } 4501 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); 4502 KMP_DEBUG_ASSERT(num_tasks > extras); 4503 KMP_DEBUG_ASSERT(num_tasks > 0); 4504 // ========================================================================= 4505 4506 // check if clause value first 4507 // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native) 4508 if (if_val == 0) { // if(0) specified, mark task as serial 4509 taskdata->td_flags.task_serial = 1; 4510 taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied 4511 // always start serial tasks linearly 4512 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4513 grainsize, extras, tc, 4514 #if OMPT_SUPPORT 4515 OMPT_GET_RETURN_ADDRESS(0), 4516 #endif 4517 task_dup); 4518 // !taskdata->td_flags.native => currently force linear spawning of tasks 4519 // for GOMP_taskloop 4520 } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) { 4521 KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu" 4522 "(%lld), grain %llu, extras %llu\n", 4523 gtid, tc, num_tasks, num_tasks_min, grainsize, extras)); 4524 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4525 grainsize, extras, tc, num_tasks_min, 4526 #if OMPT_SUPPORT 4527 OMPT_GET_RETURN_ADDRESS(0), 4528 #endif 4529 task_dup); 4530 } else { 4531 KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu" 4532 "(%lld), grain %llu, extras %llu\n", 4533 gtid, tc, num_tasks, num_tasks_min, grainsize, extras)); 4534 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4535 grainsize, extras, tc, 4536 #if OMPT_SUPPORT 4537 OMPT_GET_RETURN_ADDRESS(0), 4538 #endif 4539 task_dup); 4540 } 4541 4542 #if OMPT_SUPPORT && OMPT_OPTIONAL 4543 if (ompt_enabled.ompt_callback_work) { 4544 ompt_callbacks.ompt_callback(ompt_callback_work)( 4545 ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data), 4546 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0)); 4547 } 4548 #endif 4549 4550 if (nogroup == 0) { 4551 #if OMPT_SUPPORT && OMPT_OPTIONAL 4552 OMPT_STORE_RETURN_ADDRESS(gtid); 4553 #endif 4554 __kmpc_end_taskgroup(loc, gtid); 4555 } 4556 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid)); 4557 } 4558 4559 #endif 4560