1 /* 2 * kmp_tasking.cpp -- OpenMP 3.0 tasking support. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_i18n.h" 15 #include "kmp_itt.h" 16 #include "kmp_stats.h" 17 #include "kmp_wait_release.h" 18 #include "kmp_taskdeps.h" 19 20 #if OMPT_SUPPORT 21 #include "ompt-specific.h" 22 #endif 23 24 #include "tsan_annotations.h" 25 26 /* forward declaration */ 27 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 28 kmp_info_t *this_thr); 29 static void __kmp_alloc_task_deque(kmp_info_t *thread, 30 kmp_thread_data_t *thread_data); 31 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 32 kmp_task_team_t *task_team); 33 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask); 34 35 #ifdef BUILD_TIED_TASK_STACK 36 37 // __kmp_trace_task_stack: print the tied tasks from the task stack in order 38 // from top do bottom 39 // 40 // gtid: global thread identifier for thread containing stack 41 // thread_data: thread data for task team thread containing stack 42 // threshold: value above which the trace statement triggers 43 // location: string identifying call site of this function (for trace) 44 static void __kmp_trace_task_stack(kmp_int32 gtid, 45 kmp_thread_data_t *thread_data, 46 int threshold, char *location) { 47 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 48 kmp_taskdata_t **stack_top = task_stack->ts_top; 49 kmp_int32 entries = task_stack->ts_entries; 50 kmp_taskdata_t *tied_task; 51 52 KA_TRACE( 53 threshold, 54 ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, " 55 "first_block = %p, stack_top = %p \n", 56 location, gtid, entries, task_stack->ts_first_block, stack_top)); 57 58 KMP_DEBUG_ASSERT(stack_top != NULL); 59 KMP_DEBUG_ASSERT(entries > 0); 60 61 while (entries != 0) { 62 KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]); 63 // fix up ts_top if we need to pop from previous block 64 if (entries & TASK_STACK_INDEX_MASK == 0) { 65 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top); 66 67 stack_block = stack_block->sb_prev; 68 stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 69 } 70 71 // finish bookkeeping 72 stack_top--; 73 entries--; 74 75 tied_task = *stack_top; 76 77 KMP_DEBUG_ASSERT(tied_task != NULL); 78 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 79 80 KA_TRACE(threshold, 81 ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, " 82 "stack_top=%p, tied_task=%p\n", 83 location, gtid, entries, stack_top, tied_task)); 84 } 85 KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]); 86 87 KA_TRACE(threshold, 88 ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n", 89 location, gtid)); 90 } 91 92 // __kmp_init_task_stack: initialize the task stack for the first time 93 // after a thread_data structure is created. 94 // It should not be necessary to do this again (assuming the stack works). 95 // 96 // gtid: global thread identifier of calling thread 97 // thread_data: thread data for task team thread containing stack 98 static void __kmp_init_task_stack(kmp_int32 gtid, 99 kmp_thread_data_t *thread_data) { 100 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 101 kmp_stack_block_t *first_block; 102 103 // set up the first block of the stack 104 first_block = &task_stack->ts_first_block; 105 task_stack->ts_top = (kmp_taskdata_t **)first_block; 106 memset((void *)first_block, '\0', 107 TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *)); 108 109 // initialize the stack to be empty 110 task_stack->ts_entries = TASK_STACK_EMPTY; 111 first_block->sb_next = NULL; 112 first_block->sb_prev = NULL; 113 } 114 115 // __kmp_free_task_stack: free the task stack when thread_data is destroyed. 116 // 117 // gtid: global thread identifier for calling thread 118 // thread_data: thread info for thread containing stack 119 static void __kmp_free_task_stack(kmp_int32 gtid, 120 kmp_thread_data_t *thread_data) { 121 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 122 kmp_stack_block_t *stack_block = &task_stack->ts_first_block; 123 124 KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY); 125 // free from the second block of the stack 126 while (stack_block != NULL) { 127 kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL; 128 129 stack_block->sb_next = NULL; 130 stack_block->sb_prev = NULL; 131 if (stack_block != &task_stack->ts_first_block) { 132 __kmp_thread_free(thread, 133 stack_block); // free the block, if not the first 134 } 135 stack_block = next_block; 136 } 137 // initialize the stack to be empty 138 task_stack->ts_entries = 0; 139 task_stack->ts_top = NULL; 140 } 141 142 // __kmp_push_task_stack: Push the tied task onto the task stack. 143 // Grow the stack if necessary by allocating another block. 144 // 145 // gtid: global thread identifier for calling thread 146 // thread: thread info for thread containing stack 147 // tied_task: the task to push on the stack 148 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread, 149 kmp_taskdata_t *tied_task) { 150 // GEH - need to consider what to do if tt_threads_data not allocated yet 151 kmp_thread_data_t *thread_data = 152 &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 153 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 154 155 if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) { 156 return; // Don't push anything on stack if team or team tasks are serialized 157 } 158 159 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 160 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 161 162 KA_TRACE(20, 163 ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n", 164 gtid, thread, tied_task)); 165 // Store entry 166 *(task_stack->ts_top) = tied_task; 167 168 // Do bookkeeping for next push 169 task_stack->ts_top++; 170 task_stack->ts_entries++; 171 172 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 173 // Find beginning of this task block 174 kmp_stack_block_t *stack_block = 175 (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE); 176 177 // Check if we already have a block 178 if (stack_block->sb_next != 179 NULL) { // reset ts_top to beginning of next block 180 task_stack->ts_top = &stack_block->sb_next->sb_block[0]; 181 } else { // Alloc new block and link it up 182 kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc( 183 thread, sizeof(kmp_stack_block_t)); 184 185 task_stack->ts_top = &new_block->sb_block[0]; 186 stack_block->sb_next = new_block; 187 new_block->sb_prev = stack_block; 188 new_block->sb_next = NULL; 189 190 KA_TRACE( 191 30, 192 ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n", 193 gtid, tied_task, new_block)); 194 } 195 } 196 KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 197 tied_task)); 198 } 199 200 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return 201 // the task, just check to make sure it matches the ending task passed in. 202 // 203 // gtid: global thread identifier for the calling thread 204 // thread: thread info structure containing stack 205 // tied_task: the task popped off the stack 206 // ending_task: the task that is ending (should match popped task) 207 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread, 208 kmp_taskdata_t *ending_task) { 209 // GEH - need to consider what to do if tt_threads_data not allocated yet 210 kmp_thread_data_t *thread_data = 211 &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)]; 212 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 213 kmp_taskdata_t *tied_task; 214 215 if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) { 216 // Don't pop anything from stack if team or team tasks are serialized 217 return; 218 } 219 220 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 221 KMP_DEBUG_ASSERT(task_stack->ts_entries > 0); 222 223 KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid, 224 thread)); 225 226 // fix up ts_top if we need to pop from previous block 227 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 228 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top); 229 230 stack_block = stack_block->sb_prev; 231 task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 232 } 233 234 // finish bookkeeping 235 task_stack->ts_top--; 236 task_stack->ts_entries--; 237 238 tied_task = *(task_stack->ts_top); 239 240 KMP_DEBUG_ASSERT(tied_task != NULL); 241 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 242 KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly 243 244 KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 245 tied_task)); 246 return; 247 } 248 #endif /* BUILD_TIED_TASK_STACK */ 249 250 // returns 1 if new task is allowed to execute, 0 otherwise 251 // checks Task Scheduling constraint (if requested) and 252 // mutexinoutset dependencies if any 253 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained, 254 const kmp_taskdata_t *tasknew, 255 const kmp_taskdata_t *taskcurr) { 256 if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) { 257 // Check if the candidate obeys the Task Scheduling Constraints (TSC) 258 // only descendant of all deferred tied tasks can be scheduled, checking 259 // the last one is enough, as it in turn is the descendant of all others 260 kmp_taskdata_t *current = taskcurr->td_last_tied; 261 KMP_DEBUG_ASSERT(current != NULL); 262 // check if the task is not suspended on barrier 263 if (current->td_flags.tasktype == TASK_EXPLICIT || 264 current->td_taskwait_thread > 0) { // <= 0 on barrier 265 kmp_int32 level = current->td_level; 266 kmp_taskdata_t *parent = tasknew->td_parent; 267 while (parent != current && parent->td_level > level) { 268 // check generation up to the level of the current task 269 parent = parent->td_parent; 270 KMP_DEBUG_ASSERT(parent != NULL); 271 } 272 if (parent != current) 273 return false; 274 } 275 } 276 // Check mutexinoutset dependencies, acquire locks 277 kmp_depnode_t *node = tasknew->td_depnode; 278 if (node && (node->dn.mtx_num_locks > 0)) { 279 for (int i = 0; i < node->dn.mtx_num_locks; ++i) { 280 KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL); 281 if (__kmp_test_lock(node->dn.mtx_locks[i], gtid)) 282 continue; 283 // could not get the lock, release previous locks 284 for (int j = i - 1; j >= 0; --j) 285 __kmp_release_lock(node->dn.mtx_locks[j], gtid); 286 return false; 287 } 288 // negative num_locks means all locks acquired successfully 289 node->dn.mtx_num_locks = -node->dn.mtx_num_locks; 290 } 291 return true; 292 } 293 294 // __kmp_realloc_task_deque: 295 // Re-allocates a task deque for a particular thread, copies the content from 296 // the old deque and adjusts the necessary data structures relating to the 297 // deque. This operation must be done with the deque_lock being held 298 static void __kmp_realloc_task_deque(kmp_info_t *thread, 299 kmp_thread_data_t *thread_data) { 300 kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td); 301 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size); 302 kmp_int32 new_size = 2 * size; 303 304 KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to " 305 "%d] for thread_data %p\n", 306 __kmp_gtid_from_thread(thread), size, new_size, thread_data)); 307 308 kmp_taskdata_t **new_deque = 309 (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *)); 310 311 int i, j; 312 for (i = thread_data->td.td_deque_head, j = 0; j < size; 313 i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++) 314 new_deque[j] = thread_data->td.td_deque[i]; 315 316 __kmp_free(thread_data->td.td_deque); 317 318 thread_data->td.td_deque_head = 0; 319 thread_data->td.td_deque_tail = size; 320 thread_data->td.td_deque = new_deque; 321 thread_data->td.td_deque_size = new_size; 322 } 323 324 // __kmp_push_task: Add a task to the thread's deque 325 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { 326 kmp_info_t *thread = __kmp_threads[gtid]; 327 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 328 kmp_task_team_t *task_team = thread->th.th_task_team; 329 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 330 kmp_thread_data_t *thread_data; 331 332 KA_TRACE(20, 333 ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata)); 334 335 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 336 // untied task needs to increment counter so that the task structure is not 337 // freed prematurely 338 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count); 339 KMP_DEBUG_USE_VAR(counter); 340 KA_TRACE( 341 20, 342 ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n", 343 gtid, counter, taskdata)); 344 } 345 346 // The first check avoids building task_team thread data if serialized 347 if (taskdata->td_flags.task_serial) { 348 KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning " 349 "TASK_NOT_PUSHED for task %p\n", 350 gtid, taskdata)); 351 return TASK_NOT_PUSHED; 352 } 353 354 // Now that serialized tasks have returned, we can assume that we are not in 355 // immediate exec mode 356 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 357 if (!KMP_TASKING_ENABLED(task_team)) { 358 __kmp_enable_tasking(task_team, thread); 359 } 360 KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE); 361 KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL); 362 363 // Find tasking deque specific to encountering thread 364 thread_data = &task_team->tt.tt_threads_data[tid]; 365 366 // No lock needed since only owner can allocate 367 if (thread_data->td.td_deque == NULL) { 368 __kmp_alloc_task_deque(thread, thread_data); 369 } 370 371 int locked = 0; 372 // Check if deque is full 373 if (TCR_4(thread_data->td.td_deque_ntasks) >= 374 TASK_DEQUE_SIZE(thread_data->td)) { 375 if (__kmp_enable_task_throttling && 376 __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata, 377 thread->th.th_current_task)) { 378 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning " 379 "TASK_NOT_PUSHED for task %p\n", 380 gtid, taskdata)); 381 return TASK_NOT_PUSHED; 382 } else { 383 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 384 locked = 1; 385 if (TCR_4(thread_data->td.td_deque_ntasks) >= 386 TASK_DEQUE_SIZE(thread_data->td)) { 387 // expand deque to push the task which is not allowed to execute 388 __kmp_realloc_task_deque(thread, thread_data); 389 } 390 } 391 } 392 // Lock the deque for the task push operation 393 if (!locked) { 394 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 395 // Need to recheck as we can get a proxy task from thread outside of OpenMP 396 if (TCR_4(thread_data->td.td_deque_ntasks) >= 397 TASK_DEQUE_SIZE(thread_data->td)) { 398 if (__kmp_enable_task_throttling && 399 __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata, 400 thread->th.th_current_task)) { 401 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 402 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; " 403 "returning TASK_NOT_PUSHED for task %p\n", 404 gtid, taskdata)); 405 return TASK_NOT_PUSHED; 406 } else { 407 // expand deque to push the task which is not allowed to execute 408 __kmp_realloc_task_deque(thread, thread_data); 409 } 410 } 411 } 412 // Must have room since no thread can add tasks but calling thread 413 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) < 414 TASK_DEQUE_SIZE(thread_data->td)); 415 416 thread_data->td.td_deque[thread_data->td.td_deque_tail] = 417 taskdata; // Push taskdata 418 // Wrap index. 419 thread_data->td.td_deque_tail = 420 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 421 TCW_4(thread_data->td.td_deque_ntasks, 422 TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count 423 424 KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: " 425 "task=%p ntasks=%d head=%u tail=%u\n", 426 gtid, taskdata, thread_data->td.td_deque_ntasks, 427 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 428 429 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 430 431 return TASK_SUCCESSFULLY_PUSHED; 432 } 433 434 // __kmp_pop_current_task_from_thread: set up current task from called thread 435 // when team ends 436 // 437 // this_thr: thread structure to set current_task in. 438 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) { 439 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d " 440 "this_thread=%p, curtask=%p, " 441 "curtask_parent=%p\n", 442 0, this_thr, this_thr->th.th_current_task, 443 this_thr->th.th_current_task->td_parent)); 444 445 this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent; 446 447 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d " 448 "this_thread=%p, curtask=%p, " 449 "curtask_parent=%p\n", 450 0, this_thr, this_thr->th.th_current_task, 451 this_thr->th.th_current_task->td_parent)); 452 } 453 454 // __kmp_push_current_task_to_thread: set up current task in called thread for a 455 // new team 456 // 457 // this_thr: thread structure to set up 458 // team: team for implicit task data 459 // tid: thread within team to set up 460 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team, 461 int tid) { 462 // current task of the thread is a parent of the new just created implicit 463 // tasks of new team 464 KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p " 465 "curtask=%p " 466 "parent_task=%p\n", 467 tid, this_thr, this_thr->th.th_current_task, 468 team->t.t_implicit_task_taskdata[tid].td_parent)); 469 470 KMP_DEBUG_ASSERT(this_thr != NULL); 471 472 if (tid == 0) { 473 if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) { 474 team->t.t_implicit_task_taskdata[0].td_parent = 475 this_thr->th.th_current_task; 476 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0]; 477 } 478 } else { 479 team->t.t_implicit_task_taskdata[tid].td_parent = 480 team->t.t_implicit_task_taskdata[0].td_parent; 481 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid]; 482 } 483 484 KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p " 485 "curtask=%p " 486 "parent_task=%p\n", 487 tid, this_thr, this_thr->th.th_current_task, 488 team->t.t_implicit_task_taskdata[tid].td_parent)); 489 } 490 491 // __kmp_task_start: bookkeeping for a task starting execution 492 // 493 // GTID: global thread id of calling thread 494 // task: task starting execution 495 // current_task: task suspending 496 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task, 497 kmp_taskdata_t *current_task) { 498 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 499 kmp_info_t *thread = __kmp_threads[gtid]; 500 501 KA_TRACE(10, 502 ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n", 503 gtid, taskdata, current_task)); 504 505 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 506 507 // mark currently executing task as suspended 508 // TODO: GEH - make sure root team implicit task is initialized properly. 509 // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 ); 510 current_task->td_flags.executing = 0; 511 512 // Add task to stack if tied 513 #ifdef BUILD_TIED_TASK_STACK 514 if (taskdata->td_flags.tiedness == TASK_TIED) { 515 __kmp_push_task_stack(gtid, thread, taskdata); 516 } 517 #endif /* BUILD_TIED_TASK_STACK */ 518 519 // mark starting task as executing and as current task 520 thread->th.th_current_task = taskdata; 521 522 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 || 523 taskdata->td_flags.tiedness == TASK_UNTIED); 524 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 || 525 taskdata->td_flags.tiedness == TASK_UNTIED); 526 taskdata->td_flags.started = 1; 527 taskdata->td_flags.executing = 1; 528 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 529 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 530 531 // GEH TODO: shouldn't we pass some sort of location identifier here? 532 // APT: yes, we will pass location here. 533 // need to store current thread state (in a thread or taskdata structure) 534 // before setting work_state, otherwise wrong state is set after end of task 535 536 KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata)); 537 538 return; 539 } 540 541 #if OMPT_SUPPORT 542 //------------------------------------------------------------------------------ 543 // __ompt_task_init: 544 // Initialize OMPT fields maintained by a task. This will only be called after 545 // ompt_start_tool, so we already know whether ompt is enabled or not. 546 547 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) { 548 // The calls to __ompt_task_init already have the ompt_enabled condition. 549 task->ompt_task_info.task_data.value = 0; 550 task->ompt_task_info.frame.exit_frame = ompt_data_none; 551 task->ompt_task_info.frame.enter_frame = ompt_data_none; 552 task->ompt_task_info.frame.exit_frame_flags = ompt_frame_runtime | ompt_frame_framepointer; 553 task->ompt_task_info.frame.enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer; 554 task->ompt_task_info.ndeps = 0; 555 task->ompt_task_info.deps = NULL; 556 } 557 558 // __ompt_task_start: 559 // Build and trigger task-begin event 560 static inline void __ompt_task_start(kmp_task_t *task, 561 kmp_taskdata_t *current_task, 562 kmp_int32 gtid) { 563 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 564 ompt_task_status_t status = ompt_task_switch; 565 if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) { 566 status = ompt_task_yield; 567 __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0; 568 } 569 /* let OMPT know that we're about to run this task */ 570 if (ompt_enabled.ompt_callback_task_schedule) { 571 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)( 572 &(current_task->ompt_task_info.task_data), status, 573 &(taskdata->ompt_task_info.task_data)); 574 } 575 taskdata->ompt_task_info.scheduling_parent = current_task; 576 } 577 578 // __ompt_task_finish: 579 // Build and trigger final task-schedule event 580 static inline void __ompt_task_finish(kmp_task_t *task, 581 kmp_taskdata_t *resumed_task, 582 ompt_task_status_t status) { 583 if (ompt_enabled.ompt_callback_task_schedule) { 584 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 585 if (__kmp_omp_cancellation && taskdata->td_taskgroup && 586 taskdata->td_taskgroup->cancel_request == cancel_taskgroup) { 587 status = ompt_task_cancel; 588 } 589 590 /* let OMPT know that we're returning to the callee task */ 591 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)( 592 &(taskdata->ompt_task_info.task_data), status, 593 (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL)); 594 } 595 } 596 #endif 597 598 template <bool ompt> 599 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid, 600 kmp_task_t *task, 601 void *frame_address, 602 void *return_address) { 603 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 604 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 605 606 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p " 607 "current_task=%p\n", 608 gtid, loc_ref, taskdata, current_task)); 609 610 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 611 // untied task needs to increment counter so that the task structure is not 612 // freed prematurely 613 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count); 614 KMP_DEBUG_USE_VAR(counter); 615 KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) " 616 "incremented for task %p\n", 617 gtid, counter, taskdata)); 618 } 619 620 taskdata->td_flags.task_serial = 621 1; // Execute this task immediately, not deferred. 622 __kmp_task_start(gtid, task, current_task); 623 624 #if OMPT_SUPPORT 625 if (ompt) { 626 if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) { 627 current_task->ompt_task_info.frame.enter_frame.ptr = 628 taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address; 629 current_task->ompt_task_info.frame.enter_frame_flags = 630 taskdata->ompt_task_info.frame.exit_frame_flags = ompt_frame_application | ompt_frame_framepointer; 631 } 632 if (ompt_enabled.ompt_callback_task_create) { 633 ompt_task_info_t *parent_info = &(current_task->ompt_task_info); 634 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 635 &(parent_info->task_data), &(parent_info->frame), 636 &(taskdata->ompt_task_info.task_data), 637 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0, 638 return_address); 639 } 640 __ompt_task_start(task, current_task, gtid); 641 } 642 #endif // OMPT_SUPPORT 643 644 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid, 645 loc_ref, taskdata)); 646 } 647 648 #if OMPT_SUPPORT 649 OMPT_NOINLINE 650 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid, 651 kmp_task_t *task, 652 void *frame_address, 653 void *return_address) { 654 __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address, 655 return_address); 656 } 657 #endif // OMPT_SUPPORT 658 659 // __kmpc_omp_task_begin_if0: report that a given serialized task has started 660 // execution 661 // 662 // loc_ref: source location information; points to beginning of task block. 663 // gtid: global thread number. 664 // task: task thunk for the started task. 665 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid, 666 kmp_task_t *task) { 667 #if OMPT_SUPPORT 668 if (UNLIKELY(ompt_enabled.enabled)) { 669 OMPT_STORE_RETURN_ADDRESS(gtid); 670 __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task, 671 OMPT_GET_FRAME_ADDRESS(1), 672 OMPT_LOAD_RETURN_ADDRESS(gtid)); 673 return; 674 } 675 #endif 676 __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL); 677 } 678 679 #ifdef TASK_UNUSED 680 // __kmpc_omp_task_begin: report that a given task has started execution 681 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 682 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) { 683 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 684 685 KA_TRACE( 686 10, 687 ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n", 688 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task)); 689 690 __kmp_task_start(gtid, task, current_task); 691 692 KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid, 693 loc_ref, KMP_TASK_TO_TASKDATA(task))); 694 return; 695 } 696 #endif // TASK_UNUSED 697 698 // __kmp_free_task: free the current task space and the space for shareds 699 // 700 // gtid: Global thread ID of calling thread 701 // taskdata: task to free 702 // thread: thread data structure of caller 703 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata, 704 kmp_info_t *thread) { 705 KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid, 706 taskdata)); 707 708 // Check to make sure all flags and counters have the correct values 709 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 710 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0); 711 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1); 712 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 713 KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 || 714 taskdata->td_flags.task_serial == 1); 715 KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0); 716 717 taskdata->td_flags.freed = 1; 718 ANNOTATE_HAPPENS_BEFORE(taskdata); 719 // deallocate the taskdata and shared variable blocks associated with this task 720 #if USE_FAST_MEMORY 721 __kmp_fast_free(thread, taskdata); 722 #else /* ! USE_FAST_MEMORY */ 723 __kmp_thread_free(thread, taskdata); 724 #endif 725 726 KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata)); 727 } 728 729 // __kmp_free_task_and_ancestors: free the current task and ancestors without 730 // children 731 // 732 // gtid: Global thread ID of calling thread 733 // taskdata: task to free 734 // thread: thread data structure of caller 735 static void __kmp_free_task_and_ancestors(kmp_int32 gtid, 736 kmp_taskdata_t *taskdata, 737 kmp_info_t *thread) { 738 // Proxy tasks must always be allowed to free their parents 739 // because they can be run in background even in serial mode. 740 kmp_int32 team_serial = 741 (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) && 742 !taskdata->td_flags.proxy; 743 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 744 745 kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1; 746 KMP_DEBUG_ASSERT(children >= 0); 747 748 // Now, go up the ancestor tree to see if any ancestors can now be freed. 749 while (children == 0) { 750 kmp_taskdata_t *parent_taskdata = taskdata->td_parent; 751 752 KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete " 753 "and freeing itself\n", 754 gtid, taskdata)); 755 756 // --- Deallocate my ancestor task --- 757 __kmp_free_task(gtid, taskdata, thread); 758 759 taskdata = parent_taskdata; 760 761 if (team_serial) 762 return; 763 // Stop checking ancestors at implicit task instead of walking up ancestor 764 // tree to avoid premature deallocation of ancestors. 765 if (taskdata->td_flags.tasktype == TASK_IMPLICIT) { 766 if (taskdata->td_dephash) { // do we need to cleanup dephash? 767 int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks); 768 kmp_tasking_flags_t flags_old = taskdata->td_flags; 769 if (children == 0 && flags_old.complete == 1) { 770 kmp_tasking_flags_t flags_new = flags_old; 771 flags_new.complete = 0; 772 if (KMP_COMPARE_AND_STORE_ACQ32( 773 RCAST(kmp_int32 *, &taskdata->td_flags), 774 *RCAST(kmp_int32 *, &flags_old), 775 *RCAST(kmp_int32 *, &flags_new))) { 776 KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans " 777 "dephash of implicit task %p\n", 778 gtid, taskdata)); 779 // cleanup dephash of finished implicit task 780 __kmp_dephash_free_entries(thread, taskdata->td_dephash); 781 } 782 } 783 } 784 return; 785 } 786 // Predecrement simulated by "- 1" calculation 787 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1; 788 KMP_DEBUG_ASSERT(children >= 0); 789 } 790 791 KA_TRACE( 792 20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; " 793 "not freeing it yet\n", 794 gtid, taskdata, children)); 795 } 796 797 // __kmp_task_finish: bookkeeping to do when a task finishes execution 798 // 799 // gtid: global thread ID for calling thread 800 // task: task to be finished 801 // resumed_task: task to be resumed. (may be NULL if task is serialized) 802 // 803 // template<ompt>: effectively ompt_enabled.enabled!=0 804 // the version with ompt=false is inlined, allowing to optimize away all ompt 805 // code in this case 806 template <bool ompt> 807 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, 808 kmp_taskdata_t *resumed_task) { 809 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 810 kmp_info_t *thread = __kmp_threads[gtid]; 811 kmp_task_team_t *task_team = 812 thread->th.th_task_team; // might be NULL for serial teams... 813 kmp_int32 children = 0; 814 815 KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming " 816 "task %p\n", 817 gtid, taskdata, resumed_task)); 818 819 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 820 821 // Pop task from stack if tied 822 #ifdef BUILD_TIED_TASK_STACK 823 if (taskdata->td_flags.tiedness == TASK_TIED) { 824 __kmp_pop_task_stack(gtid, thread, taskdata); 825 } 826 #endif /* BUILD_TIED_TASK_STACK */ 827 828 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 829 // untied task needs to check the counter so that the task structure is not 830 // freed prematurely 831 kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1; 832 KA_TRACE( 833 20, 834 ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n", 835 gtid, counter, taskdata)); 836 if (counter > 0) { 837 // untied task is not done, to be continued possibly by other thread, do 838 // not free it now 839 if (resumed_task == NULL) { 840 KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial); 841 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 842 // task is the parent 843 } 844 thread->th.th_current_task = resumed_task; // restore current_task 845 resumed_task->td_flags.executing = 1; // resume previous task 846 KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, " 847 "resuming task %p\n", 848 gtid, taskdata, resumed_task)); 849 return; 850 } 851 } 852 853 // Check mutexinoutset dependencies, release locks 854 kmp_depnode_t *node = taskdata->td_depnode; 855 if (node && (node->dn.mtx_num_locks < 0)) { 856 // negative num_locks means all locks were acquired 857 node->dn.mtx_num_locks = -node->dn.mtx_num_locks; 858 for (int i = node->dn.mtx_num_locks - 1; i >= 0; --i) { 859 KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL); 860 __kmp_release_lock(node->dn.mtx_locks[i], gtid); 861 } 862 } 863 864 // bookkeeping for resuming task: 865 // GEH - note tasking_ser => task_serial 866 KMP_DEBUG_ASSERT( 867 (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) == 868 taskdata->td_flags.task_serial); 869 if (taskdata->td_flags.task_serial) { 870 if (resumed_task == NULL) { 871 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 872 // task is the parent 873 } 874 } else { 875 KMP_DEBUG_ASSERT(resumed_task != 876 NULL); // verify that resumed task is passed as argument 877 } 878 879 /* If the tasks' destructor thunk flag has been set, we need to invoke the 880 destructor thunk that has been generated by the compiler. The code is 881 placed here, since at this point other tasks might have been released 882 hence overlapping the destructor invocations with some other work in the 883 released tasks. The OpenMP spec is not specific on when the destructors 884 are invoked, so we should be free to choose. */ 885 if (taskdata->td_flags.destructors_thunk) { 886 kmp_routine_entry_t destr_thunk = task->data1.destructors; 887 KMP_ASSERT(destr_thunk); 888 destr_thunk(gtid, task); 889 } 890 891 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 892 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1); 893 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 894 895 bool detach = false; 896 if (taskdata->td_flags.detachable == TASK_DETACHABLE) { 897 if (taskdata->td_allow_completion_event.type == 898 KMP_EVENT_ALLOW_COMPLETION) { 899 // event hasn't been fulfilled yet. Try to detach task. 900 __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid); 901 if (taskdata->td_allow_completion_event.type == 902 KMP_EVENT_ALLOW_COMPLETION) { 903 // task finished execution 904 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); 905 taskdata->td_flags.executing = 0; // suspend the finishing task 906 907 #if OMPT_SUPPORT 908 // For a detached task, which is not completed, we switch back 909 // the omp_fulfill_event signals completion 910 // locking is necessary to avoid a race with ompt_task_late_fulfill 911 if (ompt) 912 __ompt_task_finish(task, resumed_task, ompt_task_detach); 913 #endif 914 915 // no access to taskdata after this point! 916 // __kmp_fulfill_event might free taskdata at any time from now 917 918 taskdata->td_flags.proxy = TASK_PROXY; // proxify! 919 detach = true; 920 } 921 __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid); 922 } 923 } 924 925 if (!detach) { 926 taskdata->td_flags.complete = 1; // mark the task as completed 927 928 #if OMPT_SUPPORT 929 // This is not a detached task, we are done here 930 if (ompt) 931 __ompt_task_finish(task, resumed_task, ompt_task_complete); 932 #endif 933 934 // Only need to keep track of count if team parallel and tasking not 935 // serialized, or task is detachable and event has already been fulfilled 936 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) || 937 taskdata->td_flags.detachable == TASK_DETACHABLE) { 938 // Predecrement simulated by "- 1" calculation 939 children = 940 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1; 941 KMP_DEBUG_ASSERT(children >= 0); 942 if (taskdata->td_taskgroup) 943 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); 944 __kmp_release_deps(gtid, taskdata); 945 } else if (task_team && task_team->tt.tt_found_proxy_tasks) { 946 // if we found proxy tasks there could exist a dependency chain 947 // with the proxy task as origin 948 __kmp_release_deps(gtid, taskdata); 949 } 950 // td_flags.executing must be marked as 0 after __kmp_release_deps has been 951 // called. Othertwise, if a task is executed immediately from the 952 // release_deps code, the flag will be reset to 1 again by this same 953 // function 954 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); 955 taskdata->td_flags.executing = 0; // suspend the finishing task 956 } 957 958 959 KA_TRACE( 960 20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n", 961 gtid, taskdata, children)); 962 963 // Free this task and then ancestor tasks if they have no children. 964 // Restore th_current_task first as suggested by John: 965 // johnmc: if an asynchronous inquiry peers into the runtime system 966 // it doesn't see the freed task as the current task. 967 thread->th.th_current_task = resumed_task; 968 if (!detach) 969 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 970 971 // TODO: GEH - make sure root team implicit task is initialized properly. 972 // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 ); 973 resumed_task->td_flags.executing = 1; // resume previous task 974 975 KA_TRACE( 976 10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n", 977 gtid, taskdata, resumed_task)); 978 979 return; 980 } 981 982 template <bool ompt> 983 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref, 984 kmp_int32 gtid, 985 kmp_task_t *task) { 986 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n", 987 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 988 // this routine will provide task to resume 989 __kmp_task_finish<ompt>(gtid, task, NULL); 990 991 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n", 992 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 993 994 #if OMPT_SUPPORT 995 if (ompt) { 996 ompt_frame_t *ompt_frame; 997 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 998 ompt_frame->enter_frame = ompt_data_none; 999 ompt_frame->enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer; 1000 } 1001 #endif 1002 1003 return; 1004 } 1005 1006 #if OMPT_SUPPORT 1007 OMPT_NOINLINE 1008 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid, 1009 kmp_task_t *task) { 1010 __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task); 1011 } 1012 #endif // OMPT_SUPPORT 1013 1014 // __kmpc_omp_task_complete_if0: report that a task has completed execution 1015 // 1016 // loc_ref: source location information; points to end of task block. 1017 // gtid: global thread number. 1018 // task: task thunk for the completed task. 1019 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid, 1020 kmp_task_t *task) { 1021 #if OMPT_SUPPORT 1022 if (UNLIKELY(ompt_enabled.enabled)) { 1023 __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task); 1024 return; 1025 } 1026 #endif 1027 __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task); 1028 } 1029 1030 #ifdef TASK_UNUSED 1031 // __kmpc_omp_task_complete: report that a task has completed execution 1032 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 1033 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid, 1034 kmp_task_t *task) { 1035 KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid, 1036 loc_ref, KMP_TASK_TO_TASKDATA(task))); 1037 1038 __kmp_task_finish<false>(gtid, task, 1039 NULL); // Not sure how to find task to resume 1040 1041 KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid, 1042 loc_ref, KMP_TASK_TO_TASKDATA(task))); 1043 return; 1044 } 1045 #endif // TASK_UNUSED 1046 1047 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit 1048 // task for a given thread 1049 // 1050 // loc_ref: reference to source location of parallel region 1051 // this_thr: thread data structure corresponding to implicit task 1052 // team: team for this_thr 1053 // tid: thread id of given thread within team 1054 // set_curr_task: TRUE if need to push current task to thread 1055 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to 1056 // have already been done elsewhere. 1057 // TODO: Get better loc_ref. Value passed in may be NULL 1058 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr, 1059 kmp_team_t *team, int tid, int set_curr_task) { 1060 kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid]; 1061 1062 KF_TRACE( 1063 10, 1064 ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n", 1065 tid, team, task, set_curr_task ? "TRUE" : "FALSE")); 1066 1067 task->td_task_id = KMP_GEN_TASK_ID(); 1068 task->td_team = team; 1069 // task->td_parent = NULL; // fix for CQ230101 (broken parent task info 1070 // in debugger) 1071 task->td_ident = loc_ref; 1072 task->td_taskwait_ident = NULL; 1073 task->td_taskwait_counter = 0; 1074 task->td_taskwait_thread = 0; 1075 1076 task->td_flags.tiedness = TASK_TIED; 1077 task->td_flags.tasktype = TASK_IMPLICIT; 1078 task->td_flags.proxy = TASK_FULL; 1079 1080 // All implicit tasks are executed immediately, not deferred 1081 task->td_flags.task_serial = 1; 1082 task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 1083 task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 1084 1085 task->td_flags.started = 1; 1086 task->td_flags.executing = 1; 1087 task->td_flags.complete = 0; 1088 task->td_flags.freed = 0; 1089 1090 task->td_depnode = NULL; 1091 task->td_last_tied = task; 1092 task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED; 1093 1094 if (set_curr_task) { // only do this init first time thread is created 1095 KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0); 1096 // Not used: don't need to deallocate implicit task 1097 KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0); 1098 task->td_taskgroup = NULL; // An implicit task does not have taskgroup 1099 task->td_dephash = NULL; 1100 __kmp_push_current_task_to_thread(this_thr, team, tid); 1101 } else { 1102 KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0); 1103 KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0); 1104 } 1105 1106 #if OMPT_SUPPORT 1107 if (UNLIKELY(ompt_enabled.enabled)) 1108 __ompt_task_init(task, tid); 1109 #endif 1110 1111 KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid, 1112 team, task)); 1113 } 1114 1115 // __kmp_finish_implicit_task: Release resources associated to implicit tasks 1116 // at the end of parallel regions. Some resources are kept for reuse in the next 1117 // parallel region. 1118 // 1119 // thread: thread data structure corresponding to implicit task 1120 void __kmp_finish_implicit_task(kmp_info_t *thread) { 1121 kmp_taskdata_t *task = thread->th.th_current_task; 1122 if (task->td_dephash) { 1123 int children; 1124 task->td_flags.complete = 1; 1125 children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks); 1126 kmp_tasking_flags_t flags_old = task->td_flags; 1127 if (children == 0 && flags_old.complete == 1) { 1128 kmp_tasking_flags_t flags_new = flags_old; 1129 flags_new.complete = 0; 1130 if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags), 1131 *RCAST(kmp_int32 *, &flags_old), 1132 *RCAST(kmp_int32 *, &flags_new))) { 1133 KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans " 1134 "dephash of implicit task %p\n", 1135 thread->th.th_info.ds.ds_gtid, task)); 1136 __kmp_dephash_free_entries(thread, task->td_dephash); 1137 } 1138 } 1139 } 1140 } 1141 1142 // __kmp_free_implicit_task: Release resources associated to implicit tasks 1143 // when these are destroyed regions 1144 // 1145 // thread: thread data structure corresponding to implicit task 1146 void __kmp_free_implicit_task(kmp_info_t *thread) { 1147 kmp_taskdata_t *task = thread->th.th_current_task; 1148 if (task && task->td_dephash) { 1149 __kmp_dephash_free(thread, task->td_dephash); 1150 task->td_dephash = NULL; 1151 } 1152 } 1153 1154 // Round up a size to a power of two specified by val: Used to insert padding 1155 // between structures co-allocated using a single malloc() call 1156 static size_t __kmp_round_up_to_val(size_t size, size_t val) { 1157 if (size & (val - 1)) { 1158 size &= ~(val - 1); 1159 if (size <= KMP_SIZE_T_MAX - val) { 1160 size += val; // Round up if there is no overflow. 1161 } 1162 } 1163 return size; 1164 } // __kmp_round_up_to_va 1165 1166 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task 1167 // 1168 // loc_ref: source location information 1169 // gtid: global thread number. 1170 // flags: include tiedness & task type (explicit vs. implicit) of the ''new'' 1171 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine. 1172 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including 1173 // private vars accessed in task. 1174 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed 1175 // in task. 1176 // task_entry: Pointer to task code entry point generated by compiler. 1177 // returns: a pointer to the allocated kmp_task_t structure (task). 1178 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1179 kmp_tasking_flags_t *flags, 1180 size_t sizeof_kmp_task_t, size_t sizeof_shareds, 1181 kmp_routine_entry_t task_entry) { 1182 kmp_task_t *task; 1183 kmp_taskdata_t *taskdata; 1184 kmp_info_t *thread = __kmp_threads[gtid]; 1185 kmp_team_t *team = thread->th.th_team; 1186 kmp_taskdata_t *parent_task = thread->th.th_current_task; 1187 size_t shareds_offset; 1188 1189 if (!TCR_4(__kmp_init_middle)) 1190 __kmp_middle_initialize(); 1191 1192 KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) " 1193 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1194 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t, 1195 sizeof_shareds, task_entry)); 1196 1197 if (parent_task->td_flags.final) { 1198 if (flags->merged_if0) { 1199 } 1200 flags->final = 1; 1201 } 1202 if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) { 1203 // Untied task encountered causes the TSC algorithm to check entire deque of 1204 // the victim thread. If no untied task encountered, then checking the head 1205 // of the deque should be enough. 1206 KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1); 1207 } 1208 1209 // Detachable tasks are not proxy tasks yet but could be in the future. Doing 1210 // the tasking setup 1211 // when that happens is too late. 1212 if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) { 1213 if (flags->proxy == TASK_PROXY) { 1214 flags->tiedness = TASK_UNTIED; 1215 flags->merged_if0 = 1; 1216 } 1217 /* are we running in a sequential parallel or tskm_immediate_exec... we need 1218 tasking support enabled */ 1219 if ((thread->th.th_task_team) == NULL) { 1220 /* This should only happen if the team is serialized 1221 setup a task team and propagate it to the thread */ 1222 KMP_DEBUG_ASSERT(team->t.t_serialized); 1223 KA_TRACE(30, 1224 ("T#%d creating task team in __kmp_task_alloc for proxy task\n", 1225 gtid)); 1226 __kmp_task_team_setup( 1227 thread, team, 1228 1); // 1 indicates setup the current team regardless of nthreads 1229 thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state]; 1230 } 1231 kmp_task_team_t *task_team = thread->th.th_task_team; 1232 1233 /* tasking must be enabled now as the task might not be pushed */ 1234 if (!KMP_TASKING_ENABLED(task_team)) { 1235 KA_TRACE( 1236 30, 1237 ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid)); 1238 __kmp_enable_tasking(task_team, thread); 1239 kmp_int32 tid = thread->th.th_info.ds.ds_tid; 1240 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 1241 // No lock needed since only owner can allocate 1242 if (thread_data->td.td_deque == NULL) { 1243 __kmp_alloc_task_deque(thread, thread_data); 1244 } 1245 } 1246 1247 if (task_team->tt.tt_found_proxy_tasks == FALSE) 1248 TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE); 1249 } 1250 1251 // Calculate shared structure offset including padding after kmp_task_t struct 1252 // to align pointers in shared struct 1253 shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t; 1254 shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *)); 1255 1256 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 1257 KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid, 1258 shareds_offset)); 1259 KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid, 1260 sizeof_shareds)); 1261 1262 // Avoid double allocation here by combining shareds with taskdata 1263 #if USE_FAST_MEMORY 1264 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset + 1265 sizeof_shareds); 1266 #else /* ! USE_FAST_MEMORY */ 1267 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset + 1268 sizeof_shareds); 1269 #endif /* USE_FAST_MEMORY */ 1270 ANNOTATE_HAPPENS_AFTER(taskdata); 1271 1272 task = KMP_TASKDATA_TO_TASK(taskdata); 1273 1274 // Make sure task & taskdata are aligned appropriately 1275 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD 1276 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0); 1277 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0); 1278 #else 1279 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0); 1280 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0); 1281 #endif 1282 if (sizeof_shareds > 0) { 1283 // Avoid double allocation here by combining shareds with taskdata 1284 task->shareds = &((char *)taskdata)[shareds_offset]; 1285 // Make sure shareds struct is aligned to pointer size 1286 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 1287 0); 1288 } else { 1289 task->shareds = NULL; 1290 } 1291 task->routine = task_entry; 1292 task->part_id = 0; // AC: Always start with 0 part id 1293 1294 taskdata->td_task_id = KMP_GEN_TASK_ID(); 1295 taskdata->td_team = team; 1296 taskdata->td_alloc_thread = thread; 1297 taskdata->td_parent = parent_task; 1298 taskdata->td_level = parent_task->td_level + 1; // increment nesting level 1299 KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0); 1300 taskdata->td_ident = loc_ref; 1301 taskdata->td_taskwait_ident = NULL; 1302 taskdata->td_taskwait_counter = 0; 1303 taskdata->td_taskwait_thread = 0; 1304 KMP_DEBUG_ASSERT(taskdata->td_parent != NULL); 1305 // avoid copying icvs for proxy tasks 1306 if (flags->proxy == TASK_FULL) 1307 copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs); 1308 1309 taskdata->td_flags.tiedness = flags->tiedness; 1310 taskdata->td_flags.final = flags->final; 1311 taskdata->td_flags.merged_if0 = flags->merged_if0; 1312 taskdata->td_flags.destructors_thunk = flags->destructors_thunk; 1313 taskdata->td_flags.proxy = flags->proxy; 1314 taskdata->td_flags.detachable = flags->detachable; 1315 taskdata->td_task_team = thread->th.th_task_team; 1316 taskdata->td_size_alloc = shareds_offset + sizeof_shareds; 1317 taskdata->td_flags.tasktype = TASK_EXPLICIT; 1318 1319 // GEH - TODO: fix this to copy parent task's value of tasking_ser flag 1320 taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 1321 1322 // GEH - TODO: fix this to copy parent task's value of team_serial flag 1323 taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 1324 1325 // GEH - Note we serialize the task if the team is serialized to make sure 1326 // implicit parallel region tasks are not left until program termination to 1327 // execute. Also, it helps locality to execute immediately. 1328 1329 taskdata->td_flags.task_serial = 1330 (parent_task->td_flags.final || taskdata->td_flags.team_serial || 1331 taskdata->td_flags.tasking_ser); 1332 1333 taskdata->td_flags.started = 0; 1334 taskdata->td_flags.executing = 0; 1335 taskdata->td_flags.complete = 0; 1336 taskdata->td_flags.freed = 0; 1337 1338 taskdata->td_flags.native = flags->native; 1339 1340 KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0); 1341 // start at one because counts current task and children 1342 KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1); 1343 taskdata->td_taskgroup = 1344 parent_task->td_taskgroup; // task inherits taskgroup from the parent task 1345 taskdata->td_dephash = NULL; 1346 taskdata->td_depnode = NULL; 1347 if (flags->tiedness == TASK_UNTIED) 1348 taskdata->td_last_tied = NULL; // will be set when the task is scheduled 1349 else 1350 taskdata->td_last_tied = taskdata; 1351 taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED; 1352 #if OMPT_SUPPORT 1353 if (UNLIKELY(ompt_enabled.enabled)) 1354 __ompt_task_init(taskdata, gtid); 1355 #endif 1356 // Only need to keep track of child task counts if team parallel and tasking not 1357 // serialized or if it is a proxy or detachable task 1358 if (flags->proxy == TASK_PROXY || 1359 flags->detachable == TASK_DETACHABLE || 1360 !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) 1361 { 1362 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks); 1363 if (parent_task->td_taskgroup) 1364 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count); 1365 // Only need to keep track of allocated child tasks for explicit tasks since 1366 // implicit not deallocated 1367 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) { 1368 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks); 1369 } 1370 } 1371 1372 KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n", 1373 gtid, taskdata, taskdata->td_parent)); 1374 ANNOTATE_HAPPENS_BEFORE(task); 1375 1376 return task; 1377 } 1378 1379 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1380 kmp_int32 flags, size_t sizeof_kmp_task_t, 1381 size_t sizeof_shareds, 1382 kmp_routine_entry_t task_entry) { 1383 kmp_task_t *retval; 1384 kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags; 1385 1386 input_flags->native = FALSE; 1387 // __kmp_task_alloc() sets up all other runtime flags 1388 1389 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) " 1390 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1391 gtid, loc_ref, input_flags->tiedness ? "tied " : "untied", 1392 input_flags->proxy ? "proxy" : "", 1393 input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t, 1394 sizeof_shareds, task_entry)); 1395 1396 retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t, 1397 sizeof_shareds, task_entry); 1398 1399 KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval)); 1400 1401 return retval; 1402 } 1403 1404 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1405 kmp_int32 flags, 1406 size_t sizeof_kmp_task_t, 1407 size_t sizeof_shareds, 1408 kmp_routine_entry_t task_entry, 1409 kmp_int64 device_id) { 1410 return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t, 1411 sizeof_shareds, task_entry); 1412 } 1413 1414 /*! 1415 @ingroup TASKING 1416 @param loc_ref location of the original task directive 1417 @param gtid Global Thread ID of encountering thread 1418 @param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new 1419 task'' 1420 @param naffins Number of affinity items 1421 @param affin_list List of affinity items 1422 @return Returns non-zero if registering affinity information was not successful. 1423 Returns 0 if registration was successful 1424 This entry registers the affinity information attached to a task with the task 1425 thunk structure kmp_taskdata_t. 1426 */ 1427 kmp_int32 1428 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, 1429 kmp_task_t *new_task, kmp_int32 naffins, 1430 kmp_task_affinity_info_t *affin_list) { 1431 return 0; 1432 } 1433 1434 // __kmp_invoke_task: invoke the specified task 1435 // 1436 // gtid: global thread ID of caller 1437 // task: the task to invoke 1438 // current_task: the task to resume after task invocation 1439 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, 1440 kmp_taskdata_t *current_task) { 1441 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 1442 kmp_info_t *thread; 1443 int discard = 0 /* false */; 1444 KA_TRACE( 1445 30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n", 1446 gtid, taskdata, current_task)); 1447 KMP_DEBUG_ASSERT(task); 1448 if (taskdata->td_flags.proxy == TASK_PROXY && 1449 taskdata->td_flags.complete == 1) { 1450 // This is a proxy task that was already completed but it needs to run 1451 // its bottom-half finish 1452 KA_TRACE( 1453 30, 1454 ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n", 1455 gtid, taskdata)); 1456 1457 __kmp_bottom_half_finish_proxy(gtid, task); 1458 1459 KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for " 1460 "proxy task %p, resuming task %p\n", 1461 gtid, taskdata, current_task)); 1462 1463 return; 1464 } 1465 1466 #if OMPT_SUPPORT 1467 // For untied tasks, the first task executed only calls __kmpc_omp_task and 1468 // does not execute code. 1469 ompt_thread_info_t oldInfo; 1470 if (UNLIKELY(ompt_enabled.enabled)) { 1471 // Store the threads states and restore them after the task 1472 thread = __kmp_threads[gtid]; 1473 oldInfo = thread->th.ompt_thread_info; 1474 thread->th.ompt_thread_info.wait_id = 0; 1475 thread->th.ompt_thread_info.state = (thread->th.th_team_serialized) 1476 ? ompt_state_work_serial 1477 : ompt_state_work_parallel; 1478 taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1479 } 1480 #endif 1481 1482 // Proxy tasks are not handled by the runtime 1483 if (taskdata->td_flags.proxy != TASK_PROXY) { 1484 ANNOTATE_HAPPENS_AFTER(task); 1485 __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded 1486 } 1487 1488 // TODO: cancel tasks if the parallel region has also been cancelled 1489 // TODO: check if this sequence can be hoisted above __kmp_task_start 1490 // if cancellation has been enabled for this run ... 1491 if (__kmp_omp_cancellation) { 1492 thread = __kmp_threads[gtid]; 1493 kmp_team_t *this_team = thread->th.th_team; 1494 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 1495 if ((taskgroup && taskgroup->cancel_request) || 1496 (this_team->t.t_cancel_request == cancel_parallel)) { 1497 #if OMPT_SUPPORT && OMPT_OPTIONAL 1498 ompt_data_t *task_data; 1499 if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) { 1500 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL); 1501 ompt_callbacks.ompt_callback(ompt_callback_cancel)( 1502 task_data, 1503 ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup 1504 : ompt_cancel_parallel) | 1505 ompt_cancel_discarded_task, 1506 NULL); 1507 } 1508 #endif 1509 KMP_COUNT_BLOCK(TASK_cancelled); 1510 // this task belongs to a task group and we need to cancel it 1511 discard = 1 /* true */; 1512 } 1513 } 1514 1515 // Invoke the task routine and pass in relevant data. 1516 // Thunks generated by gcc take a different argument list. 1517 if (!discard) { 1518 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 1519 taskdata->td_last_tied = current_task->td_last_tied; 1520 KMP_DEBUG_ASSERT(taskdata->td_last_tied); 1521 } 1522 #if KMP_STATS_ENABLED 1523 KMP_COUNT_BLOCK(TASK_executed); 1524 switch (KMP_GET_THREAD_STATE()) { 1525 case FORK_JOIN_BARRIER: 1526 KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar); 1527 break; 1528 case PLAIN_BARRIER: 1529 KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar); 1530 break; 1531 case TASKYIELD: 1532 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield); 1533 break; 1534 case TASKWAIT: 1535 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait); 1536 break; 1537 case TASKGROUP: 1538 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup); 1539 break; 1540 default: 1541 KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate); 1542 break; 1543 } 1544 #endif // KMP_STATS_ENABLED 1545 1546 // OMPT task begin 1547 #if OMPT_SUPPORT 1548 if (UNLIKELY(ompt_enabled.enabled)) 1549 __ompt_task_start(task, current_task, gtid); 1550 #endif 1551 1552 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1553 kmp_uint64 cur_time; 1554 kmp_int32 kmp_itt_count_task = 1555 __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial && 1556 current_task->td_flags.tasktype == TASK_IMPLICIT; 1557 if (kmp_itt_count_task) { 1558 thread = __kmp_threads[gtid]; 1559 // Time outer level explicit task on barrier for adjusting imbalance time 1560 if (thread->th.th_bar_arrive_time) 1561 cur_time = __itt_get_timestamp(); 1562 else 1563 kmp_itt_count_task = 0; // thread is not on a barrier - skip timing 1564 } 1565 #endif 1566 1567 #ifdef KMP_GOMP_COMPAT 1568 if (taskdata->td_flags.native) { 1569 ((void (*)(void *))(*(task->routine)))(task->shareds); 1570 } else 1571 #endif /* KMP_GOMP_COMPAT */ 1572 { 1573 (*(task->routine))(gtid, task); 1574 } 1575 KMP_POP_PARTITIONED_TIMER(); 1576 1577 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1578 if (kmp_itt_count_task) { 1579 // Barrier imbalance - adjust arrive time with the task duration 1580 thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time); 1581 } 1582 #endif 1583 1584 } 1585 1586 1587 // Proxy tasks are not handled by the runtime 1588 if (taskdata->td_flags.proxy != TASK_PROXY) { 1589 ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent); 1590 #if OMPT_SUPPORT 1591 if (UNLIKELY(ompt_enabled.enabled)) { 1592 thread->th.ompt_thread_info = oldInfo; 1593 if (taskdata->td_flags.tiedness == TASK_TIED) { 1594 taskdata->ompt_task_info.frame.exit_frame = ompt_data_none; 1595 } 1596 __kmp_task_finish<true>(gtid, task, current_task); 1597 } else 1598 #endif 1599 __kmp_task_finish<false>(gtid, task, current_task); 1600 } 1601 1602 KA_TRACE( 1603 30, 1604 ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n", 1605 gtid, taskdata, current_task)); 1606 return; 1607 } 1608 1609 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution 1610 // 1611 // loc_ref: location of original task pragma (ignored) 1612 // gtid: Global Thread ID of encountering thread 1613 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task'' 1614 // Returns: 1615 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1616 // be resumed later. 1617 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1618 // resumed later. 1619 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid, 1620 kmp_task_t *new_task) { 1621 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1622 1623 KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid, 1624 loc_ref, new_taskdata)); 1625 1626 #if OMPT_SUPPORT 1627 kmp_taskdata_t *parent; 1628 if (UNLIKELY(ompt_enabled.enabled)) { 1629 parent = new_taskdata->td_parent; 1630 if (ompt_enabled.ompt_callback_task_create) { 1631 ompt_data_t task_data = ompt_data_none; 1632 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1633 parent ? &(parent->ompt_task_info.task_data) : &task_data, 1634 parent ? &(parent->ompt_task_info.frame) : NULL, 1635 &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0, 1636 OMPT_GET_RETURN_ADDRESS(0)); 1637 } 1638 } 1639 #endif 1640 1641 /* Should we execute the new task or queue it? For now, let's just always try 1642 to queue it. If the queue fills up, then we'll execute it. */ 1643 1644 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1645 { // Execute this task immediately 1646 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1647 new_taskdata->td_flags.task_serial = 1; 1648 __kmp_invoke_task(gtid, new_task, current_task); 1649 } 1650 1651 KA_TRACE( 1652 10, 1653 ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: " 1654 "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n", 1655 gtid, loc_ref, new_taskdata)); 1656 1657 ANNOTATE_HAPPENS_BEFORE(new_task); 1658 #if OMPT_SUPPORT 1659 if (UNLIKELY(ompt_enabled.enabled)) { 1660 parent->ompt_task_info.frame.enter_frame = ompt_data_none; 1661 } 1662 #endif 1663 return TASK_CURRENT_NOT_QUEUED; 1664 } 1665 1666 // __kmp_omp_task: Schedule a non-thread-switchable task for execution 1667 // 1668 // gtid: Global Thread ID of encountering thread 1669 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc() 1670 // serialize_immediate: if TRUE then if the task is executed immediately its 1671 // execution will be serialized 1672 // Returns: 1673 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1674 // be resumed later. 1675 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1676 // resumed later. 1677 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task, 1678 bool serialize_immediate) { 1679 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1680 1681 /* Should we execute the new task or queue it? For now, let's just always try 1682 to queue it. If the queue fills up, then we'll execute it. */ 1683 if (new_taskdata->td_flags.proxy == TASK_PROXY || 1684 __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1685 { // Execute this task immediately 1686 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1687 if (serialize_immediate) 1688 new_taskdata->td_flags.task_serial = 1; 1689 __kmp_invoke_task(gtid, new_task, current_task); 1690 } 1691 1692 ANNOTATE_HAPPENS_BEFORE(new_task); 1693 return TASK_CURRENT_NOT_QUEUED; 1694 } 1695 1696 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a 1697 // non-thread-switchable task from the parent thread only! 1698 // 1699 // loc_ref: location of original task pragma (ignored) 1700 // gtid: Global Thread ID of encountering thread 1701 // new_task: non-thread-switchable task thunk allocated by 1702 // __kmp_omp_task_alloc() 1703 // Returns: 1704 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1705 // be resumed later. 1706 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1707 // resumed later. 1708 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid, 1709 kmp_task_t *new_task) { 1710 kmp_int32 res; 1711 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); 1712 1713 #if KMP_DEBUG || OMPT_SUPPORT 1714 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1715 #endif 1716 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref, 1717 new_taskdata)); 1718 1719 #if OMPT_SUPPORT 1720 kmp_taskdata_t *parent = NULL; 1721 if (UNLIKELY(ompt_enabled.enabled)) { 1722 if (!new_taskdata->td_flags.started) { 1723 OMPT_STORE_RETURN_ADDRESS(gtid); 1724 parent = new_taskdata->td_parent; 1725 if (!parent->ompt_task_info.frame.enter_frame.ptr) { 1726 parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1727 } 1728 if (ompt_enabled.ompt_callback_task_create) { 1729 ompt_data_t task_data = ompt_data_none; 1730 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1731 parent ? &(parent->ompt_task_info.task_data) : &task_data, 1732 parent ? &(parent->ompt_task_info.frame) : NULL, 1733 &(new_taskdata->ompt_task_info.task_data), 1734 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, 1735 OMPT_LOAD_RETURN_ADDRESS(gtid)); 1736 } 1737 } else { 1738 // We are scheduling the continuation of an UNTIED task. 1739 // Scheduling back to the parent task. 1740 __ompt_task_finish(new_task, 1741 new_taskdata->ompt_task_info.scheduling_parent, 1742 ompt_task_switch); 1743 new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none; 1744 } 1745 } 1746 #endif 1747 1748 res = __kmp_omp_task(gtid, new_task, true); 1749 1750 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning " 1751 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n", 1752 gtid, loc_ref, new_taskdata)); 1753 #if OMPT_SUPPORT 1754 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) { 1755 parent->ompt_task_info.frame.enter_frame = ompt_data_none; 1756 } 1757 #endif 1758 return res; 1759 } 1760 1761 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule 1762 // a taskloop task with the correct OMPT return address 1763 // 1764 // loc_ref: location of original task pragma (ignored) 1765 // gtid: Global Thread ID of encountering thread 1766 // new_task: non-thread-switchable task thunk allocated by 1767 // __kmp_omp_task_alloc() 1768 // codeptr_ra: return address for OMPT callback 1769 // Returns: 1770 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1771 // be resumed later. 1772 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1773 // resumed later. 1774 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid, 1775 kmp_task_t *new_task, void *codeptr_ra) { 1776 kmp_int32 res; 1777 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); 1778 1779 #if KMP_DEBUG || OMPT_SUPPORT 1780 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1781 #endif 1782 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref, 1783 new_taskdata)); 1784 1785 #if OMPT_SUPPORT 1786 kmp_taskdata_t *parent = NULL; 1787 if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) { 1788 parent = new_taskdata->td_parent; 1789 if (!parent->ompt_task_info.frame.enter_frame.ptr) 1790 parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1791 if (ompt_enabled.ompt_callback_task_create) { 1792 ompt_data_t task_data = ompt_data_none; 1793 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1794 parent ? &(parent->ompt_task_info.task_data) : &task_data, 1795 parent ? &(parent->ompt_task_info.frame) : NULL, 1796 &(new_taskdata->ompt_task_info.task_data), 1797 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, 1798 codeptr_ra); 1799 } 1800 } 1801 #endif 1802 1803 res = __kmp_omp_task(gtid, new_task, true); 1804 1805 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning " 1806 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n", 1807 gtid, loc_ref, new_taskdata)); 1808 #if OMPT_SUPPORT 1809 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) { 1810 parent->ompt_task_info.frame.enter_frame = ompt_data_none; 1811 } 1812 #endif 1813 return res; 1814 } 1815 1816 template <bool ompt> 1817 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid, 1818 void *frame_address, 1819 void *return_address) { 1820 kmp_taskdata_t *taskdata; 1821 kmp_info_t *thread; 1822 int thread_finished = FALSE; 1823 KMP_SET_THREAD_STATE_BLOCK(TASKWAIT); 1824 1825 KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref)); 1826 1827 if (__kmp_tasking_mode != tskm_immediate_exec) { 1828 thread = __kmp_threads[gtid]; 1829 taskdata = thread->th.th_current_task; 1830 1831 #if OMPT_SUPPORT && OMPT_OPTIONAL 1832 ompt_data_t *my_task_data; 1833 ompt_data_t *my_parallel_data; 1834 1835 if (ompt) { 1836 my_task_data = &(taskdata->ompt_task_info.task_data); 1837 my_parallel_data = OMPT_CUR_TEAM_DATA(thread); 1838 1839 taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address; 1840 1841 if (ompt_enabled.ompt_callback_sync_region) { 1842 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1843 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data, 1844 my_task_data, return_address); 1845 } 1846 1847 if (ompt_enabled.ompt_callback_sync_region_wait) { 1848 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1849 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data, 1850 my_task_data, return_address); 1851 } 1852 } 1853 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1854 1855 // Debugger: The taskwait is active. Store location and thread encountered the 1856 // taskwait. 1857 #if USE_ITT_BUILD 1858 // Note: These values are used by ITT events as well. 1859 #endif /* USE_ITT_BUILD */ 1860 taskdata->td_taskwait_counter += 1; 1861 taskdata->td_taskwait_ident = loc_ref; 1862 taskdata->td_taskwait_thread = gtid + 1; 1863 1864 #if USE_ITT_BUILD 1865 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 1866 if (itt_sync_obj != NULL) 1867 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 1868 #endif /* USE_ITT_BUILD */ 1869 1870 bool must_wait = 1871 !taskdata->td_flags.team_serial && !taskdata->td_flags.final; 1872 1873 must_wait = must_wait || (thread->th.th_task_team != NULL && 1874 thread->th.th_task_team->tt.tt_found_proxy_tasks); 1875 if (must_wait) { 1876 kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, 1877 &(taskdata->td_incomplete_child_tasks)), 1878 0U); 1879 while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) { 1880 flag.execute_tasks(thread, gtid, FALSE, 1881 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 1882 __kmp_task_stealing_constraint); 1883 } 1884 } 1885 #if USE_ITT_BUILD 1886 if (itt_sync_obj != NULL) 1887 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 1888 #endif /* USE_ITT_BUILD */ 1889 1890 // Debugger: The taskwait is completed. Location remains, but thread is 1891 // negated. 1892 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 1893 1894 #if OMPT_SUPPORT && OMPT_OPTIONAL 1895 if (ompt) { 1896 if (ompt_enabled.ompt_callback_sync_region_wait) { 1897 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1898 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data, 1899 my_task_data, return_address); 1900 } 1901 if (ompt_enabled.ompt_callback_sync_region) { 1902 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1903 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data, 1904 my_task_data, return_address); 1905 } 1906 taskdata->ompt_task_info.frame.enter_frame = ompt_data_none; 1907 } 1908 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1909 1910 ANNOTATE_HAPPENS_AFTER(taskdata); 1911 } 1912 1913 KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, " 1914 "returning TASK_CURRENT_NOT_QUEUED\n", 1915 gtid, taskdata)); 1916 1917 return TASK_CURRENT_NOT_QUEUED; 1918 } 1919 1920 #if OMPT_SUPPORT && OMPT_OPTIONAL 1921 OMPT_NOINLINE 1922 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid, 1923 void *frame_address, 1924 void *return_address) { 1925 return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address, 1926 return_address); 1927 } 1928 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1929 1930 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are 1931 // complete 1932 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) { 1933 #if OMPT_SUPPORT && OMPT_OPTIONAL 1934 if (UNLIKELY(ompt_enabled.enabled)) { 1935 OMPT_STORE_RETURN_ADDRESS(gtid); 1936 return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0), 1937 OMPT_LOAD_RETURN_ADDRESS(gtid)); 1938 } 1939 #endif 1940 return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL); 1941 } 1942 1943 // __kmpc_omp_taskyield: switch to a different task 1944 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) { 1945 kmp_taskdata_t *taskdata; 1946 kmp_info_t *thread; 1947 int thread_finished = FALSE; 1948 1949 KMP_COUNT_BLOCK(OMP_TASKYIELD); 1950 KMP_SET_THREAD_STATE_BLOCK(TASKYIELD); 1951 1952 KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n", 1953 gtid, loc_ref, end_part)); 1954 1955 if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) { 1956 thread = __kmp_threads[gtid]; 1957 taskdata = thread->th.th_current_task; 1958 // Should we model this as a task wait or not? 1959 // Debugger: The taskwait is active. Store location and thread encountered the 1960 // taskwait. 1961 #if USE_ITT_BUILD 1962 // Note: These values are used by ITT events as well. 1963 #endif /* USE_ITT_BUILD */ 1964 taskdata->td_taskwait_counter += 1; 1965 taskdata->td_taskwait_ident = loc_ref; 1966 taskdata->td_taskwait_thread = gtid + 1; 1967 1968 #if USE_ITT_BUILD 1969 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 1970 if (itt_sync_obj != NULL) 1971 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 1972 #endif /* USE_ITT_BUILD */ 1973 if (!taskdata->td_flags.team_serial) { 1974 kmp_task_team_t *task_team = thread->th.th_task_team; 1975 if (task_team != NULL) { 1976 if (KMP_TASKING_ENABLED(task_team)) { 1977 #if OMPT_SUPPORT 1978 if (UNLIKELY(ompt_enabled.enabled)) 1979 thread->th.ompt_thread_info.ompt_task_yielded = 1; 1980 #endif 1981 __kmp_execute_tasks_32( 1982 thread, gtid, NULL, FALSE, 1983 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 1984 __kmp_task_stealing_constraint); 1985 #if OMPT_SUPPORT 1986 if (UNLIKELY(ompt_enabled.enabled)) 1987 thread->th.ompt_thread_info.ompt_task_yielded = 0; 1988 #endif 1989 } 1990 } 1991 } 1992 #if USE_ITT_BUILD 1993 if (itt_sync_obj != NULL) 1994 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 1995 #endif /* USE_ITT_BUILD */ 1996 1997 // Debugger: The taskwait is completed. Location remains, but thread is 1998 // negated. 1999 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 2000 } 2001 2002 KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, " 2003 "returning TASK_CURRENT_NOT_QUEUED\n", 2004 gtid, taskdata)); 2005 2006 return TASK_CURRENT_NOT_QUEUED; 2007 } 2008 2009 // Task Reduction implementation 2010 // 2011 // Note: initial implementation didn't take into account the possibility 2012 // to specify omp_orig for initializer of the UDR (user defined reduction). 2013 // Corrected implementation takes into account the omp_orig object. 2014 // Compiler is free to use old implementation if omp_orig is not specified. 2015 2016 /*! 2017 @ingroup BASIC_TYPES 2018 @{ 2019 */ 2020 2021 /*! 2022 Flags for special info per task reduction item. 2023 */ 2024 typedef struct kmp_taskred_flags { 2025 /*! 1 - use lazy alloc/init (e.g. big objects, #tasks < #threads) */ 2026 unsigned lazy_priv : 1; 2027 unsigned reserved31 : 31; 2028 } kmp_taskred_flags_t; 2029 2030 /*! 2031 Internal struct for reduction data item related info set up by compiler. 2032 */ 2033 typedef struct kmp_task_red_input { 2034 void *reduce_shar; /**< shared between tasks item to reduce into */ 2035 size_t reduce_size; /**< size of data item in bytes */ 2036 // three compiler-generated routines (init, fini are optional): 2037 void *reduce_init; /**< data initialization routine (single parameter) */ 2038 void *reduce_fini; /**< data finalization routine */ 2039 void *reduce_comb; /**< data combiner routine */ 2040 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */ 2041 } kmp_task_red_input_t; 2042 2043 /*! 2044 Internal struct for reduction data item related info saved by the library. 2045 */ 2046 typedef struct kmp_taskred_data { 2047 void *reduce_shar; /**< shared between tasks item to reduce into */ 2048 size_t reduce_size; /**< size of data item */ 2049 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */ 2050 void *reduce_priv; /**< array of thread specific items */ 2051 void *reduce_pend; /**< end of private data for faster comparison op */ 2052 // three compiler-generated routines (init, fini are optional): 2053 void *reduce_comb; /**< data combiner routine */ 2054 void *reduce_init; /**< data initialization routine (two parameters) */ 2055 void *reduce_fini; /**< data finalization routine */ 2056 void *reduce_orig; /**< original item (can be used in UDR initializer) */ 2057 } kmp_taskred_data_t; 2058 2059 /*! 2060 Internal struct for reduction data item related info set up by compiler. 2061 2062 New interface: added reduce_orig field to provide omp_orig for UDR initializer. 2063 */ 2064 typedef struct kmp_taskred_input { 2065 void *reduce_shar; /**< shared between tasks item to reduce into */ 2066 void *reduce_orig; /**< original reduction item used for initialization */ 2067 size_t reduce_size; /**< size of data item */ 2068 // three compiler-generated routines (init, fini are optional): 2069 void *reduce_init; /**< data initialization routine (two parameters) */ 2070 void *reduce_fini; /**< data finalization routine */ 2071 void *reduce_comb; /**< data combiner routine */ 2072 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */ 2073 } kmp_taskred_input_t; 2074 /*! 2075 @} 2076 */ 2077 2078 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src); 2079 template <> 2080 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item, 2081 kmp_task_red_input_t &src) { 2082 item.reduce_orig = NULL; 2083 } 2084 template <> 2085 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item, 2086 kmp_taskred_input_t &src) { 2087 if (src.reduce_orig != NULL) { 2088 item.reduce_orig = src.reduce_orig; 2089 } else { 2090 item.reduce_orig = src.reduce_shar; 2091 } // non-NULL reduce_orig means new interface used 2092 } 2093 2094 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, int j); 2095 template <> 2096 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item, 2097 int offset) { 2098 ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset); 2099 } 2100 template <> 2101 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item, 2102 int offset) { 2103 ((void (*)(void *, void *))item.reduce_init)( 2104 (char *)(item.reduce_priv) + offset, item.reduce_orig); 2105 } 2106 2107 template <typename T> 2108 void *__kmp_task_reduction_init(int gtid, int num, T *data) { 2109 kmp_info_t *thread = __kmp_threads[gtid]; 2110 kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup; 2111 kmp_int32 nth = thread->th.th_team_nproc; 2112 kmp_taskred_data_t *arr; 2113 2114 // check input data just in case 2115 KMP_ASSERT(tg != NULL); 2116 KMP_ASSERT(data != NULL); 2117 KMP_ASSERT(num > 0); 2118 if (nth == 1) { 2119 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n", 2120 gtid, tg)); 2121 return (void *)tg; 2122 } 2123 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n", 2124 gtid, tg, num)); 2125 arr = (kmp_taskred_data_t *)__kmp_thread_malloc( 2126 thread, num * sizeof(kmp_taskred_data_t)); 2127 for (int i = 0; i < num; ++i) { 2128 size_t size = data[i].reduce_size - 1; 2129 // round the size up to cache line per thread-specific item 2130 size += CACHE_LINE - size % CACHE_LINE; 2131 KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory 2132 arr[i].reduce_shar = data[i].reduce_shar; 2133 arr[i].reduce_size = size; 2134 arr[i].flags = data[i].flags; 2135 arr[i].reduce_comb = data[i].reduce_comb; 2136 arr[i].reduce_init = data[i].reduce_init; 2137 arr[i].reduce_fini = data[i].reduce_fini; 2138 __kmp_assign_orig<T>(arr[i], data[i]); 2139 if (!arr[i].flags.lazy_priv) { 2140 // allocate cache-line aligned block and fill it with zeros 2141 arr[i].reduce_priv = __kmp_allocate(nth * size); 2142 arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size; 2143 if (arr[i].reduce_init != NULL) { 2144 // initialize all thread-specific items 2145 for (int j = 0; j < nth; ++j) { 2146 __kmp_call_init<T>(arr[i], j * size); 2147 } 2148 } 2149 } else { 2150 // only allocate space for pointers now, 2151 // objects will be lazily allocated/initialized if/when requested 2152 // note that __kmp_allocate zeroes the allocated memory 2153 arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *)); 2154 } 2155 } 2156 tg->reduce_data = (void *)arr; 2157 tg->reduce_num_data = num; 2158 return (void *)tg; 2159 } 2160 2161 /*! 2162 @ingroup TASKING 2163 @param gtid Global thread ID 2164 @param num Number of data items to reduce 2165 @param data Array of data for reduction 2166 @return The taskgroup identifier 2167 2168 Initialize task reduction for the taskgroup. 2169 2170 Note: this entry supposes the optional compiler-generated initializer routine 2171 has single parameter - pointer to object to be initialized. That means 2172 the reduction either does not use omp_orig object, or the omp_orig is accessible 2173 without help of the runtime library. 2174 */ 2175 void *__kmpc_task_reduction_init(int gtid, int num, void *data) { 2176 return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data); 2177 } 2178 2179 /*! 2180 @ingroup TASKING 2181 @param gtid Global thread ID 2182 @param num Number of data items to reduce 2183 @param data Array of data for reduction 2184 @return The taskgroup identifier 2185 2186 Initialize task reduction for the taskgroup. 2187 2188 Note: this entry supposes the optional compiler-generated initializer routine 2189 has two parameters, pointer to object to be initialized and pointer to omp_orig 2190 */ 2191 void *__kmpc_taskred_init(int gtid, int num, void *data) { 2192 return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data); 2193 } 2194 2195 // Copy task reduction data (except for shared pointers). 2196 template <typename T> 2197 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data, 2198 kmp_taskgroup_t *tg, void *reduce_data) { 2199 kmp_taskred_data_t *arr; 2200 KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p," 2201 " from data %p\n", 2202 thr, tg, reduce_data)); 2203 arr = (kmp_taskred_data_t *)__kmp_thread_malloc( 2204 thr, num * sizeof(kmp_taskred_data_t)); 2205 // threads will share private copies, thunk routines, sizes, flags, etc.: 2206 KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t)); 2207 for (int i = 0; i < num; ++i) { 2208 arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers 2209 } 2210 tg->reduce_data = (void *)arr; 2211 tg->reduce_num_data = num; 2212 } 2213 2214 /*! 2215 @ingroup TASKING 2216 @param gtid Global thread ID 2217 @param tskgrp The taskgroup ID (optional) 2218 @param data Shared location of the item 2219 @return The pointer to per-thread data 2220 2221 Get thread-specific location of data item 2222 */ 2223 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) { 2224 kmp_info_t *thread = __kmp_threads[gtid]; 2225 kmp_int32 nth = thread->th.th_team_nproc; 2226 if (nth == 1) 2227 return data; // nothing to do 2228 2229 kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp; 2230 if (tg == NULL) 2231 tg = thread->th.th_current_task->td_taskgroup; 2232 KMP_ASSERT(tg != NULL); 2233 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data); 2234 kmp_int32 num = tg->reduce_num_data; 2235 kmp_int32 tid = thread->th.th_info.ds.ds_tid; 2236 2237 KMP_ASSERT(data != NULL); 2238 while (tg != NULL) { 2239 for (int i = 0; i < num; ++i) { 2240 if (!arr[i].flags.lazy_priv) { 2241 if (data == arr[i].reduce_shar || 2242 (data >= arr[i].reduce_priv && data < arr[i].reduce_pend)) 2243 return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size; 2244 } else { 2245 // check shared location first 2246 void **p_priv = (void **)(arr[i].reduce_priv); 2247 if (data == arr[i].reduce_shar) 2248 goto found; 2249 // check if we get some thread specific location as parameter 2250 for (int j = 0; j < nth; ++j) 2251 if (data == p_priv[j]) 2252 goto found; 2253 continue; // not found, continue search 2254 found: 2255 if (p_priv[tid] == NULL) { 2256 // allocate thread specific object lazily 2257 p_priv[tid] = __kmp_allocate(arr[i].reduce_size); 2258 if (arr[i].reduce_init != NULL) { 2259 if (arr[i].reduce_orig != NULL) { // new interface 2260 ((void (*)(void *, void *))arr[i].reduce_init)( 2261 p_priv[tid], arr[i].reduce_orig); 2262 } else { // old interface (single parameter) 2263 ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]); 2264 } 2265 } 2266 } 2267 return p_priv[tid]; 2268 } 2269 } 2270 tg = tg->parent; 2271 arr = (kmp_taskred_data_t *)(tg->reduce_data); 2272 num = tg->reduce_num_data; 2273 } 2274 KMP_ASSERT2(0, "Unknown task reduction item"); 2275 return NULL; // ERROR, this line never executed 2276 } 2277 2278 // Finalize task reduction. 2279 // Called from __kmpc_end_taskgroup() 2280 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) { 2281 kmp_int32 nth = th->th.th_team_nproc; 2282 KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1 2283 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data; 2284 kmp_int32 num = tg->reduce_num_data; 2285 for (int i = 0; i < num; ++i) { 2286 void *sh_data = arr[i].reduce_shar; 2287 void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini); 2288 void (*f_comb)(void *, void *) = 2289 (void (*)(void *, void *))(arr[i].reduce_comb); 2290 if (!arr[i].flags.lazy_priv) { 2291 void *pr_data = arr[i].reduce_priv; 2292 size_t size = arr[i].reduce_size; 2293 for (int j = 0; j < nth; ++j) { 2294 void *priv_data = (char *)pr_data + j * size; 2295 f_comb(sh_data, priv_data); // combine results 2296 if (f_fini) 2297 f_fini(priv_data); // finalize if needed 2298 } 2299 } else { 2300 void **pr_data = (void **)(arr[i].reduce_priv); 2301 for (int j = 0; j < nth; ++j) { 2302 if (pr_data[j] != NULL) { 2303 f_comb(sh_data, pr_data[j]); // combine results 2304 if (f_fini) 2305 f_fini(pr_data[j]); // finalize if needed 2306 __kmp_free(pr_data[j]); 2307 } 2308 } 2309 } 2310 __kmp_free(arr[i].reduce_priv); 2311 } 2312 __kmp_thread_free(th, arr); 2313 tg->reduce_data = NULL; 2314 tg->reduce_num_data = 0; 2315 } 2316 2317 // Cleanup task reduction data for parallel or worksharing, 2318 // do not touch task private data other threads still working with. 2319 // Called from __kmpc_end_taskgroup() 2320 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) { 2321 __kmp_thread_free(th, tg->reduce_data); 2322 tg->reduce_data = NULL; 2323 tg->reduce_num_data = 0; 2324 } 2325 2326 template <typename T> 2327 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, 2328 int num, T *data) { 2329 kmp_info_t *thr = __kmp_threads[gtid]; 2330 kmp_int32 nth = thr->th.th_team_nproc; 2331 __kmpc_taskgroup(loc, gtid); // form new taskgroup first 2332 if (nth == 1) { 2333 KA_TRACE(10, 2334 ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n", 2335 gtid, thr->th.th_current_task->td_taskgroup)); 2336 return (void *)thr->th.th_current_task->td_taskgroup; 2337 } 2338 kmp_team_t *team = thr->th.th_team; 2339 void *reduce_data; 2340 kmp_taskgroup_t *tg; 2341 reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]); 2342 if (reduce_data == NULL && 2343 __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data, 2344 (void *)1)) { 2345 // single thread enters this block to initialize common reduction data 2346 KMP_DEBUG_ASSERT(reduce_data == NULL); 2347 // first initialize own data, then make a copy other threads can use 2348 tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data); 2349 reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t)); 2350 KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t)); 2351 // fini counters should be 0 at this point 2352 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0); 2353 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0); 2354 KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data); 2355 } else { 2356 while ( 2357 (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) == 2358 (void *)1) { // wait for task reduction initialization 2359 KMP_CPU_PAUSE(); 2360 } 2361 KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here 2362 tg = thr->th.th_current_task->td_taskgroup; 2363 __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data); 2364 } 2365 return tg; 2366 } 2367 2368 /*! 2369 @ingroup TASKING 2370 @param loc Source location info 2371 @param gtid Global thread ID 2372 @param is_ws Is 1 if the reduction is for worksharing, 0 otherwise 2373 @param num Number of data items to reduce 2374 @param data Array of data for reduction 2375 @return The taskgroup identifier 2376 2377 Initialize task reduction for a parallel or worksharing. 2378 2379 Note: this entry supposes the optional compiler-generated initializer routine 2380 has single parameter - pointer to object to be initialized. That means 2381 the reduction either does not use omp_orig object, or the omp_orig is accessible 2382 without help of the runtime library. 2383 */ 2384 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, 2385 int num, void *data) { 2386 return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num, 2387 (kmp_task_red_input_t *)data); 2388 } 2389 2390 /*! 2391 @ingroup TASKING 2392 @param loc Source location info 2393 @param gtid Global thread ID 2394 @param is_ws Is 1 if the reduction is for worksharing, 0 otherwise 2395 @param num Number of data items to reduce 2396 @param data Array of data for reduction 2397 @return The taskgroup identifier 2398 2399 Initialize task reduction for a parallel or worksharing. 2400 2401 Note: this entry supposes the optional compiler-generated initializer routine 2402 has two parameters, pointer to object to be initialized and pointer to omp_orig 2403 */ 2404 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, 2405 void *data) { 2406 return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num, 2407 (kmp_taskred_input_t *)data); 2408 } 2409 2410 /*! 2411 @ingroup TASKING 2412 @param loc Source location info 2413 @param gtid Global thread ID 2414 @param is_ws Is 1 if the reduction is for worksharing, 0 otherwise 2415 2416 Finalize task reduction for a parallel or worksharing. 2417 */ 2418 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) { 2419 __kmpc_end_taskgroup(loc, gtid); 2420 } 2421 2422 // __kmpc_taskgroup: Start a new taskgroup 2423 void __kmpc_taskgroup(ident_t *loc, int gtid) { 2424 kmp_info_t *thread = __kmp_threads[gtid]; 2425 kmp_taskdata_t *taskdata = thread->th.th_current_task; 2426 kmp_taskgroup_t *tg_new = 2427 (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t)); 2428 KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new)); 2429 KMP_ATOMIC_ST_RLX(&tg_new->count, 0); 2430 KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq); 2431 tg_new->parent = taskdata->td_taskgroup; 2432 tg_new->reduce_data = NULL; 2433 tg_new->reduce_num_data = 0; 2434 taskdata->td_taskgroup = tg_new; 2435 2436 #if OMPT_SUPPORT && OMPT_OPTIONAL 2437 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) { 2438 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2439 if (!codeptr) 2440 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2441 kmp_team_t *team = thread->th.th_team; 2442 ompt_data_t my_task_data = taskdata->ompt_task_info.task_data; 2443 // FIXME: I think this is wrong for lwt! 2444 ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data; 2445 2446 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2447 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data), 2448 &(my_task_data), codeptr); 2449 } 2450 #endif 2451 } 2452 2453 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task 2454 // and its descendants are complete 2455 void __kmpc_end_taskgroup(ident_t *loc, int gtid) { 2456 kmp_info_t *thread = __kmp_threads[gtid]; 2457 kmp_taskdata_t *taskdata = thread->th.th_current_task; 2458 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 2459 int thread_finished = FALSE; 2460 2461 #if OMPT_SUPPORT && OMPT_OPTIONAL 2462 kmp_team_t *team; 2463 ompt_data_t my_task_data; 2464 ompt_data_t my_parallel_data; 2465 void *codeptr; 2466 if (UNLIKELY(ompt_enabled.enabled)) { 2467 team = thread->th.th_team; 2468 my_task_data = taskdata->ompt_task_info.task_data; 2469 // FIXME: I think this is wrong for lwt! 2470 my_parallel_data = team->t.ompt_team_info.parallel_data; 2471 codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2472 if (!codeptr) 2473 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2474 } 2475 #endif 2476 2477 KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc)); 2478 KMP_DEBUG_ASSERT(taskgroup != NULL); 2479 KMP_SET_THREAD_STATE_BLOCK(TASKGROUP); 2480 2481 if (__kmp_tasking_mode != tskm_immediate_exec) { 2482 // mark task as waiting not on a barrier 2483 taskdata->td_taskwait_counter += 1; 2484 taskdata->td_taskwait_ident = loc; 2485 taskdata->td_taskwait_thread = gtid + 1; 2486 #if USE_ITT_BUILD 2487 // For ITT the taskgroup wait is similar to taskwait until we need to 2488 // distinguish them 2489 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 2490 if (itt_sync_obj != NULL) 2491 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 2492 #endif /* USE_ITT_BUILD */ 2493 2494 #if OMPT_SUPPORT && OMPT_OPTIONAL 2495 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) { 2496 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2497 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data), 2498 &(my_task_data), codeptr); 2499 } 2500 #endif 2501 2502 if (!taskdata->td_flags.team_serial || 2503 (thread->th.th_task_team != NULL && 2504 thread->th.th_task_team->tt.tt_found_proxy_tasks)) { 2505 kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 2506 0U); 2507 while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) { 2508 flag.execute_tasks(thread, gtid, FALSE, 2509 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 2510 __kmp_task_stealing_constraint); 2511 } 2512 } 2513 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting 2514 2515 #if OMPT_SUPPORT && OMPT_OPTIONAL 2516 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) { 2517 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2518 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data), 2519 &(my_task_data), codeptr); 2520 } 2521 #endif 2522 2523 #if USE_ITT_BUILD 2524 if (itt_sync_obj != NULL) 2525 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 2526 #endif /* USE_ITT_BUILD */ 2527 } 2528 KMP_DEBUG_ASSERT(taskgroup->count == 0); 2529 2530 if (taskgroup->reduce_data != NULL) { // need to reduce? 2531 int cnt; 2532 void *reduce_data; 2533 kmp_team_t *t = thread->th.th_team; 2534 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data; 2535 // check if <priv> data of the first reduction variable shared for the team 2536 void *priv0 = arr[0].reduce_priv; 2537 if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL && 2538 ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) { 2539 // finishing task reduction on parallel 2540 cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]); 2541 if (cnt == thread->th.th_team_nproc - 1) { 2542 // we are the last thread passing __kmpc_reduction_modifier_fini() 2543 // finalize task reduction: 2544 __kmp_task_reduction_fini(thread, taskgroup); 2545 // cleanup fields in the team structure: 2546 // TODO: is relaxed store enough here (whole barrier should follow)? 2547 __kmp_thread_free(thread, reduce_data); 2548 KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL); 2549 KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0); 2550 } else { 2551 // we are not the last thread passing __kmpc_reduction_modifier_fini(), 2552 // so do not finalize reduction, just clean own copy of the data 2553 __kmp_task_reduction_clean(thread, taskgroup); 2554 } 2555 } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) != 2556 NULL && 2557 ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) { 2558 // finishing task reduction on worksharing 2559 cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]); 2560 if (cnt == thread->th.th_team_nproc - 1) { 2561 // we are the last thread passing __kmpc_reduction_modifier_fini() 2562 __kmp_task_reduction_fini(thread, taskgroup); 2563 // cleanup fields in team structure: 2564 // TODO: is relaxed store enough here (whole barrier should follow)? 2565 __kmp_thread_free(thread, reduce_data); 2566 KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL); 2567 KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0); 2568 } else { 2569 // we are not the last thread passing __kmpc_reduction_modifier_fini(), 2570 // so do not finalize reduction, just clean own copy of the data 2571 __kmp_task_reduction_clean(thread, taskgroup); 2572 } 2573 } else { 2574 // finishing task reduction on taskgroup 2575 __kmp_task_reduction_fini(thread, taskgroup); 2576 } 2577 } 2578 // Restore parent taskgroup for the current task 2579 taskdata->td_taskgroup = taskgroup->parent; 2580 __kmp_thread_free(thread, taskgroup); 2581 2582 KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n", 2583 gtid, taskdata)); 2584 ANNOTATE_HAPPENS_AFTER(taskdata); 2585 2586 #if OMPT_SUPPORT && OMPT_OPTIONAL 2587 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) { 2588 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2589 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data), 2590 &(my_task_data), codeptr); 2591 } 2592 #endif 2593 } 2594 2595 // __kmp_remove_my_task: remove a task from my own deque 2596 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid, 2597 kmp_task_team_t *task_team, 2598 kmp_int32 is_constrained) { 2599 kmp_task_t *task; 2600 kmp_taskdata_t *taskdata; 2601 kmp_thread_data_t *thread_data; 2602 kmp_uint32 tail; 2603 2604 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2605 KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data != 2606 NULL); // Caller should check this condition 2607 2608 thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 2609 2610 KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n", 2611 gtid, thread_data->td.td_deque_ntasks, 2612 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2613 2614 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 2615 KA_TRACE(10, 2616 ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: " 2617 "ntasks=%d head=%u tail=%u\n", 2618 gtid, thread_data->td.td_deque_ntasks, 2619 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2620 return NULL; 2621 } 2622 2623 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 2624 2625 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 2626 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2627 KA_TRACE(10, 2628 ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: " 2629 "ntasks=%d head=%u tail=%u\n", 2630 gtid, thread_data->td.td_deque_ntasks, 2631 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2632 return NULL; 2633 } 2634 2635 tail = (thread_data->td.td_deque_tail - 1) & 2636 TASK_DEQUE_MASK(thread_data->td); // Wrap index. 2637 taskdata = thread_data->td.td_deque[tail]; 2638 2639 if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata, 2640 thread->th.th_current_task)) { 2641 // The TSC does not allow to steal victim task 2642 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2643 KA_TRACE(10, 2644 ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: " 2645 "ntasks=%d head=%u tail=%u\n", 2646 gtid, thread_data->td.td_deque_ntasks, 2647 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2648 return NULL; 2649 } 2650 2651 thread_data->td.td_deque_tail = tail; 2652 TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1); 2653 2654 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2655 2656 KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: " 2657 "ntasks=%d head=%u tail=%u\n", 2658 gtid, taskdata, thread_data->td.td_deque_ntasks, 2659 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2660 2661 task = KMP_TASKDATA_TO_TASK(taskdata); 2662 return task; 2663 } 2664 2665 // __kmp_steal_task: remove a task from another thread's deque 2666 // Assume that calling thread has already checked existence of 2667 // task_team thread_data before calling this routine. 2668 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid, 2669 kmp_task_team_t *task_team, 2670 std::atomic<kmp_int32> *unfinished_threads, 2671 int *thread_finished, 2672 kmp_int32 is_constrained) { 2673 kmp_task_t *task; 2674 kmp_taskdata_t *taskdata; 2675 kmp_taskdata_t *current; 2676 kmp_thread_data_t *victim_td, *threads_data; 2677 kmp_int32 target; 2678 kmp_int32 victim_tid; 2679 2680 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2681 2682 threads_data = task_team->tt.tt_threads_data; 2683 KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition 2684 2685 victim_tid = victim_thr->th.th_info.ds.ds_tid; 2686 victim_td = &threads_data[victim_tid]; 2687 2688 KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: " 2689 "task_team=%p ntasks=%d head=%u tail=%u\n", 2690 gtid, __kmp_gtid_from_thread(victim_thr), task_team, 2691 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2692 victim_td->td.td_deque_tail)); 2693 2694 if (TCR_4(victim_td->td.td_deque_ntasks) == 0) { 2695 KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: " 2696 "task_team=%p ntasks=%d head=%u tail=%u\n", 2697 gtid, __kmp_gtid_from_thread(victim_thr), task_team, 2698 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2699 victim_td->td.td_deque_tail)); 2700 return NULL; 2701 } 2702 2703 __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock); 2704 2705 int ntasks = TCR_4(victim_td->td.td_deque_ntasks); 2706 // Check again after we acquire the lock 2707 if (ntasks == 0) { 2708 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2709 KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: " 2710 "task_team=%p ntasks=%d head=%u tail=%u\n", 2711 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2712 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2713 return NULL; 2714 } 2715 2716 KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL); 2717 current = __kmp_threads[gtid]->th.th_current_task; 2718 taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head]; 2719 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) { 2720 // Bump head pointer and Wrap. 2721 victim_td->td.td_deque_head = 2722 (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td); 2723 } else { 2724 if (!task_team->tt.tt_untied_task_encountered) { 2725 // The TSC does not allow to steal victim task 2726 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2727 KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from " 2728 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n", 2729 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2730 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2731 return NULL; 2732 } 2733 int i; 2734 // walk through victim's deque trying to steal any task 2735 target = victim_td->td.td_deque_head; 2736 taskdata = NULL; 2737 for (i = 1; i < ntasks; ++i) { 2738 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td); 2739 taskdata = victim_td->td.td_deque[target]; 2740 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) { 2741 break; // found victim task 2742 } else { 2743 taskdata = NULL; 2744 } 2745 } 2746 if (taskdata == NULL) { 2747 // No appropriate candidate to steal found 2748 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2749 KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from " 2750 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n", 2751 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2752 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2753 return NULL; 2754 } 2755 int prev = target; 2756 for (i = i + 1; i < ntasks; ++i) { 2757 // shift remaining tasks in the deque left by 1 2758 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td); 2759 victim_td->td.td_deque[prev] = victim_td->td.td_deque[target]; 2760 prev = target; 2761 } 2762 KMP_DEBUG_ASSERT( 2763 victim_td->td.td_deque_tail == 2764 (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td))); 2765 victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped)) 2766 } 2767 if (*thread_finished) { 2768 // We need to un-mark this victim as a finished victim. This must be done 2769 // before releasing the lock, or else other threads (starting with the 2770 // master victim) might be prematurely released from the barrier!!! 2771 kmp_int32 count; 2772 2773 count = KMP_ATOMIC_INC(unfinished_threads); 2774 2775 KA_TRACE( 2776 20, 2777 ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n", 2778 gtid, count + 1, task_team)); 2779 2780 *thread_finished = FALSE; 2781 } 2782 TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1); 2783 2784 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2785 2786 KMP_COUNT_BLOCK(TASK_stolen); 2787 KA_TRACE(10, 2788 ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: " 2789 "task_team=%p ntasks=%d head=%u tail=%u\n", 2790 gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team, 2791 ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2792 2793 task = KMP_TASKDATA_TO_TASK(taskdata); 2794 return task; 2795 } 2796 2797 // __kmp_execute_tasks_template: Choose and execute tasks until either the 2798 // condition is statisfied (return true) or there are none left (return false). 2799 // 2800 // final_spin is TRUE if this is the spin at the release barrier. 2801 // thread_finished indicates whether the thread is finished executing all 2802 // the tasks it has on its deque, and is at the release barrier. 2803 // spinner is the location on which to spin. 2804 // spinner == NULL means only execute a single task and return. 2805 // checker is the value to check to terminate the spin. 2806 template <class C> 2807 static inline int __kmp_execute_tasks_template( 2808 kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin, 2809 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2810 kmp_int32 is_constrained) { 2811 kmp_task_team_t *task_team = thread->th.th_task_team; 2812 kmp_thread_data_t *threads_data; 2813 kmp_task_t *task; 2814 kmp_info_t *other_thread; 2815 kmp_taskdata_t *current_task = thread->th.th_current_task; 2816 std::atomic<kmp_int32> *unfinished_threads; 2817 kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0, 2818 tid = thread->th.th_info.ds.ds_tid; 2819 2820 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2821 KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]); 2822 2823 if (task_team == NULL || current_task == NULL) 2824 return FALSE; 2825 2826 KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d " 2827 "*thread_finished=%d\n", 2828 gtid, final_spin, *thread_finished)); 2829 2830 thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 2831 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 2832 KMP_DEBUG_ASSERT(threads_data != NULL); 2833 2834 nthreads = task_team->tt.tt_nproc; 2835 unfinished_threads = &(task_team->tt.tt_unfinished_threads); 2836 KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks); 2837 KMP_DEBUG_ASSERT(*unfinished_threads >= 0); 2838 2839 while (1) { // Outer loop keeps trying to find tasks in case of single thread 2840 // getting tasks from target constructs 2841 while (1) { // Inner loop to find a task and execute it 2842 task = NULL; 2843 if (use_own_tasks) { // check on own queue first 2844 task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained); 2845 } 2846 if ((task == NULL) && (nthreads > 1)) { // Steal a task 2847 int asleep = 1; 2848 use_own_tasks = 0; 2849 // Try to steal from the last place I stole from successfully. 2850 if (victim_tid == -2) { // haven't stolen anything yet 2851 victim_tid = threads_data[tid].td.td_deque_last_stolen; 2852 if (victim_tid != 2853 -1) // if we have a last stolen from victim, get the thread 2854 other_thread = threads_data[victim_tid].td.td_thr; 2855 } 2856 if (victim_tid != -1) { // found last victim 2857 asleep = 0; 2858 } else if (!new_victim) { // no recent steals and we haven't already 2859 // used a new victim; select a random thread 2860 do { // Find a different thread to steal work from. 2861 // Pick a random thread. Initial plan was to cycle through all the 2862 // threads, and only return if we tried to steal from every thread, 2863 // and failed. Arch says that's not such a great idea. 2864 victim_tid = __kmp_get_random(thread) % (nthreads - 1); 2865 if (victim_tid >= tid) { 2866 ++victim_tid; // Adjusts random distribution to exclude self 2867 } 2868 // Found a potential victim 2869 other_thread = threads_data[victim_tid].td.td_thr; 2870 // There is a slight chance that __kmp_enable_tasking() did not wake 2871 // up all threads waiting at the barrier. If victim is sleeping, 2872 // then wake it up. Since we were going to pay the cache miss 2873 // penalty for referencing another thread's kmp_info_t struct 2874 // anyway, 2875 // the check shouldn't cost too much performance at this point. In 2876 // extra barrier mode, tasks do not sleep at the separate tasking 2877 // barrier, so this isn't a problem. 2878 asleep = 0; 2879 if ((__kmp_tasking_mode == tskm_task_teams) && 2880 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) && 2881 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) != 2882 NULL)) { 2883 asleep = 1; 2884 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread), 2885 other_thread->th.th_sleep_loc); 2886 // A sleeping thread should not have any tasks on it's queue. 2887 // There is a slight possibility that it resumes, steals a task 2888 // from another thread, which spawns more tasks, all in the time 2889 // that it takes this thread to check => don't write an assertion 2890 // that the victim's queue is empty. Try stealing from a 2891 // different thread. 2892 } 2893 } while (asleep); 2894 } 2895 2896 if (!asleep) { 2897 // We have a victim to try to steal from 2898 task = __kmp_steal_task(other_thread, gtid, task_team, 2899 unfinished_threads, thread_finished, 2900 is_constrained); 2901 } 2902 if (task != NULL) { // set last stolen to victim 2903 if (threads_data[tid].td.td_deque_last_stolen != victim_tid) { 2904 threads_data[tid].td.td_deque_last_stolen = victim_tid; 2905 // The pre-refactored code did not try more than 1 successful new 2906 // vicitm, unless the last one generated more local tasks; 2907 // new_victim keeps track of this 2908 new_victim = 1; 2909 } 2910 } else { // No tasks found; unset last_stolen 2911 KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1); 2912 victim_tid = -2; // no successful victim found 2913 } 2914 } 2915 2916 if (task == NULL) // break out of tasking loop 2917 break; 2918 2919 // Found a task; execute it 2920 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2921 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2922 if (itt_sync_obj == NULL) { // we are at fork barrier where we could not 2923 // get the object reliably 2924 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2925 } 2926 __kmp_itt_task_starting(itt_sync_obj); 2927 } 2928 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2929 __kmp_invoke_task(gtid, task, current_task); 2930 #if USE_ITT_BUILD 2931 if (itt_sync_obj != NULL) 2932 __kmp_itt_task_finished(itt_sync_obj); 2933 #endif /* USE_ITT_BUILD */ 2934 // If this thread is only partway through the barrier and the condition is 2935 // met, then return now, so that the barrier gather/release pattern can 2936 // proceed. If this thread is in the last spin loop in the barrier, 2937 // waiting to be released, we know that the termination condition will not 2938 // be satisfied, so don't waste any cycles checking it. 2939 if (flag == NULL || (!final_spin && flag->done_check())) { 2940 KA_TRACE( 2941 15, 2942 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 2943 gtid)); 2944 return TRUE; 2945 } 2946 if (thread->th.th_task_team == NULL) { 2947 break; 2948 } 2949 KMP_YIELD(__kmp_library == library_throughput); // Yield before next task 2950 // If execution of a stolen task results in more tasks being placed on our 2951 // run queue, reset use_own_tasks 2952 if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) { 2953 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned " 2954 "other tasks, restart\n", 2955 gtid)); 2956 use_own_tasks = 1; 2957 new_victim = 0; 2958 } 2959 } 2960 2961 // The task source has been exhausted. If in final spin loop of barrier, 2962 // check if termination condition is satisfied. The work queue may be empty 2963 // but there might be proxy tasks still executing. 2964 if (final_spin && 2965 KMP_ATOMIC_LD_ACQ(¤t_task->td_incomplete_child_tasks) == 0) { 2966 // First, decrement the #unfinished threads, if that has not already been 2967 // done. This decrement might be to the spin location, and result in the 2968 // termination condition being satisfied. 2969 if (!*thread_finished) { 2970 kmp_int32 count; 2971 2972 count = KMP_ATOMIC_DEC(unfinished_threads) - 1; 2973 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec " 2974 "unfinished_threads to %d task_team=%p\n", 2975 gtid, count, task_team)); 2976 *thread_finished = TRUE; 2977 } 2978 2979 // It is now unsafe to reference thread->th.th_team !!! 2980 // Decrementing task_team->tt.tt_unfinished_threads can allow the master 2981 // thread to pass through the barrier, where it might reset each thread's 2982 // th.th_team field for the next parallel region. If we can steal more 2983 // work, we know that this has not happened yet. 2984 if (flag != NULL && flag->done_check()) { 2985 KA_TRACE( 2986 15, 2987 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 2988 gtid)); 2989 return TRUE; 2990 } 2991 } 2992 2993 // If this thread's task team is NULL, master has recognized that there are 2994 // no more tasks; bail out 2995 if (thread->th.th_task_team == NULL) { 2996 KA_TRACE(15, 2997 ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid)); 2998 return FALSE; 2999 } 3000 3001 // We could be getting tasks from target constructs; if this is the only 3002 // thread, keep trying to execute tasks from own queue 3003 if (nthreads == 1) 3004 use_own_tasks = 1; 3005 else { 3006 KA_TRACE(15, 3007 ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid)); 3008 return FALSE; 3009 } 3010 } 3011 } 3012 3013 int __kmp_execute_tasks_32( 3014 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin, 3015 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 3016 kmp_int32 is_constrained) { 3017 return __kmp_execute_tasks_template( 3018 thread, gtid, flag, final_spin, 3019 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 3020 } 3021 3022 int __kmp_execute_tasks_64( 3023 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin, 3024 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 3025 kmp_int32 is_constrained) { 3026 return __kmp_execute_tasks_template( 3027 thread, gtid, flag, final_spin, 3028 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 3029 } 3030 3031 int __kmp_execute_tasks_oncore( 3032 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin, 3033 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 3034 kmp_int32 is_constrained) { 3035 return __kmp_execute_tasks_template( 3036 thread, gtid, flag, final_spin, 3037 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 3038 } 3039 3040 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the 3041 // next barrier so they can assist in executing enqueued tasks. 3042 // First thread in allocates the task team atomically. 3043 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 3044 kmp_info_t *this_thr) { 3045 kmp_thread_data_t *threads_data; 3046 int nthreads, i, is_init_thread; 3047 3048 KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n", 3049 __kmp_gtid_from_thread(this_thr))); 3050 3051 KMP_DEBUG_ASSERT(task_team != NULL); 3052 KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL); 3053 3054 nthreads = task_team->tt.tt_nproc; 3055 KMP_DEBUG_ASSERT(nthreads > 0); 3056 KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc); 3057 3058 // Allocate or increase the size of threads_data if necessary 3059 is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team); 3060 3061 if (!is_init_thread) { 3062 // Some other thread already set up the array. 3063 KA_TRACE( 3064 20, 3065 ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n", 3066 __kmp_gtid_from_thread(this_thr))); 3067 return; 3068 } 3069 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 3070 KMP_DEBUG_ASSERT(threads_data != NULL); 3071 3072 if (__kmp_tasking_mode == tskm_task_teams && 3073 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) { 3074 // Release any threads sleeping at the barrier, so that they can steal 3075 // tasks and execute them. In extra barrier mode, tasks do not sleep 3076 // at the separate tasking barrier, so this isn't a problem. 3077 for (i = 0; i < nthreads; i++) { 3078 volatile void *sleep_loc; 3079 kmp_info_t *thread = threads_data[i].td.td_thr; 3080 3081 if (i == this_thr->th.th_info.ds.ds_tid) { 3082 continue; 3083 } 3084 // Since we haven't locked the thread's suspend mutex lock at this 3085 // point, there is a small window where a thread might be putting 3086 // itself to sleep, but hasn't set the th_sleep_loc field yet. 3087 // To work around this, __kmp_execute_tasks_template() periodically checks 3088 // see if other threads are sleeping (using the same random mechanism that 3089 // is used for task stealing) and awakens them if they are. 3090 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != 3091 NULL) { 3092 KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n", 3093 __kmp_gtid_from_thread(this_thr), 3094 __kmp_gtid_from_thread(thread))); 3095 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc); 3096 } else { 3097 KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n", 3098 __kmp_gtid_from_thread(this_thr), 3099 __kmp_gtid_from_thread(thread))); 3100 } 3101 } 3102 } 3103 3104 KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n", 3105 __kmp_gtid_from_thread(this_thr))); 3106 } 3107 3108 /* // TODO: Check the comment consistency 3109 * Utility routines for "task teams". A task team (kmp_task_t) is kind of 3110 * like a shadow of the kmp_team_t data struct, with a different lifetime. 3111 * After a child * thread checks into a barrier and calls __kmp_release() from 3112 * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no 3113 * longer assume that the kmp_team_t structure is intact (at any moment, the 3114 * master thread may exit the barrier code and free the team data structure, 3115 * and return the threads to the thread pool). 3116 * 3117 * This does not work with the tasking code, as the thread is still 3118 * expected to participate in the execution of any tasks that may have been 3119 * spawned my a member of the team, and the thread still needs access to all 3120 * to each thread in the team, so that it can steal work from it. 3121 * 3122 * Enter the existence of the kmp_task_team_t struct. It employs a reference 3123 * counting mechanism, and is allocated by the master thread before calling 3124 * __kmp_<barrier_kind>_release, and then is release by the last thread to 3125 * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes 3126 * of the kmp_task_team_t structs for consecutive barriers can overlap 3127 * (and will, unless the master thread is the last thread to exit the barrier 3128 * release phase, which is not typical). The existence of such a struct is 3129 * useful outside the context of tasking. 3130 * 3131 * We currently use the existence of the threads array as an indicator that 3132 * tasks were spawned since the last barrier. If the structure is to be 3133 * useful outside the context of tasking, then this will have to change, but 3134 * not setting the field minimizes the performance impact of tasking on 3135 * barriers, when no explicit tasks were spawned (pushed, actually). 3136 */ 3137 3138 static kmp_task_team_t *__kmp_free_task_teams = 3139 NULL; // Free list for task_team data structures 3140 // Lock for task team data structures 3141 kmp_bootstrap_lock_t __kmp_task_team_lock = 3142 KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock); 3143 3144 // __kmp_alloc_task_deque: 3145 // Allocates a task deque for a particular thread, and initialize the necessary 3146 // data structures relating to the deque. This only happens once per thread 3147 // per task team since task teams are recycled. No lock is needed during 3148 // allocation since each thread allocates its own deque. 3149 static void __kmp_alloc_task_deque(kmp_info_t *thread, 3150 kmp_thread_data_t *thread_data) { 3151 __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock); 3152 KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL); 3153 3154 // Initialize last stolen task field to "none" 3155 thread_data->td.td_deque_last_stolen = -1; 3156 3157 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0); 3158 KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0); 3159 KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0); 3160 3161 KE_TRACE( 3162 10, 3163 ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n", 3164 __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data)); 3165 // Allocate space for task deque, and zero the deque 3166 // Cannot use __kmp_thread_calloc() because threads not around for 3167 // kmp_reap_task_team( ). 3168 thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate( 3169 INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *)); 3170 thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE; 3171 } 3172 3173 // __kmp_free_task_deque: 3174 // Deallocates a task deque for a particular thread. Happens at library 3175 // deallocation so don't need to reset all thread data fields. 3176 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) { 3177 if (thread_data->td.td_deque != NULL) { 3178 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3179 TCW_4(thread_data->td.td_deque_ntasks, 0); 3180 __kmp_free(thread_data->td.td_deque); 3181 thread_data->td.td_deque = NULL; 3182 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 3183 } 3184 3185 #ifdef BUILD_TIED_TASK_STACK 3186 // GEH: Figure out what to do here for td_susp_tied_tasks 3187 if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) { 3188 __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data); 3189 } 3190 #endif // BUILD_TIED_TASK_STACK 3191 } 3192 3193 // __kmp_realloc_task_threads_data: 3194 // Allocates a threads_data array for a task team, either by allocating an 3195 // initial array or enlarging an existing array. Only the first thread to get 3196 // the lock allocs or enlarges the array and re-initializes the array elements. 3197 // That thread returns "TRUE", the rest return "FALSE". 3198 // Assumes that the new array size is given by task_team -> tt.tt_nproc. 3199 // The current size is given by task_team -> tt.tt_max_threads. 3200 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 3201 kmp_task_team_t *task_team) { 3202 kmp_thread_data_t **threads_data_p; 3203 kmp_int32 nthreads, maxthreads; 3204 int is_init_thread = FALSE; 3205 3206 if (TCR_4(task_team->tt.tt_found_tasks)) { 3207 // Already reallocated and initialized. 3208 return FALSE; 3209 } 3210 3211 threads_data_p = &task_team->tt.tt_threads_data; 3212 nthreads = task_team->tt.tt_nproc; 3213 maxthreads = task_team->tt.tt_max_threads; 3214 3215 // All threads must lock when they encounter the first task of the implicit 3216 // task region to make sure threads_data fields are (re)initialized before 3217 // used. 3218 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 3219 3220 if (!TCR_4(task_team->tt.tt_found_tasks)) { 3221 // first thread to enable tasking 3222 kmp_team_t *team = thread->th.th_team; 3223 int i; 3224 3225 is_init_thread = TRUE; 3226 if (maxthreads < nthreads) { 3227 3228 if (*threads_data_p != NULL) { 3229 kmp_thread_data_t *old_data = *threads_data_p; 3230 kmp_thread_data_t *new_data = NULL; 3231 3232 KE_TRACE( 3233 10, 3234 ("__kmp_realloc_task_threads_data: T#%d reallocating " 3235 "threads data for task_team %p, new_size = %d, old_size = %d\n", 3236 __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads)); 3237 // Reallocate threads_data to have more elements than current array 3238 // Cannot use __kmp_thread_realloc() because threads not around for 3239 // kmp_reap_task_team( ). Note all new array entries are initialized 3240 // to zero by __kmp_allocate(). 3241 new_data = (kmp_thread_data_t *)__kmp_allocate( 3242 nthreads * sizeof(kmp_thread_data_t)); 3243 // copy old data to new data 3244 KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t), 3245 (void *)old_data, maxthreads * sizeof(kmp_thread_data_t)); 3246 3247 #ifdef BUILD_TIED_TASK_STACK 3248 // GEH: Figure out if this is the right thing to do 3249 for (i = maxthreads; i < nthreads; i++) { 3250 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 3251 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 3252 } 3253 #endif // BUILD_TIED_TASK_STACK 3254 // Install the new data and free the old data 3255 (*threads_data_p) = new_data; 3256 __kmp_free(old_data); 3257 } else { 3258 KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating " 3259 "threads data for task_team %p, size = %d\n", 3260 __kmp_gtid_from_thread(thread), task_team, nthreads)); 3261 // Make the initial allocate for threads_data array, and zero entries 3262 // Cannot use __kmp_thread_calloc() because threads not around for 3263 // kmp_reap_task_team( ). 3264 ANNOTATE_IGNORE_WRITES_BEGIN(); 3265 *threads_data_p = (kmp_thread_data_t *)__kmp_allocate( 3266 nthreads * sizeof(kmp_thread_data_t)); 3267 ANNOTATE_IGNORE_WRITES_END(); 3268 #ifdef BUILD_TIED_TASK_STACK 3269 // GEH: Figure out if this is the right thing to do 3270 for (i = 0; i < nthreads; i++) { 3271 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 3272 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 3273 } 3274 #endif // BUILD_TIED_TASK_STACK 3275 } 3276 task_team->tt.tt_max_threads = nthreads; 3277 } else { 3278 // If array has (more than) enough elements, go ahead and use it 3279 KMP_DEBUG_ASSERT(*threads_data_p != NULL); 3280 } 3281 3282 // initialize threads_data pointers back to thread_info structures 3283 for (i = 0; i < nthreads; i++) { 3284 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 3285 thread_data->td.td_thr = team->t.t_threads[i]; 3286 3287 if (thread_data->td.td_deque_last_stolen >= nthreads) { 3288 // The last stolen field survives across teams / barrier, and the number 3289 // of threads may have changed. It's possible (likely?) that a new 3290 // parallel region will exhibit the same behavior as previous region. 3291 thread_data->td.td_deque_last_stolen = -1; 3292 } 3293 } 3294 3295 KMP_MB(); 3296 TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE); 3297 } 3298 3299 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 3300 return is_init_thread; 3301 } 3302 3303 // __kmp_free_task_threads_data: 3304 // Deallocates a threads_data array for a task team, including any attached 3305 // tasking deques. Only occurs at library shutdown. 3306 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) { 3307 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 3308 if (task_team->tt.tt_threads_data != NULL) { 3309 int i; 3310 for (i = 0; i < task_team->tt.tt_max_threads; i++) { 3311 __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]); 3312 } 3313 __kmp_free(task_team->tt.tt_threads_data); 3314 task_team->tt.tt_threads_data = NULL; 3315 } 3316 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 3317 } 3318 3319 // __kmp_allocate_task_team: 3320 // Allocates a task team associated with a specific team, taking it from 3321 // the global task team free list if possible. Also initializes data 3322 // structures. 3323 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread, 3324 kmp_team_t *team) { 3325 kmp_task_team_t *task_team = NULL; 3326 int nthreads; 3327 3328 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n", 3329 (thread ? __kmp_gtid_from_thread(thread) : -1), team)); 3330 3331 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 3332 // Take a task team from the task team pool 3333 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3334 if (__kmp_free_task_teams != NULL) { 3335 task_team = __kmp_free_task_teams; 3336 TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next); 3337 task_team->tt.tt_next = NULL; 3338 } 3339 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3340 } 3341 3342 if (task_team == NULL) { 3343 KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating " 3344 "task team for team %p\n", 3345 __kmp_gtid_from_thread(thread), team)); 3346 // Allocate a new task team if one is not available. 3347 // Cannot use __kmp_thread_malloc() because threads not around for 3348 // kmp_reap_task_team( ). 3349 task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t)); 3350 __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock); 3351 // AC: __kmp_allocate zeroes returned memory 3352 // task_team -> tt.tt_threads_data = NULL; 3353 // task_team -> tt.tt_max_threads = 0; 3354 // task_team -> tt.tt_next = NULL; 3355 } 3356 3357 TCW_4(task_team->tt.tt_found_tasks, FALSE); 3358 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3359 task_team->tt.tt_nproc = nthreads = team->t.t_nproc; 3360 3361 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads); 3362 TCW_4(task_team->tt.tt_active, TRUE); 3363 3364 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p " 3365 "unfinished_threads init'd to %d\n", 3366 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team, 3367 KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads))); 3368 return task_team; 3369 } 3370 3371 // __kmp_free_task_team: 3372 // Frees the task team associated with a specific thread, and adds it 3373 // to the global task team free list. 3374 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) { 3375 KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n", 3376 thread ? __kmp_gtid_from_thread(thread) : -1, task_team)); 3377 3378 // Put task team back on free list 3379 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3380 3381 KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL); 3382 task_team->tt.tt_next = __kmp_free_task_teams; 3383 TCW_PTR(__kmp_free_task_teams, task_team); 3384 3385 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3386 } 3387 3388 // __kmp_reap_task_teams: 3389 // Free all the task teams on the task team free list. 3390 // Should only be done during library shutdown. 3391 // Cannot do anything that needs a thread structure or gtid since they are 3392 // already gone. 3393 void __kmp_reap_task_teams(void) { 3394 kmp_task_team_t *task_team; 3395 3396 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 3397 // Free all task_teams on the free list 3398 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3399 while ((task_team = __kmp_free_task_teams) != NULL) { 3400 __kmp_free_task_teams = task_team->tt.tt_next; 3401 task_team->tt.tt_next = NULL; 3402 3403 // Free threads_data if necessary 3404 if (task_team->tt.tt_threads_data != NULL) { 3405 __kmp_free_task_threads_data(task_team); 3406 } 3407 __kmp_free(task_team); 3408 } 3409 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3410 } 3411 } 3412 3413 // __kmp_wait_to_unref_task_teams: 3414 // Some threads could still be in the fork barrier release code, possibly 3415 // trying to steal tasks. Wait for each thread to unreference its task team. 3416 void __kmp_wait_to_unref_task_teams(void) { 3417 kmp_info_t *thread; 3418 kmp_uint32 spins; 3419 int done; 3420 3421 KMP_INIT_YIELD(spins); 3422 3423 for (;;) { 3424 done = TRUE; 3425 3426 // TODO: GEH - this may be is wrong because some sync would be necessary 3427 // in case threads are added to the pool during the traversal. Need to 3428 // verify that lock for thread pool is held when calling this routine. 3429 for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL; 3430 thread = thread->th.th_next_pool) { 3431 #if KMP_OS_WINDOWS 3432 DWORD exit_val; 3433 #endif 3434 if (TCR_PTR(thread->th.th_task_team) == NULL) { 3435 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n", 3436 __kmp_gtid_from_thread(thread))); 3437 continue; 3438 } 3439 #if KMP_OS_WINDOWS 3440 // TODO: GEH - add this check for Linux* OS / OS X* as well? 3441 if (!__kmp_is_thread_alive(thread, &exit_val)) { 3442 thread->th.th_task_team = NULL; 3443 continue; 3444 } 3445 #endif 3446 3447 done = FALSE; // Because th_task_team pointer is not NULL for this thread 3448 3449 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to " 3450 "unreference task_team\n", 3451 __kmp_gtid_from_thread(thread))); 3452 3453 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 3454 volatile void *sleep_loc; 3455 // If the thread is sleeping, awaken it. 3456 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != 3457 NULL) { 3458 KA_TRACE( 3459 10, 3460 ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n", 3461 __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread))); 3462 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc); 3463 } 3464 } 3465 } 3466 if (done) { 3467 break; 3468 } 3469 3470 // If oversubscribed or have waited a bit, yield. 3471 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 3472 } 3473 } 3474 3475 // __kmp_task_team_setup: Create a task_team for the current team, but use 3476 // an already created, unused one if it already exists. 3477 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) { 3478 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3479 3480 // If this task_team hasn't been created yet, allocate it. It will be used in 3481 // the region after the next. 3482 // If it exists, it is the current task team and shouldn't be touched yet as 3483 // it may still be in use. 3484 if (team->t.t_task_team[this_thr->th.th_task_state] == NULL && 3485 (always || team->t.t_nproc > 1)) { 3486 team->t.t_task_team[this_thr->th.th_task_state] = 3487 __kmp_allocate_task_team(this_thr, team); 3488 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p " 3489 "for team %d at parity=%d\n", 3490 __kmp_gtid_from_thread(this_thr), 3491 team->t.t_task_team[this_thr->th.th_task_state], 3492 ((team != NULL) ? team->t.t_id : -1), 3493 this_thr->th.th_task_state)); 3494 } 3495 3496 // After threads exit the release, they will call sync, and then point to this 3497 // other task_team; make sure it is allocated and properly initialized. As 3498 // threads spin in the barrier release phase, they will continue to use the 3499 // previous task_team struct(above), until they receive the signal to stop 3500 // checking for tasks (they can't safely reference the kmp_team_t struct, 3501 // which could be reallocated by the master thread). No task teams are formed 3502 // for serialized teams. 3503 if (team->t.t_nproc > 1) { 3504 int other_team = 1 - this_thr->th.th_task_state; 3505 if (team->t.t_task_team[other_team] == NULL) { // setup other team as well 3506 team->t.t_task_team[other_team] = 3507 __kmp_allocate_task_team(this_thr, team); 3508 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new " 3509 "task_team %p for team %d at parity=%d\n", 3510 __kmp_gtid_from_thread(this_thr), 3511 team->t.t_task_team[other_team], 3512 ((team != NULL) ? team->t.t_id : -1), other_team)); 3513 } else { // Leave the old task team struct in place for the upcoming region; 3514 // adjust as needed 3515 kmp_task_team_t *task_team = team->t.t_task_team[other_team]; 3516 if (!task_team->tt.tt_active || 3517 team->t.t_nproc != task_team->tt.tt_nproc) { 3518 TCW_4(task_team->tt.tt_nproc, team->t.t_nproc); 3519 TCW_4(task_team->tt.tt_found_tasks, FALSE); 3520 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3521 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, 3522 team->t.t_nproc); 3523 TCW_4(task_team->tt.tt_active, TRUE); 3524 } 3525 // if team size has changed, the first thread to enable tasking will 3526 // realloc threads_data if necessary 3527 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team " 3528 "%p for team %d at parity=%d\n", 3529 __kmp_gtid_from_thread(this_thr), 3530 team->t.t_task_team[other_team], 3531 ((team != NULL) ? team->t.t_id : -1), other_team)); 3532 } 3533 } 3534 } 3535 3536 // __kmp_task_team_sync: Propagation of task team data from team to threads 3537 // which happens just after the release phase of a team barrier. This may be 3538 // called by any thread, but only for teams with # threads > 1. 3539 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) { 3540 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3541 3542 // Toggle the th_task_state field, to switch which task_team this thread 3543 // refers to 3544 this_thr->th.th_task_state = 1 - this_thr->th.th_task_state; 3545 // It is now safe to propagate the task team pointer from the team struct to 3546 // the current thread. 3547 TCW_PTR(this_thr->th.th_task_team, 3548 team->t.t_task_team[this_thr->th.th_task_state]); 3549 KA_TRACE(20, 3550 ("__kmp_task_team_sync: Thread T#%d task team switched to task_team " 3551 "%p from Team #%d (parity=%d)\n", 3552 __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team, 3553 ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state)); 3554 } 3555 3556 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the 3557 // barrier gather phase. Only called by master thread if #threads in team > 1 or 3558 // if proxy tasks were created. 3559 // 3560 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off 3561 // by passing in 0 optionally as the last argument. When wait is zero, master 3562 // thread does not wait for unfinished_threads to reach 0. 3563 void __kmp_task_team_wait( 3564 kmp_info_t *this_thr, 3565 kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) { 3566 kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state]; 3567 3568 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3569 KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team); 3570 3571 if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) { 3572 if (wait) { 3573 KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks " 3574 "(for unfinished_threads to reach 0) on task_team = %p\n", 3575 __kmp_gtid_from_thread(this_thr), task_team)); 3576 // Worker threads may have dropped through to release phase, but could 3577 // still be executing tasks. Wait here for tasks to complete. To avoid 3578 // memory contention, only master thread checks termination condition. 3579 kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, 3580 &task_team->tt.tt_unfinished_threads), 3581 0U); 3582 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 3583 } 3584 // Deactivate the old task team, so that the worker threads will stop 3585 // referencing it while spinning. 3586 KA_TRACE( 3587 20, 3588 ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: " 3589 "setting active to false, setting local and team's pointer to NULL\n", 3590 __kmp_gtid_from_thread(this_thr), task_team)); 3591 KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 || 3592 task_team->tt.tt_found_proxy_tasks == TRUE); 3593 TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3594 KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0); 3595 TCW_SYNC_4(task_team->tt.tt_active, FALSE); 3596 KMP_MB(); 3597 3598 TCW_PTR(this_thr->th.th_task_team, NULL); 3599 } 3600 } 3601 3602 // __kmp_tasking_barrier: 3603 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier. 3604 // Internal function to execute all tasks prior to a regular barrier or a join 3605 // barrier. It is a full barrier itself, which unfortunately turns regular 3606 // barriers into double barriers and join barriers into 1 1/2 barriers. 3607 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) { 3608 std::atomic<kmp_uint32> *spin = RCAST( 3609 std::atomic<kmp_uint32> *, 3610 &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads); 3611 int flag = FALSE; 3612 KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier); 3613 3614 #if USE_ITT_BUILD 3615 KMP_FSYNC_SPIN_INIT(spin, NULL); 3616 #endif /* USE_ITT_BUILD */ 3617 kmp_flag_32 spin_flag(spin, 0U); 3618 while (!spin_flag.execute_tasks(thread, gtid, TRUE, 3619 &flag USE_ITT_BUILD_ARG(NULL), 0)) { 3620 #if USE_ITT_BUILD 3621 // TODO: What about itt_sync_obj?? 3622 KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin)); 3623 #endif /* USE_ITT_BUILD */ 3624 3625 if (TCR_4(__kmp_global.g.g_done)) { 3626 if (__kmp_global.g.g_abort) 3627 __kmp_abort_thread(); 3628 break; 3629 } 3630 KMP_YIELD(TRUE); 3631 } 3632 #if USE_ITT_BUILD 3633 KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin)); 3634 #endif /* USE_ITT_BUILD */ 3635 } 3636 3637 // __kmp_give_task puts a task into a given thread queue if: 3638 // - the queue for that thread was created 3639 // - there's space in that queue 3640 // Because of this, __kmp_push_task needs to check if there's space after 3641 // getting the lock 3642 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task, 3643 kmp_int32 pass) { 3644 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 3645 kmp_task_team_t *task_team = taskdata->td_task_team; 3646 3647 KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n", 3648 taskdata, tid)); 3649 3650 // If task_team is NULL something went really bad... 3651 KMP_DEBUG_ASSERT(task_team != NULL); 3652 3653 bool result = false; 3654 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 3655 3656 if (thread_data->td.td_deque == NULL) { 3657 // There's no queue in this thread, go find another one 3658 // We're guaranteed that at least one thread has a queue 3659 KA_TRACE(30, 3660 ("__kmp_give_task: thread %d has no queue while giving task %p.\n", 3661 tid, taskdata)); 3662 return result; 3663 } 3664 3665 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3666 TASK_DEQUE_SIZE(thread_data->td)) { 3667 KA_TRACE( 3668 30, 3669 ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", 3670 taskdata, tid)); 3671 3672 // if this deque is bigger than the pass ratio give a chance to another 3673 // thread 3674 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 3675 return result; 3676 3677 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3678 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3679 TASK_DEQUE_SIZE(thread_data->td)) { 3680 // expand deque to push the task which is not allowed to execute 3681 __kmp_realloc_task_deque(thread, thread_data); 3682 } 3683 3684 } else { 3685 3686 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3687 3688 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3689 TASK_DEQUE_SIZE(thread_data->td)) { 3690 KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to " 3691 "thread %d.\n", 3692 taskdata, tid)); 3693 3694 // if this deque is bigger than the pass ratio give a chance to another 3695 // thread 3696 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 3697 goto release_and_exit; 3698 3699 __kmp_realloc_task_deque(thread, thread_data); 3700 } 3701 } 3702 3703 // lock is held here, and there is space in the deque 3704 3705 thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata; 3706 // Wrap index. 3707 thread_data->td.td_deque_tail = 3708 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 3709 TCW_4(thread_data->td.td_deque_ntasks, 3710 TCR_4(thread_data->td.td_deque_ntasks) + 1); 3711 3712 result = true; 3713 KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n", 3714 taskdata, tid)); 3715 3716 release_and_exit: 3717 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 3718 3719 return result; 3720 } 3721 3722 /* The finish of the proxy tasks is divided in two pieces: 3723 - the top half is the one that can be done from a thread outside the team 3724 - the bottom half must be run from a thread within the team 3725 3726 In order to run the bottom half the task gets queued back into one of the 3727 threads of the team. Once the td_incomplete_child_task counter of the parent 3728 is decremented the threads can leave the barriers. So, the bottom half needs 3729 to be queued before the counter is decremented. The top half is therefore 3730 divided in two parts: 3731 - things that can be run before queuing the bottom half 3732 - things that must be run after queuing the bottom half 3733 3734 This creates a second race as the bottom half can free the task before the 3735 second top half is executed. To avoid this we use the 3736 td_incomplete_child_task of the proxy task to synchronize the top and bottom 3737 half. */ 3738 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 3739 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 3740 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3741 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 3742 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 3743 3744 taskdata->td_flags.complete = 1; // mark the task as completed 3745 3746 if (taskdata->td_taskgroup) 3747 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); 3748 3749 // Create an imaginary children for this task so the bottom half cannot 3750 // release the task before we have completed the second top half 3751 KMP_ATOMIC_INC(&taskdata->td_incomplete_child_tasks); 3752 } 3753 3754 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 3755 kmp_int32 children = 0; 3756 3757 // Predecrement simulated by "- 1" calculation 3758 children = 3759 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1; 3760 KMP_DEBUG_ASSERT(children >= 0); 3761 3762 // Remove the imaginary children 3763 KMP_ATOMIC_DEC(&taskdata->td_incomplete_child_tasks); 3764 } 3765 3766 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) { 3767 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3768 kmp_info_t *thread = __kmp_threads[gtid]; 3769 3770 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3771 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 3772 1); // top half must run before bottom half 3773 3774 // We need to wait to make sure the top half is finished 3775 // Spinning here should be ok as this should happen quickly 3776 while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) > 0) 3777 ; 3778 3779 __kmp_release_deps(gtid, taskdata); 3780 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 3781 } 3782 3783 /*! 3784 @ingroup TASKING 3785 @param gtid Global Thread ID of encountering thread 3786 @param ptask Task which execution is completed 3787 3788 Execute the completion of a proxy task from a thread of that is part of the 3789 team. Run first and bottom halves directly. 3790 */ 3791 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) { 3792 KMP_DEBUG_ASSERT(ptask != NULL); 3793 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3794 KA_TRACE( 3795 10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n", 3796 gtid, taskdata)); 3797 3798 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3799 3800 __kmp_first_top_half_finish_proxy(taskdata); 3801 __kmp_second_top_half_finish_proxy(taskdata); 3802 __kmp_bottom_half_finish_proxy(gtid, ptask); 3803 3804 KA_TRACE(10, 3805 ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n", 3806 gtid, taskdata)); 3807 } 3808 3809 /*! 3810 @ingroup TASKING 3811 @param ptask Task which execution is completed 3812 3813 Execute the completion of a proxy task from a thread that could not belong to 3814 the team. 3815 */ 3816 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) { 3817 KMP_DEBUG_ASSERT(ptask != NULL); 3818 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3819 3820 KA_TRACE( 3821 10, 3822 ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n", 3823 taskdata)); 3824 3825 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3826 3827 __kmp_first_top_half_finish_proxy(taskdata); 3828 3829 // Enqueue task to complete bottom half completion from a thread within the 3830 // corresponding team 3831 kmp_team_t *team = taskdata->td_team; 3832 kmp_int32 nthreads = team->t.t_nproc; 3833 kmp_info_t *thread; 3834 3835 // This should be similar to start_k = __kmp_get_random( thread ) % nthreads 3836 // but we cannot use __kmp_get_random here 3837 kmp_int32 start_k = 0; 3838 kmp_int32 pass = 1; 3839 kmp_int32 k = start_k; 3840 3841 do { 3842 // For now we're just linearly trying to find a thread 3843 thread = team->t.t_threads[k]; 3844 k = (k + 1) % nthreads; 3845 3846 // we did a full pass through all the threads 3847 if (k == start_k) 3848 pass = pass << 1; 3849 3850 } while (!__kmp_give_task(thread, k, ptask, pass)); 3851 3852 __kmp_second_top_half_finish_proxy(taskdata); 3853 3854 KA_TRACE( 3855 10, 3856 ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", 3857 taskdata)); 3858 } 3859 3860 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid, 3861 kmp_task_t *task) { 3862 kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task); 3863 if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) { 3864 td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION; 3865 td->td_allow_completion_event.ed.task = task; 3866 __kmp_init_tas_lock(&td->td_allow_completion_event.lock); 3867 } 3868 return &td->td_allow_completion_event; 3869 } 3870 3871 void __kmp_fulfill_event(kmp_event_t *event) { 3872 if (event->type == KMP_EVENT_ALLOW_COMPLETION) { 3873 kmp_task_t *ptask = event->ed.task; 3874 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3875 bool detached = false; 3876 int gtid = __kmp_get_gtid(); 3877 3878 // The associated task might have completed or could be completing at this 3879 // point. 3880 // We need to take the lock to avoid races 3881 __kmp_acquire_tas_lock(&event->lock, gtid); 3882 if (taskdata->td_flags.proxy == TASK_PROXY) { 3883 detached = true; 3884 } else { 3885 #if OMPT_SUPPORT 3886 // The OMPT event must occur under mutual exclusion, 3887 // otherwise the tool might access ptask after free 3888 if (UNLIKELY(ompt_enabled.enabled)) 3889 __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill); 3890 #endif 3891 } 3892 event->type = KMP_EVENT_UNINITIALIZED; 3893 __kmp_release_tas_lock(&event->lock, gtid); 3894 3895 if (detached) { 3896 #if OMPT_SUPPORT 3897 // We free ptask afterwards and know the task is finished, 3898 // so locking is not necessary 3899 if (UNLIKELY(ompt_enabled.enabled)) 3900 __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill); 3901 #endif 3902 // If the task detached complete the proxy task 3903 if (gtid >= 0) { 3904 kmp_team_t *team = taskdata->td_team; 3905 kmp_info_t *thread = __kmp_get_thread(); 3906 if (thread->th.th_team == team) { 3907 __kmpc_proxy_task_completed(gtid, ptask); 3908 return; 3909 } 3910 } 3911 3912 // fallback 3913 __kmpc_proxy_task_completed_ooo(ptask); 3914 } 3915 } 3916 } 3917 3918 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task 3919 // for taskloop 3920 // 3921 // thread: allocating thread 3922 // task_src: pointer to source task to be duplicated 3923 // returns: a pointer to the allocated kmp_task_t structure (task). 3924 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) { 3925 kmp_task_t *task; 3926 kmp_taskdata_t *taskdata; 3927 kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src); 3928 kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task 3929 size_t shareds_offset; 3930 size_t task_size; 3931 3932 KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, 3933 task_src)); 3934 KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy == 3935 TASK_FULL); // it should not be proxy task 3936 KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT); 3937 task_size = taskdata_src->td_size_alloc; 3938 3939 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 3940 KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread, 3941 task_size)); 3942 #if USE_FAST_MEMORY 3943 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size); 3944 #else 3945 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size); 3946 #endif /* USE_FAST_MEMORY */ 3947 KMP_MEMCPY(taskdata, taskdata_src, task_size); 3948 3949 task = KMP_TASKDATA_TO_TASK(taskdata); 3950 3951 // Initialize new task (only specific fields not affected by memcpy) 3952 taskdata->td_task_id = KMP_GEN_TASK_ID(); 3953 if (task->shareds != NULL) { // need setup shareds pointer 3954 shareds_offset = (char *)task_src->shareds - (char *)taskdata_src; 3955 task->shareds = &((char *)taskdata)[shareds_offset]; 3956 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 3957 0); 3958 } 3959 taskdata->td_alloc_thread = thread; 3960 taskdata->td_parent = parent_task; 3961 // task inherits the taskgroup from the parent task 3962 taskdata->td_taskgroup = parent_task->td_taskgroup; 3963 // tied task needs to initialize the td_last_tied at creation, 3964 // untied one does this when it is scheduled for execution 3965 if (taskdata->td_flags.tiedness == TASK_TIED) 3966 taskdata->td_last_tied = taskdata; 3967 3968 // Only need to keep track of child task counts if team parallel and tasking 3969 // not serialized 3970 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { 3971 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks); 3972 if (parent_task->td_taskgroup) 3973 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count); 3974 // Only need to keep track of allocated child tasks for explicit tasks since 3975 // implicit not deallocated 3976 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) 3977 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks); 3978 } 3979 3980 KA_TRACE(20, 3981 ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n", 3982 thread, taskdata, taskdata->td_parent)); 3983 #if OMPT_SUPPORT 3984 if (UNLIKELY(ompt_enabled.enabled)) 3985 __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid); 3986 #endif 3987 return task; 3988 } 3989 3990 // Routine optionally generated by the compiler for setting the lastprivate flag 3991 // and calling needed constructors for private/firstprivate objects 3992 // (used to form taskloop tasks from pattern task) 3993 // Parameters: dest task, src task, lastprivate flag. 3994 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32); 3995 3996 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8); 3997 3998 // class to encapsulate manipulating loop bounds in a taskloop task. 3999 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting 4000 // the loop bound variables. 4001 class kmp_taskloop_bounds_t { 4002 kmp_task_t *task; 4003 const kmp_taskdata_t *taskdata; 4004 size_t lower_offset; 4005 size_t upper_offset; 4006 4007 public: 4008 kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub) 4009 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)), 4010 lower_offset((char *)lb - (char *)task), 4011 upper_offset((char *)ub - (char *)task) { 4012 KMP_DEBUG_ASSERT((char *)lb > (char *)_task); 4013 KMP_DEBUG_ASSERT((char *)ub > (char *)_task); 4014 } 4015 kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds) 4016 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)), 4017 lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {} 4018 size_t get_lower_offset() const { return lower_offset; } 4019 size_t get_upper_offset() const { return upper_offset; } 4020 kmp_uint64 get_lb() const { 4021 kmp_int64 retval; 4022 #if defined(KMP_GOMP_COMPAT) 4023 // Intel task just returns the lower bound normally 4024 if (!taskdata->td_flags.native) { 4025 retval = *(kmp_int64 *)((char *)task + lower_offset); 4026 } else { 4027 // GOMP task has to take into account the sizeof(long) 4028 if (taskdata->td_size_loop_bounds == 4) { 4029 kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds); 4030 retval = (kmp_int64)*lb; 4031 } else { 4032 kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds); 4033 retval = (kmp_int64)*lb; 4034 } 4035 } 4036 #else 4037 retval = *(kmp_int64 *)((char *)task + lower_offset); 4038 #endif // defined(KMP_GOMP_COMPAT) 4039 return retval; 4040 } 4041 kmp_uint64 get_ub() const { 4042 kmp_int64 retval; 4043 #if defined(KMP_GOMP_COMPAT) 4044 // Intel task just returns the upper bound normally 4045 if (!taskdata->td_flags.native) { 4046 retval = *(kmp_int64 *)((char *)task + upper_offset); 4047 } else { 4048 // GOMP task has to take into account the sizeof(long) 4049 if (taskdata->td_size_loop_bounds == 4) { 4050 kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1; 4051 retval = (kmp_int64)*ub; 4052 } else { 4053 kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1; 4054 retval = (kmp_int64)*ub; 4055 } 4056 } 4057 #else 4058 retval = *(kmp_int64 *)((char *)task + upper_offset); 4059 #endif // defined(KMP_GOMP_COMPAT) 4060 return retval; 4061 } 4062 void set_lb(kmp_uint64 lb) { 4063 #if defined(KMP_GOMP_COMPAT) 4064 // Intel task just sets the lower bound normally 4065 if (!taskdata->td_flags.native) { 4066 *(kmp_uint64 *)((char *)task + lower_offset) = lb; 4067 } else { 4068 // GOMP task has to take into account the sizeof(long) 4069 if (taskdata->td_size_loop_bounds == 4) { 4070 kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds); 4071 *lower = (kmp_uint32)lb; 4072 } else { 4073 kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds); 4074 *lower = (kmp_uint64)lb; 4075 } 4076 } 4077 #else 4078 *(kmp_uint64 *)((char *)task + lower_offset) = lb; 4079 #endif // defined(KMP_GOMP_COMPAT) 4080 } 4081 void set_ub(kmp_uint64 ub) { 4082 #if defined(KMP_GOMP_COMPAT) 4083 // Intel task just sets the upper bound normally 4084 if (!taskdata->td_flags.native) { 4085 *(kmp_uint64 *)((char *)task + upper_offset) = ub; 4086 } else { 4087 // GOMP task has to take into account the sizeof(long) 4088 if (taskdata->td_size_loop_bounds == 4) { 4089 kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1; 4090 *upper = (kmp_uint32)ub; 4091 } else { 4092 kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1; 4093 *upper = (kmp_uint64)ub; 4094 } 4095 } 4096 #else 4097 *(kmp_uint64 *)((char *)task + upper_offset) = ub; 4098 #endif // defined(KMP_GOMP_COMPAT) 4099 } 4100 }; 4101 4102 // __kmp_taskloop_linear: Start tasks of the taskloop linearly 4103 // 4104 // loc Source location information 4105 // gtid Global thread ID 4106 // task Pattern task, exposes the loop iteration range 4107 // lb Pointer to loop lower bound in task structure 4108 // ub Pointer to loop upper bound in task structure 4109 // st Loop stride 4110 // ub_glob Global upper bound (used for lastprivate check) 4111 // num_tasks Number of tasks to execute 4112 // grainsize Number of loop iterations per task 4113 // extras Number of chunks with grainsize+1 iterations 4114 // tc Iterations count 4115 // task_dup Tasks duplication routine 4116 // codeptr_ra Return address for OMPT events 4117 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, 4118 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 4119 kmp_uint64 ub_glob, kmp_uint64 num_tasks, 4120 kmp_uint64 grainsize, kmp_uint64 extras, 4121 kmp_uint64 tc, 4122 #if OMPT_SUPPORT 4123 void *codeptr_ra, 4124 #endif 4125 void *task_dup) { 4126 KMP_COUNT_BLOCK(OMP_TASKLOOP); 4127 KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling); 4128 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 4129 // compiler provides global bounds here 4130 kmp_taskloop_bounds_t task_bounds(task, lb, ub); 4131 kmp_uint64 lower = task_bounds.get_lb(); 4132 kmp_uint64 upper = task_bounds.get_ub(); 4133 kmp_uint64 i; 4134 kmp_info_t *thread = __kmp_threads[gtid]; 4135 kmp_taskdata_t *current_task = thread->th.th_current_task; 4136 kmp_task_t *next_task; 4137 kmp_int32 lastpriv = 0; 4138 4139 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); 4140 KMP_DEBUG_ASSERT(num_tasks > extras); 4141 KMP_DEBUG_ASSERT(num_tasks > 0); 4142 KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, " 4143 "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n", 4144 gtid, num_tasks, grainsize, extras, lower, upper, ub_glob, st, 4145 task_dup)); 4146 4147 // Launch num_tasks tasks, assign grainsize iterations each task 4148 for (i = 0; i < num_tasks; ++i) { 4149 kmp_uint64 chunk_minus_1; 4150 if (extras == 0) { 4151 chunk_minus_1 = grainsize - 1; 4152 } else { 4153 chunk_minus_1 = grainsize; 4154 --extras; // first extras iterations get bigger chunk (grainsize+1) 4155 } 4156 upper = lower + st * chunk_minus_1; 4157 if (i == num_tasks - 1) { 4158 // schedule the last task, set lastprivate flag if needed 4159 if (st == 1) { // most common case 4160 KMP_DEBUG_ASSERT(upper == *ub); 4161 if (upper == ub_glob) 4162 lastpriv = 1; 4163 } else if (st > 0) { // positive loop stride 4164 KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper); 4165 if ((kmp_uint64)st > ub_glob - upper) 4166 lastpriv = 1; 4167 } else { // negative loop stride 4168 KMP_DEBUG_ASSERT(upper + st < *ub); 4169 if (upper - ub_glob < (kmp_uint64)(-st)) 4170 lastpriv = 1; 4171 } 4172 } 4173 next_task = __kmp_task_dup_alloc(thread, task); // allocate new task 4174 kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task); 4175 kmp_taskloop_bounds_t next_task_bounds = 4176 kmp_taskloop_bounds_t(next_task, task_bounds); 4177 4178 // adjust task-specific bounds 4179 next_task_bounds.set_lb(lower); 4180 if (next_taskdata->td_flags.native) { 4181 next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1)); 4182 } else { 4183 next_task_bounds.set_ub(upper); 4184 } 4185 if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates, 4186 // etc. 4187 ptask_dup(next_task, task, lastpriv); 4188 KA_TRACE(40, 4189 ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, " 4190 "upper %lld stride %lld, (offsets %p %p)\n", 4191 gtid, i, next_task, lower, upper, st, 4192 next_task_bounds.get_lower_offset(), 4193 next_task_bounds.get_upper_offset())); 4194 #if OMPT_SUPPORT 4195 __kmp_omp_taskloop_task(NULL, gtid, next_task, 4196 codeptr_ra); // schedule new task 4197 #else 4198 __kmp_omp_task(gtid, next_task, true); // schedule new task 4199 #endif 4200 lower = upper + st; // adjust lower bound for the next iteration 4201 } 4202 // free the pattern task and exit 4203 __kmp_task_start(gtid, task, current_task); // make internal bookkeeping 4204 // do not execute the pattern task, just do internal bookkeeping 4205 __kmp_task_finish<false>(gtid, task, current_task); 4206 } 4207 4208 // Structure to keep taskloop parameters for auxiliary task 4209 // kept in the shareds of the task structure. 4210 typedef struct __taskloop_params { 4211 kmp_task_t *task; 4212 kmp_uint64 *lb; 4213 kmp_uint64 *ub; 4214 void *task_dup; 4215 kmp_int64 st; 4216 kmp_uint64 ub_glob; 4217 kmp_uint64 num_tasks; 4218 kmp_uint64 grainsize; 4219 kmp_uint64 extras; 4220 kmp_uint64 tc; 4221 kmp_uint64 num_t_min; 4222 #if OMPT_SUPPORT 4223 void *codeptr_ra; 4224 #endif 4225 } __taskloop_params_t; 4226 4227 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *, 4228 kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64, 4229 kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64, 4230 #if OMPT_SUPPORT 4231 void *, 4232 #endif 4233 void *); 4234 4235 // Execute part of the taskloop submitted as a task. 4236 int __kmp_taskloop_task(int gtid, void *ptask) { 4237 __taskloop_params_t *p = 4238 (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds; 4239 kmp_task_t *task = p->task; 4240 kmp_uint64 *lb = p->lb; 4241 kmp_uint64 *ub = p->ub; 4242 void *task_dup = p->task_dup; 4243 // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 4244 kmp_int64 st = p->st; 4245 kmp_uint64 ub_glob = p->ub_glob; 4246 kmp_uint64 num_tasks = p->num_tasks; 4247 kmp_uint64 grainsize = p->grainsize; 4248 kmp_uint64 extras = p->extras; 4249 kmp_uint64 tc = p->tc; 4250 kmp_uint64 num_t_min = p->num_t_min; 4251 #if OMPT_SUPPORT 4252 void *codeptr_ra = p->codeptr_ra; 4253 #endif 4254 #if KMP_DEBUG 4255 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4256 KMP_DEBUG_ASSERT(task != NULL); 4257 KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize" 4258 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", 4259 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st, 4260 task_dup)); 4261 #endif 4262 KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min); 4263 if (num_tasks > num_t_min) 4264 __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, 4265 grainsize, extras, tc, num_t_min, 4266 #if OMPT_SUPPORT 4267 codeptr_ra, 4268 #endif 4269 task_dup); 4270 else 4271 __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, 4272 grainsize, extras, tc, 4273 #if OMPT_SUPPORT 4274 codeptr_ra, 4275 #endif 4276 task_dup); 4277 4278 KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid)); 4279 return 0; 4280 } 4281 4282 // Schedule part of the taskloop as a task, 4283 // execute the rest of the taskloop. 4284 // 4285 // loc Source location information 4286 // gtid Global thread ID 4287 // task Pattern task, exposes the loop iteration range 4288 // lb Pointer to loop lower bound in task structure 4289 // ub Pointer to loop upper bound in task structure 4290 // st Loop stride 4291 // ub_glob Global upper bound (used for lastprivate check) 4292 // num_tasks Number of tasks to execute 4293 // grainsize Number of loop iterations per task 4294 // extras Number of chunks with grainsize+1 iterations 4295 // tc Iterations count 4296 // num_t_min Threshold to launch tasks recursively 4297 // task_dup Tasks duplication routine 4298 // codeptr_ra Return address for OMPT events 4299 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task, 4300 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 4301 kmp_uint64 ub_glob, kmp_uint64 num_tasks, 4302 kmp_uint64 grainsize, kmp_uint64 extras, 4303 kmp_uint64 tc, kmp_uint64 num_t_min, 4304 #if OMPT_SUPPORT 4305 void *codeptr_ra, 4306 #endif 4307 void *task_dup) { 4308 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4309 KMP_DEBUG_ASSERT(task != NULL); 4310 KMP_DEBUG_ASSERT(num_tasks > num_t_min); 4311 KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize" 4312 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", 4313 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st, 4314 task_dup)); 4315 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 4316 kmp_uint64 lower = *lb; 4317 kmp_info_t *thread = __kmp_threads[gtid]; 4318 // kmp_taskdata_t *current_task = thread->th.th_current_task; 4319 kmp_task_t *next_task; 4320 size_t lower_offset = 4321 (char *)lb - (char *)task; // remember offset of lb in the task structure 4322 size_t upper_offset = 4323 (char *)ub - (char *)task; // remember offset of ub in the task structure 4324 4325 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); 4326 KMP_DEBUG_ASSERT(num_tasks > extras); 4327 KMP_DEBUG_ASSERT(num_tasks > 0); 4328 4329 // split the loop in two halves 4330 kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1; 4331 kmp_uint64 gr_size0 = grainsize; 4332 kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute 4333 kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task 4334 if (n_tsk0 <= extras) { 4335 gr_size0++; // integrate extras into grainsize 4336 ext0 = 0; // no extra iters in 1st half 4337 ext1 = extras - n_tsk0; // remaining extras 4338 tc0 = gr_size0 * n_tsk0; 4339 tc1 = tc - tc0; 4340 } else { // n_tsk0 > extras 4341 ext1 = 0; // no extra iters in 2nd half 4342 ext0 = extras; 4343 tc1 = grainsize * n_tsk1; 4344 tc0 = tc - tc1; 4345 } 4346 ub0 = lower + st * (tc0 - 1); 4347 lb1 = ub0 + st; 4348 4349 // create pattern task for 2nd half of the loop 4350 next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task 4351 // adjust lower bound (upper bound is not changed) for the 2nd half 4352 *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1; 4353 if (ptask_dup != NULL) // construct firstprivates, etc. 4354 ptask_dup(next_task, task, 0); 4355 *ub = ub0; // adjust upper bound for the 1st half 4356 4357 // create auxiliary task for 2nd half of the loop 4358 // make sure new task has same parent task as the pattern task 4359 kmp_taskdata_t *current_task = thread->th.th_current_task; 4360 thread->th.th_current_task = taskdata->td_parent; 4361 kmp_task_t *new_task = 4362 __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *), 4363 sizeof(__taskloop_params_t), &__kmp_taskloop_task); 4364 // restore current task 4365 thread->th.th_current_task = current_task; 4366 __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds; 4367 p->task = next_task; 4368 p->lb = (kmp_uint64 *)((char *)next_task + lower_offset); 4369 p->ub = (kmp_uint64 *)((char *)next_task + upper_offset); 4370 p->task_dup = task_dup; 4371 p->st = st; 4372 p->ub_glob = ub_glob; 4373 p->num_tasks = n_tsk1; 4374 p->grainsize = grainsize; 4375 p->extras = ext1; 4376 p->tc = tc1; 4377 p->num_t_min = num_t_min; 4378 #if OMPT_SUPPORT 4379 p->codeptr_ra = codeptr_ra; 4380 #endif 4381 4382 #if OMPT_SUPPORT 4383 // schedule new task with correct return address for OMPT events 4384 __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra); 4385 #else 4386 __kmp_omp_task(gtid, new_task, true); // schedule new task 4387 #endif 4388 4389 // execute the 1st half of current subrange 4390 if (n_tsk0 > num_t_min) 4391 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0, 4392 ext0, tc0, num_t_min, 4393 #if OMPT_SUPPORT 4394 codeptr_ra, 4395 #endif 4396 task_dup); 4397 else 4398 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, 4399 gr_size0, ext0, tc0, 4400 #if OMPT_SUPPORT 4401 codeptr_ra, 4402 #endif 4403 task_dup); 4404 4405 KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid)); 4406 } 4407 4408 /*! 4409 @ingroup TASKING 4410 @param loc Source location information 4411 @param gtid Global thread ID 4412 @param task Task structure 4413 @param if_val Value of the if clause 4414 @param lb Pointer to loop lower bound in task structure 4415 @param ub Pointer to loop upper bound in task structure 4416 @param st Loop stride 4417 @param nogroup Flag, 1 if no taskgroup needs to be added, 0 otherwise 4418 @param sched Schedule specified 0/1/2 for none/grainsize/num_tasks 4419 @param grainsize Schedule value if specified 4420 @param task_dup Tasks duplication routine 4421 4422 Execute the taskloop construct. 4423 */ 4424 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, 4425 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, 4426 int sched, kmp_uint64 grainsize, void *task_dup) { 4427 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4428 KMP_DEBUG_ASSERT(task != NULL); 4429 4430 if (nogroup == 0) { 4431 #if OMPT_SUPPORT && OMPT_OPTIONAL 4432 OMPT_STORE_RETURN_ADDRESS(gtid); 4433 #endif 4434 __kmpc_taskgroup(loc, gtid); 4435 } 4436 4437 // ========================================================================= 4438 // calculate loop parameters 4439 kmp_taskloop_bounds_t task_bounds(task, lb, ub); 4440 kmp_uint64 tc; 4441 // compiler provides global bounds here 4442 kmp_uint64 lower = task_bounds.get_lb(); 4443 kmp_uint64 upper = task_bounds.get_ub(); 4444 kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag 4445 kmp_uint64 num_tasks = 0, extras = 0; 4446 kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks; 4447 kmp_info_t *thread = __kmp_threads[gtid]; 4448 kmp_taskdata_t *current_task = thread->th.th_current_task; 4449 4450 KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, " 4451 "grain %llu(%d), dup %p\n", 4452 gtid, taskdata, lower, upper, st, grainsize, sched, task_dup)); 4453 4454 // compute trip count 4455 if (st == 1) { // most common case 4456 tc = upper - lower + 1; 4457 } else if (st < 0) { 4458 tc = (lower - upper) / (-st) + 1; 4459 } else { // st > 0 4460 tc = (upper - lower) / st + 1; 4461 } 4462 if (tc == 0) { 4463 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid)); 4464 // free the pattern task and exit 4465 __kmp_task_start(gtid, task, current_task); 4466 // do not execute anything for zero-trip loop 4467 __kmp_task_finish<false>(gtid, task, current_task); 4468 return; 4469 } 4470 4471 #if OMPT_SUPPORT && OMPT_OPTIONAL 4472 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 4473 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 4474 if (ompt_enabled.ompt_callback_work) { 4475 ompt_callbacks.ompt_callback(ompt_callback_work)( 4476 ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data), 4477 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0)); 4478 } 4479 #endif 4480 4481 if (num_tasks_min == 0) 4482 // TODO: can we choose better default heuristic? 4483 num_tasks_min = 4484 KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE); 4485 4486 // compute num_tasks/grainsize based on the input provided 4487 switch (sched) { 4488 case 0: // no schedule clause specified, we can choose the default 4489 // let's try to schedule (team_size*10) tasks 4490 grainsize = thread->th.th_team_nproc * 10; 4491 KMP_FALLTHROUGH(); 4492 case 2: // num_tasks provided 4493 if (grainsize > tc) { 4494 num_tasks = tc; // too big num_tasks requested, adjust values 4495 grainsize = 1; 4496 extras = 0; 4497 } else { 4498 num_tasks = grainsize; 4499 grainsize = tc / num_tasks; 4500 extras = tc % num_tasks; 4501 } 4502 break; 4503 case 1: // grainsize provided 4504 if (grainsize > tc) { 4505 num_tasks = 1; // too big grainsize requested, adjust values 4506 grainsize = tc; 4507 extras = 0; 4508 } else { 4509 num_tasks = tc / grainsize; 4510 // adjust grainsize for balanced distribution of iterations 4511 grainsize = tc / num_tasks; 4512 extras = tc % num_tasks; 4513 } 4514 break; 4515 default: 4516 KMP_ASSERT2(0, "unknown scheduling of taskloop"); 4517 } 4518 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); 4519 KMP_DEBUG_ASSERT(num_tasks > extras); 4520 KMP_DEBUG_ASSERT(num_tasks > 0); 4521 // ========================================================================= 4522 4523 // check if clause value first 4524 // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native) 4525 if (if_val == 0) { // if(0) specified, mark task as serial 4526 taskdata->td_flags.task_serial = 1; 4527 taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied 4528 // always start serial tasks linearly 4529 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4530 grainsize, extras, tc, 4531 #if OMPT_SUPPORT 4532 OMPT_GET_RETURN_ADDRESS(0), 4533 #endif 4534 task_dup); 4535 // !taskdata->td_flags.native => currently force linear spawning of tasks 4536 // for GOMP_taskloop 4537 } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) { 4538 KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu" 4539 "(%lld), grain %llu, extras %llu\n", 4540 gtid, tc, num_tasks, num_tasks_min, grainsize, extras)); 4541 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4542 grainsize, extras, tc, num_tasks_min, 4543 #if OMPT_SUPPORT 4544 OMPT_GET_RETURN_ADDRESS(0), 4545 #endif 4546 task_dup); 4547 } else { 4548 KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu" 4549 "(%lld), grain %llu, extras %llu\n", 4550 gtid, tc, num_tasks, num_tasks_min, grainsize, extras)); 4551 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4552 grainsize, extras, tc, 4553 #if OMPT_SUPPORT 4554 OMPT_GET_RETURN_ADDRESS(0), 4555 #endif 4556 task_dup); 4557 } 4558 4559 #if OMPT_SUPPORT && OMPT_OPTIONAL 4560 if (ompt_enabled.ompt_callback_work) { 4561 ompt_callbacks.ompt_callback(ompt_callback_work)( 4562 ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data), 4563 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0)); 4564 } 4565 #endif 4566 4567 if (nogroup == 0) { 4568 #if OMPT_SUPPORT && OMPT_OPTIONAL 4569 OMPT_STORE_RETURN_ADDRESS(gtid); 4570 #endif 4571 __kmpc_end_taskgroup(loc, gtid); 4572 } 4573 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid)); 4574 } 4575