1 /* 2 * kmp_tasking.cpp -- OpenMP 3.0 tasking support. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_i18n.h" 15 #include "kmp_itt.h" 16 #include "kmp_stats.h" 17 #include "kmp_wait_release.h" 18 #include "kmp_taskdeps.h" 19 20 #if OMPT_SUPPORT 21 #include "ompt-specific.h" 22 #endif 23 24 #include "tsan_annotations.h" 25 26 /* forward declaration */ 27 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 28 kmp_info_t *this_thr); 29 static void __kmp_alloc_task_deque(kmp_info_t *thread, 30 kmp_thread_data_t *thread_data); 31 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 32 kmp_task_team_t *task_team); 33 34 #if OMP_45_ENABLED 35 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask); 36 #endif 37 38 #ifdef BUILD_TIED_TASK_STACK 39 40 // __kmp_trace_task_stack: print the tied tasks from the task stack in order 41 // from top do bottom 42 // 43 // gtid: global thread identifier for thread containing stack 44 // thread_data: thread data for task team thread containing stack 45 // threshold: value above which the trace statement triggers 46 // location: string identifying call site of this function (for trace) 47 static void __kmp_trace_task_stack(kmp_int32 gtid, 48 kmp_thread_data_t *thread_data, 49 int threshold, char *location) { 50 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 51 kmp_taskdata_t **stack_top = task_stack->ts_top; 52 kmp_int32 entries = task_stack->ts_entries; 53 kmp_taskdata_t *tied_task; 54 55 KA_TRACE( 56 threshold, 57 ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, " 58 "first_block = %p, stack_top = %p \n", 59 location, gtid, entries, task_stack->ts_first_block, stack_top)); 60 61 KMP_DEBUG_ASSERT(stack_top != NULL); 62 KMP_DEBUG_ASSERT(entries > 0); 63 64 while (entries != 0) { 65 KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]); 66 // fix up ts_top if we need to pop from previous block 67 if (entries & TASK_STACK_INDEX_MASK == 0) { 68 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top); 69 70 stack_block = stack_block->sb_prev; 71 stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 72 } 73 74 // finish bookkeeping 75 stack_top--; 76 entries--; 77 78 tied_task = *stack_top; 79 80 KMP_DEBUG_ASSERT(tied_task != NULL); 81 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 82 83 KA_TRACE(threshold, 84 ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, " 85 "stack_top=%p, tied_task=%p\n", 86 location, gtid, entries, stack_top, tied_task)); 87 } 88 KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]); 89 90 KA_TRACE(threshold, 91 ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n", 92 location, gtid)); 93 } 94 95 // __kmp_init_task_stack: initialize the task stack for the first time 96 // after a thread_data structure is created. 97 // It should not be necessary to do this again (assuming the stack works). 98 // 99 // gtid: global thread identifier of calling thread 100 // thread_data: thread data for task team thread containing stack 101 static void __kmp_init_task_stack(kmp_int32 gtid, 102 kmp_thread_data_t *thread_data) { 103 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 104 kmp_stack_block_t *first_block; 105 106 // set up the first block of the stack 107 first_block = &task_stack->ts_first_block; 108 task_stack->ts_top = (kmp_taskdata_t **)first_block; 109 memset((void *)first_block, '\0', 110 TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *)); 111 112 // initialize the stack to be empty 113 task_stack->ts_entries = TASK_STACK_EMPTY; 114 first_block->sb_next = NULL; 115 first_block->sb_prev = NULL; 116 } 117 118 // __kmp_free_task_stack: free the task stack when thread_data is destroyed. 119 // 120 // gtid: global thread identifier for calling thread 121 // thread_data: thread info for thread containing stack 122 static void __kmp_free_task_stack(kmp_int32 gtid, 123 kmp_thread_data_t *thread_data) { 124 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 125 kmp_stack_block_t *stack_block = &task_stack->ts_first_block; 126 127 KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY); 128 // free from the second block of the stack 129 while (stack_block != NULL) { 130 kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL; 131 132 stack_block->sb_next = NULL; 133 stack_block->sb_prev = NULL; 134 if (stack_block != &task_stack->ts_first_block) { 135 __kmp_thread_free(thread, 136 stack_block); // free the block, if not the first 137 } 138 stack_block = next_block; 139 } 140 // initialize the stack to be empty 141 task_stack->ts_entries = 0; 142 task_stack->ts_top = NULL; 143 } 144 145 // __kmp_push_task_stack: Push the tied task onto the task stack. 146 // Grow the stack if necessary by allocating another block. 147 // 148 // gtid: global thread identifier for calling thread 149 // thread: thread info for thread containing stack 150 // tied_task: the task to push on the stack 151 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread, 152 kmp_taskdata_t *tied_task) { 153 // GEH - need to consider what to do if tt_threads_data not allocated yet 154 kmp_thread_data_t *thread_data = 155 &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 156 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 157 158 if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) { 159 return; // Don't push anything on stack if team or team tasks are serialized 160 } 161 162 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 163 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 164 165 KA_TRACE(20, 166 ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n", 167 gtid, thread, tied_task)); 168 // Store entry 169 *(task_stack->ts_top) = tied_task; 170 171 // Do bookkeeping for next push 172 task_stack->ts_top++; 173 task_stack->ts_entries++; 174 175 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 176 // Find beginning of this task block 177 kmp_stack_block_t *stack_block = 178 (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE); 179 180 // Check if we already have a block 181 if (stack_block->sb_next != 182 NULL) { // reset ts_top to beginning of next block 183 task_stack->ts_top = &stack_block->sb_next->sb_block[0]; 184 } else { // Alloc new block and link it up 185 kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc( 186 thread, sizeof(kmp_stack_block_t)); 187 188 task_stack->ts_top = &new_block->sb_block[0]; 189 stack_block->sb_next = new_block; 190 new_block->sb_prev = stack_block; 191 new_block->sb_next = NULL; 192 193 KA_TRACE( 194 30, 195 ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n", 196 gtid, tied_task, new_block)); 197 } 198 } 199 KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 200 tied_task)); 201 } 202 203 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return 204 // the task, just check to make sure it matches the ending task passed in. 205 // 206 // gtid: global thread identifier for the calling thread 207 // thread: thread info structure containing stack 208 // tied_task: the task popped off the stack 209 // ending_task: the task that is ending (should match popped task) 210 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread, 211 kmp_taskdata_t *ending_task) { 212 // GEH - need to consider what to do if tt_threads_data not allocated yet 213 kmp_thread_data_t *thread_data = 214 &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)]; 215 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 216 kmp_taskdata_t *tied_task; 217 218 if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) { 219 // Don't pop anything from stack if team or team tasks are serialized 220 return; 221 } 222 223 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 224 KMP_DEBUG_ASSERT(task_stack->ts_entries > 0); 225 226 KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid, 227 thread)); 228 229 // fix up ts_top if we need to pop from previous block 230 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 231 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top); 232 233 stack_block = stack_block->sb_prev; 234 task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 235 } 236 237 // finish bookkeeping 238 task_stack->ts_top--; 239 task_stack->ts_entries--; 240 241 tied_task = *(task_stack->ts_top); 242 243 KMP_DEBUG_ASSERT(tied_task != NULL); 244 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 245 KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly 246 247 KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 248 tied_task)); 249 return; 250 } 251 #endif /* BUILD_TIED_TASK_STACK */ 252 253 // returns 1 if new task is allowed to execute, 0 otherwise 254 // checks Task Scheduling constraint (if requested) and 255 // mutexinoutset dependencies if any 256 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained, 257 const kmp_taskdata_t *tasknew, 258 const kmp_taskdata_t *taskcurr) { 259 if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) { 260 // Check if the candidate obeys the Task Scheduling Constraints (TSC) 261 // only descendant of all deferred tied tasks can be scheduled, checking 262 // the last one is enough, as it in turn is the descendant of all others 263 kmp_taskdata_t *current = taskcurr->td_last_tied; 264 KMP_DEBUG_ASSERT(current != NULL); 265 // check if the task is not suspended on barrier 266 if (current->td_flags.tasktype == TASK_EXPLICIT || 267 current->td_taskwait_thread > 0) { // <= 0 on barrier 268 kmp_int32 level = current->td_level; 269 kmp_taskdata_t *parent = tasknew->td_parent; 270 while (parent != current && parent->td_level > level) { 271 // check generation up to the level of the current task 272 parent = parent->td_parent; 273 KMP_DEBUG_ASSERT(parent != NULL); 274 } 275 if (parent != current) 276 return false; 277 } 278 } 279 // Check mutexinoutset dependencies, acquire locks 280 kmp_depnode_t *node = tasknew->td_depnode; 281 if (node && (node->dn.mtx_num_locks > 0)) { 282 for (int i = 0; i < node->dn.mtx_num_locks; ++i) { 283 KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL); 284 if (__kmp_test_lock(node->dn.mtx_locks[i], gtid)) 285 continue; 286 // could not get the lock, release previous locks 287 for (int j = i - 1; j >= 0; --j) 288 __kmp_release_lock(node->dn.mtx_locks[j], gtid); 289 return false; 290 } 291 // negative num_locks means all locks acquired successfully 292 node->dn.mtx_num_locks = -node->dn.mtx_num_locks; 293 } 294 return true; 295 } 296 297 // __kmp_realloc_task_deque: 298 // Re-allocates a task deque for a particular thread, copies the content from 299 // the old deque and adjusts the necessary data structures relating to the 300 // deque. This operation must be done with the deque_lock being held 301 static void __kmp_realloc_task_deque(kmp_info_t *thread, 302 kmp_thread_data_t *thread_data) { 303 kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td); 304 kmp_int32 new_size = 2 * size; 305 306 KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to " 307 "%d] for thread_data %p\n", 308 __kmp_gtid_from_thread(thread), size, new_size, thread_data)); 309 310 kmp_taskdata_t **new_deque = 311 (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *)); 312 313 int i, j; 314 for (i = thread_data->td.td_deque_head, j = 0; j < size; 315 i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++) 316 new_deque[j] = thread_data->td.td_deque[i]; 317 318 __kmp_free(thread_data->td.td_deque); 319 320 thread_data->td.td_deque_head = 0; 321 thread_data->td.td_deque_tail = size; 322 thread_data->td.td_deque = new_deque; 323 thread_data->td.td_deque_size = new_size; 324 } 325 326 // __kmp_push_task: Add a task to the thread's deque 327 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { 328 kmp_info_t *thread = __kmp_threads[gtid]; 329 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 330 kmp_task_team_t *task_team = thread->th.th_task_team; 331 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 332 kmp_thread_data_t *thread_data; 333 334 KA_TRACE(20, 335 ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata)); 336 337 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 338 // untied task needs to increment counter so that the task structure is not 339 // freed prematurely 340 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count); 341 KMP_DEBUG_USE_VAR(counter); 342 KA_TRACE( 343 20, 344 ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n", 345 gtid, counter, taskdata)); 346 } 347 348 // The first check avoids building task_team thread data if serialized 349 if (taskdata->td_flags.task_serial) { 350 KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning " 351 "TASK_NOT_PUSHED for task %p\n", 352 gtid, taskdata)); 353 return TASK_NOT_PUSHED; 354 } 355 356 // Now that serialized tasks have returned, we can assume that we are not in 357 // immediate exec mode 358 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 359 if (!KMP_TASKING_ENABLED(task_team)) { 360 __kmp_enable_tasking(task_team, thread); 361 } 362 KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE); 363 KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL); 364 365 // Find tasking deque specific to encountering thread 366 thread_data = &task_team->tt.tt_threads_data[tid]; 367 368 // No lock needed since only owner can allocate 369 if (thread_data->td.td_deque == NULL) { 370 __kmp_alloc_task_deque(thread, thread_data); 371 } 372 373 int locked = 0; 374 // Check if deque is full 375 if (TCR_4(thread_data->td.td_deque_ntasks) >= 376 TASK_DEQUE_SIZE(thread_data->td)) { 377 if (__kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata, 378 thread->th.th_current_task)) { 379 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning " 380 "TASK_NOT_PUSHED for task %p\n", 381 gtid, taskdata)); 382 return TASK_NOT_PUSHED; 383 } else { 384 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 385 locked = 1; 386 // expand deque to push the task which is not allowed to execute 387 __kmp_realloc_task_deque(thread, thread_data); 388 } 389 } 390 // Lock the deque for the task push operation 391 if (!locked) { 392 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 393 #if OMP_45_ENABLED 394 // Need to recheck as we can get a proxy task from thread outside of OpenMP 395 if (TCR_4(thread_data->td.td_deque_ntasks) >= 396 TASK_DEQUE_SIZE(thread_data->td)) { 397 if (__kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata, 398 thread->th.th_current_task)) { 399 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 400 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; " 401 "returning TASK_NOT_PUSHED for task %p\n", 402 gtid, taskdata)); 403 return TASK_NOT_PUSHED; 404 } else { 405 // expand deque to push the task which is not allowed to execute 406 __kmp_realloc_task_deque(thread, thread_data); 407 } 408 } 409 #endif 410 } 411 // Must have room since no thread can add tasks but calling thread 412 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) < 413 TASK_DEQUE_SIZE(thread_data->td)); 414 415 thread_data->td.td_deque[thread_data->td.td_deque_tail] = 416 taskdata; // Push taskdata 417 // Wrap index. 418 thread_data->td.td_deque_tail = 419 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 420 TCW_4(thread_data->td.td_deque_ntasks, 421 TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count 422 423 KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: " 424 "task=%p ntasks=%d head=%u tail=%u\n", 425 gtid, taskdata, thread_data->td.td_deque_ntasks, 426 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 427 428 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 429 430 return TASK_SUCCESSFULLY_PUSHED; 431 } 432 433 // __kmp_pop_current_task_from_thread: set up current task from called thread 434 // when team ends 435 // 436 // this_thr: thread structure to set current_task in. 437 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) { 438 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d " 439 "this_thread=%p, curtask=%p, " 440 "curtask_parent=%p\n", 441 0, this_thr, this_thr->th.th_current_task, 442 this_thr->th.th_current_task->td_parent)); 443 444 this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent; 445 446 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d " 447 "this_thread=%p, curtask=%p, " 448 "curtask_parent=%p\n", 449 0, this_thr, this_thr->th.th_current_task, 450 this_thr->th.th_current_task->td_parent)); 451 } 452 453 // __kmp_push_current_task_to_thread: set up current task in called thread for a 454 // new team 455 // 456 // this_thr: thread structure to set up 457 // team: team for implicit task data 458 // tid: thread within team to set up 459 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team, 460 int tid) { 461 // current task of the thread is a parent of the new just created implicit 462 // tasks of new team 463 KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p " 464 "curtask=%p " 465 "parent_task=%p\n", 466 tid, this_thr, this_thr->th.th_current_task, 467 team->t.t_implicit_task_taskdata[tid].td_parent)); 468 469 KMP_DEBUG_ASSERT(this_thr != NULL); 470 471 if (tid == 0) { 472 if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) { 473 team->t.t_implicit_task_taskdata[0].td_parent = 474 this_thr->th.th_current_task; 475 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0]; 476 } 477 } else { 478 team->t.t_implicit_task_taskdata[tid].td_parent = 479 team->t.t_implicit_task_taskdata[0].td_parent; 480 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid]; 481 } 482 483 KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p " 484 "curtask=%p " 485 "parent_task=%p\n", 486 tid, this_thr, this_thr->th.th_current_task, 487 team->t.t_implicit_task_taskdata[tid].td_parent)); 488 } 489 490 // __kmp_task_start: bookkeeping for a task starting execution 491 // 492 // GTID: global thread id of calling thread 493 // task: task starting execution 494 // current_task: task suspending 495 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task, 496 kmp_taskdata_t *current_task) { 497 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 498 kmp_info_t *thread = __kmp_threads[gtid]; 499 500 KA_TRACE(10, 501 ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n", 502 gtid, taskdata, current_task)); 503 504 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 505 506 // mark currently executing task as suspended 507 // TODO: GEH - make sure root team implicit task is initialized properly. 508 // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 ); 509 current_task->td_flags.executing = 0; 510 511 // Add task to stack if tied 512 #ifdef BUILD_TIED_TASK_STACK 513 if (taskdata->td_flags.tiedness == TASK_TIED) { 514 __kmp_push_task_stack(gtid, thread, taskdata); 515 } 516 #endif /* BUILD_TIED_TASK_STACK */ 517 518 // mark starting task as executing and as current task 519 thread->th.th_current_task = taskdata; 520 521 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 || 522 taskdata->td_flags.tiedness == TASK_UNTIED); 523 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 || 524 taskdata->td_flags.tiedness == TASK_UNTIED); 525 taskdata->td_flags.started = 1; 526 taskdata->td_flags.executing = 1; 527 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 528 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 529 530 // GEH TODO: shouldn't we pass some sort of location identifier here? 531 // APT: yes, we will pass location here. 532 // need to store current thread state (in a thread or taskdata structure) 533 // before setting work_state, otherwise wrong state is set after end of task 534 535 KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata)); 536 537 return; 538 } 539 540 #if OMPT_SUPPORT 541 //------------------------------------------------------------------------------ 542 // __ompt_task_init: 543 // Initialize OMPT fields maintained by a task. This will only be called after 544 // ompt_start_tool, so we already know whether ompt is enabled or not. 545 546 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) { 547 // The calls to __ompt_task_init already have the ompt_enabled condition. 548 task->ompt_task_info.task_data.value = 0; 549 task->ompt_task_info.frame.exit_frame = ompt_data_none; 550 task->ompt_task_info.frame.enter_frame = ompt_data_none; 551 task->ompt_task_info.frame.exit_frame_flags = ompt_frame_runtime | ompt_frame_framepointer; 552 task->ompt_task_info.frame.enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer; 553 #if OMP_40_ENABLED 554 task->ompt_task_info.ndeps = 0; 555 task->ompt_task_info.deps = NULL; 556 #endif /* OMP_40_ENABLED */ 557 } 558 559 // __ompt_task_start: 560 // Build and trigger task-begin event 561 static inline void __ompt_task_start(kmp_task_t *task, 562 kmp_taskdata_t *current_task, 563 kmp_int32 gtid) { 564 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 565 ompt_task_status_t status = ompt_task_switch; 566 if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) { 567 status = ompt_task_yield; 568 __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0; 569 } 570 /* let OMPT know that we're about to run this task */ 571 if (ompt_enabled.ompt_callback_task_schedule) { 572 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)( 573 &(current_task->ompt_task_info.task_data), status, 574 &(taskdata->ompt_task_info.task_data)); 575 } 576 taskdata->ompt_task_info.scheduling_parent = current_task; 577 } 578 579 // __ompt_task_finish: 580 // Build and trigger final task-schedule event 581 static inline void 582 __ompt_task_finish(kmp_task_t *task, kmp_taskdata_t *resumed_task, 583 ompt_task_status_t status = ompt_task_complete) { 584 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 585 if (__kmp_omp_cancellation && taskdata->td_taskgroup && 586 taskdata->td_taskgroup->cancel_request == cancel_taskgroup) { 587 status = ompt_task_cancel; 588 } 589 590 /* let OMPT know that we're returning to the callee task */ 591 if (ompt_enabled.ompt_callback_task_schedule) { 592 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)( 593 &(taskdata->ompt_task_info.task_data), status, 594 &((resumed_task ? resumed_task 595 : (taskdata->ompt_task_info.scheduling_parent 596 ? taskdata->ompt_task_info.scheduling_parent 597 : taskdata->td_parent)) 598 ->ompt_task_info.task_data)); 599 } 600 } 601 #endif 602 603 template <bool ompt> 604 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid, 605 kmp_task_t *task, 606 void *frame_address, 607 void *return_address) { 608 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 609 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 610 611 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p " 612 "current_task=%p\n", 613 gtid, loc_ref, taskdata, current_task)); 614 615 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 616 // untied task needs to increment counter so that the task structure is not 617 // freed prematurely 618 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count); 619 KMP_DEBUG_USE_VAR(counter); 620 KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) " 621 "incremented for task %p\n", 622 gtid, counter, taskdata)); 623 } 624 625 taskdata->td_flags.task_serial = 626 1; // Execute this task immediately, not deferred. 627 __kmp_task_start(gtid, task, current_task); 628 629 #if OMPT_SUPPORT 630 if (ompt) { 631 if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) { 632 current_task->ompt_task_info.frame.enter_frame.ptr = 633 taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address; 634 current_task->ompt_task_info.frame.enter_frame_flags = 635 taskdata->ompt_task_info.frame.exit_frame_flags = ompt_frame_application | ompt_frame_framepointer; 636 } 637 if (ompt_enabled.ompt_callback_task_create) { 638 ompt_task_info_t *parent_info = &(current_task->ompt_task_info); 639 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 640 &(parent_info->task_data), &(parent_info->frame), 641 &(taskdata->ompt_task_info.task_data), 642 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0, 643 return_address); 644 } 645 __ompt_task_start(task, current_task, gtid); 646 } 647 #endif // OMPT_SUPPORT 648 649 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid, 650 loc_ref, taskdata)); 651 } 652 653 #if OMPT_SUPPORT 654 OMPT_NOINLINE 655 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid, 656 kmp_task_t *task, 657 void *frame_address, 658 void *return_address) { 659 __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address, 660 return_address); 661 } 662 #endif // OMPT_SUPPORT 663 664 // __kmpc_omp_task_begin_if0: report that a given serialized task has started 665 // execution 666 // 667 // loc_ref: source location information; points to beginning of task block. 668 // gtid: global thread number. 669 // task: task thunk for the started task. 670 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid, 671 kmp_task_t *task) { 672 #if OMPT_SUPPORT 673 if (UNLIKELY(ompt_enabled.enabled)) { 674 OMPT_STORE_RETURN_ADDRESS(gtid); 675 __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task, 676 OMPT_GET_FRAME_ADDRESS(1), 677 OMPT_LOAD_RETURN_ADDRESS(gtid)); 678 return; 679 } 680 #endif 681 __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL); 682 } 683 684 #ifdef TASK_UNUSED 685 // __kmpc_omp_task_begin: report that a given task has started execution 686 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 687 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) { 688 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 689 690 KA_TRACE( 691 10, 692 ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n", 693 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task)); 694 695 __kmp_task_start(gtid, task, current_task); 696 697 KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid, 698 loc_ref, KMP_TASK_TO_TASKDATA(task))); 699 return; 700 } 701 #endif // TASK_UNUSED 702 703 // __kmp_free_task: free the current task space and the space for shareds 704 // 705 // gtid: Global thread ID of calling thread 706 // taskdata: task to free 707 // thread: thread data structure of caller 708 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata, 709 kmp_info_t *thread) { 710 KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid, 711 taskdata)); 712 713 // Check to make sure all flags and counters have the correct values 714 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 715 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0); 716 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1); 717 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 718 KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 || 719 taskdata->td_flags.task_serial == 1); 720 KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0); 721 722 taskdata->td_flags.freed = 1; 723 ANNOTATE_HAPPENS_BEFORE(taskdata); 724 // deallocate the taskdata and shared variable blocks associated with this task 725 #if USE_FAST_MEMORY 726 __kmp_fast_free(thread, taskdata); 727 #else /* ! USE_FAST_MEMORY */ 728 __kmp_thread_free(thread, taskdata); 729 #endif 730 731 KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata)); 732 } 733 734 // __kmp_free_task_and_ancestors: free the current task and ancestors without 735 // children 736 // 737 // gtid: Global thread ID of calling thread 738 // taskdata: task to free 739 // thread: thread data structure of caller 740 static void __kmp_free_task_and_ancestors(kmp_int32 gtid, 741 kmp_taskdata_t *taskdata, 742 kmp_info_t *thread) { 743 #if OMP_45_ENABLED 744 // Proxy tasks must always be allowed to free their parents 745 // because they can be run in background even in serial mode. 746 kmp_int32 team_serial = 747 (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) && 748 !taskdata->td_flags.proxy; 749 #else 750 kmp_int32 team_serial = 751 taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser; 752 #endif 753 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 754 755 kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1; 756 KMP_DEBUG_ASSERT(children >= 0); 757 758 // Now, go up the ancestor tree to see if any ancestors can now be freed. 759 while (children == 0) { 760 kmp_taskdata_t *parent_taskdata = taskdata->td_parent; 761 762 KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete " 763 "and freeing itself\n", 764 gtid, taskdata)); 765 766 // --- Deallocate my ancestor task --- 767 __kmp_free_task(gtid, taskdata, thread); 768 769 taskdata = parent_taskdata; 770 771 if (team_serial) 772 return; 773 // Stop checking ancestors at implicit task instead of walking up ancestor 774 // tree to avoid premature deallocation of ancestors. 775 if (taskdata->td_flags.tasktype == TASK_IMPLICIT) { 776 if (taskdata->td_dephash) { // do we need to cleanup dephash? 777 int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks); 778 kmp_tasking_flags_t flags_old = taskdata->td_flags; 779 if (children == 0 && flags_old.complete == 1) { 780 kmp_tasking_flags_t flags_new = flags_old; 781 flags_new.complete = 0; 782 if (KMP_COMPARE_AND_STORE_ACQ32( 783 RCAST(kmp_int32 *, &taskdata->td_flags), 784 *RCAST(kmp_int32 *, &flags_old), 785 *RCAST(kmp_int32 *, &flags_new))) { 786 KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans " 787 "dephash of implicit task %p\n", 788 gtid, taskdata)); 789 // cleanup dephash of finished implicit task 790 __kmp_dephash_free_entries(thread, taskdata->td_dephash); 791 } 792 } 793 } 794 return; 795 } 796 // Predecrement simulated by "- 1" calculation 797 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1; 798 KMP_DEBUG_ASSERT(children >= 0); 799 } 800 801 KA_TRACE( 802 20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; " 803 "not freeing it yet\n", 804 gtid, taskdata, children)); 805 } 806 807 // __kmp_task_finish: bookkeeping to do when a task finishes execution 808 // 809 // gtid: global thread ID for calling thread 810 // task: task to be finished 811 // resumed_task: task to be resumed. (may be NULL if task is serialized) 812 template <bool ompt> 813 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, 814 kmp_taskdata_t *resumed_task) { 815 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 816 kmp_info_t *thread = __kmp_threads[gtid]; 817 #if OMP_45_ENABLED 818 kmp_task_team_t *task_team = 819 thread->th.th_task_team; // might be NULL for serial teams... 820 #endif // OMP_45_ENABLED 821 kmp_int32 children = 0; 822 823 KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming " 824 "task %p\n", 825 gtid, taskdata, resumed_task)); 826 827 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 828 829 // Pop task from stack if tied 830 #ifdef BUILD_TIED_TASK_STACK 831 if (taskdata->td_flags.tiedness == TASK_TIED) { 832 __kmp_pop_task_stack(gtid, thread, taskdata); 833 } 834 #endif /* BUILD_TIED_TASK_STACK */ 835 836 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 837 // untied task needs to check the counter so that the task structure is not 838 // freed prematurely 839 kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1; 840 KA_TRACE( 841 20, 842 ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n", 843 gtid, counter, taskdata)); 844 if (counter > 0) { 845 // untied task is not done, to be continued possibly by other thread, do 846 // not free it now 847 if (resumed_task == NULL) { 848 KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial); 849 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 850 // task is the parent 851 } 852 thread->th.th_current_task = resumed_task; // restore current_task 853 resumed_task->td_flags.executing = 1; // resume previous task 854 KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, " 855 "resuming task %p\n", 856 gtid, taskdata, resumed_task)); 857 return; 858 } 859 } 860 #if OMPT_SUPPORT 861 if (ompt) 862 __ompt_task_finish(task, resumed_task); 863 #endif 864 865 // Check mutexinoutset dependencies, release locks 866 kmp_depnode_t *node = taskdata->td_depnode; 867 if (node && (node->dn.mtx_num_locks < 0)) { 868 // negative num_locks means all locks were acquired 869 node->dn.mtx_num_locks = -node->dn.mtx_num_locks; 870 for (int i = node->dn.mtx_num_locks - 1; i >= 0; --i) { 871 KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL); 872 __kmp_release_lock(node->dn.mtx_locks[i], gtid); 873 } 874 } 875 876 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 877 taskdata->td_flags.complete = 1; // mark the task as completed 878 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1); 879 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 880 881 // Only need to keep track of count if team parallel and tasking not 882 // serialized 883 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { 884 // Predecrement simulated by "- 1" calculation 885 children = 886 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1; 887 KMP_DEBUG_ASSERT(children >= 0); 888 #if OMP_40_ENABLED 889 if (taskdata->td_taskgroup) 890 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); 891 __kmp_release_deps(gtid, taskdata); 892 #if OMP_45_ENABLED 893 } else if (task_team && task_team->tt.tt_found_proxy_tasks) { 894 // if we found proxy tasks there could exist a dependency chain 895 // with the proxy task as origin 896 __kmp_release_deps(gtid, taskdata); 897 #endif // OMP_45_ENABLED 898 #endif // OMP_40_ENABLED 899 } 900 901 // td_flags.executing must be marked as 0 after __kmp_release_deps has been 902 // called. Othertwise, if a task is executed immediately from the release_deps 903 // code, the flag will be reset to 1 again by this same function 904 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); 905 taskdata->td_flags.executing = 0; // suspend the finishing task 906 907 KA_TRACE( 908 20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n", 909 gtid, taskdata, children)); 910 911 #if OMP_40_ENABLED 912 /* If the tasks' destructor thunk flag has been set, we need to invoke the 913 destructor thunk that has been generated by the compiler. The code is 914 placed here, since at this point other tasks might have been released 915 hence overlapping the destructor invokations with some other work in the 916 released tasks. The OpenMP spec is not specific on when the destructors 917 are invoked, so we should be free to choose. */ 918 if (taskdata->td_flags.destructors_thunk) { 919 kmp_routine_entry_t destr_thunk = task->data1.destructors; 920 KMP_ASSERT(destr_thunk); 921 destr_thunk(gtid, task); 922 } 923 #endif // OMP_40_ENABLED 924 925 // bookkeeping for resuming task: 926 // GEH - note tasking_ser => task_serial 927 KMP_DEBUG_ASSERT( 928 (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) == 929 taskdata->td_flags.task_serial); 930 if (taskdata->td_flags.task_serial) { 931 if (resumed_task == NULL) { 932 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 933 // task is the parent 934 } 935 } else { 936 KMP_DEBUG_ASSERT(resumed_task != 937 NULL); // verify that resumed task is passed as arguemnt 938 } 939 940 // Free this task and then ancestor tasks if they have no children. 941 // Restore th_current_task first as suggested by John: 942 // johnmc: if an asynchronous inquiry peers into the runtime system 943 // it doesn't see the freed task as the current task. 944 thread->th.th_current_task = resumed_task; 945 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 946 947 // TODO: GEH - make sure root team implicit task is initialized properly. 948 // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 ); 949 resumed_task->td_flags.executing = 1; // resume previous task 950 951 KA_TRACE( 952 10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n", 953 gtid, taskdata, resumed_task)); 954 955 return; 956 } 957 958 template <bool ompt> 959 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref, 960 kmp_int32 gtid, 961 kmp_task_t *task) { 962 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n", 963 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 964 // this routine will provide task to resume 965 __kmp_task_finish<ompt>(gtid, task, NULL); 966 967 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n", 968 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 969 970 #if OMPT_SUPPORT 971 if (ompt) { 972 ompt_frame_t *ompt_frame; 973 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 974 ompt_frame->enter_frame = ompt_data_none; 975 ompt_frame->enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer; 976 } 977 #endif 978 979 return; 980 } 981 982 #if OMPT_SUPPORT 983 OMPT_NOINLINE 984 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid, 985 kmp_task_t *task) { 986 __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task); 987 } 988 #endif // OMPT_SUPPORT 989 990 // __kmpc_omp_task_complete_if0: report that a task has completed execution 991 // 992 // loc_ref: source location information; points to end of task block. 993 // gtid: global thread number. 994 // task: task thunk for the completed task. 995 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid, 996 kmp_task_t *task) { 997 #if OMPT_SUPPORT 998 if (UNLIKELY(ompt_enabled.enabled)) { 999 __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task); 1000 return; 1001 } 1002 #endif 1003 __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task); 1004 } 1005 1006 #ifdef TASK_UNUSED 1007 // __kmpc_omp_task_complete: report that a task has completed execution 1008 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 1009 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid, 1010 kmp_task_t *task) { 1011 KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid, 1012 loc_ref, KMP_TASK_TO_TASKDATA(task))); 1013 1014 __kmp_task_finish<false>(gtid, task, 1015 NULL); // Not sure how to find task to resume 1016 1017 KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid, 1018 loc_ref, KMP_TASK_TO_TASKDATA(task))); 1019 return; 1020 } 1021 #endif // TASK_UNUSED 1022 1023 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit 1024 // task for a given thread 1025 // 1026 // loc_ref: reference to source location of parallel region 1027 // this_thr: thread data structure corresponding to implicit task 1028 // team: team for this_thr 1029 // tid: thread id of given thread within team 1030 // set_curr_task: TRUE if need to push current task to thread 1031 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to 1032 // have already been done elsewhere. 1033 // TODO: Get better loc_ref. Value passed in may be NULL 1034 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr, 1035 kmp_team_t *team, int tid, int set_curr_task) { 1036 kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid]; 1037 1038 KF_TRACE( 1039 10, 1040 ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n", 1041 tid, team, task, set_curr_task ? "TRUE" : "FALSE")); 1042 1043 task->td_task_id = KMP_GEN_TASK_ID(); 1044 task->td_team = team; 1045 // task->td_parent = NULL; // fix for CQ230101 (broken parent task info 1046 // in debugger) 1047 task->td_ident = loc_ref; 1048 task->td_taskwait_ident = NULL; 1049 task->td_taskwait_counter = 0; 1050 task->td_taskwait_thread = 0; 1051 1052 task->td_flags.tiedness = TASK_TIED; 1053 task->td_flags.tasktype = TASK_IMPLICIT; 1054 #if OMP_45_ENABLED 1055 task->td_flags.proxy = TASK_FULL; 1056 #endif 1057 1058 // All implicit tasks are executed immediately, not deferred 1059 task->td_flags.task_serial = 1; 1060 task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 1061 task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 1062 1063 task->td_flags.started = 1; 1064 task->td_flags.executing = 1; 1065 task->td_flags.complete = 0; 1066 task->td_flags.freed = 0; 1067 1068 #if OMP_40_ENABLED 1069 task->td_depnode = NULL; 1070 #endif 1071 task->td_last_tied = task; 1072 1073 if (set_curr_task) { // only do this init first time thread is created 1074 KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0); 1075 // Not used: don't need to deallocate implicit task 1076 KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0); 1077 #if OMP_40_ENABLED 1078 task->td_taskgroup = NULL; // An implicit task does not have taskgroup 1079 task->td_dephash = NULL; 1080 #endif 1081 __kmp_push_current_task_to_thread(this_thr, team, tid); 1082 } else { 1083 KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0); 1084 KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0); 1085 } 1086 1087 #if OMPT_SUPPORT 1088 if (UNLIKELY(ompt_enabled.enabled)) 1089 __ompt_task_init(task, tid); 1090 #endif 1091 1092 KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid, 1093 team, task)); 1094 } 1095 1096 // __kmp_finish_implicit_task: Release resources associated to implicit tasks 1097 // at the end of parallel regions. Some resources are kept for reuse in the next 1098 // parallel region. 1099 // 1100 // thread: thread data structure corresponding to implicit task 1101 void __kmp_finish_implicit_task(kmp_info_t *thread) { 1102 kmp_taskdata_t *task = thread->th.th_current_task; 1103 if (task->td_dephash) { 1104 int children; 1105 task->td_flags.complete = 1; 1106 children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks); 1107 kmp_tasking_flags_t flags_old = task->td_flags; 1108 if (children == 0 && flags_old.complete == 1) { 1109 kmp_tasking_flags_t flags_new = flags_old; 1110 flags_new.complete = 0; 1111 if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags), 1112 *RCAST(kmp_int32 *, &flags_old), 1113 *RCAST(kmp_int32 *, &flags_new))) { 1114 KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans " 1115 "dephash of implicit task %p\n", 1116 thread->th.th_info.ds.ds_gtid, task)); 1117 __kmp_dephash_free_entries(thread, task->td_dephash); 1118 } 1119 } 1120 } 1121 } 1122 1123 // __kmp_free_implicit_task: Release resources associated to implicit tasks 1124 // when these are destroyed regions 1125 // 1126 // thread: thread data structure corresponding to implicit task 1127 void __kmp_free_implicit_task(kmp_info_t *thread) { 1128 kmp_taskdata_t *task = thread->th.th_current_task; 1129 if (task && task->td_dephash) { 1130 __kmp_dephash_free(thread, task->td_dephash); 1131 task->td_dephash = NULL; 1132 } 1133 } 1134 1135 // Round up a size to a power of two specified by val: Used to insert padding 1136 // between structures co-allocated using a single malloc() call 1137 static size_t __kmp_round_up_to_val(size_t size, size_t val) { 1138 if (size & (val - 1)) { 1139 size &= ~(val - 1); 1140 if (size <= KMP_SIZE_T_MAX - val) { 1141 size += val; // Round up if there is no overflow. 1142 } 1143 } 1144 return size; 1145 } // __kmp_round_up_to_va 1146 1147 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task 1148 // 1149 // loc_ref: source location information 1150 // gtid: global thread number. 1151 // flags: include tiedness & task type (explicit vs. implicit) of the ''new'' 1152 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine. 1153 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including 1154 // private vars accessed in task. 1155 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed 1156 // in task. 1157 // task_entry: Pointer to task code entry point generated by compiler. 1158 // returns: a pointer to the allocated kmp_task_t structure (task). 1159 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1160 kmp_tasking_flags_t *flags, 1161 size_t sizeof_kmp_task_t, size_t sizeof_shareds, 1162 kmp_routine_entry_t task_entry) { 1163 kmp_task_t *task; 1164 kmp_taskdata_t *taskdata; 1165 kmp_info_t *thread = __kmp_threads[gtid]; 1166 kmp_team_t *team = thread->th.th_team; 1167 kmp_taskdata_t *parent_task = thread->th.th_current_task; 1168 size_t shareds_offset; 1169 1170 if (!TCR_4(__kmp_init_middle)) 1171 __kmp_middle_initialize(); 1172 1173 KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) " 1174 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1175 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t, 1176 sizeof_shareds, task_entry)); 1177 1178 if (parent_task->td_flags.final) { 1179 if (flags->merged_if0) { 1180 } 1181 flags->final = 1; 1182 } 1183 if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) { 1184 // Untied task encountered causes the TSC algorithm to check entire deque of 1185 // the victim thread. If no untied task encountered, then checking the head 1186 // of the deque should be enough. 1187 KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1); 1188 } 1189 1190 #if OMP_45_ENABLED 1191 if (flags->proxy == TASK_PROXY) { 1192 flags->tiedness = TASK_UNTIED; 1193 flags->merged_if0 = 1; 1194 1195 /* are we running in a sequential parallel or tskm_immediate_exec... we need 1196 tasking support enabled */ 1197 if ((thread->th.th_task_team) == NULL) { 1198 /* This should only happen if the team is serialized 1199 setup a task team and propagate it to the thread */ 1200 KMP_DEBUG_ASSERT(team->t.t_serialized); 1201 KA_TRACE(30, 1202 ("T#%d creating task team in __kmp_task_alloc for proxy task\n", 1203 gtid)); 1204 __kmp_task_team_setup( 1205 thread, team, 1206 1); // 1 indicates setup the current team regardless of nthreads 1207 thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state]; 1208 } 1209 kmp_task_team_t *task_team = thread->th.th_task_team; 1210 1211 /* tasking must be enabled now as the task might not be pushed */ 1212 if (!KMP_TASKING_ENABLED(task_team)) { 1213 KA_TRACE( 1214 30, 1215 ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid)); 1216 __kmp_enable_tasking(task_team, thread); 1217 kmp_int32 tid = thread->th.th_info.ds.ds_tid; 1218 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 1219 // No lock needed since only owner can allocate 1220 if (thread_data->td.td_deque == NULL) { 1221 __kmp_alloc_task_deque(thread, thread_data); 1222 } 1223 } 1224 1225 if (task_team->tt.tt_found_proxy_tasks == FALSE) 1226 TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE); 1227 } 1228 #endif 1229 1230 // Calculate shared structure offset including padding after kmp_task_t struct 1231 // to align pointers in shared struct 1232 shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t; 1233 shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *)); 1234 1235 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 1236 KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid, 1237 shareds_offset)); 1238 KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid, 1239 sizeof_shareds)); 1240 1241 // Avoid double allocation here by combining shareds with taskdata 1242 #if USE_FAST_MEMORY 1243 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset + 1244 sizeof_shareds); 1245 #else /* ! USE_FAST_MEMORY */ 1246 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset + 1247 sizeof_shareds); 1248 #endif /* USE_FAST_MEMORY */ 1249 ANNOTATE_HAPPENS_AFTER(taskdata); 1250 1251 task = KMP_TASKDATA_TO_TASK(taskdata); 1252 1253 // Make sure task & taskdata are aligned appropriately 1254 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD 1255 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0); 1256 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0); 1257 #else 1258 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0); 1259 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0); 1260 #endif 1261 if (sizeof_shareds > 0) { 1262 // Avoid double allocation here by combining shareds with taskdata 1263 task->shareds = &((char *)taskdata)[shareds_offset]; 1264 // Make sure shareds struct is aligned to pointer size 1265 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 1266 0); 1267 } else { 1268 task->shareds = NULL; 1269 } 1270 task->routine = task_entry; 1271 task->part_id = 0; // AC: Always start with 0 part id 1272 1273 taskdata->td_task_id = KMP_GEN_TASK_ID(); 1274 taskdata->td_team = team; 1275 taskdata->td_alloc_thread = thread; 1276 taskdata->td_parent = parent_task; 1277 taskdata->td_level = parent_task->td_level + 1; // increment nesting level 1278 KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0); 1279 taskdata->td_ident = loc_ref; 1280 taskdata->td_taskwait_ident = NULL; 1281 taskdata->td_taskwait_counter = 0; 1282 taskdata->td_taskwait_thread = 0; 1283 KMP_DEBUG_ASSERT(taskdata->td_parent != NULL); 1284 #if OMP_45_ENABLED 1285 // avoid copying icvs for proxy tasks 1286 if (flags->proxy == TASK_FULL) 1287 #endif 1288 copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs); 1289 1290 taskdata->td_flags.tiedness = flags->tiedness; 1291 taskdata->td_flags.final = flags->final; 1292 taskdata->td_flags.merged_if0 = flags->merged_if0; 1293 #if OMP_40_ENABLED 1294 taskdata->td_flags.destructors_thunk = flags->destructors_thunk; 1295 #endif // OMP_40_ENABLED 1296 #if OMP_45_ENABLED 1297 taskdata->td_flags.proxy = flags->proxy; 1298 taskdata->td_task_team = thread->th.th_task_team; 1299 taskdata->td_size_alloc = shareds_offset + sizeof_shareds; 1300 #endif 1301 taskdata->td_flags.tasktype = TASK_EXPLICIT; 1302 1303 // GEH - TODO: fix this to copy parent task's value of tasking_ser flag 1304 taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 1305 1306 // GEH - TODO: fix this to copy parent task's value of team_serial flag 1307 taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 1308 1309 // GEH - Note we serialize the task if the team is serialized to make sure 1310 // implicit parallel region tasks are not left until program termination to 1311 // execute. Also, it helps locality to execute immediately. 1312 1313 taskdata->td_flags.task_serial = 1314 (parent_task->td_flags.final || taskdata->td_flags.team_serial || 1315 taskdata->td_flags.tasking_ser); 1316 1317 taskdata->td_flags.started = 0; 1318 taskdata->td_flags.executing = 0; 1319 taskdata->td_flags.complete = 0; 1320 taskdata->td_flags.freed = 0; 1321 1322 taskdata->td_flags.native = flags->native; 1323 1324 KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0); 1325 // start at one because counts current task and children 1326 KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1); 1327 #if OMP_40_ENABLED 1328 taskdata->td_taskgroup = 1329 parent_task->td_taskgroup; // task inherits taskgroup from the parent task 1330 taskdata->td_dephash = NULL; 1331 taskdata->td_depnode = NULL; 1332 #endif 1333 if (flags->tiedness == TASK_UNTIED) 1334 taskdata->td_last_tied = NULL; // will be set when the task is scheduled 1335 else 1336 taskdata->td_last_tied = taskdata; 1337 1338 #if OMPT_SUPPORT 1339 if (UNLIKELY(ompt_enabled.enabled)) 1340 __ompt_task_init(taskdata, gtid); 1341 #endif 1342 // Only need to keep track of child task counts if team parallel and tasking not 1343 // serialized or if it is a proxy task 1344 #if OMP_45_ENABLED 1345 if (flags->proxy == TASK_PROXY || 1346 !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) 1347 #else 1348 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) 1349 #endif 1350 { 1351 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks); 1352 #if OMP_40_ENABLED 1353 if (parent_task->td_taskgroup) 1354 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count); 1355 #endif 1356 // Only need to keep track of allocated child tasks for explicit tasks since 1357 // implicit not deallocated 1358 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) { 1359 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks); 1360 } 1361 } 1362 1363 KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n", 1364 gtid, taskdata, taskdata->td_parent)); 1365 ANNOTATE_HAPPENS_BEFORE(task); 1366 1367 return task; 1368 } 1369 1370 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1371 kmp_int32 flags, size_t sizeof_kmp_task_t, 1372 size_t sizeof_shareds, 1373 kmp_routine_entry_t task_entry) { 1374 kmp_task_t *retval; 1375 kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags; 1376 1377 input_flags->native = FALSE; 1378 // __kmp_task_alloc() sets up all other runtime flags 1379 1380 #if OMP_45_ENABLED 1381 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) " 1382 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1383 gtid, loc_ref, input_flags->tiedness ? "tied " : "untied", 1384 input_flags->proxy ? "proxy" : "", sizeof_kmp_task_t, 1385 sizeof_shareds, task_entry)); 1386 #else 1387 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) " 1388 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1389 gtid, loc_ref, input_flags->tiedness ? "tied " : "untied", 1390 sizeof_kmp_task_t, sizeof_shareds, task_entry)); 1391 #endif 1392 1393 retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t, 1394 sizeof_shareds, task_entry); 1395 1396 KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval)); 1397 1398 return retval; 1399 } 1400 1401 #if OMP_50_ENABLED 1402 /*! 1403 @ingroup TASKING 1404 @param loc_ref location of the original task directive 1405 @param gtid Global Thread ID of encountering thread 1406 @param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new 1407 task'' 1408 @param naffins Number of affinity items 1409 @param affin_list List of affinity items 1410 @return Returns non-zero if registering affinity information was not successful. 1411 Returns 0 if registration was successful 1412 This entry registers the affinity information attached to a task with the task 1413 thunk structure kmp_taskdata_t. 1414 */ 1415 kmp_int32 1416 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, 1417 kmp_task_t *new_task, kmp_int32 naffins, 1418 kmp_task_affinity_info_t *affin_list) { 1419 return 0; 1420 } 1421 #endif 1422 1423 // __kmp_invoke_task: invoke the specified task 1424 // 1425 // gtid: global thread ID of caller 1426 // task: the task to invoke 1427 // current_task: the task to resume after task invokation 1428 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, 1429 kmp_taskdata_t *current_task) { 1430 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 1431 kmp_info_t *thread; 1432 #if OMP_40_ENABLED 1433 int discard = 0 /* false */; 1434 #endif 1435 KA_TRACE( 1436 30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n", 1437 gtid, taskdata, current_task)); 1438 KMP_DEBUG_ASSERT(task); 1439 #if OMP_45_ENABLED 1440 if (taskdata->td_flags.proxy == TASK_PROXY && 1441 taskdata->td_flags.complete == 1) { 1442 // This is a proxy task that was already completed but it needs to run 1443 // its bottom-half finish 1444 KA_TRACE( 1445 30, 1446 ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n", 1447 gtid, taskdata)); 1448 1449 __kmp_bottom_half_finish_proxy(gtid, task); 1450 1451 KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for " 1452 "proxy task %p, resuming task %p\n", 1453 gtid, taskdata, current_task)); 1454 1455 return; 1456 } 1457 #endif 1458 1459 #if OMPT_SUPPORT 1460 // For untied tasks, the first task executed only calls __kmpc_omp_task and 1461 // does not execute code. 1462 ompt_thread_info_t oldInfo; 1463 if (UNLIKELY(ompt_enabled.enabled)) { 1464 // Store the threads states and restore them after the task 1465 thread = __kmp_threads[gtid]; 1466 oldInfo = thread->th.ompt_thread_info; 1467 thread->th.ompt_thread_info.wait_id = 0; 1468 thread->th.ompt_thread_info.state = (thread->th.th_team_serialized) 1469 ? ompt_state_work_serial 1470 : ompt_state_work_parallel; 1471 taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1472 } 1473 #endif 1474 1475 #if OMP_45_ENABLED 1476 // Proxy tasks are not handled by the runtime 1477 if (taskdata->td_flags.proxy != TASK_PROXY) { 1478 #endif 1479 ANNOTATE_HAPPENS_AFTER(task); 1480 __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded 1481 #if OMP_45_ENABLED 1482 } 1483 #endif 1484 1485 #if OMP_40_ENABLED 1486 // TODO: cancel tasks if the parallel region has also been cancelled 1487 // TODO: check if this sequence can be hoisted above __kmp_task_start 1488 // if cancellation has been enabled for this run ... 1489 if (__kmp_omp_cancellation) { 1490 thread = __kmp_threads[gtid]; 1491 kmp_team_t *this_team = thread->th.th_team; 1492 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 1493 if ((taskgroup && taskgroup->cancel_request) || 1494 (this_team->t.t_cancel_request == cancel_parallel)) { 1495 #if OMPT_SUPPORT && OMPT_OPTIONAL 1496 ompt_data_t *task_data; 1497 if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) { 1498 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL); 1499 ompt_callbacks.ompt_callback(ompt_callback_cancel)( 1500 task_data, 1501 ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup 1502 : ompt_cancel_parallel) | 1503 ompt_cancel_discarded_task, 1504 NULL); 1505 } 1506 #endif 1507 KMP_COUNT_BLOCK(TASK_cancelled); 1508 // this task belongs to a task group and we need to cancel it 1509 discard = 1 /* true */; 1510 } 1511 } 1512 1513 // Invoke the task routine and pass in relevant data. 1514 // Thunks generated by gcc take a different argument list. 1515 if (!discard) { 1516 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 1517 taskdata->td_last_tied = current_task->td_last_tied; 1518 KMP_DEBUG_ASSERT(taskdata->td_last_tied); 1519 } 1520 #if KMP_STATS_ENABLED 1521 KMP_COUNT_BLOCK(TASK_executed); 1522 switch (KMP_GET_THREAD_STATE()) { 1523 case FORK_JOIN_BARRIER: 1524 KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar); 1525 break; 1526 case PLAIN_BARRIER: 1527 KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar); 1528 break; 1529 case TASKYIELD: 1530 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield); 1531 break; 1532 case TASKWAIT: 1533 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait); 1534 break; 1535 case TASKGROUP: 1536 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup); 1537 break; 1538 default: 1539 KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate); 1540 break; 1541 } 1542 #endif // KMP_STATS_ENABLED 1543 #endif // OMP_40_ENABLED 1544 1545 // OMPT task begin 1546 #if OMPT_SUPPORT 1547 if (UNLIKELY(ompt_enabled.enabled)) 1548 __ompt_task_start(task, current_task, gtid); 1549 #endif 1550 1551 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1552 kmp_uint64 cur_time; 1553 kmp_int32 kmp_itt_count_task = 1554 __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial && 1555 current_task->td_flags.tasktype == TASK_IMPLICIT; 1556 if (kmp_itt_count_task) { 1557 thread = __kmp_threads[gtid]; 1558 // Time outer level explicit task on barrier for adjusting imbalance time 1559 if (thread->th.th_bar_arrive_time) 1560 cur_time = __itt_get_timestamp(); 1561 else 1562 kmp_itt_count_task = 0; // thread is not on a barrier - skip timing 1563 } 1564 #endif 1565 1566 #ifdef KMP_GOMP_COMPAT 1567 if (taskdata->td_flags.native) { 1568 ((void (*)(void *))(*(task->routine)))(task->shareds); 1569 } else 1570 #endif /* KMP_GOMP_COMPAT */ 1571 { 1572 (*(task->routine))(gtid, task); 1573 } 1574 KMP_POP_PARTITIONED_TIMER(); 1575 1576 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1577 if (kmp_itt_count_task) { 1578 // Barrier imbalance - adjust arrive time with the task duration 1579 thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time); 1580 } 1581 #endif 1582 1583 #if OMP_40_ENABLED 1584 } 1585 #endif // OMP_40_ENABLED 1586 1587 1588 #if OMP_45_ENABLED 1589 // Proxy tasks are not handled by the runtime 1590 if (taskdata->td_flags.proxy != TASK_PROXY) { 1591 #endif 1592 ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent); 1593 #if OMPT_SUPPORT 1594 if (UNLIKELY(ompt_enabled.enabled)) { 1595 thread->th.ompt_thread_info = oldInfo; 1596 if (taskdata->td_flags.tiedness == TASK_TIED) { 1597 taskdata->ompt_task_info.frame.exit_frame = ompt_data_none; 1598 } 1599 __kmp_task_finish<true>(gtid, task, current_task); 1600 } else 1601 #endif 1602 __kmp_task_finish<false>(gtid, task, current_task); 1603 #if OMP_45_ENABLED 1604 } 1605 #endif 1606 1607 KA_TRACE( 1608 30, 1609 ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n", 1610 gtid, taskdata, current_task)); 1611 return; 1612 } 1613 1614 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution 1615 // 1616 // loc_ref: location of original task pragma (ignored) 1617 // gtid: Global Thread ID of encountering thread 1618 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task'' 1619 // Returns: 1620 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1621 // be resumed later. 1622 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1623 // resumed later. 1624 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid, 1625 kmp_task_t *new_task) { 1626 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1627 1628 KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid, 1629 loc_ref, new_taskdata)); 1630 1631 #if OMPT_SUPPORT 1632 kmp_taskdata_t *parent; 1633 if (UNLIKELY(ompt_enabled.enabled)) { 1634 parent = new_taskdata->td_parent; 1635 if (ompt_enabled.ompt_callback_task_create) { 1636 ompt_data_t task_data = ompt_data_none; 1637 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1638 parent ? &(parent->ompt_task_info.task_data) : &task_data, 1639 parent ? &(parent->ompt_task_info.frame) : NULL, 1640 &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0, 1641 OMPT_GET_RETURN_ADDRESS(0)); 1642 } 1643 } 1644 #endif 1645 1646 /* Should we execute the new task or queue it? For now, let's just always try 1647 to queue it. If the queue fills up, then we'll execute it. */ 1648 1649 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1650 { // Execute this task immediately 1651 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1652 new_taskdata->td_flags.task_serial = 1; 1653 __kmp_invoke_task(gtid, new_task, current_task); 1654 } 1655 1656 KA_TRACE( 1657 10, 1658 ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: " 1659 "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n", 1660 gtid, loc_ref, new_taskdata)); 1661 1662 ANNOTATE_HAPPENS_BEFORE(new_task); 1663 #if OMPT_SUPPORT 1664 if (UNLIKELY(ompt_enabled.enabled)) { 1665 parent->ompt_task_info.frame.enter_frame = ompt_data_none; 1666 } 1667 #endif 1668 return TASK_CURRENT_NOT_QUEUED; 1669 } 1670 1671 // __kmp_omp_task: Schedule a non-thread-switchable task for execution 1672 // 1673 // gtid: Global Thread ID of encountering thread 1674 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc() 1675 // serialize_immediate: if TRUE then if the task is executed immediately its 1676 // execution will be serialized 1677 // Returns: 1678 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1679 // be resumed later. 1680 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1681 // resumed later. 1682 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task, 1683 bool serialize_immediate) { 1684 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1685 1686 /* Should we execute the new task or queue it? For now, let's just always try to 1687 queue it. If the queue fills up, then we'll execute it. */ 1688 #if OMP_45_ENABLED 1689 if (new_taskdata->td_flags.proxy == TASK_PROXY || 1690 __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1691 #else 1692 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1693 #endif 1694 { // Execute this task immediately 1695 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1696 if (serialize_immediate) 1697 new_taskdata->td_flags.task_serial = 1; 1698 __kmp_invoke_task(gtid, new_task, current_task); 1699 } 1700 1701 ANNOTATE_HAPPENS_BEFORE(new_task); 1702 return TASK_CURRENT_NOT_QUEUED; 1703 } 1704 1705 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a 1706 // non-thread-switchable task from the parent thread only! 1707 // 1708 // loc_ref: location of original task pragma (ignored) 1709 // gtid: Global Thread ID of encountering thread 1710 // new_task: non-thread-switchable task thunk allocated by 1711 // __kmp_omp_task_alloc() 1712 // Returns: 1713 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1714 // be resumed later. 1715 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1716 // resumed later. 1717 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid, 1718 kmp_task_t *new_task) { 1719 kmp_int32 res; 1720 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); 1721 1722 #if KMP_DEBUG || OMPT_SUPPORT 1723 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1724 #endif 1725 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref, 1726 new_taskdata)); 1727 1728 #if OMPT_SUPPORT 1729 kmp_taskdata_t *parent = NULL; 1730 if (UNLIKELY(ompt_enabled.enabled)) { 1731 if (!new_taskdata->td_flags.started) { 1732 OMPT_STORE_RETURN_ADDRESS(gtid); 1733 parent = new_taskdata->td_parent; 1734 if (!parent->ompt_task_info.frame.enter_frame.ptr) { 1735 parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1736 } 1737 if (ompt_enabled.ompt_callback_task_create) { 1738 ompt_data_t task_data = ompt_data_none; 1739 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1740 parent ? &(parent->ompt_task_info.task_data) : &task_data, 1741 parent ? &(parent->ompt_task_info.frame) : NULL, 1742 &(new_taskdata->ompt_task_info.task_data), 1743 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, 1744 OMPT_LOAD_RETURN_ADDRESS(gtid)); 1745 } 1746 } else { 1747 // We are scheduling the continuation of an UNTIED task. 1748 // Scheduling back to the parent task. 1749 __ompt_task_finish(new_task, 1750 new_taskdata->ompt_task_info.scheduling_parent, 1751 ompt_task_switch); 1752 new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none; 1753 } 1754 } 1755 #endif 1756 1757 res = __kmp_omp_task(gtid, new_task, true); 1758 1759 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning " 1760 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n", 1761 gtid, loc_ref, new_taskdata)); 1762 #if OMPT_SUPPORT 1763 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) { 1764 parent->ompt_task_info.frame.enter_frame = ompt_data_none; 1765 } 1766 #endif 1767 return res; 1768 } 1769 1770 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule 1771 // a taskloop task with the correct OMPT return address 1772 // 1773 // loc_ref: location of original task pragma (ignored) 1774 // gtid: Global Thread ID of encountering thread 1775 // new_task: non-thread-switchable task thunk allocated by 1776 // __kmp_omp_task_alloc() 1777 // codeptr_ra: return address for OMPT callback 1778 // Returns: 1779 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1780 // be resumed later. 1781 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1782 // resumed later. 1783 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid, 1784 kmp_task_t *new_task, void *codeptr_ra) { 1785 kmp_int32 res; 1786 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); 1787 1788 #if KMP_DEBUG || OMPT_SUPPORT 1789 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1790 #endif 1791 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref, 1792 new_taskdata)); 1793 1794 #if OMPT_SUPPORT 1795 kmp_taskdata_t *parent = NULL; 1796 if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) { 1797 parent = new_taskdata->td_parent; 1798 if (!parent->ompt_task_info.frame.enter_frame.ptr) 1799 parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1800 if (ompt_enabled.ompt_callback_task_create) { 1801 ompt_data_t task_data = ompt_data_none; 1802 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1803 parent ? &(parent->ompt_task_info.task_data) : &task_data, 1804 parent ? &(parent->ompt_task_info.frame) : NULL, 1805 &(new_taskdata->ompt_task_info.task_data), 1806 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, 1807 codeptr_ra); 1808 } 1809 } 1810 #endif 1811 1812 res = __kmp_omp_task(gtid, new_task, true); 1813 1814 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning " 1815 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n", 1816 gtid, loc_ref, new_taskdata)); 1817 #if OMPT_SUPPORT 1818 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) { 1819 parent->ompt_task_info.frame.enter_frame = ompt_data_none; 1820 } 1821 #endif 1822 return res; 1823 } 1824 1825 template <bool ompt> 1826 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid, 1827 void *frame_address, 1828 void *return_address) { 1829 kmp_taskdata_t *taskdata; 1830 kmp_info_t *thread; 1831 int thread_finished = FALSE; 1832 KMP_SET_THREAD_STATE_BLOCK(TASKWAIT); 1833 1834 KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref)); 1835 1836 if (__kmp_tasking_mode != tskm_immediate_exec) { 1837 thread = __kmp_threads[gtid]; 1838 taskdata = thread->th.th_current_task; 1839 1840 #if OMPT_SUPPORT && OMPT_OPTIONAL 1841 ompt_data_t *my_task_data; 1842 ompt_data_t *my_parallel_data; 1843 1844 if (ompt) { 1845 my_task_data = &(taskdata->ompt_task_info.task_data); 1846 my_parallel_data = OMPT_CUR_TEAM_DATA(thread); 1847 1848 taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address; 1849 1850 if (ompt_enabled.ompt_callback_sync_region) { 1851 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1852 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data, 1853 my_task_data, return_address); 1854 } 1855 1856 if (ompt_enabled.ompt_callback_sync_region_wait) { 1857 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1858 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data, 1859 my_task_data, return_address); 1860 } 1861 } 1862 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1863 1864 // Debugger: The taskwait is active. Store location and thread encountered the 1865 // taskwait. 1866 #if USE_ITT_BUILD 1867 // Note: These values are used by ITT events as well. 1868 #endif /* USE_ITT_BUILD */ 1869 taskdata->td_taskwait_counter += 1; 1870 taskdata->td_taskwait_ident = loc_ref; 1871 taskdata->td_taskwait_thread = gtid + 1; 1872 1873 #if USE_ITT_BUILD 1874 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 1875 if (itt_sync_obj != NULL) 1876 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 1877 #endif /* USE_ITT_BUILD */ 1878 1879 bool must_wait = 1880 !taskdata->td_flags.team_serial && !taskdata->td_flags.final; 1881 1882 #if OMP_45_ENABLED 1883 must_wait = must_wait || (thread->th.th_task_team != NULL && 1884 thread->th.th_task_team->tt.tt_found_proxy_tasks); 1885 #endif 1886 if (must_wait) { 1887 kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, 1888 &(taskdata->td_incomplete_child_tasks)), 1889 0U); 1890 while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) { 1891 flag.execute_tasks(thread, gtid, FALSE, 1892 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 1893 __kmp_task_stealing_constraint); 1894 } 1895 } 1896 #if USE_ITT_BUILD 1897 if (itt_sync_obj != NULL) 1898 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 1899 #endif /* USE_ITT_BUILD */ 1900 1901 // Debugger: The taskwait is completed. Location remains, but thread is 1902 // negated. 1903 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 1904 1905 #if OMPT_SUPPORT && OMPT_OPTIONAL 1906 if (ompt) { 1907 if (ompt_enabled.ompt_callback_sync_region_wait) { 1908 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1909 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data, 1910 my_task_data, return_address); 1911 } 1912 if (ompt_enabled.ompt_callback_sync_region) { 1913 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1914 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data, 1915 my_task_data, return_address); 1916 } 1917 taskdata->ompt_task_info.frame.enter_frame = ompt_data_none; 1918 } 1919 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1920 1921 ANNOTATE_HAPPENS_AFTER(taskdata); 1922 } 1923 1924 KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, " 1925 "returning TASK_CURRENT_NOT_QUEUED\n", 1926 gtid, taskdata)); 1927 1928 return TASK_CURRENT_NOT_QUEUED; 1929 } 1930 1931 #if OMPT_SUPPORT && OMPT_OPTIONAL 1932 OMPT_NOINLINE 1933 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid, 1934 void *frame_address, 1935 void *return_address) { 1936 return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address, 1937 return_address); 1938 } 1939 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1940 1941 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are 1942 // complete 1943 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) { 1944 #if OMPT_SUPPORT && OMPT_OPTIONAL 1945 if (UNLIKELY(ompt_enabled.enabled)) { 1946 OMPT_STORE_RETURN_ADDRESS(gtid); 1947 return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0), 1948 OMPT_LOAD_RETURN_ADDRESS(gtid)); 1949 } 1950 #endif 1951 return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL); 1952 } 1953 1954 // __kmpc_omp_taskyield: switch to a different task 1955 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) { 1956 kmp_taskdata_t *taskdata; 1957 kmp_info_t *thread; 1958 int thread_finished = FALSE; 1959 1960 KMP_COUNT_BLOCK(OMP_TASKYIELD); 1961 KMP_SET_THREAD_STATE_BLOCK(TASKYIELD); 1962 1963 KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n", 1964 gtid, loc_ref, end_part)); 1965 1966 if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) { 1967 thread = __kmp_threads[gtid]; 1968 taskdata = thread->th.th_current_task; 1969 // Should we model this as a task wait or not? 1970 // Debugger: The taskwait is active. Store location and thread encountered the 1971 // taskwait. 1972 #if USE_ITT_BUILD 1973 // Note: These values are used by ITT events as well. 1974 #endif /* USE_ITT_BUILD */ 1975 taskdata->td_taskwait_counter += 1; 1976 taskdata->td_taskwait_ident = loc_ref; 1977 taskdata->td_taskwait_thread = gtid + 1; 1978 1979 #if USE_ITT_BUILD 1980 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 1981 if (itt_sync_obj != NULL) 1982 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 1983 #endif /* USE_ITT_BUILD */ 1984 if (!taskdata->td_flags.team_serial) { 1985 kmp_task_team_t *task_team = thread->th.th_task_team; 1986 if (task_team != NULL) { 1987 if (KMP_TASKING_ENABLED(task_team)) { 1988 #if OMPT_SUPPORT 1989 if (UNLIKELY(ompt_enabled.enabled)) 1990 thread->th.ompt_thread_info.ompt_task_yielded = 1; 1991 #endif 1992 __kmp_execute_tasks_32( 1993 thread, gtid, NULL, FALSE, 1994 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 1995 __kmp_task_stealing_constraint); 1996 #if OMPT_SUPPORT 1997 if (UNLIKELY(ompt_enabled.enabled)) 1998 thread->th.ompt_thread_info.ompt_task_yielded = 0; 1999 #endif 2000 } 2001 } 2002 } 2003 #if USE_ITT_BUILD 2004 if (itt_sync_obj != NULL) 2005 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 2006 #endif /* USE_ITT_BUILD */ 2007 2008 // Debugger: The taskwait is completed. Location remains, but thread is 2009 // negated. 2010 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 2011 } 2012 2013 KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, " 2014 "returning TASK_CURRENT_NOT_QUEUED\n", 2015 gtid, taskdata)); 2016 2017 return TASK_CURRENT_NOT_QUEUED; 2018 } 2019 2020 #if OMP_50_ENABLED 2021 // Task Reduction implementation 2022 2023 typedef struct kmp_task_red_flags { 2024 unsigned lazy_priv : 1; // hint: (1) use lazy allocation (big objects) 2025 unsigned reserved31 : 31; 2026 } kmp_task_red_flags_t; 2027 2028 // internal structure for reduction data item related info 2029 typedef struct kmp_task_red_data { 2030 void *reduce_shar; // shared reduction item 2031 size_t reduce_size; // size of data item 2032 void *reduce_priv; // thread specific data 2033 void *reduce_pend; // end of private data for comparison op 2034 void *reduce_init; // data initialization routine 2035 void *reduce_fini; // data finalization routine 2036 void *reduce_comb; // data combiner routine 2037 kmp_task_red_flags_t flags; // flags for additional info from compiler 2038 } kmp_task_red_data_t; 2039 2040 // structure sent us by compiler - one per reduction item 2041 typedef struct kmp_task_red_input { 2042 void *reduce_shar; // shared reduction item 2043 size_t reduce_size; // size of data item 2044 void *reduce_init; // data initialization routine 2045 void *reduce_fini; // data finalization routine 2046 void *reduce_comb; // data combiner routine 2047 kmp_task_red_flags_t flags; // flags for additional info from compiler 2048 } kmp_task_red_input_t; 2049 2050 /*! 2051 @ingroup TASKING 2052 @param gtid Global thread ID 2053 @param num Number of data items to reduce 2054 @param data Array of data for reduction 2055 @return The taskgroup identifier 2056 2057 Initialize task reduction for the taskgroup. 2058 */ 2059 void *__kmpc_task_reduction_init(int gtid, int num, void *data) { 2060 kmp_info_t *thread = __kmp_threads[gtid]; 2061 kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup; 2062 kmp_int32 nth = thread->th.th_team_nproc; 2063 kmp_task_red_input_t *input = (kmp_task_red_input_t *)data; 2064 kmp_task_red_data_t *arr; 2065 2066 // check input data just in case 2067 KMP_ASSERT(tg != NULL); 2068 KMP_ASSERT(data != NULL); 2069 KMP_ASSERT(num > 0); 2070 if (nth == 1) { 2071 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n", 2072 gtid, tg)); 2073 return (void *)tg; 2074 } 2075 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n", 2076 gtid, tg, num)); 2077 arr = (kmp_task_red_data_t *)__kmp_thread_malloc( 2078 thread, num * sizeof(kmp_task_red_data_t)); 2079 for (int i = 0; i < num; ++i) { 2080 void (*f_init)(void *) = (void (*)(void *))(input[i].reduce_init); 2081 size_t size = input[i].reduce_size - 1; 2082 // round the size up to cache line per thread-specific item 2083 size += CACHE_LINE - size % CACHE_LINE; 2084 KMP_ASSERT(input[i].reduce_comb != NULL); // combiner is mandatory 2085 arr[i].reduce_shar = input[i].reduce_shar; 2086 arr[i].reduce_size = size; 2087 arr[i].reduce_init = input[i].reduce_init; 2088 arr[i].reduce_fini = input[i].reduce_fini; 2089 arr[i].reduce_comb = input[i].reduce_comb; 2090 arr[i].flags = input[i].flags; 2091 if (!input[i].flags.lazy_priv) { 2092 // allocate cache-line aligned block and fill it with zeros 2093 arr[i].reduce_priv = __kmp_allocate(nth * size); 2094 arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size; 2095 if (f_init != NULL) { 2096 // initialize thread-specific items 2097 for (int j = 0; j < nth; ++j) { 2098 f_init((char *)(arr[i].reduce_priv) + j * size); 2099 } 2100 } 2101 } else { 2102 // only allocate space for pointers now, 2103 // objects will be lazily allocated/initialized once requested 2104 arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *)); 2105 } 2106 } 2107 tg->reduce_data = (void *)arr; 2108 tg->reduce_num_data = num; 2109 return (void *)tg; 2110 } 2111 2112 /*! 2113 @ingroup TASKING 2114 @param gtid Global thread ID 2115 @param tskgrp The taskgroup ID (optional) 2116 @param data Shared location of the item 2117 @return The pointer to per-thread data 2118 2119 Get thread-specific location of data item 2120 */ 2121 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) { 2122 kmp_info_t *thread = __kmp_threads[gtid]; 2123 kmp_int32 nth = thread->th.th_team_nproc; 2124 if (nth == 1) 2125 return data; // nothing to do 2126 2127 kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp; 2128 if (tg == NULL) 2129 tg = thread->th.th_current_task->td_taskgroup; 2130 KMP_ASSERT(tg != NULL); 2131 kmp_task_red_data_t *arr = (kmp_task_red_data_t *)(tg->reduce_data); 2132 kmp_int32 num = tg->reduce_num_data; 2133 kmp_int32 tid = thread->th.th_info.ds.ds_tid; 2134 2135 KMP_ASSERT(data != NULL); 2136 while (tg != NULL) { 2137 for (int i = 0; i < num; ++i) { 2138 if (!arr[i].flags.lazy_priv) { 2139 if (data == arr[i].reduce_shar || 2140 (data >= arr[i].reduce_priv && data < arr[i].reduce_pend)) 2141 return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size; 2142 } else { 2143 // check shared location first 2144 void **p_priv = (void **)(arr[i].reduce_priv); 2145 if (data == arr[i].reduce_shar) 2146 goto found; 2147 // check if we get some thread specific location as parameter 2148 for (int j = 0; j < nth; ++j) 2149 if (data == p_priv[j]) 2150 goto found; 2151 continue; // not found, continue search 2152 found: 2153 if (p_priv[tid] == NULL) { 2154 // allocate thread specific object lazily 2155 void (*f_init)(void *) = (void (*)(void *))(arr[i].reduce_init); 2156 p_priv[tid] = __kmp_allocate(arr[i].reduce_size); 2157 if (f_init != NULL) { 2158 f_init(p_priv[tid]); 2159 } 2160 } 2161 return p_priv[tid]; 2162 } 2163 } 2164 tg = tg->parent; 2165 arr = (kmp_task_red_data_t *)(tg->reduce_data); 2166 num = tg->reduce_num_data; 2167 } 2168 KMP_ASSERT2(0, "Unknown task reduction item"); 2169 return NULL; // ERROR, this line never executed 2170 } 2171 2172 // Finalize task reduction. 2173 // Called from __kmpc_end_taskgroup() 2174 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) { 2175 kmp_int32 nth = th->th.th_team_nproc; 2176 KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1 2177 kmp_task_red_data_t *arr = (kmp_task_red_data_t *)tg->reduce_data; 2178 kmp_int32 num = tg->reduce_num_data; 2179 for (int i = 0; i < num; ++i) { 2180 void *sh_data = arr[i].reduce_shar; 2181 void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini); 2182 void (*f_comb)(void *, void *) = 2183 (void (*)(void *, void *))(arr[i].reduce_comb); 2184 if (!arr[i].flags.lazy_priv) { 2185 void *pr_data = arr[i].reduce_priv; 2186 size_t size = arr[i].reduce_size; 2187 for (int j = 0; j < nth; ++j) { 2188 void *priv_data = (char *)pr_data + j * size; 2189 f_comb(sh_data, priv_data); // combine results 2190 if (f_fini) 2191 f_fini(priv_data); // finalize if needed 2192 } 2193 } else { 2194 void **pr_data = (void **)(arr[i].reduce_priv); 2195 for (int j = 0; j < nth; ++j) { 2196 if (pr_data[j] != NULL) { 2197 f_comb(sh_data, pr_data[j]); // combine results 2198 if (f_fini) 2199 f_fini(pr_data[j]); // finalize if needed 2200 __kmp_free(pr_data[j]); 2201 } 2202 } 2203 } 2204 __kmp_free(arr[i].reduce_priv); 2205 } 2206 __kmp_thread_free(th, arr); 2207 tg->reduce_data = NULL; 2208 tg->reduce_num_data = 0; 2209 } 2210 #endif 2211 2212 #if OMP_40_ENABLED 2213 // __kmpc_taskgroup: Start a new taskgroup 2214 void __kmpc_taskgroup(ident_t *loc, int gtid) { 2215 kmp_info_t *thread = __kmp_threads[gtid]; 2216 kmp_taskdata_t *taskdata = thread->th.th_current_task; 2217 kmp_taskgroup_t *tg_new = 2218 (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t)); 2219 KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new)); 2220 KMP_ATOMIC_ST_RLX(&tg_new->count, 0); 2221 KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq); 2222 tg_new->parent = taskdata->td_taskgroup; 2223 #if OMP_50_ENABLED 2224 tg_new->reduce_data = NULL; 2225 tg_new->reduce_num_data = 0; 2226 #endif 2227 taskdata->td_taskgroup = tg_new; 2228 2229 #if OMPT_SUPPORT && OMPT_OPTIONAL 2230 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) { 2231 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2232 if (!codeptr) 2233 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2234 kmp_team_t *team = thread->th.th_team; 2235 ompt_data_t my_task_data = taskdata->ompt_task_info.task_data; 2236 // FIXME: I think this is wrong for lwt! 2237 ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data; 2238 2239 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2240 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data), 2241 &(my_task_data), codeptr); 2242 } 2243 #endif 2244 } 2245 2246 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task 2247 // and its descendants are complete 2248 void __kmpc_end_taskgroup(ident_t *loc, int gtid) { 2249 kmp_info_t *thread = __kmp_threads[gtid]; 2250 kmp_taskdata_t *taskdata = thread->th.th_current_task; 2251 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 2252 int thread_finished = FALSE; 2253 2254 #if OMPT_SUPPORT && OMPT_OPTIONAL 2255 kmp_team_t *team; 2256 ompt_data_t my_task_data; 2257 ompt_data_t my_parallel_data; 2258 void *codeptr; 2259 if (UNLIKELY(ompt_enabled.enabled)) { 2260 team = thread->th.th_team; 2261 my_task_data = taskdata->ompt_task_info.task_data; 2262 // FIXME: I think this is wrong for lwt! 2263 my_parallel_data = team->t.ompt_team_info.parallel_data; 2264 codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2265 if (!codeptr) 2266 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2267 } 2268 #endif 2269 2270 KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc)); 2271 KMP_DEBUG_ASSERT(taskgroup != NULL); 2272 KMP_SET_THREAD_STATE_BLOCK(TASKGROUP); 2273 2274 if (__kmp_tasking_mode != tskm_immediate_exec) { 2275 // mark task as waiting not on a barrier 2276 taskdata->td_taskwait_counter += 1; 2277 taskdata->td_taskwait_ident = loc; 2278 taskdata->td_taskwait_thread = gtid + 1; 2279 #if USE_ITT_BUILD 2280 // For ITT the taskgroup wait is similar to taskwait until we need to 2281 // distinguish them 2282 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 2283 if (itt_sync_obj != NULL) 2284 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 2285 #endif /* USE_ITT_BUILD */ 2286 2287 #if OMPT_SUPPORT && OMPT_OPTIONAL 2288 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) { 2289 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2290 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data), 2291 &(my_task_data), codeptr); 2292 } 2293 #endif 2294 2295 #if OMP_45_ENABLED 2296 if (!taskdata->td_flags.team_serial || 2297 (thread->th.th_task_team != NULL && 2298 thread->th.th_task_team->tt.tt_found_proxy_tasks)) 2299 #else 2300 if (!taskdata->td_flags.team_serial) 2301 #endif 2302 { 2303 kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 2304 0U); 2305 while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) { 2306 flag.execute_tasks(thread, gtid, FALSE, 2307 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 2308 __kmp_task_stealing_constraint); 2309 } 2310 } 2311 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting 2312 2313 #if OMPT_SUPPORT && OMPT_OPTIONAL 2314 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) { 2315 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2316 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data), 2317 &(my_task_data), codeptr); 2318 } 2319 #endif 2320 2321 #if USE_ITT_BUILD 2322 if (itt_sync_obj != NULL) 2323 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 2324 #endif /* USE_ITT_BUILD */ 2325 } 2326 KMP_DEBUG_ASSERT(taskgroup->count == 0); 2327 2328 #if OMP_50_ENABLED 2329 if (taskgroup->reduce_data != NULL) // need to reduce? 2330 __kmp_task_reduction_fini(thread, taskgroup); 2331 #endif 2332 // Restore parent taskgroup for the current task 2333 taskdata->td_taskgroup = taskgroup->parent; 2334 __kmp_thread_free(thread, taskgroup); 2335 2336 KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n", 2337 gtid, taskdata)); 2338 ANNOTATE_HAPPENS_AFTER(taskdata); 2339 2340 #if OMPT_SUPPORT && OMPT_OPTIONAL 2341 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) { 2342 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2343 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data), 2344 &(my_task_data), codeptr); 2345 } 2346 #endif 2347 } 2348 #endif 2349 2350 // __kmp_remove_my_task: remove a task from my own deque 2351 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid, 2352 kmp_task_team_t *task_team, 2353 kmp_int32 is_constrained) { 2354 kmp_task_t *task; 2355 kmp_taskdata_t *taskdata; 2356 kmp_thread_data_t *thread_data; 2357 kmp_uint32 tail; 2358 2359 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2360 KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data != 2361 NULL); // Caller should check this condition 2362 2363 thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 2364 2365 KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n", 2366 gtid, thread_data->td.td_deque_ntasks, 2367 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2368 2369 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 2370 KA_TRACE(10, 2371 ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: " 2372 "ntasks=%d head=%u tail=%u\n", 2373 gtid, thread_data->td.td_deque_ntasks, 2374 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2375 return NULL; 2376 } 2377 2378 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 2379 2380 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 2381 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2382 KA_TRACE(10, 2383 ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: " 2384 "ntasks=%d head=%u tail=%u\n", 2385 gtid, thread_data->td.td_deque_ntasks, 2386 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2387 return NULL; 2388 } 2389 2390 tail = (thread_data->td.td_deque_tail - 1) & 2391 TASK_DEQUE_MASK(thread_data->td); // Wrap index. 2392 taskdata = thread_data->td.td_deque[tail]; 2393 2394 if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata, 2395 thread->th.th_current_task)) { 2396 // The TSC does not allow to steal victim task 2397 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2398 KA_TRACE(10, 2399 ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: " 2400 "ntasks=%d head=%u tail=%u\n", 2401 gtid, thread_data->td.td_deque_ntasks, 2402 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2403 return NULL; 2404 } 2405 2406 thread_data->td.td_deque_tail = tail; 2407 TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1); 2408 2409 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2410 2411 KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: " 2412 "ntasks=%d head=%u tail=%u\n", 2413 gtid, taskdata, thread_data->td.td_deque_ntasks, 2414 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2415 2416 task = KMP_TASKDATA_TO_TASK(taskdata); 2417 return task; 2418 } 2419 2420 // __kmp_steal_task: remove a task from another thread's deque 2421 // Assume that calling thread has already checked existence of 2422 // task_team thread_data before calling this routine. 2423 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid, 2424 kmp_task_team_t *task_team, 2425 std::atomic<kmp_int32> *unfinished_threads, 2426 int *thread_finished, 2427 kmp_int32 is_constrained) { 2428 kmp_task_t *task; 2429 kmp_taskdata_t *taskdata; 2430 kmp_taskdata_t *current; 2431 kmp_thread_data_t *victim_td, *threads_data; 2432 kmp_int32 target; 2433 kmp_int32 victim_tid; 2434 2435 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2436 2437 threads_data = task_team->tt.tt_threads_data; 2438 KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition 2439 2440 victim_tid = victim_thr->th.th_info.ds.ds_tid; 2441 victim_td = &threads_data[victim_tid]; 2442 2443 KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: " 2444 "task_team=%p ntasks=%d head=%u tail=%u\n", 2445 gtid, __kmp_gtid_from_thread(victim_thr), task_team, 2446 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2447 victim_td->td.td_deque_tail)); 2448 2449 if (TCR_4(victim_td->td.td_deque_ntasks) == 0) { 2450 KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: " 2451 "task_team=%p ntasks=%d head=%u tail=%u\n", 2452 gtid, __kmp_gtid_from_thread(victim_thr), task_team, 2453 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2454 victim_td->td.td_deque_tail)); 2455 return NULL; 2456 } 2457 2458 __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock); 2459 2460 int ntasks = TCR_4(victim_td->td.td_deque_ntasks); 2461 // Check again after we acquire the lock 2462 if (ntasks == 0) { 2463 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2464 KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: " 2465 "task_team=%p ntasks=%d head=%u tail=%u\n", 2466 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2467 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2468 return NULL; 2469 } 2470 2471 KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL); 2472 current = __kmp_threads[gtid]->th.th_current_task; 2473 taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head]; 2474 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) { 2475 // Bump head pointer and Wrap. 2476 victim_td->td.td_deque_head = 2477 (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td); 2478 } else { 2479 if (!task_team->tt.tt_untied_task_encountered) { 2480 // The TSC does not allow to steal victim task 2481 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2482 KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from " 2483 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n", 2484 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2485 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2486 return NULL; 2487 } 2488 int i; 2489 // walk through victim's deque trying to steal any task 2490 target = victim_td->td.td_deque_head; 2491 taskdata = NULL; 2492 for (i = 1; i < ntasks; ++i) { 2493 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td); 2494 taskdata = victim_td->td.td_deque[target]; 2495 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) { 2496 break; // found victim task 2497 } else { 2498 taskdata = NULL; 2499 } 2500 } 2501 if (taskdata == NULL) { 2502 // No appropriate candidate to steal found 2503 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2504 KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from " 2505 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n", 2506 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2507 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2508 return NULL; 2509 } 2510 int prev = target; 2511 for (i = i + 1; i < ntasks; ++i) { 2512 // shift remaining tasks in the deque left by 1 2513 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td); 2514 victim_td->td.td_deque[prev] = victim_td->td.td_deque[target]; 2515 prev = target; 2516 } 2517 KMP_DEBUG_ASSERT( 2518 victim_td->td.td_deque_tail == 2519 (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td))); 2520 victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped)) 2521 } 2522 if (*thread_finished) { 2523 // We need to un-mark this victim as a finished victim. This must be done 2524 // before releasing the lock, or else other threads (starting with the 2525 // master victim) might be prematurely released from the barrier!!! 2526 kmp_int32 count; 2527 2528 count = KMP_ATOMIC_INC(unfinished_threads); 2529 2530 KA_TRACE( 2531 20, 2532 ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n", 2533 gtid, count + 1, task_team)); 2534 2535 *thread_finished = FALSE; 2536 } 2537 TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1); 2538 2539 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2540 2541 KMP_COUNT_BLOCK(TASK_stolen); 2542 KA_TRACE(10, 2543 ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: " 2544 "task_team=%p ntasks=%d head=%u tail=%u\n", 2545 gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team, 2546 ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2547 2548 task = KMP_TASKDATA_TO_TASK(taskdata); 2549 return task; 2550 } 2551 2552 // __kmp_execute_tasks_template: Choose and execute tasks until either the 2553 // condition is statisfied (return true) or there are none left (return false). 2554 // 2555 // final_spin is TRUE if this is the spin at the release barrier. 2556 // thread_finished indicates whether the thread is finished executing all 2557 // the tasks it has on its deque, and is at the release barrier. 2558 // spinner is the location on which to spin. 2559 // spinner == NULL means only execute a single task and return. 2560 // checker is the value to check to terminate the spin. 2561 template <class C> 2562 static inline int __kmp_execute_tasks_template( 2563 kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin, 2564 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2565 kmp_int32 is_constrained) { 2566 kmp_task_team_t *task_team = thread->th.th_task_team; 2567 kmp_thread_data_t *threads_data; 2568 kmp_task_t *task; 2569 kmp_info_t *other_thread; 2570 kmp_taskdata_t *current_task = thread->th.th_current_task; 2571 std::atomic<kmp_int32> *unfinished_threads; 2572 kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0, 2573 tid = thread->th.th_info.ds.ds_tid; 2574 2575 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2576 KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]); 2577 2578 if (task_team == NULL || current_task == NULL) 2579 return FALSE; 2580 2581 KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d " 2582 "*thread_finished=%d\n", 2583 gtid, final_spin, *thread_finished)); 2584 2585 thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 2586 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 2587 KMP_DEBUG_ASSERT(threads_data != NULL); 2588 2589 nthreads = task_team->tt.tt_nproc; 2590 unfinished_threads = &(task_team->tt.tt_unfinished_threads); 2591 #if OMP_45_ENABLED 2592 KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks); 2593 #else 2594 KMP_DEBUG_ASSERT(nthreads > 1); 2595 #endif 2596 KMP_DEBUG_ASSERT(*unfinished_threads >= 0); 2597 2598 while (1) { // Outer loop keeps trying to find tasks in case of single thread 2599 // getting tasks from target constructs 2600 while (1) { // Inner loop to find a task and execute it 2601 task = NULL; 2602 if (use_own_tasks) { // check on own queue first 2603 task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained); 2604 } 2605 if ((task == NULL) && (nthreads > 1)) { // Steal a task 2606 int asleep = 1; 2607 use_own_tasks = 0; 2608 // Try to steal from the last place I stole from successfully. 2609 if (victim_tid == -2) { // haven't stolen anything yet 2610 victim_tid = threads_data[tid].td.td_deque_last_stolen; 2611 if (victim_tid != 2612 -1) // if we have a last stolen from victim, get the thread 2613 other_thread = threads_data[victim_tid].td.td_thr; 2614 } 2615 if (victim_tid != -1) { // found last victim 2616 asleep = 0; 2617 } else if (!new_victim) { // no recent steals and we haven't already 2618 // used a new victim; select a random thread 2619 do { // Find a different thread to steal work from. 2620 // Pick a random thread. Initial plan was to cycle through all the 2621 // threads, and only return if we tried to steal from every thread, 2622 // and failed. Arch says that's not such a great idea. 2623 victim_tid = __kmp_get_random(thread) % (nthreads - 1); 2624 if (victim_tid >= tid) { 2625 ++victim_tid; // Adjusts random distribution to exclude self 2626 } 2627 // Found a potential victim 2628 other_thread = threads_data[victim_tid].td.td_thr; 2629 // There is a slight chance that __kmp_enable_tasking() did not wake 2630 // up all threads waiting at the barrier. If victim is sleeping, 2631 // then wake it up. Since we were going to pay the cache miss 2632 // penalty for referencing another thread's kmp_info_t struct 2633 // anyway, 2634 // the check shouldn't cost too much performance at this point. In 2635 // extra barrier mode, tasks do not sleep at the separate tasking 2636 // barrier, so this isn't a problem. 2637 asleep = 0; 2638 if ((__kmp_tasking_mode == tskm_task_teams) && 2639 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) && 2640 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) != 2641 NULL)) { 2642 asleep = 1; 2643 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread), 2644 other_thread->th.th_sleep_loc); 2645 // A sleeping thread should not have any tasks on it's queue. 2646 // There is a slight possibility that it resumes, steals a task 2647 // from another thread, which spawns more tasks, all in the time 2648 // that it takes this thread to check => don't write an assertion 2649 // that the victim's queue is empty. Try stealing from a 2650 // different thread. 2651 } 2652 } while (asleep); 2653 } 2654 2655 if (!asleep) { 2656 // We have a victim to try to steal from 2657 task = __kmp_steal_task(other_thread, gtid, task_team, 2658 unfinished_threads, thread_finished, 2659 is_constrained); 2660 } 2661 if (task != NULL) { // set last stolen to victim 2662 if (threads_data[tid].td.td_deque_last_stolen != victim_tid) { 2663 threads_data[tid].td.td_deque_last_stolen = victim_tid; 2664 // The pre-refactored code did not try more than 1 successful new 2665 // vicitm, unless the last one generated more local tasks; 2666 // new_victim keeps track of this 2667 new_victim = 1; 2668 } 2669 } else { // No tasks found; unset last_stolen 2670 KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1); 2671 victim_tid = -2; // no successful victim found 2672 } 2673 } 2674 2675 if (task == NULL) // break out of tasking loop 2676 break; 2677 2678 // Found a task; execute it 2679 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2680 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2681 if (itt_sync_obj == NULL) { // we are at fork barrier where we could not 2682 // get the object reliably 2683 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2684 } 2685 __kmp_itt_task_starting(itt_sync_obj); 2686 } 2687 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2688 __kmp_invoke_task(gtid, task, current_task); 2689 #if USE_ITT_BUILD 2690 if (itt_sync_obj != NULL) 2691 __kmp_itt_task_finished(itt_sync_obj); 2692 #endif /* USE_ITT_BUILD */ 2693 // If this thread is only partway through the barrier and the condition is 2694 // met, then return now, so that the barrier gather/release pattern can 2695 // proceed. If this thread is in the last spin loop in the barrier, 2696 // waiting to be released, we know that the termination condition will not 2697 // be satisified, so don't waste any cycles checking it. 2698 if (flag == NULL || (!final_spin && flag->done_check())) { 2699 KA_TRACE( 2700 15, 2701 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 2702 gtid)); 2703 return TRUE; 2704 } 2705 if (thread->th.th_task_team == NULL) { 2706 break; 2707 } 2708 KMP_YIELD(__kmp_library == library_throughput); // Yield before next task 2709 // If execution of a stolen task results in more tasks being placed on our 2710 // run queue, reset use_own_tasks 2711 if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) { 2712 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned " 2713 "other tasks, restart\n", 2714 gtid)); 2715 use_own_tasks = 1; 2716 new_victim = 0; 2717 } 2718 } 2719 2720 // The task source has been exhausted. If in final spin loop of barrier, check 2721 // if termination condition is satisfied. 2722 #if OMP_45_ENABLED 2723 // The work queue may be empty but there might be proxy tasks still 2724 // executing 2725 if (final_spin && 2726 KMP_ATOMIC_LD_ACQ(¤t_task->td_incomplete_child_tasks) == 0) 2727 #else 2728 if (final_spin) 2729 #endif 2730 { 2731 // First, decrement the #unfinished threads, if that has not already been 2732 // done. This decrement might be to the spin location, and result in the 2733 // termination condition being satisfied. 2734 if (!*thread_finished) { 2735 kmp_int32 count; 2736 2737 count = KMP_ATOMIC_DEC(unfinished_threads) - 1; 2738 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec " 2739 "unfinished_threads to %d task_team=%p\n", 2740 gtid, count, task_team)); 2741 *thread_finished = TRUE; 2742 } 2743 2744 // It is now unsafe to reference thread->th.th_team !!! 2745 // Decrementing task_team->tt.tt_unfinished_threads can allow the master 2746 // thread to pass through the barrier, where it might reset each thread's 2747 // th.th_team field for the next parallel region. If we can steal more 2748 // work, we know that this has not happened yet. 2749 if (flag != NULL && flag->done_check()) { 2750 KA_TRACE( 2751 15, 2752 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 2753 gtid)); 2754 return TRUE; 2755 } 2756 } 2757 2758 // If this thread's task team is NULL, master has recognized that there are 2759 // no more tasks; bail out 2760 if (thread->th.th_task_team == NULL) { 2761 KA_TRACE(15, 2762 ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid)); 2763 return FALSE; 2764 } 2765 2766 #if OMP_45_ENABLED 2767 // We could be getting tasks from target constructs; if this is the only 2768 // thread, keep trying to execute tasks from own queue 2769 if (nthreads == 1) 2770 use_own_tasks = 1; 2771 else 2772 #endif 2773 { 2774 KA_TRACE(15, 2775 ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid)); 2776 return FALSE; 2777 } 2778 } 2779 } 2780 2781 int __kmp_execute_tasks_32( 2782 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin, 2783 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2784 kmp_int32 is_constrained) { 2785 return __kmp_execute_tasks_template( 2786 thread, gtid, flag, final_spin, 2787 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 2788 } 2789 2790 int __kmp_execute_tasks_64( 2791 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin, 2792 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2793 kmp_int32 is_constrained) { 2794 return __kmp_execute_tasks_template( 2795 thread, gtid, flag, final_spin, 2796 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 2797 } 2798 2799 int __kmp_execute_tasks_oncore( 2800 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin, 2801 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2802 kmp_int32 is_constrained) { 2803 return __kmp_execute_tasks_template( 2804 thread, gtid, flag, final_spin, 2805 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 2806 } 2807 2808 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the 2809 // next barrier so they can assist in executing enqueued tasks. 2810 // First thread in allocates the task team atomically. 2811 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 2812 kmp_info_t *this_thr) { 2813 kmp_thread_data_t *threads_data; 2814 int nthreads, i, is_init_thread; 2815 2816 KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n", 2817 __kmp_gtid_from_thread(this_thr))); 2818 2819 KMP_DEBUG_ASSERT(task_team != NULL); 2820 KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL); 2821 2822 nthreads = task_team->tt.tt_nproc; 2823 KMP_DEBUG_ASSERT(nthreads > 0); 2824 KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc); 2825 2826 // Allocate or increase the size of threads_data if necessary 2827 is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team); 2828 2829 if (!is_init_thread) { 2830 // Some other thread already set up the array. 2831 KA_TRACE( 2832 20, 2833 ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n", 2834 __kmp_gtid_from_thread(this_thr))); 2835 return; 2836 } 2837 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 2838 KMP_DEBUG_ASSERT(threads_data != NULL); 2839 2840 if (__kmp_tasking_mode == tskm_task_teams && 2841 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) { 2842 // Release any threads sleeping at the barrier, so that they can steal 2843 // tasks and execute them. In extra barrier mode, tasks do not sleep 2844 // at the separate tasking barrier, so this isn't a problem. 2845 for (i = 0; i < nthreads; i++) { 2846 volatile void *sleep_loc; 2847 kmp_info_t *thread = threads_data[i].td.td_thr; 2848 2849 if (i == this_thr->th.th_info.ds.ds_tid) { 2850 continue; 2851 } 2852 // Since we haven't locked the thread's suspend mutex lock at this 2853 // point, there is a small window where a thread might be putting 2854 // itself to sleep, but hasn't set the th_sleep_loc field yet. 2855 // To work around this, __kmp_execute_tasks_template() periodically checks 2856 // see if other threads are sleeping (using the same random mechanism that 2857 // is used for task stealing) and awakens them if they are. 2858 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != 2859 NULL) { 2860 KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n", 2861 __kmp_gtid_from_thread(this_thr), 2862 __kmp_gtid_from_thread(thread))); 2863 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc); 2864 } else { 2865 KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n", 2866 __kmp_gtid_from_thread(this_thr), 2867 __kmp_gtid_from_thread(thread))); 2868 } 2869 } 2870 } 2871 2872 KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n", 2873 __kmp_gtid_from_thread(this_thr))); 2874 } 2875 2876 /* // TODO: Check the comment consistency 2877 * Utility routines for "task teams". A task team (kmp_task_t) is kind of 2878 * like a shadow of the kmp_team_t data struct, with a different lifetime. 2879 * After a child * thread checks into a barrier and calls __kmp_release() from 2880 * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no 2881 * longer assume that the kmp_team_t structure is intact (at any moment, the 2882 * master thread may exit the barrier code and free the team data structure, 2883 * and return the threads to the thread pool). 2884 * 2885 * This does not work with the the tasking code, as the thread is still 2886 * expected to participate in the execution of any tasks that may have been 2887 * spawned my a member of the team, and the thread still needs access to all 2888 * to each thread in the team, so that it can steal work from it. 2889 * 2890 * Enter the existence of the kmp_task_team_t struct. It employs a reference 2891 * counting mechanims, and is allocated by the master thread before calling 2892 * __kmp_<barrier_kind>_release, and then is release by the last thread to 2893 * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes 2894 * of the kmp_task_team_t structs for consecutive barriers can overlap 2895 * (and will, unless the master thread is the last thread to exit the barrier 2896 * release phase, which is not typical). 2897 * 2898 * The existence of such a struct is useful outside the context of tasking, 2899 * but for now, I'm trying to keep it specific to the OMP_30_ENABLED macro, 2900 * so that any performance differences show up when comparing the 2.5 vs. 3.0 2901 * libraries. 2902 * 2903 * We currently use the existence of the threads array as an indicator that 2904 * tasks were spawned since the last barrier. If the structure is to be 2905 * useful outside the context of tasking, then this will have to change, but 2906 * not settting the field minimizes the performance impact of tasking on 2907 * barriers, when no explicit tasks were spawned (pushed, actually). 2908 */ 2909 2910 static kmp_task_team_t *__kmp_free_task_teams = 2911 NULL; // Free list for task_team data structures 2912 // Lock for task team data structures 2913 kmp_bootstrap_lock_t __kmp_task_team_lock = 2914 KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock); 2915 2916 // __kmp_alloc_task_deque: 2917 // Allocates a task deque for a particular thread, and initialize the necessary 2918 // data structures relating to the deque. This only happens once per thread 2919 // per task team since task teams are recycled. No lock is needed during 2920 // allocation since each thread allocates its own deque. 2921 static void __kmp_alloc_task_deque(kmp_info_t *thread, 2922 kmp_thread_data_t *thread_data) { 2923 __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock); 2924 KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL); 2925 2926 // Initialize last stolen task field to "none" 2927 thread_data->td.td_deque_last_stolen = -1; 2928 2929 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0); 2930 KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0); 2931 KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0); 2932 2933 KE_TRACE( 2934 10, 2935 ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n", 2936 __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data)); 2937 // Allocate space for task deque, and zero the deque 2938 // Cannot use __kmp_thread_calloc() because threads not around for 2939 // kmp_reap_task_team( ). 2940 thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate( 2941 INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *)); 2942 thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE; 2943 } 2944 2945 // __kmp_free_task_deque: 2946 // Deallocates a task deque for a particular thread. Happens at library 2947 // deallocation so don't need to reset all thread data fields. 2948 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) { 2949 if (thread_data->td.td_deque != NULL) { 2950 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 2951 TCW_4(thread_data->td.td_deque_ntasks, 0); 2952 __kmp_free(thread_data->td.td_deque); 2953 thread_data->td.td_deque = NULL; 2954 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2955 } 2956 2957 #ifdef BUILD_TIED_TASK_STACK 2958 // GEH: Figure out what to do here for td_susp_tied_tasks 2959 if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) { 2960 __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data); 2961 } 2962 #endif // BUILD_TIED_TASK_STACK 2963 } 2964 2965 // __kmp_realloc_task_threads_data: 2966 // Allocates a threads_data array for a task team, either by allocating an 2967 // initial array or enlarging an existing array. Only the first thread to get 2968 // the lock allocs or enlarges the array and re-initializes the array eleemnts. 2969 // That thread returns "TRUE", the rest return "FALSE". 2970 // Assumes that the new array size is given by task_team -> tt.tt_nproc. 2971 // The current size is given by task_team -> tt.tt_max_threads. 2972 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 2973 kmp_task_team_t *task_team) { 2974 kmp_thread_data_t **threads_data_p; 2975 kmp_int32 nthreads, maxthreads; 2976 int is_init_thread = FALSE; 2977 2978 if (TCR_4(task_team->tt.tt_found_tasks)) { 2979 // Already reallocated and initialized. 2980 return FALSE; 2981 } 2982 2983 threads_data_p = &task_team->tt.tt_threads_data; 2984 nthreads = task_team->tt.tt_nproc; 2985 maxthreads = task_team->tt.tt_max_threads; 2986 2987 // All threads must lock when they encounter the first task of the implicit 2988 // task region to make sure threads_data fields are (re)initialized before 2989 // used. 2990 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 2991 2992 if (!TCR_4(task_team->tt.tt_found_tasks)) { 2993 // first thread to enable tasking 2994 kmp_team_t *team = thread->th.th_team; 2995 int i; 2996 2997 is_init_thread = TRUE; 2998 if (maxthreads < nthreads) { 2999 3000 if (*threads_data_p != NULL) { 3001 kmp_thread_data_t *old_data = *threads_data_p; 3002 kmp_thread_data_t *new_data = NULL; 3003 3004 KE_TRACE( 3005 10, 3006 ("__kmp_realloc_task_threads_data: T#%d reallocating " 3007 "threads data for task_team %p, new_size = %d, old_size = %d\n", 3008 __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads)); 3009 // Reallocate threads_data to have more elements than current array 3010 // Cannot use __kmp_thread_realloc() because threads not around for 3011 // kmp_reap_task_team( ). Note all new array entries are initialized 3012 // to zero by __kmp_allocate(). 3013 new_data = (kmp_thread_data_t *)__kmp_allocate( 3014 nthreads * sizeof(kmp_thread_data_t)); 3015 // copy old data to new data 3016 KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t), 3017 (void *)old_data, maxthreads * sizeof(kmp_thread_data_t)); 3018 3019 #ifdef BUILD_TIED_TASK_STACK 3020 // GEH: Figure out if this is the right thing to do 3021 for (i = maxthreads; i < nthreads; i++) { 3022 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 3023 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 3024 } 3025 #endif // BUILD_TIED_TASK_STACK 3026 // Install the new data and free the old data 3027 (*threads_data_p) = new_data; 3028 __kmp_free(old_data); 3029 } else { 3030 KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating " 3031 "threads data for task_team %p, size = %d\n", 3032 __kmp_gtid_from_thread(thread), task_team, nthreads)); 3033 // Make the initial allocate for threads_data array, and zero entries 3034 // Cannot use __kmp_thread_calloc() because threads not around for 3035 // kmp_reap_task_team( ). 3036 ANNOTATE_IGNORE_WRITES_BEGIN(); 3037 *threads_data_p = (kmp_thread_data_t *)__kmp_allocate( 3038 nthreads * sizeof(kmp_thread_data_t)); 3039 ANNOTATE_IGNORE_WRITES_END(); 3040 #ifdef BUILD_TIED_TASK_STACK 3041 // GEH: Figure out if this is the right thing to do 3042 for (i = 0; i < nthreads; i++) { 3043 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 3044 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 3045 } 3046 #endif // BUILD_TIED_TASK_STACK 3047 } 3048 task_team->tt.tt_max_threads = nthreads; 3049 } else { 3050 // If array has (more than) enough elements, go ahead and use it 3051 KMP_DEBUG_ASSERT(*threads_data_p != NULL); 3052 } 3053 3054 // initialize threads_data pointers back to thread_info structures 3055 for (i = 0; i < nthreads; i++) { 3056 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 3057 thread_data->td.td_thr = team->t.t_threads[i]; 3058 3059 if (thread_data->td.td_deque_last_stolen >= nthreads) { 3060 // The last stolen field survives across teams / barrier, and the number 3061 // of threads may have changed. It's possible (likely?) that a new 3062 // parallel region will exhibit the same behavior as previous region. 3063 thread_data->td.td_deque_last_stolen = -1; 3064 } 3065 } 3066 3067 KMP_MB(); 3068 TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE); 3069 } 3070 3071 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 3072 return is_init_thread; 3073 } 3074 3075 // __kmp_free_task_threads_data: 3076 // Deallocates a threads_data array for a task team, including any attached 3077 // tasking deques. Only occurs at library shutdown. 3078 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) { 3079 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 3080 if (task_team->tt.tt_threads_data != NULL) { 3081 int i; 3082 for (i = 0; i < task_team->tt.tt_max_threads; i++) { 3083 __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]); 3084 } 3085 __kmp_free(task_team->tt.tt_threads_data); 3086 task_team->tt.tt_threads_data = NULL; 3087 } 3088 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 3089 } 3090 3091 // __kmp_allocate_task_team: 3092 // Allocates a task team associated with a specific team, taking it from 3093 // the global task team free list if possible. Also initializes data 3094 // structures. 3095 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread, 3096 kmp_team_t *team) { 3097 kmp_task_team_t *task_team = NULL; 3098 int nthreads; 3099 3100 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n", 3101 (thread ? __kmp_gtid_from_thread(thread) : -1), team)); 3102 3103 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 3104 // Take a task team from the task team pool 3105 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3106 if (__kmp_free_task_teams != NULL) { 3107 task_team = __kmp_free_task_teams; 3108 TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next); 3109 task_team->tt.tt_next = NULL; 3110 } 3111 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3112 } 3113 3114 if (task_team == NULL) { 3115 KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating " 3116 "task team for team %p\n", 3117 __kmp_gtid_from_thread(thread), team)); 3118 // Allocate a new task team if one is not available. 3119 // Cannot use __kmp_thread_malloc() because threads not around for 3120 // kmp_reap_task_team( ). 3121 task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t)); 3122 __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock); 3123 // AC: __kmp_allocate zeroes returned memory 3124 // task_team -> tt.tt_threads_data = NULL; 3125 // task_team -> tt.tt_max_threads = 0; 3126 // task_team -> tt.tt_next = NULL; 3127 } 3128 3129 TCW_4(task_team->tt.tt_found_tasks, FALSE); 3130 #if OMP_45_ENABLED 3131 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3132 #endif 3133 task_team->tt.tt_nproc = nthreads = team->t.t_nproc; 3134 3135 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads); 3136 TCW_4(task_team->tt.tt_active, TRUE); 3137 3138 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p " 3139 "unfinished_threads init'd to %d\n", 3140 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team, 3141 KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads))); 3142 return task_team; 3143 } 3144 3145 // __kmp_free_task_team: 3146 // Frees the task team associated with a specific thread, and adds it 3147 // to the global task team free list. 3148 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) { 3149 KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n", 3150 thread ? __kmp_gtid_from_thread(thread) : -1, task_team)); 3151 3152 // Put task team back on free list 3153 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3154 3155 KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL); 3156 task_team->tt.tt_next = __kmp_free_task_teams; 3157 TCW_PTR(__kmp_free_task_teams, task_team); 3158 3159 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3160 } 3161 3162 // __kmp_reap_task_teams: 3163 // Free all the task teams on the task team free list. 3164 // Should only be done during library shutdown. 3165 // Cannot do anything that needs a thread structure or gtid since they are 3166 // already gone. 3167 void __kmp_reap_task_teams(void) { 3168 kmp_task_team_t *task_team; 3169 3170 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 3171 // Free all task_teams on the free list 3172 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3173 while ((task_team = __kmp_free_task_teams) != NULL) { 3174 __kmp_free_task_teams = task_team->tt.tt_next; 3175 task_team->tt.tt_next = NULL; 3176 3177 // Free threads_data if necessary 3178 if (task_team->tt.tt_threads_data != NULL) { 3179 __kmp_free_task_threads_data(task_team); 3180 } 3181 __kmp_free(task_team); 3182 } 3183 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3184 } 3185 } 3186 3187 // __kmp_wait_to_unref_task_teams: 3188 // Some threads could still be in the fork barrier release code, possibly 3189 // trying to steal tasks. Wait for each thread to unreference its task team. 3190 void __kmp_wait_to_unref_task_teams(void) { 3191 kmp_info_t *thread; 3192 kmp_uint32 spins; 3193 int done; 3194 3195 KMP_INIT_YIELD(spins); 3196 3197 for (;;) { 3198 done = TRUE; 3199 3200 // TODO: GEH - this may be is wrong because some sync would be necessary 3201 // in case threads are added to the pool during the traversal. Need to 3202 // verify that lock for thread pool is held when calling this routine. 3203 for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL; 3204 thread = thread->th.th_next_pool) { 3205 #if KMP_OS_WINDOWS 3206 DWORD exit_val; 3207 #endif 3208 if (TCR_PTR(thread->th.th_task_team) == NULL) { 3209 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n", 3210 __kmp_gtid_from_thread(thread))); 3211 continue; 3212 } 3213 #if KMP_OS_WINDOWS 3214 // TODO: GEH - add this check for Linux* OS / OS X* as well? 3215 if (!__kmp_is_thread_alive(thread, &exit_val)) { 3216 thread->th.th_task_team = NULL; 3217 continue; 3218 } 3219 #endif 3220 3221 done = FALSE; // Because th_task_team pointer is not NULL for this thread 3222 3223 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to " 3224 "unreference task_team\n", 3225 __kmp_gtid_from_thread(thread))); 3226 3227 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 3228 volatile void *sleep_loc; 3229 // If the thread is sleeping, awaken it. 3230 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != 3231 NULL) { 3232 KA_TRACE( 3233 10, 3234 ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n", 3235 __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread))); 3236 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc); 3237 } 3238 } 3239 } 3240 if (done) { 3241 break; 3242 } 3243 3244 // If oversubscribed or have waited a bit, yield. 3245 KMP_YIELD_OVERSUB_ELSE_SPIN(spins); 3246 } 3247 } 3248 3249 // __kmp_task_team_setup: Create a task_team for the current team, but use 3250 // an already created, unused one if it already exists. 3251 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) { 3252 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3253 3254 // If this task_team hasn't been created yet, allocate it. It will be used in 3255 // the region after the next. 3256 // If it exists, it is the current task team and shouldn't be touched yet as 3257 // it may still be in use. 3258 if (team->t.t_task_team[this_thr->th.th_task_state] == NULL && 3259 (always || team->t.t_nproc > 1)) { 3260 team->t.t_task_team[this_thr->th.th_task_state] = 3261 __kmp_allocate_task_team(this_thr, team); 3262 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p " 3263 "for team %d at parity=%d\n", 3264 __kmp_gtid_from_thread(this_thr), 3265 team->t.t_task_team[this_thr->th.th_task_state], 3266 ((team != NULL) ? team->t.t_id : -1), 3267 this_thr->th.th_task_state)); 3268 } 3269 3270 // After threads exit the release, they will call sync, and then point to this 3271 // other task_team; make sure it is allocated and properly initialized. As 3272 // threads spin in the barrier release phase, they will continue to use the 3273 // previous task_team struct(above), until they receive the signal to stop 3274 // checking for tasks (they can't safely reference the kmp_team_t struct, 3275 // which could be reallocated by the master thread). No task teams are formed 3276 // for serialized teams. 3277 if (team->t.t_nproc > 1) { 3278 int other_team = 1 - this_thr->th.th_task_state; 3279 if (team->t.t_task_team[other_team] == NULL) { // setup other team as well 3280 team->t.t_task_team[other_team] = 3281 __kmp_allocate_task_team(this_thr, team); 3282 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new " 3283 "task_team %p for team %d at parity=%d\n", 3284 __kmp_gtid_from_thread(this_thr), 3285 team->t.t_task_team[other_team], 3286 ((team != NULL) ? team->t.t_id : -1), other_team)); 3287 } else { // Leave the old task team struct in place for the upcoming region; 3288 // adjust as needed 3289 kmp_task_team_t *task_team = team->t.t_task_team[other_team]; 3290 if (!task_team->tt.tt_active || 3291 team->t.t_nproc != task_team->tt.tt_nproc) { 3292 TCW_4(task_team->tt.tt_nproc, team->t.t_nproc); 3293 TCW_4(task_team->tt.tt_found_tasks, FALSE); 3294 #if OMP_45_ENABLED 3295 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3296 #endif 3297 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, 3298 team->t.t_nproc); 3299 TCW_4(task_team->tt.tt_active, TRUE); 3300 } 3301 // if team size has changed, the first thread to enable tasking will 3302 // realloc threads_data if necessary 3303 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team " 3304 "%p for team %d at parity=%d\n", 3305 __kmp_gtid_from_thread(this_thr), 3306 team->t.t_task_team[other_team], 3307 ((team != NULL) ? team->t.t_id : -1), other_team)); 3308 } 3309 } 3310 } 3311 3312 // __kmp_task_team_sync: Propagation of task team data from team to threads 3313 // which happens just after the release phase of a team barrier. This may be 3314 // called by any thread, but only for teams with # threads > 1. 3315 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) { 3316 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3317 3318 // Toggle the th_task_state field, to switch which task_team this thread 3319 // refers to 3320 this_thr->th.th_task_state = 1 - this_thr->th.th_task_state; 3321 // It is now safe to propagate the task team pointer from the team struct to 3322 // the current thread. 3323 TCW_PTR(this_thr->th.th_task_team, 3324 team->t.t_task_team[this_thr->th.th_task_state]); 3325 KA_TRACE(20, 3326 ("__kmp_task_team_sync: Thread T#%d task team switched to task_team " 3327 "%p from Team #%d (parity=%d)\n", 3328 __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team, 3329 ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state)); 3330 } 3331 3332 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the 3333 // barrier gather phase. Only called by master thread if #threads in team > 1 or 3334 // if proxy tasks were created. 3335 // 3336 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off 3337 // by passing in 0 optionally as the last argument. When wait is zero, master 3338 // thread does not wait for unfinished_threads to reach 0. 3339 void __kmp_task_team_wait( 3340 kmp_info_t *this_thr, 3341 kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) { 3342 kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state]; 3343 3344 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3345 KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team); 3346 3347 if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) { 3348 if (wait) { 3349 KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks " 3350 "(for unfinished_threads to reach 0) on task_team = %p\n", 3351 __kmp_gtid_from_thread(this_thr), task_team)); 3352 // Worker threads may have dropped through to release phase, but could 3353 // still be executing tasks. Wait here for tasks to complete. To avoid 3354 // memory contention, only master thread checks termination condition. 3355 kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, 3356 &task_team->tt.tt_unfinished_threads), 3357 0U); 3358 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 3359 } 3360 // Deactivate the old task team, so that the worker threads will stop 3361 // referencing it while spinning. 3362 KA_TRACE( 3363 20, 3364 ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: " 3365 "setting active to false, setting local and team's pointer to NULL\n", 3366 __kmp_gtid_from_thread(this_thr), task_team)); 3367 #if OMP_45_ENABLED 3368 KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 || 3369 task_team->tt.tt_found_proxy_tasks == TRUE); 3370 TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3371 #else 3372 KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1); 3373 #endif 3374 KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0); 3375 TCW_SYNC_4(task_team->tt.tt_active, FALSE); 3376 KMP_MB(); 3377 3378 TCW_PTR(this_thr->th.th_task_team, NULL); 3379 } 3380 } 3381 3382 // __kmp_tasking_barrier: 3383 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier. 3384 // Internal function to execute all tasks prior to a regular barrier or a join 3385 // barrier. It is a full barrier itself, which unfortunately turns regular 3386 // barriers into double barriers and join barriers into 1 1/2 barriers. 3387 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) { 3388 std::atomic<kmp_uint32> *spin = RCAST( 3389 std::atomic<kmp_uint32> *, 3390 &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads); 3391 int flag = FALSE; 3392 KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier); 3393 3394 #if USE_ITT_BUILD 3395 KMP_FSYNC_SPIN_INIT(spin, NULL); 3396 #endif /* USE_ITT_BUILD */ 3397 kmp_flag_32 spin_flag(spin, 0U); 3398 while (!spin_flag.execute_tasks(thread, gtid, TRUE, 3399 &flag USE_ITT_BUILD_ARG(NULL), 0)) { 3400 #if USE_ITT_BUILD 3401 // TODO: What about itt_sync_obj?? 3402 KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin)); 3403 #endif /* USE_ITT_BUILD */ 3404 3405 if (TCR_4(__kmp_global.g.g_done)) { 3406 if (__kmp_global.g.g_abort) 3407 __kmp_abort_thread(); 3408 break; 3409 } 3410 KMP_YIELD(TRUE); 3411 } 3412 #if USE_ITT_BUILD 3413 KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin)); 3414 #endif /* USE_ITT_BUILD */ 3415 } 3416 3417 #if OMP_45_ENABLED 3418 3419 // __kmp_give_task puts a task into a given thread queue if: 3420 // - the queue for that thread was created 3421 // - there's space in that queue 3422 // Because of this, __kmp_push_task needs to check if there's space after 3423 // getting the lock 3424 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task, 3425 kmp_int32 pass) { 3426 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 3427 kmp_task_team_t *task_team = taskdata->td_task_team; 3428 3429 KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n", 3430 taskdata, tid)); 3431 3432 // If task_team is NULL something went really bad... 3433 KMP_DEBUG_ASSERT(task_team != NULL); 3434 3435 bool result = false; 3436 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 3437 3438 if (thread_data->td.td_deque == NULL) { 3439 // There's no queue in this thread, go find another one 3440 // We're guaranteed that at least one thread has a queue 3441 KA_TRACE(30, 3442 ("__kmp_give_task: thread %d has no queue while giving task %p.\n", 3443 tid, taskdata)); 3444 return result; 3445 } 3446 3447 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3448 TASK_DEQUE_SIZE(thread_data->td)) { 3449 KA_TRACE( 3450 30, 3451 ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", 3452 taskdata, tid)); 3453 3454 // if this deque is bigger than the pass ratio give a chance to another 3455 // thread 3456 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 3457 return result; 3458 3459 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3460 __kmp_realloc_task_deque(thread, thread_data); 3461 3462 } else { 3463 3464 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3465 3466 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3467 TASK_DEQUE_SIZE(thread_data->td)) { 3468 KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to " 3469 "thread %d.\n", 3470 taskdata, tid)); 3471 3472 // if this deque is bigger than the pass ratio give a chance to another 3473 // thread 3474 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 3475 goto release_and_exit; 3476 3477 __kmp_realloc_task_deque(thread, thread_data); 3478 } 3479 } 3480 3481 // lock is held here, and there is space in the deque 3482 3483 thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata; 3484 // Wrap index. 3485 thread_data->td.td_deque_tail = 3486 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 3487 TCW_4(thread_data->td.td_deque_ntasks, 3488 TCR_4(thread_data->td.td_deque_ntasks) + 1); 3489 3490 result = true; 3491 KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n", 3492 taskdata, tid)); 3493 3494 release_and_exit: 3495 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 3496 3497 return result; 3498 } 3499 3500 /* The finish of the proxy tasks is divided in two pieces: 3501 - the top half is the one that can be done from a thread outside the team 3502 - the bottom half must be run from a thread within the team 3503 3504 In order to run the bottom half the task gets queued back into one of the 3505 threads of the team. Once the td_incomplete_child_task counter of the parent 3506 is decremented the threads can leave the barriers. So, the bottom half needs 3507 to be queued before the counter is decremented. The top half is therefore 3508 divided in two parts: 3509 - things that can be run before queuing the bottom half 3510 - things that must be run after queuing the bottom half 3511 3512 This creates a second race as the bottom half can free the task before the 3513 second top half is executed. To avoid this we use the 3514 td_incomplete_child_task of the proxy task to synchronize the top and bottom 3515 half. */ 3516 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 3517 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 3518 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3519 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 3520 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 3521 3522 taskdata->td_flags.complete = 1; // mark the task as completed 3523 3524 if (taskdata->td_taskgroup) 3525 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); 3526 3527 // Create an imaginary children for this task so the bottom half cannot 3528 // release the task before we have completed the second top half 3529 KMP_ATOMIC_INC(&taskdata->td_incomplete_child_tasks); 3530 } 3531 3532 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 3533 kmp_int32 children = 0; 3534 3535 // Predecrement simulated by "- 1" calculation 3536 children = 3537 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1; 3538 KMP_DEBUG_ASSERT(children >= 0); 3539 3540 // Remove the imaginary children 3541 KMP_ATOMIC_DEC(&taskdata->td_incomplete_child_tasks); 3542 } 3543 3544 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) { 3545 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3546 kmp_info_t *thread = __kmp_threads[gtid]; 3547 3548 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3549 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 3550 1); // top half must run before bottom half 3551 3552 // We need to wait to make sure the top half is finished 3553 // Spinning here should be ok as this should happen quickly 3554 while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) > 0) 3555 ; 3556 3557 __kmp_release_deps(gtid, taskdata); 3558 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 3559 } 3560 3561 /*! 3562 @ingroup TASKING 3563 @param gtid Global Thread ID of encountering thread 3564 @param ptask Task which execution is completed 3565 3566 Execute the completation of a proxy task from a thread of that is part of the 3567 team. Run first and bottom halves directly. 3568 */ 3569 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) { 3570 KMP_DEBUG_ASSERT(ptask != NULL); 3571 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3572 KA_TRACE( 3573 10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n", 3574 gtid, taskdata)); 3575 3576 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3577 3578 __kmp_first_top_half_finish_proxy(taskdata); 3579 __kmp_second_top_half_finish_proxy(taskdata); 3580 __kmp_bottom_half_finish_proxy(gtid, ptask); 3581 3582 KA_TRACE(10, 3583 ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n", 3584 gtid, taskdata)); 3585 } 3586 3587 /*! 3588 @ingroup TASKING 3589 @param ptask Task which execution is completed 3590 3591 Execute the completation of a proxy task from a thread that could not belong to 3592 the team. 3593 */ 3594 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) { 3595 KMP_DEBUG_ASSERT(ptask != NULL); 3596 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3597 3598 KA_TRACE( 3599 10, 3600 ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n", 3601 taskdata)); 3602 3603 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3604 3605 __kmp_first_top_half_finish_proxy(taskdata); 3606 3607 // Enqueue task to complete bottom half completion from a thread within the 3608 // corresponding team 3609 kmp_team_t *team = taskdata->td_team; 3610 kmp_int32 nthreads = team->t.t_nproc; 3611 kmp_info_t *thread; 3612 3613 // This should be similar to start_k = __kmp_get_random( thread ) % nthreads 3614 // but we cannot use __kmp_get_random here 3615 kmp_int32 start_k = 0; 3616 kmp_int32 pass = 1; 3617 kmp_int32 k = start_k; 3618 3619 do { 3620 // For now we're just linearly trying to find a thread 3621 thread = team->t.t_threads[k]; 3622 k = (k + 1) % nthreads; 3623 3624 // we did a full pass through all the threads 3625 if (k == start_k) 3626 pass = pass << 1; 3627 3628 } while (!__kmp_give_task(thread, k, ptask, pass)); 3629 3630 __kmp_second_top_half_finish_proxy(taskdata); 3631 3632 KA_TRACE( 3633 10, 3634 ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", 3635 taskdata)); 3636 } 3637 3638 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task 3639 // for taskloop 3640 // 3641 // thread: allocating thread 3642 // task_src: pointer to source task to be duplicated 3643 // returns: a pointer to the allocated kmp_task_t structure (task). 3644 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) { 3645 kmp_task_t *task; 3646 kmp_taskdata_t *taskdata; 3647 kmp_taskdata_t *taskdata_src; 3648 kmp_taskdata_t *parent_task = thread->th.th_current_task; 3649 size_t shareds_offset; 3650 size_t task_size; 3651 3652 KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, 3653 task_src)); 3654 taskdata_src = KMP_TASK_TO_TASKDATA(task_src); 3655 KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy == 3656 TASK_FULL); // it should not be proxy task 3657 KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT); 3658 task_size = taskdata_src->td_size_alloc; 3659 3660 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 3661 KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread, 3662 task_size)); 3663 #if USE_FAST_MEMORY 3664 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size); 3665 #else 3666 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size); 3667 #endif /* USE_FAST_MEMORY */ 3668 KMP_MEMCPY(taskdata, taskdata_src, task_size); 3669 3670 task = KMP_TASKDATA_TO_TASK(taskdata); 3671 3672 // Initialize new task (only specific fields not affected by memcpy) 3673 taskdata->td_task_id = KMP_GEN_TASK_ID(); 3674 if (task->shareds != NULL) { // need setup shareds pointer 3675 shareds_offset = (char *)task_src->shareds - (char *)taskdata_src; 3676 task->shareds = &((char *)taskdata)[shareds_offset]; 3677 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 3678 0); 3679 } 3680 taskdata->td_alloc_thread = thread; 3681 taskdata->td_parent = parent_task; 3682 taskdata->td_taskgroup = 3683 parent_task 3684 ->td_taskgroup; // task inherits the taskgroup from the parent task 3685 3686 // Only need to keep track of child task counts if team parallel and tasking 3687 // not serialized 3688 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { 3689 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks); 3690 if (parent_task->td_taskgroup) 3691 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count); 3692 // Only need to keep track of allocated child tasks for explicit tasks since 3693 // implicit not deallocated 3694 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) 3695 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks); 3696 } 3697 3698 KA_TRACE(20, 3699 ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n", 3700 thread, taskdata, taskdata->td_parent)); 3701 #if OMPT_SUPPORT 3702 if (UNLIKELY(ompt_enabled.enabled)) 3703 __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid); 3704 #endif 3705 return task; 3706 } 3707 3708 // Routine optionally generated by the compiler for setting the lastprivate flag 3709 // and calling needed constructors for private/firstprivate objects 3710 // (used to form taskloop tasks from pattern task) 3711 // Parameters: dest task, src task, lastprivate flag. 3712 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32); 3713 3714 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8); 3715 3716 // class to encapsulate manipulating loop bounds in a taskloop task. 3717 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting 3718 // the loop bound variables. 3719 class kmp_taskloop_bounds_t { 3720 kmp_task_t *task; 3721 const kmp_taskdata_t *taskdata; 3722 size_t lower_offset; 3723 size_t upper_offset; 3724 3725 public: 3726 kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub) 3727 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)), 3728 lower_offset((char *)lb - (char *)task), 3729 upper_offset((char *)ub - (char *)task) { 3730 KMP_DEBUG_ASSERT((char *)lb > (char *)_task); 3731 KMP_DEBUG_ASSERT((char *)ub > (char *)_task); 3732 } 3733 kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds) 3734 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)), 3735 lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {} 3736 size_t get_lower_offset() const { return lower_offset; } 3737 size_t get_upper_offset() const { return upper_offset; } 3738 kmp_uint64 get_lb() const { 3739 kmp_int64 retval; 3740 #if defined(KMP_GOMP_COMPAT) 3741 // Intel task just returns the lower bound normally 3742 if (!taskdata->td_flags.native) { 3743 retval = *(kmp_int64 *)((char *)task + lower_offset); 3744 } else { 3745 // GOMP task has to take into account the sizeof(long) 3746 if (taskdata->td_size_loop_bounds == 4) { 3747 kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds); 3748 retval = (kmp_int64)*lb; 3749 } else { 3750 kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds); 3751 retval = (kmp_int64)*lb; 3752 } 3753 } 3754 #else 3755 retval = *(kmp_int64 *)((char *)task + lower_offset); 3756 #endif // defined(KMP_GOMP_COMPAT) 3757 return retval; 3758 } 3759 kmp_uint64 get_ub() const { 3760 kmp_int64 retval; 3761 #if defined(KMP_GOMP_COMPAT) 3762 // Intel task just returns the upper bound normally 3763 if (!taskdata->td_flags.native) { 3764 retval = *(kmp_int64 *)((char *)task + upper_offset); 3765 } else { 3766 // GOMP task has to take into account the sizeof(long) 3767 if (taskdata->td_size_loop_bounds == 4) { 3768 kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1; 3769 retval = (kmp_int64)*ub; 3770 } else { 3771 kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1; 3772 retval = (kmp_int64)*ub; 3773 } 3774 } 3775 #else 3776 retval = *(kmp_int64 *)((char *)task + upper_offset); 3777 #endif // defined(KMP_GOMP_COMPAT) 3778 return retval; 3779 } 3780 void set_lb(kmp_uint64 lb) { 3781 #if defined(KMP_GOMP_COMPAT) 3782 // Intel task just sets the lower bound normally 3783 if (!taskdata->td_flags.native) { 3784 *(kmp_uint64 *)((char *)task + lower_offset) = lb; 3785 } else { 3786 // GOMP task has to take into account the sizeof(long) 3787 if (taskdata->td_size_loop_bounds == 4) { 3788 kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds); 3789 *lower = (kmp_uint32)lb; 3790 } else { 3791 kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds); 3792 *lower = (kmp_uint64)lb; 3793 } 3794 } 3795 #else 3796 *(kmp_uint64 *)((char *)task + lower_offset) = lb; 3797 #endif // defined(KMP_GOMP_COMPAT) 3798 } 3799 void set_ub(kmp_uint64 ub) { 3800 #if defined(KMP_GOMP_COMPAT) 3801 // Intel task just sets the upper bound normally 3802 if (!taskdata->td_flags.native) { 3803 *(kmp_uint64 *)((char *)task + upper_offset) = ub; 3804 } else { 3805 // GOMP task has to take into account the sizeof(long) 3806 if (taskdata->td_size_loop_bounds == 4) { 3807 kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1; 3808 *upper = (kmp_uint32)ub; 3809 } else { 3810 kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1; 3811 *upper = (kmp_uint64)ub; 3812 } 3813 } 3814 #else 3815 *(kmp_uint64 *)((char *)task + upper_offset) = ub; 3816 #endif // defined(KMP_GOMP_COMPAT) 3817 } 3818 }; 3819 3820 // __kmp_taskloop_linear: Start tasks of the taskloop linearly 3821 // 3822 // loc Source location information 3823 // gtid Global thread ID 3824 // task Pattern task, exposes the loop iteration range 3825 // lb Pointer to loop lower bound in task structure 3826 // ub Pointer to loop upper bound in task structure 3827 // st Loop stride 3828 // ub_glob Global upper bound (used for lastprivate check) 3829 // num_tasks Number of tasks to execute 3830 // grainsize Number of loop iterations per task 3831 // extras Number of chunks with grainsize+1 iterations 3832 // tc Iterations count 3833 // task_dup Tasks duplication routine 3834 // codeptr_ra Return address for OMPT events 3835 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, 3836 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 3837 kmp_uint64 ub_glob, kmp_uint64 num_tasks, 3838 kmp_uint64 grainsize, kmp_uint64 extras, 3839 kmp_uint64 tc, 3840 #if OMPT_SUPPORT 3841 void *codeptr_ra, 3842 #endif 3843 void *task_dup) { 3844 KMP_COUNT_BLOCK(OMP_TASKLOOP); 3845 KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling); 3846 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 3847 // compiler provides global bounds here 3848 kmp_taskloop_bounds_t task_bounds(task, lb, ub); 3849 kmp_uint64 lower = task_bounds.get_lb(); 3850 kmp_uint64 upper = task_bounds.get_ub(); 3851 kmp_uint64 i; 3852 kmp_info_t *thread = __kmp_threads[gtid]; 3853 kmp_taskdata_t *current_task = thread->th.th_current_task; 3854 kmp_task_t *next_task; 3855 kmp_int32 lastpriv = 0; 3856 3857 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); 3858 KMP_DEBUG_ASSERT(num_tasks > extras); 3859 KMP_DEBUG_ASSERT(num_tasks > 0); 3860 KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, " 3861 "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n", 3862 gtid, num_tasks, grainsize, extras, lower, upper, ub_glob, st, 3863 task_dup)); 3864 3865 // Launch num_tasks tasks, assign grainsize iterations each task 3866 for (i = 0; i < num_tasks; ++i) { 3867 kmp_uint64 chunk_minus_1; 3868 if (extras == 0) { 3869 chunk_minus_1 = grainsize - 1; 3870 } else { 3871 chunk_minus_1 = grainsize; 3872 --extras; // first extras iterations get bigger chunk (grainsize+1) 3873 } 3874 upper = lower + st * chunk_minus_1; 3875 if (i == num_tasks - 1) { 3876 // schedule the last task, set lastprivate flag if needed 3877 if (st == 1) { // most common case 3878 KMP_DEBUG_ASSERT(upper == *ub); 3879 if (upper == ub_glob) 3880 lastpriv = 1; 3881 } else if (st > 0) { // positive loop stride 3882 KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper); 3883 if ((kmp_uint64)st > ub_glob - upper) 3884 lastpriv = 1; 3885 } else { // negative loop stride 3886 KMP_DEBUG_ASSERT(upper + st < *ub); 3887 if (upper - ub_glob < (kmp_uint64)(-st)) 3888 lastpriv = 1; 3889 } 3890 } 3891 next_task = __kmp_task_dup_alloc(thread, task); // allocate new task 3892 kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task); 3893 kmp_taskloop_bounds_t next_task_bounds = 3894 kmp_taskloop_bounds_t(next_task, task_bounds); 3895 3896 // adjust task-specific bounds 3897 next_task_bounds.set_lb(lower); 3898 if (next_taskdata->td_flags.native) { 3899 next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1)); 3900 } else { 3901 next_task_bounds.set_ub(upper); 3902 } 3903 if (ptask_dup != NULL) // set lastprivate flag, construct fistprivates, etc. 3904 ptask_dup(next_task, task, lastpriv); 3905 KA_TRACE(40, 3906 ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, " 3907 "upper %lld stride %lld, (offsets %p %p)\n", 3908 gtid, i, next_task, lower, upper, st, 3909 next_task_bounds.get_lower_offset(), 3910 next_task_bounds.get_upper_offset())); 3911 #if OMPT_SUPPORT 3912 __kmp_omp_taskloop_task(NULL, gtid, next_task, 3913 codeptr_ra); // schedule new task 3914 #else 3915 __kmp_omp_task(gtid, next_task, true); // schedule new task 3916 #endif 3917 lower = upper + st; // adjust lower bound for the next iteration 3918 } 3919 // free the pattern task and exit 3920 __kmp_task_start(gtid, task, current_task); // make internal bookkeeping 3921 // do not execute the pattern task, just do internal bookkeeping 3922 __kmp_task_finish<false>(gtid, task, current_task); 3923 } 3924 3925 // Structure to keep taskloop parameters for auxiliary task 3926 // kept in the shareds of the task structure. 3927 typedef struct __taskloop_params { 3928 kmp_task_t *task; 3929 kmp_uint64 *lb; 3930 kmp_uint64 *ub; 3931 void *task_dup; 3932 kmp_int64 st; 3933 kmp_uint64 ub_glob; 3934 kmp_uint64 num_tasks; 3935 kmp_uint64 grainsize; 3936 kmp_uint64 extras; 3937 kmp_uint64 tc; 3938 kmp_uint64 num_t_min; 3939 #if OMPT_SUPPORT 3940 void *codeptr_ra; 3941 #endif 3942 } __taskloop_params_t; 3943 3944 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *, 3945 kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64, 3946 kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64, 3947 #if OMPT_SUPPORT 3948 void *, 3949 #endif 3950 void *); 3951 3952 // Execute part of the the taskloop submitted as a task. 3953 int __kmp_taskloop_task(int gtid, void *ptask) { 3954 __taskloop_params_t *p = 3955 (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds; 3956 kmp_task_t *task = p->task; 3957 kmp_uint64 *lb = p->lb; 3958 kmp_uint64 *ub = p->ub; 3959 void *task_dup = p->task_dup; 3960 // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 3961 kmp_int64 st = p->st; 3962 kmp_uint64 ub_glob = p->ub_glob; 3963 kmp_uint64 num_tasks = p->num_tasks; 3964 kmp_uint64 grainsize = p->grainsize; 3965 kmp_uint64 extras = p->extras; 3966 kmp_uint64 tc = p->tc; 3967 kmp_uint64 num_t_min = p->num_t_min; 3968 #if OMPT_SUPPORT 3969 void *codeptr_ra = p->codeptr_ra; 3970 #endif 3971 #if KMP_DEBUG 3972 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 3973 KMP_DEBUG_ASSERT(task != NULL); 3974 KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize" 3975 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", 3976 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st, 3977 task_dup)); 3978 #endif 3979 KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min); 3980 if (num_tasks > num_t_min) 3981 __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, 3982 grainsize, extras, tc, num_t_min, 3983 #if OMPT_SUPPORT 3984 codeptr_ra, 3985 #endif 3986 task_dup); 3987 else 3988 __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, 3989 grainsize, extras, tc, 3990 #if OMPT_SUPPORT 3991 codeptr_ra, 3992 #endif 3993 task_dup); 3994 3995 KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid)); 3996 return 0; 3997 } 3998 3999 // Schedule part of the the taskloop as a task, 4000 // execute the rest of the the taskloop. 4001 // 4002 // loc Source location information 4003 // gtid Global thread ID 4004 // task Pattern task, exposes the loop iteration range 4005 // lb Pointer to loop lower bound in task structure 4006 // ub Pointer to loop upper bound in task structure 4007 // st Loop stride 4008 // ub_glob Global upper bound (used for lastprivate check) 4009 // num_tasks Number of tasks to execute 4010 // grainsize Number of loop iterations per task 4011 // extras Number of chunks with grainsize+1 iterations 4012 // tc Iterations count 4013 // num_t_min Threashold to launch tasks recursively 4014 // task_dup Tasks duplication routine 4015 // codeptr_ra Return address for OMPT events 4016 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task, 4017 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 4018 kmp_uint64 ub_glob, kmp_uint64 num_tasks, 4019 kmp_uint64 grainsize, kmp_uint64 extras, 4020 kmp_uint64 tc, kmp_uint64 num_t_min, 4021 #if OMPT_SUPPORT 4022 void *codeptr_ra, 4023 #endif 4024 void *task_dup) { 4025 #if KMP_DEBUG 4026 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4027 KMP_DEBUG_ASSERT(task != NULL); 4028 KMP_DEBUG_ASSERT(num_tasks > num_t_min); 4029 KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize" 4030 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", 4031 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st, 4032 task_dup)); 4033 #endif 4034 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 4035 kmp_uint64 lower = *lb; 4036 kmp_info_t *thread = __kmp_threads[gtid]; 4037 // kmp_taskdata_t *current_task = thread->th.th_current_task; 4038 kmp_task_t *next_task; 4039 size_t lower_offset = 4040 (char *)lb - (char *)task; // remember offset of lb in the task structure 4041 size_t upper_offset = 4042 (char *)ub - (char *)task; // remember offset of ub in the task structure 4043 4044 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); 4045 KMP_DEBUG_ASSERT(num_tasks > extras); 4046 KMP_DEBUG_ASSERT(num_tasks > 0); 4047 4048 // split the loop in two halves 4049 kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1; 4050 kmp_uint64 gr_size0 = grainsize; 4051 kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute 4052 kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task 4053 if (n_tsk0 <= extras) { 4054 gr_size0++; // integrate extras into grainsize 4055 ext0 = 0; // no extra iters in 1st half 4056 ext1 = extras - n_tsk0; // remaining extras 4057 tc0 = gr_size0 * n_tsk0; 4058 tc1 = tc - tc0; 4059 } else { // n_tsk0 > extras 4060 ext1 = 0; // no extra iters in 2nd half 4061 ext0 = extras; 4062 tc1 = grainsize * n_tsk1; 4063 tc0 = tc - tc1; 4064 } 4065 ub0 = lower + st * (tc0 - 1); 4066 lb1 = ub0 + st; 4067 4068 // create pattern task for 2nd half of the loop 4069 next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task 4070 // adjust lower bound (upper bound is not changed) for the 2nd half 4071 *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1; 4072 if (ptask_dup != NULL) // construct fistprivates, etc. 4073 ptask_dup(next_task, task, 0); 4074 *ub = ub0; // adjust upper bound for the 1st half 4075 4076 // create auxiliary task for 2nd half of the loop 4077 kmp_task_t *new_task = 4078 __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *), 4079 sizeof(__taskloop_params_t), &__kmp_taskloop_task); 4080 __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds; 4081 p->task = next_task; 4082 p->lb = (kmp_uint64 *)((char *)next_task + lower_offset); 4083 p->ub = (kmp_uint64 *)((char *)next_task + upper_offset); 4084 p->task_dup = task_dup; 4085 p->st = st; 4086 p->ub_glob = ub_glob; 4087 p->num_tasks = n_tsk1; 4088 p->grainsize = grainsize; 4089 p->extras = ext1; 4090 p->tc = tc1; 4091 p->num_t_min = num_t_min; 4092 #if OMPT_SUPPORT 4093 p->codeptr_ra = codeptr_ra; 4094 #endif 4095 4096 #if OMPT_SUPPORT 4097 // schedule new task with correct return address for OMPT events 4098 __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra); 4099 #else 4100 __kmp_omp_task(gtid, new_task, true); // schedule new task 4101 #endif 4102 4103 // execute the 1st half of current subrange 4104 if (n_tsk0 > num_t_min) 4105 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0, 4106 ext0, tc0, num_t_min, 4107 #if OMPT_SUPPORT 4108 codeptr_ra, 4109 #endif 4110 task_dup); 4111 else 4112 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, 4113 gr_size0, ext0, tc0, 4114 #if OMPT_SUPPORT 4115 codeptr_ra, 4116 #endif 4117 task_dup); 4118 4119 KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid)); 4120 } 4121 4122 /*! 4123 @ingroup TASKING 4124 @param loc Source location information 4125 @param gtid Global thread ID 4126 @param task Task structure 4127 @param if_val Value of the if clause 4128 @param lb Pointer to loop lower bound in task structure 4129 @param ub Pointer to loop upper bound in task structure 4130 @param st Loop stride 4131 @param nogroup Flag, 1 if no taskgroup needs to be added, 0 otherwise 4132 @param sched Schedule specified 0/1/2 for none/grainsize/num_tasks 4133 @param grainsize Schedule value if specified 4134 @param task_dup Tasks duplication routine 4135 4136 Execute the taskloop construct. 4137 */ 4138 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, 4139 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, 4140 int sched, kmp_uint64 grainsize, void *task_dup) { 4141 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4142 KMP_DEBUG_ASSERT(task != NULL); 4143 4144 if (nogroup == 0) { 4145 #if OMPT_SUPPORT && OMPT_OPTIONAL 4146 OMPT_STORE_RETURN_ADDRESS(gtid); 4147 #endif 4148 __kmpc_taskgroup(loc, gtid); 4149 } 4150 4151 // ========================================================================= 4152 // calculate loop parameters 4153 kmp_taskloop_bounds_t task_bounds(task, lb, ub); 4154 kmp_uint64 tc; 4155 // compiler provides global bounds here 4156 kmp_uint64 lower = task_bounds.get_lb(); 4157 kmp_uint64 upper = task_bounds.get_ub(); 4158 kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag 4159 kmp_uint64 num_tasks = 0, extras = 0; 4160 kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks; 4161 kmp_info_t *thread = __kmp_threads[gtid]; 4162 kmp_taskdata_t *current_task = thread->th.th_current_task; 4163 4164 KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, " 4165 "grain %llu(%d), dup %p\n", 4166 gtid, taskdata, lower, upper, st, grainsize, sched, task_dup)); 4167 4168 // compute trip count 4169 if (st == 1) { // most common case 4170 tc = upper - lower + 1; 4171 } else if (st < 0) { 4172 tc = (lower - upper) / (-st) + 1; 4173 } else { // st > 0 4174 tc = (upper - lower) / st + 1; 4175 } 4176 if (tc == 0) { 4177 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid)); 4178 // free the pattern task and exit 4179 __kmp_task_start(gtid, task, current_task); 4180 // do not execute anything for zero-trip loop 4181 __kmp_task_finish<false>(gtid, task, current_task); 4182 return; 4183 } 4184 4185 #if OMPT_SUPPORT && OMPT_OPTIONAL 4186 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 4187 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 4188 if (ompt_enabled.ompt_callback_work) { 4189 ompt_callbacks.ompt_callback(ompt_callback_work)( 4190 ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data), 4191 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0)); 4192 } 4193 #endif 4194 4195 if (num_tasks_min == 0) 4196 // TODO: can we choose better default heuristic? 4197 num_tasks_min = 4198 KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE); 4199 4200 // compute num_tasks/grainsize based on the input provided 4201 switch (sched) { 4202 case 0: // no schedule clause specified, we can choose the default 4203 // let's try to schedule (team_size*10) tasks 4204 grainsize = thread->th.th_team_nproc * 10; 4205 KMP_FALLTHROUGH(); 4206 case 2: // num_tasks provided 4207 if (grainsize > tc) { 4208 num_tasks = tc; // too big num_tasks requested, adjust values 4209 grainsize = 1; 4210 extras = 0; 4211 } else { 4212 num_tasks = grainsize; 4213 grainsize = tc / num_tasks; 4214 extras = tc % num_tasks; 4215 } 4216 break; 4217 case 1: // grainsize provided 4218 if (grainsize > tc) { 4219 num_tasks = 1; // too big grainsize requested, adjust values 4220 grainsize = tc; 4221 extras = 0; 4222 } else { 4223 num_tasks = tc / grainsize; 4224 // adjust grainsize for balanced distribution of iterations 4225 grainsize = tc / num_tasks; 4226 extras = tc % num_tasks; 4227 } 4228 break; 4229 default: 4230 KMP_ASSERT2(0, "unknown scheduling of taskloop"); 4231 } 4232 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); 4233 KMP_DEBUG_ASSERT(num_tasks > extras); 4234 KMP_DEBUG_ASSERT(num_tasks > 0); 4235 // ========================================================================= 4236 4237 // check if clause value first 4238 // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native) 4239 if (if_val == 0) { // if(0) specified, mark task as serial 4240 taskdata->td_flags.task_serial = 1; 4241 taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied 4242 // always start serial tasks linearly 4243 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4244 grainsize, extras, tc, 4245 #if OMPT_SUPPORT 4246 OMPT_GET_RETURN_ADDRESS(0), 4247 #endif 4248 task_dup); 4249 // !taskdata->td_flags.native => currently force linear spawning of tasks 4250 // for GOMP_taskloop 4251 } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) { 4252 KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu" 4253 "(%lld), grain %llu, extras %llu\n", 4254 gtid, tc, num_tasks, num_tasks_min, grainsize, extras)); 4255 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4256 grainsize, extras, tc, num_tasks_min, 4257 #if OMPT_SUPPORT 4258 OMPT_GET_RETURN_ADDRESS(0), 4259 #endif 4260 task_dup); 4261 } else { 4262 KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu" 4263 "(%lld), grain %llu, extras %llu\n", 4264 gtid, tc, num_tasks, num_tasks_min, grainsize, extras)); 4265 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4266 grainsize, extras, tc, 4267 #if OMPT_SUPPORT 4268 OMPT_GET_RETURN_ADDRESS(0), 4269 #endif 4270 task_dup); 4271 } 4272 4273 #if OMPT_SUPPORT && OMPT_OPTIONAL 4274 if (ompt_enabled.ompt_callback_work) { 4275 ompt_callbacks.ompt_callback(ompt_callback_work)( 4276 ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data), 4277 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0)); 4278 } 4279 #endif 4280 4281 if (nogroup == 0) { 4282 #if OMPT_SUPPORT && OMPT_OPTIONAL 4283 OMPT_STORE_RETURN_ADDRESS(gtid); 4284 #endif 4285 __kmpc_end_taskgroup(loc, gtid); 4286 } 4287 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid)); 4288 } 4289 4290 #endif 4291