1 /* 2 * kmp_tasking.cpp -- OpenMP 3.0 tasking support. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // The LLVM Compiler Infrastructure 8 // 9 // This file is dual licensed under the MIT and the University of Illinois Open 10 // Source Licenses. See LICENSE.txt for details. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "kmp.h" 15 #include "kmp_i18n.h" 16 #include "kmp_itt.h" 17 #include "kmp_stats.h" 18 #include "kmp_wait_release.h" 19 20 #if OMPT_SUPPORT 21 #include "ompt-specific.h" 22 #endif 23 24 #include "tsan_annotations.h" 25 26 /* forward declaration */ 27 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 28 kmp_info_t *this_thr); 29 static void __kmp_alloc_task_deque(kmp_info_t *thread, 30 kmp_thread_data_t *thread_data); 31 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 32 kmp_task_team_t *task_team); 33 34 #ifdef OMP_45_ENABLED 35 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask); 36 #endif 37 38 #ifdef BUILD_TIED_TASK_STACK 39 40 // __kmp_trace_task_stack: print the tied tasks from the task stack in order 41 // from top do bottom 42 // 43 // gtid: global thread identifier for thread containing stack 44 // thread_data: thread data for task team thread containing stack 45 // threshold: value above which the trace statement triggers 46 // location: string identifying call site of this function (for trace) 47 static void __kmp_trace_task_stack(kmp_int32 gtid, 48 kmp_thread_data_t *thread_data, 49 int threshold, char *location) { 50 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 51 kmp_taskdata_t **stack_top = task_stack->ts_top; 52 kmp_int32 entries = task_stack->ts_entries; 53 kmp_taskdata_t *tied_task; 54 55 KA_TRACE( 56 threshold, 57 ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, " 58 "first_block = %p, stack_top = %p \n", 59 location, gtid, entries, task_stack->ts_first_block, stack_top)); 60 61 KMP_DEBUG_ASSERT(stack_top != NULL); 62 KMP_DEBUG_ASSERT(entries > 0); 63 64 while (entries != 0) { 65 KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]); 66 // fix up ts_top if we need to pop from previous block 67 if (entries & TASK_STACK_INDEX_MASK == 0) { 68 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top); 69 70 stack_block = stack_block->sb_prev; 71 stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 72 } 73 74 // finish bookkeeping 75 stack_top--; 76 entries--; 77 78 tied_task = *stack_top; 79 80 KMP_DEBUG_ASSERT(tied_task != NULL); 81 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 82 83 KA_TRACE(threshold, 84 ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, " 85 "stack_top=%p, tied_task=%p\n", 86 location, gtid, entries, stack_top, tied_task)); 87 } 88 KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]); 89 90 KA_TRACE(threshold, 91 ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n", 92 location, gtid)); 93 } 94 95 // __kmp_init_task_stack: initialize the task stack for the first time 96 // after a thread_data structure is created. 97 // It should not be necessary to do this again (assuming the stack works). 98 // 99 // gtid: global thread identifier of calling thread 100 // thread_data: thread data for task team thread containing stack 101 static void __kmp_init_task_stack(kmp_int32 gtid, 102 kmp_thread_data_t *thread_data) { 103 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 104 kmp_stack_block_t *first_block; 105 106 // set up the first block of the stack 107 first_block = &task_stack->ts_first_block; 108 task_stack->ts_top = (kmp_taskdata_t **)first_block; 109 memset((void *)first_block, '\0', 110 TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *)); 111 112 // initialize the stack to be empty 113 task_stack->ts_entries = TASK_STACK_EMPTY; 114 first_block->sb_next = NULL; 115 first_block->sb_prev = NULL; 116 } 117 118 // __kmp_free_task_stack: free the task stack when thread_data is destroyed. 119 // 120 // gtid: global thread identifier for calling thread 121 // thread_data: thread info for thread containing stack 122 static void __kmp_free_task_stack(kmp_int32 gtid, 123 kmp_thread_data_t *thread_data) { 124 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 125 kmp_stack_block_t *stack_block = &task_stack->ts_first_block; 126 127 KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY); 128 // free from the second block of the stack 129 while (stack_block != NULL) { 130 kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL; 131 132 stack_block->sb_next = NULL; 133 stack_block->sb_prev = NULL; 134 if (stack_block != &task_stack->ts_first_block) { 135 __kmp_thread_free(thread, 136 stack_block); // free the block, if not the first 137 } 138 stack_block = next_block; 139 } 140 // initialize the stack to be empty 141 task_stack->ts_entries = 0; 142 task_stack->ts_top = NULL; 143 } 144 145 // __kmp_push_task_stack: Push the tied task onto the task stack. 146 // Grow the stack if necessary by allocating another block. 147 // 148 // gtid: global thread identifier for calling thread 149 // thread: thread info for thread containing stack 150 // tied_task: the task to push on the stack 151 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread, 152 kmp_taskdata_t *tied_task) { 153 // GEH - need to consider what to do if tt_threads_data not allocated yet 154 kmp_thread_data_t *thread_data = 155 &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 156 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 157 158 if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) { 159 return; // Don't push anything on stack if team or team tasks are serialized 160 } 161 162 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 163 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 164 165 KA_TRACE(20, 166 ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n", 167 gtid, thread, tied_task)); 168 // Store entry 169 *(task_stack->ts_top) = tied_task; 170 171 // Do bookkeeping for next push 172 task_stack->ts_top++; 173 task_stack->ts_entries++; 174 175 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 176 // Find beginning of this task block 177 kmp_stack_block_t *stack_block = 178 (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE); 179 180 // Check if we already have a block 181 if (stack_block->sb_next != 182 NULL) { // reset ts_top to beginning of next block 183 task_stack->ts_top = &stack_block->sb_next->sb_block[0]; 184 } else { // Alloc new block and link it up 185 kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc( 186 thread, sizeof(kmp_stack_block_t)); 187 188 task_stack->ts_top = &new_block->sb_block[0]; 189 stack_block->sb_next = new_block; 190 new_block->sb_prev = stack_block; 191 new_block->sb_next = NULL; 192 193 KA_TRACE( 194 30, 195 ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n", 196 gtid, tied_task, new_block)); 197 } 198 } 199 KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 200 tied_task)); 201 } 202 203 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return 204 // the task, just check to make sure it matches the ending task passed in. 205 // 206 // gtid: global thread identifier for the calling thread 207 // thread: thread info structure containing stack 208 // tied_task: the task popped off the stack 209 // ending_task: the task that is ending (should match popped task) 210 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread, 211 kmp_taskdata_t *ending_task) { 212 // GEH - need to consider what to do if tt_threads_data not allocated yet 213 kmp_thread_data_t *thread_data = 214 &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)]; 215 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 216 kmp_taskdata_t *tied_task; 217 218 if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) { 219 // Don't pop anything from stack if team or team tasks are serialized 220 return; 221 } 222 223 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 224 KMP_DEBUG_ASSERT(task_stack->ts_entries > 0); 225 226 KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid, 227 thread)); 228 229 // fix up ts_top if we need to pop from previous block 230 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 231 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top); 232 233 stack_block = stack_block->sb_prev; 234 task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 235 } 236 237 // finish bookkeeping 238 task_stack->ts_top--; 239 task_stack->ts_entries--; 240 241 tied_task = *(task_stack->ts_top); 242 243 KMP_DEBUG_ASSERT(tied_task != NULL); 244 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 245 KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly 246 247 KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 248 tied_task)); 249 return; 250 } 251 #endif /* BUILD_TIED_TASK_STACK */ 252 253 // __kmp_push_task: Add a task to the thread's deque 254 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { 255 kmp_info_t *thread = __kmp_threads[gtid]; 256 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 257 kmp_task_team_t *task_team = thread->th.th_task_team; 258 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 259 kmp_thread_data_t *thread_data; 260 261 KA_TRACE(20, 262 ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata)); 263 264 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 265 // untied task needs to increment counter so that the task structure is not 266 // freed prematurely 267 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count); 268 KMP_DEBUG_USE_VAR(counter); 269 KA_TRACE( 270 20, 271 ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n", 272 gtid, counter, taskdata)); 273 } 274 275 // The first check avoids building task_team thread data if serialized 276 if (taskdata->td_flags.task_serial) { 277 KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning " 278 "TASK_NOT_PUSHED for task %p\n", 279 gtid, taskdata)); 280 return TASK_NOT_PUSHED; 281 } 282 283 // Now that serialized tasks have returned, we can assume that we are not in 284 // immediate exec mode 285 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 286 if (!KMP_TASKING_ENABLED(task_team)) { 287 __kmp_enable_tasking(task_team, thread); 288 } 289 KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE); 290 KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL); 291 292 // Find tasking deque specific to encountering thread 293 thread_data = &task_team->tt.tt_threads_data[tid]; 294 295 // No lock needed since only owner can allocate 296 if (thread_data->td.td_deque == NULL) { 297 __kmp_alloc_task_deque(thread, thread_data); 298 } 299 300 // Check if deque is full 301 if (TCR_4(thread_data->td.td_deque_ntasks) >= 302 TASK_DEQUE_SIZE(thread_data->td)) { 303 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning " 304 "TASK_NOT_PUSHED for task %p\n", 305 gtid, taskdata)); 306 return TASK_NOT_PUSHED; 307 } 308 309 // Lock the deque for the task push operation 310 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 311 312 #if OMP_45_ENABLED 313 // Need to recheck as we can get a proxy task from a thread outside of OpenMP 314 if (TCR_4(thread_data->td.td_deque_ntasks) >= 315 TASK_DEQUE_SIZE(thread_data->td)) { 316 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 317 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; returning " 318 "TASK_NOT_PUSHED for task %p\n", 319 gtid, taskdata)); 320 return TASK_NOT_PUSHED; 321 } 322 #else 323 // Must have room since no thread can add tasks but calling thread 324 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) < 325 TASK_DEQUE_SIZE(thread_data->td)); 326 #endif 327 328 thread_data->td.td_deque[thread_data->td.td_deque_tail] = 329 taskdata; // Push taskdata 330 // Wrap index. 331 thread_data->td.td_deque_tail = 332 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 333 TCW_4(thread_data->td.td_deque_ntasks, 334 TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count 335 336 KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: " 337 "task=%p ntasks=%d head=%u tail=%u\n", 338 gtid, taskdata, thread_data->td.td_deque_ntasks, 339 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 340 341 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 342 343 return TASK_SUCCESSFULLY_PUSHED; 344 } 345 346 // __kmp_pop_current_task_from_thread: set up current task from called thread 347 // when team ends 348 // 349 // this_thr: thread structure to set current_task in. 350 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) { 351 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d " 352 "this_thread=%p, curtask=%p, " 353 "curtask_parent=%p\n", 354 0, this_thr, this_thr->th.th_current_task, 355 this_thr->th.th_current_task->td_parent)); 356 357 this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent; 358 359 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d " 360 "this_thread=%p, curtask=%p, " 361 "curtask_parent=%p\n", 362 0, this_thr, this_thr->th.th_current_task, 363 this_thr->th.th_current_task->td_parent)); 364 } 365 366 // __kmp_push_current_task_to_thread: set up current task in called thread for a 367 // new team 368 // 369 // this_thr: thread structure to set up 370 // team: team for implicit task data 371 // tid: thread within team to set up 372 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team, 373 int tid) { 374 // current task of the thread is a parent of the new just created implicit 375 // tasks of new team 376 KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p " 377 "curtask=%p " 378 "parent_task=%p\n", 379 tid, this_thr, this_thr->th.th_current_task, 380 team->t.t_implicit_task_taskdata[tid].td_parent)); 381 382 KMP_DEBUG_ASSERT(this_thr != NULL); 383 384 if (tid == 0) { 385 if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) { 386 team->t.t_implicit_task_taskdata[0].td_parent = 387 this_thr->th.th_current_task; 388 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0]; 389 } 390 } else { 391 team->t.t_implicit_task_taskdata[tid].td_parent = 392 team->t.t_implicit_task_taskdata[0].td_parent; 393 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid]; 394 } 395 396 KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p " 397 "curtask=%p " 398 "parent_task=%p\n", 399 tid, this_thr, this_thr->th.th_current_task, 400 team->t.t_implicit_task_taskdata[tid].td_parent)); 401 } 402 403 // __kmp_task_start: bookkeeping for a task starting execution 404 // 405 // GTID: global thread id of calling thread 406 // task: task starting execution 407 // current_task: task suspending 408 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task, 409 kmp_taskdata_t *current_task) { 410 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 411 kmp_info_t *thread = __kmp_threads[gtid]; 412 413 KA_TRACE(10, 414 ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n", 415 gtid, taskdata, current_task)); 416 417 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 418 419 // mark currently executing task as suspended 420 // TODO: GEH - make sure root team implicit task is initialized properly. 421 // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 ); 422 current_task->td_flags.executing = 0; 423 424 // Add task to stack if tied 425 #ifdef BUILD_TIED_TASK_STACK 426 if (taskdata->td_flags.tiedness == TASK_TIED) { 427 __kmp_push_task_stack(gtid, thread, taskdata); 428 } 429 #endif /* BUILD_TIED_TASK_STACK */ 430 431 // mark starting task as executing and as current task 432 thread->th.th_current_task = taskdata; 433 434 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 || 435 taskdata->td_flags.tiedness == TASK_UNTIED); 436 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 || 437 taskdata->td_flags.tiedness == TASK_UNTIED); 438 taskdata->td_flags.started = 1; 439 taskdata->td_flags.executing = 1; 440 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 441 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 442 443 // GEH TODO: shouldn't we pass some sort of location identifier here? 444 // APT: yes, we will pass location here. 445 // need to store current thread state (in a thread or taskdata structure) 446 // before setting work_state, otherwise wrong state is set after end of task 447 448 KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata)); 449 450 return; 451 } 452 453 #if OMPT_SUPPORT 454 //------------------------------------------------------------------------------ 455 // __ompt_task_init: 456 // Initialize OMPT fields maintained by a task. This will only be called after 457 // ompt_start_tool, so we already know whether ompt is enabled or not. 458 459 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) { 460 // The calls to __ompt_task_init already have the ompt_enabled condition. 461 task->ompt_task_info.task_data.value = 0; 462 task->ompt_task_info.frame.exit_frame = NULL; 463 task->ompt_task_info.frame.enter_frame = NULL; 464 #if OMP_40_ENABLED 465 task->ompt_task_info.ndeps = 0; 466 task->ompt_task_info.deps = NULL; 467 #endif /* OMP_40_ENABLED */ 468 } 469 470 // __ompt_task_start: 471 // Build and trigger task-begin event 472 static inline void __ompt_task_start(kmp_task_t *task, 473 kmp_taskdata_t *current_task, 474 kmp_int32 gtid) { 475 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 476 ompt_task_status_t status = ompt_task_others; 477 if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) { 478 status = ompt_task_yield; 479 __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0; 480 } 481 /* let OMPT know that we're about to run this task */ 482 if (ompt_enabled.ompt_callback_task_schedule) { 483 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)( 484 &(current_task->ompt_task_info.task_data), status, 485 &(taskdata->ompt_task_info.task_data)); 486 } 487 taskdata->ompt_task_info.scheduling_parent = current_task; 488 } 489 490 // __ompt_task_finish: 491 // Build and trigger final task-schedule event 492 static inline void 493 __ompt_task_finish(kmp_task_t *task, kmp_taskdata_t *resumed_task, 494 ompt_task_status_t status = ompt_task_complete) { 495 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 496 if (__kmp_omp_cancellation && taskdata->td_taskgroup && 497 taskdata->td_taskgroup->cancel_request == cancel_taskgroup) { 498 status = ompt_task_cancel; 499 } 500 501 /* let OMPT know that we're returning to the callee task */ 502 if (ompt_enabled.ompt_callback_task_schedule) { 503 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)( 504 &(taskdata->ompt_task_info.task_data), status, 505 &((resumed_task ? resumed_task 506 : (taskdata->ompt_task_info.scheduling_parent 507 ? taskdata->ompt_task_info.scheduling_parent 508 : taskdata->td_parent)) 509 ->ompt_task_info.task_data)); 510 } 511 } 512 #endif 513 514 template <bool ompt> 515 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid, 516 kmp_task_t *task, 517 void *frame_address, 518 void *return_address) { 519 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 520 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 521 522 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p " 523 "current_task=%p\n", 524 gtid, loc_ref, taskdata, current_task)); 525 526 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 527 // untied task needs to increment counter so that the task structure is not 528 // freed prematurely 529 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count); 530 KMP_DEBUG_USE_VAR(counter); 531 KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) " 532 "incremented for task %p\n", 533 gtid, counter, taskdata)); 534 } 535 536 taskdata->td_flags.task_serial = 537 1; // Execute this task immediately, not deferred. 538 __kmp_task_start(gtid, task, current_task); 539 540 #if OMPT_SUPPORT 541 if (ompt) { 542 if (current_task->ompt_task_info.frame.enter_frame == NULL) { 543 current_task->ompt_task_info.frame.enter_frame = 544 taskdata->ompt_task_info.frame.exit_frame = frame_address; 545 } 546 if (ompt_enabled.ompt_callback_task_create) { 547 ompt_task_info_t *parent_info = &(current_task->ompt_task_info); 548 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 549 &(parent_info->task_data), &(parent_info->frame), 550 &(taskdata->ompt_task_info.task_data), 551 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0, 552 return_address); 553 } 554 __ompt_task_start(task, current_task, gtid); 555 } 556 #endif // OMPT_SUPPORT 557 558 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid, 559 loc_ref, taskdata)); 560 } 561 562 #if OMPT_SUPPORT 563 OMPT_NOINLINE 564 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid, 565 kmp_task_t *task, 566 void *frame_address, 567 void *return_address) { 568 __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address, 569 return_address); 570 } 571 #endif // OMPT_SUPPORT 572 573 // __kmpc_omp_task_begin_if0: report that a given serialized task has started 574 // execution 575 // 576 // loc_ref: source location information; points to beginning of task block. 577 // gtid: global thread number. 578 // task: task thunk for the started task. 579 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid, 580 kmp_task_t *task) { 581 #if OMPT_SUPPORT 582 if (UNLIKELY(ompt_enabled.enabled)) { 583 OMPT_STORE_RETURN_ADDRESS(gtid); 584 __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task, 585 OMPT_GET_FRAME_ADDRESS(1), 586 OMPT_LOAD_RETURN_ADDRESS(gtid)); 587 return; 588 } 589 #endif 590 __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL); 591 } 592 593 #ifdef TASK_UNUSED 594 // __kmpc_omp_task_begin: report that a given task has started execution 595 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 596 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) { 597 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 598 599 KA_TRACE( 600 10, 601 ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n", 602 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task)); 603 604 __kmp_task_start(gtid, task, current_task); 605 606 KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid, 607 loc_ref, KMP_TASK_TO_TASKDATA(task))); 608 return; 609 } 610 #endif // TASK_UNUSED 611 612 // __kmp_free_task: free the current task space and the space for shareds 613 // 614 // gtid: Global thread ID of calling thread 615 // taskdata: task to free 616 // thread: thread data structure of caller 617 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata, 618 kmp_info_t *thread) { 619 KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid, 620 taskdata)); 621 622 // Check to make sure all flags and counters have the correct values 623 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 624 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0); 625 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1); 626 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 627 KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 || 628 taskdata->td_flags.task_serial == 1); 629 KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0); 630 631 taskdata->td_flags.freed = 1; 632 ANNOTATE_HAPPENS_BEFORE(taskdata); 633 // deallocate the taskdata and shared variable blocks associated with this task 634 #if USE_FAST_MEMORY 635 __kmp_fast_free(thread, taskdata); 636 #else /* ! USE_FAST_MEMORY */ 637 __kmp_thread_free(thread, taskdata); 638 #endif 639 640 KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata)); 641 } 642 643 // __kmp_free_task_and_ancestors: free the current task and ancestors without 644 // children 645 // 646 // gtid: Global thread ID of calling thread 647 // taskdata: task to free 648 // thread: thread data structure of caller 649 static void __kmp_free_task_and_ancestors(kmp_int32 gtid, 650 kmp_taskdata_t *taskdata, 651 kmp_info_t *thread) { 652 #if OMP_45_ENABLED 653 // Proxy tasks must always be allowed to free their parents 654 // because they can be run in background even in serial mode. 655 kmp_int32 team_serial = 656 (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) && 657 !taskdata->td_flags.proxy; 658 #else 659 kmp_int32 team_serial = 660 taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser; 661 #endif 662 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 663 664 kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1; 665 KMP_DEBUG_ASSERT(children >= 0); 666 667 // Now, go up the ancestor tree to see if any ancestors can now be freed. 668 while (children == 0) { 669 kmp_taskdata_t *parent_taskdata = taskdata->td_parent; 670 671 KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete " 672 "and freeing itself\n", 673 gtid, taskdata)); 674 675 // --- Deallocate my ancestor task --- 676 __kmp_free_task(gtid, taskdata, thread); 677 678 taskdata = parent_taskdata; 679 680 // Stop checking ancestors at implicit task instead of walking up ancestor 681 // tree to avoid premature deallocation of ancestors. 682 if (team_serial || taskdata->td_flags.tasktype == TASK_IMPLICIT) 683 return; 684 685 // Predecrement simulated by "- 1" calculation 686 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1; 687 KMP_DEBUG_ASSERT(children >= 0); 688 } 689 690 KA_TRACE( 691 20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; " 692 "not freeing it yet\n", 693 gtid, taskdata, children)); 694 } 695 696 // __kmp_task_finish: bookkeeping to do when a task finishes execution 697 // 698 // gtid: global thread ID for calling thread 699 // task: task to be finished 700 // resumed_task: task to be resumed. (may be NULL if task is serialized) 701 template <bool ompt> 702 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, 703 kmp_taskdata_t *resumed_task) { 704 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 705 kmp_info_t *thread = __kmp_threads[gtid]; 706 kmp_task_team_t *task_team = 707 thread->th.th_task_team; // might be NULL for serial teams... 708 kmp_int32 children = 0; 709 710 KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming " 711 "task %p\n", 712 gtid, taskdata, resumed_task)); 713 714 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 715 716 // Pop task from stack if tied 717 #ifdef BUILD_TIED_TASK_STACK 718 if (taskdata->td_flags.tiedness == TASK_TIED) { 719 __kmp_pop_task_stack(gtid, thread, taskdata); 720 } 721 #endif /* BUILD_TIED_TASK_STACK */ 722 723 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 724 // untied task needs to check the counter so that the task structure is not 725 // freed prematurely 726 kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1; 727 KA_TRACE( 728 20, 729 ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n", 730 gtid, counter, taskdata)); 731 if (counter > 0) { 732 // untied task is not done, to be continued possibly by other thread, do 733 // not free it now 734 if (resumed_task == NULL) { 735 KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial); 736 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 737 // task is the parent 738 } 739 thread->th.th_current_task = resumed_task; // restore current_task 740 resumed_task->td_flags.executing = 1; // resume previous task 741 KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, " 742 "resuming task %p\n", 743 gtid, taskdata, resumed_task)); 744 return; 745 } 746 } 747 #if OMPT_SUPPORT 748 if (ompt) 749 __ompt_task_finish(task, resumed_task); 750 #endif 751 752 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 753 taskdata->td_flags.complete = 1; // mark the task as completed 754 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1); 755 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 756 757 // Only need to keep track of count if team parallel and tasking not 758 // serialized 759 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { 760 // Predecrement simulated by "- 1" calculation 761 children = 762 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1; 763 KMP_DEBUG_ASSERT(children >= 0); 764 #if OMP_40_ENABLED 765 if (taskdata->td_taskgroup) 766 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); 767 #if OMP_45_ENABLED 768 } 769 // if we found proxy tasks there could exist a dependency chain 770 // with the proxy task as origin 771 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) || 772 (task_team && task_team->tt.tt_found_proxy_tasks)) { 773 #endif 774 __kmp_release_deps(gtid, taskdata); 775 #endif 776 } 777 778 // td_flags.executing must be marked as 0 after __kmp_release_deps has been 779 // called. Othertwise, if a task is executed immediately from the release_deps 780 // code, the flag will be reset to 1 again by this same function 781 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); 782 taskdata->td_flags.executing = 0; // suspend the finishing task 783 784 KA_TRACE( 785 20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n", 786 gtid, taskdata, children)); 787 788 #if OMP_40_ENABLED 789 /* If the tasks' destructor thunk flag has been set, we need to invoke the 790 destructor thunk that has been generated by the compiler. The code is 791 placed here, since at this point other tasks might have been released 792 hence overlapping the destructor invokations with some other work in the 793 released tasks. The OpenMP spec is not specific on when the destructors 794 are invoked, so we should be free to choose. */ 795 if (taskdata->td_flags.destructors_thunk) { 796 kmp_routine_entry_t destr_thunk = task->data1.destructors; 797 KMP_ASSERT(destr_thunk); 798 destr_thunk(gtid, task); 799 } 800 #endif // OMP_40_ENABLED 801 802 // bookkeeping for resuming task: 803 // GEH - note tasking_ser => task_serial 804 KMP_DEBUG_ASSERT( 805 (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) == 806 taskdata->td_flags.task_serial); 807 if (taskdata->td_flags.task_serial) { 808 if (resumed_task == NULL) { 809 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 810 // task is the parent 811 } 812 } else { 813 KMP_DEBUG_ASSERT(resumed_task != 814 NULL); // verify that resumed task is passed as arguemnt 815 } 816 817 // Free this task and then ancestor tasks if they have no children. 818 // Restore th_current_task first as suggested by John: 819 // johnmc: if an asynchronous inquiry peers into the runtime system 820 // it doesn't see the freed task as the current task. 821 thread->th.th_current_task = resumed_task; 822 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 823 824 // TODO: GEH - make sure root team implicit task is initialized properly. 825 // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 ); 826 resumed_task->td_flags.executing = 1; // resume previous task 827 828 KA_TRACE( 829 10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n", 830 gtid, taskdata, resumed_task)); 831 832 return; 833 } 834 835 template <bool ompt> 836 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref, 837 kmp_int32 gtid, 838 kmp_task_t *task) { 839 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n", 840 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 841 // this routine will provide task to resume 842 __kmp_task_finish<ompt>(gtid, task, NULL); 843 844 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n", 845 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 846 847 #if OMPT_SUPPORT 848 if (ompt) { 849 omp_frame_t *ompt_frame; 850 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 851 ompt_frame->enter_frame = NULL; 852 } 853 #endif 854 855 return; 856 } 857 858 #if OMPT_SUPPORT 859 OMPT_NOINLINE 860 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid, 861 kmp_task_t *task) { 862 __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task); 863 } 864 #endif // OMPT_SUPPORT 865 866 // __kmpc_omp_task_complete_if0: report that a task has completed execution 867 // 868 // loc_ref: source location information; points to end of task block. 869 // gtid: global thread number. 870 // task: task thunk for the completed task. 871 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid, 872 kmp_task_t *task) { 873 #if OMPT_SUPPORT 874 if (UNLIKELY(ompt_enabled.enabled)) { 875 __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task); 876 return; 877 } 878 #endif 879 __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task); 880 } 881 882 #ifdef TASK_UNUSED 883 // __kmpc_omp_task_complete: report that a task has completed execution 884 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 885 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid, 886 kmp_task_t *task) { 887 KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid, 888 loc_ref, KMP_TASK_TO_TASKDATA(task))); 889 890 __kmp_task_finish<false>(gtid, task, 891 NULL); // Not sure how to find task to resume 892 893 KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid, 894 loc_ref, KMP_TASK_TO_TASKDATA(task))); 895 return; 896 } 897 #endif // TASK_UNUSED 898 899 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit 900 // task for a given thread 901 // 902 // loc_ref: reference to source location of parallel region 903 // this_thr: thread data structure corresponding to implicit task 904 // team: team for this_thr 905 // tid: thread id of given thread within team 906 // set_curr_task: TRUE if need to push current task to thread 907 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to 908 // have already been done elsewhere. 909 // TODO: Get better loc_ref. Value passed in may be NULL 910 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr, 911 kmp_team_t *team, int tid, int set_curr_task) { 912 kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid]; 913 914 KF_TRACE( 915 10, 916 ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n", 917 tid, team, task, set_curr_task ? "TRUE" : "FALSE")); 918 919 task->td_task_id = KMP_GEN_TASK_ID(); 920 task->td_team = team; 921 // task->td_parent = NULL; // fix for CQ230101 (broken parent task info 922 // in debugger) 923 task->td_ident = loc_ref; 924 task->td_taskwait_ident = NULL; 925 task->td_taskwait_counter = 0; 926 task->td_taskwait_thread = 0; 927 928 task->td_flags.tiedness = TASK_TIED; 929 task->td_flags.tasktype = TASK_IMPLICIT; 930 #if OMP_45_ENABLED 931 task->td_flags.proxy = TASK_FULL; 932 #endif 933 934 // All implicit tasks are executed immediately, not deferred 935 task->td_flags.task_serial = 1; 936 task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 937 task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 938 939 task->td_flags.started = 1; 940 task->td_flags.executing = 1; 941 task->td_flags.complete = 0; 942 task->td_flags.freed = 0; 943 944 #if OMP_40_ENABLED 945 task->td_depnode = NULL; 946 #endif 947 task->td_last_tied = task; 948 949 if (set_curr_task) { // only do this init first time thread is created 950 KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0); 951 // Not used: don't need to deallocate implicit task 952 KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0); 953 #if OMP_40_ENABLED 954 task->td_taskgroup = NULL; // An implicit task does not have taskgroup 955 task->td_dephash = NULL; 956 #endif 957 __kmp_push_current_task_to_thread(this_thr, team, tid); 958 } else { 959 KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0); 960 KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0); 961 } 962 963 #if OMPT_SUPPORT 964 if (UNLIKELY(ompt_enabled.enabled)) 965 __ompt_task_init(task, tid); 966 #endif 967 968 KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid, 969 team, task)); 970 } 971 972 // __kmp_finish_implicit_task: Release resources associated to implicit tasks 973 // at the end of parallel regions. Some resources are kept for reuse in the next 974 // parallel region. 975 // 976 // thread: thread data structure corresponding to implicit task 977 void __kmp_finish_implicit_task(kmp_info_t *thread) { 978 kmp_taskdata_t *task = thread->th.th_current_task; 979 if (task->td_dephash) 980 __kmp_dephash_free_entries(thread, task->td_dephash); 981 } 982 983 // __kmp_free_implicit_task: Release resources associated to implicit tasks 984 // when these are destroyed regions 985 // 986 // thread: thread data structure corresponding to implicit task 987 void __kmp_free_implicit_task(kmp_info_t *thread) { 988 kmp_taskdata_t *task = thread->th.th_current_task; 989 if (task && task->td_dephash) { 990 __kmp_dephash_free(thread, task->td_dephash); 991 task->td_dephash = NULL; 992 } 993 } 994 995 // Round up a size to a power of two specified by val: Used to insert padding 996 // between structures co-allocated using a single malloc() call 997 static size_t __kmp_round_up_to_val(size_t size, size_t val) { 998 if (size & (val - 1)) { 999 size &= ~(val - 1); 1000 if (size <= KMP_SIZE_T_MAX - val) { 1001 size += val; // Round up if there is no overflow. 1002 } 1003 } 1004 return size; 1005 } // __kmp_round_up_to_va 1006 1007 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task 1008 // 1009 // loc_ref: source location information 1010 // gtid: global thread number. 1011 // flags: include tiedness & task type (explicit vs. implicit) of the ''new'' 1012 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine. 1013 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including 1014 // private vars accessed in task. 1015 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed 1016 // in task. 1017 // task_entry: Pointer to task code entry point generated by compiler. 1018 // returns: a pointer to the allocated kmp_task_t structure (task). 1019 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1020 kmp_tasking_flags_t *flags, 1021 size_t sizeof_kmp_task_t, size_t sizeof_shareds, 1022 kmp_routine_entry_t task_entry) { 1023 kmp_task_t *task; 1024 kmp_taskdata_t *taskdata; 1025 kmp_info_t *thread = __kmp_threads[gtid]; 1026 kmp_team_t *team = thread->th.th_team; 1027 kmp_taskdata_t *parent_task = thread->th.th_current_task; 1028 size_t shareds_offset; 1029 1030 if (!TCR_4(__kmp_init_middle)) 1031 __kmp_middle_initialize(); 1032 1033 KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) " 1034 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1035 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t, 1036 sizeof_shareds, task_entry)); 1037 1038 if (parent_task->td_flags.final) { 1039 if (flags->merged_if0) { 1040 } 1041 flags->final = 1; 1042 } 1043 if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) { 1044 // Untied task encountered causes the TSC algorithm to check entire deque of 1045 // the victim thread. If no untied task encountered, then checking the head 1046 // of the deque should be enough. 1047 KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1); 1048 } 1049 1050 #if OMP_45_ENABLED 1051 if (flags->proxy == TASK_PROXY) { 1052 flags->tiedness = TASK_UNTIED; 1053 flags->merged_if0 = 1; 1054 1055 /* are we running in a sequential parallel or tskm_immediate_exec... we need 1056 tasking support enabled */ 1057 if ((thread->th.th_task_team) == NULL) { 1058 /* This should only happen if the team is serialized 1059 setup a task team and propagate it to the thread */ 1060 KMP_DEBUG_ASSERT(team->t.t_serialized); 1061 KA_TRACE(30, 1062 ("T#%d creating task team in __kmp_task_alloc for proxy task\n", 1063 gtid)); 1064 __kmp_task_team_setup( 1065 thread, team, 1066 1); // 1 indicates setup the current team regardless of nthreads 1067 thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state]; 1068 } 1069 kmp_task_team_t *task_team = thread->th.th_task_team; 1070 1071 /* tasking must be enabled now as the task might not be pushed */ 1072 if (!KMP_TASKING_ENABLED(task_team)) { 1073 KA_TRACE( 1074 30, 1075 ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid)); 1076 __kmp_enable_tasking(task_team, thread); 1077 kmp_int32 tid = thread->th.th_info.ds.ds_tid; 1078 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 1079 // No lock needed since only owner can allocate 1080 if (thread_data->td.td_deque == NULL) { 1081 __kmp_alloc_task_deque(thread, thread_data); 1082 } 1083 } 1084 1085 if (task_team->tt.tt_found_proxy_tasks == FALSE) 1086 TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE); 1087 } 1088 #endif 1089 1090 // Calculate shared structure offset including padding after kmp_task_t struct 1091 // to align pointers in shared struct 1092 shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t; 1093 shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *)); 1094 1095 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 1096 KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid, 1097 shareds_offset)); 1098 KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid, 1099 sizeof_shareds)); 1100 1101 // Avoid double allocation here by combining shareds with taskdata 1102 #if USE_FAST_MEMORY 1103 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset + 1104 sizeof_shareds); 1105 #else /* ! USE_FAST_MEMORY */ 1106 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset + 1107 sizeof_shareds); 1108 #endif /* USE_FAST_MEMORY */ 1109 ANNOTATE_HAPPENS_AFTER(taskdata); 1110 1111 task = KMP_TASKDATA_TO_TASK(taskdata); 1112 1113 // Make sure task & taskdata are aligned appropriately 1114 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD 1115 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0); 1116 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0); 1117 #else 1118 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0); 1119 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0); 1120 #endif 1121 if (sizeof_shareds > 0) { 1122 // Avoid double allocation here by combining shareds with taskdata 1123 task->shareds = &((char *)taskdata)[shareds_offset]; 1124 // Make sure shareds struct is aligned to pointer size 1125 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 1126 0); 1127 } else { 1128 task->shareds = NULL; 1129 } 1130 task->routine = task_entry; 1131 task->part_id = 0; // AC: Always start with 0 part id 1132 1133 taskdata->td_task_id = KMP_GEN_TASK_ID(); 1134 taskdata->td_team = team; 1135 taskdata->td_alloc_thread = thread; 1136 taskdata->td_parent = parent_task; 1137 taskdata->td_level = parent_task->td_level + 1; // increment nesting level 1138 KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0); 1139 taskdata->td_ident = loc_ref; 1140 taskdata->td_taskwait_ident = NULL; 1141 taskdata->td_taskwait_counter = 0; 1142 taskdata->td_taskwait_thread = 0; 1143 KMP_DEBUG_ASSERT(taskdata->td_parent != NULL); 1144 #if OMP_45_ENABLED 1145 // avoid copying icvs for proxy tasks 1146 if (flags->proxy == TASK_FULL) 1147 #endif 1148 copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs); 1149 1150 taskdata->td_flags.tiedness = flags->tiedness; 1151 taskdata->td_flags.final = flags->final; 1152 taskdata->td_flags.merged_if0 = flags->merged_if0; 1153 #if OMP_40_ENABLED 1154 taskdata->td_flags.destructors_thunk = flags->destructors_thunk; 1155 #endif // OMP_40_ENABLED 1156 #if OMP_45_ENABLED 1157 taskdata->td_flags.proxy = flags->proxy; 1158 taskdata->td_task_team = thread->th.th_task_team; 1159 taskdata->td_size_alloc = shareds_offset + sizeof_shareds; 1160 #endif 1161 taskdata->td_flags.tasktype = TASK_EXPLICIT; 1162 1163 // GEH - TODO: fix this to copy parent task's value of tasking_ser flag 1164 taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 1165 1166 // GEH - TODO: fix this to copy parent task's value of team_serial flag 1167 taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 1168 1169 // GEH - Note we serialize the task if the team is serialized to make sure 1170 // implicit parallel region tasks are not left until program termination to 1171 // execute. Also, it helps locality to execute immediately. 1172 1173 taskdata->td_flags.task_serial = 1174 (parent_task->td_flags.final || taskdata->td_flags.team_serial || 1175 taskdata->td_flags.tasking_ser); 1176 1177 taskdata->td_flags.started = 0; 1178 taskdata->td_flags.executing = 0; 1179 taskdata->td_flags.complete = 0; 1180 taskdata->td_flags.freed = 0; 1181 1182 taskdata->td_flags.native = flags->native; 1183 1184 KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0); 1185 // start at one because counts current task and children 1186 KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1); 1187 #if OMP_40_ENABLED 1188 taskdata->td_taskgroup = 1189 parent_task->td_taskgroup; // task inherits taskgroup from the parent task 1190 taskdata->td_dephash = NULL; 1191 taskdata->td_depnode = NULL; 1192 #endif 1193 if (flags->tiedness == TASK_UNTIED) 1194 taskdata->td_last_tied = NULL; // will be set when the task is scheduled 1195 else 1196 taskdata->td_last_tied = taskdata; 1197 1198 #if OMPT_SUPPORT 1199 if (UNLIKELY(ompt_enabled.enabled)) 1200 __ompt_task_init(taskdata, gtid); 1201 #endif 1202 // Only need to keep track of child task counts if team parallel and tasking not 1203 // serialized or if it is a proxy task 1204 #if OMP_45_ENABLED 1205 if (flags->proxy == TASK_PROXY || 1206 !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) 1207 #else 1208 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) 1209 #endif 1210 { 1211 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks); 1212 #if OMP_40_ENABLED 1213 if (parent_task->td_taskgroup) 1214 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count); 1215 #endif 1216 // Only need to keep track of allocated child tasks for explicit tasks since 1217 // implicit not deallocated 1218 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) { 1219 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks); 1220 } 1221 } 1222 1223 KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n", 1224 gtid, taskdata, taskdata->td_parent)); 1225 ANNOTATE_HAPPENS_BEFORE(task); 1226 1227 return task; 1228 } 1229 1230 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1231 kmp_int32 flags, size_t sizeof_kmp_task_t, 1232 size_t sizeof_shareds, 1233 kmp_routine_entry_t task_entry) { 1234 kmp_task_t *retval; 1235 kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags; 1236 1237 input_flags->native = FALSE; 1238 // __kmp_task_alloc() sets up all other runtime flags 1239 1240 #if OMP_45_ENABLED 1241 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) " 1242 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1243 gtid, loc_ref, input_flags->tiedness ? "tied " : "untied", 1244 input_flags->proxy ? "proxy" : "", sizeof_kmp_task_t, 1245 sizeof_shareds, task_entry)); 1246 #else 1247 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) " 1248 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1249 gtid, loc_ref, input_flags->tiedness ? "tied " : "untied", 1250 sizeof_kmp_task_t, sizeof_shareds, task_entry)); 1251 #endif 1252 1253 retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t, 1254 sizeof_shareds, task_entry); 1255 1256 KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval)); 1257 1258 return retval; 1259 } 1260 1261 // __kmp_invoke_task: invoke the specified task 1262 // 1263 // gtid: global thread ID of caller 1264 // task: the task to invoke 1265 // current_task: the task to resume after task invokation 1266 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, 1267 kmp_taskdata_t *current_task) { 1268 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 1269 #if OMP_40_ENABLED 1270 int discard = 0 /* false */; 1271 #endif 1272 KA_TRACE( 1273 30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n", 1274 gtid, taskdata, current_task)); 1275 KMP_DEBUG_ASSERT(task); 1276 #if OMP_45_ENABLED 1277 if (taskdata->td_flags.proxy == TASK_PROXY && 1278 taskdata->td_flags.complete == 1) { 1279 // This is a proxy task that was already completed but it needs to run 1280 // its bottom-half finish 1281 KA_TRACE( 1282 30, 1283 ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n", 1284 gtid, taskdata)); 1285 1286 __kmp_bottom_half_finish_proxy(gtid, task); 1287 1288 KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for " 1289 "proxy task %p, resuming task %p\n", 1290 gtid, taskdata, current_task)); 1291 1292 return; 1293 } 1294 #endif 1295 1296 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1297 kmp_uint64 cur_time; 1298 if (__kmp_forkjoin_frames_mode == 3) { 1299 // Get the current time stamp to measure task execution time to correct 1300 // barrier imbalance time 1301 cur_time = __itt_get_timestamp(); 1302 } 1303 #endif 1304 1305 #if OMPT_SUPPORT 1306 // For untied tasks, the first task executed only calls __kmpc_omp_task and 1307 // does not execute code. 1308 ompt_thread_info_t oldInfo; 1309 kmp_info_t *thread; 1310 if (UNLIKELY(ompt_enabled.enabled)) { 1311 // Store the threads states and restore them after the task 1312 thread = __kmp_threads[gtid]; 1313 oldInfo = thread->th.ompt_thread_info; 1314 thread->th.ompt_thread_info.wait_id = 0; 1315 thread->th.ompt_thread_info.state = (thread->th.th_team_serialized) 1316 ? omp_state_work_serial 1317 : omp_state_work_parallel; 1318 taskdata->ompt_task_info.frame.exit_frame = OMPT_GET_FRAME_ADDRESS(0); 1319 } 1320 #endif 1321 1322 #if OMP_45_ENABLED 1323 // Proxy tasks are not handled by the runtime 1324 if (taskdata->td_flags.proxy != TASK_PROXY) { 1325 #endif 1326 ANNOTATE_HAPPENS_AFTER(task); 1327 __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded 1328 #if OMP_45_ENABLED 1329 } 1330 #endif 1331 1332 #if OMP_40_ENABLED 1333 // TODO: cancel tasks if the parallel region has also been cancelled 1334 // TODO: check if this sequence can be hoisted above __kmp_task_start 1335 // if cancellation has been enabled for this run ... 1336 if (__kmp_omp_cancellation) { 1337 kmp_info_t *this_thr = __kmp_threads[gtid]; 1338 kmp_team_t *this_team = this_thr->th.th_team; 1339 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 1340 if ((taskgroup && taskgroup->cancel_request) || 1341 (this_team->t.t_cancel_request == cancel_parallel)) { 1342 #if OMPT_SUPPORT && OMPT_OPTIONAL 1343 ompt_data_t *task_data; 1344 if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) { 1345 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL); 1346 ompt_callbacks.ompt_callback(ompt_callback_cancel)( 1347 task_data, 1348 ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup 1349 : ompt_cancel_parallel) | 1350 ompt_cancel_discarded_task, 1351 NULL); 1352 } 1353 #endif 1354 KMP_COUNT_BLOCK(TASK_cancelled); 1355 // this task belongs to a task group and we need to cancel it 1356 discard = 1 /* true */; 1357 } 1358 } 1359 1360 // Invoke the task routine and pass in relevant data. 1361 // Thunks generated by gcc take a different argument list. 1362 if (!discard) { 1363 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 1364 taskdata->td_last_tied = current_task->td_last_tied; 1365 KMP_DEBUG_ASSERT(taskdata->td_last_tied); 1366 } 1367 #if KMP_STATS_ENABLED 1368 KMP_COUNT_BLOCK(TASK_executed); 1369 switch (KMP_GET_THREAD_STATE()) { 1370 case FORK_JOIN_BARRIER: 1371 KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar); 1372 break; 1373 case PLAIN_BARRIER: 1374 KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar); 1375 break; 1376 case TASKYIELD: 1377 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield); 1378 break; 1379 case TASKWAIT: 1380 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait); 1381 break; 1382 case TASKGROUP: 1383 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup); 1384 break; 1385 default: 1386 KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate); 1387 break; 1388 } 1389 #endif // KMP_STATS_ENABLED 1390 #endif // OMP_40_ENABLED 1391 1392 // OMPT task begin 1393 #if OMPT_SUPPORT 1394 if (UNLIKELY(ompt_enabled.enabled)) 1395 __ompt_task_start(task, current_task, gtid); 1396 #endif 1397 1398 #ifdef KMP_GOMP_COMPAT 1399 if (taskdata->td_flags.native) { 1400 ((void (*)(void *))(*(task->routine)))(task->shareds); 1401 } else 1402 #endif /* KMP_GOMP_COMPAT */ 1403 { 1404 (*(task->routine))(gtid, task); 1405 } 1406 KMP_POP_PARTITIONED_TIMER(); 1407 1408 #if OMP_40_ENABLED 1409 } 1410 #endif // OMP_40_ENABLED 1411 1412 1413 #if OMP_45_ENABLED 1414 // Proxy tasks are not handled by the runtime 1415 if (taskdata->td_flags.proxy != TASK_PROXY) { 1416 #endif 1417 ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent); 1418 #if OMPT_SUPPORT 1419 if (UNLIKELY(ompt_enabled.enabled)) { 1420 thread->th.ompt_thread_info = oldInfo; 1421 if (taskdata->td_flags.tiedness == TASK_TIED) { 1422 taskdata->ompt_task_info.frame.exit_frame = NULL; 1423 } 1424 __kmp_task_finish<true>(gtid, task, current_task); 1425 } else 1426 #endif 1427 __kmp_task_finish<false>(gtid, task, current_task); 1428 #if OMP_45_ENABLED 1429 } 1430 #endif 1431 1432 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1433 // Barrier imbalance - correct arrive time after the task finished 1434 if (__kmp_forkjoin_frames_mode == 3) { 1435 kmp_info_t *this_thr = __kmp_threads[gtid]; 1436 if (this_thr->th.th_bar_arrive_time) { 1437 this_thr->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time); 1438 } 1439 } 1440 #endif 1441 KA_TRACE( 1442 30, 1443 ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n", 1444 gtid, taskdata, current_task)); 1445 return; 1446 } 1447 1448 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution 1449 // 1450 // loc_ref: location of original task pragma (ignored) 1451 // gtid: Global Thread ID of encountering thread 1452 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task'' 1453 // Returns: 1454 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1455 // be resumed later. 1456 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1457 // resumed later. 1458 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid, 1459 kmp_task_t *new_task) { 1460 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1461 1462 KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid, 1463 loc_ref, new_taskdata)); 1464 1465 #if OMPT_SUPPORT 1466 kmp_taskdata_t *parent; 1467 if (UNLIKELY(ompt_enabled.enabled)) { 1468 parent = new_taskdata->td_parent; 1469 if (ompt_enabled.ompt_callback_task_create) { 1470 ompt_data_t task_data = ompt_data_none; 1471 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1472 parent ? &(parent->ompt_task_info.task_data) : &task_data, 1473 parent ? &(parent->ompt_task_info.frame) : NULL, 1474 &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0, 1475 OMPT_GET_RETURN_ADDRESS(0)); 1476 } 1477 } 1478 #endif 1479 1480 /* Should we execute the new task or queue it? For now, let's just always try 1481 to queue it. If the queue fills up, then we'll execute it. */ 1482 1483 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1484 { // Execute this task immediately 1485 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1486 new_taskdata->td_flags.task_serial = 1; 1487 __kmp_invoke_task(gtid, new_task, current_task); 1488 } 1489 1490 KA_TRACE( 1491 10, 1492 ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: " 1493 "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n", 1494 gtid, loc_ref, new_taskdata)); 1495 1496 ANNOTATE_HAPPENS_BEFORE(new_task); 1497 #if OMPT_SUPPORT 1498 if (UNLIKELY(ompt_enabled.enabled)) { 1499 parent->ompt_task_info.frame.enter_frame = NULL; 1500 } 1501 #endif 1502 return TASK_CURRENT_NOT_QUEUED; 1503 } 1504 1505 // __kmp_omp_task: Schedule a non-thread-switchable task for execution 1506 // 1507 // gtid: Global Thread ID of encountering thread 1508 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc() 1509 // serialize_immediate: if TRUE then if the task is executed immediately its 1510 // execution will be serialized 1511 // Returns: 1512 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1513 // be resumed later. 1514 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1515 // resumed later. 1516 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task, 1517 bool serialize_immediate) { 1518 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1519 1520 /* Should we execute the new task or queue it? For now, let's just always try to 1521 queue it. If the queue fills up, then we'll execute it. */ 1522 #if OMP_45_ENABLED 1523 if (new_taskdata->td_flags.proxy == TASK_PROXY || 1524 __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1525 #else 1526 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1527 #endif 1528 { // Execute this task immediately 1529 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1530 if (serialize_immediate) 1531 new_taskdata->td_flags.task_serial = 1; 1532 __kmp_invoke_task(gtid, new_task, current_task); 1533 } 1534 1535 ANNOTATE_HAPPENS_BEFORE(new_task); 1536 return TASK_CURRENT_NOT_QUEUED; 1537 } 1538 1539 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a 1540 // non-thread-switchable task from the parent thread only! 1541 // 1542 // loc_ref: location of original task pragma (ignored) 1543 // gtid: Global Thread ID of encountering thread 1544 // new_task: non-thread-switchable task thunk allocated by 1545 // __kmp_omp_task_alloc() 1546 // Returns: 1547 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1548 // be resumed later. 1549 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1550 // resumed later. 1551 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid, 1552 kmp_task_t *new_task) { 1553 kmp_int32 res; 1554 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); 1555 1556 #if KMP_DEBUG || OMPT_SUPPORT 1557 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1558 #endif 1559 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref, 1560 new_taskdata)); 1561 1562 #if OMPT_SUPPORT 1563 kmp_taskdata_t *parent = NULL; 1564 if (UNLIKELY(ompt_enabled.enabled)) { 1565 if (!new_taskdata->td_flags.started) { 1566 OMPT_STORE_RETURN_ADDRESS(gtid); 1567 parent = new_taskdata->td_parent; 1568 if (!parent->ompt_task_info.frame.enter_frame) { 1569 parent->ompt_task_info.frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1); 1570 } 1571 if (ompt_enabled.ompt_callback_task_create) { 1572 ompt_data_t task_data = ompt_data_none; 1573 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1574 parent ? &(parent->ompt_task_info.task_data) : &task_data, 1575 parent ? &(parent->ompt_task_info.frame) : NULL, 1576 &(new_taskdata->ompt_task_info.task_data), 1577 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, 1578 OMPT_LOAD_RETURN_ADDRESS(gtid)); 1579 } 1580 } else { 1581 // We are scheduling the continuation of an UNTIED task. 1582 // Scheduling back to the parent task. 1583 __ompt_task_finish(new_task, 1584 new_taskdata->ompt_task_info.scheduling_parent, 1585 ompt_task_others); 1586 new_taskdata->ompt_task_info.frame.exit_frame = NULL; 1587 } 1588 } 1589 #endif 1590 1591 res = __kmp_omp_task(gtid, new_task, true); 1592 1593 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning " 1594 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n", 1595 gtid, loc_ref, new_taskdata)); 1596 #if OMPT_SUPPORT 1597 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) { 1598 parent->ompt_task_info.frame.enter_frame = NULL; 1599 } 1600 #endif 1601 return res; 1602 } 1603 1604 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule 1605 // a taskloop task with the correct OMPT return address 1606 // 1607 // loc_ref: location of original task pragma (ignored) 1608 // gtid: Global Thread ID of encountering thread 1609 // new_task: non-thread-switchable task thunk allocated by 1610 // __kmp_omp_task_alloc() 1611 // codeptr_ra: return address for OMPT callback 1612 // Returns: 1613 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1614 // be resumed later. 1615 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1616 // resumed later. 1617 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid, 1618 kmp_task_t *new_task, void *codeptr_ra) { 1619 kmp_int32 res; 1620 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); 1621 1622 #if KMP_DEBUG || OMPT_SUPPORT 1623 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1624 #endif 1625 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref, 1626 new_taskdata)); 1627 1628 #if OMPT_SUPPORT 1629 kmp_taskdata_t *parent = NULL; 1630 if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) { 1631 parent = new_taskdata->td_parent; 1632 if (!parent->ompt_task_info.frame.enter_frame) 1633 parent->ompt_task_info.frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1); 1634 if (ompt_enabled.ompt_callback_task_create) { 1635 ompt_data_t task_data = ompt_data_none; 1636 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1637 parent ? &(parent->ompt_task_info.task_data) : &task_data, 1638 parent ? &(parent->ompt_task_info.frame) : NULL, 1639 &(new_taskdata->ompt_task_info.task_data), 1640 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, 1641 codeptr_ra); 1642 } 1643 } 1644 #endif 1645 1646 res = __kmp_omp_task(gtid, new_task, true); 1647 1648 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning " 1649 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n", 1650 gtid, loc_ref, new_taskdata)); 1651 #if OMPT_SUPPORT 1652 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) { 1653 parent->ompt_task_info.frame.enter_frame = NULL; 1654 } 1655 #endif 1656 return res; 1657 } 1658 1659 template <bool ompt> 1660 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid, 1661 void *frame_address, 1662 void *return_address) { 1663 kmp_taskdata_t *taskdata; 1664 kmp_info_t *thread; 1665 int thread_finished = FALSE; 1666 KMP_SET_THREAD_STATE_BLOCK(TASKWAIT); 1667 1668 KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref)); 1669 1670 if (__kmp_tasking_mode != tskm_immediate_exec) { 1671 thread = __kmp_threads[gtid]; 1672 taskdata = thread->th.th_current_task; 1673 1674 #if OMPT_SUPPORT && OMPT_OPTIONAL 1675 ompt_data_t *my_task_data; 1676 ompt_data_t *my_parallel_data; 1677 1678 if (ompt) { 1679 my_task_data = &(taskdata->ompt_task_info.task_data); 1680 my_parallel_data = OMPT_CUR_TEAM_DATA(thread); 1681 1682 taskdata->ompt_task_info.frame.enter_frame = frame_address; 1683 1684 if (ompt_enabled.ompt_callback_sync_region) { 1685 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1686 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data, 1687 my_task_data, return_address); 1688 } 1689 1690 if (ompt_enabled.ompt_callback_sync_region_wait) { 1691 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1692 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data, 1693 my_task_data, return_address); 1694 } 1695 } 1696 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1697 1698 // Debugger: The taskwait is active. Store location and thread encountered the 1699 // taskwait. 1700 #if USE_ITT_BUILD 1701 // Note: These values are used by ITT events as well. 1702 #endif /* USE_ITT_BUILD */ 1703 taskdata->td_taskwait_counter += 1; 1704 taskdata->td_taskwait_ident = loc_ref; 1705 taskdata->td_taskwait_thread = gtid + 1; 1706 1707 #if USE_ITT_BUILD 1708 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 1709 if (itt_sync_obj != NULL) 1710 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 1711 #endif /* USE_ITT_BUILD */ 1712 1713 bool must_wait = 1714 !taskdata->td_flags.team_serial && !taskdata->td_flags.final; 1715 1716 #if OMP_45_ENABLED 1717 must_wait = must_wait || (thread->th.th_task_team != NULL && 1718 thread->th.th_task_team->tt.tt_found_proxy_tasks); 1719 #endif 1720 if (must_wait) { 1721 kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, 1722 &(taskdata->td_incomplete_child_tasks)), 1723 0U); 1724 while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) { 1725 flag.execute_tasks(thread, gtid, FALSE, 1726 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 1727 __kmp_task_stealing_constraint); 1728 } 1729 } 1730 #if USE_ITT_BUILD 1731 if (itt_sync_obj != NULL) 1732 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 1733 #endif /* USE_ITT_BUILD */ 1734 1735 // Debugger: The taskwait is completed. Location remains, but thread is 1736 // negated. 1737 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 1738 1739 #if OMPT_SUPPORT && OMPT_OPTIONAL 1740 if (ompt) { 1741 if (ompt_enabled.ompt_callback_sync_region_wait) { 1742 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1743 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data, 1744 my_task_data, return_address); 1745 } 1746 if (ompt_enabled.ompt_callback_sync_region) { 1747 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1748 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data, 1749 my_task_data, return_address); 1750 } 1751 taskdata->ompt_task_info.frame.enter_frame = NULL; 1752 } 1753 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1754 1755 ANNOTATE_HAPPENS_AFTER(taskdata); 1756 } 1757 1758 KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, " 1759 "returning TASK_CURRENT_NOT_QUEUED\n", 1760 gtid, taskdata)); 1761 1762 return TASK_CURRENT_NOT_QUEUED; 1763 } 1764 1765 #if OMPT_SUPPORT 1766 OMPT_NOINLINE 1767 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid, 1768 void *frame_address, 1769 void *return_address) { 1770 return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address, 1771 return_address); 1772 } 1773 #endif // OMPT_SUPPORT 1774 1775 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are 1776 // complete 1777 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) { 1778 #if OMPT_SUPPORT && OMPT_OPTIONAL 1779 if (UNLIKELY(ompt_enabled.enabled)) { 1780 OMPT_STORE_RETURN_ADDRESS(gtid); 1781 return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(1), 1782 OMPT_LOAD_RETURN_ADDRESS(gtid)); 1783 } 1784 #endif 1785 return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL); 1786 } 1787 1788 // __kmpc_omp_taskyield: switch to a different task 1789 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) { 1790 kmp_taskdata_t *taskdata; 1791 kmp_info_t *thread; 1792 int thread_finished = FALSE; 1793 1794 KMP_COUNT_BLOCK(OMP_TASKYIELD); 1795 KMP_SET_THREAD_STATE_BLOCK(TASKYIELD); 1796 1797 KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n", 1798 gtid, loc_ref, end_part)); 1799 1800 if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) { 1801 thread = __kmp_threads[gtid]; 1802 taskdata = thread->th.th_current_task; 1803 // Should we model this as a task wait or not? 1804 // Debugger: The taskwait is active. Store location and thread encountered the 1805 // taskwait. 1806 #if USE_ITT_BUILD 1807 // Note: These values are used by ITT events as well. 1808 #endif /* USE_ITT_BUILD */ 1809 taskdata->td_taskwait_counter += 1; 1810 taskdata->td_taskwait_ident = loc_ref; 1811 taskdata->td_taskwait_thread = gtid + 1; 1812 1813 #if USE_ITT_BUILD 1814 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 1815 if (itt_sync_obj != NULL) 1816 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 1817 #endif /* USE_ITT_BUILD */ 1818 if (!taskdata->td_flags.team_serial) { 1819 kmp_task_team_t *task_team = thread->th.th_task_team; 1820 if (task_team != NULL) { 1821 if (KMP_TASKING_ENABLED(task_team)) { 1822 #if OMPT_SUPPORT 1823 if (UNLIKELY(ompt_enabled.enabled)) 1824 thread->th.ompt_thread_info.ompt_task_yielded = 1; 1825 #endif 1826 __kmp_execute_tasks_32( 1827 thread, gtid, NULL, FALSE, 1828 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 1829 __kmp_task_stealing_constraint); 1830 #if OMPT_SUPPORT 1831 if (UNLIKELY(ompt_enabled.enabled)) 1832 thread->th.ompt_thread_info.ompt_task_yielded = 0; 1833 #endif 1834 } 1835 } 1836 } 1837 #if USE_ITT_BUILD 1838 if (itt_sync_obj != NULL) 1839 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 1840 #endif /* USE_ITT_BUILD */ 1841 1842 // Debugger: The taskwait is completed. Location remains, but thread is 1843 // negated. 1844 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 1845 } 1846 1847 KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, " 1848 "returning TASK_CURRENT_NOT_QUEUED\n", 1849 gtid, taskdata)); 1850 1851 return TASK_CURRENT_NOT_QUEUED; 1852 } 1853 1854 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work 1855 #if OMP_45_ENABLED 1856 // Task Reduction implementation 1857 1858 typedef struct kmp_task_red_flags { 1859 unsigned lazy_priv : 1; // hint: (1) use lazy allocation (big objects) 1860 unsigned reserved31 : 31; 1861 } kmp_task_red_flags_t; 1862 1863 // internal structure for reduction data item related info 1864 typedef struct kmp_task_red_data { 1865 void *reduce_shar; // shared reduction item 1866 size_t reduce_size; // size of data item 1867 void *reduce_priv; // thread specific data 1868 void *reduce_pend; // end of private data for comparison op 1869 void *reduce_init; // data initialization routine 1870 void *reduce_fini; // data finalization routine 1871 void *reduce_comb; // data combiner routine 1872 kmp_task_red_flags_t flags; // flags for additional info from compiler 1873 } kmp_task_red_data_t; 1874 1875 // structure sent us by compiler - one per reduction item 1876 typedef struct kmp_task_red_input { 1877 void *reduce_shar; // shared reduction item 1878 size_t reduce_size; // size of data item 1879 void *reduce_init; // data initialization routine 1880 void *reduce_fini; // data finalization routine 1881 void *reduce_comb; // data combiner routine 1882 kmp_task_red_flags_t flags; // flags for additional info from compiler 1883 } kmp_task_red_input_t; 1884 1885 /*! 1886 @ingroup TASKING 1887 @param gtid Global thread ID 1888 @param num Number of data items to reduce 1889 @param data Array of data for reduction 1890 @return The taskgroup identifier 1891 1892 Initialize task reduction for the taskgroup. 1893 */ 1894 void *__kmpc_task_reduction_init(int gtid, int num, void *data) { 1895 kmp_info_t *thread = __kmp_threads[gtid]; 1896 kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup; 1897 kmp_int32 nth = thread->th.th_team_nproc; 1898 kmp_task_red_input_t *input = (kmp_task_red_input_t *)data; 1899 kmp_task_red_data_t *arr; 1900 1901 // check input data just in case 1902 KMP_ASSERT(tg != NULL); 1903 KMP_ASSERT(data != NULL); 1904 KMP_ASSERT(num > 0); 1905 if (nth == 1) { 1906 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n", 1907 gtid, tg)); 1908 return (void *)tg; 1909 } 1910 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n", 1911 gtid, tg, num)); 1912 arr = (kmp_task_red_data_t *)__kmp_thread_malloc( 1913 thread, num * sizeof(kmp_task_red_data_t)); 1914 for (int i = 0; i < num; ++i) { 1915 void (*f_init)(void *) = (void (*)(void *))(input[i].reduce_init); 1916 size_t size = input[i].reduce_size - 1; 1917 // round the size up to cache line per thread-specific item 1918 size += CACHE_LINE - size % CACHE_LINE; 1919 KMP_ASSERT(input[i].reduce_comb != NULL); // combiner is mandatory 1920 arr[i].reduce_shar = input[i].reduce_shar; 1921 arr[i].reduce_size = size; 1922 arr[i].reduce_init = input[i].reduce_init; 1923 arr[i].reduce_fini = input[i].reduce_fini; 1924 arr[i].reduce_comb = input[i].reduce_comb; 1925 arr[i].flags = input[i].flags; 1926 if (!input[i].flags.lazy_priv) { 1927 // allocate cache-line aligned block and fill it with zeros 1928 arr[i].reduce_priv = __kmp_allocate(nth * size); 1929 arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size; 1930 if (f_init != NULL) { 1931 // initialize thread-specific items 1932 for (int j = 0; j < nth; ++j) { 1933 f_init((char *)(arr[i].reduce_priv) + j * size); 1934 } 1935 } 1936 } else { 1937 // only allocate space for pointers now, 1938 // objects will be lazily allocated/initialized once requested 1939 arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *)); 1940 } 1941 } 1942 tg->reduce_data = (void *)arr; 1943 tg->reduce_num_data = num; 1944 return (void *)tg; 1945 } 1946 1947 /*! 1948 @ingroup TASKING 1949 @param gtid Global thread ID 1950 @param tskgrp The taskgroup ID (optional) 1951 @param data Shared location of the item 1952 @return The pointer to per-thread data 1953 1954 Get thread-specific location of data item 1955 */ 1956 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) { 1957 kmp_info_t *thread = __kmp_threads[gtid]; 1958 kmp_int32 nth = thread->th.th_team_nproc; 1959 if (nth == 1) 1960 return data; // nothing to do 1961 1962 kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp; 1963 if (tg == NULL) 1964 tg = thread->th.th_current_task->td_taskgroup; 1965 KMP_ASSERT(tg != NULL); 1966 kmp_task_red_data_t *arr = (kmp_task_red_data_t *)(tg->reduce_data); 1967 kmp_int32 num = tg->reduce_num_data; 1968 kmp_int32 tid = thread->th.th_info.ds.ds_tid; 1969 1970 KMP_ASSERT(data != NULL); 1971 while (tg != NULL) { 1972 for (int i = 0; i < num; ++i) { 1973 if (!arr[i].flags.lazy_priv) { 1974 if (data == arr[i].reduce_shar || 1975 (data >= arr[i].reduce_priv && data < arr[i].reduce_pend)) 1976 return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size; 1977 } else { 1978 // check shared location first 1979 void **p_priv = (void **)(arr[i].reduce_priv); 1980 if (data == arr[i].reduce_shar) 1981 goto found; 1982 // check if we get some thread specific location as parameter 1983 for (int j = 0; j < nth; ++j) 1984 if (data == p_priv[j]) 1985 goto found; 1986 continue; // not found, continue search 1987 found: 1988 if (p_priv[tid] == NULL) { 1989 // allocate thread specific object lazily 1990 void (*f_init)(void *) = (void (*)(void *))(arr[i].reduce_init); 1991 p_priv[tid] = __kmp_allocate(arr[i].reduce_size); 1992 if (f_init != NULL) { 1993 f_init(p_priv[tid]); 1994 } 1995 } 1996 return p_priv[tid]; 1997 } 1998 } 1999 tg = tg->parent; 2000 arr = (kmp_task_red_data_t *)(tg->reduce_data); 2001 num = tg->reduce_num_data; 2002 } 2003 KMP_ASSERT2(0, "Unknown task reduction item"); 2004 return NULL; // ERROR, this line never executed 2005 } 2006 2007 // Finalize task reduction. 2008 // Called from __kmpc_end_taskgroup() 2009 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) { 2010 kmp_int32 nth = th->th.th_team_nproc; 2011 KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1 2012 kmp_task_red_data_t *arr = (kmp_task_red_data_t *)tg->reduce_data; 2013 kmp_int32 num = tg->reduce_num_data; 2014 for (int i = 0; i < num; ++i) { 2015 void *sh_data = arr[i].reduce_shar; 2016 void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini); 2017 void (*f_comb)(void *, void *) = 2018 (void (*)(void *, void *))(arr[i].reduce_comb); 2019 if (!arr[i].flags.lazy_priv) { 2020 void *pr_data = arr[i].reduce_priv; 2021 size_t size = arr[i].reduce_size; 2022 for (int j = 0; j < nth; ++j) { 2023 void *priv_data = (char *)pr_data + j * size; 2024 f_comb(sh_data, priv_data); // combine results 2025 if (f_fini) 2026 f_fini(priv_data); // finalize if needed 2027 } 2028 } else { 2029 void **pr_data = (void **)(arr[i].reduce_priv); 2030 for (int j = 0; j < nth; ++j) { 2031 if (pr_data[j] != NULL) { 2032 f_comb(sh_data, pr_data[j]); // combine results 2033 if (f_fini) 2034 f_fini(pr_data[j]); // finalize if needed 2035 __kmp_free(pr_data[j]); 2036 } 2037 } 2038 } 2039 __kmp_free(arr[i].reduce_priv); 2040 } 2041 __kmp_thread_free(th, arr); 2042 tg->reduce_data = NULL; 2043 tg->reduce_num_data = 0; 2044 } 2045 #endif 2046 2047 #if OMP_40_ENABLED 2048 // __kmpc_taskgroup: Start a new taskgroup 2049 void __kmpc_taskgroup(ident_t *loc, int gtid) { 2050 kmp_info_t *thread = __kmp_threads[gtid]; 2051 kmp_taskdata_t *taskdata = thread->th.th_current_task; 2052 kmp_taskgroup_t *tg_new = 2053 (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t)); 2054 KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new)); 2055 KMP_ATOMIC_ST_RLX(&tg_new->count, 0); 2056 KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq); 2057 tg_new->parent = taskdata->td_taskgroup; 2058 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work 2059 #if OMP_45_ENABLED 2060 tg_new->reduce_data = NULL; 2061 tg_new->reduce_num_data = 0; 2062 #endif 2063 taskdata->td_taskgroup = tg_new; 2064 2065 #if OMPT_SUPPORT && OMPT_OPTIONAL 2066 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) { 2067 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2068 if (!codeptr) 2069 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2070 kmp_team_t *team = thread->th.th_team; 2071 ompt_data_t my_task_data = taskdata->ompt_task_info.task_data; 2072 // FIXME: I think this is wrong for lwt! 2073 ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data; 2074 2075 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2076 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data), 2077 &(my_task_data), codeptr); 2078 } 2079 #endif 2080 } 2081 2082 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task 2083 // and its descendants are complete 2084 void __kmpc_end_taskgroup(ident_t *loc, int gtid) { 2085 kmp_info_t *thread = __kmp_threads[gtid]; 2086 kmp_taskdata_t *taskdata = thread->th.th_current_task; 2087 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 2088 int thread_finished = FALSE; 2089 2090 #if OMPT_SUPPORT && OMPT_OPTIONAL 2091 kmp_team_t *team; 2092 ompt_data_t my_task_data; 2093 ompt_data_t my_parallel_data; 2094 void *codeptr; 2095 if (UNLIKELY(ompt_enabled.enabled)) { 2096 team = thread->th.th_team; 2097 my_task_data = taskdata->ompt_task_info.task_data; 2098 // FIXME: I think this is wrong for lwt! 2099 my_parallel_data = team->t.ompt_team_info.parallel_data; 2100 codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2101 if (!codeptr) 2102 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2103 } 2104 #endif 2105 2106 KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc)); 2107 KMP_DEBUG_ASSERT(taskgroup != NULL); 2108 KMP_SET_THREAD_STATE_BLOCK(TASKGROUP); 2109 2110 if (__kmp_tasking_mode != tskm_immediate_exec) { 2111 // mark task as waiting not on a barrier 2112 taskdata->td_taskwait_counter += 1; 2113 taskdata->td_taskwait_ident = loc; 2114 taskdata->td_taskwait_thread = gtid + 1; 2115 #if USE_ITT_BUILD 2116 // For ITT the taskgroup wait is similar to taskwait until we need to 2117 // distinguish them 2118 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 2119 if (itt_sync_obj != NULL) 2120 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 2121 #endif /* USE_ITT_BUILD */ 2122 2123 #if OMPT_SUPPORT && OMPT_OPTIONAL 2124 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) { 2125 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2126 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data), 2127 &(my_task_data), codeptr); 2128 } 2129 #endif 2130 2131 #if OMP_45_ENABLED 2132 if (!taskdata->td_flags.team_serial || 2133 (thread->th.th_task_team != NULL && 2134 thread->th.th_task_team->tt.tt_found_proxy_tasks)) 2135 #else 2136 if (!taskdata->td_flags.team_serial) 2137 #endif 2138 { 2139 kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 2140 0U); 2141 while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) { 2142 flag.execute_tasks(thread, gtid, FALSE, 2143 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 2144 __kmp_task_stealing_constraint); 2145 } 2146 } 2147 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting 2148 2149 #if OMPT_SUPPORT && OMPT_OPTIONAL 2150 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) { 2151 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2152 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data), 2153 &(my_task_data), codeptr); 2154 } 2155 #endif 2156 2157 #if USE_ITT_BUILD 2158 if (itt_sync_obj != NULL) 2159 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 2160 #endif /* USE_ITT_BUILD */ 2161 } 2162 KMP_DEBUG_ASSERT(taskgroup->count == 0); 2163 2164 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work 2165 #if OMP_45_ENABLED 2166 if (taskgroup->reduce_data != NULL) // need to reduce? 2167 __kmp_task_reduction_fini(thread, taskgroup); 2168 #endif 2169 // Restore parent taskgroup for the current task 2170 taskdata->td_taskgroup = taskgroup->parent; 2171 __kmp_thread_free(thread, taskgroup); 2172 2173 KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n", 2174 gtid, taskdata)); 2175 ANNOTATE_HAPPENS_AFTER(taskdata); 2176 2177 #if OMPT_SUPPORT && OMPT_OPTIONAL 2178 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) { 2179 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2180 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data), 2181 &(my_task_data), codeptr); 2182 } 2183 #endif 2184 } 2185 #endif 2186 2187 // __kmp_remove_my_task: remove a task from my own deque 2188 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid, 2189 kmp_task_team_t *task_team, 2190 kmp_int32 is_constrained) { 2191 kmp_task_t *task; 2192 kmp_taskdata_t *taskdata; 2193 kmp_thread_data_t *thread_data; 2194 kmp_uint32 tail; 2195 2196 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2197 KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data != 2198 NULL); // Caller should check this condition 2199 2200 thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 2201 2202 KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n", 2203 gtid, thread_data->td.td_deque_ntasks, 2204 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2205 2206 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 2207 KA_TRACE(10, 2208 ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: " 2209 "ntasks=%d head=%u tail=%u\n", 2210 gtid, thread_data->td.td_deque_ntasks, 2211 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2212 return NULL; 2213 } 2214 2215 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 2216 2217 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 2218 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2219 KA_TRACE(10, 2220 ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: " 2221 "ntasks=%d head=%u tail=%u\n", 2222 gtid, thread_data->td.td_deque_ntasks, 2223 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2224 return NULL; 2225 } 2226 2227 tail = (thread_data->td.td_deque_tail - 1) & 2228 TASK_DEQUE_MASK(thread_data->td); // Wrap index. 2229 taskdata = thread_data->td.td_deque[tail]; 2230 2231 if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) { 2232 // we need to check if the candidate obeys task scheduling constraint (TSC) 2233 // only descendant of all deferred tied tasks can be scheduled, checking 2234 // the last one is enough, as it in turn is the descendant of all others 2235 kmp_taskdata_t *current = thread->th.th_current_task->td_last_tied; 2236 KMP_DEBUG_ASSERT(current != NULL); 2237 // check if last tied task is not suspended on barrier 2238 if (current->td_flags.tasktype == TASK_EXPLICIT || 2239 current->td_taskwait_thread > 0) { // <= 0 on barrier 2240 kmp_int32 level = current->td_level; 2241 kmp_taskdata_t *parent = taskdata->td_parent; 2242 while (parent != current && parent->td_level > level) { 2243 parent = parent->td_parent; // check generation up to the level of the 2244 // current task 2245 KMP_DEBUG_ASSERT(parent != NULL); 2246 } 2247 if (parent != current) { 2248 // The TSC does not allow to steal victim task 2249 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2250 KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: " 2251 "ntasks=%d head=%u tail=%u\n", 2252 gtid, thread_data->td.td_deque_ntasks, 2253 thread_data->td.td_deque_head, 2254 thread_data->td.td_deque_tail)); 2255 return NULL; 2256 } 2257 } 2258 } 2259 2260 thread_data->td.td_deque_tail = tail; 2261 TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1); 2262 2263 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2264 2265 KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d task %p removed: " 2266 "ntasks=%d head=%u tail=%u\n", 2267 gtid, taskdata, thread_data->td.td_deque_ntasks, 2268 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2269 2270 task = KMP_TASKDATA_TO_TASK(taskdata); 2271 return task; 2272 } 2273 2274 // __kmp_steal_task: remove a task from another thread's deque 2275 // Assume that calling thread has already checked existence of 2276 // task_team thread_data before calling this routine. 2277 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid, 2278 kmp_task_team_t *task_team, 2279 std::atomic<kmp_int32> *unfinished_threads, 2280 int *thread_finished, 2281 kmp_int32 is_constrained) { 2282 kmp_task_t *task; 2283 kmp_taskdata_t *taskdata; 2284 kmp_taskdata_t *current; 2285 kmp_thread_data_t *victim_td, *threads_data; 2286 kmp_int32 level, target; 2287 kmp_int32 victim_tid; 2288 2289 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2290 2291 threads_data = task_team->tt.tt_threads_data; 2292 KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition 2293 2294 victim_tid = victim_thr->th.th_info.ds.ds_tid; 2295 victim_td = &threads_data[victim_tid]; 2296 2297 KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: " 2298 "task_team=%p ntasks=%d head=%u tail=%u\n", 2299 gtid, __kmp_gtid_from_thread(victim_thr), task_team, 2300 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2301 victim_td->td.td_deque_tail)); 2302 2303 if (TCR_4(victim_td->td.td_deque_ntasks) == 0) { 2304 KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: " 2305 "task_team=%p ntasks=%d head=%u tail=%u\n", 2306 gtid, __kmp_gtid_from_thread(victim_thr), task_team, 2307 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2308 victim_td->td.td_deque_tail)); 2309 return NULL; 2310 } 2311 2312 __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock); 2313 2314 int ntasks = TCR_4(victim_td->td.td_deque_ntasks); 2315 // Check again after we acquire the lock 2316 if (ntasks == 0) { 2317 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2318 KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: " 2319 "task_team=%p ntasks=%d head=%u tail=%u\n", 2320 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2321 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2322 return NULL; 2323 } 2324 2325 KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL); 2326 2327 taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head]; 2328 if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) { 2329 // we need to check if the candidate obeys task scheduling constraint (TSC) 2330 // only descendant of all deferred tied tasks can be scheduled, checking 2331 // the last one is enough, as it in turn is the descendant of all others 2332 current = __kmp_threads[gtid]->th.th_current_task->td_last_tied; 2333 KMP_DEBUG_ASSERT(current != NULL); 2334 // check if last tied task is not suspended on barrier 2335 if (current->td_flags.tasktype == TASK_EXPLICIT || 2336 current->td_taskwait_thread > 0) { // <= 0 on barrier 2337 level = current->td_level; 2338 kmp_taskdata_t *parent = taskdata->td_parent; 2339 while (parent != current && parent->td_level > level) { 2340 parent = parent->td_parent; // check generation up to the level of the 2341 // current task 2342 KMP_DEBUG_ASSERT(parent != NULL); 2343 } 2344 if (parent != current) { 2345 if (!task_team->tt.tt_untied_task_encountered) { 2346 // The TSC does not allow to steal victim task 2347 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2348 KA_TRACE(10, 2349 ("__kmp_steal_task(exit #3): T#%d could not steal from " 2350 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n", 2351 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2352 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2353 return NULL; 2354 } 2355 taskdata = NULL; // will check other tasks in victim's deque 2356 } 2357 } 2358 } 2359 if (taskdata != NULL) { 2360 // Bump head pointer and Wrap. 2361 victim_td->td.td_deque_head = 2362 (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td); 2363 } else { 2364 int i; 2365 // walk through victim's deque trying to steal any task 2366 target = victim_td->td.td_deque_head; 2367 for (i = 1; i < ntasks; ++i) { 2368 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td); 2369 taskdata = victim_td->td.td_deque[target]; 2370 if (taskdata->td_flags.tiedness == TASK_TIED) { 2371 // check if the candidate obeys the TSC 2372 kmp_taskdata_t *parent = taskdata->td_parent; 2373 // check generation up to the level of the current task 2374 while (parent != current && parent->td_level > level) { 2375 parent = parent->td_parent; 2376 KMP_DEBUG_ASSERT(parent != NULL); 2377 } 2378 if (parent != current) { 2379 // The TSC does not allow to steal the candidate 2380 taskdata = NULL; 2381 continue; 2382 } else { 2383 // found victim tied task 2384 break; 2385 } 2386 } else { 2387 // found victim untied task 2388 break; 2389 } 2390 } 2391 if (taskdata == NULL) { 2392 // No appropriate candidate to steal found 2393 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2394 KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from " 2395 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n", 2396 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2397 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2398 return NULL; 2399 } 2400 int prev = target; 2401 for (i = i + 1; i < ntasks; ++i) { 2402 // shift remaining tasks in the deque left by 1 2403 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td); 2404 victim_td->td.td_deque[prev] = victim_td->td.td_deque[target]; 2405 prev = target; 2406 } 2407 KMP_DEBUG_ASSERT( 2408 victim_td->td.td_deque_tail == 2409 (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td))); 2410 victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped)) 2411 } 2412 if (*thread_finished) { 2413 // We need to un-mark this victim as a finished victim. This must be done 2414 // before releasing the lock, or else other threads (starting with the 2415 // master victim) might be prematurely released from the barrier!!! 2416 kmp_int32 count; 2417 2418 count = KMP_ATOMIC_INC(unfinished_threads); 2419 2420 KA_TRACE( 2421 20, 2422 ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n", 2423 gtid, count + 1, task_team)); 2424 2425 *thread_finished = FALSE; 2426 } 2427 TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1); 2428 2429 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2430 2431 KMP_COUNT_BLOCK(TASK_stolen); 2432 KA_TRACE(10, 2433 ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: " 2434 "task_team=%p ntasks=%d head=%u tail=%u\n", 2435 gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team, 2436 ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2437 2438 task = KMP_TASKDATA_TO_TASK(taskdata); 2439 return task; 2440 } 2441 2442 // __kmp_execute_tasks_template: Choose and execute tasks until either the 2443 // condition is statisfied (return true) or there are none left (return false). 2444 // 2445 // final_spin is TRUE if this is the spin at the release barrier. 2446 // thread_finished indicates whether the thread is finished executing all 2447 // the tasks it has on its deque, and is at the release barrier. 2448 // spinner is the location on which to spin. 2449 // spinner == NULL means only execute a single task and return. 2450 // checker is the value to check to terminate the spin. 2451 template <class C> 2452 static inline int __kmp_execute_tasks_template( 2453 kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin, 2454 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2455 kmp_int32 is_constrained) { 2456 kmp_task_team_t *task_team = thread->th.th_task_team; 2457 kmp_thread_data_t *threads_data; 2458 kmp_task_t *task; 2459 kmp_info_t *other_thread; 2460 kmp_taskdata_t *current_task = thread->th.th_current_task; 2461 std::atomic<kmp_int32> *unfinished_threads; 2462 kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0, 2463 tid = thread->th.th_info.ds.ds_tid; 2464 2465 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2466 KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]); 2467 2468 if (task_team == NULL) 2469 return FALSE; 2470 2471 KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d " 2472 "*thread_finished=%d\n", 2473 gtid, final_spin, *thread_finished)); 2474 2475 thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 2476 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 2477 KMP_DEBUG_ASSERT(threads_data != NULL); 2478 2479 nthreads = task_team->tt.tt_nproc; 2480 unfinished_threads = &(task_team->tt.tt_unfinished_threads); 2481 #if OMP_45_ENABLED 2482 KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks); 2483 #else 2484 KMP_DEBUG_ASSERT(nthreads > 1); 2485 #endif 2486 KMP_DEBUG_ASSERT(*unfinished_threads >= 0); 2487 2488 while (1) { // Outer loop keeps trying to find tasks in case of single thread 2489 // getting tasks from target constructs 2490 while (1) { // Inner loop to find a task and execute it 2491 task = NULL; 2492 if (use_own_tasks) { // check on own queue first 2493 task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained); 2494 } 2495 if ((task == NULL) && (nthreads > 1)) { // Steal a task 2496 int asleep = 1; 2497 use_own_tasks = 0; 2498 // Try to steal from the last place I stole from successfully. 2499 if (victim_tid == -2) { // haven't stolen anything yet 2500 victim_tid = threads_data[tid].td.td_deque_last_stolen; 2501 if (victim_tid != 2502 -1) // if we have a last stolen from victim, get the thread 2503 other_thread = threads_data[victim_tid].td.td_thr; 2504 } 2505 if (victim_tid != -1) { // found last victim 2506 asleep = 0; 2507 } else if (!new_victim) { // no recent steals and we haven't already 2508 // used a new victim; select a random thread 2509 do { // Find a different thread to steal work from. 2510 // Pick a random thread. Initial plan was to cycle through all the 2511 // threads, and only return if we tried to steal from every thread, 2512 // and failed. Arch says that's not such a great idea. 2513 victim_tid = __kmp_get_random(thread) % (nthreads - 1); 2514 if (victim_tid >= tid) { 2515 ++victim_tid; // Adjusts random distribution to exclude self 2516 } 2517 // Found a potential victim 2518 other_thread = threads_data[victim_tid].td.td_thr; 2519 // There is a slight chance that __kmp_enable_tasking() did not wake 2520 // up all threads waiting at the barrier. If victim is sleeping, 2521 // then wake it up. Since we were going to pay the cache miss 2522 // penalty for referencing another thread's kmp_info_t struct 2523 // anyway, 2524 // the check shouldn't cost too much performance at this point. In 2525 // extra barrier mode, tasks do not sleep at the separate tasking 2526 // barrier, so this isn't a problem. 2527 asleep = 0; 2528 if ((__kmp_tasking_mode == tskm_task_teams) && 2529 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) && 2530 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) != 2531 NULL)) { 2532 asleep = 1; 2533 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread), 2534 other_thread->th.th_sleep_loc); 2535 // A sleeping thread should not have any tasks on it's queue. 2536 // There is a slight possibility that it resumes, steals a task 2537 // from another thread, which spawns more tasks, all in the time 2538 // that it takes this thread to check => don't write an assertion 2539 // that the victim's queue is empty. Try stealing from a 2540 // different thread. 2541 } 2542 } while (asleep); 2543 } 2544 2545 if (!asleep) { 2546 // We have a victim to try to steal from 2547 task = __kmp_steal_task(other_thread, gtid, task_team, 2548 unfinished_threads, thread_finished, 2549 is_constrained); 2550 } 2551 if (task != NULL) { // set last stolen to victim 2552 if (threads_data[tid].td.td_deque_last_stolen != victim_tid) { 2553 threads_data[tid].td.td_deque_last_stolen = victim_tid; 2554 // The pre-refactored code did not try more than 1 successful new 2555 // vicitm, unless the last one generated more local tasks; 2556 // new_victim keeps track of this 2557 new_victim = 1; 2558 } 2559 } else { // No tasks found; unset last_stolen 2560 KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1); 2561 victim_tid = -2; // no successful victim found 2562 } 2563 } 2564 2565 if (task == NULL) // break out of tasking loop 2566 break; 2567 2568 // Found a task; execute it 2569 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2570 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2571 if (itt_sync_obj == NULL) { // we are at fork barrier where we could not 2572 // get the object reliably 2573 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2574 } 2575 __kmp_itt_task_starting(itt_sync_obj); 2576 } 2577 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2578 __kmp_invoke_task(gtid, task, current_task); 2579 #if USE_ITT_BUILD 2580 if (itt_sync_obj != NULL) 2581 __kmp_itt_task_finished(itt_sync_obj); 2582 #endif /* USE_ITT_BUILD */ 2583 // If this thread is only partway through the barrier and the condition is 2584 // met, then return now, so that the barrier gather/release pattern can 2585 // proceed. If this thread is in the last spin loop in the barrier, 2586 // waiting to be released, we know that the termination condition will not 2587 // be satisified, so don't waste any cycles checking it. 2588 if (flag == NULL || (!final_spin && flag->done_check())) { 2589 KA_TRACE( 2590 15, 2591 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 2592 gtid)); 2593 return TRUE; 2594 } 2595 if (thread->th.th_task_team == NULL) { 2596 break; 2597 } 2598 // Yield before executing next task 2599 KMP_YIELD(__kmp_library == library_throughput); 2600 // If execution of a stolen task results in more tasks being placed on our 2601 // run queue, reset use_own_tasks 2602 if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) { 2603 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned " 2604 "other tasks, restart\n", 2605 gtid)); 2606 use_own_tasks = 1; 2607 new_victim = 0; 2608 } 2609 } 2610 2611 // The task source has been exhausted. If in final spin loop of barrier, check 2612 // if termination condition is satisfied. 2613 #if OMP_45_ENABLED 2614 // The work queue may be empty but there might be proxy tasks still 2615 // executing 2616 if (final_spin && 2617 KMP_ATOMIC_LD_ACQ(¤t_task->td_incomplete_child_tasks) == 0) 2618 #else 2619 if (final_spin) 2620 #endif 2621 { 2622 // First, decrement the #unfinished threads, if that has not already been 2623 // done. This decrement might be to the spin location, and result in the 2624 // termination condition being satisfied. 2625 if (!*thread_finished) { 2626 kmp_int32 count; 2627 2628 count = KMP_ATOMIC_DEC(unfinished_threads) - 1; 2629 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec " 2630 "unfinished_threads to %d task_team=%p\n", 2631 gtid, count, task_team)); 2632 *thread_finished = TRUE; 2633 } 2634 2635 // It is now unsafe to reference thread->th.th_team !!! 2636 // Decrementing task_team->tt.tt_unfinished_threads can allow the master 2637 // thread to pass through the barrier, where it might reset each thread's 2638 // th.th_team field for the next parallel region. If we can steal more 2639 // work, we know that this has not happened yet. 2640 if (flag != NULL && flag->done_check()) { 2641 KA_TRACE( 2642 15, 2643 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 2644 gtid)); 2645 return TRUE; 2646 } 2647 } 2648 2649 // If this thread's task team is NULL, master has recognized that there are 2650 // no more tasks; bail out 2651 if (thread->th.th_task_team == NULL) { 2652 KA_TRACE(15, 2653 ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid)); 2654 return FALSE; 2655 } 2656 2657 #if OMP_45_ENABLED 2658 // We could be getting tasks from target constructs; if this is the only 2659 // thread, keep trying to execute tasks from own queue 2660 if (nthreads == 1) 2661 use_own_tasks = 1; 2662 else 2663 #endif 2664 { 2665 KA_TRACE(15, 2666 ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid)); 2667 return FALSE; 2668 } 2669 } 2670 } 2671 2672 int __kmp_execute_tasks_32( 2673 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin, 2674 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2675 kmp_int32 is_constrained) { 2676 return __kmp_execute_tasks_template( 2677 thread, gtid, flag, final_spin, 2678 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 2679 } 2680 2681 int __kmp_execute_tasks_64( 2682 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin, 2683 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2684 kmp_int32 is_constrained) { 2685 return __kmp_execute_tasks_template( 2686 thread, gtid, flag, final_spin, 2687 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 2688 } 2689 2690 int __kmp_execute_tasks_oncore( 2691 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin, 2692 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2693 kmp_int32 is_constrained) { 2694 return __kmp_execute_tasks_template( 2695 thread, gtid, flag, final_spin, 2696 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 2697 } 2698 2699 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the 2700 // next barrier so they can assist in executing enqueued tasks. 2701 // First thread in allocates the task team atomically. 2702 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 2703 kmp_info_t *this_thr) { 2704 kmp_thread_data_t *threads_data; 2705 int nthreads, i, is_init_thread; 2706 2707 KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n", 2708 __kmp_gtid_from_thread(this_thr))); 2709 2710 KMP_DEBUG_ASSERT(task_team != NULL); 2711 KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL); 2712 2713 nthreads = task_team->tt.tt_nproc; 2714 KMP_DEBUG_ASSERT(nthreads > 0); 2715 KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc); 2716 2717 // Allocate or increase the size of threads_data if necessary 2718 is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team); 2719 2720 if (!is_init_thread) { 2721 // Some other thread already set up the array. 2722 KA_TRACE( 2723 20, 2724 ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n", 2725 __kmp_gtid_from_thread(this_thr))); 2726 return; 2727 } 2728 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 2729 KMP_DEBUG_ASSERT(threads_data != NULL); 2730 2731 if ((__kmp_tasking_mode == tskm_task_teams) && 2732 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) { 2733 // Release any threads sleeping at the barrier, so that they can steal 2734 // tasks and execute them. In extra barrier mode, tasks do not sleep 2735 // at the separate tasking barrier, so this isn't a problem. 2736 for (i = 0; i < nthreads; i++) { 2737 volatile void *sleep_loc; 2738 kmp_info_t *thread = threads_data[i].td.td_thr; 2739 2740 if (i == this_thr->th.th_info.ds.ds_tid) { 2741 continue; 2742 } 2743 // Since we haven't locked the thread's suspend mutex lock at this 2744 // point, there is a small window where a thread might be putting 2745 // itself to sleep, but hasn't set the th_sleep_loc field yet. 2746 // To work around this, __kmp_execute_tasks_template() periodically checks 2747 // see if other threads are sleeping (using the same random mechanism that 2748 // is used for task stealing) and awakens them if they are. 2749 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != 2750 NULL) { 2751 KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n", 2752 __kmp_gtid_from_thread(this_thr), 2753 __kmp_gtid_from_thread(thread))); 2754 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc); 2755 } else { 2756 KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n", 2757 __kmp_gtid_from_thread(this_thr), 2758 __kmp_gtid_from_thread(thread))); 2759 } 2760 } 2761 } 2762 2763 KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n", 2764 __kmp_gtid_from_thread(this_thr))); 2765 } 2766 2767 /* // TODO: Check the comment consistency 2768 * Utility routines for "task teams". A task team (kmp_task_t) is kind of 2769 * like a shadow of the kmp_team_t data struct, with a different lifetime. 2770 * After a child * thread checks into a barrier and calls __kmp_release() from 2771 * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no 2772 * longer assume that the kmp_team_t structure is intact (at any moment, the 2773 * master thread may exit the barrier code and free the team data structure, 2774 * and return the threads to the thread pool). 2775 * 2776 * This does not work with the the tasking code, as the thread is still 2777 * expected to participate in the execution of any tasks that may have been 2778 * spawned my a member of the team, and the thread still needs access to all 2779 * to each thread in the team, so that it can steal work from it. 2780 * 2781 * Enter the existence of the kmp_task_team_t struct. It employs a reference 2782 * counting mechanims, and is allocated by the master thread before calling 2783 * __kmp_<barrier_kind>_release, and then is release by the last thread to 2784 * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes 2785 * of the kmp_task_team_t structs for consecutive barriers can overlap 2786 * (and will, unless the master thread is the last thread to exit the barrier 2787 * release phase, which is not typical). 2788 * 2789 * The existence of such a struct is useful outside the context of tasking, 2790 * but for now, I'm trying to keep it specific to the OMP_30_ENABLED macro, 2791 * so that any performance differences show up when comparing the 2.5 vs. 3.0 2792 * libraries. 2793 * 2794 * We currently use the existence of the threads array as an indicator that 2795 * tasks were spawned since the last barrier. If the structure is to be 2796 * useful outside the context of tasking, then this will have to change, but 2797 * not settting the field minimizes the performance impact of tasking on 2798 * barriers, when no explicit tasks were spawned (pushed, actually). 2799 */ 2800 2801 static kmp_task_team_t *__kmp_free_task_teams = 2802 NULL; // Free list for task_team data structures 2803 // Lock for task team data structures 2804 kmp_bootstrap_lock_t __kmp_task_team_lock = 2805 KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock); 2806 2807 // __kmp_alloc_task_deque: 2808 // Allocates a task deque for a particular thread, and initialize the necessary 2809 // data structures relating to the deque. This only happens once per thread 2810 // per task team since task teams are recycled. No lock is needed during 2811 // allocation since each thread allocates its own deque. 2812 static void __kmp_alloc_task_deque(kmp_info_t *thread, 2813 kmp_thread_data_t *thread_data) { 2814 __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock); 2815 KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL); 2816 2817 // Initialize last stolen task field to "none" 2818 thread_data->td.td_deque_last_stolen = -1; 2819 2820 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0); 2821 KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0); 2822 KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0); 2823 2824 KE_TRACE( 2825 10, 2826 ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n", 2827 __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data)); 2828 // Allocate space for task deque, and zero the deque 2829 // Cannot use __kmp_thread_calloc() because threads not around for 2830 // kmp_reap_task_team( ). 2831 thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate( 2832 INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *)); 2833 thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE; 2834 } 2835 2836 // __kmp_realloc_task_deque: 2837 // Re-allocates a task deque for a particular thread, copies the content from 2838 // the old deque and adjusts the necessary data structures relating to the 2839 // deque. This operation must be done with a the deque_lock being held 2840 static void __kmp_realloc_task_deque(kmp_info_t *thread, 2841 kmp_thread_data_t *thread_data) { 2842 kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td); 2843 kmp_int32 new_size = 2 * size; 2844 2845 KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to " 2846 "%d] for thread_data %p\n", 2847 __kmp_gtid_from_thread(thread), size, new_size, thread_data)); 2848 2849 kmp_taskdata_t **new_deque = 2850 (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *)); 2851 2852 int i, j; 2853 for (i = thread_data->td.td_deque_head, j = 0; j < size; 2854 i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++) 2855 new_deque[j] = thread_data->td.td_deque[i]; 2856 2857 __kmp_free(thread_data->td.td_deque); 2858 2859 thread_data->td.td_deque_head = 0; 2860 thread_data->td.td_deque_tail = size; 2861 thread_data->td.td_deque = new_deque; 2862 thread_data->td.td_deque_size = new_size; 2863 } 2864 2865 // __kmp_free_task_deque: 2866 // Deallocates a task deque for a particular thread. Happens at library 2867 // deallocation so don't need to reset all thread data fields. 2868 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) { 2869 if (thread_data->td.td_deque != NULL) { 2870 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 2871 TCW_4(thread_data->td.td_deque_ntasks, 0); 2872 __kmp_free(thread_data->td.td_deque); 2873 thread_data->td.td_deque = NULL; 2874 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2875 } 2876 2877 #ifdef BUILD_TIED_TASK_STACK 2878 // GEH: Figure out what to do here for td_susp_tied_tasks 2879 if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) { 2880 __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data); 2881 } 2882 #endif // BUILD_TIED_TASK_STACK 2883 } 2884 2885 // __kmp_realloc_task_threads_data: 2886 // Allocates a threads_data array for a task team, either by allocating an 2887 // initial array or enlarging an existing array. Only the first thread to get 2888 // the lock allocs or enlarges the array and re-initializes the array eleemnts. 2889 // That thread returns "TRUE", the rest return "FALSE". 2890 // Assumes that the new array size is given by task_team -> tt.tt_nproc. 2891 // The current size is given by task_team -> tt.tt_max_threads. 2892 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 2893 kmp_task_team_t *task_team) { 2894 kmp_thread_data_t **threads_data_p; 2895 kmp_int32 nthreads, maxthreads; 2896 int is_init_thread = FALSE; 2897 2898 if (TCR_4(task_team->tt.tt_found_tasks)) { 2899 // Already reallocated and initialized. 2900 return FALSE; 2901 } 2902 2903 threads_data_p = &task_team->tt.tt_threads_data; 2904 nthreads = task_team->tt.tt_nproc; 2905 maxthreads = task_team->tt.tt_max_threads; 2906 2907 // All threads must lock when they encounter the first task of the implicit 2908 // task region to make sure threads_data fields are (re)initialized before 2909 // used. 2910 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 2911 2912 if (!TCR_4(task_team->tt.tt_found_tasks)) { 2913 // first thread to enable tasking 2914 kmp_team_t *team = thread->th.th_team; 2915 int i; 2916 2917 is_init_thread = TRUE; 2918 if (maxthreads < nthreads) { 2919 2920 if (*threads_data_p != NULL) { 2921 kmp_thread_data_t *old_data = *threads_data_p; 2922 kmp_thread_data_t *new_data = NULL; 2923 2924 KE_TRACE( 2925 10, 2926 ("__kmp_realloc_task_threads_data: T#%d reallocating " 2927 "threads data for task_team %p, new_size = %d, old_size = %d\n", 2928 __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads)); 2929 // Reallocate threads_data to have more elements than current array 2930 // Cannot use __kmp_thread_realloc() because threads not around for 2931 // kmp_reap_task_team( ). Note all new array entries are initialized 2932 // to zero by __kmp_allocate(). 2933 new_data = (kmp_thread_data_t *)__kmp_allocate( 2934 nthreads * sizeof(kmp_thread_data_t)); 2935 // copy old data to new data 2936 KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t), 2937 (void *)old_data, maxthreads * sizeof(kmp_thread_data_t)); 2938 2939 #ifdef BUILD_TIED_TASK_STACK 2940 // GEH: Figure out if this is the right thing to do 2941 for (i = maxthreads; i < nthreads; i++) { 2942 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 2943 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 2944 } 2945 #endif // BUILD_TIED_TASK_STACK 2946 // Install the new data and free the old data 2947 (*threads_data_p) = new_data; 2948 __kmp_free(old_data); 2949 } else { 2950 KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating " 2951 "threads data for task_team %p, size = %d\n", 2952 __kmp_gtid_from_thread(thread), task_team, nthreads)); 2953 // Make the initial allocate for threads_data array, and zero entries 2954 // Cannot use __kmp_thread_calloc() because threads not around for 2955 // kmp_reap_task_team( ). 2956 ANNOTATE_IGNORE_WRITES_BEGIN(); 2957 *threads_data_p = (kmp_thread_data_t *)__kmp_allocate( 2958 nthreads * sizeof(kmp_thread_data_t)); 2959 ANNOTATE_IGNORE_WRITES_END(); 2960 #ifdef BUILD_TIED_TASK_STACK 2961 // GEH: Figure out if this is the right thing to do 2962 for (i = 0; i < nthreads; i++) { 2963 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 2964 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 2965 } 2966 #endif // BUILD_TIED_TASK_STACK 2967 } 2968 task_team->tt.tt_max_threads = nthreads; 2969 } else { 2970 // If array has (more than) enough elements, go ahead and use it 2971 KMP_DEBUG_ASSERT(*threads_data_p != NULL); 2972 } 2973 2974 // initialize threads_data pointers back to thread_info structures 2975 for (i = 0; i < nthreads; i++) { 2976 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 2977 thread_data->td.td_thr = team->t.t_threads[i]; 2978 2979 if (thread_data->td.td_deque_last_stolen >= nthreads) { 2980 // The last stolen field survives across teams / barrier, and the number 2981 // of threads may have changed. It's possible (likely?) that a new 2982 // parallel region will exhibit the same behavior as previous region. 2983 thread_data->td.td_deque_last_stolen = -1; 2984 } 2985 } 2986 2987 KMP_MB(); 2988 TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE); 2989 } 2990 2991 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 2992 return is_init_thread; 2993 } 2994 2995 // __kmp_free_task_threads_data: 2996 // Deallocates a threads_data array for a task team, including any attached 2997 // tasking deques. Only occurs at library shutdown. 2998 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) { 2999 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 3000 if (task_team->tt.tt_threads_data != NULL) { 3001 int i; 3002 for (i = 0; i < task_team->tt.tt_max_threads; i++) { 3003 __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]); 3004 } 3005 __kmp_free(task_team->tt.tt_threads_data); 3006 task_team->tt.tt_threads_data = NULL; 3007 } 3008 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 3009 } 3010 3011 // __kmp_allocate_task_team: 3012 // Allocates a task team associated with a specific team, taking it from 3013 // the global task team free list if possible. Also initializes data 3014 // structures. 3015 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread, 3016 kmp_team_t *team) { 3017 kmp_task_team_t *task_team = NULL; 3018 int nthreads; 3019 3020 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n", 3021 (thread ? __kmp_gtid_from_thread(thread) : -1), team)); 3022 3023 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 3024 // Take a task team from the task team pool 3025 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3026 if (__kmp_free_task_teams != NULL) { 3027 task_team = __kmp_free_task_teams; 3028 TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next); 3029 task_team->tt.tt_next = NULL; 3030 } 3031 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3032 } 3033 3034 if (task_team == NULL) { 3035 KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating " 3036 "task team for team %p\n", 3037 __kmp_gtid_from_thread(thread), team)); 3038 // Allocate a new task team if one is not available. 3039 // Cannot use __kmp_thread_malloc() because threads not around for 3040 // kmp_reap_task_team( ). 3041 task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t)); 3042 __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock); 3043 // AC: __kmp_allocate zeroes returned memory 3044 // task_team -> tt.tt_threads_data = NULL; 3045 // task_team -> tt.tt_max_threads = 0; 3046 // task_team -> tt.tt_next = NULL; 3047 } 3048 3049 TCW_4(task_team->tt.tt_found_tasks, FALSE); 3050 #if OMP_45_ENABLED 3051 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3052 #endif 3053 task_team->tt.tt_nproc = nthreads = team->t.t_nproc; 3054 3055 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads); 3056 TCW_4(task_team->tt.tt_active, TRUE); 3057 3058 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p " 3059 "unfinished_threads init'd to %d\n", 3060 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team, 3061 KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads))); 3062 return task_team; 3063 } 3064 3065 // __kmp_free_task_team: 3066 // Frees the task team associated with a specific thread, and adds it 3067 // to the global task team free list. 3068 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) { 3069 KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n", 3070 thread ? __kmp_gtid_from_thread(thread) : -1, task_team)); 3071 3072 // Put task team back on free list 3073 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3074 3075 KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL); 3076 task_team->tt.tt_next = __kmp_free_task_teams; 3077 TCW_PTR(__kmp_free_task_teams, task_team); 3078 3079 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3080 } 3081 3082 // __kmp_reap_task_teams: 3083 // Free all the task teams on the task team free list. 3084 // Should only be done during library shutdown. 3085 // Cannot do anything that needs a thread structure or gtid since they are 3086 // already gone. 3087 void __kmp_reap_task_teams(void) { 3088 kmp_task_team_t *task_team; 3089 3090 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 3091 // Free all task_teams on the free list 3092 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3093 while ((task_team = __kmp_free_task_teams) != NULL) { 3094 __kmp_free_task_teams = task_team->tt.tt_next; 3095 task_team->tt.tt_next = NULL; 3096 3097 // Free threads_data if necessary 3098 if (task_team->tt.tt_threads_data != NULL) { 3099 __kmp_free_task_threads_data(task_team); 3100 } 3101 __kmp_free(task_team); 3102 } 3103 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3104 } 3105 } 3106 3107 // __kmp_wait_to_unref_task_teams: 3108 // Some threads could still be in the fork barrier release code, possibly 3109 // trying to steal tasks. Wait for each thread to unreference its task team. 3110 void __kmp_wait_to_unref_task_teams(void) { 3111 kmp_info_t *thread; 3112 kmp_uint32 spins; 3113 int done; 3114 3115 KMP_INIT_YIELD(spins); 3116 3117 for (;;) { 3118 done = TRUE; 3119 3120 // TODO: GEH - this may be is wrong because some sync would be necessary 3121 // in case threads are added to the pool during the traversal. Need to 3122 // verify that lock for thread pool is held when calling this routine. 3123 for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL; 3124 thread = thread->th.th_next_pool) { 3125 #if KMP_OS_WINDOWS 3126 DWORD exit_val; 3127 #endif 3128 if (TCR_PTR(thread->th.th_task_team) == NULL) { 3129 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n", 3130 __kmp_gtid_from_thread(thread))); 3131 continue; 3132 } 3133 #if KMP_OS_WINDOWS 3134 // TODO: GEH - add this check for Linux* OS / OS X* as well? 3135 if (!__kmp_is_thread_alive(thread, &exit_val)) { 3136 thread->th.th_task_team = NULL; 3137 continue; 3138 } 3139 #endif 3140 3141 done = FALSE; // Because th_task_team pointer is not NULL for this thread 3142 3143 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to " 3144 "unreference task_team\n", 3145 __kmp_gtid_from_thread(thread))); 3146 3147 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 3148 volatile void *sleep_loc; 3149 // If the thread is sleeping, awaken it. 3150 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != 3151 NULL) { 3152 KA_TRACE( 3153 10, 3154 ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n", 3155 __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread))); 3156 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc); 3157 } 3158 } 3159 } 3160 if (done) { 3161 break; 3162 } 3163 3164 // If we are oversubscribed, or have waited a bit (and library mode is 3165 // throughput), yield. Pause is in the following code. 3166 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 3167 KMP_YIELD_SPIN(spins); // Yields only if KMP_LIBRARY=throughput 3168 } 3169 } 3170 3171 // __kmp_task_team_setup: Create a task_team for the current team, but use 3172 // an already created, unused one if it already exists. 3173 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) { 3174 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3175 3176 // If this task_team hasn't been created yet, allocate it. It will be used in 3177 // the region after the next. 3178 // If it exists, it is the current task team and shouldn't be touched yet as 3179 // it may still be in use. 3180 if (team->t.t_task_team[this_thr->th.th_task_state] == NULL && 3181 (always || team->t.t_nproc > 1)) { 3182 team->t.t_task_team[this_thr->th.th_task_state] = 3183 __kmp_allocate_task_team(this_thr, team); 3184 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p " 3185 "for team %d at parity=%d\n", 3186 __kmp_gtid_from_thread(this_thr), 3187 team->t.t_task_team[this_thr->th.th_task_state], 3188 ((team != NULL) ? team->t.t_id : -1), 3189 this_thr->th.th_task_state)); 3190 } 3191 3192 // After threads exit the release, they will call sync, and then point to this 3193 // other task_team; make sure it is allocated and properly initialized. As 3194 // threads spin in the barrier release phase, they will continue to use the 3195 // previous task_team struct(above), until they receive the signal to stop 3196 // checking for tasks (they can't safely reference the kmp_team_t struct, 3197 // which could be reallocated by the master thread). No task teams are formed 3198 // for serialized teams. 3199 if (team->t.t_nproc > 1) { 3200 int other_team = 1 - this_thr->th.th_task_state; 3201 if (team->t.t_task_team[other_team] == NULL) { // setup other team as well 3202 team->t.t_task_team[other_team] = 3203 __kmp_allocate_task_team(this_thr, team); 3204 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new " 3205 "task_team %p for team %d at parity=%d\n", 3206 __kmp_gtid_from_thread(this_thr), 3207 team->t.t_task_team[other_team], 3208 ((team != NULL) ? team->t.t_id : -1), other_team)); 3209 } else { // Leave the old task team struct in place for the upcoming region; 3210 // adjust as needed 3211 kmp_task_team_t *task_team = team->t.t_task_team[other_team]; 3212 if (!task_team->tt.tt_active || 3213 team->t.t_nproc != task_team->tt.tt_nproc) { 3214 TCW_4(task_team->tt.tt_nproc, team->t.t_nproc); 3215 TCW_4(task_team->tt.tt_found_tasks, FALSE); 3216 #if OMP_45_ENABLED 3217 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3218 #endif 3219 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, 3220 team->t.t_nproc); 3221 TCW_4(task_team->tt.tt_active, TRUE); 3222 } 3223 // if team size has changed, the first thread to enable tasking will 3224 // realloc threads_data if necessary 3225 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team " 3226 "%p for team %d at parity=%d\n", 3227 __kmp_gtid_from_thread(this_thr), 3228 team->t.t_task_team[other_team], 3229 ((team != NULL) ? team->t.t_id : -1), other_team)); 3230 } 3231 } 3232 } 3233 3234 // __kmp_task_team_sync: Propagation of task team data from team to threads 3235 // which happens just after the release phase of a team barrier. This may be 3236 // called by any thread, but only for teams with # threads > 1. 3237 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) { 3238 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3239 3240 // Toggle the th_task_state field, to switch which task_team this thread 3241 // refers to 3242 this_thr->th.th_task_state = 1 - this_thr->th.th_task_state; 3243 // It is now safe to propagate the task team pointer from the team struct to 3244 // the current thread. 3245 TCW_PTR(this_thr->th.th_task_team, 3246 team->t.t_task_team[this_thr->th.th_task_state]); 3247 KA_TRACE(20, 3248 ("__kmp_task_team_sync: Thread T#%d task team switched to task_team " 3249 "%p from Team #%d (parity=%d)\n", 3250 __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team, 3251 ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state)); 3252 } 3253 3254 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the 3255 // barrier gather phase. Only called by master thread if #threads in team > 1 or 3256 // if proxy tasks were created. 3257 // 3258 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off 3259 // by passing in 0 optionally as the last argument. When wait is zero, master 3260 // thread does not wait for unfinished_threads to reach 0. 3261 void __kmp_task_team_wait( 3262 kmp_info_t *this_thr, 3263 kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) { 3264 kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state]; 3265 3266 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3267 KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team); 3268 3269 if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) { 3270 if (wait) { 3271 KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks " 3272 "(for unfinished_threads to reach 0) on task_team = %p\n", 3273 __kmp_gtid_from_thread(this_thr), task_team)); 3274 // Worker threads may have dropped through to release phase, but could 3275 // still be executing tasks. Wait here for tasks to complete. To avoid 3276 // memory contention, only master thread checks termination condition. 3277 kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, 3278 &task_team->tt.tt_unfinished_threads), 3279 0U); 3280 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 3281 } 3282 // Deactivate the old task team, so that the worker threads will stop 3283 // referencing it while spinning. 3284 KA_TRACE( 3285 20, 3286 ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: " 3287 "setting active to false, setting local and team's pointer to NULL\n", 3288 __kmp_gtid_from_thread(this_thr), task_team)); 3289 #if OMP_45_ENABLED 3290 KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 || 3291 task_team->tt.tt_found_proxy_tasks == TRUE); 3292 TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3293 #else 3294 KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1); 3295 #endif 3296 KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0); 3297 TCW_SYNC_4(task_team->tt.tt_active, FALSE); 3298 KMP_MB(); 3299 3300 TCW_PTR(this_thr->th.th_task_team, NULL); 3301 } 3302 } 3303 3304 // __kmp_tasking_barrier: 3305 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier. 3306 // Internal function to execute all tasks prior to a regular barrier or a join 3307 // barrier. It is a full barrier itself, which unfortunately turns regular 3308 // barriers into double barriers and join barriers into 1 1/2 barriers. 3309 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) { 3310 std::atomic<kmp_uint32> *spin = RCAST( 3311 std::atomic<kmp_uint32> *, 3312 &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads); 3313 int flag = FALSE; 3314 KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier); 3315 3316 #if USE_ITT_BUILD 3317 KMP_FSYNC_SPIN_INIT(spin, NULL); 3318 #endif /* USE_ITT_BUILD */ 3319 kmp_flag_32 spin_flag(spin, 0U); 3320 while (!spin_flag.execute_tasks(thread, gtid, TRUE, 3321 &flag USE_ITT_BUILD_ARG(NULL), 0)) { 3322 #if USE_ITT_BUILD 3323 // TODO: What about itt_sync_obj?? 3324 KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin)); 3325 #endif /* USE_ITT_BUILD */ 3326 3327 if (TCR_4(__kmp_global.g.g_done)) { 3328 if (__kmp_global.g.g_abort) 3329 __kmp_abort_thread(); 3330 break; 3331 } 3332 KMP_YIELD(TRUE); // GH: We always yield here 3333 } 3334 #if USE_ITT_BUILD 3335 KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin)); 3336 #endif /* USE_ITT_BUILD */ 3337 } 3338 3339 #if OMP_45_ENABLED 3340 3341 // __kmp_give_task puts a task into a given thread queue if: 3342 // - the queue for that thread was created 3343 // - there's space in that queue 3344 // Because of this, __kmp_push_task needs to check if there's space after 3345 // getting the lock 3346 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task, 3347 kmp_int32 pass) { 3348 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 3349 kmp_task_team_t *task_team = taskdata->td_task_team; 3350 3351 KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n", 3352 taskdata, tid)); 3353 3354 // If task_team is NULL something went really bad... 3355 KMP_DEBUG_ASSERT(task_team != NULL); 3356 3357 bool result = false; 3358 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 3359 3360 if (thread_data->td.td_deque == NULL) { 3361 // There's no queue in this thread, go find another one 3362 // We're guaranteed that at least one thread has a queue 3363 KA_TRACE(30, 3364 ("__kmp_give_task: thread %d has no queue while giving task %p.\n", 3365 tid, taskdata)); 3366 return result; 3367 } 3368 3369 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3370 TASK_DEQUE_SIZE(thread_data->td)) { 3371 KA_TRACE( 3372 30, 3373 ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", 3374 taskdata, tid)); 3375 3376 // if this deque is bigger than the pass ratio give a chance to another 3377 // thread 3378 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 3379 return result; 3380 3381 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3382 __kmp_realloc_task_deque(thread, thread_data); 3383 3384 } else { 3385 3386 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3387 3388 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3389 TASK_DEQUE_SIZE(thread_data->td)) { 3390 KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to " 3391 "thread %d.\n", 3392 taskdata, tid)); 3393 3394 // if this deque is bigger than the pass ratio give a chance to another 3395 // thread 3396 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 3397 goto release_and_exit; 3398 3399 __kmp_realloc_task_deque(thread, thread_data); 3400 } 3401 } 3402 3403 // lock is held here, and there is space in the deque 3404 3405 thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata; 3406 // Wrap index. 3407 thread_data->td.td_deque_tail = 3408 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 3409 TCW_4(thread_data->td.td_deque_ntasks, 3410 TCR_4(thread_data->td.td_deque_ntasks) + 1); 3411 3412 result = true; 3413 KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n", 3414 taskdata, tid)); 3415 3416 release_and_exit: 3417 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 3418 3419 return result; 3420 } 3421 3422 /* The finish of the proxy tasks is divided in two pieces: 3423 - the top half is the one that can be done from a thread outside the team 3424 - the bottom half must be run from a them within the team 3425 3426 In order to run the bottom half the task gets queued back into one of the 3427 threads of the team. Once the td_incomplete_child_task counter of the parent 3428 is decremented the threads can leave the barriers. So, the bottom half needs 3429 to be queued before the counter is decremented. The top half is therefore 3430 divided in two parts: 3431 - things that can be run before queuing the bottom half 3432 - things that must be run after queuing the bottom half 3433 3434 This creates a second race as the bottom half can free the task before the 3435 second top half is executed. To avoid this we use the 3436 td_incomplete_child_task of the proxy task to synchronize the top and bottom 3437 half. */ 3438 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 3439 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 3440 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3441 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 3442 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 3443 3444 taskdata->td_flags.complete = 1; // mark the task as completed 3445 3446 if (taskdata->td_taskgroup) 3447 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); 3448 3449 // Create an imaginary children for this task so the bottom half cannot 3450 // release the task before we have completed the second top half 3451 KMP_ATOMIC_INC(&taskdata->td_incomplete_child_tasks); 3452 } 3453 3454 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 3455 kmp_int32 children = 0; 3456 3457 // Predecrement simulated by "- 1" calculation 3458 children = 3459 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1; 3460 KMP_DEBUG_ASSERT(children >= 0); 3461 3462 // Remove the imaginary children 3463 KMP_ATOMIC_DEC(&taskdata->td_incomplete_child_tasks); 3464 } 3465 3466 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) { 3467 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3468 kmp_info_t *thread = __kmp_threads[gtid]; 3469 3470 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3471 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 3472 1); // top half must run before bottom half 3473 3474 // We need to wait to make sure the top half is finished 3475 // Spinning here should be ok as this should happen quickly 3476 while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) > 0) 3477 ; 3478 3479 __kmp_release_deps(gtid, taskdata); 3480 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 3481 } 3482 3483 /*! 3484 @ingroup TASKING 3485 @param gtid Global Thread ID of encountering thread 3486 @param ptask Task which execution is completed 3487 3488 Execute the completation of a proxy task from a thread of that is part of the 3489 team. Run first and bottom halves directly. 3490 */ 3491 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) { 3492 KMP_DEBUG_ASSERT(ptask != NULL); 3493 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3494 KA_TRACE( 3495 10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n", 3496 gtid, taskdata)); 3497 3498 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3499 3500 __kmp_first_top_half_finish_proxy(taskdata); 3501 __kmp_second_top_half_finish_proxy(taskdata); 3502 __kmp_bottom_half_finish_proxy(gtid, ptask); 3503 3504 KA_TRACE(10, 3505 ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n", 3506 gtid, taskdata)); 3507 } 3508 3509 /*! 3510 @ingroup TASKING 3511 @param ptask Task which execution is completed 3512 3513 Execute the completation of a proxy task from a thread that could not belong to 3514 the team. 3515 */ 3516 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) { 3517 KMP_DEBUG_ASSERT(ptask != NULL); 3518 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3519 3520 KA_TRACE( 3521 10, 3522 ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n", 3523 taskdata)); 3524 3525 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3526 3527 __kmp_first_top_half_finish_proxy(taskdata); 3528 3529 // Enqueue task to complete bottom half completion from a thread within the 3530 // corresponding team 3531 kmp_team_t *team = taskdata->td_team; 3532 kmp_int32 nthreads = team->t.t_nproc; 3533 kmp_info_t *thread; 3534 3535 // This should be similar to start_k = __kmp_get_random( thread ) % nthreads 3536 // but we cannot use __kmp_get_random here 3537 kmp_int32 start_k = 0; 3538 kmp_int32 pass = 1; 3539 kmp_int32 k = start_k; 3540 3541 do { 3542 // For now we're just linearly trying to find a thread 3543 thread = team->t.t_threads[k]; 3544 k = (k + 1) % nthreads; 3545 3546 // we did a full pass through all the threads 3547 if (k == start_k) 3548 pass = pass << 1; 3549 3550 } while (!__kmp_give_task(thread, k, ptask, pass)); 3551 3552 __kmp_second_top_half_finish_proxy(taskdata); 3553 3554 KA_TRACE( 3555 10, 3556 ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", 3557 taskdata)); 3558 } 3559 3560 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task 3561 // for taskloop 3562 // 3563 // thread: allocating thread 3564 // task_src: pointer to source task to be duplicated 3565 // returns: a pointer to the allocated kmp_task_t structure (task). 3566 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) { 3567 kmp_task_t *task; 3568 kmp_taskdata_t *taskdata; 3569 kmp_taskdata_t *taskdata_src; 3570 kmp_taskdata_t *parent_task = thread->th.th_current_task; 3571 size_t shareds_offset; 3572 size_t task_size; 3573 3574 KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, 3575 task_src)); 3576 taskdata_src = KMP_TASK_TO_TASKDATA(task_src); 3577 KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy == 3578 TASK_FULL); // it should not be proxy task 3579 KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT); 3580 task_size = taskdata_src->td_size_alloc; 3581 3582 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 3583 KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread, 3584 task_size)); 3585 #if USE_FAST_MEMORY 3586 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size); 3587 #else 3588 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size); 3589 #endif /* USE_FAST_MEMORY */ 3590 KMP_MEMCPY(taskdata, taskdata_src, task_size); 3591 3592 task = KMP_TASKDATA_TO_TASK(taskdata); 3593 3594 // Initialize new task (only specific fields not affected by memcpy) 3595 taskdata->td_task_id = KMP_GEN_TASK_ID(); 3596 if (task->shareds != NULL) { // need setup shareds pointer 3597 shareds_offset = (char *)task_src->shareds - (char *)taskdata_src; 3598 task->shareds = &((char *)taskdata)[shareds_offset]; 3599 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 3600 0); 3601 } 3602 taskdata->td_alloc_thread = thread; 3603 taskdata->td_parent = parent_task; 3604 taskdata->td_taskgroup = 3605 parent_task 3606 ->td_taskgroup; // task inherits the taskgroup from the parent task 3607 3608 // Only need to keep track of child task counts if team parallel and tasking 3609 // not serialized 3610 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { 3611 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks); 3612 if (parent_task->td_taskgroup) 3613 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count); 3614 // Only need to keep track of allocated child tasks for explicit tasks since 3615 // implicit not deallocated 3616 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) 3617 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks); 3618 } 3619 3620 KA_TRACE(20, 3621 ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n", 3622 thread, taskdata, taskdata->td_parent)); 3623 #if OMPT_SUPPORT 3624 if (UNLIKELY(ompt_enabled.enabled)) 3625 __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid); 3626 #endif 3627 return task; 3628 } 3629 3630 // Routine optionally generated by the compiler for setting the lastprivate flag 3631 // and calling needed constructors for private/firstprivate objects 3632 // (used to form taskloop tasks from pattern task) 3633 // Parameters: dest task, src task, lastprivate flag. 3634 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32); 3635 3636 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8); 3637 3638 // class to encapsulate manipulating loop bounds in a taskloop task. 3639 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting 3640 // the loop bound variables. 3641 class kmp_taskloop_bounds_t { 3642 kmp_task_t *task; 3643 const kmp_taskdata_t *taskdata; 3644 size_t lower_offset; 3645 size_t upper_offset; 3646 3647 public: 3648 kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub) 3649 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)), 3650 lower_offset((char *)lb - (char *)task), 3651 upper_offset((char *)ub - (char *)task) { 3652 KMP_DEBUG_ASSERT((char *)lb > (char *)_task); 3653 KMP_DEBUG_ASSERT((char *)ub > (char *)_task); 3654 } 3655 kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds) 3656 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)), 3657 lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {} 3658 size_t get_lower_offset() const { return lower_offset; } 3659 size_t get_upper_offset() const { return upper_offset; } 3660 kmp_uint64 get_lb() const { 3661 kmp_int64 retval; 3662 #if defined(KMP_GOMP_COMPAT) 3663 // Intel task just returns the lower bound normally 3664 if (!taskdata->td_flags.native) { 3665 retval = *(kmp_int64 *)((char *)task + lower_offset); 3666 } else { 3667 // GOMP task has to take into account the sizeof(long) 3668 if (taskdata->td_size_loop_bounds == 4) { 3669 kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds); 3670 retval = (kmp_int64)*lb; 3671 } else { 3672 kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds); 3673 retval = (kmp_int64)*lb; 3674 } 3675 } 3676 #else 3677 retval = *(kmp_int64 *)((char *)task + lower_offset); 3678 #endif // defined(KMP_GOMP_COMPAT) 3679 return retval; 3680 } 3681 kmp_uint64 get_ub() const { 3682 kmp_int64 retval; 3683 #if defined(KMP_GOMP_COMPAT) 3684 // Intel task just returns the upper bound normally 3685 if (!taskdata->td_flags.native) { 3686 retval = *(kmp_int64 *)((char *)task + upper_offset); 3687 } else { 3688 // GOMP task has to take into account the sizeof(long) 3689 if (taskdata->td_size_loop_bounds == 4) { 3690 kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1; 3691 retval = (kmp_int64)*ub; 3692 } else { 3693 kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1; 3694 retval = (kmp_int64)*ub; 3695 } 3696 } 3697 #else 3698 retval = *(kmp_int64 *)((char *)task + upper_offset); 3699 #endif // defined(KMP_GOMP_COMPAT) 3700 return retval; 3701 } 3702 void set_lb(kmp_uint64 lb) { 3703 #if defined(KMP_GOMP_COMPAT) 3704 // Intel task just sets the lower bound normally 3705 if (!taskdata->td_flags.native) { 3706 *(kmp_uint64 *)((char *)task + lower_offset) = lb; 3707 } else { 3708 // GOMP task has to take into account the sizeof(long) 3709 if (taskdata->td_size_loop_bounds == 4) { 3710 kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds); 3711 *lower = (kmp_uint32)lb; 3712 } else { 3713 kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds); 3714 *lower = (kmp_uint64)lb; 3715 } 3716 } 3717 #else 3718 *(kmp_uint64 *)((char *)task + lower_offset) = lb; 3719 #endif // defined(KMP_GOMP_COMPAT) 3720 } 3721 void set_ub(kmp_uint64 ub) { 3722 #if defined(KMP_GOMP_COMPAT) 3723 // Intel task just sets the upper bound normally 3724 if (!taskdata->td_flags.native) { 3725 *(kmp_uint64 *)((char *)task + upper_offset) = ub; 3726 } else { 3727 // GOMP task has to take into account the sizeof(long) 3728 if (taskdata->td_size_loop_bounds == 4) { 3729 kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1; 3730 *upper = (kmp_uint32)ub; 3731 } else { 3732 kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1; 3733 *upper = (kmp_uint64)ub; 3734 } 3735 } 3736 #else 3737 *(kmp_uint64 *)((char *)task + upper_offset) = ub; 3738 #endif // defined(KMP_GOMP_COMPAT) 3739 } 3740 }; 3741 3742 // __kmp_taskloop_linear: Start tasks of the taskloop linearly 3743 // 3744 // loc Source location information 3745 // gtid Global thread ID 3746 // task Pattern task, exposes the loop iteration range 3747 // lb Pointer to loop lower bound in task structure 3748 // ub Pointer to loop upper bound in task structure 3749 // st Loop stride 3750 // ub_glob Global upper bound (used for lastprivate check) 3751 // num_tasks Number of tasks to execute 3752 // grainsize Number of loop iterations per task 3753 // extras Number of chunks with grainsize+1 iterations 3754 // tc Iterations count 3755 // task_dup Tasks duplication routine 3756 // codeptr_ra Return address for OMPT events 3757 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, 3758 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 3759 kmp_uint64 ub_glob, kmp_uint64 num_tasks, 3760 kmp_uint64 grainsize, kmp_uint64 extras, 3761 kmp_uint64 tc, 3762 #if OMPT_SUPPORT 3763 void *codeptr_ra, 3764 #endif 3765 void *task_dup) { 3766 KMP_COUNT_BLOCK(OMP_TASKLOOP); 3767 KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling); 3768 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 3769 // compiler provides global bounds here 3770 kmp_taskloop_bounds_t task_bounds(task, lb, ub); 3771 kmp_uint64 lower = task_bounds.get_lb(); 3772 kmp_uint64 upper = task_bounds.get_ub(); 3773 kmp_uint64 i; 3774 kmp_info_t *thread = __kmp_threads[gtid]; 3775 kmp_taskdata_t *current_task = thread->th.th_current_task; 3776 kmp_task_t *next_task; 3777 kmp_int32 lastpriv = 0; 3778 3779 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); 3780 KMP_DEBUG_ASSERT(num_tasks > extras); 3781 KMP_DEBUG_ASSERT(num_tasks > 0); 3782 KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, " 3783 "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n", 3784 gtid, num_tasks, grainsize, extras, lower, upper, ub_glob, st, 3785 task_dup)); 3786 3787 // Launch num_tasks tasks, assign grainsize iterations each task 3788 for (i = 0; i < num_tasks; ++i) { 3789 kmp_uint64 chunk_minus_1; 3790 if (extras == 0) { 3791 chunk_minus_1 = grainsize - 1; 3792 } else { 3793 chunk_minus_1 = grainsize; 3794 --extras; // first extras iterations get bigger chunk (grainsize+1) 3795 } 3796 upper = lower + st * chunk_minus_1; 3797 if (i == num_tasks - 1) { 3798 // schedule the last task, set lastprivate flag if needed 3799 if (st == 1) { // most common case 3800 KMP_DEBUG_ASSERT(upper == *ub); 3801 if (upper == ub_glob) 3802 lastpriv = 1; 3803 } else if (st > 0) { // positive loop stride 3804 KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper); 3805 if ((kmp_uint64)st > ub_glob - upper) 3806 lastpriv = 1; 3807 } else { // negative loop stride 3808 KMP_DEBUG_ASSERT(upper + st < *ub); 3809 if (upper - ub_glob < (kmp_uint64)(-st)) 3810 lastpriv = 1; 3811 } 3812 } 3813 next_task = __kmp_task_dup_alloc(thread, task); // allocate new task 3814 kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task); 3815 kmp_taskloop_bounds_t next_task_bounds = 3816 kmp_taskloop_bounds_t(next_task, task_bounds); 3817 3818 // adjust task-specific bounds 3819 next_task_bounds.set_lb(lower); 3820 if (next_taskdata->td_flags.native) { 3821 next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1)); 3822 } else { 3823 next_task_bounds.set_ub(upper); 3824 } 3825 if (ptask_dup != NULL) // set lastprivate flag, construct fistprivates, etc. 3826 ptask_dup(next_task, task, lastpriv); 3827 KA_TRACE(40, 3828 ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, " 3829 "upper %lld stride %lld, (offsets %p %p)\n", 3830 gtid, i, next_task, lower, upper, st, 3831 next_task_bounds.get_lower_offset(), 3832 next_task_bounds.get_upper_offset())); 3833 #if OMPT_SUPPORT 3834 __kmp_omp_taskloop_task(NULL, gtid, next_task, 3835 codeptr_ra); // schedule new task 3836 #else 3837 __kmp_omp_task(gtid, next_task, true); // schedule new task 3838 #endif 3839 lower = upper + st; // adjust lower bound for the next iteration 3840 } 3841 // free the pattern task and exit 3842 __kmp_task_start(gtid, task, current_task); // make internal bookkeeping 3843 // do not execute the pattern task, just do internal bookkeeping 3844 __kmp_task_finish<false>(gtid, task, current_task); 3845 } 3846 3847 // Structure to keep taskloop parameters for auxiliary task 3848 // kept in the shareds of the task structure. 3849 typedef struct __taskloop_params { 3850 kmp_task_t *task; 3851 kmp_uint64 *lb; 3852 kmp_uint64 *ub; 3853 void *task_dup; 3854 kmp_int64 st; 3855 kmp_uint64 ub_glob; 3856 kmp_uint64 num_tasks; 3857 kmp_uint64 grainsize; 3858 kmp_uint64 extras; 3859 kmp_uint64 tc; 3860 kmp_uint64 num_t_min; 3861 #if OMPT_SUPPORT 3862 void *codeptr_ra; 3863 #endif 3864 } __taskloop_params_t; 3865 3866 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *, 3867 kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64, 3868 kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64, 3869 #if OMPT_SUPPORT 3870 void *, 3871 #endif 3872 void *); 3873 3874 // Execute part of the the taskloop submitted as a task. 3875 int __kmp_taskloop_task(int gtid, void *ptask) { 3876 __taskloop_params_t *p = 3877 (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds; 3878 kmp_task_t *task = p->task; 3879 kmp_uint64 *lb = p->lb; 3880 kmp_uint64 *ub = p->ub; 3881 void *task_dup = p->task_dup; 3882 // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 3883 kmp_int64 st = p->st; 3884 kmp_uint64 ub_glob = p->ub_glob; 3885 kmp_uint64 num_tasks = p->num_tasks; 3886 kmp_uint64 grainsize = p->grainsize; 3887 kmp_uint64 extras = p->extras; 3888 kmp_uint64 tc = p->tc; 3889 kmp_uint64 num_t_min = p->num_t_min; 3890 #if OMPT_SUPPORT 3891 void *codeptr_ra = p->codeptr_ra; 3892 #endif 3893 #if KMP_DEBUG 3894 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 3895 KMP_DEBUG_ASSERT(task != NULL); 3896 KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize" 3897 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", 3898 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st, 3899 task_dup)); 3900 #endif 3901 KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min); 3902 if (num_tasks > num_t_min) 3903 __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, 3904 grainsize, extras, tc, num_t_min, 3905 #if OMPT_SUPPORT 3906 codeptr_ra, 3907 #endif 3908 task_dup); 3909 else 3910 __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, 3911 grainsize, extras, tc, 3912 #if OMPT_SUPPORT 3913 codeptr_ra, 3914 #endif 3915 task_dup); 3916 3917 KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid)); 3918 return 0; 3919 } 3920 3921 // Schedule part of the the taskloop as a task, 3922 // execute the rest of the the taskloop. 3923 // 3924 // loc Source location information 3925 // gtid Global thread ID 3926 // task Pattern task, exposes the loop iteration range 3927 // lb Pointer to loop lower bound in task structure 3928 // ub Pointer to loop upper bound in task structure 3929 // st Loop stride 3930 // ub_glob Global upper bound (used for lastprivate check) 3931 // num_tasks Number of tasks to execute 3932 // grainsize Number of loop iterations per task 3933 // extras Number of chunks with grainsize+1 iterations 3934 // tc Iterations count 3935 // num_t_min Threashold to launch tasks recursively 3936 // task_dup Tasks duplication routine 3937 // codeptr_ra Return address for OMPT events 3938 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task, 3939 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 3940 kmp_uint64 ub_glob, kmp_uint64 num_tasks, 3941 kmp_uint64 grainsize, kmp_uint64 extras, 3942 kmp_uint64 tc, kmp_uint64 num_t_min, 3943 #if OMPT_SUPPORT 3944 void *codeptr_ra, 3945 #endif 3946 void *task_dup) { 3947 #if KMP_DEBUG 3948 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 3949 KMP_DEBUG_ASSERT(task != NULL); 3950 KMP_DEBUG_ASSERT(num_tasks > num_t_min); 3951 KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize" 3952 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", 3953 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st, 3954 task_dup)); 3955 #endif 3956 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 3957 kmp_uint64 lower = *lb; 3958 kmp_info_t *thread = __kmp_threads[gtid]; 3959 // kmp_taskdata_t *current_task = thread->th.th_current_task; 3960 kmp_task_t *next_task; 3961 size_t lower_offset = 3962 (char *)lb - (char *)task; // remember offset of lb in the task structure 3963 size_t upper_offset = 3964 (char *)ub - (char *)task; // remember offset of ub in the task structure 3965 3966 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); 3967 KMP_DEBUG_ASSERT(num_tasks > extras); 3968 KMP_DEBUG_ASSERT(num_tasks > 0); 3969 3970 // split the loop in two halves 3971 kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1; 3972 kmp_uint64 gr_size0 = grainsize; 3973 kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute 3974 kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task 3975 if (n_tsk0 <= extras) { 3976 gr_size0++; // integrate extras into grainsize 3977 ext0 = 0; // no extra iters in 1st half 3978 ext1 = extras - n_tsk0; // remaining extras 3979 tc0 = gr_size0 * n_tsk0; 3980 tc1 = tc - tc0; 3981 } else { // n_tsk0 > extras 3982 ext1 = 0; // no extra iters in 2nd half 3983 ext0 = extras; 3984 tc1 = grainsize * n_tsk1; 3985 tc0 = tc - tc1; 3986 } 3987 ub0 = lower + st * (tc0 - 1); 3988 lb1 = ub0 + st; 3989 3990 // create pattern task for 2nd half of the loop 3991 next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task 3992 // adjust lower bound (upper bound is not changed) for the 2nd half 3993 *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1; 3994 if (ptask_dup != NULL) // construct fistprivates, etc. 3995 ptask_dup(next_task, task, 0); 3996 *ub = ub0; // adjust upper bound for the 1st half 3997 3998 // create auxiliary task for 2nd half of the loop 3999 kmp_task_t *new_task = 4000 __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *), 4001 sizeof(__taskloop_params_t), &__kmp_taskloop_task); 4002 __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds; 4003 p->task = next_task; 4004 p->lb = (kmp_uint64 *)((char *)next_task + lower_offset); 4005 p->ub = (kmp_uint64 *)((char *)next_task + upper_offset); 4006 p->task_dup = task_dup; 4007 p->st = st; 4008 p->ub_glob = ub_glob; 4009 p->num_tasks = n_tsk1; 4010 p->grainsize = grainsize; 4011 p->extras = ext1; 4012 p->tc = tc1; 4013 p->num_t_min = num_t_min; 4014 #if OMPT_SUPPORT 4015 p->codeptr_ra = codeptr_ra; 4016 #endif 4017 4018 #if OMPT_SUPPORT 4019 // schedule new task with correct return address for OMPT events 4020 __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra); 4021 #else 4022 __kmp_omp_task(gtid, new_task, true); // schedule new task 4023 #endif 4024 4025 // execute the 1st half of current subrange 4026 if (n_tsk0 > num_t_min) 4027 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0, 4028 ext0, tc0, num_t_min, 4029 #if OMPT_SUPPORT 4030 codeptr_ra, 4031 #endif 4032 task_dup); 4033 else 4034 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, 4035 gr_size0, ext0, tc0, 4036 #if OMPT_SUPPORT 4037 codeptr_ra, 4038 #endif 4039 task_dup); 4040 4041 KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid)); 4042 } 4043 4044 /*! 4045 @ingroup TASKING 4046 @param loc Source location information 4047 @param gtid Global thread ID 4048 @param task Task structure 4049 @param if_val Value of the if clause 4050 @param lb Pointer to loop lower bound in task structure 4051 @param ub Pointer to loop upper bound in task structure 4052 @param st Loop stride 4053 @param nogroup Flag, 1 if nogroup clause specified, 0 otherwise 4054 @param sched Schedule specified 0/1/2 for none/grainsize/num_tasks 4055 @param grainsize Schedule value if specified 4056 @param task_dup Tasks duplication routine 4057 4058 Execute the taskloop construct. 4059 */ 4060 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, 4061 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, 4062 int sched, kmp_uint64 grainsize, void *task_dup) { 4063 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4064 KMP_DEBUG_ASSERT(task != NULL); 4065 4066 if (nogroup == 0) { 4067 #if OMPT_SUPPORT && OMPT_OPTIONAL 4068 OMPT_STORE_RETURN_ADDRESS(gtid); 4069 #endif 4070 __kmpc_taskgroup(loc, gtid); 4071 } 4072 4073 // ========================================================================= 4074 // calculate loop parameters 4075 kmp_taskloop_bounds_t task_bounds(task, lb, ub); 4076 kmp_uint64 tc; 4077 // compiler provides global bounds here 4078 kmp_uint64 lower = task_bounds.get_lb(); 4079 kmp_uint64 upper = task_bounds.get_ub(); 4080 kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag 4081 kmp_uint64 num_tasks = 0, extras = 0; 4082 kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks; 4083 kmp_info_t *thread = __kmp_threads[gtid]; 4084 kmp_taskdata_t *current_task = thread->th.th_current_task; 4085 4086 KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, " 4087 "grain %llu(%d), dup %p\n", 4088 gtid, taskdata, lower, upper, st, grainsize, sched, task_dup)); 4089 4090 // compute trip count 4091 if (st == 1) { // most common case 4092 tc = upper - lower + 1; 4093 } else if (st < 0) { 4094 tc = (lower - upper) / (-st) + 1; 4095 } else { // st > 0 4096 tc = (upper - lower) / st + 1; 4097 } 4098 if (tc == 0) { 4099 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid)); 4100 // free the pattern task and exit 4101 __kmp_task_start(gtid, task, current_task); 4102 // do not execute anything for zero-trip loop 4103 __kmp_task_finish<false>(gtid, task, current_task); 4104 return; 4105 } 4106 4107 #if OMPT_SUPPORT && OMPT_OPTIONAL 4108 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 4109 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 4110 if (ompt_enabled.ompt_callback_work) { 4111 ompt_callbacks.ompt_callback(ompt_callback_work)( 4112 ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data), 4113 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0)); 4114 } 4115 #endif 4116 4117 if (num_tasks_min == 0) 4118 // TODO: can we choose better default heuristic? 4119 num_tasks_min = 4120 KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE); 4121 4122 // compute num_tasks/grainsize based on the input provided 4123 switch (sched) { 4124 case 0: // no schedule clause specified, we can choose the default 4125 // let's try to schedule (team_size*10) tasks 4126 grainsize = thread->th.th_team_nproc * 10; 4127 case 2: // num_tasks provided 4128 if (grainsize > tc) { 4129 num_tasks = tc; // too big num_tasks requested, adjust values 4130 grainsize = 1; 4131 extras = 0; 4132 } else { 4133 num_tasks = grainsize; 4134 grainsize = tc / num_tasks; 4135 extras = tc % num_tasks; 4136 } 4137 break; 4138 case 1: // grainsize provided 4139 if (grainsize > tc) { 4140 num_tasks = 1; // too big grainsize requested, adjust values 4141 grainsize = tc; 4142 extras = 0; 4143 } else { 4144 num_tasks = tc / grainsize; 4145 // adjust grainsize for balanced distribution of iterations 4146 grainsize = tc / num_tasks; 4147 extras = tc % num_tasks; 4148 } 4149 break; 4150 default: 4151 KMP_ASSERT2(0, "unknown scheduling of taskloop"); 4152 } 4153 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); 4154 KMP_DEBUG_ASSERT(num_tasks > extras); 4155 KMP_DEBUG_ASSERT(num_tasks > 0); 4156 // ========================================================================= 4157 4158 // check if clause value first 4159 // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native) 4160 if (if_val == 0) { // if(0) specified, mark task as serial 4161 taskdata->td_flags.task_serial = 1; 4162 taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied 4163 // always start serial tasks linearly 4164 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4165 grainsize, extras, tc, 4166 #if OMPT_SUPPORT 4167 OMPT_GET_RETURN_ADDRESS(0), 4168 #endif 4169 task_dup); 4170 // !taskdata->td_flags.native => currently force linear spawning of tasks 4171 // for GOMP_taskloop 4172 } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) { 4173 KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu" 4174 "(%lld), grain %llu, extras %llu\n", 4175 gtid, tc, num_tasks, num_tasks_min, grainsize, extras)); 4176 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4177 grainsize, extras, tc, num_tasks_min, 4178 #if OMPT_SUPPORT 4179 OMPT_GET_RETURN_ADDRESS(0), 4180 #endif 4181 task_dup); 4182 } else { 4183 KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu" 4184 "(%lld), grain %llu, extras %llu\n", 4185 gtid, tc, num_tasks, num_tasks_min, grainsize, extras)); 4186 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4187 grainsize, extras, tc, 4188 #if OMPT_SUPPORT 4189 OMPT_GET_RETURN_ADDRESS(0), 4190 #endif 4191 task_dup); 4192 } 4193 4194 #if OMPT_SUPPORT && OMPT_OPTIONAL 4195 if (ompt_enabled.ompt_callback_work) { 4196 ompt_callbacks.ompt_callback(ompt_callback_work)( 4197 ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data), 4198 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0)); 4199 } 4200 #endif 4201 4202 if (nogroup == 0) { 4203 #if OMPT_SUPPORT && OMPT_OPTIONAL 4204 OMPT_STORE_RETURN_ADDRESS(gtid); 4205 #endif 4206 __kmpc_end_taskgroup(loc, gtid); 4207 } 4208 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid)); 4209 } 4210 4211 #endif 4212