1 /* 2 * kmp_tasking.cpp -- OpenMP 3.0 tasking support. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // The LLVM Compiler Infrastructure 8 // 9 // This file is dual licensed under the MIT and the University of Illinois Open 10 // Source Licenses. See LICENSE.txt for details. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "kmp.h" 15 #include "kmp_i18n.h" 16 #include "kmp_itt.h" 17 #include "kmp_stats.h" 18 #include "kmp_wait_release.h" 19 20 #if OMPT_SUPPORT 21 #include "ompt-specific.h" 22 #endif 23 24 #include "tsan_annotations.h" 25 26 /* forward declaration */ 27 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 28 kmp_info_t *this_thr); 29 static void __kmp_alloc_task_deque(kmp_info_t *thread, 30 kmp_thread_data_t *thread_data); 31 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 32 kmp_task_team_t *task_team); 33 34 #ifdef OMP_45_ENABLED 35 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask); 36 #endif 37 38 #ifdef BUILD_TIED_TASK_STACK 39 40 // __kmp_trace_task_stack: print the tied tasks from the task stack in order 41 // from top do bottom 42 // 43 // gtid: global thread identifier for thread containing stack 44 // thread_data: thread data for task team thread containing stack 45 // threshold: value above which the trace statement triggers 46 // location: string identifying call site of this function (for trace) 47 static void __kmp_trace_task_stack(kmp_int32 gtid, 48 kmp_thread_data_t *thread_data, 49 int threshold, char *location) { 50 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 51 kmp_taskdata_t **stack_top = task_stack->ts_top; 52 kmp_int32 entries = task_stack->ts_entries; 53 kmp_taskdata_t *tied_task; 54 55 KA_TRACE( 56 threshold, 57 ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, " 58 "first_block = %p, stack_top = %p \n", 59 location, gtid, entries, task_stack->ts_first_block, stack_top)); 60 61 KMP_DEBUG_ASSERT(stack_top != NULL); 62 KMP_DEBUG_ASSERT(entries > 0); 63 64 while (entries != 0) { 65 KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]); 66 // fix up ts_top if we need to pop from previous block 67 if (entries & TASK_STACK_INDEX_MASK == 0) { 68 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top); 69 70 stack_block = stack_block->sb_prev; 71 stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 72 } 73 74 // finish bookkeeping 75 stack_top--; 76 entries--; 77 78 tied_task = *stack_top; 79 80 KMP_DEBUG_ASSERT(tied_task != NULL); 81 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 82 83 KA_TRACE(threshold, 84 ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, " 85 "stack_top=%p, tied_task=%p\n", 86 location, gtid, entries, stack_top, tied_task)); 87 } 88 KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]); 89 90 KA_TRACE(threshold, 91 ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n", 92 location, gtid)); 93 } 94 95 // __kmp_init_task_stack: initialize the task stack for the first time 96 // after a thread_data structure is created. 97 // It should not be necessary to do this again (assuming the stack works). 98 // 99 // gtid: global thread identifier of calling thread 100 // thread_data: thread data for task team thread containing stack 101 static void __kmp_init_task_stack(kmp_int32 gtid, 102 kmp_thread_data_t *thread_data) { 103 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 104 kmp_stack_block_t *first_block; 105 106 // set up the first block of the stack 107 first_block = &task_stack->ts_first_block; 108 task_stack->ts_top = (kmp_taskdata_t **)first_block; 109 memset((void *)first_block, '\0', 110 TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *)); 111 112 // initialize the stack to be empty 113 task_stack->ts_entries = TASK_STACK_EMPTY; 114 first_block->sb_next = NULL; 115 first_block->sb_prev = NULL; 116 } 117 118 // __kmp_free_task_stack: free the task stack when thread_data is destroyed. 119 // 120 // gtid: global thread identifier for calling thread 121 // thread_data: thread info for thread containing stack 122 static void __kmp_free_task_stack(kmp_int32 gtid, 123 kmp_thread_data_t *thread_data) { 124 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 125 kmp_stack_block_t *stack_block = &task_stack->ts_first_block; 126 127 KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY); 128 // free from the second block of the stack 129 while (stack_block != NULL) { 130 kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL; 131 132 stack_block->sb_next = NULL; 133 stack_block->sb_prev = NULL; 134 if (stack_block != &task_stack->ts_first_block) { 135 __kmp_thread_free(thread, 136 stack_block); // free the block, if not the first 137 } 138 stack_block = next_block; 139 } 140 // initialize the stack to be empty 141 task_stack->ts_entries = 0; 142 task_stack->ts_top = NULL; 143 } 144 145 // __kmp_push_task_stack: Push the tied task onto the task stack. 146 // Grow the stack if necessary by allocating another block. 147 // 148 // gtid: global thread identifier for calling thread 149 // thread: thread info for thread containing stack 150 // tied_task: the task to push on the stack 151 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread, 152 kmp_taskdata_t *tied_task) { 153 // GEH - need to consider what to do if tt_threads_data not allocated yet 154 kmp_thread_data_t *thread_data = 155 &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 156 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 157 158 if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) { 159 return; // Don't push anything on stack if team or team tasks are serialized 160 } 161 162 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 163 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 164 165 KA_TRACE(20, 166 ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n", 167 gtid, thread, tied_task)); 168 // Store entry 169 *(task_stack->ts_top) = tied_task; 170 171 // Do bookkeeping for next push 172 task_stack->ts_top++; 173 task_stack->ts_entries++; 174 175 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 176 // Find beginning of this task block 177 kmp_stack_block_t *stack_block = 178 (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE); 179 180 // Check if we already have a block 181 if (stack_block->sb_next != 182 NULL) { // reset ts_top to beginning of next block 183 task_stack->ts_top = &stack_block->sb_next->sb_block[0]; 184 } else { // Alloc new block and link it up 185 kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc( 186 thread, sizeof(kmp_stack_block_t)); 187 188 task_stack->ts_top = &new_block->sb_block[0]; 189 stack_block->sb_next = new_block; 190 new_block->sb_prev = stack_block; 191 new_block->sb_next = NULL; 192 193 KA_TRACE( 194 30, 195 ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n", 196 gtid, tied_task, new_block)); 197 } 198 } 199 KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 200 tied_task)); 201 } 202 203 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return 204 // the task, just check to make sure it matches the ending task passed in. 205 // 206 // gtid: global thread identifier for the calling thread 207 // thread: thread info structure containing stack 208 // tied_task: the task popped off the stack 209 // ending_task: the task that is ending (should match popped task) 210 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread, 211 kmp_taskdata_t *ending_task) { 212 // GEH - need to consider what to do if tt_threads_data not allocated yet 213 kmp_thread_data_t *thread_data = 214 &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)]; 215 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 216 kmp_taskdata_t *tied_task; 217 218 if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) { 219 // Don't pop anything from stack if team or team tasks are serialized 220 return; 221 } 222 223 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 224 KMP_DEBUG_ASSERT(task_stack->ts_entries > 0); 225 226 KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid, 227 thread)); 228 229 // fix up ts_top if we need to pop from previous block 230 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 231 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top); 232 233 stack_block = stack_block->sb_prev; 234 task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 235 } 236 237 // finish bookkeeping 238 task_stack->ts_top--; 239 task_stack->ts_entries--; 240 241 tied_task = *(task_stack->ts_top); 242 243 KMP_DEBUG_ASSERT(tied_task != NULL); 244 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 245 KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly 246 247 KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 248 tied_task)); 249 return; 250 } 251 #endif /* BUILD_TIED_TASK_STACK */ 252 253 // __kmp_push_task: Add a task to the thread's deque 254 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { 255 kmp_info_t *thread = __kmp_threads[gtid]; 256 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 257 kmp_task_team_t *task_team = thread->th.th_task_team; 258 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 259 kmp_thread_data_t *thread_data; 260 261 KA_TRACE(20, 262 ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata)); 263 264 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 265 // untied task needs to increment counter so that the task structure is not 266 // freed prematurely 267 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count); 268 KA_TRACE( 269 20, 270 ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n", 271 gtid, counter, taskdata)); 272 } 273 274 // The first check avoids building task_team thread data if serialized 275 if (taskdata->td_flags.task_serial) { 276 KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning " 277 "TASK_NOT_PUSHED for task %p\n", 278 gtid, taskdata)); 279 return TASK_NOT_PUSHED; 280 } 281 282 // Now that serialized tasks have returned, we can assume that we are not in 283 // immediate exec mode 284 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 285 if (!KMP_TASKING_ENABLED(task_team)) { 286 __kmp_enable_tasking(task_team, thread); 287 } 288 KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE); 289 KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL); 290 291 // Find tasking deque specific to encountering thread 292 thread_data = &task_team->tt.tt_threads_data[tid]; 293 294 // No lock needed since only owner can allocate 295 if (thread_data->td.td_deque == NULL) { 296 __kmp_alloc_task_deque(thread, thread_data); 297 } 298 299 // Check if deque is full 300 if (TCR_4(thread_data->td.td_deque_ntasks) >= 301 TASK_DEQUE_SIZE(thread_data->td)) { 302 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning " 303 "TASK_NOT_PUSHED for task %p\n", 304 gtid, taskdata)); 305 return TASK_NOT_PUSHED; 306 } 307 308 // Lock the deque for the task push operation 309 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 310 311 #if OMP_45_ENABLED 312 // Need to recheck as we can get a proxy task from a thread outside of OpenMP 313 if (TCR_4(thread_data->td.td_deque_ntasks) >= 314 TASK_DEQUE_SIZE(thread_data->td)) { 315 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 316 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; returning " 317 "TASK_NOT_PUSHED for task %p\n", 318 gtid, taskdata)); 319 return TASK_NOT_PUSHED; 320 } 321 #else 322 // Must have room since no thread can add tasks but calling thread 323 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) < 324 TASK_DEQUE_SIZE(thread_data->td)); 325 #endif 326 327 thread_data->td.td_deque[thread_data->td.td_deque_tail] = 328 taskdata; // Push taskdata 329 // Wrap index. 330 thread_data->td.td_deque_tail = 331 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 332 TCW_4(thread_data->td.td_deque_ntasks, 333 TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count 334 335 KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: " 336 "task=%p ntasks=%d head=%u tail=%u\n", 337 gtid, taskdata, thread_data->td.td_deque_ntasks, 338 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 339 340 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 341 342 return TASK_SUCCESSFULLY_PUSHED; 343 } 344 345 // __kmp_pop_current_task_from_thread: set up current task from called thread 346 // when team ends 347 // 348 // this_thr: thread structure to set current_task in. 349 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) { 350 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d " 351 "this_thread=%p, curtask=%p, " 352 "curtask_parent=%p\n", 353 0, this_thr, this_thr->th.th_current_task, 354 this_thr->th.th_current_task->td_parent)); 355 356 this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent; 357 358 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d " 359 "this_thread=%p, curtask=%p, " 360 "curtask_parent=%p\n", 361 0, this_thr, this_thr->th.th_current_task, 362 this_thr->th.th_current_task->td_parent)); 363 } 364 365 // __kmp_push_current_task_to_thread: set up current task in called thread for a 366 // new team 367 // 368 // this_thr: thread structure to set up 369 // team: team for implicit task data 370 // tid: thread within team to set up 371 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team, 372 int tid) { 373 // current task of the thread is a parent of the new just created implicit 374 // tasks of new team 375 KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p " 376 "curtask=%p " 377 "parent_task=%p\n", 378 tid, this_thr, this_thr->th.th_current_task, 379 team->t.t_implicit_task_taskdata[tid].td_parent)); 380 381 KMP_DEBUG_ASSERT(this_thr != NULL); 382 383 if (tid == 0) { 384 if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) { 385 team->t.t_implicit_task_taskdata[0].td_parent = 386 this_thr->th.th_current_task; 387 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0]; 388 } 389 } else { 390 team->t.t_implicit_task_taskdata[tid].td_parent = 391 team->t.t_implicit_task_taskdata[0].td_parent; 392 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid]; 393 } 394 395 KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p " 396 "curtask=%p " 397 "parent_task=%p\n", 398 tid, this_thr, this_thr->th.th_current_task, 399 team->t.t_implicit_task_taskdata[tid].td_parent)); 400 } 401 402 // __kmp_task_start: bookkeeping for a task starting execution 403 // 404 // GTID: global thread id of calling thread 405 // task: task starting execution 406 // current_task: task suspending 407 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task, 408 kmp_taskdata_t *current_task) { 409 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 410 kmp_info_t *thread = __kmp_threads[gtid]; 411 412 KA_TRACE(10, 413 ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n", 414 gtid, taskdata, current_task)); 415 416 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 417 418 // mark currently executing task as suspended 419 // TODO: GEH - make sure root team implicit task is initialized properly. 420 // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 ); 421 current_task->td_flags.executing = 0; 422 423 // Add task to stack if tied 424 #ifdef BUILD_TIED_TASK_STACK 425 if (taskdata->td_flags.tiedness == TASK_TIED) { 426 __kmp_push_task_stack(gtid, thread, taskdata); 427 } 428 #endif /* BUILD_TIED_TASK_STACK */ 429 430 // mark starting task as executing and as current task 431 thread->th.th_current_task = taskdata; 432 433 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 || 434 taskdata->td_flags.tiedness == TASK_UNTIED); 435 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 || 436 taskdata->td_flags.tiedness == TASK_UNTIED); 437 taskdata->td_flags.started = 1; 438 taskdata->td_flags.executing = 1; 439 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 440 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 441 442 // GEH TODO: shouldn't we pass some sort of location identifier here? 443 // APT: yes, we will pass location here. 444 // need to store current thread state (in a thread or taskdata structure) 445 // before setting work_state, otherwise wrong state is set after end of task 446 447 KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata)); 448 449 return; 450 } 451 452 #if OMPT_SUPPORT 453 //------------------------------------------------------------------------------ 454 // __ompt_task_init: 455 // Initialize OMPT fields maintained by a task. This will only be called after 456 // ompt_start_tool, so we already know whether ompt is enabled or not. 457 458 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) { 459 // The calls to __ompt_task_init already have the ompt_enabled condition. 460 task->ompt_task_info.task_data.value = 0; 461 task->ompt_task_info.frame.exit_frame = NULL; 462 task->ompt_task_info.frame.enter_frame = NULL; 463 #if OMP_40_ENABLED 464 task->ompt_task_info.ndeps = 0; 465 task->ompt_task_info.deps = NULL; 466 #endif /* OMP_40_ENABLED */ 467 } 468 469 // __ompt_task_start: 470 // Build and trigger task-begin event 471 static inline void __ompt_task_start(kmp_task_t *task, 472 kmp_taskdata_t *current_task, 473 kmp_int32 gtid) { 474 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 475 ompt_task_status_t status = ompt_task_others; 476 if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) { 477 status = ompt_task_yield; 478 __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0; 479 } 480 /* let OMPT know that we're about to run this task */ 481 if (ompt_enabled.ompt_callback_task_schedule) { 482 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)( 483 &(current_task->ompt_task_info.task_data), status, 484 &(taskdata->ompt_task_info.task_data)); 485 } 486 taskdata->ompt_task_info.scheduling_parent = current_task; 487 } 488 489 // __ompt_task_finish: 490 // Build and trigger final task-schedule event 491 static inline void 492 __ompt_task_finish(kmp_task_t *task, kmp_taskdata_t *resumed_task, 493 ompt_task_status_t status = ompt_task_complete) { 494 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 495 if (__kmp_omp_cancellation && taskdata->td_taskgroup && 496 taskdata->td_taskgroup->cancel_request == cancel_taskgroup) { 497 status = ompt_task_cancel; 498 } 499 500 /* let OMPT know that we're returning to the callee task */ 501 if (ompt_enabled.ompt_callback_task_schedule) { 502 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)( 503 &(taskdata->ompt_task_info.task_data), status, 504 &((resumed_task ? resumed_task 505 : (taskdata->ompt_task_info.scheduling_parent 506 ? taskdata->ompt_task_info.scheduling_parent 507 : taskdata->td_parent)) 508 ->ompt_task_info.task_data)); 509 } 510 } 511 #endif 512 513 template <bool ompt> 514 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid, 515 kmp_task_t *task, 516 void *frame_address, 517 void *return_address) { 518 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 519 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 520 521 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p " 522 "current_task=%p\n", 523 gtid, loc_ref, taskdata, current_task)); 524 525 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 526 // untied task needs to increment counter so that the task structure is not 527 // freed prematurely 528 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count); 529 KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) " 530 "incremented for task %p\n", 531 gtid, counter, taskdata)); 532 } 533 534 taskdata->td_flags.task_serial = 535 1; // Execute this task immediately, not deferred. 536 __kmp_task_start(gtid, task, current_task); 537 538 #if OMPT_SUPPORT 539 if (ompt) { 540 if (current_task->ompt_task_info.frame.enter_frame == NULL) { 541 current_task->ompt_task_info.frame.enter_frame = 542 taskdata->ompt_task_info.frame.exit_frame = frame_address; 543 } 544 if (ompt_enabled.ompt_callback_task_create) { 545 ompt_task_info_t *parent_info = &(current_task->ompt_task_info); 546 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 547 &(parent_info->task_data), &(parent_info->frame), 548 &(taskdata->ompt_task_info.task_data), 549 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0, 550 return_address); 551 } 552 __ompt_task_start(task, current_task, gtid); 553 } 554 #endif // OMPT_SUPPORT 555 556 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid, 557 loc_ref, taskdata)); 558 } 559 560 #if OMPT_SUPPORT 561 OMPT_NOINLINE 562 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid, 563 kmp_task_t *task, 564 void *frame_address, 565 void *return_address) { 566 __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address, 567 return_address); 568 } 569 #endif // OMPT_SUPPORT 570 571 // __kmpc_omp_task_begin_if0: report that a given serialized task has started 572 // execution 573 // 574 // loc_ref: source location information; points to beginning of task block. 575 // gtid: global thread number. 576 // task: task thunk for the started task. 577 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid, 578 kmp_task_t *task) { 579 #if OMPT_SUPPORT 580 if (UNLIKELY(ompt_enabled.enabled)) { 581 OMPT_STORE_RETURN_ADDRESS(gtid); 582 __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task, 583 OMPT_GET_FRAME_ADDRESS(1), 584 OMPT_LOAD_RETURN_ADDRESS(gtid)); 585 return; 586 } 587 #endif 588 __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL); 589 } 590 591 #ifdef TASK_UNUSED 592 // __kmpc_omp_task_begin: report that a given task has started execution 593 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 594 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) { 595 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 596 597 KA_TRACE( 598 10, 599 ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n", 600 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task)); 601 602 __kmp_task_start(gtid, task, current_task); 603 604 KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid, 605 loc_ref, KMP_TASK_TO_TASKDATA(task))); 606 return; 607 } 608 #endif // TASK_UNUSED 609 610 // __kmp_free_task: free the current task space and the space for shareds 611 // 612 // gtid: Global thread ID of calling thread 613 // taskdata: task to free 614 // thread: thread data structure of caller 615 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata, 616 kmp_info_t *thread) { 617 KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid, 618 taskdata)); 619 620 // Check to make sure all flags and counters have the correct values 621 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 622 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0); 623 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1); 624 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 625 KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 || 626 taskdata->td_flags.task_serial == 1); 627 KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0); 628 629 taskdata->td_flags.freed = 1; 630 ANNOTATE_HAPPENS_BEFORE(taskdata); 631 // deallocate the taskdata and shared variable blocks associated with this task 632 #if USE_FAST_MEMORY 633 __kmp_fast_free(thread, taskdata); 634 #else /* ! USE_FAST_MEMORY */ 635 __kmp_thread_free(thread, taskdata); 636 #endif 637 638 KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata)); 639 } 640 641 // __kmp_free_task_and_ancestors: free the current task and ancestors without 642 // children 643 // 644 // gtid: Global thread ID of calling thread 645 // taskdata: task to free 646 // thread: thread data structure of caller 647 static void __kmp_free_task_and_ancestors(kmp_int32 gtid, 648 kmp_taskdata_t *taskdata, 649 kmp_info_t *thread) { 650 #if OMP_45_ENABLED 651 // Proxy tasks must always be allowed to free their parents 652 // because they can be run in background even in serial mode. 653 kmp_int32 team_serial = 654 (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) && 655 !taskdata->td_flags.proxy; 656 #else 657 kmp_int32 team_serial = 658 taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser; 659 #endif 660 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 661 662 kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1; 663 KMP_DEBUG_ASSERT(children >= 0); 664 665 // Now, go up the ancestor tree to see if any ancestors can now be freed. 666 while (children == 0) { 667 kmp_taskdata_t *parent_taskdata = taskdata->td_parent; 668 669 KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete " 670 "and freeing itself\n", 671 gtid, taskdata)); 672 673 // --- Deallocate my ancestor task --- 674 __kmp_free_task(gtid, taskdata, thread); 675 676 taskdata = parent_taskdata; 677 678 // Stop checking ancestors at implicit task instead of walking up ancestor 679 // tree to avoid premature deallocation of ancestors. 680 if (team_serial || taskdata->td_flags.tasktype == TASK_IMPLICIT) 681 return; 682 683 // Predecrement simulated by "- 1" calculation 684 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1; 685 KMP_DEBUG_ASSERT(children >= 0); 686 } 687 688 KA_TRACE( 689 20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; " 690 "not freeing it yet\n", 691 gtid, taskdata, children)); 692 } 693 694 // __kmp_task_finish: bookkeeping to do when a task finishes execution 695 // 696 // gtid: global thread ID for calling thread 697 // task: task to be finished 698 // resumed_task: task to be resumed. (may be NULL if task is serialized) 699 template <bool ompt> 700 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, 701 kmp_taskdata_t *resumed_task) { 702 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 703 kmp_info_t *thread = __kmp_threads[gtid]; 704 kmp_task_team_t *task_team = 705 thread->th.th_task_team; // might be NULL for serial teams... 706 kmp_int32 children = 0; 707 708 KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming " 709 "task %p\n", 710 gtid, taskdata, resumed_task)); 711 712 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 713 714 // Pop task from stack if tied 715 #ifdef BUILD_TIED_TASK_STACK 716 if (taskdata->td_flags.tiedness == TASK_TIED) { 717 __kmp_pop_task_stack(gtid, thread, taskdata); 718 } 719 #endif /* BUILD_TIED_TASK_STACK */ 720 721 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 722 // untied task needs to check the counter so that the task structure is not 723 // freed prematurely 724 kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1; 725 KA_TRACE( 726 20, 727 ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n", 728 gtid, counter, taskdata)); 729 if (counter > 0) { 730 // untied task is not done, to be continued possibly by other thread, do 731 // not free it now 732 if (resumed_task == NULL) { 733 KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial); 734 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 735 // task is the parent 736 } 737 thread->th.th_current_task = resumed_task; // restore current_task 738 resumed_task->td_flags.executing = 1; // resume previous task 739 KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, " 740 "resuming task %p\n", 741 gtid, taskdata, resumed_task)); 742 return; 743 } 744 } 745 #if OMPT_SUPPORT 746 if (ompt) 747 __ompt_task_finish(task, resumed_task); 748 #endif 749 750 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 751 taskdata->td_flags.complete = 1; // mark the task as completed 752 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1); 753 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 754 755 // Only need to keep track of count if team parallel and tasking not 756 // serialized 757 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { 758 // Predecrement simulated by "- 1" calculation 759 children = 760 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1; 761 KMP_DEBUG_ASSERT(children >= 0); 762 #if OMP_40_ENABLED 763 if (taskdata->td_taskgroup) 764 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); 765 #if OMP_45_ENABLED 766 } 767 // if we found proxy tasks there could exist a dependency chain 768 // with the proxy task as origin 769 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) || 770 (task_team && task_team->tt.tt_found_proxy_tasks)) { 771 #endif 772 __kmp_release_deps(gtid, taskdata); 773 #endif 774 } 775 776 // td_flags.executing must be marked as 0 after __kmp_release_deps has been 777 // called. Othertwise, if a task is executed immediately from the release_deps 778 // code, the flag will be reset to 1 again by this same function 779 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); 780 taskdata->td_flags.executing = 0; // suspend the finishing task 781 782 KA_TRACE( 783 20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n", 784 gtid, taskdata, children)); 785 786 #if OMP_40_ENABLED 787 /* If the tasks' destructor thunk flag has been set, we need to invoke the 788 destructor thunk that has been generated by the compiler. The code is 789 placed here, since at this point other tasks might have been released 790 hence overlapping the destructor invokations with some other work in the 791 released tasks. The OpenMP spec is not specific on when the destructors 792 are invoked, so we should be free to choose. */ 793 if (taskdata->td_flags.destructors_thunk) { 794 kmp_routine_entry_t destr_thunk = task->data1.destructors; 795 KMP_ASSERT(destr_thunk); 796 destr_thunk(gtid, task); 797 } 798 #endif // OMP_40_ENABLED 799 800 // bookkeeping for resuming task: 801 // GEH - note tasking_ser => task_serial 802 KMP_DEBUG_ASSERT( 803 (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) == 804 taskdata->td_flags.task_serial); 805 if (taskdata->td_flags.task_serial) { 806 if (resumed_task == NULL) { 807 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 808 // task is the parent 809 } 810 } else { 811 KMP_DEBUG_ASSERT(resumed_task != 812 NULL); // verify that resumed task is passed as arguemnt 813 } 814 815 // Free this task and then ancestor tasks if they have no children. 816 // Restore th_current_task first as suggested by John: 817 // johnmc: if an asynchronous inquiry peers into the runtime system 818 // it doesn't see the freed task as the current task. 819 thread->th.th_current_task = resumed_task; 820 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 821 822 // TODO: GEH - make sure root team implicit task is initialized properly. 823 // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 ); 824 resumed_task->td_flags.executing = 1; // resume previous task 825 826 KA_TRACE( 827 10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n", 828 gtid, taskdata, resumed_task)); 829 830 return; 831 } 832 833 template <bool ompt> 834 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref, 835 kmp_int32 gtid, 836 kmp_task_t *task) { 837 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n", 838 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 839 // this routine will provide task to resume 840 __kmp_task_finish<ompt>(gtid, task, NULL); 841 842 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n", 843 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 844 845 #if OMPT_SUPPORT 846 if (ompt) { 847 omp_frame_t *ompt_frame; 848 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL); 849 ompt_frame->enter_frame = NULL; 850 } 851 #endif 852 853 return; 854 } 855 856 #if OMPT_SUPPORT 857 OMPT_NOINLINE 858 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid, 859 kmp_task_t *task) { 860 __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task); 861 } 862 #endif // OMPT_SUPPORT 863 864 // __kmpc_omp_task_complete_if0: report that a task has completed execution 865 // 866 // loc_ref: source location information; points to end of task block. 867 // gtid: global thread number. 868 // task: task thunk for the completed task. 869 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid, 870 kmp_task_t *task) { 871 #if OMPT_SUPPORT 872 if (UNLIKELY(ompt_enabled.enabled)) { 873 __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task); 874 return; 875 } 876 #endif 877 __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task); 878 } 879 880 #ifdef TASK_UNUSED 881 // __kmpc_omp_task_complete: report that a task has completed execution 882 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 883 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid, 884 kmp_task_t *task) { 885 KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid, 886 loc_ref, KMP_TASK_TO_TASKDATA(task))); 887 888 __kmp_task_finish<false>(gtid, task, 889 NULL); // Not sure how to find task to resume 890 891 KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid, 892 loc_ref, KMP_TASK_TO_TASKDATA(task))); 893 return; 894 } 895 #endif // TASK_UNUSED 896 897 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit 898 // task for a given thread 899 // 900 // loc_ref: reference to source location of parallel region 901 // this_thr: thread data structure corresponding to implicit task 902 // team: team for this_thr 903 // tid: thread id of given thread within team 904 // set_curr_task: TRUE if need to push current task to thread 905 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to 906 // have already been done elsewhere. 907 // TODO: Get better loc_ref. Value passed in may be NULL 908 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr, 909 kmp_team_t *team, int tid, int set_curr_task) { 910 kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid]; 911 912 KF_TRACE( 913 10, 914 ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n", 915 tid, team, task, set_curr_task ? "TRUE" : "FALSE")); 916 917 task->td_task_id = KMP_GEN_TASK_ID(); 918 task->td_team = team; 919 // task->td_parent = NULL; // fix for CQ230101 (broken parent task info 920 // in debugger) 921 task->td_ident = loc_ref; 922 task->td_taskwait_ident = NULL; 923 task->td_taskwait_counter = 0; 924 task->td_taskwait_thread = 0; 925 926 task->td_flags.tiedness = TASK_TIED; 927 task->td_flags.tasktype = TASK_IMPLICIT; 928 #if OMP_45_ENABLED 929 task->td_flags.proxy = TASK_FULL; 930 #endif 931 932 // All implicit tasks are executed immediately, not deferred 933 task->td_flags.task_serial = 1; 934 task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 935 task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 936 937 task->td_flags.started = 1; 938 task->td_flags.executing = 1; 939 task->td_flags.complete = 0; 940 task->td_flags.freed = 0; 941 942 #if OMP_40_ENABLED 943 task->td_depnode = NULL; 944 #endif 945 task->td_last_tied = task; 946 947 if (set_curr_task) { // only do this init first time thread is created 948 KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0); 949 // Not used: don't need to deallocate implicit task 950 KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0); 951 #if OMP_40_ENABLED 952 task->td_taskgroup = NULL; // An implicit task does not have taskgroup 953 task->td_dephash = NULL; 954 #endif 955 __kmp_push_current_task_to_thread(this_thr, team, tid); 956 } else { 957 KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0); 958 KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0); 959 } 960 961 #if OMPT_SUPPORT 962 if (UNLIKELY(ompt_enabled.enabled)) 963 __ompt_task_init(task, tid); 964 #endif 965 966 KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid, 967 team, task)); 968 } 969 970 // __kmp_finish_implicit_task: Release resources associated to implicit tasks 971 // at the end of parallel regions. Some resources are kept for reuse in the next 972 // parallel region. 973 // 974 // thread: thread data structure corresponding to implicit task 975 void __kmp_finish_implicit_task(kmp_info_t *thread) { 976 kmp_taskdata_t *task = thread->th.th_current_task; 977 if (task->td_dephash) 978 __kmp_dephash_free_entries(thread, task->td_dephash); 979 } 980 981 // __kmp_free_implicit_task: Release resources associated to implicit tasks 982 // when these are destroyed regions 983 // 984 // thread: thread data structure corresponding to implicit task 985 void __kmp_free_implicit_task(kmp_info_t *thread) { 986 kmp_taskdata_t *task = thread->th.th_current_task; 987 if (task && task->td_dephash) { 988 __kmp_dephash_free(thread, task->td_dephash); 989 task->td_dephash = NULL; 990 } 991 } 992 993 // Round up a size to a power of two specified by val: Used to insert padding 994 // between structures co-allocated using a single malloc() call 995 static size_t __kmp_round_up_to_val(size_t size, size_t val) { 996 if (size & (val - 1)) { 997 size &= ~(val - 1); 998 if (size <= KMP_SIZE_T_MAX - val) { 999 size += val; // Round up if there is no overflow. 1000 } 1001 } 1002 return size; 1003 } // __kmp_round_up_to_va 1004 1005 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task 1006 // 1007 // loc_ref: source location information 1008 // gtid: global thread number. 1009 // flags: include tiedness & task type (explicit vs. implicit) of the ''new'' 1010 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine. 1011 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including 1012 // private vars accessed in task. 1013 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed 1014 // in task. 1015 // task_entry: Pointer to task code entry point generated by compiler. 1016 // returns: a pointer to the allocated kmp_task_t structure (task). 1017 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1018 kmp_tasking_flags_t *flags, 1019 size_t sizeof_kmp_task_t, size_t sizeof_shareds, 1020 kmp_routine_entry_t task_entry) { 1021 kmp_task_t *task; 1022 kmp_taskdata_t *taskdata; 1023 kmp_info_t *thread = __kmp_threads[gtid]; 1024 kmp_team_t *team = thread->th.th_team; 1025 kmp_taskdata_t *parent_task = thread->th.th_current_task; 1026 size_t shareds_offset; 1027 1028 if (!TCR_4(__kmp_init_middle)) 1029 __kmp_middle_initialize(); 1030 1031 KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) " 1032 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1033 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t, 1034 sizeof_shareds, task_entry)); 1035 1036 if (parent_task->td_flags.final) { 1037 if (flags->merged_if0) { 1038 } 1039 flags->final = 1; 1040 } 1041 if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) { 1042 // Untied task encountered causes the TSC algorithm to check entire deque of 1043 // the victim thread. If no untied task encountered, then checking the head 1044 // of the deque should be enough. 1045 KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1); 1046 } 1047 1048 #if OMP_45_ENABLED 1049 if (flags->proxy == TASK_PROXY) { 1050 flags->tiedness = TASK_UNTIED; 1051 flags->merged_if0 = 1; 1052 1053 /* are we running in a sequential parallel or tskm_immediate_exec... we need 1054 tasking support enabled */ 1055 if ((thread->th.th_task_team) == NULL) { 1056 /* This should only happen if the team is serialized 1057 setup a task team and propagate it to the thread */ 1058 KMP_DEBUG_ASSERT(team->t.t_serialized); 1059 KA_TRACE(30, 1060 ("T#%d creating task team in __kmp_task_alloc for proxy task\n", 1061 gtid)); 1062 __kmp_task_team_setup( 1063 thread, team, 1064 1); // 1 indicates setup the current team regardless of nthreads 1065 thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state]; 1066 } 1067 kmp_task_team_t *task_team = thread->th.th_task_team; 1068 1069 /* tasking must be enabled now as the task might not be pushed */ 1070 if (!KMP_TASKING_ENABLED(task_team)) { 1071 KA_TRACE( 1072 30, 1073 ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid)); 1074 __kmp_enable_tasking(task_team, thread); 1075 kmp_int32 tid = thread->th.th_info.ds.ds_tid; 1076 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 1077 // No lock needed since only owner can allocate 1078 if (thread_data->td.td_deque == NULL) { 1079 __kmp_alloc_task_deque(thread, thread_data); 1080 } 1081 } 1082 1083 if (task_team->tt.tt_found_proxy_tasks == FALSE) 1084 TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE); 1085 } 1086 #endif 1087 1088 // Calculate shared structure offset including padding after kmp_task_t struct 1089 // to align pointers in shared struct 1090 shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t; 1091 shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *)); 1092 1093 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 1094 KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid, 1095 shareds_offset)); 1096 KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid, 1097 sizeof_shareds)); 1098 1099 // Avoid double allocation here by combining shareds with taskdata 1100 #if USE_FAST_MEMORY 1101 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset + 1102 sizeof_shareds); 1103 #else /* ! USE_FAST_MEMORY */ 1104 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset + 1105 sizeof_shareds); 1106 #endif /* USE_FAST_MEMORY */ 1107 ANNOTATE_HAPPENS_AFTER(taskdata); 1108 1109 task = KMP_TASKDATA_TO_TASK(taskdata); 1110 1111 // Make sure task & taskdata are aligned appropriately 1112 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD 1113 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0); 1114 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0); 1115 #else 1116 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0); 1117 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0); 1118 #endif 1119 if (sizeof_shareds > 0) { 1120 // Avoid double allocation here by combining shareds with taskdata 1121 task->shareds = &((char *)taskdata)[shareds_offset]; 1122 // Make sure shareds struct is aligned to pointer size 1123 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 1124 0); 1125 } else { 1126 task->shareds = NULL; 1127 } 1128 task->routine = task_entry; 1129 task->part_id = 0; // AC: Always start with 0 part id 1130 1131 taskdata->td_task_id = KMP_GEN_TASK_ID(); 1132 taskdata->td_team = team; 1133 taskdata->td_alloc_thread = thread; 1134 taskdata->td_parent = parent_task; 1135 taskdata->td_level = parent_task->td_level + 1; // increment nesting level 1136 KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0); 1137 taskdata->td_ident = loc_ref; 1138 taskdata->td_taskwait_ident = NULL; 1139 taskdata->td_taskwait_counter = 0; 1140 taskdata->td_taskwait_thread = 0; 1141 KMP_DEBUG_ASSERT(taskdata->td_parent != NULL); 1142 #if OMP_45_ENABLED 1143 // avoid copying icvs for proxy tasks 1144 if (flags->proxy == TASK_FULL) 1145 #endif 1146 copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs); 1147 1148 taskdata->td_flags.tiedness = flags->tiedness; 1149 taskdata->td_flags.final = flags->final; 1150 taskdata->td_flags.merged_if0 = flags->merged_if0; 1151 #if OMP_40_ENABLED 1152 taskdata->td_flags.destructors_thunk = flags->destructors_thunk; 1153 #endif // OMP_40_ENABLED 1154 #if OMP_45_ENABLED 1155 taskdata->td_flags.proxy = flags->proxy; 1156 taskdata->td_task_team = thread->th.th_task_team; 1157 taskdata->td_size_alloc = shareds_offset + sizeof_shareds; 1158 #endif 1159 taskdata->td_flags.tasktype = TASK_EXPLICIT; 1160 1161 // GEH - TODO: fix this to copy parent task's value of tasking_ser flag 1162 taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 1163 1164 // GEH - TODO: fix this to copy parent task's value of team_serial flag 1165 taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 1166 1167 // GEH - Note we serialize the task if the team is serialized to make sure 1168 // implicit parallel region tasks are not left until program termination to 1169 // execute. Also, it helps locality to execute immediately. 1170 1171 taskdata->td_flags.task_serial = 1172 (parent_task->td_flags.final || taskdata->td_flags.team_serial || 1173 taskdata->td_flags.tasking_ser); 1174 1175 taskdata->td_flags.started = 0; 1176 taskdata->td_flags.executing = 0; 1177 taskdata->td_flags.complete = 0; 1178 taskdata->td_flags.freed = 0; 1179 1180 taskdata->td_flags.native = flags->native; 1181 1182 KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0); 1183 // start at one because counts current task and children 1184 KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1); 1185 #if OMP_40_ENABLED 1186 taskdata->td_taskgroup = 1187 parent_task->td_taskgroup; // task inherits taskgroup from the parent task 1188 taskdata->td_dephash = NULL; 1189 taskdata->td_depnode = NULL; 1190 #endif 1191 if (flags->tiedness == TASK_UNTIED) 1192 taskdata->td_last_tied = NULL; // will be set when the task is scheduled 1193 else 1194 taskdata->td_last_tied = taskdata; 1195 1196 #if OMPT_SUPPORT 1197 if (UNLIKELY(ompt_enabled.enabled)) 1198 __ompt_task_init(taskdata, gtid); 1199 #endif 1200 // Only need to keep track of child task counts if team parallel and tasking not 1201 // serialized or if it is a proxy task 1202 #if OMP_45_ENABLED 1203 if (flags->proxy == TASK_PROXY || 1204 !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) 1205 #else 1206 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) 1207 #endif 1208 { 1209 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks); 1210 #if OMP_40_ENABLED 1211 if (parent_task->td_taskgroup) 1212 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count); 1213 #endif 1214 // Only need to keep track of allocated child tasks for explicit tasks since 1215 // implicit not deallocated 1216 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) { 1217 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks); 1218 } 1219 } 1220 1221 KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n", 1222 gtid, taskdata, taskdata->td_parent)); 1223 ANNOTATE_HAPPENS_BEFORE(task); 1224 1225 return task; 1226 } 1227 1228 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1229 kmp_int32 flags, size_t sizeof_kmp_task_t, 1230 size_t sizeof_shareds, 1231 kmp_routine_entry_t task_entry) { 1232 kmp_task_t *retval; 1233 kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags; 1234 1235 input_flags->native = FALSE; 1236 // __kmp_task_alloc() sets up all other runtime flags 1237 1238 #if OMP_45_ENABLED 1239 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) " 1240 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1241 gtid, loc_ref, input_flags->tiedness ? "tied " : "untied", 1242 input_flags->proxy ? "proxy" : "", sizeof_kmp_task_t, 1243 sizeof_shareds, task_entry)); 1244 #else 1245 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) " 1246 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1247 gtid, loc_ref, input_flags->tiedness ? "tied " : "untied", 1248 sizeof_kmp_task_t, sizeof_shareds, task_entry)); 1249 #endif 1250 1251 retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t, 1252 sizeof_shareds, task_entry); 1253 1254 KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval)); 1255 1256 return retval; 1257 } 1258 1259 // __kmp_invoke_task: invoke the specified task 1260 // 1261 // gtid: global thread ID of caller 1262 // task: the task to invoke 1263 // current_task: the task to resume after task invokation 1264 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, 1265 kmp_taskdata_t *current_task) { 1266 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 1267 kmp_uint64 cur_time; 1268 #if OMP_40_ENABLED 1269 int discard = 0 /* false */; 1270 #endif 1271 KA_TRACE( 1272 30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n", 1273 gtid, taskdata, current_task)); 1274 KMP_DEBUG_ASSERT(task); 1275 #if OMP_45_ENABLED 1276 if (taskdata->td_flags.proxy == TASK_PROXY && 1277 taskdata->td_flags.complete == 1) { 1278 // This is a proxy task that was already completed but it needs to run 1279 // its bottom-half finish 1280 KA_TRACE( 1281 30, 1282 ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n", 1283 gtid, taskdata)); 1284 1285 __kmp_bottom_half_finish_proxy(gtid, task); 1286 1287 KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for " 1288 "proxy task %p, resuming task %p\n", 1289 gtid, taskdata, current_task)); 1290 1291 return; 1292 } 1293 #endif 1294 1295 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1296 if (__kmp_forkjoin_frames_mode == 3) { 1297 // Get the current time stamp to measure task execution time to correct 1298 // barrier imbalance time 1299 cur_time = __itt_get_timestamp(); 1300 } 1301 #endif 1302 1303 #if OMPT_SUPPORT 1304 // For untied tasks, the first task executed only calls __kmpc_omp_task and 1305 // does not execute code. 1306 ompt_thread_info_t oldInfo; 1307 kmp_info_t *thread; 1308 if (UNLIKELY(ompt_enabled.enabled)) { 1309 // Store the threads states and restore them after the task 1310 thread = __kmp_threads[gtid]; 1311 oldInfo = thread->th.ompt_thread_info; 1312 thread->th.ompt_thread_info.wait_id = 0; 1313 thread->th.ompt_thread_info.state = (thread->th.th_team_serialized) 1314 ? omp_state_work_serial 1315 : omp_state_work_parallel; 1316 taskdata->ompt_task_info.frame.exit_frame = OMPT_GET_FRAME_ADDRESS(0); 1317 } 1318 #endif 1319 1320 #if OMP_45_ENABLED 1321 // Proxy tasks are not handled by the runtime 1322 if (taskdata->td_flags.proxy != TASK_PROXY) { 1323 #endif 1324 ANNOTATE_HAPPENS_AFTER(task); 1325 __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded 1326 #if OMP_45_ENABLED 1327 } 1328 #endif 1329 1330 #if OMP_40_ENABLED 1331 // TODO: cancel tasks if the parallel region has also been cancelled 1332 // TODO: check if this sequence can be hoisted above __kmp_task_start 1333 // if cancellation has been enabled for this run ... 1334 if (__kmp_omp_cancellation) { 1335 kmp_info_t *this_thr = __kmp_threads[gtid]; 1336 kmp_team_t *this_team = this_thr->th.th_team; 1337 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 1338 if ((taskgroup && taskgroup->cancel_request) || 1339 (this_team->t.t_cancel_request == cancel_parallel)) { 1340 #if OMPT_SUPPORT && OMPT_OPTIONAL 1341 ompt_data_t *task_data; 1342 if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) { 1343 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL); 1344 ompt_callbacks.ompt_callback(ompt_callback_cancel)( 1345 task_data, 1346 ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup 1347 : ompt_cancel_parallel) | 1348 ompt_cancel_discarded_task, 1349 NULL); 1350 } 1351 #endif 1352 KMP_COUNT_BLOCK(TASK_cancelled); 1353 // this task belongs to a task group and we need to cancel it 1354 discard = 1 /* true */; 1355 } 1356 } 1357 1358 // Invoke the task routine and pass in relevant data. 1359 // Thunks generated by gcc take a different argument list. 1360 if (!discard) { 1361 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 1362 taskdata->td_last_tied = current_task->td_last_tied; 1363 KMP_DEBUG_ASSERT(taskdata->td_last_tied); 1364 } 1365 #if KMP_STATS_ENABLED 1366 KMP_COUNT_BLOCK(TASK_executed); 1367 switch (KMP_GET_THREAD_STATE()) { 1368 case FORK_JOIN_BARRIER: 1369 KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar); 1370 break; 1371 case PLAIN_BARRIER: 1372 KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar); 1373 break; 1374 case TASKYIELD: 1375 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield); 1376 break; 1377 case TASKWAIT: 1378 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait); 1379 break; 1380 case TASKGROUP: 1381 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup); 1382 break; 1383 default: 1384 KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate); 1385 break; 1386 } 1387 #endif // KMP_STATS_ENABLED 1388 #endif // OMP_40_ENABLED 1389 1390 // OMPT task begin 1391 #if OMPT_SUPPORT 1392 if (UNLIKELY(ompt_enabled.enabled)) 1393 __ompt_task_start(task, current_task, gtid); 1394 #endif 1395 1396 #ifdef KMP_GOMP_COMPAT 1397 if (taskdata->td_flags.native) { 1398 ((void (*)(void *))(*(task->routine)))(task->shareds); 1399 } else 1400 #endif /* KMP_GOMP_COMPAT */ 1401 { 1402 (*(task->routine))(gtid, task); 1403 } 1404 KMP_POP_PARTITIONED_TIMER(); 1405 1406 #if OMP_40_ENABLED 1407 } 1408 #endif // OMP_40_ENABLED 1409 1410 1411 #if OMP_45_ENABLED 1412 // Proxy tasks are not handled by the runtime 1413 if (taskdata->td_flags.proxy != TASK_PROXY) { 1414 #endif 1415 ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent); 1416 #if OMPT_SUPPORT 1417 if (UNLIKELY(ompt_enabled.enabled)) { 1418 thread->th.ompt_thread_info = oldInfo; 1419 if (taskdata->td_flags.tiedness == TASK_TIED) { 1420 taskdata->ompt_task_info.frame.exit_frame = NULL; 1421 } 1422 __kmp_task_finish<true>(gtid, task, current_task); 1423 } else 1424 #endif 1425 __kmp_task_finish<false>(gtid, task, current_task); 1426 #if OMP_45_ENABLED 1427 } 1428 #endif 1429 1430 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1431 // Barrier imbalance - correct arrive time after the task finished 1432 if (__kmp_forkjoin_frames_mode == 3) { 1433 kmp_info_t *this_thr = __kmp_threads[gtid]; 1434 if (this_thr->th.th_bar_arrive_time) { 1435 this_thr->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time); 1436 } 1437 } 1438 #endif 1439 KA_TRACE( 1440 30, 1441 ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n", 1442 gtid, taskdata, current_task)); 1443 return; 1444 } 1445 1446 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution 1447 // 1448 // loc_ref: location of original task pragma (ignored) 1449 // gtid: Global Thread ID of encountering thread 1450 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task'' 1451 // Returns: 1452 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1453 // be resumed later. 1454 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1455 // resumed later. 1456 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid, 1457 kmp_task_t *new_task) { 1458 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1459 1460 KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid, 1461 loc_ref, new_taskdata)); 1462 1463 #if OMPT_SUPPORT 1464 kmp_taskdata_t *parent; 1465 if (UNLIKELY(ompt_enabled.enabled)) { 1466 parent = new_taskdata->td_parent; 1467 if (ompt_enabled.ompt_callback_task_create) { 1468 ompt_data_t task_data = ompt_data_none; 1469 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1470 parent ? &(parent->ompt_task_info.task_data) : &task_data, 1471 parent ? &(parent->ompt_task_info.frame) : NULL, 1472 &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0, 1473 OMPT_GET_RETURN_ADDRESS(0)); 1474 } 1475 } 1476 #endif 1477 1478 /* Should we execute the new task or queue it? For now, let's just always try 1479 to queue it. If the queue fills up, then we'll execute it. */ 1480 1481 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1482 { // Execute this task immediately 1483 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1484 new_taskdata->td_flags.task_serial = 1; 1485 __kmp_invoke_task(gtid, new_task, current_task); 1486 } 1487 1488 KA_TRACE( 1489 10, 1490 ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: " 1491 "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n", 1492 gtid, loc_ref, new_taskdata)); 1493 1494 ANNOTATE_HAPPENS_BEFORE(new_task); 1495 #if OMPT_SUPPORT 1496 if (UNLIKELY(ompt_enabled.enabled)) { 1497 parent->ompt_task_info.frame.enter_frame = NULL; 1498 } 1499 #endif 1500 return TASK_CURRENT_NOT_QUEUED; 1501 } 1502 1503 // __kmp_omp_task: Schedule a non-thread-switchable task for execution 1504 // 1505 // gtid: Global Thread ID of encountering thread 1506 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc() 1507 // serialize_immediate: if TRUE then if the task is executed immediately its 1508 // execution will be serialized 1509 // Returns: 1510 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1511 // be resumed later. 1512 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1513 // resumed later. 1514 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task, 1515 bool serialize_immediate) { 1516 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1517 1518 /* Should we execute the new task or queue it? For now, let's just always try to 1519 queue it. If the queue fills up, then we'll execute it. */ 1520 #if OMP_45_ENABLED 1521 if (new_taskdata->td_flags.proxy == TASK_PROXY || 1522 __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1523 #else 1524 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1525 #endif 1526 { // Execute this task immediately 1527 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1528 if (serialize_immediate) 1529 new_taskdata->td_flags.task_serial = 1; 1530 __kmp_invoke_task(gtid, new_task, current_task); 1531 } 1532 1533 ANNOTATE_HAPPENS_BEFORE(new_task); 1534 return TASK_CURRENT_NOT_QUEUED; 1535 } 1536 1537 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a 1538 // non-thread-switchable task from the parent thread only! 1539 // 1540 // loc_ref: location of original task pragma (ignored) 1541 // gtid: Global Thread ID of encountering thread 1542 // new_task: non-thread-switchable task thunk allocated by 1543 // __kmp_omp_task_alloc() 1544 // Returns: 1545 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1546 // be resumed later. 1547 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1548 // resumed later. 1549 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid, 1550 kmp_task_t *new_task) { 1551 kmp_int32 res; 1552 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); 1553 1554 #if KMP_DEBUG || OMPT_SUPPORT 1555 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1556 #endif 1557 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref, 1558 new_taskdata)); 1559 1560 #if OMPT_SUPPORT 1561 kmp_taskdata_t *parent = NULL; 1562 if (UNLIKELY(ompt_enabled.enabled)) { 1563 if (!new_taskdata->td_flags.started) { 1564 OMPT_STORE_RETURN_ADDRESS(gtid); 1565 parent = new_taskdata->td_parent; 1566 if (!parent->ompt_task_info.frame.enter_frame) { 1567 parent->ompt_task_info.frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1); 1568 } 1569 if (ompt_enabled.ompt_callback_task_create) { 1570 ompt_data_t task_data = ompt_data_none; 1571 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1572 parent ? &(parent->ompt_task_info.task_data) : &task_data, 1573 parent ? &(parent->ompt_task_info.frame) : NULL, 1574 &(new_taskdata->ompt_task_info.task_data), 1575 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, 1576 OMPT_LOAD_RETURN_ADDRESS(gtid)); 1577 } 1578 } else { 1579 // We are scheduling the continuation of an UNTIED task. 1580 // Scheduling back to the parent task. 1581 __ompt_task_finish(new_task, 1582 new_taskdata->ompt_task_info.scheduling_parent, 1583 ompt_task_others); 1584 new_taskdata->ompt_task_info.frame.exit_frame = NULL; 1585 } 1586 } 1587 #endif 1588 1589 res = __kmp_omp_task(gtid, new_task, true); 1590 1591 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning " 1592 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n", 1593 gtid, loc_ref, new_taskdata)); 1594 #if OMPT_SUPPORT 1595 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) { 1596 parent->ompt_task_info.frame.enter_frame = NULL; 1597 } 1598 #endif 1599 return res; 1600 } 1601 1602 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule 1603 // a taskloop task with the correct OMPT return address 1604 // 1605 // loc_ref: location of original task pragma (ignored) 1606 // gtid: Global Thread ID of encountering thread 1607 // new_task: non-thread-switchable task thunk allocated by 1608 // __kmp_omp_task_alloc() 1609 // codeptr_ra: return address for OMPT callback 1610 // Returns: 1611 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1612 // be resumed later. 1613 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1614 // resumed later. 1615 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid, 1616 kmp_task_t *new_task, void *codeptr_ra) { 1617 kmp_int32 res; 1618 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); 1619 1620 #if KMP_DEBUG || OMPT_SUPPORT 1621 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1622 #endif 1623 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref, 1624 new_taskdata)); 1625 1626 #if OMPT_SUPPORT 1627 kmp_taskdata_t *parent = NULL; 1628 if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) { 1629 parent = new_taskdata->td_parent; 1630 if (!parent->ompt_task_info.frame.enter_frame) 1631 parent->ompt_task_info.frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1); 1632 if (ompt_enabled.ompt_callback_task_create) { 1633 ompt_data_t task_data = ompt_data_none; 1634 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 1635 parent ? &(parent->ompt_task_info.task_data) : &task_data, 1636 parent ? &(parent->ompt_task_info.frame) : NULL, 1637 &(new_taskdata->ompt_task_info.task_data), 1638 ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, 1639 codeptr_ra); 1640 } 1641 } 1642 #endif 1643 1644 res = __kmp_omp_task(gtid, new_task, true); 1645 1646 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning " 1647 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n", 1648 gtid, loc_ref, new_taskdata)); 1649 #if OMPT_SUPPORT 1650 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) { 1651 parent->ompt_task_info.frame.enter_frame = NULL; 1652 } 1653 #endif 1654 return res; 1655 } 1656 1657 template <bool ompt> 1658 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid, 1659 void *frame_address, 1660 void *return_address) { 1661 kmp_taskdata_t *taskdata; 1662 kmp_info_t *thread; 1663 int thread_finished = FALSE; 1664 KMP_SET_THREAD_STATE_BLOCK(TASKWAIT); 1665 1666 KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref)); 1667 1668 if (__kmp_tasking_mode != tskm_immediate_exec) { 1669 thread = __kmp_threads[gtid]; 1670 taskdata = thread->th.th_current_task; 1671 1672 #if OMPT_SUPPORT && OMPT_OPTIONAL 1673 ompt_data_t *my_task_data; 1674 ompt_data_t *my_parallel_data; 1675 1676 if (ompt) { 1677 my_task_data = &(taskdata->ompt_task_info.task_data); 1678 my_parallel_data = OMPT_CUR_TEAM_DATA(thread); 1679 1680 taskdata->ompt_task_info.frame.enter_frame = frame_address; 1681 1682 if (ompt_enabled.ompt_callback_sync_region) { 1683 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1684 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data, 1685 my_task_data, return_address); 1686 } 1687 1688 if (ompt_enabled.ompt_callback_sync_region_wait) { 1689 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1690 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data, 1691 my_task_data, return_address); 1692 } 1693 } 1694 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1695 1696 // Debugger: The taskwait is active. Store location and thread encountered the 1697 // taskwait. 1698 #if USE_ITT_BUILD 1699 // Note: These values are used by ITT events as well. 1700 #endif /* USE_ITT_BUILD */ 1701 taskdata->td_taskwait_counter += 1; 1702 taskdata->td_taskwait_ident = loc_ref; 1703 taskdata->td_taskwait_thread = gtid + 1; 1704 1705 #if USE_ITT_BUILD 1706 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 1707 if (itt_sync_obj != NULL) 1708 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 1709 #endif /* USE_ITT_BUILD */ 1710 1711 bool must_wait = 1712 !taskdata->td_flags.team_serial && !taskdata->td_flags.final; 1713 1714 #if OMP_45_ENABLED 1715 must_wait = must_wait || (thread->th.th_task_team != NULL && 1716 thread->th.th_task_team->tt.tt_found_proxy_tasks); 1717 #endif 1718 if (must_wait) { 1719 kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, 1720 &(taskdata->td_incomplete_child_tasks)), 1721 0U); 1722 while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) { 1723 flag.execute_tasks(thread, gtid, FALSE, 1724 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 1725 __kmp_task_stealing_constraint); 1726 } 1727 } 1728 #if USE_ITT_BUILD 1729 if (itt_sync_obj != NULL) 1730 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 1731 #endif /* USE_ITT_BUILD */ 1732 1733 // Debugger: The taskwait is completed. Location remains, but thread is 1734 // negated. 1735 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 1736 1737 #if OMPT_SUPPORT && OMPT_OPTIONAL 1738 if (ompt) { 1739 if (ompt_enabled.ompt_callback_sync_region_wait) { 1740 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 1741 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data, 1742 my_task_data, return_address); 1743 } 1744 if (ompt_enabled.ompt_callback_sync_region) { 1745 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 1746 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data, 1747 my_task_data, return_address); 1748 } 1749 taskdata->ompt_task_info.frame.enter_frame = NULL; 1750 } 1751 #endif // OMPT_SUPPORT && OMPT_OPTIONAL 1752 1753 ANNOTATE_HAPPENS_AFTER(taskdata); 1754 } 1755 1756 KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, " 1757 "returning TASK_CURRENT_NOT_QUEUED\n", 1758 gtid, taskdata)); 1759 1760 return TASK_CURRENT_NOT_QUEUED; 1761 } 1762 1763 #if OMPT_SUPPORT 1764 OMPT_NOINLINE 1765 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid, 1766 void *frame_address, 1767 void *return_address) { 1768 return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address, 1769 return_address); 1770 } 1771 #endif // OMPT_SUPPORT 1772 1773 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are 1774 // complete 1775 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) { 1776 #if OMPT_SUPPORT && OMPT_OPTIONAL 1777 if (UNLIKELY(ompt_enabled.enabled)) { 1778 OMPT_STORE_RETURN_ADDRESS(gtid); 1779 return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(1), 1780 OMPT_LOAD_RETURN_ADDRESS(gtid)); 1781 } 1782 #endif 1783 return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL); 1784 } 1785 1786 // __kmpc_omp_taskyield: switch to a different task 1787 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) { 1788 kmp_taskdata_t *taskdata; 1789 kmp_info_t *thread; 1790 int thread_finished = FALSE; 1791 1792 KMP_COUNT_BLOCK(OMP_TASKYIELD); 1793 KMP_SET_THREAD_STATE_BLOCK(TASKYIELD); 1794 1795 KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n", 1796 gtid, loc_ref, end_part)); 1797 1798 if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) { 1799 thread = __kmp_threads[gtid]; 1800 taskdata = thread->th.th_current_task; 1801 // Should we model this as a task wait or not? 1802 // Debugger: The taskwait is active. Store location and thread encountered the 1803 // taskwait. 1804 #if USE_ITT_BUILD 1805 // Note: These values are used by ITT events as well. 1806 #endif /* USE_ITT_BUILD */ 1807 taskdata->td_taskwait_counter += 1; 1808 taskdata->td_taskwait_ident = loc_ref; 1809 taskdata->td_taskwait_thread = gtid + 1; 1810 1811 #if USE_ITT_BUILD 1812 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 1813 if (itt_sync_obj != NULL) 1814 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 1815 #endif /* USE_ITT_BUILD */ 1816 if (!taskdata->td_flags.team_serial) { 1817 kmp_task_team_t *task_team = thread->th.th_task_team; 1818 if (task_team != NULL) { 1819 if (KMP_TASKING_ENABLED(task_team)) { 1820 #if OMPT_SUPPORT 1821 if (UNLIKELY(ompt_enabled.enabled)) 1822 thread->th.ompt_thread_info.ompt_task_yielded = 1; 1823 #endif 1824 __kmp_execute_tasks_32( 1825 thread, gtid, NULL, FALSE, 1826 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 1827 __kmp_task_stealing_constraint); 1828 #if OMPT_SUPPORT 1829 if (UNLIKELY(ompt_enabled.enabled)) 1830 thread->th.ompt_thread_info.ompt_task_yielded = 0; 1831 #endif 1832 } 1833 } 1834 } 1835 #if USE_ITT_BUILD 1836 if (itt_sync_obj != NULL) 1837 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 1838 #endif /* USE_ITT_BUILD */ 1839 1840 // Debugger: The taskwait is completed. Location remains, but thread is 1841 // negated. 1842 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 1843 } 1844 1845 KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, " 1846 "returning TASK_CURRENT_NOT_QUEUED\n", 1847 gtid, taskdata)); 1848 1849 return TASK_CURRENT_NOT_QUEUED; 1850 } 1851 1852 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work 1853 #if OMP_45_ENABLED 1854 // Task Reduction implementation 1855 1856 typedef struct kmp_task_red_flags { 1857 unsigned lazy_priv : 1; // hint: (1) use lazy allocation (big objects) 1858 unsigned reserved31 : 31; 1859 } kmp_task_red_flags_t; 1860 1861 // internal structure for reduction data item related info 1862 typedef struct kmp_task_red_data { 1863 void *reduce_shar; // shared reduction item 1864 size_t reduce_size; // size of data item 1865 void *reduce_priv; // thread specific data 1866 void *reduce_pend; // end of private data for comparison op 1867 void *reduce_init; // data initialization routine 1868 void *reduce_fini; // data finalization routine 1869 void *reduce_comb; // data combiner routine 1870 kmp_task_red_flags_t flags; // flags for additional info from compiler 1871 } kmp_task_red_data_t; 1872 1873 // structure sent us by compiler - one per reduction item 1874 typedef struct kmp_task_red_input { 1875 void *reduce_shar; // shared reduction item 1876 size_t reduce_size; // size of data item 1877 void *reduce_init; // data initialization routine 1878 void *reduce_fini; // data finalization routine 1879 void *reduce_comb; // data combiner routine 1880 kmp_task_red_flags_t flags; // flags for additional info from compiler 1881 } kmp_task_red_input_t; 1882 1883 /*! 1884 @ingroup TASKING 1885 @param gtid Global thread ID 1886 @param num Number of data items to reduce 1887 @param data Array of data for reduction 1888 @return The taskgroup identifier 1889 1890 Initialize task reduction for the taskgroup. 1891 */ 1892 void *__kmpc_task_reduction_init(int gtid, int num, void *data) { 1893 kmp_info_t *thread = __kmp_threads[gtid]; 1894 kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup; 1895 kmp_int32 nth = thread->th.th_team_nproc; 1896 kmp_task_red_input_t *input = (kmp_task_red_input_t *)data; 1897 kmp_task_red_data_t *arr; 1898 1899 // check input data just in case 1900 KMP_ASSERT(tg != NULL); 1901 KMP_ASSERT(data != NULL); 1902 KMP_ASSERT(num > 0); 1903 if (nth == 1) { 1904 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n", 1905 gtid, tg)); 1906 return (void *)tg; 1907 } 1908 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n", 1909 gtid, tg, num)); 1910 arr = (kmp_task_red_data_t *)__kmp_thread_malloc( 1911 thread, num * sizeof(kmp_task_red_data_t)); 1912 for (int i = 0; i < num; ++i) { 1913 void (*f_init)(void *) = (void (*)(void *))(input[i].reduce_init); 1914 size_t size = input[i].reduce_size - 1; 1915 // round the size up to cache line per thread-specific item 1916 size += CACHE_LINE - size % CACHE_LINE; 1917 KMP_ASSERT(input[i].reduce_comb != NULL); // combiner is mandatory 1918 arr[i].reduce_shar = input[i].reduce_shar; 1919 arr[i].reduce_size = size; 1920 arr[i].reduce_init = input[i].reduce_init; 1921 arr[i].reduce_fini = input[i].reduce_fini; 1922 arr[i].reduce_comb = input[i].reduce_comb; 1923 arr[i].flags = input[i].flags; 1924 if (!input[i].flags.lazy_priv) { 1925 // allocate cache-line aligned block and fill it with zeros 1926 arr[i].reduce_priv = __kmp_allocate(nth * size); 1927 arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size; 1928 if (f_init != NULL) { 1929 // initialize thread-specific items 1930 for (int j = 0; j < nth; ++j) { 1931 f_init((char *)(arr[i].reduce_priv) + j * size); 1932 } 1933 } 1934 } else { 1935 // only allocate space for pointers now, 1936 // objects will be lazily allocated/initialized once requested 1937 arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *)); 1938 } 1939 } 1940 tg->reduce_data = (void *)arr; 1941 tg->reduce_num_data = num; 1942 return (void *)tg; 1943 } 1944 1945 /*! 1946 @ingroup TASKING 1947 @param gtid Global thread ID 1948 @param tskgrp The taskgroup ID (optional) 1949 @param data Shared location of the item 1950 @return The pointer to per-thread data 1951 1952 Get thread-specific location of data item 1953 */ 1954 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) { 1955 kmp_info_t *thread = __kmp_threads[gtid]; 1956 kmp_int32 nth = thread->th.th_team_nproc; 1957 if (nth == 1) 1958 return data; // nothing to do 1959 1960 kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp; 1961 if (tg == NULL) 1962 tg = thread->th.th_current_task->td_taskgroup; 1963 KMP_ASSERT(tg != NULL); 1964 kmp_task_red_data_t *arr = (kmp_task_red_data_t *)(tg->reduce_data); 1965 kmp_int32 num = tg->reduce_num_data; 1966 kmp_int32 tid = thread->th.th_info.ds.ds_tid; 1967 1968 KMP_ASSERT(data != NULL); 1969 while (tg != NULL) { 1970 for (int i = 0; i < num; ++i) { 1971 if (!arr[i].flags.lazy_priv) { 1972 if (data == arr[i].reduce_shar || 1973 (data >= arr[i].reduce_priv && data < arr[i].reduce_pend)) 1974 return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size; 1975 } else { 1976 // check shared location first 1977 void **p_priv = (void **)(arr[i].reduce_priv); 1978 if (data == arr[i].reduce_shar) 1979 goto found; 1980 // check if we get some thread specific location as parameter 1981 for (int j = 0; j < nth; ++j) 1982 if (data == p_priv[j]) 1983 goto found; 1984 continue; // not found, continue search 1985 found: 1986 if (p_priv[tid] == NULL) { 1987 // allocate thread specific object lazily 1988 void (*f_init)(void *) = (void (*)(void *))(arr[i].reduce_init); 1989 p_priv[tid] = __kmp_allocate(arr[i].reduce_size); 1990 if (f_init != NULL) { 1991 f_init(p_priv[tid]); 1992 } 1993 } 1994 return p_priv[tid]; 1995 } 1996 } 1997 tg = tg->parent; 1998 arr = (kmp_task_red_data_t *)(tg->reduce_data); 1999 num = tg->reduce_num_data; 2000 } 2001 KMP_ASSERT2(0, "Unknown task reduction item"); 2002 return NULL; // ERROR, this line never executed 2003 } 2004 2005 // Finalize task reduction. 2006 // Called from __kmpc_end_taskgroup() 2007 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) { 2008 kmp_int32 nth = th->th.th_team_nproc; 2009 KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1 2010 kmp_task_red_data_t *arr = (kmp_task_red_data_t *)tg->reduce_data; 2011 kmp_int32 num = tg->reduce_num_data; 2012 for (int i = 0; i < num; ++i) { 2013 void *sh_data = arr[i].reduce_shar; 2014 void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini); 2015 void (*f_comb)(void *, void *) = 2016 (void (*)(void *, void *))(arr[i].reduce_comb); 2017 if (!arr[i].flags.lazy_priv) { 2018 void *pr_data = arr[i].reduce_priv; 2019 size_t size = arr[i].reduce_size; 2020 for (int j = 0; j < nth; ++j) { 2021 void *priv_data = (char *)pr_data + j * size; 2022 f_comb(sh_data, priv_data); // combine results 2023 if (f_fini) 2024 f_fini(priv_data); // finalize if needed 2025 } 2026 } else { 2027 void **pr_data = (void **)(arr[i].reduce_priv); 2028 for (int j = 0; j < nth; ++j) { 2029 if (pr_data[j] != NULL) { 2030 f_comb(sh_data, pr_data[j]); // combine results 2031 if (f_fini) 2032 f_fini(pr_data[j]); // finalize if needed 2033 __kmp_free(pr_data[j]); 2034 } 2035 } 2036 } 2037 __kmp_free(arr[i].reduce_priv); 2038 } 2039 __kmp_thread_free(th, arr); 2040 tg->reduce_data = NULL; 2041 tg->reduce_num_data = 0; 2042 } 2043 #endif 2044 2045 #if OMP_40_ENABLED 2046 // __kmpc_taskgroup: Start a new taskgroup 2047 void __kmpc_taskgroup(ident_t *loc, int gtid) { 2048 kmp_info_t *thread = __kmp_threads[gtid]; 2049 kmp_taskdata_t *taskdata = thread->th.th_current_task; 2050 kmp_taskgroup_t *tg_new = 2051 (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t)); 2052 KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new)); 2053 KMP_ATOMIC_ST_RLX(&tg_new->count, 0); 2054 KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq); 2055 tg_new->parent = taskdata->td_taskgroup; 2056 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work 2057 #if OMP_45_ENABLED 2058 tg_new->reduce_data = NULL; 2059 tg_new->reduce_num_data = 0; 2060 #endif 2061 taskdata->td_taskgroup = tg_new; 2062 2063 #if OMPT_SUPPORT && OMPT_OPTIONAL 2064 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) { 2065 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2066 if (!codeptr) 2067 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2068 kmp_team_t *team = thread->th.th_team; 2069 ompt_data_t my_task_data = taskdata->ompt_task_info.task_data; 2070 // FIXME: I think this is wrong for lwt! 2071 ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data; 2072 2073 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2074 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data), 2075 &(my_task_data), codeptr); 2076 } 2077 #endif 2078 } 2079 2080 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task 2081 // and its descendants are complete 2082 void __kmpc_end_taskgroup(ident_t *loc, int gtid) { 2083 kmp_info_t *thread = __kmp_threads[gtid]; 2084 kmp_taskdata_t *taskdata = thread->th.th_current_task; 2085 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 2086 int thread_finished = FALSE; 2087 2088 #if OMPT_SUPPORT && OMPT_OPTIONAL 2089 kmp_team_t *team; 2090 ompt_data_t my_task_data; 2091 ompt_data_t my_parallel_data; 2092 void *codeptr; 2093 if (UNLIKELY(ompt_enabled.enabled)) { 2094 team = thread->th.th_team; 2095 my_task_data = taskdata->ompt_task_info.task_data; 2096 // FIXME: I think this is wrong for lwt! 2097 my_parallel_data = team->t.ompt_team_info.parallel_data; 2098 codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid); 2099 if (!codeptr) 2100 codeptr = OMPT_GET_RETURN_ADDRESS(0); 2101 } 2102 #endif 2103 2104 KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc)); 2105 KMP_DEBUG_ASSERT(taskgroup != NULL); 2106 KMP_SET_THREAD_STATE_BLOCK(TASKGROUP); 2107 2108 if (__kmp_tasking_mode != tskm_immediate_exec) { 2109 // mark task as waiting not on a barrier 2110 taskdata->td_taskwait_counter += 1; 2111 taskdata->td_taskwait_ident = loc; 2112 taskdata->td_taskwait_thread = gtid + 1; 2113 #if USE_ITT_BUILD 2114 // For ITT the taskgroup wait is similar to taskwait until we need to 2115 // distinguish them 2116 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 2117 if (itt_sync_obj != NULL) 2118 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 2119 #endif /* USE_ITT_BUILD */ 2120 2121 #if OMPT_SUPPORT && OMPT_OPTIONAL 2122 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) { 2123 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2124 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data), 2125 &(my_task_data), codeptr); 2126 } 2127 #endif 2128 2129 #if OMP_45_ENABLED 2130 if (!taskdata->td_flags.team_serial || 2131 (thread->th.th_task_team != NULL && 2132 thread->th.th_task_team->tt.tt_found_proxy_tasks)) 2133 #else 2134 if (!taskdata->td_flags.team_serial) 2135 #endif 2136 { 2137 kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 2138 0U); 2139 while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) { 2140 flag.execute_tasks(thread, gtid, FALSE, 2141 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 2142 __kmp_task_stealing_constraint); 2143 } 2144 } 2145 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting 2146 2147 #if OMPT_SUPPORT && OMPT_OPTIONAL 2148 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) { 2149 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 2150 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data), 2151 &(my_task_data), codeptr); 2152 } 2153 #endif 2154 2155 #if USE_ITT_BUILD 2156 if (itt_sync_obj != NULL) 2157 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 2158 #endif /* USE_ITT_BUILD */ 2159 } 2160 KMP_DEBUG_ASSERT(taskgroup->count == 0); 2161 2162 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work 2163 #if OMP_45_ENABLED 2164 if (taskgroup->reduce_data != NULL) // need to reduce? 2165 __kmp_task_reduction_fini(thread, taskgroup); 2166 #endif 2167 // Restore parent taskgroup for the current task 2168 taskdata->td_taskgroup = taskgroup->parent; 2169 __kmp_thread_free(thread, taskgroup); 2170 2171 KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n", 2172 gtid, taskdata)); 2173 ANNOTATE_HAPPENS_AFTER(taskdata); 2174 2175 #if OMPT_SUPPORT && OMPT_OPTIONAL 2176 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) { 2177 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 2178 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data), 2179 &(my_task_data), codeptr); 2180 } 2181 #endif 2182 } 2183 #endif 2184 2185 // __kmp_remove_my_task: remove a task from my own deque 2186 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid, 2187 kmp_task_team_t *task_team, 2188 kmp_int32 is_constrained) { 2189 kmp_task_t *task; 2190 kmp_taskdata_t *taskdata; 2191 kmp_thread_data_t *thread_data; 2192 kmp_uint32 tail; 2193 2194 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2195 KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data != 2196 NULL); // Caller should check this condition 2197 2198 thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 2199 2200 KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n", 2201 gtid, thread_data->td.td_deque_ntasks, 2202 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2203 2204 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 2205 KA_TRACE(10, 2206 ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: " 2207 "ntasks=%d head=%u tail=%u\n", 2208 gtid, thread_data->td.td_deque_ntasks, 2209 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2210 return NULL; 2211 } 2212 2213 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 2214 2215 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 2216 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2217 KA_TRACE(10, 2218 ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: " 2219 "ntasks=%d head=%u tail=%u\n", 2220 gtid, thread_data->td.td_deque_ntasks, 2221 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2222 return NULL; 2223 } 2224 2225 tail = (thread_data->td.td_deque_tail - 1) & 2226 TASK_DEQUE_MASK(thread_data->td); // Wrap index. 2227 taskdata = thread_data->td.td_deque[tail]; 2228 2229 if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) { 2230 // we need to check if the candidate obeys task scheduling constraint (TSC) 2231 // only descendant of all deferred tied tasks can be scheduled, checking 2232 // the last one is enough, as it in turn is the descendant of all others 2233 kmp_taskdata_t *current = thread->th.th_current_task->td_last_tied; 2234 KMP_DEBUG_ASSERT(current != NULL); 2235 // check if last tied task is not suspended on barrier 2236 if (current->td_flags.tasktype == TASK_EXPLICIT || 2237 current->td_taskwait_thread > 0) { // <= 0 on barrier 2238 kmp_int32 level = current->td_level; 2239 kmp_taskdata_t *parent = taskdata->td_parent; 2240 while (parent != current && parent->td_level > level) { 2241 parent = parent->td_parent; // check generation up to the level of the 2242 // current task 2243 KMP_DEBUG_ASSERT(parent != NULL); 2244 } 2245 if (parent != current) { 2246 // The TSC does not allow to steal victim task 2247 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2248 KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: " 2249 "ntasks=%d head=%u tail=%u\n", 2250 gtid, thread_data->td.td_deque_ntasks, 2251 thread_data->td.td_deque_head, 2252 thread_data->td.td_deque_tail)); 2253 return NULL; 2254 } 2255 } 2256 } 2257 2258 thread_data->td.td_deque_tail = tail; 2259 TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1); 2260 2261 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2262 2263 KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d task %p removed: " 2264 "ntasks=%d head=%u tail=%u\n", 2265 gtid, taskdata, thread_data->td.td_deque_ntasks, 2266 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 2267 2268 task = KMP_TASKDATA_TO_TASK(taskdata); 2269 return task; 2270 } 2271 2272 // __kmp_steal_task: remove a task from another thread's deque 2273 // Assume that calling thread has already checked existence of 2274 // task_team thread_data before calling this routine. 2275 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid, 2276 kmp_task_team_t *task_team, 2277 std::atomic<kmp_int32> *unfinished_threads, 2278 int *thread_finished, 2279 kmp_int32 is_constrained) { 2280 kmp_task_t *task; 2281 kmp_taskdata_t *taskdata; 2282 kmp_taskdata_t *current; 2283 kmp_thread_data_t *victim_td, *threads_data; 2284 kmp_int32 level, target; 2285 kmp_int32 victim_tid; 2286 2287 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2288 2289 threads_data = task_team->tt.tt_threads_data; 2290 KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition 2291 2292 victim_tid = victim_thr->th.th_info.ds.ds_tid; 2293 victim_td = &threads_data[victim_tid]; 2294 2295 KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: " 2296 "task_team=%p ntasks=%d head=%u tail=%u\n", 2297 gtid, __kmp_gtid_from_thread(victim_thr), task_team, 2298 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2299 victim_td->td.td_deque_tail)); 2300 2301 if (TCR_4(victim_td->td.td_deque_ntasks) == 0) { 2302 KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: " 2303 "task_team=%p ntasks=%d head=%u tail=%u\n", 2304 gtid, __kmp_gtid_from_thread(victim_thr), task_team, 2305 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2306 victim_td->td.td_deque_tail)); 2307 return NULL; 2308 } 2309 2310 __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock); 2311 2312 int ntasks = TCR_4(victim_td->td.td_deque_ntasks); 2313 // Check again after we acquire the lock 2314 if (ntasks == 0) { 2315 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2316 KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: " 2317 "task_team=%p ntasks=%d head=%u tail=%u\n", 2318 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2319 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2320 return NULL; 2321 } 2322 2323 KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL); 2324 2325 taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head]; 2326 if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) { 2327 // we need to check if the candidate obeys task scheduling constraint (TSC) 2328 // only descendant of all deferred tied tasks can be scheduled, checking 2329 // the last one is enough, as it in turn is the descendant of all others 2330 current = __kmp_threads[gtid]->th.th_current_task->td_last_tied; 2331 KMP_DEBUG_ASSERT(current != NULL); 2332 // check if last tied task is not suspended on barrier 2333 if (current->td_flags.tasktype == TASK_EXPLICIT || 2334 current->td_taskwait_thread > 0) { // <= 0 on barrier 2335 level = current->td_level; 2336 kmp_taskdata_t *parent = taskdata->td_parent; 2337 while (parent != current && parent->td_level > level) { 2338 parent = parent->td_parent; // check generation up to the level of the 2339 // current task 2340 KMP_DEBUG_ASSERT(parent != NULL); 2341 } 2342 if (parent != current) { 2343 if (!task_team->tt.tt_untied_task_encountered) { 2344 // The TSC does not allow to steal victim task 2345 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2346 KA_TRACE(10, 2347 ("__kmp_steal_task(exit #3): T#%d could not steal from " 2348 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n", 2349 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2350 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2351 return NULL; 2352 } 2353 taskdata = NULL; // will check other tasks in victim's deque 2354 } 2355 } 2356 } 2357 if (taskdata != NULL) { 2358 // Bump head pointer and Wrap. 2359 victim_td->td.td_deque_head = 2360 (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td); 2361 } else { 2362 int i; 2363 // walk through victim's deque trying to steal any task 2364 target = victim_td->td.td_deque_head; 2365 for (i = 1; i < ntasks; ++i) { 2366 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td); 2367 taskdata = victim_td->td.td_deque[target]; 2368 if (taskdata->td_flags.tiedness == TASK_TIED) { 2369 // check if the candidate obeys the TSC 2370 kmp_taskdata_t *parent = taskdata->td_parent; 2371 // check generation up to the level of the current task 2372 while (parent != current && parent->td_level > level) { 2373 parent = parent->td_parent; 2374 KMP_DEBUG_ASSERT(parent != NULL); 2375 } 2376 if (parent != current) { 2377 // The TSC does not allow to steal the candidate 2378 taskdata = NULL; 2379 continue; 2380 } else { 2381 // found victim tied task 2382 break; 2383 } 2384 } else { 2385 // found victim untied task 2386 break; 2387 } 2388 } 2389 if (taskdata == NULL) { 2390 // No appropriate candidate to steal found 2391 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2392 KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from " 2393 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n", 2394 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks, 2395 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2396 return NULL; 2397 } 2398 int prev = target; 2399 for (i = i + 1; i < ntasks; ++i) { 2400 // shift remaining tasks in the deque left by 1 2401 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td); 2402 victim_td->td.td_deque[prev] = victim_td->td.td_deque[target]; 2403 prev = target; 2404 } 2405 KMP_DEBUG_ASSERT(victim_td->td.td_deque_tail == 2406 ((target + 1) & TASK_DEQUE_MASK(victim_td->td))); 2407 victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped)) 2408 } 2409 if (*thread_finished) { 2410 // We need to un-mark this victim as a finished victim. This must be done 2411 // before releasing the lock, or else other threads (starting with the 2412 // master victim) might be prematurely released from the barrier!!! 2413 kmp_int32 count; 2414 2415 count = KMP_ATOMIC_INC(unfinished_threads); 2416 2417 KA_TRACE( 2418 20, 2419 ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n", 2420 gtid, count + 1, task_team)); 2421 2422 *thread_finished = FALSE; 2423 } 2424 TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1); 2425 2426 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2427 2428 KMP_COUNT_BLOCK(TASK_stolen); 2429 KA_TRACE(10, 2430 ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: " 2431 "task_team=%p ntasks=%d head=%u tail=%u\n", 2432 gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team, 2433 ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2434 2435 task = KMP_TASKDATA_TO_TASK(taskdata); 2436 return task; 2437 } 2438 2439 // __kmp_execute_tasks_template: Choose and execute tasks until either the 2440 // condition is statisfied (return true) or there are none left (return false). 2441 // 2442 // final_spin is TRUE if this is the spin at the release barrier. 2443 // thread_finished indicates whether the thread is finished executing all 2444 // the tasks it has on its deque, and is at the release barrier. 2445 // spinner is the location on which to spin. 2446 // spinner == NULL means only execute a single task and return. 2447 // checker is the value to check to terminate the spin. 2448 template <class C> 2449 static inline int __kmp_execute_tasks_template( 2450 kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin, 2451 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2452 kmp_int32 is_constrained) { 2453 kmp_task_team_t *task_team = thread->th.th_task_team; 2454 kmp_thread_data_t *threads_data; 2455 kmp_task_t *task; 2456 kmp_info_t *other_thread; 2457 kmp_taskdata_t *current_task = thread->th.th_current_task; 2458 std::atomic<kmp_int32> *unfinished_threads; 2459 kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0, 2460 tid = thread->th.th_info.ds.ds_tid; 2461 2462 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2463 KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]); 2464 2465 if (task_team == NULL) 2466 return FALSE; 2467 2468 KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d " 2469 "*thread_finished=%d\n", 2470 gtid, final_spin, *thread_finished)); 2471 2472 thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 2473 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 2474 KMP_DEBUG_ASSERT(threads_data != NULL); 2475 2476 nthreads = task_team->tt.tt_nproc; 2477 unfinished_threads = &(task_team->tt.tt_unfinished_threads); 2478 #if OMP_45_ENABLED 2479 KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks); 2480 #else 2481 KMP_DEBUG_ASSERT(nthreads > 1); 2482 #endif 2483 KMP_DEBUG_ASSERT(*unfinished_threads >= 0); 2484 2485 while (1) { // Outer loop keeps trying to find tasks in case of single thread 2486 // getting tasks from target constructs 2487 while (1) { // Inner loop to find a task and execute it 2488 task = NULL; 2489 if (use_own_tasks) { // check on own queue first 2490 task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained); 2491 } 2492 if ((task == NULL) && (nthreads > 1)) { // Steal a task 2493 int asleep = 1; 2494 use_own_tasks = 0; 2495 // Try to steal from the last place I stole from successfully. 2496 if (victim_tid == -2) { // haven't stolen anything yet 2497 victim_tid = threads_data[tid].td.td_deque_last_stolen; 2498 if (victim_tid != 2499 -1) // if we have a last stolen from victim, get the thread 2500 other_thread = threads_data[victim_tid].td.td_thr; 2501 } 2502 if (victim_tid != -1) { // found last victim 2503 asleep = 0; 2504 } else if (!new_victim) { // no recent steals and we haven't already 2505 // used a new victim; select a random thread 2506 do { // Find a different thread to steal work from. 2507 // Pick a random thread. Initial plan was to cycle through all the 2508 // threads, and only return if we tried to steal from every thread, 2509 // and failed. Arch says that's not such a great idea. 2510 victim_tid = __kmp_get_random(thread) % (nthreads - 1); 2511 if (victim_tid >= tid) { 2512 ++victim_tid; // Adjusts random distribution to exclude self 2513 } 2514 // Found a potential victim 2515 other_thread = threads_data[victim_tid].td.td_thr; 2516 // There is a slight chance that __kmp_enable_tasking() did not wake 2517 // up all threads waiting at the barrier. If victim is sleeping, 2518 // then wake it up. Since we were going to pay the cache miss 2519 // penalty for referencing another thread's kmp_info_t struct 2520 // anyway, 2521 // the check shouldn't cost too much performance at this point. In 2522 // extra barrier mode, tasks do not sleep at the separate tasking 2523 // barrier, so this isn't a problem. 2524 asleep = 0; 2525 if ((__kmp_tasking_mode == tskm_task_teams) && 2526 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) && 2527 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) != 2528 NULL)) { 2529 asleep = 1; 2530 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread), 2531 other_thread->th.th_sleep_loc); 2532 // A sleeping thread should not have any tasks on it's queue. 2533 // There is a slight possibility that it resumes, steals a task 2534 // from another thread, which spawns more tasks, all in the time 2535 // that it takes this thread to check => don't write an assertion 2536 // that the victim's queue is empty. Try stealing from a 2537 // different thread. 2538 } 2539 } while (asleep); 2540 } 2541 2542 if (!asleep) { 2543 // We have a victim to try to steal from 2544 task = __kmp_steal_task(other_thread, gtid, task_team, 2545 unfinished_threads, thread_finished, 2546 is_constrained); 2547 } 2548 if (task != NULL) { // set last stolen to victim 2549 if (threads_data[tid].td.td_deque_last_stolen != victim_tid) { 2550 threads_data[tid].td.td_deque_last_stolen = victim_tid; 2551 // The pre-refactored code did not try more than 1 successful new 2552 // vicitm, unless the last one generated more local tasks; 2553 // new_victim keeps track of this 2554 new_victim = 1; 2555 } 2556 } else { // No tasks found; unset last_stolen 2557 KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1); 2558 victim_tid = -2; // no successful victim found 2559 } 2560 } 2561 2562 if (task == NULL) // break out of tasking loop 2563 break; 2564 2565 // Found a task; execute it 2566 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2567 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2568 if (itt_sync_obj == NULL) { // we are at fork barrier where we could not 2569 // get the object reliably 2570 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2571 } 2572 __kmp_itt_task_starting(itt_sync_obj); 2573 } 2574 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2575 __kmp_invoke_task(gtid, task, current_task); 2576 #if USE_ITT_BUILD 2577 if (itt_sync_obj != NULL) 2578 __kmp_itt_task_finished(itt_sync_obj); 2579 #endif /* USE_ITT_BUILD */ 2580 // If this thread is only partway through the barrier and the condition is 2581 // met, then return now, so that the barrier gather/release pattern can 2582 // proceed. If this thread is in the last spin loop in the barrier, 2583 // waiting to be released, we know that the termination condition will not 2584 // be satisified, so don't waste any cycles checking it. 2585 if (flag == NULL || (!final_spin && flag->done_check())) { 2586 KA_TRACE( 2587 15, 2588 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 2589 gtid)); 2590 return TRUE; 2591 } 2592 if (thread->th.th_task_team == NULL) { 2593 break; 2594 } 2595 // Yield before executing next task 2596 KMP_YIELD(__kmp_library == library_throughput); 2597 // If execution of a stolen task results in more tasks being placed on our 2598 // run queue, reset use_own_tasks 2599 if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) { 2600 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned " 2601 "other tasks, restart\n", 2602 gtid)); 2603 use_own_tasks = 1; 2604 new_victim = 0; 2605 } 2606 } 2607 2608 // The task source has been exhausted. If in final spin loop of barrier, check 2609 // if termination condition is satisfied. 2610 #if OMP_45_ENABLED 2611 // The work queue may be empty but there might be proxy tasks still 2612 // executing 2613 if (final_spin && 2614 KMP_ATOMIC_LD_ACQ(¤t_task->td_incomplete_child_tasks) == 0) 2615 #else 2616 if (final_spin) 2617 #endif 2618 { 2619 // First, decrement the #unfinished threads, if that has not already been 2620 // done. This decrement might be to the spin location, and result in the 2621 // termination condition being satisfied. 2622 if (!*thread_finished) { 2623 kmp_int32 count; 2624 2625 count = KMP_ATOMIC_DEC(unfinished_threads) - 1; 2626 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec " 2627 "unfinished_threads to %d task_team=%p\n", 2628 gtid, count, task_team)); 2629 *thread_finished = TRUE; 2630 } 2631 2632 // It is now unsafe to reference thread->th.th_team !!! 2633 // Decrementing task_team->tt.tt_unfinished_threads can allow the master 2634 // thread to pass through the barrier, where it might reset each thread's 2635 // th.th_team field for the next parallel region. If we can steal more 2636 // work, we know that this has not happened yet. 2637 if (flag != NULL && flag->done_check()) { 2638 KA_TRACE( 2639 15, 2640 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 2641 gtid)); 2642 return TRUE; 2643 } 2644 } 2645 2646 // If this thread's task team is NULL, master has recognized that there are 2647 // no more tasks; bail out 2648 if (thread->th.th_task_team == NULL) { 2649 KA_TRACE(15, 2650 ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid)); 2651 return FALSE; 2652 } 2653 2654 #if OMP_45_ENABLED 2655 // We could be getting tasks from target constructs; if this is the only 2656 // thread, keep trying to execute tasks from own queue 2657 if (nthreads == 1) 2658 use_own_tasks = 1; 2659 else 2660 #endif 2661 { 2662 KA_TRACE(15, 2663 ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid)); 2664 return FALSE; 2665 } 2666 } 2667 } 2668 2669 int __kmp_execute_tasks_32( 2670 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin, 2671 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2672 kmp_int32 is_constrained) { 2673 return __kmp_execute_tasks_template( 2674 thread, gtid, flag, final_spin, 2675 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 2676 } 2677 2678 int __kmp_execute_tasks_64( 2679 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin, 2680 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2681 kmp_int32 is_constrained) { 2682 return __kmp_execute_tasks_template( 2683 thread, gtid, flag, final_spin, 2684 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 2685 } 2686 2687 int __kmp_execute_tasks_oncore( 2688 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin, 2689 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2690 kmp_int32 is_constrained) { 2691 return __kmp_execute_tasks_template( 2692 thread, gtid, flag, final_spin, 2693 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 2694 } 2695 2696 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the 2697 // next barrier so they can assist in executing enqueued tasks. 2698 // First thread in allocates the task team atomically. 2699 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 2700 kmp_info_t *this_thr) { 2701 kmp_thread_data_t *threads_data; 2702 int nthreads, i, is_init_thread; 2703 2704 KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n", 2705 __kmp_gtid_from_thread(this_thr))); 2706 2707 KMP_DEBUG_ASSERT(task_team != NULL); 2708 KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL); 2709 2710 nthreads = task_team->tt.tt_nproc; 2711 KMP_DEBUG_ASSERT(nthreads > 0); 2712 KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc); 2713 2714 // Allocate or increase the size of threads_data if necessary 2715 is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team); 2716 2717 if (!is_init_thread) { 2718 // Some other thread already set up the array. 2719 KA_TRACE( 2720 20, 2721 ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n", 2722 __kmp_gtid_from_thread(this_thr))); 2723 return; 2724 } 2725 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 2726 KMP_DEBUG_ASSERT(threads_data != NULL); 2727 2728 if ((__kmp_tasking_mode == tskm_task_teams) && 2729 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) { 2730 // Release any threads sleeping at the barrier, so that they can steal 2731 // tasks and execute them. In extra barrier mode, tasks do not sleep 2732 // at the separate tasking barrier, so this isn't a problem. 2733 for (i = 0; i < nthreads; i++) { 2734 volatile void *sleep_loc; 2735 kmp_info_t *thread = threads_data[i].td.td_thr; 2736 2737 if (i == this_thr->th.th_info.ds.ds_tid) { 2738 continue; 2739 } 2740 // Since we haven't locked the thread's suspend mutex lock at this 2741 // point, there is a small window where a thread might be putting 2742 // itself to sleep, but hasn't set the th_sleep_loc field yet. 2743 // To work around this, __kmp_execute_tasks_template() periodically checks 2744 // see if other threads are sleeping (using the same random mechanism that 2745 // is used for task stealing) and awakens them if they are. 2746 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != 2747 NULL) { 2748 KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n", 2749 __kmp_gtid_from_thread(this_thr), 2750 __kmp_gtid_from_thread(thread))); 2751 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc); 2752 } else { 2753 KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n", 2754 __kmp_gtid_from_thread(this_thr), 2755 __kmp_gtid_from_thread(thread))); 2756 } 2757 } 2758 } 2759 2760 KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n", 2761 __kmp_gtid_from_thread(this_thr))); 2762 } 2763 2764 /* // TODO: Check the comment consistency 2765 * Utility routines for "task teams". A task team (kmp_task_t) is kind of 2766 * like a shadow of the kmp_team_t data struct, with a different lifetime. 2767 * After a child * thread checks into a barrier and calls __kmp_release() from 2768 * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no 2769 * longer assume that the kmp_team_t structure is intact (at any moment, the 2770 * master thread may exit the barrier code and free the team data structure, 2771 * and return the threads to the thread pool). 2772 * 2773 * This does not work with the the tasking code, as the thread is still 2774 * expected to participate in the execution of any tasks that may have been 2775 * spawned my a member of the team, and the thread still needs access to all 2776 * to each thread in the team, so that it can steal work from it. 2777 * 2778 * Enter the existence of the kmp_task_team_t struct. It employs a reference 2779 * counting mechanims, and is allocated by the master thread before calling 2780 * __kmp_<barrier_kind>_release, and then is release by the last thread to 2781 * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes 2782 * of the kmp_task_team_t structs for consecutive barriers can overlap 2783 * (and will, unless the master thread is the last thread to exit the barrier 2784 * release phase, which is not typical). 2785 * 2786 * The existence of such a struct is useful outside the context of tasking, 2787 * but for now, I'm trying to keep it specific to the OMP_30_ENABLED macro, 2788 * so that any performance differences show up when comparing the 2.5 vs. 3.0 2789 * libraries. 2790 * 2791 * We currently use the existence of the threads array as an indicator that 2792 * tasks were spawned since the last barrier. If the structure is to be 2793 * useful outside the context of tasking, then this will have to change, but 2794 * not settting the field minimizes the performance impact of tasking on 2795 * barriers, when no explicit tasks were spawned (pushed, actually). 2796 */ 2797 2798 static kmp_task_team_t *__kmp_free_task_teams = 2799 NULL; // Free list for task_team data structures 2800 // Lock for task team data structures 2801 kmp_bootstrap_lock_t __kmp_task_team_lock = 2802 KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock); 2803 2804 // __kmp_alloc_task_deque: 2805 // Allocates a task deque for a particular thread, and initialize the necessary 2806 // data structures relating to the deque. This only happens once per thread 2807 // per task team since task teams are recycled. No lock is needed during 2808 // allocation since each thread allocates its own deque. 2809 static void __kmp_alloc_task_deque(kmp_info_t *thread, 2810 kmp_thread_data_t *thread_data) { 2811 __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock); 2812 KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL); 2813 2814 // Initialize last stolen task field to "none" 2815 thread_data->td.td_deque_last_stolen = -1; 2816 2817 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0); 2818 KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0); 2819 KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0); 2820 2821 KE_TRACE( 2822 10, 2823 ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n", 2824 __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data)); 2825 // Allocate space for task deque, and zero the deque 2826 // Cannot use __kmp_thread_calloc() because threads not around for 2827 // kmp_reap_task_team( ). 2828 thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate( 2829 INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *)); 2830 thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE; 2831 } 2832 2833 // __kmp_realloc_task_deque: 2834 // Re-allocates a task deque for a particular thread, copies the content from 2835 // the old deque and adjusts the necessary data structures relating to the 2836 // deque. This operation must be done with a the deque_lock being held 2837 static void __kmp_realloc_task_deque(kmp_info_t *thread, 2838 kmp_thread_data_t *thread_data) { 2839 kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td); 2840 kmp_int32 new_size = 2 * size; 2841 2842 KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to " 2843 "%d] for thread_data %p\n", 2844 __kmp_gtid_from_thread(thread), size, new_size, thread_data)); 2845 2846 kmp_taskdata_t **new_deque = 2847 (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *)); 2848 2849 int i, j; 2850 for (i = thread_data->td.td_deque_head, j = 0; j < size; 2851 i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++) 2852 new_deque[j] = thread_data->td.td_deque[i]; 2853 2854 __kmp_free(thread_data->td.td_deque); 2855 2856 thread_data->td.td_deque_head = 0; 2857 thread_data->td.td_deque_tail = size; 2858 thread_data->td.td_deque = new_deque; 2859 thread_data->td.td_deque_size = new_size; 2860 } 2861 2862 // __kmp_free_task_deque: 2863 // Deallocates a task deque for a particular thread. Happens at library 2864 // deallocation so don't need to reset all thread data fields. 2865 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) { 2866 if (thread_data->td.td_deque != NULL) { 2867 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 2868 TCW_4(thread_data->td.td_deque_ntasks, 0); 2869 __kmp_free(thread_data->td.td_deque); 2870 thread_data->td.td_deque = NULL; 2871 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2872 } 2873 2874 #ifdef BUILD_TIED_TASK_STACK 2875 // GEH: Figure out what to do here for td_susp_tied_tasks 2876 if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) { 2877 __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data); 2878 } 2879 #endif // BUILD_TIED_TASK_STACK 2880 } 2881 2882 // __kmp_realloc_task_threads_data: 2883 // Allocates a threads_data array for a task team, either by allocating an 2884 // initial array or enlarging an existing array. Only the first thread to get 2885 // the lock allocs or enlarges the array and re-initializes the array eleemnts. 2886 // That thread returns "TRUE", the rest return "FALSE". 2887 // Assumes that the new array size is given by task_team -> tt.tt_nproc. 2888 // The current size is given by task_team -> tt.tt_max_threads. 2889 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 2890 kmp_task_team_t *task_team) { 2891 kmp_thread_data_t **threads_data_p; 2892 kmp_int32 nthreads, maxthreads; 2893 int is_init_thread = FALSE; 2894 2895 if (TCR_4(task_team->tt.tt_found_tasks)) { 2896 // Already reallocated and initialized. 2897 return FALSE; 2898 } 2899 2900 threads_data_p = &task_team->tt.tt_threads_data; 2901 nthreads = task_team->tt.tt_nproc; 2902 maxthreads = task_team->tt.tt_max_threads; 2903 2904 // All threads must lock when they encounter the first task of the implicit 2905 // task region to make sure threads_data fields are (re)initialized before 2906 // used. 2907 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 2908 2909 if (!TCR_4(task_team->tt.tt_found_tasks)) { 2910 // first thread to enable tasking 2911 kmp_team_t *team = thread->th.th_team; 2912 int i; 2913 2914 is_init_thread = TRUE; 2915 if (maxthreads < nthreads) { 2916 2917 if (*threads_data_p != NULL) { 2918 kmp_thread_data_t *old_data = *threads_data_p; 2919 kmp_thread_data_t *new_data = NULL; 2920 2921 KE_TRACE( 2922 10, 2923 ("__kmp_realloc_task_threads_data: T#%d reallocating " 2924 "threads data for task_team %p, new_size = %d, old_size = %d\n", 2925 __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads)); 2926 // Reallocate threads_data to have more elements than current array 2927 // Cannot use __kmp_thread_realloc() because threads not around for 2928 // kmp_reap_task_team( ). Note all new array entries are initialized 2929 // to zero by __kmp_allocate(). 2930 new_data = (kmp_thread_data_t *)__kmp_allocate( 2931 nthreads * sizeof(kmp_thread_data_t)); 2932 // copy old data to new data 2933 KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t), 2934 (void *)old_data, maxthreads * sizeof(kmp_thread_data_t)); 2935 2936 #ifdef BUILD_TIED_TASK_STACK 2937 // GEH: Figure out if this is the right thing to do 2938 for (i = maxthreads; i < nthreads; i++) { 2939 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 2940 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 2941 } 2942 #endif // BUILD_TIED_TASK_STACK 2943 // Install the new data and free the old data 2944 (*threads_data_p) = new_data; 2945 __kmp_free(old_data); 2946 } else { 2947 KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating " 2948 "threads data for task_team %p, size = %d\n", 2949 __kmp_gtid_from_thread(thread), task_team, nthreads)); 2950 // Make the initial allocate for threads_data array, and zero entries 2951 // Cannot use __kmp_thread_calloc() because threads not around for 2952 // kmp_reap_task_team( ). 2953 ANNOTATE_IGNORE_WRITES_BEGIN(); 2954 *threads_data_p = (kmp_thread_data_t *)__kmp_allocate( 2955 nthreads * sizeof(kmp_thread_data_t)); 2956 ANNOTATE_IGNORE_WRITES_END(); 2957 #ifdef BUILD_TIED_TASK_STACK 2958 // GEH: Figure out if this is the right thing to do 2959 for (i = 0; i < nthreads; i++) { 2960 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 2961 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 2962 } 2963 #endif // BUILD_TIED_TASK_STACK 2964 } 2965 task_team->tt.tt_max_threads = nthreads; 2966 } else { 2967 // If array has (more than) enough elements, go ahead and use it 2968 KMP_DEBUG_ASSERT(*threads_data_p != NULL); 2969 } 2970 2971 // initialize threads_data pointers back to thread_info structures 2972 for (i = 0; i < nthreads; i++) { 2973 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 2974 thread_data->td.td_thr = team->t.t_threads[i]; 2975 2976 if (thread_data->td.td_deque_last_stolen >= nthreads) { 2977 // The last stolen field survives across teams / barrier, and the number 2978 // of threads may have changed. It's possible (likely?) that a new 2979 // parallel region will exhibit the same behavior as previous region. 2980 thread_data->td.td_deque_last_stolen = -1; 2981 } 2982 } 2983 2984 KMP_MB(); 2985 TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE); 2986 } 2987 2988 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 2989 return is_init_thread; 2990 } 2991 2992 // __kmp_free_task_threads_data: 2993 // Deallocates a threads_data array for a task team, including any attached 2994 // tasking deques. Only occurs at library shutdown. 2995 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) { 2996 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 2997 if (task_team->tt.tt_threads_data != NULL) { 2998 int i; 2999 for (i = 0; i < task_team->tt.tt_max_threads; i++) { 3000 __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]); 3001 } 3002 __kmp_free(task_team->tt.tt_threads_data); 3003 task_team->tt.tt_threads_data = NULL; 3004 } 3005 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 3006 } 3007 3008 // __kmp_allocate_task_team: 3009 // Allocates a task team associated with a specific team, taking it from 3010 // the global task team free list if possible. Also initializes data 3011 // structures. 3012 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread, 3013 kmp_team_t *team) { 3014 kmp_task_team_t *task_team = NULL; 3015 int nthreads; 3016 3017 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n", 3018 (thread ? __kmp_gtid_from_thread(thread) : -1), team)); 3019 3020 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 3021 // Take a task team from the task team pool 3022 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3023 if (__kmp_free_task_teams != NULL) { 3024 task_team = __kmp_free_task_teams; 3025 TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next); 3026 task_team->tt.tt_next = NULL; 3027 } 3028 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3029 } 3030 3031 if (task_team == NULL) { 3032 KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating " 3033 "task team for team %p\n", 3034 __kmp_gtid_from_thread(thread), team)); 3035 // Allocate a new task team if one is not available. 3036 // Cannot use __kmp_thread_malloc() because threads not around for 3037 // kmp_reap_task_team( ). 3038 task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t)); 3039 __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock); 3040 // AC: __kmp_allocate zeroes returned memory 3041 // task_team -> tt.tt_threads_data = NULL; 3042 // task_team -> tt.tt_max_threads = 0; 3043 // task_team -> tt.tt_next = NULL; 3044 } 3045 3046 TCW_4(task_team->tt.tt_found_tasks, FALSE); 3047 #if OMP_45_ENABLED 3048 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3049 #endif 3050 task_team->tt.tt_nproc = nthreads = team->t.t_nproc; 3051 3052 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads); 3053 TCW_4(task_team->tt.tt_active, TRUE); 3054 3055 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p " 3056 "unfinished_threads init'd to %d\n", 3057 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team, 3058 KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads))); 3059 return task_team; 3060 } 3061 3062 // __kmp_free_task_team: 3063 // Frees the task team associated with a specific thread, and adds it 3064 // to the global task team free list. 3065 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) { 3066 KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n", 3067 thread ? __kmp_gtid_from_thread(thread) : -1, task_team)); 3068 3069 // Put task team back on free list 3070 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3071 3072 KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL); 3073 task_team->tt.tt_next = __kmp_free_task_teams; 3074 TCW_PTR(__kmp_free_task_teams, task_team); 3075 3076 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3077 } 3078 3079 // __kmp_reap_task_teams: 3080 // Free all the task teams on the task team free list. 3081 // Should only be done during library shutdown. 3082 // Cannot do anything that needs a thread structure or gtid since they are 3083 // already gone. 3084 void __kmp_reap_task_teams(void) { 3085 kmp_task_team_t *task_team; 3086 3087 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 3088 // Free all task_teams on the free list 3089 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 3090 while ((task_team = __kmp_free_task_teams) != NULL) { 3091 __kmp_free_task_teams = task_team->tt.tt_next; 3092 task_team->tt.tt_next = NULL; 3093 3094 // Free threads_data if necessary 3095 if (task_team->tt.tt_threads_data != NULL) { 3096 __kmp_free_task_threads_data(task_team); 3097 } 3098 __kmp_free(task_team); 3099 } 3100 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 3101 } 3102 } 3103 3104 // __kmp_wait_to_unref_task_teams: 3105 // Some threads could still be in the fork barrier release code, possibly 3106 // trying to steal tasks. Wait for each thread to unreference its task team. 3107 void __kmp_wait_to_unref_task_teams(void) { 3108 kmp_info_t *thread; 3109 kmp_uint32 spins; 3110 int done; 3111 3112 KMP_INIT_YIELD(spins); 3113 3114 for (;;) { 3115 done = TRUE; 3116 3117 // TODO: GEH - this may be is wrong because some sync would be necessary 3118 // in case threads are added to the pool during the traversal. Need to 3119 // verify that lock for thread pool is held when calling this routine. 3120 for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL; 3121 thread = thread->th.th_next_pool) { 3122 #if KMP_OS_WINDOWS 3123 DWORD exit_val; 3124 #endif 3125 if (TCR_PTR(thread->th.th_task_team) == NULL) { 3126 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n", 3127 __kmp_gtid_from_thread(thread))); 3128 continue; 3129 } 3130 #if KMP_OS_WINDOWS 3131 // TODO: GEH - add this check for Linux* OS / OS X* as well? 3132 if (!__kmp_is_thread_alive(thread, &exit_val)) { 3133 thread->th.th_task_team = NULL; 3134 continue; 3135 } 3136 #endif 3137 3138 done = FALSE; // Because th_task_team pointer is not NULL for this thread 3139 3140 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to " 3141 "unreference task_team\n", 3142 __kmp_gtid_from_thread(thread))); 3143 3144 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 3145 volatile void *sleep_loc; 3146 // If the thread is sleeping, awaken it. 3147 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != 3148 NULL) { 3149 KA_TRACE( 3150 10, 3151 ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n", 3152 __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread))); 3153 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc); 3154 } 3155 } 3156 } 3157 if (done) { 3158 break; 3159 } 3160 3161 // If we are oversubscribed, or have waited a bit (and library mode is 3162 // throughput), yield. Pause is in the following code. 3163 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 3164 KMP_YIELD_SPIN(spins); // Yields only if KMP_LIBRARY=throughput 3165 } 3166 } 3167 3168 // __kmp_task_team_setup: Create a task_team for the current team, but use 3169 // an already created, unused one if it already exists. 3170 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) { 3171 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3172 3173 // If this task_team hasn't been created yet, allocate it. It will be used in 3174 // the region after the next. 3175 // If it exists, it is the current task team and shouldn't be touched yet as 3176 // it may still be in use. 3177 if (team->t.t_task_team[this_thr->th.th_task_state] == NULL && 3178 (always || team->t.t_nproc > 1)) { 3179 team->t.t_task_team[this_thr->th.th_task_state] = 3180 __kmp_allocate_task_team(this_thr, team); 3181 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p " 3182 "for team %d at parity=%d\n", 3183 __kmp_gtid_from_thread(this_thr), 3184 team->t.t_task_team[this_thr->th.th_task_state], 3185 ((team != NULL) ? team->t.t_id : -1), 3186 this_thr->th.th_task_state)); 3187 } 3188 3189 // After threads exit the release, they will call sync, and then point to this 3190 // other task_team; make sure it is allocated and properly initialized. As 3191 // threads spin in the barrier release phase, they will continue to use the 3192 // previous task_team struct(above), until they receive the signal to stop 3193 // checking for tasks (they can't safely reference the kmp_team_t struct, 3194 // which could be reallocated by the master thread). No task teams are formed 3195 // for serialized teams. 3196 if (team->t.t_nproc > 1) { 3197 int other_team = 1 - this_thr->th.th_task_state; 3198 if (team->t.t_task_team[other_team] == NULL) { // setup other team as well 3199 team->t.t_task_team[other_team] = 3200 __kmp_allocate_task_team(this_thr, team); 3201 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new " 3202 "task_team %p for team %d at parity=%d\n", 3203 __kmp_gtid_from_thread(this_thr), 3204 team->t.t_task_team[other_team], 3205 ((team != NULL) ? team->t.t_id : -1), other_team)); 3206 } else { // Leave the old task team struct in place for the upcoming region; 3207 // adjust as needed 3208 kmp_task_team_t *task_team = team->t.t_task_team[other_team]; 3209 if (!task_team->tt.tt_active || 3210 team->t.t_nproc != task_team->tt.tt_nproc) { 3211 TCW_4(task_team->tt.tt_nproc, team->t.t_nproc); 3212 TCW_4(task_team->tt.tt_found_tasks, FALSE); 3213 #if OMP_45_ENABLED 3214 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3215 #endif 3216 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, 3217 team->t.t_nproc); 3218 TCW_4(task_team->tt.tt_active, TRUE); 3219 } 3220 // if team size has changed, the first thread to enable tasking will 3221 // realloc threads_data if necessary 3222 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team " 3223 "%p for team %d at parity=%d\n", 3224 __kmp_gtid_from_thread(this_thr), 3225 team->t.t_task_team[other_team], 3226 ((team != NULL) ? team->t.t_id : -1), other_team)); 3227 } 3228 } 3229 } 3230 3231 // __kmp_task_team_sync: Propagation of task team data from team to threads 3232 // which happens just after the release phase of a team barrier. This may be 3233 // called by any thread, but only for teams with # threads > 1. 3234 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) { 3235 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3236 3237 // Toggle the th_task_state field, to switch which task_team this thread 3238 // refers to 3239 this_thr->th.th_task_state = 1 - this_thr->th.th_task_state; 3240 // It is now safe to propagate the task team pointer from the team struct to 3241 // the current thread. 3242 TCW_PTR(this_thr->th.th_task_team, 3243 team->t.t_task_team[this_thr->th.th_task_state]); 3244 KA_TRACE(20, 3245 ("__kmp_task_team_sync: Thread T#%d task team switched to task_team " 3246 "%p from Team #%d (parity=%d)\n", 3247 __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team, 3248 ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state)); 3249 } 3250 3251 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the 3252 // barrier gather phase. Only called by master thread if #threads in team > 1 or 3253 // if proxy tasks were created. 3254 // 3255 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off 3256 // by passing in 0 optionally as the last argument. When wait is zero, master 3257 // thread does not wait for unfinished_threads to reach 0. 3258 void __kmp_task_team_wait( 3259 kmp_info_t *this_thr, 3260 kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) { 3261 kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state]; 3262 3263 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 3264 KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team); 3265 3266 if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) { 3267 if (wait) { 3268 KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks " 3269 "(for unfinished_threads to reach 0) on task_team = %p\n", 3270 __kmp_gtid_from_thread(this_thr), task_team)); 3271 // Worker threads may have dropped through to release phase, but could 3272 // still be executing tasks. Wait here for tasks to complete. To avoid 3273 // memory contention, only master thread checks termination condition. 3274 kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, 3275 &task_team->tt.tt_unfinished_threads), 3276 0U); 3277 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 3278 } 3279 // Deactivate the old task team, so that the worker threads will stop 3280 // referencing it while spinning. 3281 KA_TRACE( 3282 20, 3283 ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: " 3284 "setting active to false, setting local and team's pointer to NULL\n", 3285 __kmp_gtid_from_thread(this_thr), task_team)); 3286 #if OMP_45_ENABLED 3287 KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 || 3288 task_team->tt.tt_found_proxy_tasks == TRUE); 3289 TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE); 3290 #else 3291 KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1); 3292 #endif 3293 KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0); 3294 TCW_SYNC_4(task_team->tt.tt_active, FALSE); 3295 KMP_MB(); 3296 3297 TCW_PTR(this_thr->th.th_task_team, NULL); 3298 } 3299 } 3300 3301 // __kmp_tasking_barrier: 3302 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier. 3303 // Internal function to execute all tasks prior to a regular barrier or a join 3304 // barrier. It is a full barrier itself, which unfortunately turns regular 3305 // barriers into double barriers and join barriers into 1 1/2 barriers. 3306 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) { 3307 std::atomic<kmp_uint32> *spin = RCAST( 3308 std::atomic<kmp_uint32> *, 3309 &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads); 3310 int flag = FALSE; 3311 KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier); 3312 3313 #if USE_ITT_BUILD 3314 KMP_FSYNC_SPIN_INIT(spin, NULL); 3315 #endif /* USE_ITT_BUILD */ 3316 kmp_flag_32 spin_flag(spin, 0U); 3317 while (!spin_flag.execute_tasks(thread, gtid, TRUE, 3318 &flag USE_ITT_BUILD_ARG(NULL), 0)) { 3319 #if USE_ITT_BUILD 3320 // TODO: What about itt_sync_obj?? 3321 KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin)); 3322 #endif /* USE_ITT_BUILD */ 3323 3324 if (TCR_4(__kmp_global.g.g_done)) { 3325 if (__kmp_global.g.g_abort) 3326 __kmp_abort_thread(); 3327 break; 3328 } 3329 KMP_YIELD(TRUE); // GH: We always yield here 3330 } 3331 #if USE_ITT_BUILD 3332 KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin)); 3333 #endif /* USE_ITT_BUILD */ 3334 } 3335 3336 #if OMP_45_ENABLED 3337 3338 // __kmp_give_task puts a task into a given thread queue if: 3339 // - the queue for that thread was created 3340 // - there's space in that queue 3341 // Because of this, __kmp_push_task needs to check if there's space after 3342 // getting the lock 3343 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task, 3344 kmp_int32 pass) { 3345 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 3346 kmp_task_team_t *task_team = taskdata->td_task_team; 3347 3348 KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n", 3349 taskdata, tid)); 3350 3351 // If task_team is NULL something went really bad... 3352 KMP_DEBUG_ASSERT(task_team != NULL); 3353 3354 bool result = false; 3355 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 3356 3357 if (thread_data->td.td_deque == NULL) { 3358 // There's no queue in this thread, go find another one 3359 // We're guaranteed that at least one thread has a queue 3360 KA_TRACE(30, 3361 ("__kmp_give_task: thread %d has no queue while giving task %p.\n", 3362 tid, taskdata)); 3363 return result; 3364 } 3365 3366 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3367 TASK_DEQUE_SIZE(thread_data->td)) { 3368 KA_TRACE( 3369 30, 3370 ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", 3371 taskdata, tid)); 3372 3373 // if this deque is bigger than the pass ratio give a chance to another 3374 // thread 3375 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 3376 return result; 3377 3378 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3379 __kmp_realloc_task_deque(thread, thread_data); 3380 3381 } else { 3382 3383 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3384 3385 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3386 TASK_DEQUE_SIZE(thread_data->td)) { 3387 KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to " 3388 "thread %d.\n", 3389 taskdata, tid)); 3390 3391 // if this deque is bigger than the pass ratio give a chance to another 3392 // thread 3393 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 3394 goto release_and_exit; 3395 3396 __kmp_realloc_task_deque(thread, thread_data); 3397 } 3398 } 3399 3400 // lock is held here, and there is space in the deque 3401 3402 thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata; 3403 // Wrap index. 3404 thread_data->td.td_deque_tail = 3405 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 3406 TCW_4(thread_data->td.td_deque_ntasks, 3407 TCR_4(thread_data->td.td_deque_ntasks) + 1); 3408 3409 result = true; 3410 KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n", 3411 taskdata, tid)); 3412 3413 release_and_exit: 3414 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 3415 3416 return result; 3417 } 3418 3419 /* The finish of the proxy tasks is divided in two pieces: 3420 - the top half is the one that can be done from a thread outside the team 3421 - the bottom half must be run from a them within the team 3422 3423 In order to run the bottom half the task gets queued back into one of the 3424 threads of the team. Once the td_incomplete_child_task counter of the parent 3425 is decremented the threads can leave the barriers. So, the bottom half needs 3426 to be queued before the counter is decremented. The top half is therefore 3427 divided in two parts: 3428 - things that can be run before queuing the bottom half 3429 - things that must be run after queuing the bottom half 3430 3431 This creates a second race as the bottom half can free the task before the 3432 second top half is executed. To avoid this we use the 3433 td_incomplete_child_task of the proxy task to synchronize the top and bottom 3434 half. */ 3435 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 3436 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 3437 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3438 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 3439 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 3440 3441 taskdata->td_flags.complete = 1; // mark the task as completed 3442 3443 if (taskdata->td_taskgroup) 3444 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count); 3445 3446 // Create an imaginary children for this task so the bottom half cannot 3447 // release the task before we have completed the second top half 3448 KMP_ATOMIC_INC(&taskdata->td_incomplete_child_tasks); 3449 } 3450 3451 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 3452 kmp_int32 children = 0; 3453 3454 // Predecrement simulated by "- 1" calculation 3455 children = 3456 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1; 3457 KMP_DEBUG_ASSERT(children >= 0); 3458 3459 // Remove the imaginary children 3460 KMP_ATOMIC_DEC(&taskdata->td_incomplete_child_tasks); 3461 } 3462 3463 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) { 3464 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3465 kmp_info_t *thread = __kmp_threads[gtid]; 3466 3467 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3468 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 3469 1); // top half must run before bottom half 3470 3471 // We need to wait to make sure the top half is finished 3472 // Spinning here should be ok as this should happen quickly 3473 while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) > 0) 3474 ; 3475 3476 __kmp_release_deps(gtid, taskdata); 3477 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 3478 } 3479 3480 /*! 3481 @ingroup TASKING 3482 @param gtid Global Thread ID of encountering thread 3483 @param ptask Task which execution is completed 3484 3485 Execute the completation of a proxy task from a thread of that is part of the 3486 team. Run first and bottom halves directly. 3487 */ 3488 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) { 3489 KMP_DEBUG_ASSERT(ptask != NULL); 3490 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3491 KA_TRACE( 3492 10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n", 3493 gtid, taskdata)); 3494 3495 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3496 3497 __kmp_first_top_half_finish_proxy(taskdata); 3498 __kmp_second_top_half_finish_proxy(taskdata); 3499 __kmp_bottom_half_finish_proxy(gtid, ptask); 3500 3501 KA_TRACE(10, 3502 ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n", 3503 gtid, taskdata)); 3504 } 3505 3506 /*! 3507 @ingroup TASKING 3508 @param ptask Task which execution is completed 3509 3510 Execute the completation of a proxy task from a thread that could not belong to 3511 the team. 3512 */ 3513 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) { 3514 KMP_DEBUG_ASSERT(ptask != NULL); 3515 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3516 3517 KA_TRACE( 3518 10, 3519 ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n", 3520 taskdata)); 3521 3522 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3523 3524 __kmp_first_top_half_finish_proxy(taskdata); 3525 3526 // Enqueue task to complete bottom half completion from a thread within the 3527 // corresponding team 3528 kmp_team_t *team = taskdata->td_team; 3529 kmp_int32 nthreads = team->t.t_nproc; 3530 kmp_info_t *thread; 3531 3532 // This should be similar to start_k = __kmp_get_random( thread ) % nthreads 3533 // but we cannot use __kmp_get_random here 3534 kmp_int32 start_k = 0; 3535 kmp_int32 pass = 1; 3536 kmp_int32 k = start_k; 3537 3538 do { 3539 // For now we're just linearly trying to find a thread 3540 thread = team->t.t_threads[k]; 3541 k = (k + 1) % nthreads; 3542 3543 // we did a full pass through all the threads 3544 if (k == start_k) 3545 pass = pass << 1; 3546 3547 } while (!__kmp_give_task(thread, k, ptask, pass)); 3548 3549 __kmp_second_top_half_finish_proxy(taskdata); 3550 3551 KA_TRACE( 3552 10, 3553 ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", 3554 taskdata)); 3555 } 3556 3557 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task 3558 // for taskloop 3559 // 3560 // thread: allocating thread 3561 // task_src: pointer to source task to be duplicated 3562 // returns: a pointer to the allocated kmp_task_t structure (task). 3563 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) { 3564 kmp_task_t *task; 3565 kmp_taskdata_t *taskdata; 3566 kmp_taskdata_t *taskdata_src; 3567 kmp_taskdata_t *parent_task = thread->th.th_current_task; 3568 size_t shareds_offset; 3569 size_t task_size; 3570 3571 KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, 3572 task_src)); 3573 taskdata_src = KMP_TASK_TO_TASKDATA(task_src); 3574 KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy == 3575 TASK_FULL); // it should not be proxy task 3576 KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT); 3577 task_size = taskdata_src->td_size_alloc; 3578 3579 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 3580 KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread, 3581 task_size)); 3582 #if USE_FAST_MEMORY 3583 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size); 3584 #else 3585 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size); 3586 #endif /* USE_FAST_MEMORY */ 3587 KMP_MEMCPY(taskdata, taskdata_src, task_size); 3588 3589 task = KMP_TASKDATA_TO_TASK(taskdata); 3590 3591 // Initialize new task (only specific fields not affected by memcpy) 3592 taskdata->td_task_id = KMP_GEN_TASK_ID(); 3593 if (task->shareds != NULL) { // need setup shareds pointer 3594 shareds_offset = (char *)task_src->shareds - (char *)taskdata_src; 3595 task->shareds = &((char *)taskdata)[shareds_offset]; 3596 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 3597 0); 3598 } 3599 taskdata->td_alloc_thread = thread; 3600 taskdata->td_parent = parent_task; 3601 taskdata->td_taskgroup = 3602 parent_task 3603 ->td_taskgroup; // task inherits the taskgroup from the parent task 3604 3605 // Only need to keep track of child task counts if team parallel and tasking 3606 // not serialized 3607 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { 3608 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks); 3609 if (parent_task->td_taskgroup) 3610 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count); 3611 // Only need to keep track of allocated child tasks for explicit tasks since 3612 // implicit not deallocated 3613 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) 3614 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks); 3615 } 3616 3617 KA_TRACE(20, 3618 ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n", 3619 thread, taskdata, taskdata->td_parent)); 3620 #if OMPT_SUPPORT 3621 if (UNLIKELY(ompt_enabled.enabled)) 3622 __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid); 3623 #endif 3624 return task; 3625 } 3626 3627 // Routine optionally generated by the compiler for setting the lastprivate flag 3628 // and calling needed constructors for private/firstprivate objects 3629 // (used to form taskloop tasks from pattern task) 3630 // Parameters: dest task, src task, lastprivate flag. 3631 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32); 3632 3633 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8); 3634 3635 // class to encapsulate manipulating loop bounds in a taskloop task. 3636 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting 3637 // the loop bound variables. 3638 class kmp_taskloop_bounds_t { 3639 kmp_task_t *task; 3640 const kmp_taskdata_t *taskdata; 3641 size_t lower_offset; 3642 size_t upper_offset; 3643 3644 public: 3645 kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub) 3646 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)), 3647 lower_offset((char *)lb - (char *)task), 3648 upper_offset((char *)ub - (char *)task) { 3649 KMP_DEBUG_ASSERT((char *)lb > (char *)_task); 3650 KMP_DEBUG_ASSERT((char *)ub > (char *)_task); 3651 } 3652 kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds) 3653 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)), 3654 lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {} 3655 size_t get_lower_offset() const { return lower_offset; } 3656 size_t get_upper_offset() const { return upper_offset; } 3657 kmp_uint64 get_lb() const { 3658 kmp_int64 retval; 3659 #if defined(KMP_GOMP_COMPAT) 3660 // Intel task just returns the lower bound normally 3661 if (!taskdata->td_flags.native) { 3662 retval = *(kmp_int64 *)((char *)task + lower_offset); 3663 } else { 3664 // GOMP task has to take into account the sizeof(long) 3665 if (taskdata->td_size_loop_bounds == 4) { 3666 kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds); 3667 retval = (kmp_int64)*lb; 3668 } else { 3669 kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds); 3670 retval = (kmp_int64)*lb; 3671 } 3672 } 3673 #else 3674 retval = *(kmp_int64 *)((char *)task + lower_offset); 3675 #endif // defined(KMP_GOMP_COMPAT) 3676 return retval; 3677 } 3678 kmp_uint64 get_ub() const { 3679 kmp_int64 retval; 3680 #if defined(KMP_GOMP_COMPAT) 3681 // Intel task just returns the upper bound normally 3682 if (!taskdata->td_flags.native) { 3683 retval = *(kmp_int64 *)((char *)task + upper_offset); 3684 } else { 3685 // GOMP task has to take into account the sizeof(long) 3686 if (taskdata->td_size_loop_bounds == 4) { 3687 kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1; 3688 retval = (kmp_int64)*ub; 3689 } else { 3690 kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1; 3691 retval = (kmp_int64)*ub; 3692 } 3693 } 3694 #else 3695 retval = *(kmp_int64 *)((char *)task + upper_offset); 3696 #endif // defined(KMP_GOMP_COMPAT) 3697 return retval; 3698 } 3699 void set_lb(kmp_uint64 lb) { 3700 #if defined(KMP_GOMP_COMPAT) 3701 // Intel task just sets the lower bound normally 3702 if (!taskdata->td_flags.native) { 3703 *(kmp_uint64 *)((char *)task + lower_offset) = lb; 3704 } else { 3705 // GOMP task has to take into account the sizeof(long) 3706 if (taskdata->td_size_loop_bounds == 4) { 3707 kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds); 3708 *lower = (kmp_uint32)lb; 3709 } else { 3710 kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds); 3711 *lower = (kmp_uint64)lb; 3712 } 3713 } 3714 #else 3715 *(kmp_uint64 *)((char *)task + lower_offset) = lb; 3716 #endif // defined(KMP_GOMP_COMPAT) 3717 } 3718 void set_ub(kmp_uint64 ub) { 3719 #if defined(KMP_GOMP_COMPAT) 3720 // Intel task just sets the upper bound normally 3721 if (!taskdata->td_flags.native) { 3722 *(kmp_uint64 *)((char *)task + upper_offset) = ub; 3723 } else { 3724 // GOMP task has to take into account the sizeof(long) 3725 if (taskdata->td_size_loop_bounds == 4) { 3726 kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1; 3727 *upper = (kmp_uint32)ub; 3728 } else { 3729 kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1; 3730 *upper = (kmp_uint64)ub; 3731 } 3732 } 3733 #else 3734 *(kmp_uint64 *)((char *)task + upper_offset) = ub; 3735 #endif // defined(KMP_GOMP_COMPAT) 3736 } 3737 }; 3738 3739 // __kmp_taskloop_linear: Start tasks of the taskloop linearly 3740 // 3741 // loc Source location information 3742 // gtid Global thread ID 3743 // task Pattern task, exposes the loop iteration range 3744 // lb Pointer to loop lower bound in task structure 3745 // ub Pointer to loop upper bound in task structure 3746 // st Loop stride 3747 // ub_glob Global upper bound (used for lastprivate check) 3748 // num_tasks Number of tasks to execute 3749 // grainsize Number of loop iterations per task 3750 // extras Number of chunks with grainsize+1 iterations 3751 // tc Iterations count 3752 // task_dup Tasks duplication routine 3753 // codeptr_ra Return address for OMPT events 3754 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, 3755 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 3756 kmp_uint64 ub_glob, kmp_uint64 num_tasks, 3757 kmp_uint64 grainsize, kmp_uint64 extras, 3758 kmp_uint64 tc, 3759 #if OMPT_SUPPORT 3760 void *codeptr_ra, 3761 #endif 3762 void *task_dup) { 3763 KMP_COUNT_BLOCK(OMP_TASKLOOP); 3764 KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling); 3765 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 3766 // compiler provides global bounds here 3767 kmp_taskloop_bounds_t task_bounds(task, lb, ub); 3768 kmp_uint64 lower = task_bounds.get_lb(); 3769 kmp_uint64 upper = task_bounds.get_ub(); 3770 kmp_uint64 i; 3771 kmp_info_t *thread = __kmp_threads[gtid]; 3772 kmp_taskdata_t *current_task = thread->th.th_current_task; 3773 kmp_task_t *next_task; 3774 kmp_int32 lastpriv = 0; 3775 3776 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); 3777 KMP_DEBUG_ASSERT(num_tasks > extras); 3778 KMP_DEBUG_ASSERT(num_tasks > 0); 3779 KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, " 3780 "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n", 3781 gtid, num_tasks, grainsize, extras, lower, upper, ub_glob, st, 3782 task_dup)); 3783 3784 // Launch num_tasks tasks, assign grainsize iterations each task 3785 for (i = 0; i < num_tasks; ++i) { 3786 kmp_uint64 chunk_minus_1; 3787 if (extras == 0) { 3788 chunk_minus_1 = grainsize - 1; 3789 } else { 3790 chunk_minus_1 = grainsize; 3791 --extras; // first extras iterations get bigger chunk (grainsize+1) 3792 } 3793 upper = lower + st * chunk_minus_1; 3794 if (i == num_tasks - 1) { 3795 // schedule the last task, set lastprivate flag if needed 3796 if (st == 1) { // most common case 3797 KMP_DEBUG_ASSERT(upper == *ub); 3798 if (upper == ub_glob) 3799 lastpriv = 1; 3800 } else if (st > 0) { // positive loop stride 3801 KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper); 3802 if ((kmp_uint64)st > ub_glob - upper) 3803 lastpriv = 1; 3804 } else { // negative loop stride 3805 KMP_DEBUG_ASSERT(upper + st < *ub); 3806 if (upper - ub_glob < (kmp_uint64)(-st)) 3807 lastpriv = 1; 3808 } 3809 } 3810 next_task = __kmp_task_dup_alloc(thread, task); // allocate new task 3811 kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task); 3812 kmp_taskloop_bounds_t next_task_bounds = 3813 kmp_taskloop_bounds_t(next_task, task_bounds); 3814 3815 // adjust task-specific bounds 3816 next_task_bounds.set_lb(lower); 3817 if (next_taskdata->td_flags.native) { 3818 next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1)); 3819 } else { 3820 next_task_bounds.set_ub(upper); 3821 } 3822 if (ptask_dup != NULL) // set lastprivate flag, construct fistprivates, etc. 3823 ptask_dup(next_task, task, lastpriv); 3824 KA_TRACE(40, 3825 ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, " 3826 "upper %lld stride %lld, (offsets %p %p)\n", 3827 gtid, i, next_task, lower, upper, st, 3828 next_task_bounds.get_lower_offset(), 3829 next_task_bounds.get_upper_offset())); 3830 #if OMPT_SUPPORT 3831 __kmp_omp_taskloop_task(NULL, gtid, next_task, 3832 codeptr_ra); // schedule new task 3833 #else 3834 __kmp_omp_task(gtid, next_task, true); // schedule new task 3835 #endif 3836 lower = upper + st; // adjust lower bound for the next iteration 3837 } 3838 // free the pattern task and exit 3839 __kmp_task_start(gtid, task, current_task); // make internal bookkeeping 3840 // do not execute the pattern task, just do internal bookkeeping 3841 __kmp_task_finish<false>(gtid, task, current_task); 3842 } 3843 3844 // Structure to keep taskloop parameters for auxiliary task 3845 // kept in the shareds of the task structure. 3846 typedef struct __taskloop_params { 3847 kmp_task_t *task; 3848 kmp_uint64 *lb; 3849 kmp_uint64 *ub; 3850 void *task_dup; 3851 kmp_int64 st; 3852 kmp_uint64 ub_glob; 3853 kmp_uint64 num_tasks; 3854 kmp_uint64 grainsize; 3855 kmp_uint64 extras; 3856 kmp_uint64 tc; 3857 kmp_uint64 num_t_min; 3858 #if OMPT_SUPPORT 3859 void *codeptr_ra; 3860 #endif 3861 } __taskloop_params_t; 3862 3863 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *, 3864 kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64, 3865 kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64, 3866 #if OMPT_SUPPORT 3867 void *, 3868 #endif 3869 void *); 3870 3871 // Execute part of the the taskloop submitted as a task. 3872 int __kmp_taskloop_task(int gtid, void *ptask) { 3873 __taskloop_params_t *p = 3874 (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds; 3875 kmp_task_t *task = p->task; 3876 kmp_uint64 *lb = p->lb; 3877 kmp_uint64 *ub = p->ub; 3878 void *task_dup = p->task_dup; 3879 // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 3880 kmp_int64 st = p->st; 3881 kmp_uint64 ub_glob = p->ub_glob; 3882 kmp_uint64 num_tasks = p->num_tasks; 3883 kmp_uint64 grainsize = p->grainsize; 3884 kmp_uint64 extras = p->extras; 3885 kmp_uint64 tc = p->tc; 3886 kmp_uint64 num_t_min = p->num_t_min; 3887 #if OMPT_SUPPORT 3888 void *codeptr_ra = p->codeptr_ra; 3889 #endif 3890 #if KMP_DEBUG 3891 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 3892 KMP_DEBUG_ASSERT(task != NULL); 3893 KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize" 3894 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", 3895 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st, 3896 task_dup)); 3897 #endif 3898 KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min); 3899 if (num_tasks > num_t_min) 3900 __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, 3901 grainsize, extras, tc, num_t_min, 3902 #if OMPT_SUPPORT 3903 codeptr_ra, 3904 #endif 3905 task_dup); 3906 else 3907 __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, 3908 grainsize, extras, tc, 3909 #if OMPT_SUPPORT 3910 codeptr_ra, 3911 #endif 3912 task_dup); 3913 3914 KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid)); 3915 return 0; 3916 } 3917 3918 // Schedule part of the the taskloop as a task, 3919 // execute the rest of the the taskloop. 3920 // 3921 // loc Source location information 3922 // gtid Global thread ID 3923 // task Pattern task, exposes the loop iteration range 3924 // lb Pointer to loop lower bound in task structure 3925 // ub Pointer to loop upper bound in task structure 3926 // st Loop stride 3927 // ub_glob Global upper bound (used for lastprivate check) 3928 // num_tasks Number of tasks to execute 3929 // grainsize Number of loop iterations per task 3930 // extras Number of chunks with grainsize+1 iterations 3931 // tc Iterations count 3932 // num_t_min Threashold to launch tasks recursively 3933 // task_dup Tasks duplication routine 3934 // codeptr_ra Return address for OMPT events 3935 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task, 3936 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 3937 kmp_uint64 ub_glob, kmp_uint64 num_tasks, 3938 kmp_uint64 grainsize, kmp_uint64 extras, 3939 kmp_uint64 tc, kmp_uint64 num_t_min, 3940 #if OMPT_SUPPORT 3941 void *codeptr_ra, 3942 #endif 3943 void *task_dup) { 3944 #if KMP_DEBUG 3945 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 3946 KMP_DEBUG_ASSERT(task != NULL); 3947 KMP_DEBUG_ASSERT(num_tasks > num_t_min); 3948 KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize" 3949 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", 3950 gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st, 3951 task_dup)); 3952 #endif 3953 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 3954 kmp_uint64 lower = *lb; 3955 kmp_uint64 upper = *ub; 3956 kmp_info_t *thread = __kmp_threads[gtid]; 3957 // kmp_taskdata_t *current_task = thread->th.th_current_task; 3958 kmp_task_t *next_task; 3959 kmp_int32 lastpriv = 0; 3960 size_t lower_offset = 3961 (char *)lb - (char *)task; // remember offset of lb in the task structure 3962 size_t upper_offset = 3963 (char *)ub - (char *)task; // remember offset of ub in the task structure 3964 3965 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); 3966 KMP_DEBUG_ASSERT(num_tasks > extras); 3967 KMP_DEBUG_ASSERT(num_tasks > 0); 3968 3969 // split the loop in two halves 3970 kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1; 3971 kmp_uint64 gr_size0 = grainsize; 3972 kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute 3973 kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task 3974 if (n_tsk0 <= extras) { 3975 gr_size0++; // integrate extras into grainsize 3976 ext0 = 0; // no extra iters in 1st half 3977 ext1 = extras - n_tsk0; // remaining extras 3978 tc0 = gr_size0 * n_tsk0; 3979 tc1 = tc - tc0; 3980 } else { // n_tsk0 > extras 3981 ext1 = 0; // no extra iters in 2nd half 3982 ext0 = extras; 3983 tc1 = grainsize * n_tsk1; 3984 tc0 = tc - tc1; 3985 } 3986 ub0 = lower + st * (tc0 - 1); 3987 lb1 = ub0 + st; 3988 3989 // create pattern task for 2nd half of the loop 3990 next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task 3991 // adjust lower bound (upper bound is not changed) for the 2nd half 3992 *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1; 3993 if (ptask_dup != NULL) // construct fistprivates, etc. 3994 ptask_dup(next_task, task, 0); 3995 *ub = ub0; // adjust upper bound for the 1st half 3996 3997 // create auxiliary task for 2nd half of the loop 3998 kmp_task_t *new_task = 3999 __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *), 4000 sizeof(__taskloop_params_t), &__kmp_taskloop_task); 4001 __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds; 4002 p->task = next_task; 4003 p->lb = (kmp_uint64 *)((char *)next_task + lower_offset); 4004 p->ub = (kmp_uint64 *)((char *)next_task + upper_offset); 4005 p->task_dup = task_dup; 4006 p->st = st; 4007 p->ub_glob = ub_glob; 4008 p->num_tasks = n_tsk1; 4009 p->grainsize = grainsize; 4010 p->extras = ext1; 4011 p->tc = tc1; 4012 p->num_t_min = num_t_min; 4013 #if OMPT_SUPPORT 4014 p->codeptr_ra = codeptr_ra; 4015 #endif 4016 4017 #if OMPT_SUPPORT 4018 // schedule new task with correct return address for OMPT events 4019 __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra); 4020 #else 4021 __kmp_omp_task(gtid, new_task, true); // schedule new task 4022 #endif 4023 4024 // execute the 1st half of current subrange 4025 if (n_tsk0 > num_t_min) 4026 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0, 4027 ext0, tc0, num_t_min, 4028 #if OMPT_SUPPORT 4029 codeptr_ra, 4030 #endif 4031 task_dup); 4032 else 4033 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, 4034 gr_size0, ext0, tc0, 4035 #if OMPT_SUPPORT 4036 codeptr_ra, 4037 #endif 4038 task_dup); 4039 4040 KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid)); 4041 } 4042 4043 /*! 4044 @ingroup TASKING 4045 @param loc Source location information 4046 @param gtid Global thread ID 4047 @param task Task structure 4048 @param if_val Value of the if clause 4049 @param lb Pointer to loop lower bound in task structure 4050 @param ub Pointer to loop upper bound in task structure 4051 @param st Loop stride 4052 @param nogroup Flag, 1 if nogroup clause specified, 0 otherwise 4053 @param sched Schedule specified 0/1/2 for none/grainsize/num_tasks 4054 @param grainsize Schedule value if specified 4055 @param task_dup Tasks duplication routine 4056 4057 Execute the taskloop construct. 4058 */ 4059 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, 4060 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, 4061 int sched, kmp_uint64 grainsize, void *task_dup) { 4062 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 4063 KMP_DEBUG_ASSERT(task != NULL); 4064 4065 if (nogroup == 0) { 4066 #if OMPT_SUPPORT && OMPT_OPTIONAL 4067 OMPT_STORE_RETURN_ADDRESS(gtid); 4068 #endif 4069 __kmpc_taskgroup(loc, gtid); 4070 } 4071 4072 // ========================================================================= 4073 // calculate loop parameters 4074 kmp_taskloop_bounds_t task_bounds(task, lb, ub); 4075 kmp_uint64 tc; 4076 // compiler provides global bounds here 4077 kmp_uint64 lower = task_bounds.get_lb(); 4078 kmp_uint64 upper = task_bounds.get_ub(); 4079 kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag 4080 kmp_uint64 num_tasks = 0, extras = 0; 4081 kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks; 4082 kmp_info_t *thread = __kmp_threads[gtid]; 4083 kmp_taskdata_t *current_task = thread->th.th_current_task; 4084 4085 KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, " 4086 "grain %llu(%d), dup %p\n", 4087 gtid, taskdata, lower, upper, st, grainsize, sched, task_dup)); 4088 4089 // compute trip count 4090 if (st == 1) { // most common case 4091 tc = upper - lower + 1; 4092 } else if (st < 0) { 4093 tc = (lower - upper) / (-st) + 1; 4094 } else { // st > 0 4095 tc = (upper - lower) / st + 1; 4096 } 4097 if (tc == 0) { 4098 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid)); 4099 // free the pattern task and exit 4100 __kmp_task_start(gtid, task, current_task); 4101 // do not execute anything for zero-trip loop 4102 __kmp_task_finish<false>(gtid, task, current_task); 4103 return; 4104 } 4105 4106 #if OMPT_SUPPORT && OMPT_OPTIONAL 4107 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); 4108 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 4109 if (ompt_enabled.ompt_callback_work) { 4110 ompt_callbacks.ompt_callback(ompt_callback_work)( 4111 ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data), 4112 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0)); 4113 } 4114 #endif 4115 4116 if (num_tasks_min == 0) 4117 // TODO: can we choose better default heuristic? 4118 num_tasks_min = 4119 KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE); 4120 4121 // compute num_tasks/grainsize based on the input provided 4122 switch (sched) { 4123 case 0: // no schedule clause specified, we can choose the default 4124 // let's try to schedule (team_size*10) tasks 4125 grainsize = thread->th.th_team_nproc * 10; 4126 case 2: // num_tasks provided 4127 if (grainsize > tc) { 4128 num_tasks = tc; // too big num_tasks requested, adjust values 4129 grainsize = 1; 4130 extras = 0; 4131 } else { 4132 num_tasks = grainsize; 4133 grainsize = tc / num_tasks; 4134 extras = tc % num_tasks; 4135 } 4136 break; 4137 case 1: // grainsize provided 4138 if (grainsize > tc) { 4139 num_tasks = 1; // too big grainsize requested, adjust values 4140 grainsize = tc; 4141 extras = 0; 4142 } else { 4143 num_tasks = tc / grainsize; 4144 // adjust grainsize for balanced distribution of iterations 4145 grainsize = tc / num_tasks; 4146 extras = tc % num_tasks; 4147 } 4148 break; 4149 default: 4150 KMP_ASSERT2(0, "unknown scheduling of taskloop"); 4151 } 4152 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); 4153 KMP_DEBUG_ASSERT(num_tasks > extras); 4154 KMP_DEBUG_ASSERT(num_tasks > 0); 4155 // ========================================================================= 4156 4157 // check if clause value first 4158 // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native) 4159 if (if_val == 0) { // if(0) specified, mark task as serial 4160 taskdata->td_flags.task_serial = 1; 4161 taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied 4162 // always start serial tasks linearly 4163 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4164 grainsize, extras, tc, 4165 #if OMPT_SUPPORT 4166 OMPT_GET_RETURN_ADDRESS(0), 4167 #endif 4168 task_dup); 4169 // !taskdata->td_flags.native => currently force linear spawning of tasks 4170 // for GOMP_taskloop 4171 } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) { 4172 KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu" 4173 "(%lld), grain %llu, extras %llu\n", 4174 gtid, tc, num_tasks, num_tasks_min, grainsize, extras)); 4175 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4176 grainsize, extras, tc, num_tasks_min, 4177 #if OMPT_SUPPORT 4178 OMPT_GET_RETURN_ADDRESS(0), 4179 #endif 4180 task_dup); 4181 } else { 4182 KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu" 4183 "(%lld), grain %llu, extras %llu\n", 4184 gtid, tc, num_tasks, num_tasks_min, grainsize, extras)); 4185 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 4186 grainsize, extras, tc, 4187 #if OMPT_SUPPORT 4188 OMPT_GET_RETURN_ADDRESS(0), 4189 #endif 4190 task_dup); 4191 } 4192 4193 #if OMPT_SUPPORT && OMPT_OPTIONAL 4194 if (ompt_enabled.ompt_callback_work) { 4195 ompt_callbacks.ompt_callback(ompt_callback_work)( 4196 ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data), 4197 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0)); 4198 } 4199 #endif 4200 4201 if (nogroup == 0) { 4202 #if OMPT_SUPPORT && OMPT_OPTIONAL 4203 OMPT_STORE_RETURN_ADDRESS(gtid); 4204 #endif 4205 __kmpc_end_taskgroup(loc, gtid); 4206 } 4207 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid)); 4208 } 4209 4210 #endif 4211