1 /* 2 * kmp_tasking.cpp -- OpenMP 3.0 tasking support. 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "kmp.h" 17 #include "kmp_i18n.h" 18 #include "kmp_itt.h" 19 #include "kmp_stats.h" 20 #include "kmp_wait_release.h" 21 22 #if OMPT_SUPPORT 23 #include "ompt-specific.h" 24 #endif 25 26 #include "tsan_annotations.h" 27 28 /* forward declaration */ 29 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 30 kmp_info_t *this_thr); 31 static void __kmp_alloc_task_deque(kmp_info_t *thread, 32 kmp_thread_data_t *thread_data); 33 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 34 kmp_task_team_t *task_team); 35 36 #ifdef OMP_45_ENABLED 37 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask); 38 #endif 39 40 #ifdef BUILD_TIED_TASK_STACK 41 42 // __kmp_trace_task_stack: print the tied tasks from the task stack in order 43 // from top do bottom 44 // 45 // gtid: global thread identifier for thread containing stack 46 // thread_data: thread data for task team thread containing stack 47 // threshold: value above which the trace statement triggers 48 // location: string identifying call site of this function (for trace) 49 static void __kmp_trace_task_stack(kmp_int32 gtid, 50 kmp_thread_data_t *thread_data, 51 int threshold, char *location) { 52 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 53 kmp_taskdata_t **stack_top = task_stack->ts_top; 54 kmp_int32 entries = task_stack->ts_entries; 55 kmp_taskdata_t *tied_task; 56 57 KA_TRACE( 58 threshold, 59 ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, " 60 "first_block = %p, stack_top = %p \n", 61 location, gtid, entries, task_stack->ts_first_block, stack_top)); 62 63 KMP_DEBUG_ASSERT(stack_top != NULL); 64 KMP_DEBUG_ASSERT(entries > 0); 65 66 while (entries != 0) { 67 KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]); 68 // fix up ts_top if we need to pop from previous block 69 if (entries & TASK_STACK_INDEX_MASK == 0) { 70 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top); 71 72 stack_block = stack_block->sb_prev; 73 stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 74 } 75 76 // finish bookkeeping 77 stack_top--; 78 entries--; 79 80 tied_task = *stack_top; 81 82 KMP_DEBUG_ASSERT(tied_task != NULL); 83 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 84 85 KA_TRACE(threshold, 86 ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, " 87 "stack_top=%p, tied_task=%p\n", 88 location, gtid, entries, stack_top, tied_task)); 89 } 90 KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]); 91 92 KA_TRACE(threshold, 93 ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n", 94 location, gtid)); 95 } 96 97 // __kmp_init_task_stack: initialize the task stack for the first time 98 // after a thread_data structure is created. 99 // It should not be necessary to do this again (assuming the stack works). 100 // 101 // gtid: global thread identifier of calling thread 102 // thread_data: thread data for task team thread containing stack 103 static void __kmp_init_task_stack(kmp_int32 gtid, 104 kmp_thread_data_t *thread_data) { 105 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 106 kmp_stack_block_t *first_block; 107 108 // set up the first block of the stack 109 first_block = &task_stack->ts_first_block; 110 task_stack->ts_top = (kmp_taskdata_t **)first_block; 111 memset((void *)first_block, '\0', 112 TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *)); 113 114 // initialize the stack to be empty 115 task_stack->ts_entries = TASK_STACK_EMPTY; 116 first_block->sb_next = NULL; 117 first_block->sb_prev = NULL; 118 } 119 120 // __kmp_free_task_stack: free the task stack when thread_data is destroyed. 121 // 122 // gtid: global thread identifier for calling thread 123 // thread_data: thread info for thread containing stack 124 static void __kmp_free_task_stack(kmp_int32 gtid, 125 kmp_thread_data_t *thread_data) { 126 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 127 kmp_stack_block_t *stack_block = &task_stack->ts_first_block; 128 129 KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY); 130 // free from the second block of the stack 131 while (stack_block != NULL) { 132 kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL; 133 134 stack_block->sb_next = NULL; 135 stack_block->sb_prev = NULL; 136 if (stack_block != &task_stack->ts_first_block) { 137 __kmp_thread_free(thread, 138 stack_block); // free the block, if not the first 139 } 140 stack_block = next_block; 141 } 142 // initialize the stack to be empty 143 task_stack->ts_entries = 0; 144 task_stack->ts_top = NULL; 145 } 146 147 // __kmp_push_task_stack: Push the tied task onto the task stack. 148 // Grow the stack if necessary by allocating another block. 149 // 150 // gtid: global thread identifier for calling thread 151 // thread: thread info for thread containing stack 152 // tied_task: the task to push on the stack 153 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread, 154 kmp_taskdata_t *tied_task) { 155 // GEH - need to consider what to do if tt_threads_data not allocated yet 156 kmp_thread_data_t *thread_data = 157 &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 158 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 159 160 if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) { 161 return; // Don't push anything on stack if team or team tasks are serialized 162 } 163 164 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 165 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 166 167 KA_TRACE(20, 168 ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n", 169 gtid, thread, tied_task)); 170 // Store entry 171 *(task_stack->ts_top) = tied_task; 172 173 // Do bookkeeping for next push 174 task_stack->ts_top++; 175 task_stack->ts_entries++; 176 177 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 178 // Find beginning of this task block 179 kmp_stack_block_t *stack_block = 180 (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE); 181 182 // Check if we already have a block 183 if (stack_block->sb_next != 184 NULL) { // reset ts_top to beginning of next block 185 task_stack->ts_top = &stack_block->sb_next->sb_block[0]; 186 } else { // Alloc new block and link it up 187 kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc( 188 thread, sizeof(kmp_stack_block_t)); 189 190 task_stack->ts_top = &new_block->sb_block[0]; 191 stack_block->sb_next = new_block; 192 new_block->sb_prev = stack_block; 193 new_block->sb_next = NULL; 194 195 KA_TRACE( 196 30, 197 ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n", 198 gtid, tied_task, new_block)); 199 } 200 } 201 KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 202 tied_task)); 203 } 204 205 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return 206 // the task, just check to make sure it matches the ending task passed in. 207 // 208 // gtid: global thread identifier for the calling thread 209 // thread: thread info structure containing stack 210 // tied_task: the task popped off the stack 211 // ending_task: the task that is ending (should match popped task) 212 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread, 213 kmp_taskdata_t *ending_task) { 214 // GEH - need to consider what to do if tt_threads_data not allocated yet 215 kmp_thread_data_t *thread_data = 216 &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)]; 217 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 218 kmp_taskdata_t *tied_task; 219 220 if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) { 221 // Don't pop anything from stack if team or team tasks are serialized 222 return; 223 } 224 225 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 226 KMP_DEBUG_ASSERT(task_stack->ts_entries > 0); 227 228 KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid, 229 thread)); 230 231 // fix up ts_top if we need to pop from previous block 232 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 233 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top); 234 235 stack_block = stack_block->sb_prev; 236 task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 237 } 238 239 // finish bookkeeping 240 task_stack->ts_top--; 241 task_stack->ts_entries--; 242 243 tied_task = *(task_stack->ts_top); 244 245 KMP_DEBUG_ASSERT(tied_task != NULL); 246 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 247 KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly 248 249 KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 250 tied_task)); 251 return; 252 } 253 #endif /* BUILD_TIED_TASK_STACK */ 254 255 // __kmp_push_task: Add a task to the thread's deque 256 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { 257 kmp_info_t *thread = __kmp_threads[gtid]; 258 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 259 kmp_task_team_t *task_team = thread->th.th_task_team; 260 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 261 kmp_thread_data_t *thread_data; 262 263 KA_TRACE(20, 264 ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata)); 265 266 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 267 // untied task needs to increment counter so that the task structure is not 268 // freed prematurely 269 kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count); 270 KA_TRACE( 271 20, 272 ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n", 273 gtid, counter, taskdata)); 274 } 275 276 // The first check avoids building task_team thread data if serialized 277 if (taskdata->td_flags.task_serial) { 278 KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning " 279 "TASK_NOT_PUSHED for task %p\n", 280 gtid, taskdata)); 281 return TASK_NOT_PUSHED; 282 } 283 284 // Now that serialized tasks have returned, we can assume that we are not in 285 // immediate exec mode 286 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 287 if (!KMP_TASKING_ENABLED(task_team)) { 288 __kmp_enable_tasking(task_team, thread); 289 } 290 KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE); 291 KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL); 292 293 // Find tasking deque specific to encountering thread 294 thread_data = &task_team->tt.tt_threads_data[tid]; 295 296 // No lock needed since only owner can allocate 297 if (thread_data->td.td_deque == NULL) { 298 __kmp_alloc_task_deque(thread, thread_data); 299 } 300 301 // Check if deque is full 302 if (TCR_4(thread_data->td.td_deque_ntasks) >= 303 TASK_DEQUE_SIZE(thread_data->td)) { 304 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning " 305 "TASK_NOT_PUSHED for task %p\n", 306 gtid, taskdata)); 307 return TASK_NOT_PUSHED; 308 } 309 310 // Lock the deque for the task push operation 311 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 312 313 #if OMP_45_ENABLED 314 // Need to recheck as we can get a proxy task from a thread outside of OpenMP 315 if (TCR_4(thread_data->td.td_deque_ntasks) >= 316 TASK_DEQUE_SIZE(thread_data->td)) { 317 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 318 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; returning " 319 "TASK_NOT_PUSHED for task %p\n", 320 gtid, taskdata)); 321 return TASK_NOT_PUSHED; 322 } 323 #else 324 // Must have room since no thread can add tasks but calling thread 325 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) < 326 TASK_DEQUE_SIZE(thread_data->td)); 327 #endif 328 329 thread_data->td.td_deque[thread_data->td.td_deque_tail] = 330 taskdata; // Push taskdata 331 // Wrap index. 332 thread_data->td.td_deque_tail = 333 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 334 TCW_4(thread_data->td.td_deque_ntasks, 335 TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count 336 337 KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: " 338 "task=%p ntasks=%d head=%u tail=%u\n", 339 gtid, taskdata, thread_data->td.td_deque_ntasks, 340 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 341 342 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 343 344 return TASK_SUCCESSFULLY_PUSHED; 345 } 346 347 // __kmp_pop_current_task_from_thread: set up current task from called thread 348 // when team ends 349 // 350 // this_thr: thread structure to set current_task in. 351 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) { 352 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d " 353 "this_thread=%p, curtask=%p, " 354 "curtask_parent=%p\n", 355 0, this_thr, this_thr->th.th_current_task, 356 this_thr->th.th_current_task->td_parent)); 357 358 this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent; 359 360 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d " 361 "this_thread=%p, curtask=%p, " 362 "curtask_parent=%p\n", 363 0, this_thr, this_thr->th.th_current_task, 364 this_thr->th.th_current_task->td_parent)); 365 } 366 367 // __kmp_push_current_task_to_thread: set up current task in called thread for a 368 // new team 369 // 370 // this_thr: thread structure to set up 371 // team: team for implicit task data 372 // tid: thread within team to set up 373 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team, 374 int tid) { 375 // current task of the thread is a parent of the new just created implicit 376 // tasks of new team 377 KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p " 378 "curtask=%p " 379 "parent_task=%p\n", 380 tid, this_thr, this_thr->th.th_current_task, 381 team->t.t_implicit_task_taskdata[tid].td_parent)); 382 383 KMP_DEBUG_ASSERT(this_thr != NULL); 384 385 if (tid == 0) { 386 if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) { 387 team->t.t_implicit_task_taskdata[0].td_parent = 388 this_thr->th.th_current_task; 389 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0]; 390 } 391 } else { 392 team->t.t_implicit_task_taskdata[tid].td_parent = 393 team->t.t_implicit_task_taskdata[0].td_parent; 394 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid]; 395 } 396 397 KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p " 398 "curtask=%p " 399 "parent_task=%p\n", 400 tid, this_thr, this_thr->th.th_current_task, 401 team->t.t_implicit_task_taskdata[tid].td_parent)); 402 } 403 404 // __kmp_task_start: bookkeeping for a task starting execution 405 // 406 // GTID: global thread id of calling thread 407 // task: task starting execution 408 // current_task: task suspending 409 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task, 410 kmp_taskdata_t *current_task) { 411 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 412 kmp_info_t *thread = __kmp_threads[gtid]; 413 414 KA_TRACE(10, 415 ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n", 416 gtid, taskdata, current_task)); 417 418 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 419 420 // mark currently executing task as suspended 421 // TODO: GEH - make sure root team implicit task is initialized properly. 422 // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 ); 423 current_task->td_flags.executing = 0; 424 425 // Add task to stack if tied 426 #ifdef BUILD_TIED_TASK_STACK 427 if (taskdata->td_flags.tiedness == TASK_TIED) { 428 __kmp_push_task_stack(gtid, thread, taskdata); 429 } 430 #endif /* BUILD_TIED_TASK_STACK */ 431 432 // mark starting task as executing and as current task 433 thread->th.th_current_task = taskdata; 434 435 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 || 436 taskdata->td_flags.tiedness == TASK_UNTIED); 437 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 || 438 taskdata->td_flags.tiedness == TASK_UNTIED); 439 taskdata->td_flags.started = 1; 440 taskdata->td_flags.executing = 1; 441 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 442 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 443 444 // GEH TODO: shouldn't we pass some sort of location identifier here? 445 // APT: yes, we will pass location here. 446 // need to store current thread state (in a thread or taskdata structure) 447 // before setting work_state, otherwise wrong state is set after end of task 448 449 KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata)); 450 451 #if OMPT_SUPPORT 452 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_begin)) { 453 kmp_taskdata_t *parent = taskdata->td_parent; 454 ompt_callbacks.ompt_callback(ompt_event_task_begin)( 455 parent ? parent->ompt_task_info.task_id : ompt_task_id_none, 456 parent ? &(parent->ompt_task_info.frame) : NULL, 457 taskdata->ompt_task_info.task_id, taskdata->ompt_task_info.function); 458 } 459 #endif 460 #if OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE 461 /* OMPT emit all dependences if requested by the tool */ 462 if (ompt_enabled && taskdata->ompt_task_info.ndeps > 0 && 463 ompt_callbacks.ompt_callback(ompt_event_task_dependences)) { 464 ompt_callbacks.ompt_callback(ompt_event_task_dependences)( 465 taskdata->ompt_task_info.task_id, taskdata->ompt_task_info.deps, 466 taskdata->ompt_task_info.ndeps); 467 /* We can now free the allocated memory for the dependencies */ 468 KMP_OMPT_DEPS_FREE(thread, taskdata->ompt_task_info.deps); 469 taskdata->ompt_task_info.deps = NULL; 470 taskdata->ompt_task_info.ndeps = 0; 471 } 472 #endif /* OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE */ 473 474 return; 475 } 476 477 // __kmpc_omp_task_begin_if0: report that a given serialized task has started 478 // execution 479 // 480 // loc_ref: source location information; points to beginning of task block. 481 // gtid: global thread number. 482 // task: task thunk for the started task. 483 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid, 484 kmp_task_t *task) { 485 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 486 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 487 488 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p " 489 "current_task=%p\n", 490 gtid, loc_ref, taskdata, current_task)); 491 492 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 493 // untied task needs to increment counter so that the task structure is not 494 // freed prematurely 495 kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count); 496 KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) " 497 "incremented for task %p\n", 498 gtid, counter, taskdata)); 499 } 500 501 taskdata->td_flags.task_serial = 502 1; // Execute this task immediately, not deferred. 503 __kmp_task_start(gtid, task, current_task); 504 505 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid, 506 loc_ref, taskdata)); 507 508 return; 509 } 510 511 #ifdef TASK_UNUSED 512 // __kmpc_omp_task_begin: report that a given task has started execution 513 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 514 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) { 515 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 516 517 KA_TRACE( 518 10, 519 ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n", 520 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task)); 521 522 __kmp_task_start(gtid, task, current_task); 523 524 KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid, 525 loc_ref, KMP_TASK_TO_TASKDATA(task))); 526 return; 527 } 528 #endif // TASK_UNUSED 529 530 // __kmp_free_task: free the current task space and the space for shareds 531 // 532 // gtid: Global thread ID of calling thread 533 // taskdata: task to free 534 // thread: thread data structure of caller 535 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata, 536 kmp_info_t *thread) { 537 KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid, 538 taskdata)); 539 540 // Check to make sure all flags and counters have the correct values 541 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 542 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0); 543 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1); 544 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 545 KMP_DEBUG_ASSERT(TCR_4(taskdata->td_allocated_child_tasks) == 0 || 546 taskdata->td_flags.task_serial == 1); 547 KMP_DEBUG_ASSERT(TCR_4(taskdata->td_incomplete_child_tasks) == 0); 548 549 taskdata->td_flags.freed = 1; 550 ANNOTATE_HAPPENS_BEFORE(taskdata); 551 // deallocate the taskdata and shared variable blocks associated with this task 552 #if USE_FAST_MEMORY 553 __kmp_fast_free(thread, taskdata); 554 #else /* ! USE_FAST_MEMORY */ 555 __kmp_thread_free(thread, taskdata); 556 #endif 557 558 KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata)); 559 } 560 561 // __kmp_free_task_and_ancestors: free the current task and ancestors without 562 // children 563 // 564 // gtid: Global thread ID of calling thread 565 // taskdata: task to free 566 // thread: thread data structure of caller 567 static void __kmp_free_task_and_ancestors(kmp_int32 gtid, 568 kmp_taskdata_t *taskdata, 569 kmp_info_t *thread) { 570 #if OMP_45_ENABLED 571 // Proxy tasks must always be allowed to free their parents 572 // because they can be run in background even in serial mode. 573 kmp_int32 team_serial = 574 (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) && 575 !taskdata->td_flags.proxy; 576 #else 577 kmp_int32 team_serial = 578 taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser; 579 #endif 580 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 581 582 kmp_int32 children = 583 KMP_TEST_THEN_DEC32((kmp_int32 *)(&taskdata->td_allocated_child_tasks)) - 584 1; 585 KMP_DEBUG_ASSERT(children >= 0); 586 587 // Now, go up the ancestor tree to see if any ancestors can now be freed. 588 while (children == 0) { 589 kmp_taskdata_t *parent_taskdata = taskdata->td_parent; 590 591 KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete " 592 "and freeing itself\n", 593 gtid, taskdata)); 594 595 // --- Deallocate my ancestor task --- 596 __kmp_free_task(gtid, taskdata, thread); 597 598 taskdata = parent_taskdata; 599 600 // Stop checking ancestors at implicit task instead of walking up ancestor 601 // tree to avoid premature deallocation of ancestors. 602 if (team_serial || taskdata->td_flags.tasktype == TASK_IMPLICIT) 603 return; 604 605 // Predecrement simulated by "- 1" calculation 606 children = KMP_TEST_THEN_DEC32( 607 (kmp_int32 *)(&taskdata->td_allocated_child_tasks)) - 608 1; 609 KMP_DEBUG_ASSERT(children >= 0); 610 } 611 612 KA_TRACE( 613 20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; " 614 "not freeing it yet\n", 615 gtid, taskdata, children)); 616 } 617 618 // __kmp_task_finish: bookkeeping to do when a task finishes execution 619 // 620 // gtid: global thread ID for calling thread 621 // task: task to be finished 622 // resumed_task: task to be resumed. (may be NULL if task is serialized) 623 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, 624 kmp_taskdata_t *resumed_task) { 625 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 626 kmp_info_t *thread = __kmp_threads[gtid]; 627 kmp_task_team_t *task_team = 628 thread->th.th_task_team; // might be NULL for serial teams... 629 kmp_int32 children = 0; 630 631 #if OMPT_SUPPORT 632 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_end)) { 633 kmp_taskdata_t *parent = taskdata->td_parent; 634 ompt_callbacks.ompt_callback(ompt_event_task_end)( 635 taskdata->ompt_task_info.task_id); 636 } 637 #endif 638 639 KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming " 640 "task %p\n", 641 gtid, taskdata, resumed_task)); 642 643 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 644 645 // Pop task from stack if tied 646 #ifdef BUILD_TIED_TASK_STACK 647 if (taskdata->td_flags.tiedness == TASK_TIED) { 648 __kmp_pop_task_stack(gtid, thread, taskdata); 649 } 650 #endif /* BUILD_TIED_TASK_STACK */ 651 652 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 653 // untied task needs to check the counter so that the task structure is not 654 // freed prematurely 655 kmp_int32 counter = KMP_TEST_THEN_DEC32(&taskdata->td_untied_count) - 1; 656 KA_TRACE( 657 20, 658 ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n", 659 gtid, counter, taskdata)); 660 if (counter > 0) { 661 // untied task is not done, to be continued possibly by other thread, do 662 // not free it now 663 if (resumed_task == NULL) { 664 KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial); 665 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 666 // task is the parent 667 } 668 thread->th.th_current_task = resumed_task; // restore current_task 669 resumed_task->td_flags.executing = 1; // resume previous task 670 KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, " 671 "resuming task %p\n", 672 gtid, taskdata, resumed_task)); 673 return; 674 } 675 } 676 677 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 678 taskdata->td_flags.complete = 1; // mark the task as completed 679 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1); 680 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 681 682 // Only need to keep track of count if team parallel and tasking not 683 // serialized 684 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { 685 // Predecrement simulated by "- 1" calculation 686 children = 687 KMP_TEST_THEN_DEC32( 688 (kmp_int32 *)(&taskdata->td_parent->td_incomplete_child_tasks)) - 689 1; 690 KMP_DEBUG_ASSERT(children >= 0); 691 #if OMP_40_ENABLED 692 if (taskdata->td_taskgroup) 693 KMP_TEST_THEN_DEC32((kmp_int32 *)(&taskdata->td_taskgroup->count)); 694 #if OMP_45_ENABLED 695 } 696 // if we found proxy tasks there could exist a dependency chain 697 // with the proxy task as origin 698 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) || 699 (task_team && task_team->tt.tt_found_proxy_tasks)) { 700 #endif 701 __kmp_release_deps(gtid, taskdata); 702 #endif 703 } 704 705 // td_flags.executing must be marked as 0 after __kmp_release_deps has been 706 // called. Othertwise, if a task is executed immediately from the release_deps 707 // code, the flag will be reset to 1 again by this same function 708 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); 709 taskdata->td_flags.executing = 0; // suspend the finishing task 710 711 KA_TRACE( 712 20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n", 713 gtid, taskdata, children)); 714 715 #if OMP_40_ENABLED 716 /* If the tasks' destructor thunk flag has been set, we need to invoke the 717 destructor thunk that has been generated by the compiler. The code is 718 placed here, since at this point other tasks might have been released 719 hence overlapping the destructor invokations with some other work in the 720 released tasks. The OpenMP spec is not specific on when the destructors 721 are invoked, so we should be free to choose. */ 722 if (taskdata->td_flags.destructors_thunk) { 723 kmp_routine_entry_t destr_thunk = task->data1.destructors; 724 KMP_ASSERT(destr_thunk); 725 destr_thunk(gtid, task); 726 } 727 #endif // OMP_40_ENABLED 728 729 // bookkeeping for resuming task: 730 // GEH - note tasking_ser => task_serial 731 KMP_DEBUG_ASSERT( 732 (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) == 733 taskdata->td_flags.task_serial); 734 if (taskdata->td_flags.task_serial) { 735 if (resumed_task == NULL) { 736 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 737 // task is the parent 738 } else 739 #if OMP_45_ENABLED 740 if (!(task_team && task_team->tt.tt_found_proxy_tasks)) 741 #endif 742 { 743 // verify resumed task passed in points to parent 744 KMP_DEBUG_ASSERT(resumed_task == taskdata->td_parent); 745 } 746 } else { 747 KMP_DEBUG_ASSERT(resumed_task != 748 NULL); // verify that resumed task is passed as arguemnt 749 } 750 751 // Free this task and then ancestor tasks if they have no children. 752 // Restore th_current_task first as suggested by John: 753 // johnmc: if an asynchronous inquiry peers into the runtime system 754 // it doesn't see the freed task as the current task. 755 thread->th.th_current_task = resumed_task; 756 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 757 758 // TODO: GEH - make sure root team implicit task is initialized properly. 759 // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 ); 760 resumed_task->td_flags.executing = 1; // resume previous task 761 762 KA_TRACE( 763 10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n", 764 gtid, taskdata, resumed_task)); 765 766 return; 767 } 768 769 // __kmpc_omp_task_complete_if0: report that a task has completed execution 770 // 771 // loc_ref: source location information; points to end of task block. 772 // gtid: global thread number. 773 // task: task thunk for the completed task. 774 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid, 775 kmp_task_t *task) { 776 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n", 777 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 778 // this routine will provide task to resume 779 __kmp_task_finish(gtid, task, NULL); 780 781 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n", 782 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 783 return; 784 } 785 786 #ifdef TASK_UNUSED 787 // __kmpc_omp_task_complete: report that a task has completed execution 788 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 789 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid, 790 kmp_task_t *task) { 791 KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid, 792 loc_ref, KMP_TASK_TO_TASKDATA(task))); 793 794 __kmp_task_finish(gtid, task, NULL); // Not sure how to find task to resume 795 796 KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid, 797 loc_ref, KMP_TASK_TO_TASKDATA(task))); 798 return; 799 } 800 #endif // TASK_UNUSED 801 802 #if OMPT_SUPPORT 803 // __kmp_task_init_ompt: Initialize OMPT fields maintained by a task. This will 804 // only be called after ompt_tool, so we already know whether ompt is enabled 805 // or not. 806 static inline void __kmp_task_init_ompt(kmp_taskdata_t *task, int tid, 807 void *function) { 808 if (ompt_enabled) { 809 task->ompt_task_info.task_id = __ompt_task_id_new(tid); 810 task->ompt_task_info.function = function; 811 task->ompt_task_info.frame.exit_runtime_frame = NULL; 812 task->ompt_task_info.frame.reenter_runtime_frame = NULL; 813 #if OMP_40_ENABLED 814 task->ompt_task_info.ndeps = 0; 815 task->ompt_task_info.deps = NULL; 816 #endif /* OMP_40_ENABLED */ 817 } 818 } 819 #endif 820 821 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit 822 // task for a given thread 823 // 824 // loc_ref: reference to source location of parallel region 825 // this_thr: thread data structure corresponding to implicit task 826 // team: team for this_thr 827 // tid: thread id of given thread within team 828 // set_curr_task: TRUE if need to push current task to thread 829 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to 830 // have already been done elsewhere. 831 // TODO: Get better loc_ref. Value passed in may be NULL 832 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr, 833 kmp_team_t *team, int tid, int set_curr_task) { 834 kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid]; 835 836 KF_TRACE( 837 10, 838 ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n", 839 tid, team, task, set_curr_task ? "TRUE" : "FALSE")); 840 841 task->td_task_id = KMP_GEN_TASK_ID(); 842 task->td_team = team; 843 // task->td_parent = NULL; // fix for CQ230101 (broken parent task info 844 // in debugger) 845 task->td_ident = loc_ref; 846 task->td_taskwait_ident = NULL; 847 task->td_taskwait_counter = 0; 848 task->td_taskwait_thread = 0; 849 850 task->td_flags.tiedness = TASK_TIED; 851 task->td_flags.tasktype = TASK_IMPLICIT; 852 #if OMP_45_ENABLED 853 task->td_flags.proxy = TASK_FULL; 854 #endif 855 856 // All implicit tasks are executed immediately, not deferred 857 task->td_flags.task_serial = 1; 858 task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 859 task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 860 861 task->td_flags.started = 1; 862 task->td_flags.executing = 1; 863 task->td_flags.complete = 0; 864 task->td_flags.freed = 0; 865 866 #if OMP_40_ENABLED 867 task->td_depnode = NULL; 868 #endif 869 870 if (set_curr_task) { // only do this init first time thread is created 871 task->td_incomplete_child_tasks = 0; 872 // Not used: don't need to deallocate implicit task 873 task->td_allocated_child_tasks = 0; 874 #if OMP_40_ENABLED 875 task->td_taskgroup = NULL; // An implicit task does not have taskgroup 876 task->td_dephash = NULL; 877 #endif 878 __kmp_push_current_task_to_thread(this_thr, team, tid); 879 } else { 880 KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0); 881 KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0); 882 } 883 884 #if OMPT_SUPPORT 885 __kmp_task_init_ompt(task, tid, NULL); 886 #endif 887 888 KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid, 889 team, task)); 890 } 891 892 // __kmp_finish_implicit_task: Release resources associated to implicit tasks 893 // at the end of parallel regions. Some resources are kept for reuse in the next 894 // parallel region. 895 // 896 // thread: thread data structure corresponding to implicit task 897 void __kmp_finish_implicit_task(kmp_info_t *thread) { 898 kmp_taskdata_t *task = thread->th.th_current_task; 899 if (task->td_dephash) 900 __kmp_dephash_free_entries(thread, task->td_dephash); 901 } 902 903 // __kmp_free_implicit_task: Release resources associated to implicit tasks 904 // when these are destroyed regions 905 // 906 // thread: thread data structure corresponding to implicit task 907 void __kmp_free_implicit_task(kmp_info_t *thread) { 908 kmp_taskdata_t *task = thread->th.th_current_task; 909 if (task->td_dephash) 910 __kmp_dephash_free(thread, task->td_dephash); 911 task->td_dephash = NULL; 912 } 913 914 // Round up a size to a power of two specified by val: Used to insert padding 915 // between structures co-allocated using a single malloc() call 916 static size_t __kmp_round_up_to_val(size_t size, size_t val) { 917 if (size & (val - 1)) { 918 size &= ~(val - 1); 919 if (size <= KMP_SIZE_T_MAX - val) { 920 size += val; // Round up if there is no overflow. 921 }; // if 922 }; // if 923 return size; 924 } // __kmp_round_up_to_va 925 926 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task 927 // 928 // loc_ref: source location information 929 // gtid: global thread number. 930 // flags: include tiedness & task type (explicit vs. implicit) of the ''new'' 931 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine. 932 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including 933 // private vars accessed in task. 934 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed 935 // in task. 936 // task_entry: Pointer to task code entry point generated by compiler. 937 // returns: a pointer to the allocated kmp_task_t structure (task). 938 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 939 kmp_tasking_flags_t *flags, 940 size_t sizeof_kmp_task_t, size_t sizeof_shareds, 941 kmp_routine_entry_t task_entry) { 942 kmp_task_t *task; 943 kmp_taskdata_t *taskdata; 944 kmp_info_t *thread = __kmp_threads[gtid]; 945 kmp_team_t *team = thread->th.th_team; 946 kmp_taskdata_t *parent_task = thread->th.th_current_task; 947 size_t shareds_offset; 948 949 KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) " 950 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 951 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t, 952 sizeof_shareds, task_entry)); 953 954 if (parent_task->td_flags.final) { 955 if (flags->merged_if0) { 956 } 957 flags->final = 1; 958 } 959 960 #if OMP_45_ENABLED 961 if (flags->proxy == TASK_PROXY) { 962 flags->tiedness = TASK_UNTIED; 963 flags->merged_if0 = 1; 964 965 /* are we running in a sequential parallel or tskm_immediate_exec... we need 966 tasking support enabled */ 967 if ((thread->th.th_task_team) == NULL) { 968 /* This should only happen if the team is serialized 969 setup a task team and propagate it to the thread */ 970 KMP_DEBUG_ASSERT(team->t.t_serialized); 971 KA_TRACE(30, 972 ("T#%d creating task team in __kmp_task_alloc for proxy task\n", 973 gtid)); 974 __kmp_task_team_setup( 975 thread, team, 976 1); // 1 indicates setup the current team regardless of nthreads 977 thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state]; 978 } 979 kmp_task_team_t *task_team = thread->th.th_task_team; 980 981 /* tasking must be enabled now as the task might not be pushed */ 982 if (!KMP_TASKING_ENABLED(task_team)) { 983 KA_TRACE( 984 30, 985 ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid)); 986 __kmp_enable_tasking(task_team, thread); 987 kmp_int32 tid = thread->th.th_info.ds.ds_tid; 988 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 989 // No lock needed since only owner can allocate 990 if (thread_data->td.td_deque == NULL) { 991 __kmp_alloc_task_deque(thread, thread_data); 992 } 993 } 994 995 if (task_team->tt.tt_found_proxy_tasks == FALSE) 996 TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE); 997 } 998 #endif 999 1000 // Calculate shared structure offset including padding after kmp_task_t struct 1001 // to align pointers in shared struct 1002 shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t; 1003 shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *)); 1004 1005 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 1006 KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid, 1007 shareds_offset)); 1008 KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid, 1009 sizeof_shareds)); 1010 1011 // Avoid double allocation here by combining shareds with taskdata 1012 #if USE_FAST_MEMORY 1013 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset + 1014 sizeof_shareds); 1015 #else /* ! USE_FAST_MEMORY */ 1016 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset + 1017 sizeof_shareds); 1018 #endif /* USE_FAST_MEMORY */ 1019 ANNOTATE_HAPPENS_AFTER(taskdata); 1020 1021 task = KMP_TASKDATA_TO_TASK(taskdata); 1022 1023 // Make sure task & taskdata are aligned appropriately 1024 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD 1025 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0); 1026 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0); 1027 #else 1028 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0); 1029 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0); 1030 #endif 1031 if (sizeof_shareds > 0) { 1032 // Avoid double allocation here by combining shareds with taskdata 1033 task->shareds = &((char *)taskdata)[shareds_offset]; 1034 // Make sure shareds struct is aligned to pointer size 1035 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 1036 0); 1037 } else { 1038 task->shareds = NULL; 1039 } 1040 task->routine = task_entry; 1041 task->part_id = 0; // AC: Always start with 0 part id 1042 1043 taskdata->td_task_id = KMP_GEN_TASK_ID(); 1044 taskdata->td_team = team; 1045 taskdata->td_alloc_thread = thread; 1046 taskdata->td_parent = parent_task; 1047 taskdata->td_level = parent_task->td_level + 1; // increment nesting level 1048 taskdata->td_untied_count = 0; 1049 taskdata->td_ident = loc_ref; 1050 taskdata->td_taskwait_ident = NULL; 1051 taskdata->td_taskwait_counter = 0; 1052 taskdata->td_taskwait_thread = 0; 1053 KMP_DEBUG_ASSERT(taskdata->td_parent != NULL); 1054 #if OMP_45_ENABLED 1055 // avoid copying icvs for proxy tasks 1056 if (flags->proxy == TASK_FULL) 1057 #endif 1058 copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs); 1059 1060 taskdata->td_flags.tiedness = flags->tiedness; 1061 taskdata->td_flags.final = flags->final; 1062 taskdata->td_flags.merged_if0 = flags->merged_if0; 1063 #if OMP_40_ENABLED 1064 taskdata->td_flags.destructors_thunk = flags->destructors_thunk; 1065 #endif // OMP_40_ENABLED 1066 #if OMP_45_ENABLED 1067 taskdata->td_flags.proxy = flags->proxy; 1068 taskdata->td_task_team = thread->th.th_task_team; 1069 taskdata->td_size_alloc = shareds_offset + sizeof_shareds; 1070 #endif 1071 taskdata->td_flags.tasktype = TASK_EXPLICIT; 1072 1073 // GEH - TODO: fix this to copy parent task's value of tasking_ser flag 1074 taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 1075 1076 // GEH - TODO: fix this to copy parent task's value of team_serial flag 1077 taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 1078 1079 // GEH - Note we serialize the task if the team is serialized to make sure 1080 // implicit parallel region tasks are not left until program termination to 1081 // execute. Also, it helps locality to execute immediately. 1082 1083 taskdata->td_flags.task_serial = 1084 (parent_task->td_flags.final || taskdata->td_flags.team_serial || 1085 taskdata->td_flags.tasking_ser); 1086 1087 taskdata->td_flags.started = 0; 1088 taskdata->td_flags.executing = 0; 1089 taskdata->td_flags.complete = 0; 1090 taskdata->td_flags.freed = 0; 1091 1092 taskdata->td_flags.native = flags->native; 1093 1094 taskdata->td_incomplete_child_tasks = 0; 1095 taskdata->td_allocated_child_tasks = 1; // start at one because counts current 1096 // task and children 1097 #if OMP_40_ENABLED 1098 taskdata->td_taskgroup = 1099 parent_task->td_taskgroup; // task inherits taskgroup from the parent task 1100 taskdata->td_dephash = NULL; 1101 taskdata->td_depnode = NULL; 1102 #endif 1103 1104 // Only need to keep track of child task counts if team parallel and tasking not 1105 // serialized or if it is a proxy task 1106 #if OMP_45_ENABLED 1107 if (flags->proxy == TASK_PROXY || 1108 !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) 1109 #else 1110 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) 1111 #endif 1112 { 1113 KMP_TEST_THEN_INC32((kmp_int32 *)(&parent_task->td_incomplete_child_tasks)); 1114 #if OMP_40_ENABLED 1115 if (parent_task->td_taskgroup) 1116 KMP_TEST_THEN_INC32((kmp_int32 *)(&parent_task->td_taskgroup->count)); 1117 #endif 1118 // Only need to keep track of allocated child tasks for explicit tasks since 1119 // implicit not deallocated 1120 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) { 1121 KMP_TEST_THEN_INC32( 1122 (kmp_int32 *)(&taskdata->td_parent->td_allocated_child_tasks)); 1123 } 1124 } 1125 1126 KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n", 1127 gtid, taskdata, taskdata->td_parent)); 1128 ANNOTATE_HAPPENS_BEFORE(task); 1129 1130 #if OMPT_SUPPORT 1131 __kmp_task_init_ompt(taskdata, gtid, (void *)task_entry); 1132 #endif 1133 1134 return task; 1135 } 1136 1137 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1138 kmp_int32 flags, size_t sizeof_kmp_task_t, 1139 size_t sizeof_shareds, 1140 kmp_routine_entry_t task_entry) { 1141 kmp_task_t *retval; 1142 kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags; 1143 1144 input_flags->native = FALSE; 1145 // __kmp_task_alloc() sets up all other runtime flags 1146 1147 #if OMP_45_ENABLED 1148 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) " 1149 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1150 gtid, loc_ref, input_flags->tiedness ? "tied " : "untied", 1151 input_flags->proxy ? "proxy" : "", sizeof_kmp_task_t, 1152 sizeof_shareds, task_entry)); 1153 #else 1154 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) " 1155 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1156 gtid, loc_ref, input_flags->tiedness ? "tied " : "untied", 1157 sizeof_kmp_task_t, sizeof_shareds, task_entry)); 1158 #endif 1159 1160 retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t, 1161 sizeof_shareds, task_entry); 1162 1163 KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval)); 1164 1165 return retval; 1166 } 1167 1168 // __kmp_invoke_task: invoke the specified task 1169 // 1170 // gtid: global thread ID of caller 1171 // task: the task to invoke 1172 // current_task: the task to resume after task invokation 1173 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, 1174 kmp_taskdata_t *current_task) { 1175 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 1176 kmp_uint64 cur_time; 1177 #if OMP_40_ENABLED 1178 int discard = 0 /* false */; 1179 #endif 1180 KA_TRACE( 1181 30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n", 1182 gtid, taskdata, current_task)); 1183 KMP_DEBUG_ASSERT(task); 1184 #if OMP_45_ENABLED 1185 if (taskdata->td_flags.proxy == TASK_PROXY && 1186 taskdata->td_flags.complete == 1) { 1187 // This is a proxy task that was already completed but it needs to run 1188 // its bottom-half finish 1189 KA_TRACE( 1190 30, 1191 ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n", 1192 gtid, taskdata)); 1193 1194 __kmp_bottom_half_finish_proxy(gtid, task); 1195 1196 KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for " 1197 "proxy task %p, resuming task %p\n", 1198 gtid, taskdata, current_task)); 1199 1200 return; 1201 } 1202 #endif 1203 1204 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1205 if (__kmp_forkjoin_frames_mode == 3) { 1206 // Get the current time stamp to measure task execution time to correct 1207 // barrier imbalance time 1208 cur_time = __itt_get_timestamp(); 1209 } 1210 #endif 1211 1212 #if OMP_45_ENABLED 1213 // Proxy tasks are not handled by the runtime 1214 if (taskdata->td_flags.proxy != TASK_PROXY) { 1215 #endif 1216 ANNOTATE_HAPPENS_AFTER(task); 1217 __kmp_task_start(gtid, task, current_task); 1218 #if OMP_45_ENABLED 1219 } 1220 #endif 1221 1222 #if OMPT_SUPPORT 1223 ompt_thread_info_t oldInfo; 1224 kmp_info_t *thread; 1225 if (ompt_enabled) { 1226 // Store the threads states and restore them after the task 1227 thread = __kmp_threads[gtid]; 1228 oldInfo = thread->th.ompt_thread_info; 1229 thread->th.ompt_thread_info.wait_id = 0; 1230 thread->th.ompt_thread_info.state = ompt_state_work_parallel; 1231 taskdata->ompt_task_info.frame.exit_runtime_frame = 1232 __builtin_frame_address(0); 1233 } 1234 #endif 1235 1236 #if OMP_40_ENABLED 1237 // TODO: cancel tasks if the parallel region has also been cancelled 1238 // TODO: check if this sequence can be hoisted above __kmp_task_start 1239 // if cancellation has been enabled for this run ... 1240 if (__kmp_omp_cancellation) { 1241 kmp_info_t *this_thr = __kmp_threads[gtid]; 1242 kmp_team_t *this_team = this_thr->th.th_team; 1243 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 1244 if ((taskgroup && taskgroup->cancel_request) || 1245 (this_team->t.t_cancel_request == cancel_parallel)) { 1246 KMP_COUNT_BLOCK(TASK_cancelled); 1247 // this task belongs to a task group and we need to cancel it 1248 discard = 1 /* true */; 1249 } 1250 } 1251 1252 // Invoke the task routine and pass in relevant data. 1253 // Thunks generated by gcc take a different argument list. 1254 if (!discard) { 1255 #if KMP_STATS_ENABLED 1256 KMP_COUNT_BLOCK(TASK_executed); 1257 switch (KMP_GET_THREAD_STATE()) { 1258 case FORK_JOIN_BARRIER: 1259 KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar); 1260 break; 1261 case PLAIN_BARRIER: 1262 KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar); 1263 break; 1264 case TASKYIELD: 1265 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield); 1266 break; 1267 case TASKWAIT: 1268 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait); 1269 break; 1270 case TASKGROUP: 1271 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup); 1272 break; 1273 default: 1274 KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate); 1275 break; 1276 } 1277 #endif // KMP_STATS_ENABLED 1278 #endif // OMP_40_ENABLED 1279 1280 #if OMPT_SUPPORT && OMPT_TRACE 1281 /* let OMPT know that we're about to run this task */ 1282 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_switch)) { 1283 ompt_callbacks.ompt_callback(ompt_event_task_switch)( 1284 current_task->ompt_task_info.task_id, 1285 taskdata->ompt_task_info.task_id); 1286 } 1287 #endif 1288 1289 #ifdef KMP_GOMP_COMPAT 1290 if (taskdata->td_flags.native) { 1291 ((void (*)(void *))(*(task->routine)))(task->shareds); 1292 } else 1293 #endif /* KMP_GOMP_COMPAT */ 1294 { 1295 (*(task->routine))(gtid, task); 1296 } 1297 KMP_POP_PARTITIONED_TIMER(); 1298 1299 #if OMPT_SUPPORT && OMPT_TRACE 1300 /* let OMPT know that we're returning to the callee task */ 1301 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_switch)) { 1302 ompt_callbacks.ompt_callback(ompt_event_task_switch)( 1303 taskdata->ompt_task_info.task_id, 1304 current_task->ompt_task_info.task_id); 1305 } 1306 #endif 1307 1308 #if OMP_40_ENABLED 1309 } 1310 #endif // OMP_40_ENABLED 1311 1312 #if OMPT_SUPPORT 1313 if (ompt_enabled) { 1314 thread->th.ompt_thread_info = oldInfo; 1315 taskdata->ompt_task_info.frame.exit_runtime_frame = NULL; 1316 } 1317 #endif 1318 1319 #if OMP_45_ENABLED 1320 // Proxy tasks are not handled by the runtime 1321 if (taskdata->td_flags.proxy != TASK_PROXY) { 1322 #endif 1323 ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent); 1324 __kmp_task_finish(gtid, task, current_task); 1325 #if OMP_45_ENABLED 1326 } 1327 #endif 1328 1329 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1330 // Barrier imbalance - correct arrive time after the task finished 1331 if (__kmp_forkjoin_frames_mode == 3) { 1332 kmp_info_t *this_thr = __kmp_threads[gtid]; 1333 if (this_thr->th.th_bar_arrive_time) { 1334 this_thr->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time); 1335 } 1336 } 1337 #endif 1338 KA_TRACE( 1339 30, 1340 ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n", 1341 gtid, taskdata, current_task)); 1342 return; 1343 } 1344 1345 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution 1346 // 1347 // loc_ref: location of original task pragma (ignored) 1348 // gtid: Global Thread ID of encountering thread 1349 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task'' 1350 // Returns: 1351 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1352 // be resumed later. 1353 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1354 // resumed later. 1355 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid, 1356 kmp_task_t *new_task) { 1357 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1358 1359 KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid, 1360 loc_ref, new_taskdata)); 1361 1362 /* Should we execute the new task or queue it? For now, let's just always try 1363 to queue it. If the queue fills up, then we'll execute it. */ 1364 1365 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1366 { // Execute this task immediately 1367 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1368 new_taskdata->td_flags.task_serial = 1; 1369 __kmp_invoke_task(gtid, new_task, current_task); 1370 } 1371 1372 KA_TRACE( 1373 10, 1374 ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: " 1375 "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n", 1376 gtid, loc_ref, new_taskdata)); 1377 1378 ANNOTATE_HAPPENS_BEFORE(new_task); 1379 return TASK_CURRENT_NOT_QUEUED; 1380 } 1381 1382 // __kmp_omp_task: Schedule a non-thread-switchable task for execution 1383 // 1384 // gtid: Global Thread ID of encountering thread 1385 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc() 1386 // serialize_immediate: if TRUE then if the task is executed immediately its 1387 // execution will be serialized 1388 // Returns: 1389 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1390 // be resumed later. 1391 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1392 // resumed later. 1393 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task, 1394 bool serialize_immediate) { 1395 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1396 1397 #if OMPT_SUPPORT 1398 if (ompt_enabled) { 1399 new_taskdata->ompt_task_info.frame.reenter_runtime_frame = 1400 __builtin_frame_address(1); 1401 } 1402 #endif 1403 1404 /* Should we execute the new task or queue it? For now, let's just always try to 1405 queue it. If the queue fills up, then we'll execute it. */ 1406 #if OMP_45_ENABLED 1407 if (new_taskdata->td_flags.proxy == TASK_PROXY || 1408 __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1409 #else 1410 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1411 #endif 1412 { // Execute this task immediately 1413 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1414 if (serialize_immediate) 1415 new_taskdata->td_flags.task_serial = 1; 1416 __kmp_invoke_task(gtid, new_task, current_task); 1417 } 1418 1419 #if OMPT_SUPPORT 1420 if (ompt_enabled) { 1421 new_taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL; 1422 } 1423 #endif 1424 1425 ANNOTATE_HAPPENS_BEFORE(new_task); 1426 return TASK_CURRENT_NOT_QUEUED; 1427 } 1428 1429 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a 1430 // non-thread-switchable task from the parent thread only! 1431 // 1432 // loc_ref: location of original task pragma (ignored) 1433 // gtid: Global Thread ID of encountering thread 1434 // new_task: non-thread-switchable task thunk allocated by 1435 // __kmp_omp_task_alloc() 1436 // Returns: 1437 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1438 // be resumed later. 1439 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1440 // resumed later. 1441 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid, 1442 kmp_task_t *new_task) { 1443 kmp_int32 res; 1444 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); 1445 1446 #if KMP_DEBUG 1447 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1448 #endif 1449 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref, 1450 new_taskdata)); 1451 1452 res = __kmp_omp_task(gtid, new_task, true); 1453 1454 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning " 1455 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n", 1456 gtid, loc_ref, new_taskdata)); 1457 return res; 1458 } 1459 1460 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are 1461 // complete 1462 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) { 1463 kmp_taskdata_t *taskdata; 1464 kmp_info_t *thread; 1465 int thread_finished = FALSE; 1466 KMP_SET_THREAD_STATE_BLOCK(TASKWAIT); 1467 1468 KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref)); 1469 1470 if (__kmp_tasking_mode != tskm_immediate_exec) { 1471 thread = __kmp_threads[gtid]; 1472 taskdata = thread->th.th_current_task; 1473 #if OMPT_SUPPORT && OMPT_TRACE 1474 ompt_task_id_t my_task_id; 1475 ompt_parallel_id_t my_parallel_id; 1476 1477 if (ompt_enabled) { 1478 kmp_team_t *team = thread->th.th_team; 1479 my_task_id = taskdata->ompt_task_info.task_id; 1480 my_parallel_id = team->t.ompt_team_info.parallel_id; 1481 1482 taskdata->ompt_task_info.frame.reenter_runtime_frame = 1483 __builtin_frame_address(1); 1484 if (ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)) { 1485 ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)(my_parallel_id, 1486 my_task_id); 1487 } 1488 } 1489 #endif 1490 1491 // Debugger: The taskwait is active. Store location and thread encountered the 1492 // taskwait. 1493 #if USE_ITT_BUILD 1494 // Note: These values are used by ITT events as well. 1495 #endif /* USE_ITT_BUILD */ 1496 taskdata->td_taskwait_counter += 1; 1497 taskdata->td_taskwait_ident = loc_ref; 1498 taskdata->td_taskwait_thread = gtid + 1; 1499 1500 #if USE_ITT_BUILD 1501 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 1502 if (itt_sync_obj != NULL) 1503 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 1504 #endif /* USE_ITT_BUILD */ 1505 1506 bool must_wait = 1507 !taskdata->td_flags.team_serial && !taskdata->td_flags.final; 1508 1509 #if OMP_45_ENABLED 1510 must_wait = must_wait || (thread->th.th_task_team != NULL && 1511 thread->th.th_task_team->tt.tt_found_proxy_tasks); 1512 #endif 1513 if (must_wait) { 1514 kmp_flag_32 flag(&(taskdata->td_incomplete_child_tasks), 0U); 1515 while (TCR_4(taskdata->td_incomplete_child_tasks) != 0) { 1516 flag.execute_tasks(thread, gtid, FALSE, 1517 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 1518 __kmp_task_stealing_constraint); 1519 } 1520 } 1521 #if USE_ITT_BUILD 1522 if (itt_sync_obj != NULL) 1523 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 1524 #endif /* USE_ITT_BUILD */ 1525 1526 // Debugger: The taskwait is completed. Location remains, but thread is 1527 // negated. 1528 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 1529 1530 #if OMPT_SUPPORT && OMPT_TRACE 1531 if (ompt_enabled) { 1532 if (ompt_callbacks.ompt_callback(ompt_event_taskwait_end)) { 1533 ompt_callbacks.ompt_callback(ompt_event_taskwait_end)(my_parallel_id, 1534 my_task_id); 1535 } 1536 taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL; 1537 } 1538 #endif 1539 ANNOTATE_HAPPENS_AFTER(taskdata); 1540 } 1541 1542 KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, " 1543 "returning TASK_CURRENT_NOT_QUEUED\n", 1544 gtid, taskdata)); 1545 1546 return TASK_CURRENT_NOT_QUEUED; 1547 } 1548 1549 // __kmpc_omp_taskyield: switch to a different task 1550 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) { 1551 kmp_taskdata_t *taskdata; 1552 kmp_info_t *thread; 1553 int thread_finished = FALSE; 1554 1555 KMP_COUNT_BLOCK(OMP_TASKYIELD); 1556 KMP_SET_THREAD_STATE_BLOCK(TASKYIELD); 1557 1558 KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n", 1559 gtid, loc_ref, end_part)); 1560 1561 if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) { 1562 thread = __kmp_threads[gtid]; 1563 taskdata = thread->th.th_current_task; 1564 // Should we model this as a task wait or not? 1565 // Debugger: The taskwait is active. Store location and thread encountered the 1566 // taskwait. 1567 #if USE_ITT_BUILD 1568 // Note: These values are used by ITT events as well. 1569 #endif /* USE_ITT_BUILD */ 1570 taskdata->td_taskwait_counter += 1; 1571 taskdata->td_taskwait_ident = loc_ref; 1572 taskdata->td_taskwait_thread = gtid + 1; 1573 1574 #if USE_ITT_BUILD 1575 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 1576 if (itt_sync_obj != NULL) 1577 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 1578 #endif /* USE_ITT_BUILD */ 1579 if (!taskdata->td_flags.team_serial) { 1580 kmp_task_team_t *task_team = thread->th.th_task_team; 1581 if (task_team != NULL) { 1582 if (KMP_TASKING_ENABLED(task_team)) { 1583 __kmp_execute_tasks_32( 1584 thread, gtid, NULL, FALSE, 1585 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 1586 __kmp_task_stealing_constraint); 1587 } 1588 } 1589 } 1590 #if USE_ITT_BUILD 1591 if (itt_sync_obj != NULL) 1592 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 1593 #endif /* USE_ITT_BUILD */ 1594 1595 // Debugger: The taskwait is completed. Location remains, but thread is 1596 // negated. 1597 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 1598 } 1599 1600 KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, " 1601 "returning TASK_CURRENT_NOT_QUEUED\n", 1602 gtid, taskdata)); 1603 1604 return TASK_CURRENT_NOT_QUEUED; 1605 } 1606 1607 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work 1608 #if OMP_45_ENABLED 1609 // Task Reduction implementation 1610 1611 typedef struct kmp_task_red_flags { 1612 unsigned lazy_priv : 1; // hint: (1) use lazy allocation (big objects) 1613 unsigned reserved31 : 31; 1614 } kmp_task_red_flags_t; 1615 1616 // internal structure for reduction data item related info 1617 typedef struct kmp_task_red_data { 1618 void *reduce_shar; // shared reduction item 1619 size_t reduce_size; // size of data item 1620 void *reduce_priv; // thread specific data 1621 void *reduce_pend; // end of private data for comparison op 1622 void *reduce_init; // data initialization routine 1623 void *reduce_fini; // data finalization routine 1624 void *reduce_comb; // data combiner routine 1625 kmp_task_red_flags_t flags; // flags for additional info from compiler 1626 } kmp_task_red_data_t; 1627 1628 // structure sent us by compiler - one per reduction item 1629 typedef struct kmp_task_red_input { 1630 void *reduce_shar; // shared reduction item 1631 size_t reduce_size; // size of data item 1632 void *reduce_init; // data initialization routine 1633 void *reduce_fini; // data finalization routine 1634 void *reduce_comb; // data combiner routine 1635 kmp_task_red_flags_t flags; // flags for additional info from compiler 1636 } kmp_task_red_input_t; 1637 1638 /*! 1639 @ingroup TASKING 1640 @param gtid Global thread ID 1641 @param num Number of data items to reduce 1642 @param data Array of data for reduction 1643 @return The taskgroup identifier 1644 1645 Initialize task reduction for the taskgroup. 1646 */ 1647 void *__kmpc_task_reduction_init(int gtid, int num, void *data) { 1648 kmp_info_t *thread = __kmp_threads[gtid]; 1649 kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup; 1650 kmp_int32 nth = thread->th.th_team_nproc; 1651 kmp_task_red_input_t *input = (kmp_task_red_input_t *)data; 1652 kmp_task_red_data_t *arr; 1653 1654 // check input data just in case 1655 KMP_ASSERT(tg != NULL); 1656 KMP_ASSERT(data != NULL); 1657 KMP_ASSERT(num > 0); 1658 if (nth == 1) { 1659 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n", 1660 gtid, tg)); 1661 return (void *)tg; 1662 } 1663 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n", 1664 gtid, tg, num)); 1665 arr = (kmp_task_red_data_t *)__kmp_thread_malloc( 1666 thread, num * sizeof(kmp_task_red_data_t)); 1667 for (int i = 0; i < num; ++i) { 1668 void (*f_init)(void *) = (void (*)(void *))(input[i].reduce_init); 1669 size_t size = input[i].reduce_size - 1; 1670 // round the size up to cache line per thread-specific item 1671 size += CACHE_LINE - size % CACHE_LINE; 1672 KMP_ASSERT(input[i].reduce_comb != NULL); // combiner is mandatory 1673 arr[i].reduce_shar = input[i].reduce_shar; 1674 arr[i].reduce_size = size; 1675 arr[i].reduce_init = input[i].reduce_init; 1676 arr[i].reduce_fini = input[i].reduce_fini; 1677 arr[i].reduce_comb = input[i].reduce_comb; 1678 arr[i].flags = input[i].flags; 1679 if (!input[i].flags.lazy_priv) { 1680 // allocate cache-line aligned block and fill it with zeros 1681 arr[i].reduce_priv = __kmp_allocate(nth * size); 1682 arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size; 1683 if (f_init != NULL) { 1684 // initialize thread-specific items 1685 for (int j = 0; j < nth; ++j) { 1686 f_init((char *)(arr[i].reduce_priv) + j * size); 1687 } 1688 } 1689 } else { 1690 // only allocate space for pointers now, 1691 // objects will be lazily allocated/initialized once requested 1692 arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *)); 1693 } 1694 } 1695 tg->reduce_data = (void *)arr; 1696 tg->reduce_num_data = num; 1697 return (void *)tg; 1698 } 1699 1700 /*! 1701 @ingroup TASKING 1702 @param gtid Global thread ID 1703 @param tskgrp The taskgroup ID (optional) 1704 @param data Shared location of the item 1705 @return The pointer to per-thread data 1706 1707 Get thread-specific location of data item 1708 */ 1709 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) { 1710 kmp_info_t *thread = __kmp_threads[gtid]; 1711 kmp_int32 nth = thread->th.th_team_nproc; 1712 if (nth == 1) 1713 return data; // nothing to do 1714 1715 kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp; 1716 if (tg == NULL) 1717 tg = thread->th.th_current_task->td_taskgroup; 1718 KMP_ASSERT(tg != NULL); 1719 kmp_task_red_data_t *arr = (kmp_task_red_data_t *)(tg->reduce_data); 1720 kmp_int32 num = tg->reduce_num_data; 1721 kmp_int32 tid = thread->th.th_info.ds.ds_tid; 1722 1723 KMP_ASSERT(data != NULL); 1724 while (tg != NULL) { 1725 for (int i = 0; i < num; ++i) { 1726 if (!arr[i].flags.lazy_priv) { 1727 if (data == arr[i].reduce_shar || 1728 (data >= arr[i].reduce_priv && data < arr[i].reduce_pend)) 1729 return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size; 1730 } else { 1731 // check shared location first 1732 void **p_priv = (void **)(arr[i].reduce_priv); 1733 if (data == arr[i].reduce_shar) 1734 goto found; 1735 // check if we get some thread specific location as parameter 1736 for (int j = 0; j < nth; ++j) 1737 if (data == p_priv[j]) 1738 goto found; 1739 continue; // not found, continue search 1740 found: 1741 if (p_priv[tid] == NULL) { 1742 // allocate thread specific object lazily 1743 void (*f_init)(void *) = (void (*)(void *))(arr[i].reduce_init); 1744 p_priv[tid] = __kmp_allocate(arr[i].reduce_size); 1745 if (f_init != NULL) { 1746 f_init(p_priv[tid]); 1747 } 1748 } 1749 return p_priv[tid]; 1750 } 1751 } 1752 tg = tg->parent; 1753 arr = (kmp_task_red_data_t *)(tg->reduce_data); 1754 num = tg->reduce_num_data; 1755 } 1756 KMP_ASSERT2(0, "Unknown task reduction item"); 1757 return NULL; // ERROR, this line never executed 1758 } 1759 1760 // Finalize task reduction. 1761 // Called from __kmpc_end_taskgroup() 1762 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) { 1763 kmp_int32 nth = th->th.th_team_nproc; 1764 KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1 1765 kmp_task_red_data_t *arr = (kmp_task_red_data_t *)tg->reduce_data; 1766 kmp_int32 num = tg->reduce_num_data; 1767 for (int i = 0; i < num; ++i) { 1768 void *sh_data = arr[i].reduce_shar; 1769 void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini); 1770 void (*f_comb)(void *, void *) = 1771 (void (*)(void *, void *))(arr[i].reduce_comb); 1772 if (!arr[i].flags.lazy_priv) { 1773 void *pr_data = arr[i].reduce_priv; 1774 size_t size = arr[i].reduce_size; 1775 for (int j = 0; j < nth; ++j) { 1776 void *priv_data = (char *)pr_data + j * size; 1777 f_comb(sh_data, priv_data); // combine results 1778 if (f_fini) 1779 f_fini(priv_data); // finalize if needed 1780 } 1781 } else { 1782 void **pr_data = (void **)(arr[i].reduce_priv); 1783 for (int j = 0; j < nth; ++j) { 1784 if (pr_data[j] != NULL) { 1785 f_comb(sh_data, pr_data[j]); // combine results 1786 if (f_fini) 1787 f_fini(pr_data[j]); // finalize if needed 1788 __kmp_free(pr_data[j]); 1789 } 1790 } 1791 } 1792 __kmp_free(arr[i].reduce_priv); 1793 } 1794 __kmp_thread_free(th, arr); 1795 tg->reduce_data = NULL; 1796 tg->reduce_num_data = 0; 1797 } 1798 #endif 1799 1800 #if OMP_40_ENABLED 1801 // __kmpc_taskgroup: Start a new taskgroup 1802 void __kmpc_taskgroup(ident_t *loc, int gtid) { 1803 kmp_info_t *thread = __kmp_threads[gtid]; 1804 kmp_taskdata_t *taskdata = thread->th.th_current_task; 1805 kmp_taskgroup_t *tg_new = 1806 (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t)); 1807 KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new)); 1808 tg_new->count = 0; 1809 tg_new->cancel_request = cancel_noreq; 1810 tg_new->parent = taskdata->td_taskgroup; 1811 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work 1812 #if OMP_45_ENABLED 1813 tg_new->reduce_data = NULL; 1814 tg_new->reduce_num_data = 0; 1815 #endif 1816 taskdata->td_taskgroup = tg_new; 1817 } 1818 1819 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task 1820 // and its descendants are complete 1821 void __kmpc_end_taskgroup(ident_t *loc, int gtid) { 1822 kmp_info_t *thread = __kmp_threads[gtid]; 1823 kmp_taskdata_t *taskdata = thread->th.th_current_task; 1824 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 1825 int thread_finished = FALSE; 1826 1827 KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc)); 1828 KMP_DEBUG_ASSERT(taskgroup != NULL); 1829 KMP_SET_THREAD_STATE_BLOCK(TASKGROUP); 1830 1831 if (__kmp_tasking_mode != tskm_immediate_exec) { 1832 #if USE_ITT_BUILD 1833 // For ITT the taskgroup wait is similar to taskwait until we need to 1834 // distinguish them 1835 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 1836 if (itt_sync_obj != NULL) 1837 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 1838 #endif /* USE_ITT_BUILD */ 1839 1840 #if OMP_45_ENABLED 1841 if (!taskdata->td_flags.team_serial || 1842 (thread->th.th_task_team != NULL && 1843 thread->th.th_task_team->tt.tt_found_proxy_tasks)) 1844 #else 1845 if (!taskdata->td_flags.team_serial) 1846 #endif 1847 { 1848 kmp_flag_32 flag(&(taskgroup->count), 0U); 1849 while (TCR_4(taskgroup->count) != 0) { 1850 flag.execute_tasks(thread, gtid, FALSE, 1851 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 1852 __kmp_task_stealing_constraint); 1853 } 1854 } 1855 1856 #if USE_ITT_BUILD 1857 if (itt_sync_obj != NULL) 1858 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 1859 #endif /* USE_ITT_BUILD */ 1860 } 1861 KMP_DEBUG_ASSERT(taskgroup->count == 0); 1862 1863 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work 1864 #if OMP_45_ENABLED 1865 if (taskgroup->reduce_data != NULL) // need to reduce? 1866 __kmp_task_reduction_fini(thread, taskgroup); 1867 #endif 1868 // Restore parent taskgroup for the current task 1869 taskdata->td_taskgroup = taskgroup->parent; 1870 __kmp_thread_free(thread, taskgroup); 1871 1872 KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n", 1873 gtid, taskdata)); 1874 ANNOTATE_HAPPENS_AFTER(taskdata); 1875 } 1876 #endif 1877 1878 // __kmp_remove_my_task: remove a task from my own deque 1879 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid, 1880 kmp_task_team_t *task_team, 1881 kmp_int32 is_constrained) { 1882 kmp_task_t *task; 1883 kmp_taskdata_t *taskdata; 1884 kmp_thread_data_t *thread_data; 1885 kmp_uint32 tail; 1886 1887 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 1888 KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data != 1889 NULL); // Caller should check this condition 1890 1891 thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 1892 1893 KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n", 1894 gtid, thread_data->td.td_deque_ntasks, 1895 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 1896 1897 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 1898 KA_TRACE(10, 1899 ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: " 1900 "ntasks=%d head=%u tail=%u\n", 1901 gtid, thread_data->td.td_deque_ntasks, 1902 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 1903 return NULL; 1904 } 1905 1906 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 1907 1908 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 1909 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 1910 KA_TRACE(10, 1911 ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: " 1912 "ntasks=%d head=%u tail=%u\n", 1913 gtid, thread_data->td.td_deque_ntasks, 1914 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 1915 return NULL; 1916 } 1917 1918 tail = (thread_data->td.td_deque_tail - 1) & 1919 TASK_DEQUE_MASK(thread_data->td); // Wrap index. 1920 taskdata = thread_data->td.td_deque[tail]; 1921 1922 if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) { 1923 // we need to check if the candidate obeys task scheduling constraint: 1924 // only child of current task can be scheduled 1925 kmp_taskdata_t *current = thread->th.th_current_task; 1926 kmp_int32 level = current->td_level; 1927 kmp_taskdata_t *parent = taskdata->td_parent; 1928 while (parent != current && parent->td_level > level) { 1929 parent = parent->td_parent; // check generation up to the level of the 1930 // current task 1931 KMP_DEBUG_ASSERT(parent != NULL); 1932 } 1933 if (parent != current) { 1934 // If the tail task is not a child, then no other child can appear in the 1935 // deque. 1936 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 1937 KA_TRACE(10, 1938 ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: " 1939 "ntasks=%d head=%u tail=%u\n", 1940 gtid, thread_data->td.td_deque_ntasks, 1941 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 1942 return NULL; 1943 } 1944 } 1945 1946 thread_data->td.td_deque_tail = tail; 1947 TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1); 1948 1949 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 1950 1951 KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d task %p removed: " 1952 "ntasks=%d head=%u tail=%u\n", 1953 gtid, taskdata, thread_data->td.td_deque_ntasks, 1954 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 1955 1956 task = KMP_TASKDATA_TO_TASK(taskdata); 1957 return task; 1958 } 1959 1960 // __kmp_steal_task: remove a task from another thread's deque 1961 // Assume that calling thread has already checked existence of 1962 // task_team thread_data before calling this routine. 1963 static kmp_task_t * 1964 __kmp_steal_task(kmp_info_t *victim, kmp_int32 gtid, kmp_task_team_t *task_team, 1965 volatile kmp_uint32 *unfinished_threads, int *thread_finished, 1966 kmp_int32 is_constrained) 1967 { 1968 kmp_task_t *task; 1969 kmp_taskdata_t *taskdata; 1970 kmp_thread_data_t *victim_td, *threads_data; 1971 kmp_int32 victim_tid; 1972 1973 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 1974 1975 threads_data = task_team->tt.tt_threads_data; 1976 KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition 1977 1978 victim_tid = victim->th.th_info.ds.ds_tid; 1979 victim_td = &threads_data[victim_tid]; 1980 1981 KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: " 1982 "task_team=%p ntasks=%d " 1983 "head=%u tail=%u\n", 1984 gtid, __kmp_gtid_from_thread(victim), task_team, 1985 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 1986 victim_td->td.td_deque_tail)); 1987 1988 if ((TCR_4(victim_td->td.td_deque_ntasks) == 1989 0) || // Caller should not check this condition 1990 (TCR_PTR(victim->th.th_task_team) != 1991 task_team)) // GEH: why would this happen? 1992 { 1993 KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: " 1994 "task_team=%p " 1995 "ntasks=%d head=%u tail=%u\n", 1996 gtid, __kmp_gtid_from_thread(victim), task_team, 1997 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 1998 victim_td->td.td_deque_tail)); 1999 return NULL; 2000 } 2001 2002 __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock); 2003 2004 // Check again after we acquire the lock 2005 if ((TCR_4(victim_td->td.td_deque_ntasks) == 0) || 2006 (TCR_PTR(victim->th.th_task_team) != 2007 task_team)) // GEH: why would this happen? 2008 { 2009 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2010 KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: " 2011 "task_team=%p " 2012 "ntasks=%d head=%u tail=%u\n", 2013 gtid, __kmp_gtid_from_thread(victim), task_team, 2014 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2015 victim_td->td.td_deque_tail)); 2016 return NULL; 2017 } 2018 2019 KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL); 2020 2021 taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head]; 2022 if (is_constrained) { 2023 // we need to check if the candidate obeys task scheduling constraint: 2024 // only descendant of current task can be scheduled 2025 kmp_taskdata_t *current = __kmp_threads[gtid]->th.th_current_task; 2026 kmp_int32 level = current->td_level; 2027 kmp_taskdata_t *parent = taskdata->td_parent; 2028 while (parent != current && parent->td_level > level) { 2029 parent = parent->td_parent; // check generation up to the level of the 2030 // current task 2031 KMP_DEBUG_ASSERT(parent != NULL); 2032 } 2033 if (parent != current) { 2034 // If the head task is not a descendant of the current task then do not 2035 // steal it. No other task in victim's deque can be a descendant of the 2036 // current task. 2037 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2038 KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from " 2039 "T#%d: task_team=%p " 2040 "ntasks=%d head=%u tail=%u\n", 2041 gtid, 2042 __kmp_gtid_from_thread(threads_data[victim_tid].td.td_thr), 2043 task_team, victim_td->td.td_deque_ntasks, 2044 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2045 return NULL; 2046 } 2047 } 2048 // Bump head pointer and Wrap. 2049 victim_td->td.td_deque_head = 2050 (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td); 2051 if (*thread_finished) { 2052 // We need to un-mark this victim as a finished victim. This must be done 2053 // before releasing the lock, or else other threads (starting with the 2054 // master victim) might be prematurely released from the barrier!!! 2055 kmp_uint32 count; 2056 2057 count = KMP_TEST_THEN_INC32((kmp_int32 *)unfinished_threads); 2058 2059 KA_TRACE( 2060 20, 2061 ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n", 2062 gtid, count + 1, task_team)); 2063 2064 *thread_finished = FALSE; 2065 } 2066 TCW_4(victim_td->td.td_deque_ntasks, 2067 TCR_4(victim_td->td.td_deque_ntasks) - 1); 2068 2069 2070 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2071 2072 KMP_COUNT_BLOCK(TASK_stolen); 2073 KA_TRACE( 2074 10, 2075 ("__kmp_steal_task(exit #3): T#%d stole task %p from T#%d: task_team=%p " 2076 "ntasks=%d head=%u tail=%u\n", 2077 gtid, taskdata, __kmp_gtid_from_thread(victim), task_team, 2078 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2079 victim_td->td.td_deque_tail)); 2080 2081 task = KMP_TASKDATA_TO_TASK(taskdata); 2082 return task; 2083 } 2084 2085 2086 // __kmp_execute_tasks_template: Choose and execute tasks until either the 2087 // condition is statisfied (return true) or there are none left (return false). 2088 // 2089 // final_spin is TRUE if this is the spin at the release barrier. 2090 // thread_finished indicates whether the thread is finished executing all 2091 // the tasks it has on its deque, and is at the release barrier. 2092 // spinner is the location on which to spin. 2093 // spinner == NULL means only execute a single task and return. 2094 // checker is the value to check to terminate the spin. 2095 template <class C> 2096 static inline int __kmp_execute_tasks_template( 2097 kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin, 2098 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2099 kmp_int32 is_constrained) { 2100 kmp_task_team_t *task_team = thread->th.th_task_team; 2101 kmp_thread_data_t *threads_data; 2102 kmp_task_t *task; 2103 kmp_info_t *other_thread; 2104 kmp_taskdata_t *current_task = thread->th.th_current_task; 2105 volatile kmp_uint32 *unfinished_threads; 2106 kmp_int32 nthreads, victim = -2, use_own_tasks = 1, new_victim = 0, 2107 tid = thread->th.th_info.ds.ds_tid; 2108 2109 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2110 KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]); 2111 2112 if (task_team == NULL) 2113 return FALSE; 2114 2115 KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d " 2116 "*thread_finished=%d\n", 2117 gtid, final_spin, *thread_finished)); 2118 2119 thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 2120 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 2121 KMP_DEBUG_ASSERT(threads_data != NULL); 2122 2123 nthreads = task_team->tt.tt_nproc; 2124 unfinished_threads = &(task_team->tt.tt_unfinished_threads); 2125 #if OMP_45_ENABLED 2126 KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks); 2127 #else 2128 KMP_DEBUG_ASSERT(nthreads > 1); 2129 #endif 2130 KMP_DEBUG_ASSERT((int)(TCR_4(*unfinished_threads)) >= 0); 2131 2132 while (1) { // Outer loop keeps trying to find tasks in case of single thread 2133 // getting tasks from target constructs 2134 while (1) { // Inner loop to find a task and execute it 2135 task = NULL; 2136 if (use_own_tasks) { // check on own queue first 2137 task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained); 2138 } 2139 if ((task == NULL) && (nthreads > 1)) { // Steal a task 2140 int asleep = 1; 2141 use_own_tasks = 0; 2142 // Try to steal from the last place I stole from successfully. 2143 if (victim == -2) { // haven't stolen anything yet 2144 victim = threads_data[tid].td.td_deque_last_stolen; 2145 if (victim != 2146 -1) // if we have a last stolen from victim, get the thread 2147 other_thread = threads_data[victim].td.td_thr; 2148 } 2149 if (victim != -1) { // found last victim 2150 asleep = 0; 2151 } else if (!new_victim) { // no recent steals and we haven't already 2152 // used a new victim; select a random thread 2153 do { // Find a different thread to steal work from. 2154 // Pick a random thread. Initial plan was to cycle through all the 2155 // threads, and only return if we tried to steal from every thread, 2156 // and failed. Arch says that's not such a great idea. 2157 victim = __kmp_get_random(thread) % (nthreads - 1); 2158 if (victim >= tid) { 2159 ++victim; // Adjusts random distribution to exclude self 2160 } 2161 // Found a potential victim 2162 other_thread = threads_data[victim].td.td_thr; 2163 // There is a slight chance that __kmp_enable_tasking() did not wake 2164 // up all threads waiting at the barrier. If victim is sleeping, 2165 // then wake it up. Since we were going to pay the cache miss 2166 // penalty for referencing another thread's kmp_info_t struct 2167 // anyway, 2168 // the check shouldn't cost too much performance at this point. In 2169 // extra barrier mode, tasks do not sleep at the separate tasking 2170 // barrier, so this isn't a problem. 2171 asleep = 0; 2172 if ((__kmp_tasking_mode == tskm_task_teams) && 2173 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) && 2174 (TCR_PTR(other_thread->th.th_sleep_loc) != NULL)) { 2175 asleep = 1; 2176 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread), 2177 other_thread->th.th_sleep_loc); 2178 // A sleeping thread should not have any tasks on it's queue. 2179 // There is a slight possibility that it resumes, steals a task 2180 // from another thread, which spawns more tasks, all in the time 2181 // that it takes this thread to check => don't write an assertion 2182 // that the victim's queue is empty. Try stealing from a 2183 // different thread. 2184 } 2185 } while (asleep); 2186 } 2187 2188 if (!asleep) { 2189 // We have a victim to try to steal from 2190 task = __kmp_steal_task(other_thread, gtid, task_team, 2191 unfinished_threads, thread_finished, 2192 is_constrained); 2193 } 2194 if (task != NULL) { // set last stolen to victim 2195 if (threads_data[tid].td.td_deque_last_stolen != victim) { 2196 threads_data[tid].td.td_deque_last_stolen = victim; 2197 // The pre-refactored code did not try more than 1 successful new 2198 // vicitm, unless the last one generated more local tasks; 2199 // new_victim keeps track of this 2200 new_victim = 1; 2201 } 2202 } else { // No tasks found; unset last_stolen 2203 KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1); 2204 victim = -2; // no successful victim found 2205 } 2206 } 2207 2208 if (task == NULL) // break out of tasking loop 2209 break; 2210 2211 // Found a task; execute it 2212 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2213 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2214 if (itt_sync_obj == NULL) { // we are at fork barrier where we could not 2215 // get the object reliably 2216 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2217 } 2218 __kmp_itt_task_starting(itt_sync_obj); 2219 } 2220 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2221 __kmp_invoke_task(gtid, task, current_task); 2222 #if USE_ITT_BUILD 2223 if (itt_sync_obj != NULL) 2224 __kmp_itt_task_finished(itt_sync_obj); 2225 #endif /* USE_ITT_BUILD */ 2226 // If this thread is only partway through the barrier and the condition is 2227 // met, then return now, so that the barrier gather/release pattern can 2228 // proceed. If this thread is in the last spin loop in the barrier, 2229 // waiting to be released, we know that the termination condition will not 2230 // be satisified, so don't waste any cycles checking it. 2231 if (flag == NULL || (!final_spin && flag->done_check())) { 2232 KA_TRACE( 2233 15, 2234 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 2235 gtid)); 2236 return TRUE; 2237 } 2238 if (thread->th.th_task_team == NULL) { 2239 break; 2240 } 2241 // Yield before executing next task 2242 KMP_YIELD(__kmp_library == library_throughput); 2243 // If execution of a stolen task results in more tasks being placed on our 2244 // run queue, reset use_own_tasks 2245 if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) { 2246 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned " 2247 "other tasks, restart\n", 2248 gtid)); 2249 use_own_tasks = 1; 2250 new_victim = 0; 2251 } 2252 } 2253 2254 // The task source has been exhausted. If in final spin loop of barrier, check 2255 // if termination condition is satisfied. 2256 #if OMP_45_ENABLED 2257 // The work queue may be empty but there might be proxy tasks still 2258 // executing 2259 if (final_spin && TCR_4(current_task->td_incomplete_child_tasks) == 0) 2260 #else 2261 if (final_spin) 2262 #endif 2263 { 2264 // First, decrement the #unfinished threads, if that has not already been 2265 // done. This decrement might be to the spin location, and result in the 2266 // termination condition being satisfied. 2267 if (!*thread_finished) { 2268 kmp_uint32 count; 2269 2270 count = KMP_TEST_THEN_DEC32((kmp_int32 *)unfinished_threads) - 1; 2271 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec " 2272 "unfinished_threads to %d task_team=%p\n", 2273 gtid, count, task_team)); 2274 *thread_finished = TRUE; 2275 } 2276 2277 // It is now unsafe to reference thread->th.th_team !!! 2278 // Decrementing task_team->tt.tt_unfinished_threads can allow the master 2279 // thread to pass through the barrier, where it might reset each thread's 2280 // th.th_team field for the next parallel region. If we can steal more 2281 // work, we know that this has not happened yet. 2282 if (flag != NULL && flag->done_check()) { 2283 KA_TRACE( 2284 15, 2285 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 2286 gtid)); 2287 return TRUE; 2288 } 2289 } 2290 2291 // If this thread's task team is NULL, master has recognized that there are 2292 // no more tasks; bail out 2293 if (thread->th.th_task_team == NULL) { 2294 KA_TRACE(15, 2295 ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid)); 2296 return FALSE; 2297 } 2298 2299 #if OMP_45_ENABLED 2300 // We could be getting tasks from target constructs; if this is the only 2301 // thread, keep trying to execute tasks from own queue 2302 if (nthreads == 1) 2303 use_own_tasks = 1; 2304 else 2305 #endif 2306 { 2307 KA_TRACE(15, 2308 ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid)); 2309 return FALSE; 2310 } 2311 } 2312 } 2313 2314 int __kmp_execute_tasks_32( 2315 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin, 2316 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2317 kmp_int32 is_constrained) { 2318 return __kmp_execute_tasks_template( 2319 thread, gtid, flag, final_spin, 2320 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 2321 } 2322 2323 int __kmp_execute_tasks_64( 2324 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin, 2325 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2326 kmp_int32 is_constrained) { 2327 return __kmp_execute_tasks_template( 2328 thread, gtid, flag, final_spin, 2329 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 2330 } 2331 2332 int __kmp_execute_tasks_oncore( 2333 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin, 2334 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2335 kmp_int32 is_constrained) { 2336 return __kmp_execute_tasks_template( 2337 thread, gtid, flag, final_spin, 2338 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 2339 } 2340 2341 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the 2342 // next barrier so they can assist in executing enqueued tasks. 2343 // First thread in allocates the task team atomically. 2344 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 2345 kmp_info_t *this_thr) { 2346 kmp_thread_data_t *threads_data; 2347 int nthreads, i, is_init_thread; 2348 2349 KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n", 2350 __kmp_gtid_from_thread(this_thr))); 2351 2352 KMP_DEBUG_ASSERT(task_team != NULL); 2353 KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL); 2354 2355 nthreads = task_team->tt.tt_nproc; 2356 KMP_DEBUG_ASSERT(nthreads > 0); 2357 KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc); 2358 2359 // Allocate or increase the size of threads_data if necessary 2360 is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team); 2361 2362 if (!is_init_thread) { 2363 // Some other thread already set up the array. 2364 KA_TRACE( 2365 20, 2366 ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n", 2367 __kmp_gtid_from_thread(this_thr))); 2368 return; 2369 } 2370 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 2371 KMP_DEBUG_ASSERT(threads_data != NULL); 2372 2373 if ((__kmp_tasking_mode == tskm_task_teams) && 2374 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) { 2375 // Release any threads sleeping at the barrier, so that they can steal 2376 // tasks and execute them. In extra barrier mode, tasks do not sleep 2377 // at the separate tasking barrier, so this isn't a problem. 2378 for (i = 0; i < nthreads; i++) { 2379 volatile void *sleep_loc; 2380 kmp_info_t *thread = threads_data[i].td.td_thr; 2381 2382 if (i == this_thr->th.th_info.ds.ds_tid) { 2383 continue; 2384 } 2385 // Since we haven't locked the thread's suspend mutex lock at this 2386 // point, there is a small window where a thread might be putting 2387 // itself to sleep, but hasn't set the th_sleep_loc field yet. 2388 // To work around this, __kmp_execute_tasks_template() periodically checks 2389 // see if other threads are sleeping (using the same random mechanism that 2390 // is used for task stealing) and awakens them if they are. 2391 if ((sleep_loc = TCR_PTR(thread->th.th_sleep_loc)) != NULL) { 2392 KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n", 2393 __kmp_gtid_from_thread(this_thr), 2394 __kmp_gtid_from_thread(thread))); 2395 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc); 2396 } else { 2397 KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n", 2398 __kmp_gtid_from_thread(this_thr), 2399 __kmp_gtid_from_thread(thread))); 2400 } 2401 } 2402 } 2403 2404 KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n", 2405 __kmp_gtid_from_thread(this_thr))); 2406 } 2407 2408 /* // TODO: Check the comment consistency 2409 * Utility routines for "task teams". A task team (kmp_task_t) is kind of 2410 * like a shadow of the kmp_team_t data struct, with a different lifetime. 2411 * After a child * thread checks into a barrier and calls __kmp_release() from 2412 * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no 2413 * longer assume that the kmp_team_t structure is intact (at any moment, the 2414 * master thread may exit the barrier code and free the team data structure, 2415 * and return the threads to the thread pool). 2416 * 2417 * This does not work with the the tasking code, as the thread is still 2418 * expected to participate in the execution of any tasks that may have been 2419 * spawned my a member of the team, and the thread still needs access to all 2420 * to each thread in the team, so that it can steal work from it. 2421 * 2422 * Enter the existence of the kmp_task_team_t struct. It employs a reference 2423 * counting mechanims, and is allocated by the master thread before calling 2424 * __kmp_<barrier_kind>_release, and then is release by the last thread to 2425 * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes 2426 * of the kmp_task_team_t structs for consecutive barriers can overlap 2427 * (and will, unless the master thread is the last thread to exit the barrier 2428 * release phase, which is not typical). 2429 * 2430 * The existence of such a struct is useful outside the context of tasking, 2431 * but for now, I'm trying to keep it specific to the OMP_30_ENABLED macro, 2432 * so that any performance differences show up when comparing the 2.5 vs. 3.0 2433 * libraries. 2434 * 2435 * We currently use the existence of the threads array as an indicator that 2436 * tasks were spawned since the last barrier. If the structure is to be 2437 * useful outside the context of tasking, then this will have to change, but 2438 * not settting the field minimizes the performance impact of tasking on 2439 * barriers, when no explicit tasks were spawned (pushed, actually). 2440 */ 2441 2442 static kmp_task_team_t *__kmp_free_task_teams = 2443 NULL; // Free list for task_team data structures 2444 // Lock for task team data structures 2445 static kmp_bootstrap_lock_t __kmp_task_team_lock = 2446 KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock); 2447 2448 // __kmp_alloc_task_deque: 2449 // Allocates a task deque for a particular thread, and initialize the necessary 2450 // data structures relating to the deque. This only happens once per thread 2451 // per task team since task teams are recycled. No lock is needed during 2452 // allocation since each thread allocates its own deque. 2453 static void __kmp_alloc_task_deque(kmp_info_t *thread, 2454 kmp_thread_data_t *thread_data) { 2455 __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock); 2456 KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL); 2457 2458 // Initialize last stolen task field to "none" 2459 thread_data->td.td_deque_last_stolen = -1; 2460 2461 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0); 2462 KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0); 2463 KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0); 2464 2465 KE_TRACE( 2466 10, 2467 ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n", 2468 __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data)); 2469 // Allocate space for task deque, and zero the deque 2470 // Cannot use __kmp_thread_calloc() because threads not around for 2471 // kmp_reap_task_team( ). 2472 thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate( 2473 INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *)); 2474 thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE; 2475 } 2476 2477 // __kmp_realloc_task_deque: 2478 // Re-allocates a task deque for a particular thread, copies the content from 2479 // the old deque and adjusts the necessary data structures relating to the 2480 // deque. This operation must be done with a the deque_lock being held 2481 static void __kmp_realloc_task_deque(kmp_info_t *thread, 2482 kmp_thread_data_t *thread_data) { 2483 kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td); 2484 kmp_int32 new_size = 2 * size; 2485 2486 KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to " 2487 "%d] for thread_data %p\n", 2488 __kmp_gtid_from_thread(thread), size, new_size, thread_data)); 2489 2490 kmp_taskdata_t **new_deque = 2491 (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *)); 2492 2493 int i, j; 2494 for (i = thread_data->td.td_deque_head, j = 0; j < size; 2495 i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++) 2496 new_deque[j] = thread_data->td.td_deque[i]; 2497 2498 __kmp_free(thread_data->td.td_deque); 2499 2500 thread_data->td.td_deque_head = 0; 2501 thread_data->td.td_deque_tail = size; 2502 thread_data->td.td_deque = new_deque; 2503 thread_data->td.td_deque_size = new_size; 2504 } 2505 2506 // __kmp_free_task_deque: 2507 // Deallocates a task deque for a particular thread. Happens at library 2508 // deallocation so don't need to reset all thread data fields. 2509 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) { 2510 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 2511 2512 if (thread_data->td.td_deque != NULL) { 2513 TCW_4(thread_data->td.td_deque_ntasks, 0); 2514 __kmp_free(thread_data->td.td_deque); 2515 thread_data->td.td_deque = NULL; 2516 } 2517 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2518 2519 #ifdef BUILD_TIED_TASK_STACK 2520 // GEH: Figure out what to do here for td_susp_tied_tasks 2521 if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) { 2522 __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data); 2523 } 2524 #endif // BUILD_TIED_TASK_STACK 2525 } 2526 2527 // __kmp_realloc_task_threads_data: 2528 // Allocates a threads_data array for a task team, either by allocating an 2529 // initial array or enlarging an existing array. Only the first thread to get 2530 // the lock allocs or enlarges the array and re-initializes the array eleemnts. 2531 // That thread returns "TRUE", the rest return "FALSE". 2532 // Assumes that the new array size is given by task_team -> tt.tt_nproc. 2533 // The current size is given by task_team -> tt.tt_max_threads. 2534 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 2535 kmp_task_team_t *task_team) { 2536 kmp_thread_data_t **threads_data_p; 2537 kmp_int32 nthreads, maxthreads; 2538 int is_init_thread = FALSE; 2539 2540 if (TCR_4(task_team->tt.tt_found_tasks)) { 2541 // Already reallocated and initialized. 2542 return FALSE; 2543 } 2544 2545 threads_data_p = &task_team->tt.tt_threads_data; 2546 nthreads = task_team->tt.tt_nproc; 2547 maxthreads = task_team->tt.tt_max_threads; 2548 2549 // All threads must lock when they encounter the first task of the implicit 2550 // task region to make sure threads_data fields are (re)initialized before 2551 // used. 2552 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 2553 2554 if (!TCR_4(task_team->tt.tt_found_tasks)) { 2555 // first thread to enable tasking 2556 kmp_team_t *team = thread->th.th_team; 2557 int i; 2558 2559 is_init_thread = TRUE; 2560 if (maxthreads < nthreads) { 2561 2562 if (*threads_data_p != NULL) { 2563 kmp_thread_data_t *old_data = *threads_data_p; 2564 kmp_thread_data_t *new_data = NULL; 2565 2566 KE_TRACE( 2567 10, 2568 ("__kmp_realloc_task_threads_data: T#%d reallocating " 2569 "threads data for task_team %p, new_size = %d, old_size = %d\n", 2570 __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads)); 2571 // Reallocate threads_data to have more elements than current array 2572 // Cannot use __kmp_thread_realloc() because threads not around for 2573 // kmp_reap_task_team( ). Note all new array entries are initialized 2574 // to zero by __kmp_allocate(). 2575 new_data = (kmp_thread_data_t *)__kmp_allocate( 2576 nthreads * sizeof(kmp_thread_data_t)); 2577 // copy old data to new data 2578 KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t), 2579 (void *)old_data, maxthreads * sizeof(kmp_taskdata_t *)); 2580 2581 #ifdef BUILD_TIED_TASK_STACK 2582 // GEH: Figure out if this is the right thing to do 2583 for (i = maxthreads; i < nthreads; i++) { 2584 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 2585 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 2586 } 2587 #endif // BUILD_TIED_TASK_STACK 2588 // Install the new data and free the old data 2589 (*threads_data_p) = new_data; 2590 __kmp_free(old_data); 2591 } else { 2592 KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating " 2593 "threads data for task_team %p, size = %d\n", 2594 __kmp_gtid_from_thread(thread), task_team, nthreads)); 2595 // Make the initial allocate for threads_data array, and zero entries 2596 // Cannot use __kmp_thread_calloc() because threads not around for 2597 // kmp_reap_task_team( ). 2598 ANNOTATE_IGNORE_WRITES_BEGIN(); 2599 *threads_data_p = (kmp_thread_data_t *)__kmp_allocate( 2600 nthreads * sizeof(kmp_thread_data_t)); 2601 ANNOTATE_IGNORE_WRITES_END(); 2602 #ifdef BUILD_TIED_TASK_STACK 2603 // GEH: Figure out if this is the right thing to do 2604 for (i = 0; i < nthreads; i++) { 2605 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 2606 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 2607 } 2608 #endif // BUILD_TIED_TASK_STACK 2609 } 2610 task_team->tt.tt_max_threads = nthreads; 2611 } else { 2612 // If array has (more than) enough elements, go ahead and use it 2613 KMP_DEBUG_ASSERT(*threads_data_p != NULL); 2614 } 2615 2616 // initialize threads_data pointers back to thread_info structures 2617 for (i = 0; i < nthreads; i++) { 2618 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 2619 thread_data->td.td_thr = team->t.t_threads[i]; 2620 2621 if (thread_data->td.td_deque_last_stolen >= nthreads) { 2622 // The last stolen field survives across teams / barrier, and the number 2623 // of threads may have changed. It's possible (likely?) that a new 2624 // parallel region will exhibit the same behavior as previous region. 2625 thread_data->td.td_deque_last_stolen = -1; 2626 } 2627 } 2628 2629 KMP_MB(); 2630 TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE); 2631 } 2632 2633 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 2634 return is_init_thread; 2635 } 2636 2637 // __kmp_free_task_threads_data: 2638 // Deallocates a threads_data array for a task team, including any attached 2639 // tasking deques. Only occurs at library shutdown. 2640 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) { 2641 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 2642 if (task_team->tt.tt_threads_data != NULL) { 2643 int i; 2644 for (i = 0; i < task_team->tt.tt_max_threads; i++) { 2645 __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]); 2646 } 2647 __kmp_free(task_team->tt.tt_threads_data); 2648 task_team->tt.tt_threads_data = NULL; 2649 } 2650 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 2651 } 2652 2653 // __kmp_allocate_task_team: 2654 // Allocates a task team associated with a specific team, taking it from 2655 // the global task team free list if possible. Also initializes data 2656 // structures. 2657 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread, 2658 kmp_team_t *team) { 2659 kmp_task_team_t *task_team = NULL; 2660 int nthreads; 2661 2662 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n", 2663 (thread ? __kmp_gtid_from_thread(thread) : -1), team)); 2664 2665 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 2666 // Take a task team from the task team pool 2667 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 2668 if (__kmp_free_task_teams != NULL) { 2669 task_team = __kmp_free_task_teams; 2670 TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next); 2671 task_team->tt.tt_next = NULL; 2672 } 2673 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 2674 } 2675 2676 if (task_team == NULL) { 2677 KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating " 2678 "task team for team %p\n", 2679 __kmp_gtid_from_thread(thread), team)); 2680 // Allocate a new task team if one is not available. 2681 // Cannot use __kmp_thread_malloc() because threads not around for 2682 // kmp_reap_task_team( ). 2683 task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t)); 2684 __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock); 2685 // AC: __kmp_allocate zeroes returned memory 2686 // task_team -> tt.tt_threads_data = NULL; 2687 // task_team -> tt.tt_max_threads = 0; 2688 // task_team -> tt.tt_next = NULL; 2689 } 2690 2691 TCW_4(task_team->tt.tt_found_tasks, FALSE); 2692 #if OMP_45_ENABLED 2693 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 2694 #endif 2695 task_team->tt.tt_nproc = nthreads = team->t.t_nproc; 2696 2697 TCW_4(task_team->tt.tt_unfinished_threads, nthreads); 2698 TCW_4(task_team->tt.tt_active, TRUE); 2699 2700 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p " 2701 "unfinished_threads init'd to %d\n", 2702 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team, 2703 task_team->tt.tt_unfinished_threads)); 2704 return task_team; 2705 } 2706 2707 // __kmp_free_task_team: 2708 // Frees the task team associated with a specific thread, and adds it 2709 // to the global task team free list. 2710 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) { 2711 KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n", 2712 thread ? __kmp_gtid_from_thread(thread) : -1, task_team)); 2713 2714 // Put task team back on free list 2715 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 2716 2717 KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL); 2718 task_team->tt.tt_next = __kmp_free_task_teams; 2719 TCW_PTR(__kmp_free_task_teams, task_team); 2720 2721 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 2722 } 2723 2724 // __kmp_reap_task_teams: 2725 // Free all the task teams on the task team free list. 2726 // Should only be done during library shutdown. 2727 // Cannot do anything that needs a thread structure or gtid since they are 2728 // already gone. 2729 void __kmp_reap_task_teams(void) { 2730 kmp_task_team_t *task_team; 2731 2732 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 2733 // Free all task_teams on the free list 2734 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 2735 while ((task_team = __kmp_free_task_teams) != NULL) { 2736 __kmp_free_task_teams = task_team->tt.tt_next; 2737 task_team->tt.tt_next = NULL; 2738 2739 // Free threads_data if necessary 2740 if (task_team->tt.tt_threads_data != NULL) { 2741 __kmp_free_task_threads_data(task_team); 2742 } 2743 __kmp_free(task_team); 2744 } 2745 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 2746 } 2747 } 2748 2749 // __kmp_wait_to_unref_task_teams: 2750 // Some threads could still be in the fork barrier release code, possibly 2751 // trying to steal tasks. Wait for each thread to unreference its task team. 2752 void __kmp_wait_to_unref_task_teams(void) { 2753 kmp_info_t *thread; 2754 kmp_uint32 spins; 2755 int done; 2756 2757 KMP_INIT_YIELD(spins); 2758 2759 for (;;) { 2760 done = TRUE; 2761 2762 // TODO: GEH - this may be is wrong because some sync would be necessary 2763 // in case threads are added to the pool during the traversal. Need to 2764 // verify that lock for thread pool is held when calling this routine. 2765 for (thread = (kmp_info_t *)__kmp_thread_pool; thread != NULL; 2766 thread = thread->th.th_next_pool) { 2767 #if KMP_OS_WINDOWS 2768 DWORD exit_val; 2769 #endif 2770 if (TCR_PTR(thread->th.th_task_team) == NULL) { 2771 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n", 2772 __kmp_gtid_from_thread(thread))); 2773 continue; 2774 } 2775 #if KMP_OS_WINDOWS 2776 // TODO: GEH - add this check for Linux* OS / OS X* as well? 2777 if (!__kmp_is_thread_alive(thread, &exit_val)) { 2778 thread->th.th_task_team = NULL; 2779 continue; 2780 } 2781 #endif 2782 2783 done = FALSE; // Because th_task_team pointer is not NULL for this thread 2784 2785 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to " 2786 "unreference task_team\n", 2787 __kmp_gtid_from_thread(thread))); 2788 2789 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 2790 volatile void *sleep_loc; 2791 // If the thread is sleeping, awaken it. 2792 if ((sleep_loc = TCR_PTR(thread->th.th_sleep_loc)) != NULL) { 2793 KA_TRACE( 2794 10, 2795 ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n", 2796 __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread))); 2797 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc); 2798 } 2799 } 2800 } 2801 if (done) { 2802 break; 2803 } 2804 2805 // If we are oversubscribed, or have waited a bit (and library mode is 2806 // throughput), yield. Pause is in the following code. 2807 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2808 KMP_YIELD_SPIN(spins); // Yields only if KMP_LIBRARY=throughput 2809 } 2810 } 2811 2812 // __kmp_task_team_setup: Create a task_team for the current team, but use 2813 // an already created, unused one if it already exists. 2814 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) { 2815 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2816 2817 // If this task_team hasn't been created yet, allocate it. It will be used in 2818 // the region after the next. 2819 // If it exists, it is the current task team and shouldn't be touched yet as 2820 // it may still be in use. 2821 if (team->t.t_task_team[this_thr->th.th_task_state] == NULL && 2822 (always || team->t.t_nproc > 1)) { 2823 team->t.t_task_team[this_thr->th.th_task_state] = 2824 __kmp_allocate_task_team(this_thr, team); 2825 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p " 2826 "for team %d at parity=%d\n", 2827 __kmp_gtid_from_thread(this_thr), 2828 team->t.t_task_team[this_thr->th.th_task_state], 2829 ((team != NULL) ? team->t.t_id : -1), 2830 this_thr->th.th_task_state)); 2831 } 2832 2833 // After threads exit the release, they will call sync, and then point to this 2834 // other task_team; make sure it is allocated and properly initialized. As 2835 // threads spin in the barrier release phase, they will continue to use the 2836 // previous task_team struct(above), until they receive the signal to stop 2837 // checking for tasks (they can't safely reference the kmp_team_t struct, 2838 // which could be reallocated by the master thread). No task teams are formed 2839 // for serialized teams. 2840 if (team->t.t_nproc > 1) { 2841 int other_team = 1 - this_thr->th.th_task_state; 2842 if (team->t.t_task_team[other_team] == NULL) { // setup other team as well 2843 team->t.t_task_team[other_team] = 2844 __kmp_allocate_task_team(this_thr, team); 2845 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new " 2846 "task_team %p for team %d at parity=%d\n", 2847 __kmp_gtid_from_thread(this_thr), 2848 team->t.t_task_team[other_team], 2849 ((team != NULL) ? team->t.t_id : -1), other_team)); 2850 } else { // Leave the old task team struct in place for the upcoming region; 2851 // adjust as needed 2852 kmp_task_team_t *task_team = team->t.t_task_team[other_team]; 2853 if (!task_team->tt.tt_active || 2854 team->t.t_nproc != task_team->tt.tt_nproc) { 2855 TCW_4(task_team->tt.tt_nproc, team->t.t_nproc); 2856 TCW_4(task_team->tt.tt_found_tasks, FALSE); 2857 #if OMP_45_ENABLED 2858 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 2859 #endif 2860 TCW_4(task_team->tt.tt_unfinished_threads, team->t.t_nproc); 2861 TCW_4(task_team->tt.tt_active, TRUE); 2862 } 2863 // if team size has changed, the first thread to enable tasking will 2864 // realloc threads_data if necessary 2865 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team " 2866 "%p for team %d at parity=%d\n", 2867 __kmp_gtid_from_thread(this_thr), 2868 team->t.t_task_team[other_team], 2869 ((team != NULL) ? team->t.t_id : -1), other_team)); 2870 } 2871 } 2872 } 2873 2874 // __kmp_task_team_sync: Propagation of task team data from team to threads 2875 // which happens just after the release phase of a team barrier. This may be 2876 // called by any thread, but only for teams with # threads > 1. 2877 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) { 2878 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2879 2880 // Toggle the th_task_state field, to switch which task_team this thread 2881 // refers to 2882 this_thr->th.th_task_state = 1 - this_thr->th.th_task_state; 2883 // It is now safe to propagate the task team pointer from the team struct to 2884 // the current thread. 2885 TCW_PTR(this_thr->th.th_task_team, 2886 team->t.t_task_team[this_thr->th.th_task_state]); 2887 KA_TRACE(20, 2888 ("__kmp_task_team_sync: Thread T#%d task team switched to task_team " 2889 "%p from Team #%d (parity=%d)\n", 2890 __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team, 2891 ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state)); 2892 } 2893 2894 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the 2895 // barrier gather phase. Only called by master thread if #threads in team > 1 or 2896 // if proxy tasks were created. 2897 // 2898 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off 2899 // by passing in 0 optionally as the last argument. When wait is zero, master 2900 // thread does not wait for unfinished_threads to reach 0. 2901 void __kmp_task_team_wait( 2902 kmp_info_t *this_thr, 2903 kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) { 2904 kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state]; 2905 2906 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2907 KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team); 2908 2909 if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) { 2910 if (wait) { 2911 KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks " 2912 "(for unfinished_threads to reach 0) on task_team = %p\n", 2913 __kmp_gtid_from_thread(this_thr), task_team)); 2914 // Worker threads may have dropped through to release phase, but could 2915 // still be executing tasks. Wait here for tasks to complete. To avoid 2916 // memory contention, only master thread checks termination condition. 2917 kmp_flag_32 flag(&task_team->tt.tt_unfinished_threads, 0U); 2918 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 2919 } 2920 // Deactivate the old task team, so that the worker threads will stop 2921 // referencing it while spinning. 2922 KA_TRACE( 2923 20, 2924 ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: " 2925 "setting active to false, setting local and team's pointer to NULL\n", 2926 __kmp_gtid_from_thread(this_thr), task_team)); 2927 #if OMP_45_ENABLED 2928 KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 || 2929 task_team->tt.tt_found_proxy_tasks == TRUE); 2930 TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE); 2931 #else 2932 KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1); 2933 #endif 2934 TCW_SYNC_4(task_team->tt.tt_active, FALSE); 2935 KMP_MB(); 2936 2937 TCW_PTR(this_thr->th.th_task_team, NULL); 2938 } 2939 } 2940 2941 // __kmp_tasking_barrier: 2942 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier. 2943 // Internal function to execute all tasks prior to a regular barrier or a join 2944 // barrier. It is a full barrier itself, which unfortunately turns regular 2945 // barriers into double barriers and join barriers into 1 1/2 barriers. 2946 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) { 2947 volatile kmp_uint32 *spin = 2948 &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads; 2949 int flag = FALSE; 2950 KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier); 2951 2952 #if USE_ITT_BUILD 2953 KMP_FSYNC_SPIN_INIT(spin, (kmp_uint32 *)NULL); 2954 #endif /* USE_ITT_BUILD */ 2955 kmp_flag_32 spin_flag(spin, 0U); 2956 while (!spin_flag.execute_tasks(thread, gtid, TRUE, 2957 &flag USE_ITT_BUILD_ARG(NULL), 0)) { 2958 #if USE_ITT_BUILD 2959 // TODO: What about itt_sync_obj?? 2960 KMP_FSYNC_SPIN_PREPARE(spin); 2961 #endif /* USE_ITT_BUILD */ 2962 2963 if (TCR_4(__kmp_global.g.g_done)) { 2964 if (__kmp_global.g.g_abort) 2965 __kmp_abort_thread(); 2966 break; 2967 } 2968 KMP_YIELD(TRUE); // GH: We always yield here 2969 } 2970 #if USE_ITT_BUILD 2971 KMP_FSYNC_SPIN_ACQUIRED((void *)spin); 2972 #endif /* USE_ITT_BUILD */ 2973 } 2974 2975 #if OMP_45_ENABLED 2976 2977 // __kmp_give_task puts a task into a given thread queue if: 2978 // - the queue for that thread was created 2979 // - there's space in that queue 2980 // Because of this, __kmp_push_task needs to check if there's space after 2981 // getting the lock 2982 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task, 2983 kmp_int32 pass) { 2984 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 2985 kmp_task_team_t *task_team = taskdata->td_task_team; 2986 2987 KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n", 2988 taskdata, tid)); 2989 2990 // If task_team is NULL something went really bad... 2991 KMP_DEBUG_ASSERT(task_team != NULL); 2992 2993 bool result = false; 2994 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 2995 2996 if (thread_data->td.td_deque == NULL) { 2997 // There's no queue in this thread, go find another one 2998 // We're guaranteed that at least one thread has a queue 2999 KA_TRACE(30, 3000 ("__kmp_give_task: thread %d has no queue while giving task %p.\n", 3001 tid, taskdata)); 3002 return result; 3003 } 3004 3005 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3006 TASK_DEQUE_SIZE(thread_data->td)) { 3007 KA_TRACE( 3008 30, 3009 ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", 3010 taskdata, tid)); 3011 3012 // if this deque is bigger than the pass ratio give a chance to another 3013 // thread 3014 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 3015 return result; 3016 3017 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3018 __kmp_realloc_task_deque(thread, thread_data); 3019 3020 } else { 3021 3022 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3023 3024 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3025 TASK_DEQUE_SIZE(thread_data->td)) { 3026 KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to " 3027 "thread %d.\n", 3028 taskdata, tid)); 3029 3030 // if this deque is bigger than the pass ratio give a chance to another 3031 // thread 3032 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 3033 goto release_and_exit; 3034 3035 __kmp_realloc_task_deque(thread, thread_data); 3036 } 3037 } 3038 3039 // lock is held here, and there is space in the deque 3040 3041 thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata; 3042 // Wrap index. 3043 thread_data->td.td_deque_tail = 3044 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 3045 TCW_4(thread_data->td.td_deque_ntasks, 3046 TCR_4(thread_data->td.td_deque_ntasks) + 1); 3047 3048 result = true; 3049 KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n", 3050 taskdata, tid)); 3051 3052 release_and_exit: 3053 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 3054 3055 return result; 3056 } 3057 3058 /* The finish of the proxy tasks is divided in two pieces: 3059 - the top half is the one that can be done from a thread outside the team 3060 - the bottom half must be run from a them within the team 3061 3062 In order to run the bottom half the task gets queued back into one of the 3063 threads of the team. Once the td_incomplete_child_task counter of the parent 3064 is decremented the threads can leave the barriers. So, the bottom half needs 3065 to be queued before the counter is decremented. The top half is therefore 3066 divided in two parts: 3067 - things that can be run before queuing the bottom half 3068 - things that must be run after queuing the bottom half 3069 3070 This creates a second race as the bottom half can free the task before the 3071 second top half is executed. To avoid this we use the 3072 td_incomplete_child_task of the proxy task to synchronize the top and bottom 3073 half. */ 3074 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 3075 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 3076 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3077 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 3078 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 3079 3080 taskdata->td_flags.complete = 1; // mark the task as completed 3081 3082 if (taskdata->td_taskgroup) 3083 KMP_TEST_THEN_DEC32((kmp_int32 *)(&taskdata->td_taskgroup->count)); 3084 3085 // Create an imaginary children for this task so the bottom half cannot 3086 // release the task before we have completed the second top half 3087 TCI_4(taskdata->td_incomplete_child_tasks); 3088 } 3089 3090 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 3091 kmp_int32 children = 0; 3092 3093 // Predecrement simulated by "- 1" calculation 3094 children = 3095 KMP_TEST_THEN_DEC32( 3096 (kmp_int32 *)(&taskdata->td_parent->td_incomplete_child_tasks)) - 3097 1; 3098 KMP_DEBUG_ASSERT(children >= 0); 3099 3100 // Remove the imaginary children 3101 TCD_4(taskdata->td_incomplete_child_tasks); 3102 } 3103 3104 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) { 3105 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3106 kmp_info_t *thread = __kmp_threads[gtid]; 3107 3108 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3109 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 3110 1); // top half must run before bottom half 3111 3112 // We need to wait to make sure the top half is finished 3113 // Spinning here should be ok as this should happen quickly 3114 while (TCR_4(taskdata->td_incomplete_child_tasks) > 0) 3115 ; 3116 3117 __kmp_release_deps(gtid, taskdata); 3118 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 3119 } 3120 3121 /*! 3122 @ingroup TASKING 3123 @param gtid Global Thread ID of encountering thread 3124 @param ptask Task which execution is completed 3125 3126 Execute the completation of a proxy task from a thread of that is part of the 3127 team. Run first and bottom halves directly. 3128 */ 3129 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) { 3130 KMP_DEBUG_ASSERT(ptask != NULL); 3131 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3132 KA_TRACE( 3133 10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n", 3134 gtid, taskdata)); 3135 3136 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3137 3138 __kmp_first_top_half_finish_proxy(taskdata); 3139 __kmp_second_top_half_finish_proxy(taskdata); 3140 __kmp_bottom_half_finish_proxy(gtid, ptask); 3141 3142 KA_TRACE(10, 3143 ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n", 3144 gtid, taskdata)); 3145 } 3146 3147 /*! 3148 @ingroup TASKING 3149 @param ptask Task which execution is completed 3150 3151 Execute the completation of a proxy task from a thread that could not belong to 3152 the team. 3153 */ 3154 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) { 3155 KMP_DEBUG_ASSERT(ptask != NULL); 3156 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3157 3158 KA_TRACE( 3159 10, 3160 ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n", 3161 taskdata)); 3162 3163 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3164 3165 __kmp_first_top_half_finish_proxy(taskdata); 3166 3167 // Enqueue task to complete bottom half completion from a thread within the 3168 // corresponding team 3169 kmp_team_t *team = taskdata->td_team; 3170 kmp_int32 nthreads = team->t.t_nproc; 3171 kmp_info_t *thread; 3172 3173 // This should be similar to start_k = __kmp_get_random( thread ) % nthreads 3174 // but we cannot use __kmp_get_random here 3175 kmp_int32 start_k = 0; 3176 kmp_int32 pass = 1; 3177 kmp_int32 k = start_k; 3178 3179 do { 3180 // For now we're just linearly trying to find a thread 3181 thread = team->t.t_threads[k]; 3182 k = (k + 1) % nthreads; 3183 3184 // we did a full pass through all the threads 3185 if (k == start_k) 3186 pass = pass << 1; 3187 3188 } while (!__kmp_give_task(thread, k, ptask, pass)); 3189 3190 __kmp_second_top_half_finish_proxy(taskdata); 3191 3192 KA_TRACE( 3193 10, 3194 ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", 3195 taskdata)); 3196 } 3197 3198 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task 3199 // for taskloop 3200 // 3201 // thread: allocating thread 3202 // task_src: pointer to source task to be duplicated 3203 // returns: a pointer to the allocated kmp_task_t structure (task). 3204 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) { 3205 kmp_task_t *task; 3206 kmp_taskdata_t *taskdata; 3207 kmp_taskdata_t *taskdata_src; 3208 kmp_taskdata_t *parent_task = thread->th.th_current_task; 3209 size_t shareds_offset; 3210 size_t task_size; 3211 3212 KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, 3213 task_src)); 3214 taskdata_src = KMP_TASK_TO_TASKDATA(task_src); 3215 KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy == 3216 TASK_FULL); // it should not be proxy task 3217 KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT); 3218 task_size = taskdata_src->td_size_alloc; 3219 3220 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 3221 KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread, 3222 task_size)); 3223 #if USE_FAST_MEMORY 3224 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size); 3225 #else 3226 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size); 3227 #endif /* USE_FAST_MEMORY */ 3228 KMP_MEMCPY(taskdata, taskdata_src, task_size); 3229 3230 task = KMP_TASKDATA_TO_TASK(taskdata); 3231 3232 // Initialize new task (only specific fields not affected by memcpy) 3233 taskdata->td_task_id = KMP_GEN_TASK_ID(); 3234 if (task->shareds != NULL) { // need setup shareds pointer 3235 shareds_offset = (char *)task_src->shareds - (char *)taskdata_src; 3236 task->shareds = &((char *)taskdata)[shareds_offset]; 3237 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 3238 0); 3239 } 3240 taskdata->td_alloc_thread = thread; 3241 taskdata->td_taskgroup = 3242 parent_task 3243 ->td_taskgroup; // task inherits the taskgroup from the parent task 3244 3245 // Only need to keep track of child task counts if team parallel and tasking 3246 // not serialized 3247 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { 3248 KMP_TEST_THEN_INC32((kmp_int32 *)(&parent_task->td_incomplete_child_tasks)); 3249 if (parent_task->td_taskgroup) 3250 KMP_TEST_THEN_INC32((kmp_int32 *)(&parent_task->td_taskgroup->count)); 3251 // Only need to keep track of allocated child tasks for explicit tasks since 3252 // implicit not deallocated 3253 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) 3254 KMP_TEST_THEN_INC32( 3255 (kmp_int32 *)(&taskdata->td_parent->td_allocated_child_tasks)); 3256 } 3257 3258 KA_TRACE(20, 3259 ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n", 3260 thread, taskdata, taskdata->td_parent)); 3261 #if OMPT_SUPPORT 3262 __kmp_task_init_ompt(taskdata, thread->th.th_info.ds.ds_gtid, 3263 (void *)task->routine); 3264 #endif 3265 return task; 3266 } 3267 3268 // Routine optionally generated by th ecompiler for setting the lastprivate flag 3269 // and calling needed constructors for private/firstprivate objects 3270 // (used to form taskloop tasks from pattern task) 3271 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32); 3272 3273 // __kmp_taskloop_linear: Start tasks of the taskloop linearly 3274 // 3275 // loc Source location information 3276 // gtid Global thread ID 3277 // task Task with whole loop iteration range 3278 // lb Pointer to loop lower bound 3279 // ub Pointer to loop upper bound 3280 // st Loop stride 3281 // sched Schedule specified 0/1/2 for none/grainsize/num_tasks 3282 // grainsize Schedule value if specified 3283 // task_dup Tasks duplication routine 3284 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, 3285 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 3286 int sched, kmp_uint64 grainsize, void *task_dup) { 3287 KMP_COUNT_BLOCK(OMP_TASKLOOP); 3288 KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling); 3289 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 3290 kmp_uint64 tc; 3291 kmp_uint64 lower = *lb; // compiler provides global bounds here 3292 kmp_uint64 upper = *ub; 3293 kmp_uint64 i, num_tasks = 0, extras = 0; 3294 kmp_info_t *thread = __kmp_threads[gtid]; 3295 kmp_taskdata_t *current_task = thread->th.th_current_task; 3296 kmp_task_t *next_task; 3297 kmp_int32 lastpriv = 0; 3298 size_t lower_offset = 3299 (char *)lb - (char *)task; // remember offset of lb in the task structure 3300 size_t upper_offset = 3301 (char *)ub - (char *)task; // remember offset of ub in the task structure 3302 3303 // compute trip count 3304 if (st == 1) { // most common case 3305 tc = upper - lower + 1; 3306 } else if (st < 0) { 3307 tc = (lower - upper) / (-st) + 1; 3308 } else { // st > 0 3309 tc = (upper - lower) / st + 1; 3310 } 3311 if (tc == 0) { 3312 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid)); 3313 // free the pattern task and exit 3314 __kmp_task_start(gtid, task, current_task); 3315 // do not execute anything for zero-trip loop 3316 __kmp_task_finish(gtid, task, current_task); 3317 return; 3318 } 3319 3320 // compute num_tasks/grainsize based on the input provided 3321 switch (sched) { 3322 case 0: // no schedule clause specified, we can choose the default 3323 // let's try to schedule (team_size*10) tasks 3324 grainsize = thread->th.th_team_nproc * 10; 3325 case 2: // num_tasks provided 3326 if (grainsize > tc) { 3327 num_tasks = tc; // too big num_tasks requested, adjust values 3328 grainsize = 1; 3329 extras = 0; 3330 } else { 3331 num_tasks = grainsize; 3332 grainsize = tc / num_tasks; 3333 extras = tc % num_tasks; 3334 } 3335 break; 3336 case 1: // grainsize provided 3337 if (grainsize > tc) { 3338 num_tasks = 1; // too big grainsize requested, adjust values 3339 grainsize = tc; 3340 extras = 0; 3341 } else { 3342 num_tasks = tc / grainsize; 3343 grainsize = 3344 tc / 3345 num_tasks; // adjust grainsize for balanced distribution of iterations 3346 extras = tc % num_tasks; 3347 } 3348 break; 3349 default: 3350 KMP_ASSERT2(0, "unknown scheduling of taskloop"); 3351 } 3352 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); 3353 KMP_DEBUG_ASSERT(num_tasks > extras); 3354 KMP_DEBUG_ASSERT(num_tasks > 0); 3355 KA_TRACE(20, ("__kmpc_taskloop: T#%d will launch: num_tasks %lld, grainsize " 3356 "%lld, extras %lld\n", 3357 gtid, num_tasks, grainsize, extras)); 3358 3359 // Main loop, launch num_tasks tasks, assign grainsize iterations each task 3360 for (i = 0; i < num_tasks; ++i) { 3361 kmp_uint64 chunk_minus_1; 3362 if (extras == 0) { 3363 chunk_minus_1 = grainsize - 1; 3364 } else { 3365 chunk_minus_1 = grainsize; 3366 --extras; // first extras iterations get bigger chunk (grainsize+1) 3367 } 3368 upper = lower + st * chunk_minus_1; 3369 if (i == num_tasks - 1) { 3370 // schedule the last task, set lastprivate flag 3371 lastpriv = 1; 3372 #if KMP_DEBUG 3373 if (st == 1) 3374 KMP_DEBUG_ASSERT(upper == *ub); 3375 else if (st > 0) 3376 KMP_DEBUG_ASSERT(upper + st > *ub); 3377 else 3378 KMP_DEBUG_ASSERT(upper + st < *ub); 3379 #endif 3380 } 3381 next_task = __kmp_task_dup_alloc(thread, task); // allocate new task 3382 *(kmp_uint64 *)((char *)next_task + lower_offset) = 3383 lower; // adjust task-specific bounds 3384 *(kmp_uint64 *)((char *)next_task + upper_offset) = upper; 3385 if (ptask_dup != NULL) 3386 ptask_dup(next_task, task, 3387 lastpriv); // set lastprivate flag, construct fistprivates, etc. 3388 KA_TRACE(20, ("__kmpc_taskloop: T#%d schedule task %p: lower %lld, upper " 3389 "%lld (offsets %p %p)\n", 3390 gtid, next_task, lower, upper, lower_offset, upper_offset)); 3391 __kmp_omp_task(gtid, next_task, true); // schedule new task 3392 lower = upper + st; // adjust lower bound for the next iteration 3393 } 3394 // free the pattern task and exit 3395 __kmp_task_start(gtid, task, current_task); 3396 // do not execute the pattern task, just do bookkeeping 3397 __kmp_task_finish(gtid, task, current_task); 3398 } 3399 3400 /*! 3401 @ingroup TASKING 3402 @param loc Source location information 3403 @param gtid Global thread ID 3404 @param task Task structure 3405 @param if_val Value of the if clause 3406 @param lb Pointer to loop lower bound 3407 @param ub Pointer to loop upper bound 3408 @param st Loop stride 3409 @param nogroup Flag, 1 if nogroup clause specified, 0 otherwise 3410 @param sched Schedule specified 0/1/2 for none/grainsize/num_tasks 3411 @param grainsize Schedule value if specified 3412 @param task_dup Tasks duplication routine 3413 3414 Execute the taskloop construct. 3415 */ 3416 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, 3417 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, 3418 int sched, kmp_uint64 grainsize, void *task_dup) { 3419 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 3420 KMP_DEBUG_ASSERT(task != NULL); 3421 3422 KA_TRACE(10, ("__kmpc_taskloop(enter): T#%d, pattern task %p, lb %lld ub " 3423 "%lld st %lld, grain %llu(%d)\n", 3424 gtid, taskdata, *lb, *ub, st, grainsize, sched)); 3425 3426 // check if clause value first 3427 if (if_val == 0) { // if(0) specified, mark task as serial 3428 taskdata->td_flags.task_serial = 1; 3429 taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied 3430 } 3431 if (nogroup == 0) { 3432 __kmpc_taskgroup(loc, gtid); 3433 } 3434 3435 if (1 /* AC: use some heuristic here to choose task scheduling method */) { 3436 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, sched, grainsize, 3437 task_dup); 3438 } 3439 3440 if (nogroup == 0) { 3441 __kmpc_end_taskgroup(loc, gtid); 3442 } 3443 KA_TRACE(10, ("__kmpc_taskloop(exit): T#%d\n", gtid)); 3444 } 3445 3446 #endif 3447