1 /* 2 * kmp_tasking.cpp -- OpenMP 3.0 tasking support. 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "kmp.h" 17 #include "kmp_i18n.h" 18 #include "kmp_itt.h" 19 #include "kmp_stats.h" 20 #include "kmp_wait_release.h" 21 22 #if OMPT_SUPPORT 23 #include "ompt-specific.h" 24 #endif 25 26 #include "tsan_annotations.h" 27 28 /* forward declaration */ 29 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 30 kmp_info_t *this_thr); 31 static void __kmp_alloc_task_deque(kmp_info_t *thread, 32 kmp_thread_data_t *thread_data); 33 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 34 kmp_task_team_t *task_team); 35 36 #ifdef OMP_45_ENABLED 37 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask); 38 #endif 39 40 #ifdef BUILD_TIED_TASK_STACK 41 42 // __kmp_trace_task_stack: print the tied tasks from the task stack in order 43 // from top do bottom 44 // 45 // gtid: global thread identifier for thread containing stack 46 // thread_data: thread data for task team thread containing stack 47 // threshold: value above which the trace statement triggers 48 // location: string identifying call site of this function (for trace) 49 static void __kmp_trace_task_stack(kmp_int32 gtid, 50 kmp_thread_data_t *thread_data, 51 int threshold, char *location) { 52 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 53 kmp_taskdata_t **stack_top = task_stack->ts_top; 54 kmp_int32 entries = task_stack->ts_entries; 55 kmp_taskdata_t *tied_task; 56 57 KA_TRACE( 58 threshold, 59 ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, " 60 "first_block = %p, stack_top = %p \n", 61 location, gtid, entries, task_stack->ts_first_block, stack_top)); 62 63 KMP_DEBUG_ASSERT(stack_top != NULL); 64 KMP_DEBUG_ASSERT(entries > 0); 65 66 while (entries != 0) { 67 KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]); 68 // fix up ts_top if we need to pop from previous block 69 if (entries & TASK_STACK_INDEX_MASK == 0) { 70 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top); 71 72 stack_block = stack_block->sb_prev; 73 stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 74 } 75 76 // finish bookkeeping 77 stack_top--; 78 entries--; 79 80 tied_task = *stack_top; 81 82 KMP_DEBUG_ASSERT(tied_task != NULL); 83 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 84 85 KA_TRACE(threshold, 86 ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, " 87 "stack_top=%p, tied_task=%p\n", 88 location, gtid, entries, stack_top, tied_task)); 89 } 90 KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]); 91 92 KA_TRACE(threshold, 93 ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n", 94 location, gtid)); 95 } 96 97 // __kmp_init_task_stack: initialize the task stack for the first time 98 // after a thread_data structure is created. 99 // It should not be necessary to do this again (assuming the stack works). 100 // 101 // gtid: global thread identifier of calling thread 102 // thread_data: thread data for task team thread containing stack 103 static void __kmp_init_task_stack(kmp_int32 gtid, 104 kmp_thread_data_t *thread_data) { 105 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 106 kmp_stack_block_t *first_block; 107 108 // set up the first block of the stack 109 first_block = &task_stack->ts_first_block; 110 task_stack->ts_top = (kmp_taskdata_t **)first_block; 111 memset((void *)first_block, '\0', 112 TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *)); 113 114 // initialize the stack to be empty 115 task_stack->ts_entries = TASK_STACK_EMPTY; 116 first_block->sb_next = NULL; 117 first_block->sb_prev = NULL; 118 } 119 120 // __kmp_free_task_stack: free the task stack when thread_data is destroyed. 121 // 122 // gtid: global thread identifier for calling thread 123 // thread_data: thread info for thread containing stack 124 static void __kmp_free_task_stack(kmp_int32 gtid, 125 kmp_thread_data_t *thread_data) { 126 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 127 kmp_stack_block_t *stack_block = &task_stack->ts_first_block; 128 129 KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY); 130 // free from the second block of the stack 131 while (stack_block != NULL) { 132 kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL; 133 134 stack_block->sb_next = NULL; 135 stack_block->sb_prev = NULL; 136 if (stack_block != &task_stack->ts_first_block) { 137 __kmp_thread_free(thread, 138 stack_block); // free the block, if not the first 139 } 140 stack_block = next_block; 141 } 142 // initialize the stack to be empty 143 task_stack->ts_entries = 0; 144 task_stack->ts_top = NULL; 145 } 146 147 // __kmp_push_task_stack: Push the tied task onto the task stack. 148 // Grow the stack if necessary by allocating another block. 149 // 150 // gtid: global thread identifier for calling thread 151 // thread: thread info for thread containing stack 152 // tied_task: the task to push on the stack 153 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread, 154 kmp_taskdata_t *tied_task) { 155 // GEH - need to consider what to do if tt_threads_data not allocated yet 156 kmp_thread_data_t *thread_data = 157 &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 158 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 159 160 if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) { 161 return; // Don't push anything on stack if team or team tasks are serialized 162 } 163 164 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 165 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 166 167 KA_TRACE(20, 168 ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n", 169 gtid, thread, tied_task)); 170 // Store entry 171 *(task_stack->ts_top) = tied_task; 172 173 // Do bookkeeping for next push 174 task_stack->ts_top++; 175 task_stack->ts_entries++; 176 177 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 178 // Find beginning of this task block 179 kmp_stack_block_t *stack_block = 180 (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE); 181 182 // Check if we already have a block 183 if (stack_block->sb_next != 184 NULL) { // reset ts_top to beginning of next block 185 task_stack->ts_top = &stack_block->sb_next->sb_block[0]; 186 } else { // Alloc new block and link it up 187 kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc( 188 thread, sizeof(kmp_stack_block_t)); 189 190 task_stack->ts_top = &new_block->sb_block[0]; 191 stack_block->sb_next = new_block; 192 new_block->sb_prev = stack_block; 193 new_block->sb_next = NULL; 194 195 KA_TRACE( 196 30, 197 ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n", 198 gtid, tied_task, new_block)); 199 } 200 } 201 KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 202 tied_task)); 203 } 204 205 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return 206 // the task, just check to make sure it matches the ending task passed in. 207 // 208 // gtid: global thread identifier for the calling thread 209 // thread: thread info structure containing stack 210 // tied_task: the task popped off the stack 211 // ending_task: the task that is ending (should match popped task) 212 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread, 213 kmp_taskdata_t *ending_task) { 214 // GEH - need to consider what to do if tt_threads_data not allocated yet 215 kmp_thread_data_t *thread_data = 216 &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)]; 217 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 218 kmp_taskdata_t *tied_task; 219 220 if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) { 221 // Don't pop anything from stack if team or team tasks are serialized 222 return; 223 } 224 225 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 226 KMP_DEBUG_ASSERT(task_stack->ts_entries > 0); 227 228 KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid, 229 thread)); 230 231 // fix up ts_top if we need to pop from previous block 232 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 233 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top); 234 235 stack_block = stack_block->sb_prev; 236 task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 237 } 238 239 // finish bookkeeping 240 task_stack->ts_top--; 241 task_stack->ts_entries--; 242 243 tied_task = *(task_stack->ts_top); 244 245 KMP_DEBUG_ASSERT(tied_task != NULL); 246 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 247 KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly 248 249 KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 250 tied_task)); 251 return; 252 } 253 #endif /* BUILD_TIED_TASK_STACK */ 254 255 // __kmp_push_task: Add a task to the thread's deque 256 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { 257 kmp_info_t *thread = __kmp_threads[gtid]; 258 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 259 kmp_task_team_t *task_team = thread->th.th_task_team; 260 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 261 kmp_thread_data_t *thread_data; 262 263 KA_TRACE(20, 264 ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata)); 265 266 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 267 // untied task needs to increment counter so that the task structure is not 268 // freed prematurely 269 kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count); 270 KA_TRACE( 271 20, 272 ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n", 273 gtid, counter, taskdata)); 274 } 275 276 // The first check avoids building task_team thread data if serialized 277 if (taskdata->td_flags.task_serial) { 278 KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning " 279 "TASK_NOT_PUSHED for task %p\n", 280 gtid, taskdata)); 281 return TASK_NOT_PUSHED; 282 } 283 284 // Now that serialized tasks have returned, we can assume that we are not in 285 // immediate exec mode 286 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 287 if (!KMP_TASKING_ENABLED(task_team)) { 288 __kmp_enable_tasking(task_team, thread); 289 } 290 KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE); 291 KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL); 292 293 // Find tasking deque specific to encountering thread 294 thread_data = &task_team->tt.tt_threads_data[tid]; 295 296 // No lock needed since only owner can allocate 297 if (thread_data->td.td_deque == NULL) { 298 __kmp_alloc_task_deque(thread, thread_data); 299 } 300 301 // Check if deque is full 302 if (TCR_4(thread_data->td.td_deque_ntasks) >= 303 TASK_DEQUE_SIZE(thread_data->td)) { 304 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning " 305 "TASK_NOT_PUSHED for task %p\n", 306 gtid, taskdata)); 307 return TASK_NOT_PUSHED; 308 } 309 310 // Lock the deque for the task push operation 311 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 312 313 #if OMP_45_ENABLED 314 // Need to recheck as we can get a proxy task from a thread outside of OpenMP 315 if (TCR_4(thread_data->td.td_deque_ntasks) >= 316 TASK_DEQUE_SIZE(thread_data->td)) { 317 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 318 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; returning " 319 "TASK_NOT_PUSHED for task %p\n", 320 gtid, taskdata)); 321 return TASK_NOT_PUSHED; 322 } 323 #else 324 // Must have room since no thread can add tasks but calling thread 325 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) < 326 TASK_DEQUE_SIZE(thread_data->td)); 327 #endif 328 329 thread_data->td.td_deque[thread_data->td.td_deque_tail] = 330 taskdata; // Push taskdata 331 // Wrap index. 332 thread_data->td.td_deque_tail = 333 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 334 TCW_4(thread_data->td.td_deque_ntasks, 335 TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count 336 337 KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: " 338 "task=%p ntasks=%d head=%u tail=%u\n", 339 gtid, taskdata, thread_data->td.td_deque_ntasks, 340 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 341 342 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 343 344 return TASK_SUCCESSFULLY_PUSHED; 345 } 346 347 // __kmp_pop_current_task_from_thread: set up current task from called thread 348 // when team ends 349 // 350 // this_thr: thread structure to set current_task in. 351 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) { 352 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d " 353 "this_thread=%p, curtask=%p, " 354 "curtask_parent=%p\n", 355 0, this_thr, this_thr->th.th_current_task, 356 this_thr->th.th_current_task->td_parent)); 357 358 this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent; 359 360 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d " 361 "this_thread=%p, curtask=%p, " 362 "curtask_parent=%p\n", 363 0, this_thr, this_thr->th.th_current_task, 364 this_thr->th.th_current_task->td_parent)); 365 } 366 367 // __kmp_push_current_task_to_thread: set up current task in called thread for a 368 // new team 369 // 370 // this_thr: thread structure to set up 371 // team: team for implicit task data 372 // tid: thread within team to set up 373 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team, 374 int tid) { 375 // current task of the thread is a parent of the new just created implicit 376 // tasks of new team 377 KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p " 378 "curtask=%p " 379 "parent_task=%p\n", 380 tid, this_thr, this_thr->th.th_current_task, 381 team->t.t_implicit_task_taskdata[tid].td_parent)); 382 383 KMP_DEBUG_ASSERT(this_thr != NULL); 384 385 if (tid == 0) { 386 if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) { 387 team->t.t_implicit_task_taskdata[0].td_parent = 388 this_thr->th.th_current_task; 389 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0]; 390 } 391 } else { 392 team->t.t_implicit_task_taskdata[tid].td_parent = 393 team->t.t_implicit_task_taskdata[0].td_parent; 394 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid]; 395 } 396 397 KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p " 398 "curtask=%p " 399 "parent_task=%p\n", 400 tid, this_thr, this_thr->th.th_current_task, 401 team->t.t_implicit_task_taskdata[tid].td_parent)); 402 } 403 404 // __kmp_task_start: bookkeeping for a task starting execution 405 // 406 // GTID: global thread id of calling thread 407 // task: task starting execution 408 // current_task: task suspending 409 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task, 410 kmp_taskdata_t *current_task) { 411 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 412 kmp_info_t *thread = __kmp_threads[gtid]; 413 414 KA_TRACE(10, 415 ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n", 416 gtid, taskdata, current_task)); 417 418 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 419 420 // mark currently executing task as suspended 421 // TODO: GEH - make sure root team implicit task is initialized properly. 422 // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 ); 423 current_task->td_flags.executing = 0; 424 425 // Add task to stack if tied 426 #ifdef BUILD_TIED_TASK_STACK 427 if (taskdata->td_flags.tiedness == TASK_TIED) { 428 __kmp_push_task_stack(gtid, thread, taskdata); 429 } 430 #endif /* BUILD_TIED_TASK_STACK */ 431 432 // mark starting task as executing and as current task 433 thread->th.th_current_task = taskdata; 434 435 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 || 436 taskdata->td_flags.tiedness == TASK_UNTIED); 437 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 || 438 taskdata->td_flags.tiedness == TASK_UNTIED); 439 taskdata->td_flags.started = 1; 440 taskdata->td_flags.executing = 1; 441 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 442 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 443 444 // GEH TODO: shouldn't we pass some sort of location identifier here? 445 // APT: yes, we will pass location here. 446 // need to store current thread state (in a thread or taskdata structure) 447 // before setting work_state, otherwise wrong state is set after end of task 448 449 KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata)); 450 451 #if OMPT_SUPPORT 452 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_begin)) { 453 kmp_taskdata_t *parent = taskdata->td_parent; 454 ompt_callbacks.ompt_callback(ompt_event_task_begin)( 455 parent ? parent->ompt_task_info.task_id : ompt_task_id_none, 456 parent ? &(parent->ompt_task_info.frame) : NULL, 457 taskdata->ompt_task_info.task_id, taskdata->ompt_task_info.function); 458 } 459 #endif 460 #if OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE 461 /* OMPT emit all dependences if requested by the tool */ 462 if (ompt_enabled && taskdata->ompt_task_info.ndeps > 0 && 463 ompt_callbacks.ompt_callback(ompt_event_task_dependences)) { 464 ompt_callbacks.ompt_callback(ompt_event_task_dependences)( 465 taskdata->ompt_task_info.task_id, taskdata->ompt_task_info.deps, 466 taskdata->ompt_task_info.ndeps); 467 /* We can now free the allocated memory for the dependencies */ 468 KMP_OMPT_DEPS_FREE(thread, taskdata->ompt_task_info.deps); 469 taskdata->ompt_task_info.deps = NULL; 470 taskdata->ompt_task_info.ndeps = 0; 471 } 472 #endif /* OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE */ 473 474 return; 475 } 476 477 // __kmpc_omp_task_begin_if0: report that a given serialized task has started 478 // execution 479 // 480 // loc_ref: source location information; points to beginning of task block. 481 // gtid: global thread number. 482 // task: task thunk for the started task. 483 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid, 484 kmp_task_t *task) { 485 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 486 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 487 488 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p " 489 "current_task=%p\n", 490 gtid, loc_ref, taskdata, current_task)); 491 492 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 493 // untied task needs to increment counter so that the task structure is not 494 // freed prematurely 495 kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count); 496 KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) " 497 "incremented for task %p\n", 498 gtid, counter, taskdata)); 499 } 500 501 taskdata->td_flags.task_serial = 502 1; // Execute this task immediately, not deferred. 503 __kmp_task_start(gtid, task, current_task); 504 505 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid, 506 loc_ref, taskdata)); 507 508 return; 509 } 510 511 #ifdef TASK_UNUSED 512 // __kmpc_omp_task_begin: report that a given task has started execution 513 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 514 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) { 515 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 516 517 KA_TRACE( 518 10, 519 ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n", 520 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task)); 521 522 __kmp_task_start(gtid, task, current_task); 523 524 KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid, 525 loc_ref, KMP_TASK_TO_TASKDATA(task))); 526 return; 527 } 528 #endif // TASK_UNUSED 529 530 // __kmp_free_task: free the current task space and the space for shareds 531 // 532 // gtid: Global thread ID of calling thread 533 // taskdata: task to free 534 // thread: thread data structure of caller 535 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata, 536 kmp_info_t *thread) { 537 KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid, 538 taskdata)); 539 540 // Check to make sure all flags and counters have the correct values 541 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 542 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0); 543 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1); 544 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 545 KMP_DEBUG_ASSERT(TCR_4(taskdata->td_allocated_child_tasks) == 0 || 546 taskdata->td_flags.task_serial == 1); 547 KMP_DEBUG_ASSERT(TCR_4(taskdata->td_incomplete_child_tasks) == 0); 548 549 taskdata->td_flags.freed = 1; 550 ANNOTATE_HAPPENS_BEFORE(taskdata); 551 // deallocate the taskdata and shared variable blocks associated with this task 552 #if USE_FAST_MEMORY 553 __kmp_fast_free(thread, taskdata); 554 #else /* ! USE_FAST_MEMORY */ 555 __kmp_thread_free(thread, taskdata); 556 #endif 557 558 KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata)); 559 } 560 561 // __kmp_free_task_and_ancestors: free the current task and ancestors without 562 // children 563 // 564 // gtid: Global thread ID of calling thread 565 // taskdata: task to free 566 // thread: thread data structure of caller 567 static void __kmp_free_task_and_ancestors(kmp_int32 gtid, 568 kmp_taskdata_t *taskdata, 569 kmp_info_t *thread) { 570 #if OMP_45_ENABLED 571 // Proxy tasks must always be allowed to free their parents 572 // because they can be run in background even in serial mode. 573 kmp_int32 team_serial = 574 (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) && 575 !taskdata->td_flags.proxy; 576 #else 577 kmp_int32 team_serial = 578 taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser; 579 #endif 580 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 581 582 kmp_int32 children = KMP_TEST_THEN_DEC32(CCAST( 583 kmp_int32 *, &taskdata->td_allocated_child_tasks)) - 584 1; 585 KMP_DEBUG_ASSERT(children >= 0); 586 587 // Now, go up the ancestor tree to see if any ancestors can now be freed. 588 while (children == 0) { 589 kmp_taskdata_t *parent_taskdata = taskdata->td_parent; 590 591 KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete " 592 "and freeing itself\n", 593 gtid, taskdata)); 594 595 // --- Deallocate my ancestor task --- 596 __kmp_free_task(gtid, taskdata, thread); 597 598 taskdata = parent_taskdata; 599 600 // Stop checking ancestors at implicit task instead of walking up ancestor 601 // tree to avoid premature deallocation of ancestors. 602 if (team_serial || taskdata->td_flags.tasktype == TASK_IMPLICIT) 603 return; 604 605 // Predecrement simulated by "- 1" calculation 606 children = KMP_TEST_THEN_DEC32( 607 CCAST(kmp_int32 *, &taskdata->td_allocated_child_tasks)) - 608 1; 609 KMP_DEBUG_ASSERT(children >= 0); 610 } 611 612 KA_TRACE( 613 20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; " 614 "not freeing it yet\n", 615 gtid, taskdata, children)); 616 } 617 618 // __kmp_task_finish: bookkeeping to do when a task finishes execution 619 // 620 // gtid: global thread ID for calling thread 621 // task: task to be finished 622 // resumed_task: task to be resumed. (may be NULL if task is serialized) 623 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, 624 kmp_taskdata_t *resumed_task) { 625 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 626 kmp_info_t *thread = __kmp_threads[gtid]; 627 kmp_task_team_t *task_team = 628 thread->th.th_task_team; // might be NULL for serial teams... 629 kmp_int32 children = 0; 630 631 #if OMPT_SUPPORT 632 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_end)) { 633 kmp_taskdata_t *parent = taskdata->td_parent; 634 ompt_callbacks.ompt_callback(ompt_event_task_end)( 635 taskdata->ompt_task_info.task_id); 636 } 637 #endif 638 639 KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming " 640 "task %p\n", 641 gtid, taskdata, resumed_task)); 642 643 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 644 645 // Pop task from stack if tied 646 #ifdef BUILD_TIED_TASK_STACK 647 if (taskdata->td_flags.tiedness == TASK_TIED) { 648 __kmp_pop_task_stack(gtid, thread, taskdata); 649 } 650 #endif /* BUILD_TIED_TASK_STACK */ 651 652 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 653 // untied task needs to check the counter so that the task structure is not 654 // freed prematurely 655 kmp_int32 counter = KMP_TEST_THEN_DEC32(&taskdata->td_untied_count) - 1; 656 KA_TRACE( 657 20, 658 ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n", 659 gtid, counter, taskdata)); 660 if (counter > 0) { 661 // untied task is not done, to be continued possibly by other thread, do 662 // not free it now 663 if (resumed_task == NULL) { 664 KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial); 665 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 666 // task is the parent 667 } 668 thread->th.th_current_task = resumed_task; // restore current_task 669 resumed_task->td_flags.executing = 1; // resume previous task 670 KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, " 671 "resuming task %p\n", 672 gtid, taskdata, resumed_task)); 673 return; 674 } 675 } 676 677 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 678 taskdata->td_flags.complete = 1; // mark the task as completed 679 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1); 680 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 681 682 // Only need to keep track of count if team parallel and tasking not 683 // serialized 684 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { 685 // Predecrement simulated by "- 1" calculation 686 children = 687 KMP_TEST_THEN_DEC32(CCAST( 688 kmp_int32 *, &taskdata->td_parent->td_incomplete_child_tasks)) - 689 1; 690 KMP_DEBUG_ASSERT(children >= 0); 691 #if OMP_40_ENABLED 692 if (taskdata->td_taskgroup) 693 KMP_TEST_THEN_DEC32((kmp_int32 *)(&taskdata->td_taskgroup->count)); 694 #if OMP_45_ENABLED 695 } 696 // if we found proxy tasks there could exist a dependency chain 697 // with the proxy task as origin 698 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) || 699 (task_team && task_team->tt.tt_found_proxy_tasks)) { 700 #endif 701 __kmp_release_deps(gtid, taskdata); 702 #endif 703 } 704 705 // td_flags.executing must be marked as 0 after __kmp_release_deps has been 706 // called. Othertwise, if a task is executed immediately from the release_deps 707 // code, the flag will be reset to 1 again by this same function 708 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); 709 taskdata->td_flags.executing = 0; // suspend the finishing task 710 711 KA_TRACE( 712 20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n", 713 gtid, taskdata, children)); 714 715 #if OMP_40_ENABLED 716 /* If the tasks' destructor thunk flag has been set, we need to invoke the 717 destructor thunk that has been generated by the compiler. The code is 718 placed here, since at this point other tasks might have been released 719 hence overlapping the destructor invokations with some other work in the 720 released tasks. The OpenMP spec is not specific on when the destructors 721 are invoked, so we should be free to choose. */ 722 if (taskdata->td_flags.destructors_thunk) { 723 kmp_routine_entry_t destr_thunk = task->data1.destructors; 724 KMP_ASSERT(destr_thunk); 725 destr_thunk(gtid, task); 726 } 727 #endif // OMP_40_ENABLED 728 729 // bookkeeping for resuming task: 730 // GEH - note tasking_ser => task_serial 731 KMP_DEBUG_ASSERT( 732 (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) == 733 taskdata->td_flags.task_serial); 734 if (taskdata->td_flags.task_serial) { 735 if (resumed_task == NULL) { 736 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 737 // task is the parent 738 } else 739 #if OMP_45_ENABLED 740 if (!(task_team && task_team->tt.tt_found_proxy_tasks)) 741 #endif 742 { 743 // verify resumed task passed in points to parent 744 KMP_DEBUG_ASSERT(resumed_task == taskdata->td_parent); 745 } 746 } else { 747 KMP_DEBUG_ASSERT(resumed_task != 748 NULL); // verify that resumed task is passed as arguemnt 749 } 750 751 // Free this task and then ancestor tasks if they have no children. 752 // Restore th_current_task first as suggested by John: 753 // johnmc: if an asynchronous inquiry peers into the runtime system 754 // it doesn't see the freed task as the current task. 755 thread->th.th_current_task = resumed_task; 756 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 757 758 // TODO: GEH - make sure root team implicit task is initialized properly. 759 // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 ); 760 resumed_task->td_flags.executing = 1; // resume previous task 761 762 KA_TRACE( 763 10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n", 764 gtid, taskdata, resumed_task)); 765 766 return; 767 } 768 769 // __kmpc_omp_task_complete_if0: report that a task has completed execution 770 // 771 // loc_ref: source location information; points to end of task block. 772 // gtid: global thread number. 773 // task: task thunk for the completed task. 774 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid, 775 kmp_task_t *task) { 776 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n", 777 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 778 // this routine will provide task to resume 779 __kmp_task_finish(gtid, task, NULL); 780 781 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n", 782 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 783 return; 784 } 785 786 #ifdef TASK_UNUSED 787 // __kmpc_omp_task_complete: report that a task has completed execution 788 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 789 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid, 790 kmp_task_t *task) { 791 KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid, 792 loc_ref, KMP_TASK_TO_TASKDATA(task))); 793 794 __kmp_task_finish(gtid, task, NULL); // Not sure how to find task to resume 795 796 KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid, 797 loc_ref, KMP_TASK_TO_TASKDATA(task))); 798 return; 799 } 800 #endif // TASK_UNUSED 801 802 #if OMPT_SUPPORT 803 // __kmp_task_init_ompt: Initialize OMPT fields maintained by a task. This will 804 // only be called after ompt_tool, so we already know whether ompt is enabled 805 // or not. 806 static inline void __kmp_task_init_ompt(kmp_taskdata_t *task, int tid, 807 void *function) { 808 if (ompt_enabled) { 809 task->ompt_task_info.task_id = __ompt_task_id_new(tid); 810 task->ompt_task_info.function = function; 811 task->ompt_task_info.frame.exit_runtime_frame = NULL; 812 task->ompt_task_info.frame.reenter_runtime_frame = NULL; 813 #if OMP_40_ENABLED 814 task->ompt_task_info.ndeps = 0; 815 task->ompt_task_info.deps = NULL; 816 #endif /* OMP_40_ENABLED */ 817 } 818 } 819 #endif 820 821 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit 822 // task for a given thread 823 // 824 // loc_ref: reference to source location of parallel region 825 // this_thr: thread data structure corresponding to implicit task 826 // team: team for this_thr 827 // tid: thread id of given thread within team 828 // set_curr_task: TRUE if need to push current task to thread 829 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to 830 // have already been done elsewhere. 831 // TODO: Get better loc_ref. Value passed in may be NULL 832 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr, 833 kmp_team_t *team, int tid, int set_curr_task) { 834 kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid]; 835 836 KF_TRACE( 837 10, 838 ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n", 839 tid, team, task, set_curr_task ? "TRUE" : "FALSE")); 840 841 task->td_task_id = KMP_GEN_TASK_ID(); 842 task->td_team = team; 843 // task->td_parent = NULL; // fix for CQ230101 (broken parent task info 844 // in debugger) 845 task->td_ident = loc_ref; 846 task->td_taskwait_ident = NULL; 847 task->td_taskwait_counter = 0; 848 task->td_taskwait_thread = 0; 849 850 task->td_flags.tiedness = TASK_TIED; 851 task->td_flags.tasktype = TASK_IMPLICIT; 852 #if OMP_45_ENABLED 853 task->td_flags.proxy = TASK_FULL; 854 #endif 855 856 // All implicit tasks are executed immediately, not deferred 857 task->td_flags.task_serial = 1; 858 task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 859 task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 860 861 task->td_flags.started = 1; 862 task->td_flags.executing = 1; 863 task->td_flags.complete = 0; 864 task->td_flags.freed = 0; 865 866 #if OMP_40_ENABLED 867 task->td_depnode = NULL; 868 #endif 869 870 if (set_curr_task) { // only do this init first time thread is created 871 task->td_incomplete_child_tasks = 0; 872 // Not used: don't need to deallocate implicit task 873 task->td_allocated_child_tasks = 0; 874 #if OMP_40_ENABLED 875 task->td_taskgroup = NULL; // An implicit task does not have taskgroup 876 task->td_dephash = NULL; 877 #endif 878 __kmp_push_current_task_to_thread(this_thr, team, tid); 879 } else { 880 KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0); 881 KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0); 882 } 883 884 #if OMPT_SUPPORT 885 __kmp_task_init_ompt(task, tid, NULL); 886 #endif 887 888 KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid, 889 team, task)); 890 } 891 892 // __kmp_finish_implicit_task: Release resources associated to implicit tasks 893 // at the end of parallel regions. Some resources are kept for reuse in the next 894 // parallel region. 895 // 896 // thread: thread data structure corresponding to implicit task 897 void __kmp_finish_implicit_task(kmp_info_t *thread) { 898 kmp_taskdata_t *task = thread->th.th_current_task; 899 if (task->td_dephash) 900 __kmp_dephash_free_entries(thread, task->td_dephash); 901 } 902 903 // __kmp_free_implicit_task: Release resources associated to implicit tasks 904 // when these are destroyed regions 905 // 906 // thread: thread data structure corresponding to implicit task 907 void __kmp_free_implicit_task(kmp_info_t *thread) { 908 kmp_taskdata_t *task = thread->th.th_current_task; 909 if (task->td_dephash) 910 __kmp_dephash_free(thread, task->td_dephash); 911 task->td_dephash = NULL; 912 } 913 914 // Round up a size to a power of two specified by val: Used to insert padding 915 // between structures co-allocated using a single malloc() call 916 static size_t __kmp_round_up_to_val(size_t size, size_t val) { 917 if (size & (val - 1)) { 918 size &= ~(val - 1); 919 if (size <= KMP_SIZE_T_MAX - val) { 920 size += val; // Round up if there is no overflow. 921 }; // if 922 }; // if 923 return size; 924 } // __kmp_round_up_to_va 925 926 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task 927 // 928 // loc_ref: source location information 929 // gtid: global thread number. 930 // flags: include tiedness & task type (explicit vs. implicit) of the ''new'' 931 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine. 932 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including 933 // private vars accessed in task. 934 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed 935 // in task. 936 // task_entry: Pointer to task code entry point generated by compiler. 937 // returns: a pointer to the allocated kmp_task_t structure (task). 938 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 939 kmp_tasking_flags_t *flags, 940 size_t sizeof_kmp_task_t, size_t sizeof_shareds, 941 kmp_routine_entry_t task_entry) { 942 kmp_task_t *task; 943 kmp_taskdata_t *taskdata; 944 kmp_info_t *thread = __kmp_threads[gtid]; 945 kmp_team_t *team = thread->th.th_team; 946 kmp_taskdata_t *parent_task = thread->th.th_current_task; 947 size_t shareds_offset; 948 949 KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) " 950 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 951 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t, 952 sizeof_shareds, task_entry)); 953 954 if (parent_task->td_flags.final) { 955 if (flags->merged_if0) { 956 } 957 flags->final = 1; 958 } 959 960 #if OMP_45_ENABLED 961 if (flags->proxy == TASK_PROXY) { 962 flags->tiedness = TASK_UNTIED; 963 flags->merged_if0 = 1; 964 965 /* are we running in a sequential parallel or tskm_immediate_exec... we need 966 tasking support enabled */ 967 if ((thread->th.th_task_team) == NULL) { 968 /* This should only happen if the team is serialized 969 setup a task team and propagate it to the thread */ 970 KMP_DEBUG_ASSERT(team->t.t_serialized); 971 KA_TRACE(30, 972 ("T#%d creating task team in __kmp_task_alloc for proxy task\n", 973 gtid)); 974 __kmp_task_team_setup( 975 thread, team, 976 1); // 1 indicates setup the current team regardless of nthreads 977 thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state]; 978 } 979 kmp_task_team_t *task_team = thread->th.th_task_team; 980 981 /* tasking must be enabled now as the task might not be pushed */ 982 if (!KMP_TASKING_ENABLED(task_team)) { 983 KA_TRACE( 984 30, 985 ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid)); 986 __kmp_enable_tasking(task_team, thread); 987 kmp_int32 tid = thread->th.th_info.ds.ds_tid; 988 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 989 // No lock needed since only owner can allocate 990 if (thread_data->td.td_deque == NULL) { 991 __kmp_alloc_task_deque(thread, thread_data); 992 } 993 } 994 995 if (task_team->tt.tt_found_proxy_tasks == FALSE) 996 TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE); 997 } 998 #endif 999 1000 // Calculate shared structure offset including padding after kmp_task_t struct 1001 // to align pointers in shared struct 1002 shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t; 1003 shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *)); 1004 1005 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 1006 KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid, 1007 shareds_offset)); 1008 KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid, 1009 sizeof_shareds)); 1010 1011 // Avoid double allocation here by combining shareds with taskdata 1012 #if USE_FAST_MEMORY 1013 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset + 1014 sizeof_shareds); 1015 #else /* ! USE_FAST_MEMORY */ 1016 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset + 1017 sizeof_shareds); 1018 #endif /* USE_FAST_MEMORY */ 1019 ANNOTATE_HAPPENS_AFTER(taskdata); 1020 1021 task = KMP_TASKDATA_TO_TASK(taskdata); 1022 1023 // Make sure task & taskdata are aligned appropriately 1024 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD 1025 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0); 1026 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0); 1027 #else 1028 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0); 1029 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0); 1030 #endif 1031 if (sizeof_shareds > 0) { 1032 // Avoid double allocation here by combining shareds with taskdata 1033 task->shareds = &((char *)taskdata)[shareds_offset]; 1034 // Make sure shareds struct is aligned to pointer size 1035 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 1036 0); 1037 } else { 1038 task->shareds = NULL; 1039 } 1040 task->routine = task_entry; 1041 task->part_id = 0; // AC: Always start with 0 part id 1042 1043 taskdata->td_task_id = KMP_GEN_TASK_ID(); 1044 taskdata->td_team = team; 1045 taskdata->td_alloc_thread = thread; 1046 taskdata->td_parent = parent_task; 1047 taskdata->td_level = parent_task->td_level + 1; // increment nesting level 1048 taskdata->td_untied_count = 0; 1049 taskdata->td_ident = loc_ref; 1050 taskdata->td_taskwait_ident = NULL; 1051 taskdata->td_taskwait_counter = 0; 1052 taskdata->td_taskwait_thread = 0; 1053 KMP_DEBUG_ASSERT(taskdata->td_parent != NULL); 1054 #if OMP_45_ENABLED 1055 // avoid copying icvs for proxy tasks 1056 if (flags->proxy == TASK_FULL) 1057 #endif 1058 copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs); 1059 1060 taskdata->td_flags.tiedness = flags->tiedness; 1061 taskdata->td_flags.final = flags->final; 1062 taskdata->td_flags.merged_if0 = flags->merged_if0; 1063 #if OMP_40_ENABLED 1064 taskdata->td_flags.destructors_thunk = flags->destructors_thunk; 1065 #endif // OMP_40_ENABLED 1066 #if OMP_45_ENABLED 1067 taskdata->td_flags.proxy = flags->proxy; 1068 taskdata->td_task_team = thread->th.th_task_team; 1069 taskdata->td_size_alloc = shareds_offset + sizeof_shareds; 1070 #endif 1071 taskdata->td_flags.tasktype = TASK_EXPLICIT; 1072 1073 // GEH - TODO: fix this to copy parent task's value of tasking_ser flag 1074 taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 1075 1076 // GEH - TODO: fix this to copy parent task's value of team_serial flag 1077 taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 1078 1079 // GEH - Note we serialize the task if the team is serialized to make sure 1080 // implicit parallel region tasks are not left until program termination to 1081 // execute. Also, it helps locality to execute immediately. 1082 1083 taskdata->td_flags.task_serial = 1084 (parent_task->td_flags.final || taskdata->td_flags.team_serial || 1085 taskdata->td_flags.tasking_ser); 1086 1087 taskdata->td_flags.started = 0; 1088 taskdata->td_flags.executing = 0; 1089 taskdata->td_flags.complete = 0; 1090 taskdata->td_flags.freed = 0; 1091 1092 taskdata->td_flags.native = flags->native; 1093 1094 taskdata->td_incomplete_child_tasks = 0; 1095 taskdata->td_allocated_child_tasks = 1; // start at one because counts current 1096 // task and children 1097 #if OMP_40_ENABLED 1098 taskdata->td_taskgroup = 1099 parent_task->td_taskgroup; // task inherits taskgroup from the parent task 1100 taskdata->td_dephash = NULL; 1101 taskdata->td_depnode = NULL; 1102 #endif 1103 1104 // Only need to keep track of child task counts if team parallel and tasking not 1105 // serialized or if it is a proxy task 1106 #if OMP_45_ENABLED 1107 if (flags->proxy == TASK_PROXY || 1108 !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) 1109 #else 1110 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) 1111 #endif 1112 { 1113 KMP_TEST_THEN_INC32( 1114 CCAST(kmp_int32 *, &parent_task->td_incomplete_child_tasks)); 1115 #if OMP_40_ENABLED 1116 if (parent_task->td_taskgroup) 1117 KMP_TEST_THEN_INC32((kmp_int32 *)(&parent_task->td_taskgroup->count)); 1118 #endif 1119 // Only need to keep track of allocated child tasks for explicit tasks since 1120 // implicit not deallocated 1121 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) { 1122 KMP_TEST_THEN_INC32( 1123 CCAST(kmp_int32 *, &taskdata->td_parent->td_allocated_child_tasks)); 1124 } 1125 } 1126 1127 KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n", 1128 gtid, taskdata, taskdata->td_parent)); 1129 ANNOTATE_HAPPENS_BEFORE(task); 1130 1131 #if OMPT_SUPPORT 1132 __kmp_task_init_ompt(taskdata, gtid, (void *)task_entry); 1133 #endif 1134 1135 return task; 1136 } 1137 1138 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1139 kmp_int32 flags, size_t sizeof_kmp_task_t, 1140 size_t sizeof_shareds, 1141 kmp_routine_entry_t task_entry) { 1142 kmp_task_t *retval; 1143 kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags; 1144 1145 input_flags->native = FALSE; 1146 // __kmp_task_alloc() sets up all other runtime flags 1147 1148 #if OMP_45_ENABLED 1149 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) " 1150 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1151 gtid, loc_ref, input_flags->tiedness ? "tied " : "untied", 1152 input_flags->proxy ? "proxy" : "", sizeof_kmp_task_t, 1153 sizeof_shareds, task_entry)); 1154 #else 1155 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) " 1156 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1157 gtid, loc_ref, input_flags->tiedness ? "tied " : "untied", 1158 sizeof_kmp_task_t, sizeof_shareds, task_entry)); 1159 #endif 1160 1161 retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t, 1162 sizeof_shareds, task_entry); 1163 1164 KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval)); 1165 1166 return retval; 1167 } 1168 1169 // __kmp_invoke_task: invoke the specified task 1170 // 1171 // gtid: global thread ID of caller 1172 // task: the task to invoke 1173 // current_task: the task to resume after task invokation 1174 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, 1175 kmp_taskdata_t *current_task) { 1176 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 1177 kmp_uint64 cur_time; 1178 #if OMP_40_ENABLED 1179 int discard = 0 /* false */; 1180 #endif 1181 KA_TRACE( 1182 30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n", 1183 gtid, taskdata, current_task)); 1184 KMP_DEBUG_ASSERT(task); 1185 #if OMP_45_ENABLED 1186 if (taskdata->td_flags.proxy == TASK_PROXY && 1187 taskdata->td_flags.complete == 1) { 1188 // This is a proxy task that was already completed but it needs to run 1189 // its bottom-half finish 1190 KA_TRACE( 1191 30, 1192 ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n", 1193 gtid, taskdata)); 1194 1195 __kmp_bottom_half_finish_proxy(gtid, task); 1196 1197 KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for " 1198 "proxy task %p, resuming task %p\n", 1199 gtid, taskdata, current_task)); 1200 1201 return; 1202 } 1203 #endif 1204 1205 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1206 if (__kmp_forkjoin_frames_mode == 3) { 1207 // Get the current time stamp to measure task execution time to correct 1208 // barrier imbalance time 1209 cur_time = __itt_get_timestamp(); 1210 } 1211 #endif 1212 1213 #if OMP_45_ENABLED 1214 // Proxy tasks are not handled by the runtime 1215 if (taskdata->td_flags.proxy != TASK_PROXY) { 1216 #endif 1217 ANNOTATE_HAPPENS_AFTER(task); 1218 __kmp_task_start(gtid, task, current_task); 1219 #if OMP_45_ENABLED 1220 } 1221 #endif 1222 1223 #if OMPT_SUPPORT 1224 ompt_thread_info_t oldInfo; 1225 kmp_info_t *thread; 1226 if (ompt_enabled) { 1227 // Store the threads states and restore them after the task 1228 thread = __kmp_threads[gtid]; 1229 oldInfo = thread->th.ompt_thread_info; 1230 thread->th.ompt_thread_info.wait_id = 0; 1231 thread->th.ompt_thread_info.state = ompt_state_work_parallel; 1232 taskdata->ompt_task_info.frame.exit_runtime_frame = 1233 __builtin_frame_address(0); 1234 } 1235 #endif 1236 1237 #if OMP_40_ENABLED 1238 // TODO: cancel tasks if the parallel region has also been cancelled 1239 // TODO: check if this sequence can be hoisted above __kmp_task_start 1240 // if cancellation has been enabled for this run ... 1241 if (__kmp_omp_cancellation) { 1242 kmp_info_t *this_thr = __kmp_threads[gtid]; 1243 kmp_team_t *this_team = this_thr->th.th_team; 1244 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 1245 if ((taskgroup && taskgroup->cancel_request) || 1246 (this_team->t.t_cancel_request == cancel_parallel)) { 1247 KMP_COUNT_BLOCK(TASK_cancelled); 1248 // this task belongs to a task group and we need to cancel it 1249 discard = 1 /* true */; 1250 } 1251 } 1252 1253 // Invoke the task routine and pass in relevant data. 1254 // Thunks generated by gcc take a different argument list. 1255 if (!discard) { 1256 #if KMP_STATS_ENABLED 1257 KMP_COUNT_BLOCK(TASK_executed); 1258 switch (KMP_GET_THREAD_STATE()) { 1259 case FORK_JOIN_BARRIER: 1260 KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar); 1261 break; 1262 case PLAIN_BARRIER: 1263 KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar); 1264 break; 1265 case TASKYIELD: 1266 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield); 1267 break; 1268 case TASKWAIT: 1269 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait); 1270 break; 1271 case TASKGROUP: 1272 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup); 1273 break; 1274 default: 1275 KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate); 1276 break; 1277 } 1278 #endif // KMP_STATS_ENABLED 1279 #endif // OMP_40_ENABLED 1280 1281 #if OMPT_SUPPORT && OMPT_TRACE 1282 /* let OMPT know that we're about to run this task */ 1283 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_switch)) { 1284 ompt_callbacks.ompt_callback(ompt_event_task_switch)( 1285 current_task->ompt_task_info.task_id, 1286 taskdata->ompt_task_info.task_id); 1287 } 1288 #endif 1289 1290 #ifdef KMP_GOMP_COMPAT 1291 if (taskdata->td_flags.native) { 1292 ((void (*)(void *))(*(task->routine)))(task->shareds); 1293 } else 1294 #endif /* KMP_GOMP_COMPAT */ 1295 { 1296 (*(task->routine))(gtid, task); 1297 } 1298 KMP_POP_PARTITIONED_TIMER(); 1299 1300 #if OMPT_SUPPORT && OMPT_TRACE 1301 /* let OMPT know that we're returning to the callee task */ 1302 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_switch)) { 1303 ompt_callbacks.ompt_callback(ompt_event_task_switch)( 1304 taskdata->ompt_task_info.task_id, 1305 current_task->ompt_task_info.task_id); 1306 } 1307 #endif 1308 1309 #if OMP_40_ENABLED 1310 } 1311 #endif // OMP_40_ENABLED 1312 1313 #if OMPT_SUPPORT 1314 if (ompt_enabled) { 1315 thread->th.ompt_thread_info = oldInfo; 1316 taskdata->ompt_task_info.frame.exit_runtime_frame = NULL; 1317 } 1318 #endif 1319 1320 #if OMP_45_ENABLED 1321 // Proxy tasks are not handled by the runtime 1322 if (taskdata->td_flags.proxy != TASK_PROXY) { 1323 #endif 1324 ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent); 1325 __kmp_task_finish(gtid, task, current_task); 1326 #if OMP_45_ENABLED 1327 } 1328 #endif 1329 1330 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1331 // Barrier imbalance - correct arrive time after the task finished 1332 if (__kmp_forkjoin_frames_mode == 3) { 1333 kmp_info_t *this_thr = __kmp_threads[gtid]; 1334 if (this_thr->th.th_bar_arrive_time) { 1335 this_thr->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time); 1336 } 1337 } 1338 #endif 1339 KA_TRACE( 1340 30, 1341 ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n", 1342 gtid, taskdata, current_task)); 1343 return; 1344 } 1345 1346 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution 1347 // 1348 // loc_ref: location of original task pragma (ignored) 1349 // gtid: Global Thread ID of encountering thread 1350 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task'' 1351 // Returns: 1352 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1353 // be resumed later. 1354 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1355 // resumed later. 1356 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid, 1357 kmp_task_t *new_task) { 1358 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1359 1360 KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid, 1361 loc_ref, new_taskdata)); 1362 1363 /* Should we execute the new task or queue it? For now, let's just always try 1364 to queue it. If the queue fills up, then we'll execute it. */ 1365 1366 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1367 { // Execute this task immediately 1368 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1369 new_taskdata->td_flags.task_serial = 1; 1370 __kmp_invoke_task(gtid, new_task, current_task); 1371 } 1372 1373 KA_TRACE( 1374 10, 1375 ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: " 1376 "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n", 1377 gtid, loc_ref, new_taskdata)); 1378 1379 ANNOTATE_HAPPENS_BEFORE(new_task); 1380 return TASK_CURRENT_NOT_QUEUED; 1381 } 1382 1383 // __kmp_omp_task: Schedule a non-thread-switchable task for execution 1384 // 1385 // gtid: Global Thread ID of encountering thread 1386 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc() 1387 // serialize_immediate: if TRUE then if the task is executed immediately its 1388 // execution will be serialized 1389 // Returns: 1390 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1391 // be resumed later. 1392 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1393 // resumed later. 1394 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task, 1395 bool serialize_immediate) { 1396 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1397 1398 #if OMPT_SUPPORT 1399 if (ompt_enabled) { 1400 new_taskdata->ompt_task_info.frame.reenter_runtime_frame = 1401 __builtin_frame_address(1); 1402 } 1403 #endif 1404 1405 /* Should we execute the new task or queue it? For now, let's just always try to 1406 queue it. If the queue fills up, then we'll execute it. */ 1407 #if OMP_45_ENABLED 1408 if (new_taskdata->td_flags.proxy == TASK_PROXY || 1409 __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1410 #else 1411 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1412 #endif 1413 { // Execute this task immediately 1414 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1415 if (serialize_immediate) 1416 new_taskdata->td_flags.task_serial = 1; 1417 __kmp_invoke_task(gtid, new_task, current_task); 1418 } 1419 1420 #if OMPT_SUPPORT 1421 if (ompt_enabled) { 1422 new_taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL; 1423 } 1424 #endif 1425 1426 ANNOTATE_HAPPENS_BEFORE(new_task); 1427 return TASK_CURRENT_NOT_QUEUED; 1428 } 1429 1430 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a 1431 // non-thread-switchable task from the parent thread only! 1432 // 1433 // loc_ref: location of original task pragma (ignored) 1434 // gtid: Global Thread ID of encountering thread 1435 // new_task: non-thread-switchable task thunk allocated by 1436 // __kmp_omp_task_alloc() 1437 // Returns: 1438 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1439 // be resumed later. 1440 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1441 // resumed later. 1442 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid, 1443 kmp_task_t *new_task) { 1444 kmp_int32 res; 1445 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); 1446 1447 #if KMP_DEBUG 1448 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1449 #endif 1450 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref, 1451 new_taskdata)); 1452 1453 res = __kmp_omp_task(gtid, new_task, true); 1454 1455 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning " 1456 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n", 1457 gtid, loc_ref, new_taskdata)); 1458 return res; 1459 } 1460 1461 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are 1462 // complete 1463 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) { 1464 kmp_taskdata_t *taskdata; 1465 kmp_info_t *thread; 1466 int thread_finished = FALSE; 1467 KMP_SET_THREAD_STATE_BLOCK(TASKWAIT); 1468 1469 KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref)); 1470 1471 if (__kmp_tasking_mode != tskm_immediate_exec) { 1472 thread = __kmp_threads[gtid]; 1473 taskdata = thread->th.th_current_task; 1474 #if OMPT_SUPPORT && OMPT_TRACE 1475 ompt_task_id_t my_task_id; 1476 ompt_parallel_id_t my_parallel_id; 1477 1478 if (ompt_enabled) { 1479 kmp_team_t *team = thread->th.th_team; 1480 my_task_id = taskdata->ompt_task_info.task_id; 1481 my_parallel_id = team->t.ompt_team_info.parallel_id; 1482 1483 taskdata->ompt_task_info.frame.reenter_runtime_frame = 1484 __builtin_frame_address(1); 1485 if (ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)) { 1486 ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)(my_parallel_id, 1487 my_task_id); 1488 } 1489 } 1490 #endif 1491 1492 // Debugger: The taskwait is active. Store location and thread encountered the 1493 // taskwait. 1494 #if USE_ITT_BUILD 1495 // Note: These values are used by ITT events as well. 1496 #endif /* USE_ITT_BUILD */ 1497 taskdata->td_taskwait_counter += 1; 1498 taskdata->td_taskwait_ident = loc_ref; 1499 taskdata->td_taskwait_thread = gtid + 1; 1500 1501 #if USE_ITT_BUILD 1502 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 1503 if (itt_sync_obj != NULL) 1504 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 1505 #endif /* USE_ITT_BUILD */ 1506 1507 bool must_wait = 1508 !taskdata->td_flags.team_serial && !taskdata->td_flags.final; 1509 1510 #if OMP_45_ENABLED 1511 must_wait = must_wait || (thread->th.th_task_team != NULL && 1512 thread->th.th_task_team->tt.tt_found_proxy_tasks); 1513 #endif 1514 if (must_wait) { 1515 kmp_flag_32 flag( 1516 RCAST(volatile kmp_uint32 *, &taskdata->td_incomplete_child_tasks), 1517 0U); 1518 while (TCR_4(taskdata->td_incomplete_child_tasks) != 0) { 1519 flag.execute_tasks(thread, gtid, FALSE, 1520 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 1521 __kmp_task_stealing_constraint); 1522 } 1523 } 1524 #if USE_ITT_BUILD 1525 if (itt_sync_obj != NULL) 1526 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 1527 #endif /* USE_ITT_BUILD */ 1528 1529 // Debugger: The taskwait is completed. Location remains, but thread is 1530 // negated. 1531 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 1532 1533 #if OMPT_SUPPORT && OMPT_TRACE 1534 if (ompt_enabled) { 1535 if (ompt_callbacks.ompt_callback(ompt_event_taskwait_end)) { 1536 ompt_callbacks.ompt_callback(ompt_event_taskwait_end)(my_parallel_id, 1537 my_task_id); 1538 } 1539 taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL; 1540 } 1541 #endif 1542 ANNOTATE_HAPPENS_AFTER(taskdata); 1543 } 1544 1545 KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, " 1546 "returning TASK_CURRENT_NOT_QUEUED\n", 1547 gtid, taskdata)); 1548 1549 return TASK_CURRENT_NOT_QUEUED; 1550 } 1551 1552 // __kmpc_omp_taskyield: switch to a different task 1553 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) { 1554 kmp_taskdata_t *taskdata; 1555 kmp_info_t *thread; 1556 int thread_finished = FALSE; 1557 1558 KMP_COUNT_BLOCK(OMP_TASKYIELD); 1559 KMP_SET_THREAD_STATE_BLOCK(TASKYIELD); 1560 1561 KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n", 1562 gtid, loc_ref, end_part)); 1563 1564 if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) { 1565 thread = __kmp_threads[gtid]; 1566 taskdata = thread->th.th_current_task; 1567 // Should we model this as a task wait or not? 1568 // Debugger: The taskwait is active. Store location and thread encountered the 1569 // taskwait. 1570 #if USE_ITT_BUILD 1571 // Note: These values are used by ITT events as well. 1572 #endif /* USE_ITT_BUILD */ 1573 taskdata->td_taskwait_counter += 1; 1574 taskdata->td_taskwait_ident = loc_ref; 1575 taskdata->td_taskwait_thread = gtid + 1; 1576 1577 #if USE_ITT_BUILD 1578 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 1579 if (itt_sync_obj != NULL) 1580 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 1581 #endif /* USE_ITT_BUILD */ 1582 if (!taskdata->td_flags.team_serial) { 1583 kmp_task_team_t *task_team = thread->th.th_task_team; 1584 if (task_team != NULL) { 1585 if (KMP_TASKING_ENABLED(task_team)) { 1586 __kmp_execute_tasks_32( 1587 thread, gtid, NULL, FALSE, 1588 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 1589 __kmp_task_stealing_constraint); 1590 } 1591 } 1592 } 1593 #if USE_ITT_BUILD 1594 if (itt_sync_obj != NULL) 1595 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 1596 #endif /* USE_ITT_BUILD */ 1597 1598 // Debugger: The taskwait is completed. Location remains, but thread is 1599 // negated. 1600 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 1601 } 1602 1603 KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, " 1604 "returning TASK_CURRENT_NOT_QUEUED\n", 1605 gtid, taskdata)); 1606 1607 return TASK_CURRENT_NOT_QUEUED; 1608 } 1609 1610 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work 1611 #if OMP_45_ENABLED 1612 // Task Reduction implementation 1613 1614 typedef struct kmp_task_red_flags { 1615 unsigned lazy_priv : 1; // hint: (1) use lazy allocation (big objects) 1616 unsigned reserved31 : 31; 1617 } kmp_task_red_flags_t; 1618 1619 // internal structure for reduction data item related info 1620 typedef struct kmp_task_red_data { 1621 void *reduce_shar; // shared reduction item 1622 size_t reduce_size; // size of data item 1623 void *reduce_priv; // thread specific data 1624 void *reduce_pend; // end of private data for comparison op 1625 void *reduce_init; // data initialization routine 1626 void *reduce_fini; // data finalization routine 1627 void *reduce_comb; // data combiner routine 1628 kmp_task_red_flags_t flags; // flags for additional info from compiler 1629 } kmp_task_red_data_t; 1630 1631 // structure sent us by compiler - one per reduction item 1632 typedef struct kmp_task_red_input { 1633 void *reduce_shar; // shared reduction item 1634 size_t reduce_size; // size of data item 1635 void *reduce_init; // data initialization routine 1636 void *reduce_fini; // data finalization routine 1637 void *reduce_comb; // data combiner routine 1638 kmp_task_red_flags_t flags; // flags for additional info from compiler 1639 } kmp_task_red_input_t; 1640 1641 /*! 1642 @ingroup TASKING 1643 @param gtid Global thread ID 1644 @param num Number of data items to reduce 1645 @param data Array of data for reduction 1646 @return The taskgroup identifier 1647 1648 Initialize task reduction for the taskgroup. 1649 */ 1650 void *__kmpc_task_reduction_init(int gtid, int num, void *data) { 1651 kmp_info_t *thread = __kmp_threads[gtid]; 1652 kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup; 1653 kmp_int32 nth = thread->th.th_team_nproc; 1654 kmp_task_red_input_t *input = (kmp_task_red_input_t *)data; 1655 kmp_task_red_data_t *arr; 1656 1657 // check input data just in case 1658 KMP_ASSERT(tg != NULL); 1659 KMP_ASSERT(data != NULL); 1660 KMP_ASSERT(num > 0); 1661 if (nth == 1) { 1662 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n", 1663 gtid, tg)); 1664 return (void *)tg; 1665 } 1666 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n", 1667 gtid, tg, num)); 1668 arr = (kmp_task_red_data_t *)__kmp_thread_malloc( 1669 thread, num * sizeof(kmp_task_red_data_t)); 1670 for (int i = 0; i < num; ++i) { 1671 void (*f_init)(void *) = (void (*)(void *))(input[i].reduce_init); 1672 size_t size = input[i].reduce_size - 1; 1673 // round the size up to cache line per thread-specific item 1674 size += CACHE_LINE - size % CACHE_LINE; 1675 KMP_ASSERT(input[i].reduce_comb != NULL); // combiner is mandatory 1676 arr[i].reduce_shar = input[i].reduce_shar; 1677 arr[i].reduce_size = size; 1678 arr[i].reduce_init = input[i].reduce_init; 1679 arr[i].reduce_fini = input[i].reduce_fini; 1680 arr[i].reduce_comb = input[i].reduce_comb; 1681 arr[i].flags = input[i].flags; 1682 if (!input[i].flags.lazy_priv) { 1683 // allocate cache-line aligned block and fill it with zeros 1684 arr[i].reduce_priv = __kmp_allocate(nth * size); 1685 arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size; 1686 if (f_init != NULL) { 1687 // initialize thread-specific items 1688 for (int j = 0; j < nth; ++j) { 1689 f_init((char *)(arr[i].reduce_priv) + j * size); 1690 } 1691 } 1692 } else { 1693 // only allocate space for pointers now, 1694 // objects will be lazily allocated/initialized once requested 1695 arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *)); 1696 } 1697 } 1698 tg->reduce_data = (void *)arr; 1699 tg->reduce_num_data = num; 1700 return (void *)tg; 1701 } 1702 1703 /*! 1704 @ingroup TASKING 1705 @param gtid Global thread ID 1706 @param tskgrp The taskgroup ID (optional) 1707 @param data Shared location of the item 1708 @return The pointer to per-thread data 1709 1710 Get thread-specific location of data item 1711 */ 1712 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) { 1713 kmp_info_t *thread = __kmp_threads[gtid]; 1714 kmp_int32 nth = thread->th.th_team_nproc; 1715 if (nth == 1) 1716 return data; // nothing to do 1717 1718 kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp; 1719 if (tg == NULL) 1720 tg = thread->th.th_current_task->td_taskgroup; 1721 KMP_ASSERT(tg != NULL); 1722 kmp_task_red_data_t *arr = (kmp_task_red_data_t *)(tg->reduce_data); 1723 kmp_int32 num = tg->reduce_num_data; 1724 kmp_int32 tid = thread->th.th_info.ds.ds_tid; 1725 1726 KMP_ASSERT(data != NULL); 1727 while (tg != NULL) { 1728 for (int i = 0; i < num; ++i) { 1729 if (!arr[i].flags.lazy_priv) { 1730 if (data == arr[i].reduce_shar || 1731 (data >= arr[i].reduce_priv && data < arr[i].reduce_pend)) 1732 return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size; 1733 } else { 1734 // check shared location first 1735 void **p_priv = (void **)(arr[i].reduce_priv); 1736 if (data == arr[i].reduce_shar) 1737 goto found; 1738 // check if we get some thread specific location as parameter 1739 for (int j = 0; j < nth; ++j) 1740 if (data == p_priv[j]) 1741 goto found; 1742 continue; // not found, continue search 1743 found: 1744 if (p_priv[tid] == NULL) { 1745 // allocate thread specific object lazily 1746 void (*f_init)(void *) = (void (*)(void *))(arr[i].reduce_init); 1747 p_priv[tid] = __kmp_allocate(arr[i].reduce_size); 1748 if (f_init != NULL) { 1749 f_init(p_priv[tid]); 1750 } 1751 } 1752 return p_priv[tid]; 1753 } 1754 } 1755 tg = tg->parent; 1756 arr = (kmp_task_red_data_t *)(tg->reduce_data); 1757 num = tg->reduce_num_data; 1758 } 1759 KMP_ASSERT2(0, "Unknown task reduction item"); 1760 return NULL; // ERROR, this line never executed 1761 } 1762 1763 // Finalize task reduction. 1764 // Called from __kmpc_end_taskgroup() 1765 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) { 1766 kmp_int32 nth = th->th.th_team_nproc; 1767 KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1 1768 kmp_task_red_data_t *arr = (kmp_task_red_data_t *)tg->reduce_data; 1769 kmp_int32 num = tg->reduce_num_data; 1770 for (int i = 0; i < num; ++i) { 1771 void *sh_data = arr[i].reduce_shar; 1772 void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini); 1773 void (*f_comb)(void *, void *) = 1774 (void (*)(void *, void *))(arr[i].reduce_comb); 1775 if (!arr[i].flags.lazy_priv) { 1776 void *pr_data = arr[i].reduce_priv; 1777 size_t size = arr[i].reduce_size; 1778 for (int j = 0; j < nth; ++j) { 1779 void *priv_data = (char *)pr_data + j * size; 1780 f_comb(sh_data, priv_data); // combine results 1781 if (f_fini) 1782 f_fini(priv_data); // finalize if needed 1783 } 1784 } else { 1785 void **pr_data = (void **)(arr[i].reduce_priv); 1786 for (int j = 0; j < nth; ++j) { 1787 if (pr_data[j] != NULL) { 1788 f_comb(sh_data, pr_data[j]); // combine results 1789 if (f_fini) 1790 f_fini(pr_data[j]); // finalize if needed 1791 __kmp_free(pr_data[j]); 1792 } 1793 } 1794 } 1795 __kmp_free(arr[i].reduce_priv); 1796 } 1797 __kmp_thread_free(th, arr); 1798 tg->reduce_data = NULL; 1799 tg->reduce_num_data = 0; 1800 } 1801 #endif 1802 1803 #if OMP_40_ENABLED 1804 // __kmpc_taskgroup: Start a new taskgroup 1805 void __kmpc_taskgroup(ident_t *loc, int gtid) { 1806 kmp_info_t *thread = __kmp_threads[gtid]; 1807 kmp_taskdata_t *taskdata = thread->th.th_current_task; 1808 kmp_taskgroup_t *tg_new = 1809 (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t)); 1810 KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new)); 1811 tg_new->count = 0; 1812 tg_new->cancel_request = cancel_noreq; 1813 tg_new->parent = taskdata->td_taskgroup; 1814 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work 1815 #if OMP_45_ENABLED 1816 tg_new->reduce_data = NULL; 1817 tg_new->reduce_num_data = 0; 1818 #endif 1819 taskdata->td_taskgroup = tg_new; 1820 } 1821 1822 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task 1823 // and its descendants are complete 1824 void __kmpc_end_taskgroup(ident_t *loc, int gtid) { 1825 kmp_info_t *thread = __kmp_threads[gtid]; 1826 kmp_taskdata_t *taskdata = thread->th.th_current_task; 1827 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 1828 int thread_finished = FALSE; 1829 1830 KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc)); 1831 KMP_DEBUG_ASSERT(taskgroup != NULL); 1832 KMP_SET_THREAD_STATE_BLOCK(TASKGROUP); 1833 1834 if (__kmp_tasking_mode != tskm_immediate_exec) { 1835 #if USE_ITT_BUILD 1836 // For ITT the taskgroup wait is similar to taskwait until we need to 1837 // distinguish them 1838 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 1839 if (itt_sync_obj != NULL) 1840 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 1841 #endif /* USE_ITT_BUILD */ 1842 1843 #if OMP_45_ENABLED 1844 if (!taskdata->td_flags.team_serial || 1845 (thread->th.th_task_team != NULL && 1846 thread->th.th_task_team->tt.tt_found_proxy_tasks)) 1847 #else 1848 if (!taskdata->td_flags.team_serial) 1849 #endif 1850 { 1851 kmp_flag_32 flag(RCAST(kmp_uint32 *, &taskgroup->count), 0U); 1852 while (TCR_4(taskgroup->count) != 0) { 1853 flag.execute_tasks(thread, gtid, FALSE, 1854 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 1855 __kmp_task_stealing_constraint); 1856 } 1857 } 1858 1859 #if USE_ITT_BUILD 1860 if (itt_sync_obj != NULL) 1861 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 1862 #endif /* USE_ITT_BUILD */ 1863 } 1864 KMP_DEBUG_ASSERT(taskgroup->count == 0); 1865 1866 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work 1867 #if OMP_45_ENABLED 1868 if (taskgroup->reduce_data != NULL) // need to reduce? 1869 __kmp_task_reduction_fini(thread, taskgroup); 1870 #endif 1871 // Restore parent taskgroup for the current task 1872 taskdata->td_taskgroup = taskgroup->parent; 1873 __kmp_thread_free(thread, taskgroup); 1874 1875 KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n", 1876 gtid, taskdata)); 1877 ANNOTATE_HAPPENS_AFTER(taskdata); 1878 } 1879 #endif 1880 1881 // __kmp_remove_my_task: remove a task from my own deque 1882 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid, 1883 kmp_task_team_t *task_team, 1884 kmp_int32 is_constrained) { 1885 kmp_task_t *task; 1886 kmp_taskdata_t *taskdata; 1887 kmp_thread_data_t *thread_data; 1888 kmp_uint32 tail; 1889 1890 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 1891 KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data != 1892 NULL); // Caller should check this condition 1893 1894 thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 1895 1896 KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n", 1897 gtid, thread_data->td.td_deque_ntasks, 1898 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 1899 1900 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 1901 KA_TRACE(10, 1902 ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: " 1903 "ntasks=%d head=%u tail=%u\n", 1904 gtid, thread_data->td.td_deque_ntasks, 1905 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 1906 return NULL; 1907 } 1908 1909 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 1910 1911 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 1912 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 1913 KA_TRACE(10, 1914 ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: " 1915 "ntasks=%d head=%u tail=%u\n", 1916 gtid, thread_data->td.td_deque_ntasks, 1917 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 1918 return NULL; 1919 } 1920 1921 tail = (thread_data->td.td_deque_tail - 1) & 1922 TASK_DEQUE_MASK(thread_data->td); // Wrap index. 1923 taskdata = thread_data->td.td_deque[tail]; 1924 1925 if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) { 1926 // we need to check if the candidate obeys task scheduling constraint: 1927 // only child of current task can be scheduled 1928 kmp_taskdata_t *current = thread->th.th_current_task; 1929 kmp_int32 level = current->td_level; 1930 kmp_taskdata_t *parent = taskdata->td_parent; 1931 while (parent != current && parent->td_level > level) { 1932 parent = parent->td_parent; // check generation up to the level of the 1933 // current task 1934 KMP_DEBUG_ASSERT(parent != NULL); 1935 } 1936 if (parent != current) { 1937 // If the tail task is not a child, then no other child can appear in the 1938 // deque. 1939 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 1940 KA_TRACE(10, 1941 ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: " 1942 "ntasks=%d head=%u tail=%u\n", 1943 gtid, thread_data->td.td_deque_ntasks, 1944 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 1945 return NULL; 1946 } 1947 } 1948 1949 thread_data->td.td_deque_tail = tail; 1950 TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1); 1951 1952 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 1953 1954 KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d task %p removed: " 1955 "ntasks=%d head=%u tail=%u\n", 1956 gtid, taskdata, thread_data->td.td_deque_ntasks, 1957 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 1958 1959 task = KMP_TASKDATA_TO_TASK(taskdata); 1960 return task; 1961 } 1962 1963 // __kmp_steal_task: remove a task from another thread's deque 1964 // Assume that calling thread has already checked existence of 1965 // task_team thread_data before calling this routine. 1966 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim, kmp_int32 gtid, 1967 kmp_task_team_t *task_team, 1968 volatile kmp_int32 *unfinished_threads, 1969 int *thread_finished, 1970 kmp_int32 is_constrained) { 1971 kmp_task_t *task; 1972 kmp_taskdata_t *taskdata; 1973 kmp_thread_data_t *victim_td, *threads_data; 1974 kmp_int32 victim_tid; 1975 1976 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 1977 1978 threads_data = task_team->tt.tt_threads_data; 1979 KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition 1980 1981 victim_tid = victim->th.th_info.ds.ds_tid; 1982 victim_td = &threads_data[victim_tid]; 1983 1984 KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: " 1985 "task_team=%p ntasks=%d " 1986 "head=%u tail=%u\n", 1987 gtid, __kmp_gtid_from_thread(victim), task_team, 1988 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 1989 victim_td->td.td_deque_tail)); 1990 1991 if ((TCR_4(victim_td->td.td_deque_ntasks) == 1992 0) || // Caller should not check this condition 1993 (TCR_PTR(victim->th.th_task_team) != 1994 task_team)) // GEH: why would this happen? 1995 { 1996 KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: " 1997 "task_team=%p " 1998 "ntasks=%d head=%u tail=%u\n", 1999 gtid, __kmp_gtid_from_thread(victim), task_team, 2000 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2001 victim_td->td.td_deque_tail)); 2002 return NULL; 2003 } 2004 2005 __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock); 2006 2007 // Check again after we acquire the lock 2008 if ((TCR_4(victim_td->td.td_deque_ntasks) == 0) || 2009 (TCR_PTR(victim->th.th_task_team) != 2010 task_team)) // GEH: why would this happen? 2011 { 2012 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2013 KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: " 2014 "task_team=%p " 2015 "ntasks=%d head=%u tail=%u\n", 2016 gtid, __kmp_gtid_from_thread(victim), task_team, 2017 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2018 victim_td->td.td_deque_tail)); 2019 return NULL; 2020 } 2021 2022 KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL); 2023 2024 taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head]; 2025 if (is_constrained) { 2026 // we need to check if the candidate obeys task scheduling constraint: 2027 // only descendant of current task can be scheduled 2028 kmp_taskdata_t *current = __kmp_threads[gtid]->th.th_current_task; 2029 kmp_int32 level = current->td_level; 2030 kmp_taskdata_t *parent = taskdata->td_parent; 2031 while (parent != current && parent->td_level > level) { 2032 parent = parent->td_parent; // check generation up to the level of the 2033 // current task 2034 KMP_DEBUG_ASSERT(parent != NULL); 2035 } 2036 if (parent != current) { 2037 // If the head task is not a descendant of the current task then do not 2038 // steal it. No other task in victim's deque can be a descendant of the 2039 // current task. 2040 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2041 KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from " 2042 "T#%d: task_team=%p " 2043 "ntasks=%d head=%u tail=%u\n", 2044 gtid, 2045 __kmp_gtid_from_thread(threads_data[victim_tid].td.td_thr), 2046 task_team, victim_td->td.td_deque_ntasks, 2047 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2048 return NULL; 2049 } 2050 } 2051 // Bump head pointer and Wrap. 2052 victim_td->td.td_deque_head = 2053 (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td); 2054 if (*thread_finished) { 2055 // We need to un-mark this victim as a finished victim. This must be done 2056 // before releasing the lock, or else other threads (starting with the 2057 // master victim) might be prematurely released from the barrier!!! 2058 kmp_int32 count; 2059 2060 count = KMP_TEST_THEN_INC32(CCAST(kmp_int32 *, unfinished_threads)); 2061 2062 KA_TRACE( 2063 20, 2064 ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n", 2065 gtid, count + 1, task_team)); 2066 2067 *thread_finished = FALSE; 2068 } 2069 TCW_4(victim_td->td.td_deque_ntasks, 2070 TCR_4(victim_td->td.td_deque_ntasks) - 1); 2071 2072 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2073 2074 KMP_COUNT_BLOCK(TASK_stolen); 2075 KA_TRACE( 2076 10, 2077 ("__kmp_steal_task(exit #3): T#%d stole task %p from T#%d: task_team=%p " 2078 "ntasks=%d head=%u tail=%u\n", 2079 gtid, taskdata, __kmp_gtid_from_thread(victim), task_team, 2080 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2081 victim_td->td.td_deque_tail)); 2082 2083 task = KMP_TASKDATA_TO_TASK(taskdata); 2084 return task; 2085 } 2086 2087 // __kmp_execute_tasks_template: Choose and execute tasks until either the 2088 // condition is statisfied (return true) or there are none left (return false). 2089 // 2090 // final_spin is TRUE if this is the spin at the release barrier. 2091 // thread_finished indicates whether the thread is finished executing all 2092 // the tasks it has on its deque, and is at the release barrier. 2093 // spinner is the location on which to spin. 2094 // spinner == NULL means only execute a single task and return. 2095 // checker is the value to check to terminate the spin. 2096 template <class C> 2097 static inline int __kmp_execute_tasks_template( 2098 kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin, 2099 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2100 kmp_int32 is_constrained) { 2101 kmp_task_team_t *task_team = thread->th.th_task_team; 2102 kmp_thread_data_t *threads_data; 2103 kmp_task_t *task; 2104 kmp_info_t *other_thread; 2105 kmp_taskdata_t *current_task = thread->th.th_current_task; 2106 volatile kmp_int32 *unfinished_threads; 2107 kmp_int32 nthreads, victim = -2, use_own_tasks = 1, new_victim = 0, 2108 tid = thread->th.th_info.ds.ds_tid; 2109 2110 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2111 KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]); 2112 2113 if (task_team == NULL) 2114 return FALSE; 2115 2116 KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d " 2117 "*thread_finished=%d\n", 2118 gtid, final_spin, *thread_finished)); 2119 2120 thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 2121 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 2122 KMP_DEBUG_ASSERT(threads_data != NULL); 2123 2124 nthreads = task_team->tt.tt_nproc; 2125 unfinished_threads = &(task_team->tt.tt_unfinished_threads); 2126 #if OMP_45_ENABLED 2127 KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks); 2128 #else 2129 KMP_DEBUG_ASSERT(nthreads > 1); 2130 #endif 2131 KMP_DEBUG_ASSERT(TCR_4(*unfinished_threads) >= 0); 2132 2133 while (1) { // Outer loop keeps trying to find tasks in case of single thread 2134 // getting tasks from target constructs 2135 while (1) { // Inner loop to find a task and execute it 2136 task = NULL; 2137 if (use_own_tasks) { // check on own queue first 2138 task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained); 2139 } 2140 if ((task == NULL) && (nthreads > 1)) { // Steal a task 2141 int asleep = 1; 2142 use_own_tasks = 0; 2143 // Try to steal from the last place I stole from successfully. 2144 if (victim == -2) { // haven't stolen anything yet 2145 victim = threads_data[tid].td.td_deque_last_stolen; 2146 if (victim != 2147 -1) // if we have a last stolen from victim, get the thread 2148 other_thread = threads_data[victim].td.td_thr; 2149 } 2150 if (victim != -1) { // found last victim 2151 asleep = 0; 2152 } else if (!new_victim) { // no recent steals and we haven't already 2153 // used a new victim; select a random thread 2154 do { // Find a different thread to steal work from. 2155 // Pick a random thread. Initial plan was to cycle through all the 2156 // threads, and only return if we tried to steal from every thread, 2157 // and failed. Arch says that's not such a great idea. 2158 victim = __kmp_get_random(thread) % (nthreads - 1); 2159 if (victim >= tid) { 2160 ++victim; // Adjusts random distribution to exclude self 2161 } 2162 // Found a potential victim 2163 other_thread = threads_data[victim].td.td_thr; 2164 // There is a slight chance that __kmp_enable_tasking() did not wake 2165 // up all threads waiting at the barrier. If victim is sleeping, 2166 // then wake it up. Since we were going to pay the cache miss 2167 // penalty for referencing another thread's kmp_info_t struct 2168 // anyway, 2169 // the check shouldn't cost too much performance at this point. In 2170 // extra barrier mode, tasks do not sleep at the separate tasking 2171 // barrier, so this isn't a problem. 2172 asleep = 0; 2173 if ((__kmp_tasking_mode == tskm_task_teams) && 2174 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) && 2175 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) != 2176 NULL)) { 2177 asleep = 1; 2178 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread), 2179 other_thread->th.th_sleep_loc); 2180 // A sleeping thread should not have any tasks on it's queue. 2181 // There is a slight possibility that it resumes, steals a task 2182 // from another thread, which spawns more tasks, all in the time 2183 // that it takes this thread to check => don't write an assertion 2184 // that the victim's queue is empty. Try stealing from a 2185 // different thread. 2186 } 2187 } while (asleep); 2188 } 2189 2190 if (!asleep) { 2191 // We have a victim to try to steal from 2192 task = __kmp_steal_task(other_thread, gtid, task_team, 2193 unfinished_threads, thread_finished, 2194 is_constrained); 2195 } 2196 if (task != NULL) { // set last stolen to victim 2197 if (threads_data[tid].td.td_deque_last_stolen != victim) { 2198 threads_data[tid].td.td_deque_last_stolen = victim; 2199 // The pre-refactored code did not try more than 1 successful new 2200 // vicitm, unless the last one generated more local tasks; 2201 // new_victim keeps track of this 2202 new_victim = 1; 2203 } 2204 } else { // No tasks found; unset last_stolen 2205 KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1); 2206 victim = -2; // no successful victim found 2207 } 2208 } 2209 2210 if (task == NULL) // break out of tasking loop 2211 break; 2212 2213 // Found a task; execute it 2214 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2215 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2216 if (itt_sync_obj == NULL) { // we are at fork barrier where we could not 2217 // get the object reliably 2218 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2219 } 2220 __kmp_itt_task_starting(itt_sync_obj); 2221 } 2222 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2223 __kmp_invoke_task(gtid, task, current_task); 2224 #if USE_ITT_BUILD 2225 if (itt_sync_obj != NULL) 2226 __kmp_itt_task_finished(itt_sync_obj); 2227 #endif /* USE_ITT_BUILD */ 2228 // If this thread is only partway through the barrier and the condition is 2229 // met, then return now, so that the barrier gather/release pattern can 2230 // proceed. If this thread is in the last spin loop in the barrier, 2231 // waiting to be released, we know that the termination condition will not 2232 // be satisified, so don't waste any cycles checking it. 2233 if (flag == NULL || (!final_spin && flag->done_check())) { 2234 KA_TRACE( 2235 15, 2236 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 2237 gtid)); 2238 return TRUE; 2239 } 2240 if (thread->th.th_task_team == NULL) { 2241 break; 2242 } 2243 // Yield before executing next task 2244 KMP_YIELD(__kmp_library == library_throughput); 2245 // If execution of a stolen task results in more tasks being placed on our 2246 // run queue, reset use_own_tasks 2247 if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) { 2248 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned " 2249 "other tasks, restart\n", 2250 gtid)); 2251 use_own_tasks = 1; 2252 new_victim = 0; 2253 } 2254 } 2255 2256 // The task source has been exhausted. If in final spin loop of barrier, check 2257 // if termination condition is satisfied. 2258 #if OMP_45_ENABLED 2259 // The work queue may be empty but there might be proxy tasks still 2260 // executing 2261 if (final_spin && TCR_4(current_task->td_incomplete_child_tasks) == 0) 2262 #else 2263 if (final_spin) 2264 #endif 2265 { 2266 // First, decrement the #unfinished threads, if that has not already been 2267 // done. This decrement might be to the spin location, and result in the 2268 // termination condition being satisfied. 2269 if (!*thread_finished) { 2270 kmp_int32 count; 2271 2272 count = KMP_TEST_THEN_DEC32(CCAST(kmp_int32 *, unfinished_threads)) - 1; 2273 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec " 2274 "unfinished_threads to %d task_team=%p\n", 2275 gtid, count, task_team)); 2276 *thread_finished = TRUE; 2277 } 2278 2279 // It is now unsafe to reference thread->th.th_team !!! 2280 // Decrementing task_team->tt.tt_unfinished_threads can allow the master 2281 // thread to pass through the barrier, where it might reset each thread's 2282 // th.th_team field for the next parallel region. If we can steal more 2283 // work, we know that this has not happened yet. 2284 if (flag != NULL && flag->done_check()) { 2285 KA_TRACE( 2286 15, 2287 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 2288 gtid)); 2289 return TRUE; 2290 } 2291 } 2292 2293 // If this thread's task team is NULL, master has recognized that there are 2294 // no more tasks; bail out 2295 if (thread->th.th_task_team == NULL) { 2296 KA_TRACE(15, 2297 ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid)); 2298 return FALSE; 2299 } 2300 2301 #if OMP_45_ENABLED 2302 // We could be getting tasks from target constructs; if this is the only 2303 // thread, keep trying to execute tasks from own queue 2304 if (nthreads == 1) 2305 use_own_tasks = 1; 2306 else 2307 #endif 2308 { 2309 KA_TRACE(15, 2310 ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid)); 2311 return FALSE; 2312 } 2313 } 2314 } 2315 2316 int __kmp_execute_tasks_32( 2317 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin, 2318 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2319 kmp_int32 is_constrained) { 2320 return __kmp_execute_tasks_template( 2321 thread, gtid, flag, final_spin, 2322 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 2323 } 2324 2325 int __kmp_execute_tasks_64( 2326 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin, 2327 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2328 kmp_int32 is_constrained) { 2329 return __kmp_execute_tasks_template( 2330 thread, gtid, flag, final_spin, 2331 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 2332 } 2333 2334 int __kmp_execute_tasks_oncore( 2335 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin, 2336 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2337 kmp_int32 is_constrained) { 2338 return __kmp_execute_tasks_template( 2339 thread, gtid, flag, final_spin, 2340 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 2341 } 2342 2343 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the 2344 // next barrier so they can assist in executing enqueued tasks. 2345 // First thread in allocates the task team atomically. 2346 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 2347 kmp_info_t *this_thr) { 2348 kmp_thread_data_t *threads_data; 2349 int nthreads, i, is_init_thread; 2350 2351 KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n", 2352 __kmp_gtid_from_thread(this_thr))); 2353 2354 KMP_DEBUG_ASSERT(task_team != NULL); 2355 KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL); 2356 2357 nthreads = task_team->tt.tt_nproc; 2358 KMP_DEBUG_ASSERT(nthreads > 0); 2359 KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc); 2360 2361 // Allocate or increase the size of threads_data if necessary 2362 is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team); 2363 2364 if (!is_init_thread) { 2365 // Some other thread already set up the array. 2366 KA_TRACE( 2367 20, 2368 ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n", 2369 __kmp_gtid_from_thread(this_thr))); 2370 return; 2371 } 2372 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 2373 KMP_DEBUG_ASSERT(threads_data != NULL); 2374 2375 if ((__kmp_tasking_mode == tskm_task_teams) && 2376 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) { 2377 // Release any threads sleeping at the barrier, so that they can steal 2378 // tasks and execute them. In extra barrier mode, tasks do not sleep 2379 // at the separate tasking barrier, so this isn't a problem. 2380 for (i = 0; i < nthreads; i++) { 2381 volatile void *sleep_loc; 2382 kmp_info_t *thread = threads_data[i].td.td_thr; 2383 2384 if (i == this_thr->th.th_info.ds.ds_tid) { 2385 continue; 2386 } 2387 // Since we haven't locked the thread's suspend mutex lock at this 2388 // point, there is a small window where a thread might be putting 2389 // itself to sleep, but hasn't set the th_sleep_loc field yet. 2390 // To work around this, __kmp_execute_tasks_template() periodically checks 2391 // see if other threads are sleeping (using the same random mechanism that 2392 // is used for task stealing) and awakens them if they are. 2393 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != 2394 NULL) { 2395 KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n", 2396 __kmp_gtid_from_thread(this_thr), 2397 __kmp_gtid_from_thread(thread))); 2398 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc); 2399 } else { 2400 KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n", 2401 __kmp_gtid_from_thread(this_thr), 2402 __kmp_gtid_from_thread(thread))); 2403 } 2404 } 2405 } 2406 2407 KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n", 2408 __kmp_gtid_from_thread(this_thr))); 2409 } 2410 2411 /* // TODO: Check the comment consistency 2412 * Utility routines for "task teams". A task team (kmp_task_t) is kind of 2413 * like a shadow of the kmp_team_t data struct, with a different lifetime. 2414 * After a child * thread checks into a barrier and calls __kmp_release() from 2415 * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no 2416 * longer assume that the kmp_team_t structure is intact (at any moment, the 2417 * master thread may exit the barrier code and free the team data structure, 2418 * and return the threads to the thread pool). 2419 * 2420 * This does not work with the the tasking code, as the thread is still 2421 * expected to participate in the execution of any tasks that may have been 2422 * spawned my a member of the team, and the thread still needs access to all 2423 * to each thread in the team, so that it can steal work from it. 2424 * 2425 * Enter the existence of the kmp_task_team_t struct. It employs a reference 2426 * counting mechanims, and is allocated by the master thread before calling 2427 * __kmp_<barrier_kind>_release, and then is release by the last thread to 2428 * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes 2429 * of the kmp_task_team_t structs for consecutive barriers can overlap 2430 * (and will, unless the master thread is the last thread to exit the barrier 2431 * release phase, which is not typical). 2432 * 2433 * The existence of such a struct is useful outside the context of tasking, 2434 * but for now, I'm trying to keep it specific to the OMP_30_ENABLED macro, 2435 * so that any performance differences show up when comparing the 2.5 vs. 3.0 2436 * libraries. 2437 * 2438 * We currently use the existence of the threads array as an indicator that 2439 * tasks were spawned since the last barrier. If the structure is to be 2440 * useful outside the context of tasking, then this will have to change, but 2441 * not settting the field minimizes the performance impact of tasking on 2442 * barriers, when no explicit tasks were spawned (pushed, actually). 2443 */ 2444 2445 static kmp_task_team_t *__kmp_free_task_teams = 2446 NULL; // Free list for task_team data structures 2447 // Lock for task team data structures 2448 static kmp_bootstrap_lock_t __kmp_task_team_lock = 2449 KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock); 2450 2451 // __kmp_alloc_task_deque: 2452 // Allocates a task deque for a particular thread, and initialize the necessary 2453 // data structures relating to the deque. This only happens once per thread 2454 // per task team since task teams are recycled. No lock is needed during 2455 // allocation since each thread allocates its own deque. 2456 static void __kmp_alloc_task_deque(kmp_info_t *thread, 2457 kmp_thread_data_t *thread_data) { 2458 __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock); 2459 KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL); 2460 2461 // Initialize last stolen task field to "none" 2462 thread_data->td.td_deque_last_stolen = -1; 2463 2464 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0); 2465 KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0); 2466 KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0); 2467 2468 KE_TRACE( 2469 10, 2470 ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n", 2471 __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data)); 2472 // Allocate space for task deque, and zero the deque 2473 // Cannot use __kmp_thread_calloc() because threads not around for 2474 // kmp_reap_task_team( ). 2475 thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate( 2476 INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *)); 2477 thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE; 2478 } 2479 2480 // __kmp_realloc_task_deque: 2481 // Re-allocates a task deque for a particular thread, copies the content from 2482 // the old deque and adjusts the necessary data structures relating to the 2483 // deque. This operation must be done with a the deque_lock being held 2484 static void __kmp_realloc_task_deque(kmp_info_t *thread, 2485 kmp_thread_data_t *thread_data) { 2486 kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td); 2487 kmp_int32 new_size = 2 * size; 2488 2489 KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to " 2490 "%d] for thread_data %p\n", 2491 __kmp_gtid_from_thread(thread), size, new_size, thread_data)); 2492 2493 kmp_taskdata_t **new_deque = 2494 (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *)); 2495 2496 int i, j; 2497 for (i = thread_data->td.td_deque_head, j = 0; j < size; 2498 i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++) 2499 new_deque[j] = thread_data->td.td_deque[i]; 2500 2501 __kmp_free(thread_data->td.td_deque); 2502 2503 thread_data->td.td_deque_head = 0; 2504 thread_data->td.td_deque_tail = size; 2505 thread_data->td.td_deque = new_deque; 2506 thread_data->td.td_deque_size = new_size; 2507 } 2508 2509 // __kmp_free_task_deque: 2510 // Deallocates a task deque for a particular thread. Happens at library 2511 // deallocation so don't need to reset all thread data fields. 2512 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) { 2513 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 2514 2515 if (thread_data->td.td_deque != NULL) { 2516 TCW_4(thread_data->td.td_deque_ntasks, 0); 2517 __kmp_free(thread_data->td.td_deque); 2518 thread_data->td.td_deque = NULL; 2519 } 2520 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2521 2522 #ifdef BUILD_TIED_TASK_STACK 2523 // GEH: Figure out what to do here for td_susp_tied_tasks 2524 if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) { 2525 __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data); 2526 } 2527 #endif // BUILD_TIED_TASK_STACK 2528 } 2529 2530 // __kmp_realloc_task_threads_data: 2531 // Allocates a threads_data array for a task team, either by allocating an 2532 // initial array or enlarging an existing array. Only the first thread to get 2533 // the lock allocs or enlarges the array and re-initializes the array eleemnts. 2534 // That thread returns "TRUE", the rest return "FALSE". 2535 // Assumes that the new array size is given by task_team -> tt.tt_nproc. 2536 // The current size is given by task_team -> tt.tt_max_threads. 2537 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 2538 kmp_task_team_t *task_team) { 2539 kmp_thread_data_t **threads_data_p; 2540 kmp_int32 nthreads, maxthreads; 2541 int is_init_thread = FALSE; 2542 2543 if (TCR_4(task_team->tt.tt_found_tasks)) { 2544 // Already reallocated and initialized. 2545 return FALSE; 2546 } 2547 2548 threads_data_p = &task_team->tt.tt_threads_data; 2549 nthreads = task_team->tt.tt_nproc; 2550 maxthreads = task_team->tt.tt_max_threads; 2551 2552 // All threads must lock when they encounter the first task of the implicit 2553 // task region to make sure threads_data fields are (re)initialized before 2554 // used. 2555 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 2556 2557 if (!TCR_4(task_team->tt.tt_found_tasks)) { 2558 // first thread to enable tasking 2559 kmp_team_t *team = thread->th.th_team; 2560 int i; 2561 2562 is_init_thread = TRUE; 2563 if (maxthreads < nthreads) { 2564 2565 if (*threads_data_p != NULL) { 2566 kmp_thread_data_t *old_data = *threads_data_p; 2567 kmp_thread_data_t *new_data = NULL; 2568 2569 KE_TRACE( 2570 10, 2571 ("__kmp_realloc_task_threads_data: T#%d reallocating " 2572 "threads data for task_team %p, new_size = %d, old_size = %d\n", 2573 __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads)); 2574 // Reallocate threads_data to have more elements than current array 2575 // Cannot use __kmp_thread_realloc() because threads not around for 2576 // kmp_reap_task_team( ). Note all new array entries are initialized 2577 // to zero by __kmp_allocate(). 2578 new_data = (kmp_thread_data_t *)__kmp_allocate( 2579 nthreads * sizeof(kmp_thread_data_t)); 2580 // copy old data to new data 2581 KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t), 2582 (void *)old_data, maxthreads * sizeof(kmp_taskdata_t *)); 2583 2584 #ifdef BUILD_TIED_TASK_STACK 2585 // GEH: Figure out if this is the right thing to do 2586 for (i = maxthreads; i < nthreads; i++) { 2587 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 2588 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 2589 } 2590 #endif // BUILD_TIED_TASK_STACK 2591 // Install the new data and free the old data 2592 (*threads_data_p) = new_data; 2593 __kmp_free(old_data); 2594 } else { 2595 KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating " 2596 "threads data for task_team %p, size = %d\n", 2597 __kmp_gtid_from_thread(thread), task_team, nthreads)); 2598 // Make the initial allocate for threads_data array, and zero entries 2599 // Cannot use __kmp_thread_calloc() because threads not around for 2600 // kmp_reap_task_team( ). 2601 ANNOTATE_IGNORE_WRITES_BEGIN(); 2602 *threads_data_p = (kmp_thread_data_t *)__kmp_allocate( 2603 nthreads * sizeof(kmp_thread_data_t)); 2604 ANNOTATE_IGNORE_WRITES_END(); 2605 #ifdef BUILD_TIED_TASK_STACK 2606 // GEH: Figure out if this is the right thing to do 2607 for (i = 0; i < nthreads; i++) { 2608 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 2609 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 2610 } 2611 #endif // BUILD_TIED_TASK_STACK 2612 } 2613 task_team->tt.tt_max_threads = nthreads; 2614 } else { 2615 // If array has (more than) enough elements, go ahead and use it 2616 KMP_DEBUG_ASSERT(*threads_data_p != NULL); 2617 } 2618 2619 // initialize threads_data pointers back to thread_info structures 2620 for (i = 0; i < nthreads; i++) { 2621 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 2622 thread_data->td.td_thr = team->t.t_threads[i]; 2623 2624 if (thread_data->td.td_deque_last_stolen >= nthreads) { 2625 // The last stolen field survives across teams / barrier, and the number 2626 // of threads may have changed. It's possible (likely?) that a new 2627 // parallel region will exhibit the same behavior as previous region. 2628 thread_data->td.td_deque_last_stolen = -1; 2629 } 2630 } 2631 2632 KMP_MB(); 2633 TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE); 2634 } 2635 2636 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 2637 return is_init_thread; 2638 } 2639 2640 // __kmp_free_task_threads_data: 2641 // Deallocates a threads_data array for a task team, including any attached 2642 // tasking deques. Only occurs at library shutdown. 2643 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) { 2644 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 2645 if (task_team->tt.tt_threads_data != NULL) { 2646 int i; 2647 for (i = 0; i < task_team->tt.tt_max_threads; i++) { 2648 __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]); 2649 } 2650 __kmp_free(task_team->tt.tt_threads_data); 2651 task_team->tt.tt_threads_data = NULL; 2652 } 2653 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 2654 } 2655 2656 // __kmp_allocate_task_team: 2657 // Allocates a task team associated with a specific team, taking it from 2658 // the global task team free list if possible. Also initializes data 2659 // structures. 2660 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread, 2661 kmp_team_t *team) { 2662 kmp_task_team_t *task_team = NULL; 2663 int nthreads; 2664 2665 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n", 2666 (thread ? __kmp_gtid_from_thread(thread) : -1), team)); 2667 2668 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 2669 // Take a task team from the task team pool 2670 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 2671 if (__kmp_free_task_teams != NULL) { 2672 task_team = __kmp_free_task_teams; 2673 TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next); 2674 task_team->tt.tt_next = NULL; 2675 } 2676 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 2677 } 2678 2679 if (task_team == NULL) { 2680 KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating " 2681 "task team for team %p\n", 2682 __kmp_gtid_from_thread(thread), team)); 2683 // Allocate a new task team if one is not available. 2684 // Cannot use __kmp_thread_malloc() because threads not around for 2685 // kmp_reap_task_team( ). 2686 task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t)); 2687 __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock); 2688 // AC: __kmp_allocate zeroes returned memory 2689 // task_team -> tt.tt_threads_data = NULL; 2690 // task_team -> tt.tt_max_threads = 0; 2691 // task_team -> tt.tt_next = NULL; 2692 } 2693 2694 TCW_4(task_team->tt.tt_found_tasks, FALSE); 2695 #if OMP_45_ENABLED 2696 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 2697 #endif 2698 task_team->tt.tt_nproc = nthreads = team->t.t_nproc; 2699 2700 TCW_4(task_team->tt.tt_unfinished_threads, nthreads); 2701 TCW_4(task_team->tt.tt_active, TRUE); 2702 2703 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p " 2704 "unfinished_threads init'd to %d\n", 2705 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team, 2706 task_team->tt.tt_unfinished_threads)); 2707 return task_team; 2708 } 2709 2710 // __kmp_free_task_team: 2711 // Frees the task team associated with a specific thread, and adds it 2712 // to the global task team free list. 2713 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) { 2714 KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n", 2715 thread ? __kmp_gtid_from_thread(thread) : -1, task_team)); 2716 2717 // Put task team back on free list 2718 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 2719 2720 KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL); 2721 task_team->tt.tt_next = __kmp_free_task_teams; 2722 TCW_PTR(__kmp_free_task_teams, task_team); 2723 2724 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 2725 } 2726 2727 // __kmp_reap_task_teams: 2728 // Free all the task teams on the task team free list. 2729 // Should only be done during library shutdown. 2730 // Cannot do anything that needs a thread structure or gtid since they are 2731 // already gone. 2732 void __kmp_reap_task_teams(void) { 2733 kmp_task_team_t *task_team; 2734 2735 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 2736 // Free all task_teams on the free list 2737 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 2738 while ((task_team = __kmp_free_task_teams) != NULL) { 2739 __kmp_free_task_teams = task_team->tt.tt_next; 2740 task_team->tt.tt_next = NULL; 2741 2742 // Free threads_data if necessary 2743 if (task_team->tt.tt_threads_data != NULL) { 2744 __kmp_free_task_threads_data(task_team); 2745 } 2746 __kmp_free(task_team); 2747 } 2748 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 2749 } 2750 } 2751 2752 // __kmp_wait_to_unref_task_teams: 2753 // Some threads could still be in the fork barrier release code, possibly 2754 // trying to steal tasks. Wait for each thread to unreference its task team. 2755 void __kmp_wait_to_unref_task_teams(void) { 2756 kmp_info_t *thread; 2757 kmp_uint32 spins; 2758 int done; 2759 2760 KMP_INIT_YIELD(spins); 2761 2762 for (;;) { 2763 done = TRUE; 2764 2765 // TODO: GEH - this may be is wrong because some sync would be necessary 2766 // in case threads are added to the pool during the traversal. Need to 2767 // verify that lock for thread pool is held when calling this routine. 2768 for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL; 2769 thread = thread->th.th_next_pool) { 2770 #if KMP_OS_WINDOWS 2771 DWORD exit_val; 2772 #endif 2773 if (TCR_PTR(thread->th.th_task_team) == NULL) { 2774 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n", 2775 __kmp_gtid_from_thread(thread))); 2776 continue; 2777 } 2778 #if KMP_OS_WINDOWS 2779 // TODO: GEH - add this check for Linux* OS / OS X* as well? 2780 if (!__kmp_is_thread_alive(thread, &exit_val)) { 2781 thread->th.th_task_team = NULL; 2782 continue; 2783 } 2784 #endif 2785 2786 done = FALSE; // Because th_task_team pointer is not NULL for this thread 2787 2788 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to " 2789 "unreference task_team\n", 2790 __kmp_gtid_from_thread(thread))); 2791 2792 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 2793 volatile void *sleep_loc; 2794 // If the thread is sleeping, awaken it. 2795 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != 2796 NULL) { 2797 KA_TRACE( 2798 10, 2799 ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n", 2800 __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread))); 2801 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc); 2802 } 2803 } 2804 } 2805 if (done) { 2806 break; 2807 } 2808 2809 // If we are oversubscribed, or have waited a bit (and library mode is 2810 // throughput), yield. Pause is in the following code. 2811 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2812 KMP_YIELD_SPIN(spins); // Yields only if KMP_LIBRARY=throughput 2813 } 2814 } 2815 2816 // __kmp_task_team_setup: Create a task_team for the current team, but use 2817 // an already created, unused one if it already exists. 2818 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) { 2819 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2820 2821 // If this task_team hasn't been created yet, allocate it. It will be used in 2822 // the region after the next. 2823 // If it exists, it is the current task team and shouldn't be touched yet as 2824 // it may still be in use. 2825 if (team->t.t_task_team[this_thr->th.th_task_state] == NULL && 2826 (always || team->t.t_nproc > 1)) { 2827 team->t.t_task_team[this_thr->th.th_task_state] = 2828 __kmp_allocate_task_team(this_thr, team); 2829 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p " 2830 "for team %d at parity=%d\n", 2831 __kmp_gtid_from_thread(this_thr), 2832 team->t.t_task_team[this_thr->th.th_task_state], 2833 ((team != NULL) ? team->t.t_id : -1), 2834 this_thr->th.th_task_state)); 2835 } 2836 2837 // After threads exit the release, they will call sync, and then point to this 2838 // other task_team; make sure it is allocated and properly initialized. As 2839 // threads spin in the barrier release phase, they will continue to use the 2840 // previous task_team struct(above), until they receive the signal to stop 2841 // checking for tasks (they can't safely reference the kmp_team_t struct, 2842 // which could be reallocated by the master thread). No task teams are formed 2843 // for serialized teams. 2844 if (team->t.t_nproc > 1) { 2845 int other_team = 1 - this_thr->th.th_task_state; 2846 if (team->t.t_task_team[other_team] == NULL) { // setup other team as well 2847 team->t.t_task_team[other_team] = 2848 __kmp_allocate_task_team(this_thr, team); 2849 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new " 2850 "task_team %p for team %d at parity=%d\n", 2851 __kmp_gtid_from_thread(this_thr), 2852 team->t.t_task_team[other_team], 2853 ((team != NULL) ? team->t.t_id : -1), other_team)); 2854 } else { // Leave the old task team struct in place for the upcoming region; 2855 // adjust as needed 2856 kmp_task_team_t *task_team = team->t.t_task_team[other_team]; 2857 if (!task_team->tt.tt_active || 2858 team->t.t_nproc != task_team->tt.tt_nproc) { 2859 TCW_4(task_team->tt.tt_nproc, team->t.t_nproc); 2860 TCW_4(task_team->tt.tt_found_tasks, FALSE); 2861 #if OMP_45_ENABLED 2862 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 2863 #endif 2864 TCW_4(task_team->tt.tt_unfinished_threads, team->t.t_nproc); 2865 TCW_4(task_team->tt.tt_active, TRUE); 2866 } 2867 // if team size has changed, the first thread to enable tasking will 2868 // realloc threads_data if necessary 2869 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team " 2870 "%p for team %d at parity=%d\n", 2871 __kmp_gtid_from_thread(this_thr), 2872 team->t.t_task_team[other_team], 2873 ((team != NULL) ? team->t.t_id : -1), other_team)); 2874 } 2875 } 2876 } 2877 2878 // __kmp_task_team_sync: Propagation of task team data from team to threads 2879 // which happens just after the release phase of a team barrier. This may be 2880 // called by any thread, but only for teams with # threads > 1. 2881 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) { 2882 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2883 2884 // Toggle the th_task_state field, to switch which task_team this thread 2885 // refers to 2886 this_thr->th.th_task_state = 1 - this_thr->th.th_task_state; 2887 // It is now safe to propagate the task team pointer from the team struct to 2888 // the current thread. 2889 TCW_PTR(this_thr->th.th_task_team, 2890 team->t.t_task_team[this_thr->th.th_task_state]); 2891 KA_TRACE(20, 2892 ("__kmp_task_team_sync: Thread T#%d task team switched to task_team " 2893 "%p from Team #%d (parity=%d)\n", 2894 __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team, 2895 ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state)); 2896 } 2897 2898 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the 2899 // barrier gather phase. Only called by master thread if #threads in team > 1 or 2900 // if proxy tasks were created. 2901 // 2902 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off 2903 // by passing in 0 optionally as the last argument. When wait is zero, master 2904 // thread does not wait for unfinished_threads to reach 0. 2905 void __kmp_task_team_wait( 2906 kmp_info_t *this_thr, 2907 kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) { 2908 kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state]; 2909 2910 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2911 KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team); 2912 2913 if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) { 2914 if (wait) { 2915 KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks " 2916 "(for unfinished_threads to reach 0) on task_team = %p\n", 2917 __kmp_gtid_from_thread(this_thr), task_team)); 2918 // Worker threads may have dropped through to release phase, but could 2919 // still be executing tasks. Wait here for tasks to complete. To avoid 2920 // memory contention, only master thread checks termination condition. 2921 kmp_flag_32 flag( 2922 RCAST(volatile kmp_uint32 *, &task_team->tt.tt_unfinished_threads), 2923 0U); 2924 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 2925 } 2926 // Deactivate the old task team, so that the worker threads will stop 2927 // referencing it while spinning. 2928 KA_TRACE( 2929 20, 2930 ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: " 2931 "setting active to false, setting local and team's pointer to NULL\n", 2932 __kmp_gtid_from_thread(this_thr), task_team)); 2933 #if OMP_45_ENABLED 2934 KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 || 2935 task_team->tt.tt_found_proxy_tasks == TRUE); 2936 TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE); 2937 #else 2938 KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1); 2939 #endif 2940 TCW_SYNC_4(task_team->tt.tt_active, FALSE); 2941 KMP_MB(); 2942 2943 TCW_PTR(this_thr->th.th_task_team, NULL); 2944 } 2945 } 2946 2947 // __kmp_tasking_barrier: 2948 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier. 2949 // Internal function to execute all tasks prior to a regular barrier or a join 2950 // barrier. It is a full barrier itself, which unfortunately turns regular 2951 // barriers into double barriers and join barriers into 1 1/2 barriers. 2952 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) { 2953 volatile kmp_uint32 *spin = RCAST( 2954 volatile kmp_uint32 *, 2955 &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads); 2956 int flag = FALSE; 2957 KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier); 2958 2959 #if USE_ITT_BUILD 2960 KMP_FSYNC_SPIN_INIT(spin, (kmp_uint32 *)NULL); 2961 #endif /* USE_ITT_BUILD */ 2962 kmp_flag_32 spin_flag(spin, 0U); 2963 while (!spin_flag.execute_tasks(thread, gtid, TRUE, 2964 &flag USE_ITT_BUILD_ARG(NULL), 0)) { 2965 #if USE_ITT_BUILD 2966 // TODO: What about itt_sync_obj?? 2967 KMP_FSYNC_SPIN_PREPARE(CCAST(void *, RCAST(volatile void *, spin))); 2968 #endif /* USE_ITT_BUILD */ 2969 2970 if (TCR_4(__kmp_global.g.g_done)) { 2971 if (__kmp_global.g.g_abort) 2972 __kmp_abort_thread(); 2973 break; 2974 } 2975 KMP_YIELD(TRUE); // GH: We always yield here 2976 } 2977 #if USE_ITT_BUILD 2978 KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, RCAST(volatile void *, spin))); 2979 #endif /* USE_ITT_BUILD */ 2980 } 2981 2982 #if OMP_45_ENABLED 2983 2984 // __kmp_give_task puts a task into a given thread queue if: 2985 // - the queue for that thread was created 2986 // - there's space in that queue 2987 // Because of this, __kmp_push_task needs to check if there's space after 2988 // getting the lock 2989 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task, 2990 kmp_int32 pass) { 2991 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 2992 kmp_task_team_t *task_team = taskdata->td_task_team; 2993 2994 KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n", 2995 taskdata, tid)); 2996 2997 // If task_team is NULL something went really bad... 2998 KMP_DEBUG_ASSERT(task_team != NULL); 2999 3000 bool result = false; 3001 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 3002 3003 if (thread_data->td.td_deque == NULL) { 3004 // There's no queue in this thread, go find another one 3005 // We're guaranteed that at least one thread has a queue 3006 KA_TRACE(30, 3007 ("__kmp_give_task: thread %d has no queue while giving task %p.\n", 3008 tid, taskdata)); 3009 return result; 3010 } 3011 3012 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3013 TASK_DEQUE_SIZE(thread_data->td)) { 3014 KA_TRACE( 3015 30, 3016 ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", 3017 taskdata, tid)); 3018 3019 // if this deque is bigger than the pass ratio give a chance to another 3020 // thread 3021 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 3022 return result; 3023 3024 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3025 __kmp_realloc_task_deque(thread, thread_data); 3026 3027 } else { 3028 3029 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3030 3031 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3032 TASK_DEQUE_SIZE(thread_data->td)) { 3033 KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to " 3034 "thread %d.\n", 3035 taskdata, tid)); 3036 3037 // if this deque is bigger than the pass ratio give a chance to another 3038 // thread 3039 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 3040 goto release_and_exit; 3041 3042 __kmp_realloc_task_deque(thread, thread_data); 3043 } 3044 } 3045 3046 // lock is held here, and there is space in the deque 3047 3048 thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata; 3049 // Wrap index. 3050 thread_data->td.td_deque_tail = 3051 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 3052 TCW_4(thread_data->td.td_deque_ntasks, 3053 TCR_4(thread_data->td.td_deque_ntasks) + 1); 3054 3055 result = true; 3056 KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n", 3057 taskdata, tid)); 3058 3059 release_and_exit: 3060 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 3061 3062 return result; 3063 } 3064 3065 /* The finish of the proxy tasks is divided in two pieces: 3066 - the top half is the one that can be done from a thread outside the team 3067 - the bottom half must be run from a them within the team 3068 3069 In order to run the bottom half the task gets queued back into one of the 3070 threads of the team. Once the td_incomplete_child_task counter of the parent 3071 is decremented the threads can leave the barriers. So, the bottom half needs 3072 to be queued before the counter is decremented. The top half is therefore 3073 divided in two parts: 3074 - things that can be run before queuing the bottom half 3075 - things that must be run after queuing the bottom half 3076 3077 This creates a second race as the bottom half can free the task before the 3078 second top half is executed. To avoid this we use the 3079 td_incomplete_child_task of the proxy task to synchronize the top and bottom 3080 half. */ 3081 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 3082 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 3083 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3084 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 3085 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 3086 3087 taskdata->td_flags.complete = 1; // mark the task as completed 3088 3089 if (taskdata->td_taskgroup) 3090 KMP_TEST_THEN_DEC32(&taskdata->td_taskgroup->count); 3091 3092 // Create an imaginary children for this task so the bottom half cannot 3093 // release the task before we have completed the second top half 3094 TCI_4(taskdata->td_incomplete_child_tasks); 3095 } 3096 3097 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 3098 kmp_int32 children = 0; 3099 3100 // Predecrement simulated by "- 1" calculation 3101 children = 3102 KMP_TEST_THEN_DEC32( 3103 CCAST(kmp_int32 *, &taskdata->td_parent->td_incomplete_child_tasks)) - 3104 1; 3105 KMP_DEBUG_ASSERT(children >= 0); 3106 3107 // Remove the imaginary children 3108 TCD_4(taskdata->td_incomplete_child_tasks); 3109 } 3110 3111 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) { 3112 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3113 kmp_info_t *thread = __kmp_threads[gtid]; 3114 3115 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3116 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 3117 1); // top half must run before bottom half 3118 3119 // We need to wait to make sure the top half is finished 3120 // Spinning here should be ok as this should happen quickly 3121 while (TCR_4(taskdata->td_incomplete_child_tasks) > 0) 3122 ; 3123 3124 __kmp_release_deps(gtid, taskdata); 3125 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 3126 } 3127 3128 /*! 3129 @ingroup TASKING 3130 @param gtid Global Thread ID of encountering thread 3131 @param ptask Task which execution is completed 3132 3133 Execute the completation of a proxy task from a thread of that is part of the 3134 team. Run first and bottom halves directly. 3135 */ 3136 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) { 3137 KMP_DEBUG_ASSERT(ptask != NULL); 3138 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3139 KA_TRACE( 3140 10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n", 3141 gtid, taskdata)); 3142 3143 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3144 3145 __kmp_first_top_half_finish_proxy(taskdata); 3146 __kmp_second_top_half_finish_proxy(taskdata); 3147 __kmp_bottom_half_finish_proxy(gtid, ptask); 3148 3149 KA_TRACE(10, 3150 ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n", 3151 gtid, taskdata)); 3152 } 3153 3154 /*! 3155 @ingroup TASKING 3156 @param ptask Task which execution is completed 3157 3158 Execute the completation of a proxy task from a thread that could not belong to 3159 the team. 3160 */ 3161 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) { 3162 KMP_DEBUG_ASSERT(ptask != NULL); 3163 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3164 3165 KA_TRACE( 3166 10, 3167 ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n", 3168 taskdata)); 3169 3170 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3171 3172 __kmp_first_top_half_finish_proxy(taskdata); 3173 3174 // Enqueue task to complete bottom half completion from a thread within the 3175 // corresponding team 3176 kmp_team_t *team = taskdata->td_team; 3177 kmp_int32 nthreads = team->t.t_nproc; 3178 kmp_info_t *thread; 3179 3180 // This should be similar to start_k = __kmp_get_random( thread ) % nthreads 3181 // but we cannot use __kmp_get_random here 3182 kmp_int32 start_k = 0; 3183 kmp_int32 pass = 1; 3184 kmp_int32 k = start_k; 3185 3186 do { 3187 // For now we're just linearly trying to find a thread 3188 thread = team->t.t_threads[k]; 3189 k = (k + 1) % nthreads; 3190 3191 // we did a full pass through all the threads 3192 if (k == start_k) 3193 pass = pass << 1; 3194 3195 } while (!__kmp_give_task(thread, k, ptask, pass)); 3196 3197 __kmp_second_top_half_finish_proxy(taskdata); 3198 3199 KA_TRACE( 3200 10, 3201 ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", 3202 taskdata)); 3203 } 3204 3205 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task 3206 // for taskloop 3207 // 3208 // thread: allocating thread 3209 // task_src: pointer to source task to be duplicated 3210 // returns: a pointer to the allocated kmp_task_t structure (task). 3211 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) { 3212 kmp_task_t *task; 3213 kmp_taskdata_t *taskdata; 3214 kmp_taskdata_t *taskdata_src; 3215 kmp_taskdata_t *parent_task = thread->th.th_current_task; 3216 size_t shareds_offset; 3217 size_t task_size; 3218 3219 KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, 3220 task_src)); 3221 taskdata_src = KMP_TASK_TO_TASKDATA(task_src); 3222 KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy == 3223 TASK_FULL); // it should not be proxy task 3224 KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT); 3225 task_size = taskdata_src->td_size_alloc; 3226 3227 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 3228 KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread, 3229 task_size)); 3230 #if USE_FAST_MEMORY 3231 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size); 3232 #else 3233 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size); 3234 #endif /* USE_FAST_MEMORY */ 3235 KMP_MEMCPY(taskdata, taskdata_src, task_size); 3236 3237 task = KMP_TASKDATA_TO_TASK(taskdata); 3238 3239 // Initialize new task (only specific fields not affected by memcpy) 3240 taskdata->td_task_id = KMP_GEN_TASK_ID(); 3241 if (task->shareds != NULL) { // need setup shareds pointer 3242 shareds_offset = (char *)task_src->shareds - (char *)taskdata_src; 3243 task->shareds = &((char *)taskdata)[shareds_offset]; 3244 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 3245 0); 3246 } 3247 taskdata->td_alloc_thread = thread; 3248 taskdata->td_taskgroup = 3249 parent_task 3250 ->td_taskgroup; // task inherits the taskgroup from the parent task 3251 3252 // Only need to keep track of child task counts if team parallel and tasking 3253 // not serialized 3254 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { 3255 KMP_TEST_THEN_INC32( 3256 CCAST(kmp_int32 *, &parent_task->td_incomplete_child_tasks)); 3257 if (parent_task->td_taskgroup) 3258 KMP_TEST_THEN_INC32(&parent_task->td_taskgroup->count); 3259 // Only need to keep track of allocated child tasks for explicit tasks since 3260 // implicit not deallocated 3261 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) 3262 KMP_TEST_THEN_INC32( 3263 CCAST(kmp_int32 *, &taskdata->td_parent->td_allocated_child_tasks)); 3264 } 3265 3266 KA_TRACE(20, 3267 ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n", 3268 thread, taskdata, taskdata->td_parent)); 3269 #if OMPT_SUPPORT 3270 __kmp_task_init_ompt(taskdata, thread->th.th_info.ds.ds_gtid, 3271 (void *)task->routine); 3272 #endif 3273 return task; 3274 } 3275 3276 // Routine optionally generated by th ecompiler for setting the lastprivate flag 3277 // and calling needed constructors for private/firstprivate objects 3278 // (used to form taskloop tasks from pattern task) 3279 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32); 3280 3281 // __kmp_taskloop_linear: Start tasks of the taskloop linearly 3282 // 3283 // loc Source location information 3284 // gtid Global thread ID 3285 // task Task with whole loop iteration range 3286 // lb Pointer to loop lower bound 3287 // ub Pointer to loop upper bound 3288 // st Loop stride 3289 // sched Schedule specified 0/1/2 for none/grainsize/num_tasks 3290 // grainsize Schedule value if specified 3291 // task_dup Tasks duplication routine 3292 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, 3293 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 3294 int sched, kmp_uint64 grainsize, void *task_dup) { 3295 KMP_COUNT_BLOCK(OMP_TASKLOOP); 3296 KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling); 3297 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 3298 kmp_uint64 tc; 3299 kmp_uint64 lower = *lb; // compiler provides global bounds here 3300 kmp_uint64 upper = *ub; 3301 kmp_uint64 i, num_tasks = 0, extras = 0; 3302 kmp_info_t *thread = __kmp_threads[gtid]; 3303 kmp_taskdata_t *current_task = thread->th.th_current_task; 3304 kmp_task_t *next_task; 3305 kmp_int32 lastpriv = 0; 3306 size_t lower_offset = 3307 (char *)lb - (char *)task; // remember offset of lb in the task structure 3308 size_t upper_offset = 3309 (char *)ub - (char *)task; // remember offset of ub in the task structure 3310 3311 // compute trip count 3312 if (st == 1) { // most common case 3313 tc = upper - lower + 1; 3314 } else if (st < 0) { 3315 tc = (lower - upper) / (-st) + 1; 3316 } else { // st > 0 3317 tc = (upper - lower) / st + 1; 3318 } 3319 if (tc == 0) { 3320 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid)); 3321 // free the pattern task and exit 3322 __kmp_task_start(gtid, task, current_task); 3323 // do not execute anything for zero-trip loop 3324 __kmp_task_finish(gtid, task, current_task); 3325 return; 3326 } 3327 3328 // compute num_tasks/grainsize based on the input provided 3329 switch (sched) { 3330 case 0: // no schedule clause specified, we can choose the default 3331 // let's try to schedule (team_size*10) tasks 3332 grainsize = thread->th.th_team_nproc * 10; 3333 case 2: // num_tasks provided 3334 if (grainsize > tc) { 3335 num_tasks = tc; // too big num_tasks requested, adjust values 3336 grainsize = 1; 3337 extras = 0; 3338 } else { 3339 num_tasks = grainsize; 3340 grainsize = tc / num_tasks; 3341 extras = tc % num_tasks; 3342 } 3343 break; 3344 case 1: // grainsize provided 3345 if (grainsize > tc) { 3346 num_tasks = 1; // too big grainsize requested, adjust values 3347 grainsize = tc; 3348 extras = 0; 3349 } else { 3350 num_tasks = tc / grainsize; 3351 grainsize = 3352 tc / 3353 num_tasks; // adjust grainsize for balanced distribution of iterations 3354 extras = tc % num_tasks; 3355 } 3356 break; 3357 default: 3358 KMP_ASSERT2(0, "unknown scheduling of taskloop"); 3359 } 3360 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); 3361 KMP_DEBUG_ASSERT(num_tasks > extras); 3362 KMP_DEBUG_ASSERT(num_tasks > 0); 3363 KA_TRACE(20, ("__kmpc_taskloop: T#%d will launch: num_tasks %lld, grainsize " 3364 "%lld, extras %lld\n", 3365 gtid, num_tasks, grainsize, extras)); 3366 3367 // Main loop, launch num_tasks tasks, assign grainsize iterations each task 3368 for (i = 0; i < num_tasks; ++i) { 3369 kmp_uint64 chunk_minus_1; 3370 if (extras == 0) { 3371 chunk_minus_1 = grainsize - 1; 3372 } else { 3373 chunk_minus_1 = grainsize; 3374 --extras; // first extras iterations get bigger chunk (grainsize+1) 3375 } 3376 upper = lower + st * chunk_minus_1; 3377 if (i == num_tasks - 1) { 3378 // schedule the last task, set lastprivate flag 3379 lastpriv = 1; 3380 #if KMP_DEBUG 3381 if (st == 1) 3382 KMP_DEBUG_ASSERT(upper == *ub); 3383 else if (st > 0) 3384 KMP_DEBUG_ASSERT(upper + st > *ub); 3385 else 3386 KMP_DEBUG_ASSERT(upper + st < *ub); 3387 #endif 3388 } 3389 next_task = __kmp_task_dup_alloc(thread, task); // allocate new task 3390 *(kmp_uint64 *)((char *)next_task + lower_offset) = 3391 lower; // adjust task-specific bounds 3392 *(kmp_uint64 *)((char *)next_task + upper_offset) = upper; 3393 if (ptask_dup != NULL) 3394 ptask_dup(next_task, task, 3395 lastpriv); // set lastprivate flag, construct fistprivates, etc. 3396 KA_TRACE(20, ("__kmpc_taskloop: T#%d schedule task %p: lower %lld, upper " 3397 "%lld (offsets %p %p)\n", 3398 gtid, next_task, lower, upper, lower_offset, upper_offset)); 3399 __kmp_omp_task(gtid, next_task, true); // schedule new task 3400 lower = upper + st; // adjust lower bound for the next iteration 3401 } 3402 // free the pattern task and exit 3403 __kmp_task_start(gtid, task, current_task); 3404 // do not execute the pattern task, just do bookkeeping 3405 __kmp_task_finish(gtid, task, current_task); 3406 } 3407 3408 /*! 3409 @ingroup TASKING 3410 @param loc Source location information 3411 @param gtid Global thread ID 3412 @param task Task structure 3413 @param if_val Value of the if clause 3414 @param lb Pointer to loop lower bound 3415 @param ub Pointer to loop upper bound 3416 @param st Loop stride 3417 @param nogroup Flag, 1 if nogroup clause specified, 0 otherwise 3418 @param sched Schedule specified 0/1/2 for none/grainsize/num_tasks 3419 @param grainsize Schedule value if specified 3420 @param task_dup Tasks duplication routine 3421 3422 Execute the taskloop construct. 3423 */ 3424 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, 3425 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, 3426 int sched, kmp_uint64 grainsize, void *task_dup) { 3427 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 3428 KMP_DEBUG_ASSERT(task != NULL); 3429 3430 KA_TRACE(10, ("__kmpc_taskloop(enter): T#%d, pattern task %p, lb %lld ub " 3431 "%lld st %lld, grain %llu(%d)\n", 3432 gtid, taskdata, *lb, *ub, st, grainsize, sched)); 3433 3434 // check if clause value first 3435 if (if_val == 0) { // if(0) specified, mark task as serial 3436 taskdata->td_flags.task_serial = 1; 3437 taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied 3438 } 3439 if (nogroup == 0) { 3440 __kmpc_taskgroup(loc, gtid); 3441 } 3442 3443 if (1 /* AC: use some heuristic here to choose task scheduling method */) { 3444 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, sched, grainsize, 3445 task_dup); 3446 } 3447 3448 if (nogroup == 0) { 3449 __kmpc_end_taskgroup(loc, gtid); 3450 } 3451 KA_TRACE(10, ("__kmpc_taskloop(exit): T#%d\n", gtid)); 3452 } 3453 3454 #endif 3455