1 /* 2 * kmp_tasking.cpp -- OpenMP 3.0 tasking support. 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "kmp.h" 17 #include "kmp_i18n.h" 18 #include "kmp_itt.h" 19 #include "kmp_stats.h" 20 #include "kmp_wait_release.h" 21 22 #if OMPT_SUPPORT 23 #include "ompt-specific.h" 24 #endif 25 26 #include "tsan_annotations.h" 27 28 /* forward declaration */ 29 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 30 kmp_info_t *this_thr); 31 static void __kmp_alloc_task_deque(kmp_info_t *thread, 32 kmp_thread_data_t *thread_data); 33 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 34 kmp_task_team_t *task_team); 35 36 #ifdef OMP_45_ENABLED 37 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask); 38 #endif 39 40 #ifdef BUILD_TIED_TASK_STACK 41 42 // __kmp_trace_task_stack: print the tied tasks from the task stack in order 43 // from top do bottom 44 // 45 // gtid: global thread identifier for thread containing stack 46 // thread_data: thread data for task team thread containing stack 47 // threshold: value above which the trace statement triggers 48 // location: string identifying call site of this function (for trace) 49 static void __kmp_trace_task_stack(kmp_int32 gtid, 50 kmp_thread_data_t *thread_data, 51 int threshold, char *location) { 52 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 53 kmp_taskdata_t **stack_top = task_stack->ts_top; 54 kmp_int32 entries = task_stack->ts_entries; 55 kmp_taskdata_t *tied_task; 56 57 KA_TRACE( 58 threshold, 59 ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, " 60 "first_block = %p, stack_top = %p \n", 61 location, gtid, entries, task_stack->ts_first_block, stack_top)); 62 63 KMP_DEBUG_ASSERT(stack_top != NULL); 64 KMP_DEBUG_ASSERT(entries > 0); 65 66 while (entries != 0) { 67 KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]); 68 // fix up ts_top if we need to pop from previous block 69 if (entries & TASK_STACK_INDEX_MASK == 0) { 70 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top); 71 72 stack_block = stack_block->sb_prev; 73 stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 74 } 75 76 // finish bookkeeping 77 stack_top--; 78 entries--; 79 80 tied_task = *stack_top; 81 82 KMP_DEBUG_ASSERT(tied_task != NULL); 83 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 84 85 KA_TRACE(threshold, 86 ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, " 87 "stack_top=%p, tied_task=%p\n", 88 location, gtid, entries, stack_top, tied_task)); 89 } 90 KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]); 91 92 KA_TRACE(threshold, 93 ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n", 94 location, gtid)); 95 } 96 97 // __kmp_init_task_stack: initialize the task stack for the first time 98 // after a thread_data structure is created. 99 // It should not be necessary to do this again (assuming the stack works). 100 // 101 // gtid: global thread identifier of calling thread 102 // thread_data: thread data for task team thread containing stack 103 static void __kmp_init_task_stack(kmp_int32 gtid, 104 kmp_thread_data_t *thread_data) { 105 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 106 kmp_stack_block_t *first_block; 107 108 // set up the first block of the stack 109 first_block = &task_stack->ts_first_block; 110 task_stack->ts_top = (kmp_taskdata_t **)first_block; 111 memset((void *)first_block, '\0', 112 TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *)); 113 114 // initialize the stack to be empty 115 task_stack->ts_entries = TASK_STACK_EMPTY; 116 first_block->sb_next = NULL; 117 first_block->sb_prev = NULL; 118 } 119 120 // __kmp_free_task_stack: free the task stack when thread_data is destroyed. 121 // 122 // gtid: global thread identifier for calling thread 123 // thread_data: thread info for thread containing stack 124 static void __kmp_free_task_stack(kmp_int32 gtid, 125 kmp_thread_data_t *thread_data) { 126 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 127 kmp_stack_block_t *stack_block = &task_stack->ts_first_block; 128 129 KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY); 130 // free from the second block of the stack 131 while (stack_block != NULL) { 132 kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL; 133 134 stack_block->sb_next = NULL; 135 stack_block->sb_prev = NULL; 136 if (stack_block != &task_stack->ts_first_block) { 137 __kmp_thread_free(thread, 138 stack_block); // free the block, if not the first 139 } 140 stack_block = next_block; 141 } 142 // initialize the stack to be empty 143 task_stack->ts_entries = 0; 144 task_stack->ts_top = NULL; 145 } 146 147 // __kmp_push_task_stack: Push the tied task onto the task stack. 148 // Grow the stack if necessary by allocating another block. 149 // 150 // gtid: global thread identifier for calling thread 151 // thread: thread info for thread containing stack 152 // tied_task: the task to push on the stack 153 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread, 154 kmp_taskdata_t *tied_task) { 155 // GEH - need to consider what to do if tt_threads_data not allocated yet 156 kmp_thread_data_t *thread_data = 157 &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 158 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 159 160 if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) { 161 return; // Don't push anything on stack if team or team tasks are serialized 162 } 163 164 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 165 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 166 167 KA_TRACE(20, 168 ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n", 169 gtid, thread, tied_task)); 170 // Store entry 171 *(task_stack->ts_top) = tied_task; 172 173 // Do bookkeeping for next push 174 task_stack->ts_top++; 175 task_stack->ts_entries++; 176 177 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 178 // Find beginning of this task block 179 kmp_stack_block_t *stack_block = 180 (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE); 181 182 // Check if we already have a block 183 if (stack_block->sb_next != 184 NULL) { // reset ts_top to beginning of next block 185 task_stack->ts_top = &stack_block->sb_next->sb_block[0]; 186 } else { // Alloc new block and link it up 187 kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc( 188 thread, sizeof(kmp_stack_block_t)); 189 190 task_stack->ts_top = &new_block->sb_block[0]; 191 stack_block->sb_next = new_block; 192 new_block->sb_prev = stack_block; 193 new_block->sb_next = NULL; 194 195 KA_TRACE( 196 30, 197 ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n", 198 gtid, tied_task, new_block)); 199 } 200 } 201 KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 202 tied_task)); 203 } 204 205 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return 206 // the task, just check to make sure it matches the ending task passed in. 207 // 208 // gtid: global thread identifier for the calling thread 209 // thread: thread info structure containing stack 210 // tied_task: the task popped off the stack 211 // ending_task: the task that is ending (should match popped task) 212 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread, 213 kmp_taskdata_t *ending_task) { 214 // GEH - need to consider what to do if tt_threads_data not allocated yet 215 kmp_thread_data_t *thread_data = 216 &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)]; 217 kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; 218 kmp_taskdata_t *tied_task; 219 220 if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) { 221 // Don't pop anything from stack if team or team tasks are serialized 222 return; 223 } 224 225 KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); 226 KMP_DEBUG_ASSERT(task_stack->ts_entries > 0); 227 228 KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid, 229 thread)); 230 231 // fix up ts_top if we need to pop from previous block 232 if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { 233 kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top); 234 235 stack_block = stack_block->sb_prev; 236 task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; 237 } 238 239 // finish bookkeeping 240 task_stack->ts_top--; 241 task_stack->ts_entries--; 242 243 tied_task = *(task_stack->ts_top); 244 245 KMP_DEBUG_ASSERT(tied_task != NULL); 246 KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); 247 KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly 248 249 KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid, 250 tied_task)); 251 return; 252 } 253 #endif /* BUILD_TIED_TASK_STACK */ 254 255 // __kmp_push_task: Add a task to the thread's deque 256 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) { 257 kmp_info_t *thread = __kmp_threads[gtid]; 258 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 259 kmp_task_team_t *task_team = thread->th.th_task_team; 260 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 261 kmp_thread_data_t *thread_data; 262 263 KA_TRACE(20, 264 ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata)); 265 266 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 267 // untied task needs to increment counter so that the task structure is not 268 // freed prematurely 269 kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count); 270 KA_TRACE( 271 20, 272 ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n", 273 gtid, counter, taskdata)); 274 } 275 276 // The first check avoids building task_team thread data if serialized 277 if (taskdata->td_flags.task_serial) { 278 KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning " 279 "TASK_NOT_PUSHED for task %p\n", 280 gtid, taskdata)); 281 return TASK_NOT_PUSHED; 282 } 283 284 // Now that serialized tasks have returned, we can assume that we are not in 285 // immediate exec mode 286 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 287 if (!KMP_TASKING_ENABLED(task_team)) { 288 __kmp_enable_tasking(task_team, thread); 289 } 290 KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE); 291 KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL); 292 293 // Find tasking deque specific to encountering thread 294 thread_data = &task_team->tt.tt_threads_data[tid]; 295 296 // No lock needed since only owner can allocate 297 if (thread_data->td.td_deque == NULL) { 298 __kmp_alloc_task_deque(thread, thread_data); 299 } 300 301 // Check if deque is full 302 if (TCR_4(thread_data->td.td_deque_ntasks) >= 303 TASK_DEQUE_SIZE(thread_data->td)) { 304 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning " 305 "TASK_NOT_PUSHED for task %p\n", 306 gtid, taskdata)); 307 return TASK_NOT_PUSHED; 308 } 309 310 // Lock the deque for the task push operation 311 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 312 313 #if OMP_45_ENABLED 314 // Need to recheck as we can get a proxy task from a thread outside of OpenMP 315 if (TCR_4(thread_data->td.td_deque_ntasks) >= 316 TASK_DEQUE_SIZE(thread_data->td)) { 317 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 318 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; returning " 319 "TASK_NOT_PUSHED for task %p\n", 320 gtid, taskdata)); 321 return TASK_NOT_PUSHED; 322 } 323 #else 324 // Must have room since no thread can add tasks but calling thread 325 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) < 326 TASK_DEQUE_SIZE(thread_data->td)); 327 #endif 328 329 thread_data->td.td_deque[thread_data->td.td_deque_tail] = 330 taskdata; // Push taskdata 331 // Wrap index. 332 thread_data->td.td_deque_tail = 333 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 334 TCW_4(thread_data->td.td_deque_ntasks, 335 TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count 336 337 KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: " 338 "task=%p ntasks=%d head=%u tail=%u\n", 339 gtid, taskdata, thread_data->td.td_deque_ntasks, 340 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 341 342 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 343 344 return TASK_SUCCESSFULLY_PUSHED; 345 } 346 347 // __kmp_pop_current_task_from_thread: set up current task from called thread 348 // when team ends 349 // 350 // this_thr: thread structure to set current_task in. 351 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) { 352 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d " 353 "this_thread=%p, curtask=%p, " 354 "curtask_parent=%p\n", 355 0, this_thr, this_thr->th.th_current_task, 356 this_thr->th.th_current_task->td_parent)); 357 358 this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent; 359 360 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d " 361 "this_thread=%p, curtask=%p, " 362 "curtask_parent=%p\n", 363 0, this_thr, this_thr->th.th_current_task, 364 this_thr->th.th_current_task->td_parent)); 365 } 366 367 // __kmp_push_current_task_to_thread: set up current task in called thread for a 368 // new team 369 // 370 // this_thr: thread structure to set up 371 // team: team for implicit task data 372 // tid: thread within team to set up 373 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team, 374 int tid) { 375 // current task of the thread is a parent of the new just created implicit 376 // tasks of new team 377 KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p " 378 "curtask=%p " 379 "parent_task=%p\n", 380 tid, this_thr, this_thr->th.th_current_task, 381 team->t.t_implicit_task_taskdata[tid].td_parent)); 382 383 KMP_DEBUG_ASSERT(this_thr != NULL); 384 385 if (tid == 0) { 386 if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) { 387 team->t.t_implicit_task_taskdata[0].td_parent = 388 this_thr->th.th_current_task; 389 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0]; 390 } 391 } else { 392 team->t.t_implicit_task_taskdata[tid].td_parent = 393 team->t.t_implicit_task_taskdata[0].td_parent; 394 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid]; 395 } 396 397 KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p " 398 "curtask=%p " 399 "parent_task=%p\n", 400 tid, this_thr, this_thr->th.th_current_task, 401 team->t.t_implicit_task_taskdata[tid].td_parent)); 402 } 403 404 // __kmp_task_start: bookkeeping for a task starting execution 405 // 406 // GTID: global thread id of calling thread 407 // task: task starting execution 408 // current_task: task suspending 409 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task, 410 kmp_taskdata_t *current_task) { 411 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 412 kmp_info_t *thread = __kmp_threads[gtid]; 413 414 KA_TRACE(10, 415 ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n", 416 gtid, taskdata, current_task)); 417 418 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 419 420 // mark currently executing task as suspended 421 // TODO: GEH - make sure root team implicit task is initialized properly. 422 // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 ); 423 current_task->td_flags.executing = 0; 424 425 // Add task to stack if tied 426 #ifdef BUILD_TIED_TASK_STACK 427 if (taskdata->td_flags.tiedness == TASK_TIED) { 428 __kmp_push_task_stack(gtid, thread, taskdata); 429 } 430 #endif /* BUILD_TIED_TASK_STACK */ 431 432 // mark starting task as executing and as current task 433 thread->th.th_current_task = taskdata; 434 435 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 || 436 taskdata->td_flags.tiedness == TASK_UNTIED); 437 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 || 438 taskdata->td_flags.tiedness == TASK_UNTIED); 439 taskdata->td_flags.started = 1; 440 taskdata->td_flags.executing = 1; 441 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 442 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 443 444 // GEH TODO: shouldn't we pass some sort of location identifier here? 445 // APT: yes, we will pass location here. 446 // need to store current thread state (in a thread or taskdata structure) 447 // before setting work_state, otherwise wrong state is set after end of task 448 449 KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata)); 450 451 #if OMPT_SUPPORT 452 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_begin)) { 453 kmp_taskdata_t *parent = taskdata->td_parent; 454 ompt_callbacks.ompt_callback(ompt_event_task_begin)( 455 parent ? parent->ompt_task_info.task_id : ompt_task_id_none, 456 parent ? &(parent->ompt_task_info.frame) : NULL, 457 taskdata->ompt_task_info.task_id, taskdata->ompt_task_info.function); 458 } 459 #endif 460 #if OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE 461 /* OMPT emit all dependences if requested by the tool */ 462 if (ompt_enabled && taskdata->ompt_task_info.ndeps > 0 && 463 ompt_callbacks.ompt_callback(ompt_event_task_dependences)) { 464 ompt_callbacks.ompt_callback(ompt_event_task_dependences)( 465 taskdata->ompt_task_info.task_id, taskdata->ompt_task_info.deps, 466 taskdata->ompt_task_info.ndeps); 467 /* We can now free the allocated memory for the dependencies */ 468 KMP_OMPT_DEPS_FREE(thread, taskdata->ompt_task_info.deps); 469 taskdata->ompt_task_info.deps = NULL; 470 taskdata->ompt_task_info.ndeps = 0; 471 } 472 #endif /* OMP_40_ENABLED && OMPT_SUPPORT && OMPT_TRACE */ 473 474 return; 475 } 476 477 // __kmpc_omp_task_begin_if0: report that a given serialized task has started 478 // execution 479 // 480 // loc_ref: source location information; points to beginning of task block. 481 // gtid: global thread number. 482 // task: task thunk for the started task. 483 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid, 484 kmp_task_t *task) { 485 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 486 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 487 488 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p " 489 "current_task=%p\n", 490 gtid, loc_ref, taskdata, current_task)); 491 492 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 493 // untied task needs to increment counter so that the task structure is not 494 // freed prematurely 495 kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count); 496 KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) " 497 "incremented for task %p\n", 498 gtid, counter, taskdata)); 499 } 500 501 taskdata->td_flags.task_serial = 502 1; // Execute this task immediately, not deferred. 503 __kmp_task_start(gtid, task, current_task); 504 505 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid, 506 loc_ref, taskdata)); 507 508 return; 509 } 510 511 #ifdef TASK_UNUSED 512 // __kmpc_omp_task_begin: report that a given task has started execution 513 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 514 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) { 515 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 516 517 KA_TRACE( 518 10, 519 ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n", 520 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task)); 521 522 __kmp_task_start(gtid, task, current_task); 523 524 KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid, 525 loc_ref, KMP_TASK_TO_TASKDATA(task))); 526 return; 527 } 528 #endif // TASK_UNUSED 529 530 // __kmp_free_task: free the current task space and the space for shareds 531 // 532 // gtid: Global thread ID of calling thread 533 // taskdata: task to free 534 // thread: thread data structure of caller 535 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata, 536 kmp_info_t *thread) { 537 KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid, 538 taskdata)); 539 540 // Check to make sure all flags and counters have the correct values 541 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 542 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0); 543 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1); 544 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 545 KMP_DEBUG_ASSERT(TCR_4(taskdata->td_allocated_child_tasks) == 0 || 546 taskdata->td_flags.task_serial == 1); 547 KMP_DEBUG_ASSERT(TCR_4(taskdata->td_incomplete_child_tasks) == 0); 548 549 taskdata->td_flags.freed = 1; 550 ANNOTATE_HAPPENS_BEFORE(taskdata); 551 // deallocate the taskdata and shared variable blocks associated with this task 552 #if USE_FAST_MEMORY 553 __kmp_fast_free(thread, taskdata); 554 #else /* ! USE_FAST_MEMORY */ 555 __kmp_thread_free(thread, taskdata); 556 #endif 557 558 KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata)); 559 } 560 561 // __kmp_free_task_and_ancestors: free the current task and ancestors without 562 // children 563 // 564 // gtid: Global thread ID of calling thread 565 // taskdata: task to free 566 // thread: thread data structure of caller 567 static void __kmp_free_task_and_ancestors(kmp_int32 gtid, 568 kmp_taskdata_t *taskdata, 569 kmp_info_t *thread) { 570 #if OMP_45_ENABLED 571 // Proxy tasks must always be allowed to free their parents 572 // because they can be run in background even in serial mode. 573 kmp_int32 team_serial = 574 (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) && 575 !taskdata->td_flags.proxy; 576 #else 577 kmp_int32 team_serial = 578 taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser; 579 #endif 580 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 581 582 kmp_int32 children = 583 KMP_TEST_THEN_DEC32(&taskdata->td_allocated_child_tasks) - 1; 584 KMP_DEBUG_ASSERT(children >= 0); 585 586 // Now, go up the ancestor tree to see if any ancestors can now be freed. 587 while (children == 0) { 588 kmp_taskdata_t *parent_taskdata = taskdata->td_parent; 589 590 KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete " 591 "and freeing itself\n", 592 gtid, taskdata)); 593 594 // --- Deallocate my ancestor task --- 595 __kmp_free_task(gtid, taskdata, thread); 596 597 taskdata = parent_taskdata; 598 599 // Stop checking ancestors at implicit task instead of walking up ancestor 600 // tree to avoid premature deallocation of ancestors. 601 if (team_serial || taskdata->td_flags.tasktype == TASK_IMPLICIT) 602 return; 603 604 // Predecrement simulated by "- 1" calculation 605 children = KMP_TEST_THEN_DEC32(&taskdata->td_allocated_child_tasks) - 1; 606 KMP_DEBUG_ASSERT(children >= 0); 607 } 608 609 KA_TRACE( 610 20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; " 611 "not freeing it yet\n", 612 gtid, taskdata, children)); 613 } 614 615 // __kmp_task_finish: bookkeeping to do when a task finishes execution 616 // 617 // gtid: global thread ID for calling thread 618 // task: task to be finished 619 // resumed_task: task to be resumed. (may be NULL if task is serialized) 620 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, 621 kmp_taskdata_t *resumed_task) { 622 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 623 kmp_info_t *thread = __kmp_threads[gtid]; 624 kmp_task_team_t *task_team = 625 thread->th.th_task_team; // might be NULL for serial teams... 626 kmp_int32 children = 0; 627 628 #if OMPT_SUPPORT 629 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_end)) { 630 kmp_taskdata_t *parent = taskdata->td_parent; 631 ompt_callbacks.ompt_callback(ompt_event_task_end)( 632 taskdata->ompt_task_info.task_id); 633 } 634 #endif 635 636 KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming " 637 "task %p\n", 638 gtid, taskdata, resumed_task)); 639 640 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 641 642 // Pop task from stack if tied 643 #ifdef BUILD_TIED_TASK_STACK 644 if (taskdata->td_flags.tiedness == TASK_TIED) { 645 __kmp_pop_task_stack(gtid, thread, taskdata); 646 } 647 #endif /* BUILD_TIED_TASK_STACK */ 648 649 if (taskdata->td_flags.tiedness == TASK_UNTIED) { 650 // untied task needs to check the counter so that the task structure is not 651 // freed prematurely 652 kmp_int32 counter = KMP_TEST_THEN_DEC32(&taskdata->td_untied_count) - 1; 653 KA_TRACE( 654 20, 655 ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n", 656 gtid, counter, taskdata)); 657 if (counter > 0) { 658 // untied task is not done, to be continued possibly by other thread, do 659 // not free it now 660 if (resumed_task == NULL) { 661 KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial); 662 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 663 // task is the parent 664 } 665 thread->th.th_current_task = resumed_task; // restore current_task 666 resumed_task->td_flags.executing = 1; // resume previous task 667 KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, " 668 "resuming task %p\n", 669 gtid, taskdata, resumed_task)); 670 return; 671 } 672 } 673 674 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 675 taskdata->td_flags.complete = 1; // mark the task as completed 676 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1); 677 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 678 679 // Only need to keep track of count if team parallel and tasking not 680 // serialized 681 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { 682 // Predecrement simulated by "- 1" calculation 683 children = 684 KMP_TEST_THEN_DEC32(&taskdata->td_parent->td_incomplete_child_tasks) - 685 1; 686 KMP_DEBUG_ASSERT(children >= 0); 687 #if OMP_40_ENABLED 688 if (taskdata->td_taskgroup) 689 KMP_TEST_THEN_DEC32((kmp_int32 *)(&taskdata->td_taskgroup->count)); 690 #if OMP_45_ENABLED 691 } 692 // if we found proxy tasks there could exist a dependency chain 693 // with the proxy task as origin 694 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) || 695 (task_team && task_team->tt.tt_found_proxy_tasks)) { 696 #endif 697 __kmp_release_deps(gtid, taskdata); 698 #endif 699 } 700 701 // td_flags.executing must be marked as 0 after __kmp_release_deps has been 702 // called. Othertwise, if a task is executed immediately from the release_deps 703 // code, the flag will be reset to 1 again by this same function 704 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1); 705 taskdata->td_flags.executing = 0; // suspend the finishing task 706 707 KA_TRACE( 708 20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n", 709 gtid, taskdata, children)); 710 711 #if OMP_40_ENABLED 712 /* If the tasks' destructor thunk flag has been set, we need to invoke the 713 destructor thunk that has been generated by the compiler. The code is 714 placed here, since at this point other tasks might have been released 715 hence overlapping the destructor invokations with some other work in the 716 released tasks. The OpenMP spec is not specific on when the destructors 717 are invoked, so we should be free to choose. */ 718 if (taskdata->td_flags.destructors_thunk) { 719 kmp_routine_entry_t destr_thunk = task->data1.destructors; 720 KMP_ASSERT(destr_thunk); 721 destr_thunk(gtid, task); 722 } 723 #endif // OMP_40_ENABLED 724 725 // bookkeeping for resuming task: 726 // GEH - note tasking_ser => task_serial 727 KMP_DEBUG_ASSERT( 728 (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) == 729 taskdata->td_flags.task_serial); 730 if (taskdata->td_flags.task_serial) { 731 if (resumed_task == NULL) { 732 resumed_task = taskdata->td_parent; // In a serialized task, the resumed 733 // task is the parent 734 } else 735 #if OMP_45_ENABLED 736 if (!(task_team && task_team->tt.tt_found_proxy_tasks)) 737 #endif 738 { 739 // verify resumed task passed in points to parent 740 KMP_DEBUG_ASSERT(resumed_task == taskdata->td_parent); 741 } 742 } else { 743 KMP_DEBUG_ASSERT(resumed_task != 744 NULL); // verify that resumed task is passed as arguemnt 745 } 746 747 // Free this task and then ancestor tasks if they have no children. 748 // Restore th_current_task first as suggested by John: 749 // johnmc: if an asynchronous inquiry peers into the runtime system 750 // it doesn't see the freed task as the current task. 751 thread->th.th_current_task = resumed_task; 752 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 753 754 // TODO: GEH - make sure root team implicit task is initialized properly. 755 // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 ); 756 resumed_task->td_flags.executing = 1; // resume previous task 757 758 KA_TRACE( 759 10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n", 760 gtid, taskdata, resumed_task)); 761 762 return; 763 } 764 765 // __kmpc_omp_task_complete_if0: report that a task has completed execution 766 // 767 // loc_ref: source location information; points to end of task block. 768 // gtid: global thread number. 769 // task: task thunk for the completed task. 770 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid, 771 kmp_task_t *task) { 772 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n", 773 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 774 // this routine will provide task to resume 775 __kmp_task_finish(gtid, task, NULL); 776 777 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n", 778 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task))); 779 return; 780 } 781 782 #ifdef TASK_UNUSED 783 // __kmpc_omp_task_complete: report that a task has completed execution 784 // NEVER GENERATED BY COMPILER, DEPRECATED!!! 785 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid, 786 kmp_task_t *task) { 787 KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid, 788 loc_ref, KMP_TASK_TO_TASKDATA(task))); 789 790 __kmp_task_finish(gtid, task, NULL); // Not sure how to find task to resume 791 792 KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid, 793 loc_ref, KMP_TASK_TO_TASKDATA(task))); 794 return; 795 } 796 #endif // TASK_UNUSED 797 798 #if OMPT_SUPPORT 799 // __kmp_task_init_ompt: Initialize OMPT fields maintained by a task. This will 800 // only be called after ompt_tool, so we already know whether ompt is enabled 801 // or not. 802 static inline void __kmp_task_init_ompt(kmp_taskdata_t *task, int tid, 803 void *function) { 804 if (ompt_enabled) { 805 task->ompt_task_info.task_id = __ompt_task_id_new(tid); 806 task->ompt_task_info.function = function; 807 task->ompt_task_info.frame.exit_runtime_frame = NULL; 808 task->ompt_task_info.frame.reenter_runtime_frame = NULL; 809 #if OMP_40_ENABLED 810 task->ompt_task_info.ndeps = 0; 811 task->ompt_task_info.deps = NULL; 812 #endif /* OMP_40_ENABLED */ 813 } 814 } 815 #endif 816 817 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit 818 // task for a given thread 819 // 820 // loc_ref: reference to source location of parallel region 821 // this_thr: thread data structure corresponding to implicit task 822 // team: team for this_thr 823 // tid: thread id of given thread within team 824 // set_curr_task: TRUE if need to push current task to thread 825 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to 826 // have already been done elsewhere. 827 // TODO: Get better loc_ref. Value passed in may be NULL 828 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr, 829 kmp_team_t *team, int tid, int set_curr_task) { 830 kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid]; 831 832 KF_TRACE( 833 10, 834 ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n", 835 tid, team, task, set_curr_task ? "TRUE" : "FALSE")); 836 837 task->td_task_id = KMP_GEN_TASK_ID(); 838 task->td_team = team; 839 // task->td_parent = NULL; // fix for CQ230101 (broken parent task info 840 // in debugger) 841 task->td_ident = loc_ref; 842 task->td_taskwait_ident = NULL; 843 task->td_taskwait_counter = 0; 844 task->td_taskwait_thread = 0; 845 846 task->td_flags.tiedness = TASK_TIED; 847 task->td_flags.tasktype = TASK_IMPLICIT; 848 #if OMP_45_ENABLED 849 task->td_flags.proxy = TASK_FULL; 850 #endif 851 852 // All implicit tasks are executed immediately, not deferred 853 task->td_flags.task_serial = 1; 854 task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 855 task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 856 857 task->td_flags.started = 1; 858 task->td_flags.executing = 1; 859 task->td_flags.complete = 0; 860 task->td_flags.freed = 0; 861 862 #if OMP_40_ENABLED 863 task->td_depnode = NULL; 864 #endif 865 866 if (set_curr_task) { // only do this init first time thread is created 867 task->td_incomplete_child_tasks = 0; 868 // Not used: don't need to deallocate implicit task 869 task->td_allocated_child_tasks = 0; 870 #if OMP_40_ENABLED 871 task->td_taskgroup = NULL; // An implicit task does not have taskgroup 872 task->td_dephash = NULL; 873 #endif 874 __kmp_push_current_task_to_thread(this_thr, team, tid); 875 } else { 876 KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0); 877 KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0); 878 } 879 880 #if OMPT_SUPPORT 881 __kmp_task_init_ompt(task, tid, NULL); 882 #endif 883 884 KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid, 885 team, task)); 886 } 887 888 // __kmp_finish_implicit_task: Release resources associated to implicit tasks 889 // at the end of parallel regions. Some resources are kept for reuse in the next 890 // parallel region. 891 // 892 // thread: thread data structure corresponding to implicit task 893 void __kmp_finish_implicit_task(kmp_info_t *thread) { 894 kmp_taskdata_t *task = thread->th.th_current_task; 895 if (task->td_dephash) 896 __kmp_dephash_free_entries(thread, task->td_dephash); 897 } 898 899 // __kmp_free_implicit_task: Release resources associated to implicit tasks 900 // when these are destroyed regions 901 // 902 // thread: thread data structure corresponding to implicit task 903 void __kmp_free_implicit_task(kmp_info_t *thread) { 904 kmp_taskdata_t *task = thread->th.th_current_task; 905 if (task->td_dephash) 906 __kmp_dephash_free(thread, task->td_dephash); 907 task->td_dephash = NULL; 908 } 909 910 // Round up a size to a power of two specified by val: Used to insert padding 911 // between structures co-allocated using a single malloc() call 912 static size_t __kmp_round_up_to_val(size_t size, size_t val) { 913 if (size & (val - 1)) { 914 size &= ~(val - 1); 915 if (size <= KMP_SIZE_T_MAX - val) { 916 size += val; // Round up if there is no overflow. 917 }; // if 918 }; // if 919 return size; 920 } // __kmp_round_up_to_va 921 922 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task 923 // 924 // loc_ref: source location information 925 // gtid: global thread number. 926 // flags: include tiedness & task type (explicit vs. implicit) of the ''new'' 927 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine. 928 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including 929 // private vars accessed in task. 930 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed 931 // in task. 932 // task_entry: Pointer to task code entry point generated by compiler. 933 // returns: a pointer to the allocated kmp_task_t structure (task). 934 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 935 kmp_tasking_flags_t *flags, 936 size_t sizeof_kmp_task_t, size_t sizeof_shareds, 937 kmp_routine_entry_t task_entry) { 938 kmp_task_t *task; 939 kmp_taskdata_t *taskdata; 940 kmp_info_t *thread = __kmp_threads[gtid]; 941 kmp_team_t *team = thread->th.th_team; 942 kmp_taskdata_t *parent_task = thread->th.th_current_task; 943 size_t shareds_offset; 944 945 KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) " 946 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 947 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t, 948 sizeof_shareds, task_entry)); 949 950 if (parent_task->td_flags.final) { 951 if (flags->merged_if0) { 952 } 953 flags->final = 1; 954 } 955 956 #if OMP_45_ENABLED 957 if (flags->proxy == TASK_PROXY) { 958 flags->tiedness = TASK_UNTIED; 959 flags->merged_if0 = 1; 960 961 /* are we running in a sequential parallel or tskm_immediate_exec... we need 962 tasking support enabled */ 963 if ((thread->th.th_task_team) == NULL) { 964 /* This should only happen if the team is serialized 965 setup a task team and propagate it to the thread */ 966 KMP_DEBUG_ASSERT(team->t.t_serialized); 967 KA_TRACE(30, 968 ("T#%d creating task team in __kmp_task_alloc for proxy task\n", 969 gtid)); 970 __kmp_task_team_setup( 971 thread, team, 972 1); // 1 indicates setup the current team regardless of nthreads 973 thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state]; 974 } 975 kmp_task_team_t *task_team = thread->th.th_task_team; 976 977 /* tasking must be enabled now as the task might not be pushed */ 978 if (!KMP_TASKING_ENABLED(task_team)) { 979 KA_TRACE( 980 30, 981 ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid)); 982 __kmp_enable_tasking(task_team, thread); 983 kmp_int32 tid = thread->th.th_info.ds.ds_tid; 984 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 985 // No lock needed since only owner can allocate 986 if (thread_data->td.td_deque == NULL) { 987 __kmp_alloc_task_deque(thread, thread_data); 988 } 989 } 990 991 if (task_team->tt.tt_found_proxy_tasks == FALSE) 992 TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE); 993 } 994 #endif 995 996 // Calculate shared structure offset including padding after kmp_task_t struct 997 // to align pointers in shared struct 998 shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t; 999 shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *)); 1000 1001 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 1002 KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid, 1003 shareds_offset)); 1004 KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid, 1005 sizeof_shareds)); 1006 1007 // Avoid double allocation here by combining shareds with taskdata 1008 #if USE_FAST_MEMORY 1009 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset + 1010 sizeof_shareds); 1011 #else /* ! USE_FAST_MEMORY */ 1012 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset + 1013 sizeof_shareds); 1014 #endif /* USE_FAST_MEMORY */ 1015 ANNOTATE_HAPPENS_AFTER(taskdata); 1016 1017 task = KMP_TASKDATA_TO_TASK(taskdata); 1018 1019 // Make sure task & taskdata are aligned appropriately 1020 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD 1021 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0); 1022 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0); 1023 #else 1024 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0); 1025 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0); 1026 #endif 1027 if (sizeof_shareds > 0) { 1028 // Avoid double allocation here by combining shareds with taskdata 1029 task->shareds = &((char *)taskdata)[shareds_offset]; 1030 // Make sure shareds struct is aligned to pointer size 1031 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 1032 0); 1033 } else { 1034 task->shareds = NULL; 1035 } 1036 task->routine = task_entry; 1037 task->part_id = 0; // AC: Always start with 0 part id 1038 1039 taskdata->td_task_id = KMP_GEN_TASK_ID(); 1040 taskdata->td_team = team; 1041 taskdata->td_alloc_thread = thread; 1042 taskdata->td_parent = parent_task; 1043 taskdata->td_level = parent_task->td_level + 1; // increment nesting level 1044 taskdata->td_untied_count = 0; 1045 taskdata->td_ident = loc_ref; 1046 taskdata->td_taskwait_ident = NULL; 1047 taskdata->td_taskwait_counter = 0; 1048 taskdata->td_taskwait_thread = 0; 1049 KMP_DEBUG_ASSERT(taskdata->td_parent != NULL); 1050 #if OMP_45_ENABLED 1051 // avoid copying icvs for proxy tasks 1052 if (flags->proxy == TASK_FULL) 1053 #endif 1054 copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs); 1055 1056 taskdata->td_flags.tiedness = flags->tiedness; 1057 taskdata->td_flags.final = flags->final; 1058 taskdata->td_flags.merged_if0 = flags->merged_if0; 1059 #if OMP_40_ENABLED 1060 taskdata->td_flags.destructors_thunk = flags->destructors_thunk; 1061 #endif // OMP_40_ENABLED 1062 #if OMP_45_ENABLED 1063 taskdata->td_flags.proxy = flags->proxy; 1064 taskdata->td_task_team = thread->th.th_task_team; 1065 taskdata->td_size_alloc = shareds_offset + sizeof_shareds; 1066 #endif 1067 taskdata->td_flags.tasktype = TASK_EXPLICIT; 1068 1069 // GEH - TODO: fix this to copy parent task's value of tasking_ser flag 1070 taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec); 1071 1072 // GEH - TODO: fix this to copy parent task's value of team_serial flag 1073 taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0; 1074 1075 // GEH - Note we serialize the task if the team is serialized to make sure 1076 // implicit parallel region tasks are not left until program termination to 1077 // execute. Also, it helps locality to execute immediately. 1078 1079 taskdata->td_flags.task_serial = 1080 (parent_task->td_flags.final || taskdata->td_flags.team_serial || 1081 taskdata->td_flags.tasking_ser); 1082 1083 taskdata->td_flags.started = 0; 1084 taskdata->td_flags.executing = 0; 1085 taskdata->td_flags.complete = 0; 1086 taskdata->td_flags.freed = 0; 1087 1088 taskdata->td_flags.native = flags->native; 1089 1090 taskdata->td_incomplete_child_tasks = 0; 1091 taskdata->td_allocated_child_tasks = 1; // start at one because counts current 1092 // task and children 1093 #if OMP_40_ENABLED 1094 taskdata->td_taskgroup = 1095 parent_task->td_taskgroup; // task inherits taskgroup from the parent task 1096 taskdata->td_dephash = NULL; 1097 taskdata->td_depnode = NULL; 1098 #endif 1099 1100 // Only need to keep track of child task counts if team parallel and tasking not 1101 // serialized or if it is a proxy task 1102 #if OMP_45_ENABLED 1103 if (flags->proxy == TASK_PROXY || 1104 !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) 1105 #else 1106 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) 1107 #endif 1108 { 1109 KMP_TEST_THEN_INC32(&parent_task->td_incomplete_child_tasks); 1110 #if OMP_40_ENABLED 1111 if (parent_task->td_taskgroup) 1112 KMP_TEST_THEN_INC32((kmp_int32 *)(&parent_task->td_taskgroup->count)); 1113 #endif 1114 // Only need to keep track of allocated child tasks for explicit tasks since 1115 // implicit not deallocated 1116 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) { 1117 KMP_TEST_THEN_INC32(&taskdata->td_parent->td_allocated_child_tasks); 1118 } 1119 } 1120 1121 KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n", 1122 gtid, taskdata, taskdata->td_parent)); 1123 ANNOTATE_HAPPENS_BEFORE(task); 1124 1125 #if OMPT_SUPPORT 1126 __kmp_task_init_ompt(taskdata, gtid, (void *)task_entry); 1127 #endif 1128 1129 return task; 1130 } 1131 1132 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, 1133 kmp_int32 flags, size_t sizeof_kmp_task_t, 1134 size_t sizeof_shareds, 1135 kmp_routine_entry_t task_entry) { 1136 kmp_task_t *retval; 1137 kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags; 1138 1139 input_flags->native = FALSE; 1140 // __kmp_task_alloc() sets up all other runtime flags 1141 1142 #if OMP_45_ENABLED 1143 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s) " 1144 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1145 gtid, loc_ref, input_flags->tiedness ? "tied " : "untied", 1146 input_flags->proxy ? "proxy" : "", sizeof_kmp_task_t, 1147 sizeof_shareds, task_entry)); 1148 #else 1149 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s) " 1150 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n", 1151 gtid, loc_ref, input_flags->tiedness ? "tied " : "untied", 1152 sizeof_kmp_task_t, sizeof_shareds, task_entry)); 1153 #endif 1154 1155 retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t, 1156 sizeof_shareds, task_entry); 1157 1158 KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval)); 1159 1160 return retval; 1161 } 1162 1163 // __kmp_invoke_task: invoke the specified task 1164 // 1165 // gtid: global thread ID of caller 1166 // task: the task to invoke 1167 // current_task: the task to resume after task invokation 1168 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, 1169 kmp_taskdata_t *current_task) { 1170 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 1171 kmp_uint64 cur_time; 1172 #if OMP_40_ENABLED 1173 int discard = 0 /* false */; 1174 #endif 1175 KA_TRACE( 1176 30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n", 1177 gtid, taskdata, current_task)); 1178 KMP_DEBUG_ASSERT(task); 1179 #if OMP_45_ENABLED 1180 if (taskdata->td_flags.proxy == TASK_PROXY && 1181 taskdata->td_flags.complete == 1) { 1182 // This is a proxy task that was already completed but it needs to run 1183 // its bottom-half finish 1184 KA_TRACE( 1185 30, 1186 ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n", 1187 gtid, taskdata)); 1188 1189 __kmp_bottom_half_finish_proxy(gtid, task); 1190 1191 KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for " 1192 "proxy task %p, resuming task %p\n", 1193 gtid, taskdata, current_task)); 1194 1195 return; 1196 } 1197 #endif 1198 1199 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1200 if (__kmp_forkjoin_frames_mode == 3) { 1201 // Get the current time stamp to measure task execution time to correct 1202 // barrier imbalance time 1203 cur_time = __itt_get_timestamp(); 1204 } 1205 #endif 1206 1207 #if OMP_45_ENABLED 1208 // Proxy tasks are not handled by the runtime 1209 if (taskdata->td_flags.proxy != TASK_PROXY) { 1210 #endif 1211 ANNOTATE_HAPPENS_AFTER(task); 1212 __kmp_task_start(gtid, task, current_task); 1213 #if OMP_45_ENABLED 1214 } 1215 #endif 1216 1217 #if OMPT_SUPPORT 1218 ompt_thread_info_t oldInfo; 1219 kmp_info_t *thread; 1220 if (ompt_enabled) { 1221 // Store the threads states and restore them after the task 1222 thread = __kmp_threads[gtid]; 1223 oldInfo = thread->th.ompt_thread_info; 1224 thread->th.ompt_thread_info.wait_id = 0; 1225 thread->th.ompt_thread_info.state = ompt_state_work_parallel; 1226 taskdata->ompt_task_info.frame.exit_runtime_frame = 1227 __builtin_frame_address(0); 1228 } 1229 #endif 1230 1231 #if OMP_40_ENABLED 1232 // TODO: cancel tasks if the parallel region has also been cancelled 1233 // TODO: check if this sequence can be hoisted above __kmp_task_start 1234 // if cancellation has been enabled for this run ... 1235 if (__kmp_omp_cancellation) { 1236 kmp_info_t *this_thr = __kmp_threads[gtid]; 1237 kmp_team_t *this_team = this_thr->th.th_team; 1238 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 1239 if ((taskgroup && taskgroup->cancel_request) || 1240 (this_team->t.t_cancel_request == cancel_parallel)) { 1241 KMP_COUNT_BLOCK(TASK_cancelled); 1242 // this task belongs to a task group and we need to cancel it 1243 discard = 1 /* true */; 1244 } 1245 } 1246 1247 // Invoke the task routine and pass in relevant data. 1248 // Thunks generated by gcc take a different argument list. 1249 if (!discard) { 1250 #if KMP_STATS_ENABLED 1251 KMP_COUNT_BLOCK(TASK_executed); 1252 switch (KMP_GET_THREAD_STATE()) { 1253 case FORK_JOIN_BARRIER: 1254 KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar); 1255 break; 1256 case PLAIN_BARRIER: 1257 KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar); 1258 break; 1259 case TASKYIELD: 1260 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield); 1261 break; 1262 case TASKWAIT: 1263 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait); 1264 break; 1265 case TASKGROUP: 1266 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup); 1267 break; 1268 default: 1269 KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate); 1270 break; 1271 } 1272 #endif // KMP_STATS_ENABLED 1273 #endif // OMP_40_ENABLED 1274 1275 #if OMPT_SUPPORT && OMPT_TRACE 1276 /* let OMPT know that we're about to run this task */ 1277 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_switch)) { 1278 ompt_callbacks.ompt_callback(ompt_event_task_switch)( 1279 current_task->ompt_task_info.task_id, 1280 taskdata->ompt_task_info.task_id); 1281 } 1282 #endif 1283 1284 #ifdef KMP_GOMP_COMPAT 1285 if (taskdata->td_flags.native) { 1286 ((void (*)(void *))(*(task->routine)))(task->shareds); 1287 } else 1288 #endif /* KMP_GOMP_COMPAT */ 1289 { 1290 (*(task->routine))(gtid, task); 1291 } 1292 KMP_POP_PARTITIONED_TIMER(); 1293 1294 #if OMPT_SUPPORT && OMPT_TRACE 1295 /* let OMPT know that we're returning to the callee task */ 1296 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_task_switch)) { 1297 ompt_callbacks.ompt_callback(ompt_event_task_switch)( 1298 taskdata->ompt_task_info.task_id, 1299 current_task->ompt_task_info.task_id); 1300 } 1301 #endif 1302 1303 #if OMP_40_ENABLED 1304 } 1305 #endif // OMP_40_ENABLED 1306 1307 #if OMPT_SUPPORT 1308 if (ompt_enabled) { 1309 thread->th.ompt_thread_info = oldInfo; 1310 taskdata->ompt_task_info.frame.exit_runtime_frame = NULL; 1311 } 1312 #endif 1313 1314 #if OMP_45_ENABLED 1315 // Proxy tasks are not handled by the runtime 1316 if (taskdata->td_flags.proxy != TASK_PROXY) { 1317 #endif 1318 ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent); 1319 __kmp_task_finish(gtid, task, current_task); 1320 #if OMP_45_ENABLED 1321 } 1322 #endif 1323 1324 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1325 // Barrier imbalance - correct arrive time after the task finished 1326 if (__kmp_forkjoin_frames_mode == 3) { 1327 kmp_info_t *this_thr = __kmp_threads[gtid]; 1328 if (this_thr->th.th_bar_arrive_time) { 1329 this_thr->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time); 1330 } 1331 } 1332 #endif 1333 KA_TRACE( 1334 30, 1335 ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n", 1336 gtid, taskdata, current_task)); 1337 return; 1338 } 1339 1340 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution 1341 // 1342 // loc_ref: location of original task pragma (ignored) 1343 // gtid: Global Thread ID of encountering thread 1344 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task'' 1345 // Returns: 1346 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1347 // be resumed later. 1348 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1349 // resumed later. 1350 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid, 1351 kmp_task_t *new_task) { 1352 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1353 1354 KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid, 1355 loc_ref, new_taskdata)); 1356 1357 /* Should we execute the new task or queue it? For now, let's just always try 1358 to queue it. If the queue fills up, then we'll execute it. */ 1359 1360 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1361 { // Execute this task immediately 1362 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1363 new_taskdata->td_flags.task_serial = 1; 1364 __kmp_invoke_task(gtid, new_task, current_task); 1365 } 1366 1367 KA_TRACE( 1368 10, 1369 ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: " 1370 "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n", 1371 gtid, loc_ref, new_taskdata)); 1372 1373 ANNOTATE_HAPPENS_BEFORE(new_task); 1374 return TASK_CURRENT_NOT_QUEUED; 1375 } 1376 1377 // __kmp_omp_task: Schedule a non-thread-switchable task for execution 1378 // 1379 // gtid: Global Thread ID of encountering thread 1380 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc() 1381 // serialize_immediate: if TRUE then if the task is executed immediately its 1382 // execution will be serialized 1383 // Returns: 1384 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1385 // be resumed later. 1386 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1387 // resumed later. 1388 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task, 1389 bool serialize_immediate) { 1390 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1391 1392 #if OMPT_SUPPORT 1393 if (ompt_enabled) { 1394 new_taskdata->ompt_task_info.frame.reenter_runtime_frame = 1395 __builtin_frame_address(1); 1396 } 1397 #endif 1398 1399 /* Should we execute the new task or queue it? For now, let's just always try to 1400 queue it. If the queue fills up, then we'll execute it. */ 1401 #if OMP_45_ENABLED 1402 if (new_taskdata->td_flags.proxy == TASK_PROXY || 1403 __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1404 #else 1405 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer 1406 #endif 1407 { // Execute this task immediately 1408 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task; 1409 if (serialize_immediate) 1410 new_taskdata->td_flags.task_serial = 1; 1411 __kmp_invoke_task(gtid, new_task, current_task); 1412 } 1413 1414 #if OMPT_SUPPORT 1415 if (ompt_enabled) { 1416 new_taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL; 1417 } 1418 #endif 1419 1420 ANNOTATE_HAPPENS_BEFORE(new_task); 1421 return TASK_CURRENT_NOT_QUEUED; 1422 } 1423 1424 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a 1425 // non-thread-switchable task from the parent thread only! 1426 // 1427 // loc_ref: location of original task pragma (ignored) 1428 // gtid: Global Thread ID of encountering thread 1429 // new_task: non-thread-switchable task thunk allocated by 1430 // __kmp_omp_task_alloc() 1431 // Returns: 1432 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to 1433 // be resumed later. 1434 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be 1435 // resumed later. 1436 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid, 1437 kmp_task_t *new_task) { 1438 kmp_int32 res; 1439 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); 1440 1441 #if KMP_DEBUG 1442 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task); 1443 #endif 1444 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref, 1445 new_taskdata)); 1446 1447 res = __kmp_omp_task(gtid, new_task, true); 1448 1449 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning " 1450 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n", 1451 gtid, loc_ref, new_taskdata)); 1452 return res; 1453 } 1454 1455 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are 1456 // complete 1457 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) { 1458 kmp_taskdata_t *taskdata; 1459 kmp_info_t *thread; 1460 int thread_finished = FALSE; 1461 KMP_SET_THREAD_STATE_BLOCK(TASKWAIT); 1462 1463 KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref)); 1464 1465 if (__kmp_tasking_mode != tskm_immediate_exec) { 1466 thread = __kmp_threads[gtid]; 1467 taskdata = thread->th.th_current_task; 1468 #if OMPT_SUPPORT && OMPT_TRACE 1469 ompt_task_id_t my_task_id; 1470 ompt_parallel_id_t my_parallel_id; 1471 1472 if (ompt_enabled) { 1473 kmp_team_t *team = thread->th.th_team; 1474 my_task_id = taskdata->ompt_task_info.task_id; 1475 my_parallel_id = team->t.ompt_team_info.parallel_id; 1476 1477 taskdata->ompt_task_info.frame.reenter_runtime_frame = 1478 __builtin_frame_address(1); 1479 if (ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)) { 1480 ompt_callbacks.ompt_callback(ompt_event_taskwait_begin)(my_parallel_id, 1481 my_task_id); 1482 } 1483 } 1484 #endif 1485 1486 // Debugger: The taskwait is active. Store location and thread encountered the 1487 // taskwait. 1488 #if USE_ITT_BUILD 1489 // Note: These values are used by ITT events as well. 1490 #endif /* USE_ITT_BUILD */ 1491 taskdata->td_taskwait_counter += 1; 1492 taskdata->td_taskwait_ident = loc_ref; 1493 taskdata->td_taskwait_thread = gtid + 1; 1494 1495 #if USE_ITT_BUILD 1496 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 1497 if (itt_sync_obj != NULL) 1498 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 1499 #endif /* USE_ITT_BUILD */ 1500 1501 bool must_wait = 1502 !taskdata->td_flags.team_serial && !taskdata->td_flags.final; 1503 1504 #if OMP_45_ENABLED 1505 must_wait = must_wait || (thread->th.th_task_team != NULL && 1506 thread->th.th_task_team->tt.tt_found_proxy_tasks); 1507 #endif 1508 if (must_wait) { 1509 kmp_flag_32 flag( 1510 RCAST(volatile kmp_uint32 *, &taskdata->td_incomplete_child_tasks), 1511 0U); 1512 while (TCR_4(taskdata->td_incomplete_child_tasks) != 0) { 1513 flag.execute_tasks(thread, gtid, FALSE, 1514 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 1515 __kmp_task_stealing_constraint); 1516 } 1517 } 1518 #if USE_ITT_BUILD 1519 if (itt_sync_obj != NULL) 1520 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 1521 #endif /* USE_ITT_BUILD */ 1522 1523 // Debugger: The taskwait is completed. Location remains, but thread is 1524 // negated. 1525 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 1526 1527 #if OMPT_SUPPORT && OMPT_TRACE 1528 if (ompt_enabled) { 1529 if (ompt_callbacks.ompt_callback(ompt_event_taskwait_end)) { 1530 ompt_callbacks.ompt_callback(ompt_event_taskwait_end)(my_parallel_id, 1531 my_task_id); 1532 } 1533 taskdata->ompt_task_info.frame.reenter_runtime_frame = NULL; 1534 } 1535 #endif 1536 ANNOTATE_HAPPENS_AFTER(taskdata); 1537 } 1538 1539 KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, " 1540 "returning TASK_CURRENT_NOT_QUEUED\n", 1541 gtid, taskdata)); 1542 1543 return TASK_CURRENT_NOT_QUEUED; 1544 } 1545 1546 // __kmpc_omp_taskyield: switch to a different task 1547 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) { 1548 kmp_taskdata_t *taskdata; 1549 kmp_info_t *thread; 1550 int thread_finished = FALSE; 1551 1552 KMP_COUNT_BLOCK(OMP_TASKYIELD); 1553 KMP_SET_THREAD_STATE_BLOCK(TASKYIELD); 1554 1555 KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n", 1556 gtid, loc_ref, end_part)); 1557 1558 if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) { 1559 thread = __kmp_threads[gtid]; 1560 taskdata = thread->th.th_current_task; 1561 // Should we model this as a task wait or not? 1562 // Debugger: The taskwait is active. Store location and thread encountered the 1563 // taskwait. 1564 #if USE_ITT_BUILD 1565 // Note: These values are used by ITT events as well. 1566 #endif /* USE_ITT_BUILD */ 1567 taskdata->td_taskwait_counter += 1; 1568 taskdata->td_taskwait_ident = loc_ref; 1569 taskdata->td_taskwait_thread = gtid + 1; 1570 1571 #if USE_ITT_BUILD 1572 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 1573 if (itt_sync_obj != NULL) 1574 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 1575 #endif /* USE_ITT_BUILD */ 1576 if (!taskdata->td_flags.team_serial) { 1577 kmp_task_team_t *task_team = thread->th.th_task_team; 1578 if (task_team != NULL) { 1579 if (KMP_TASKING_ENABLED(task_team)) { 1580 __kmp_execute_tasks_32( 1581 thread, gtid, NULL, FALSE, 1582 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 1583 __kmp_task_stealing_constraint); 1584 } 1585 } 1586 } 1587 #if USE_ITT_BUILD 1588 if (itt_sync_obj != NULL) 1589 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 1590 #endif /* USE_ITT_BUILD */ 1591 1592 // Debugger: The taskwait is completed. Location remains, but thread is 1593 // negated. 1594 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; 1595 } 1596 1597 KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, " 1598 "returning TASK_CURRENT_NOT_QUEUED\n", 1599 gtid, taskdata)); 1600 1601 return TASK_CURRENT_NOT_QUEUED; 1602 } 1603 1604 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work 1605 #if OMP_45_ENABLED 1606 // Task Reduction implementation 1607 1608 typedef struct kmp_task_red_flags { 1609 unsigned lazy_priv : 1; // hint: (1) use lazy allocation (big objects) 1610 unsigned reserved31 : 31; 1611 } kmp_task_red_flags_t; 1612 1613 // internal structure for reduction data item related info 1614 typedef struct kmp_task_red_data { 1615 void *reduce_shar; // shared reduction item 1616 size_t reduce_size; // size of data item 1617 void *reduce_priv; // thread specific data 1618 void *reduce_pend; // end of private data for comparison op 1619 void *reduce_init; // data initialization routine 1620 void *reduce_fini; // data finalization routine 1621 void *reduce_comb; // data combiner routine 1622 kmp_task_red_flags_t flags; // flags for additional info from compiler 1623 } kmp_task_red_data_t; 1624 1625 // structure sent us by compiler - one per reduction item 1626 typedef struct kmp_task_red_input { 1627 void *reduce_shar; // shared reduction item 1628 size_t reduce_size; // size of data item 1629 void *reduce_init; // data initialization routine 1630 void *reduce_fini; // data finalization routine 1631 void *reduce_comb; // data combiner routine 1632 kmp_task_red_flags_t flags; // flags for additional info from compiler 1633 } kmp_task_red_input_t; 1634 1635 /*! 1636 @ingroup TASKING 1637 @param gtid Global thread ID 1638 @param num Number of data items to reduce 1639 @param data Array of data for reduction 1640 @return The taskgroup identifier 1641 1642 Initialize task reduction for the taskgroup. 1643 */ 1644 void *__kmpc_task_reduction_init(int gtid, int num, void *data) { 1645 kmp_info_t *thread = __kmp_threads[gtid]; 1646 kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup; 1647 kmp_int32 nth = thread->th.th_team_nproc; 1648 kmp_task_red_input_t *input = (kmp_task_red_input_t *)data; 1649 kmp_task_red_data_t *arr; 1650 1651 // check input data just in case 1652 KMP_ASSERT(tg != NULL); 1653 KMP_ASSERT(data != NULL); 1654 KMP_ASSERT(num > 0); 1655 if (nth == 1) { 1656 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n", 1657 gtid, tg)); 1658 return (void *)tg; 1659 } 1660 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n", 1661 gtid, tg, num)); 1662 arr = (kmp_task_red_data_t *)__kmp_thread_malloc( 1663 thread, num * sizeof(kmp_task_red_data_t)); 1664 for (int i = 0; i < num; ++i) { 1665 void (*f_init)(void *) = (void (*)(void *))(input[i].reduce_init); 1666 size_t size = input[i].reduce_size - 1; 1667 // round the size up to cache line per thread-specific item 1668 size += CACHE_LINE - size % CACHE_LINE; 1669 KMP_ASSERT(input[i].reduce_comb != NULL); // combiner is mandatory 1670 arr[i].reduce_shar = input[i].reduce_shar; 1671 arr[i].reduce_size = size; 1672 arr[i].reduce_init = input[i].reduce_init; 1673 arr[i].reduce_fini = input[i].reduce_fini; 1674 arr[i].reduce_comb = input[i].reduce_comb; 1675 arr[i].flags = input[i].flags; 1676 if (!input[i].flags.lazy_priv) { 1677 // allocate cache-line aligned block and fill it with zeros 1678 arr[i].reduce_priv = __kmp_allocate(nth * size); 1679 arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size; 1680 if (f_init != NULL) { 1681 // initialize thread-specific items 1682 for (int j = 0; j < nth; ++j) { 1683 f_init((char *)(arr[i].reduce_priv) + j * size); 1684 } 1685 } 1686 } else { 1687 // only allocate space for pointers now, 1688 // objects will be lazily allocated/initialized once requested 1689 arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *)); 1690 } 1691 } 1692 tg->reduce_data = (void *)arr; 1693 tg->reduce_num_data = num; 1694 return (void *)tg; 1695 } 1696 1697 /*! 1698 @ingroup TASKING 1699 @param gtid Global thread ID 1700 @param tskgrp The taskgroup ID (optional) 1701 @param data Shared location of the item 1702 @return The pointer to per-thread data 1703 1704 Get thread-specific location of data item 1705 */ 1706 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) { 1707 kmp_info_t *thread = __kmp_threads[gtid]; 1708 kmp_int32 nth = thread->th.th_team_nproc; 1709 if (nth == 1) 1710 return data; // nothing to do 1711 1712 kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp; 1713 if (tg == NULL) 1714 tg = thread->th.th_current_task->td_taskgroup; 1715 KMP_ASSERT(tg != NULL); 1716 kmp_task_red_data_t *arr = (kmp_task_red_data_t *)(tg->reduce_data); 1717 kmp_int32 num = tg->reduce_num_data; 1718 kmp_int32 tid = thread->th.th_info.ds.ds_tid; 1719 1720 KMP_ASSERT(data != NULL); 1721 while (tg != NULL) { 1722 for (int i = 0; i < num; ++i) { 1723 if (!arr[i].flags.lazy_priv) { 1724 if (data == arr[i].reduce_shar || 1725 (data >= arr[i].reduce_priv && data < arr[i].reduce_pend)) 1726 return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size; 1727 } else { 1728 // check shared location first 1729 void **p_priv = (void **)(arr[i].reduce_priv); 1730 if (data == arr[i].reduce_shar) 1731 goto found; 1732 // check if we get some thread specific location as parameter 1733 for (int j = 0; j < nth; ++j) 1734 if (data == p_priv[j]) 1735 goto found; 1736 continue; // not found, continue search 1737 found: 1738 if (p_priv[tid] == NULL) { 1739 // allocate thread specific object lazily 1740 void (*f_init)(void *) = (void (*)(void *))(arr[i].reduce_init); 1741 p_priv[tid] = __kmp_allocate(arr[i].reduce_size); 1742 if (f_init != NULL) { 1743 f_init(p_priv[tid]); 1744 } 1745 } 1746 return p_priv[tid]; 1747 } 1748 } 1749 tg = tg->parent; 1750 arr = (kmp_task_red_data_t *)(tg->reduce_data); 1751 num = tg->reduce_num_data; 1752 } 1753 KMP_ASSERT2(0, "Unknown task reduction item"); 1754 return NULL; // ERROR, this line never executed 1755 } 1756 1757 // Finalize task reduction. 1758 // Called from __kmpc_end_taskgroup() 1759 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) { 1760 kmp_int32 nth = th->th.th_team_nproc; 1761 KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1 1762 kmp_task_red_data_t *arr = (kmp_task_red_data_t *)tg->reduce_data; 1763 kmp_int32 num = tg->reduce_num_data; 1764 for (int i = 0; i < num; ++i) { 1765 void *sh_data = arr[i].reduce_shar; 1766 void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini); 1767 void (*f_comb)(void *, void *) = 1768 (void (*)(void *, void *))(arr[i].reduce_comb); 1769 if (!arr[i].flags.lazy_priv) { 1770 void *pr_data = arr[i].reduce_priv; 1771 size_t size = arr[i].reduce_size; 1772 for (int j = 0; j < nth; ++j) { 1773 void *priv_data = (char *)pr_data + j * size; 1774 f_comb(sh_data, priv_data); // combine results 1775 if (f_fini) 1776 f_fini(priv_data); // finalize if needed 1777 } 1778 } else { 1779 void **pr_data = (void **)(arr[i].reduce_priv); 1780 for (int j = 0; j < nth; ++j) { 1781 if (pr_data[j] != NULL) { 1782 f_comb(sh_data, pr_data[j]); // combine results 1783 if (f_fini) 1784 f_fini(pr_data[j]); // finalize if needed 1785 __kmp_free(pr_data[j]); 1786 } 1787 } 1788 } 1789 __kmp_free(arr[i].reduce_priv); 1790 } 1791 __kmp_thread_free(th, arr); 1792 tg->reduce_data = NULL; 1793 tg->reduce_num_data = 0; 1794 } 1795 #endif 1796 1797 #if OMP_40_ENABLED 1798 // __kmpc_taskgroup: Start a new taskgroup 1799 void __kmpc_taskgroup(ident_t *loc, int gtid) { 1800 kmp_info_t *thread = __kmp_threads[gtid]; 1801 kmp_taskdata_t *taskdata = thread->th.th_current_task; 1802 kmp_taskgroup_t *tg_new = 1803 (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t)); 1804 KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new)); 1805 tg_new->count = 0; 1806 tg_new->cancel_request = cancel_noreq; 1807 tg_new->parent = taskdata->td_taskgroup; 1808 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work 1809 #if OMP_45_ENABLED 1810 tg_new->reduce_data = NULL; 1811 tg_new->reduce_num_data = 0; 1812 #endif 1813 taskdata->td_taskgroup = tg_new; 1814 } 1815 1816 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task 1817 // and its descendants are complete 1818 void __kmpc_end_taskgroup(ident_t *loc, int gtid) { 1819 kmp_info_t *thread = __kmp_threads[gtid]; 1820 kmp_taskdata_t *taskdata = thread->th.th_current_task; 1821 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup; 1822 int thread_finished = FALSE; 1823 1824 KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc)); 1825 KMP_DEBUG_ASSERT(taskgroup != NULL); 1826 KMP_SET_THREAD_STATE_BLOCK(TASKGROUP); 1827 1828 if (__kmp_tasking_mode != tskm_immediate_exec) { 1829 #if USE_ITT_BUILD 1830 // For ITT the taskgroup wait is similar to taskwait until we need to 1831 // distinguish them 1832 void *itt_sync_obj = __kmp_itt_taskwait_object(gtid); 1833 if (itt_sync_obj != NULL) 1834 __kmp_itt_taskwait_starting(gtid, itt_sync_obj); 1835 #endif /* USE_ITT_BUILD */ 1836 1837 #if OMP_45_ENABLED 1838 if (!taskdata->td_flags.team_serial || 1839 (thread->th.th_task_team != NULL && 1840 thread->th.th_task_team->tt.tt_found_proxy_tasks)) 1841 #else 1842 if (!taskdata->td_flags.team_serial) 1843 #endif 1844 { 1845 kmp_flag_32 flag(RCAST(kmp_uint32 *, &taskgroup->count), 0U); 1846 while (TCR_4(taskgroup->count) != 0) { 1847 flag.execute_tasks(thread, gtid, FALSE, 1848 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 1849 __kmp_task_stealing_constraint); 1850 } 1851 } 1852 1853 #if USE_ITT_BUILD 1854 if (itt_sync_obj != NULL) 1855 __kmp_itt_taskwait_finished(gtid, itt_sync_obj); 1856 #endif /* USE_ITT_BUILD */ 1857 } 1858 KMP_DEBUG_ASSERT(taskgroup->count == 0); 1859 1860 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work 1861 #if OMP_45_ENABLED 1862 if (taskgroup->reduce_data != NULL) // need to reduce? 1863 __kmp_task_reduction_fini(thread, taskgroup); 1864 #endif 1865 // Restore parent taskgroup for the current task 1866 taskdata->td_taskgroup = taskgroup->parent; 1867 __kmp_thread_free(thread, taskgroup); 1868 1869 KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n", 1870 gtid, taskdata)); 1871 ANNOTATE_HAPPENS_AFTER(taskdata); 1872 } 1873 #endif 1874 1875 // __kmp_remove_my_task: remove a task from my own deque 1876 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid, 1877 kmp_task_team_t *task_team, 1878 kmp_int32 is_constrained) { 1879 kmp_task_t *task; 1880 kmp_taskdata_t *taskdata; 1881 kmp_thread_data_t *thread_data; 1882 kmp_uint32 tail; 1883 1884 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 1885 KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data != 1886 NULL); // Caller should check this condition 1887 1888 thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; 1889 1890 KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n", 1891 gtid, thread_data->td.td_deque_ntasks, 1892 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 1893 1894 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 1895 KA_TRACE(10, 1896 ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: " 1897 "ntasks=%d head=%u tail=%u\n", 1898 gtid, thread_data->td.td_deque_ntasks, 1899 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 1900 return NULL; 1901 } 1902 1903 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 1904 1905 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) { 1906 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 1907 KA_TRACE(10, 1908 ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: " 1909 "ntasks=%d head=%u tail=%u\n", 1910 gtid, thread_data->td.td_deque_ntasks, 1911 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 1912 return NULL; 1913 } 1914 1915 tail = (thread_data->td.td_deque_tail - 1) & 1916 TASK_DEQUE_MASK(thread_data->td); // Wrap index. 1917 taskdata = thread_data->td.td_deque[tail]; 1918 1919 if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) { 1920 // we need to check if the candidate obeys task scheduling constraint: 1921 // only child of current task can be scheduled 1922 kmp_taskdata_t *current = thread->th.th_current_task; 1923 kmp_int32 level = current->td_level; 1924 kmp_taskdata_t *parent = taskdata->td_parent; 1925 while (parent != current && parent->td_level > level) { 1926 parent = parent->td_parent; // check generation up to the level of the 1927 // current task 1928 KMP_DEBUG_ASSERT(parent != NULL); 1929 } 1930 if (parent != current) { 1931 // If the tail task is not a child, then no other child can appear in the 1932 // deque. 1933 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 1934 KA_TRACE(10, 1935 ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: " 1936 "ntasks=%d head=%u tail=%u\n", 1937 gtid, thread_data->td.td_deque_ntasks, 1938 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 1939 return NULL; 1940 } 1941 } 1942 1943 thread_data->td.td_deque_tail = tail; 1944 TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1); 1945 1946 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 1947 1948 KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d task %p removed: " 1949 "ntasks=%d head=%u tail=%u\n", 1950 gtid, taskdata, thread_data->td.td_deque_ntasks, 1951 thread_data->td.td_deque_head, thread_data->td.td_deque_tail)); 1952 1953 task = KMP_TASKDATA_TO_TASK(taskdata); 1954 return task; 1955 } 1956 1957 // __kmp_steal_task: remove a task from another thread's deque 1958 // Assume that calling thread has already checked existence of 1959 // task_team thread_data before calling this routine. 1960 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim, kmp_int32 gtid, 1961 kmp_task_team_t *task_team, 1962 volatile kmp_int32 *unfinished_threads, 1963 int *thread_finished, 1964 kmp_int32 is_constrained) { 1965 kmp_task_t *task; 1966 kmp_taskdata_t *taskdata; 1967 kmp_thread_data_t *victim_td, *threads_data; 1968 kmp_int32 victim_tid; 1969 1970 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 1971 1972 threads_data = task_team->tt.tt_threads_data; 1973 KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition 1974 1975 victim_tid = victim->th.th_info.ds.ds_tid; 1976 victim_td = &threads_data[victim_tid]; 1977 1978 KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: " 1979 "task_team=%p ntasks=%d " 1980 "head=%u tail=%u\n", 1981 gtid, __kmp_gtid_from_thread(victim), task_team, 1982 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 1983 victim_td->td.td_deque_tail)); 1984 1985 if ((TCR_4(victim_td->td.td_deque_ntasks) == 1986 0) || // Caller should not check this condition 1987 (TCR_PTR(victim->th.th_task_team) != 1988 task_team)) // GEH: why would this happen? 1989 { 1990 KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: " 1991 "task_team=%p " 1992 "ntasks=%d head=%u tail=%u\n", 1993 gtid, __kmp_gtid_from_thread(victim), task_team, 1994 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 1995 victim_td->td.td_deque_tail)); 1996 return NULL; 1997 } 1998 1999 __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock); 2000 2001 // Check again after we acquire the lock 2002 if ((TCR_4(victim_td->td.td_deque_ntasks) == 0) || 2003 (TCR_PTR(victim->th.th_task_team) != 2004 task_team)) // GEH: why would this happen? 2005 { 2006 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2007 KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: " 2008 "task_team=%p " 2009 "ntasks=%d head=%u tail=%u\n", 2010 gtid, __kmp_gtid_from_thread(victim), task_team, 2011 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2012 victim_td->td.td_deque_tail)); 2013 return NULL; 2014 } 2015 2016 KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL); 2017 2018 taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head]; 2019 if (is_constrained) { 2020 // we need to check if the candidate obeys task scheduling constraint: 2021 // only descendant of current task can be scheduled 2022 kmp_taskdata_t *current = __kmp_threads[gtid]->th.th_current_task; 2023 kmp_int32 level = current->td_level; 2024 kmp_taskdata_t *parent = taskdata->td_parent; 2025 while (parent != current && parent->td_level > level) { 2026 parent = parent->td_parent; // check generation up to the level of the 2027 // current task 2028 KMP_DEBUG_ASSERT(parent != NULL); 2029 } 2030 if (parent != current) { 2031 // If the head task is not a descendant of the current task then do not 2032 // steal it. No other task in victim's deque can be a descendant of the 2033 // current task. 2034 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2035 KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from " 2036 "T#%d: task_team=%p " 2037 "ntasks=%d head=%u tail=%u\n", 2038 gtid, 2039 __kmp_gtid_from_thread(threads_data[victim_tid].td.td_thr), 2040 task_team, victim_td->td.td_deque_ntasks, 2041 victim_td->td.td_deque_head, victim_td->td.td_deque_tail)); 2042 return NULL; 2043 } 2044 } 2045 // Bump head pointer and Wrap. 2046 victim_td->td.td_deque_head = 2047 (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td); 2048 if (*thread_finished) { 2049 // We need to un-mark this victim as a finished victim. This must be done 2050 // before releasing the lock, or else other threads (starting with the 2051 // master victim) might be prematurely released from the barrier!!! 2052 kmp_int32 count; 2053 2054 count = KMP_TEST_THEN_INC32(unfinished_threads); 2055 2056 KA_TRACE( 2057 20, 2058 ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n", 2059 gtid, count + 1, task_team)); 2060 2061 *thread_finished = FALSE; 2062 } 2063 TCW_4(victim_td->td.td_deque_ntasks, 2064 TCR_4(victim_td->td.td_deque_ntasks) - 1); 2065 2066 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock); 2067 2068 KMP_COUNT_BLOCK(TASK_stolen); 2069 KA_TRACE( 2070 10, 2071 ("__kmp_steal_task(exit #3): T#%d stole task %p from T#%d: task_team=%p " 2072 "ntasks=%d head=%u tail=%u\n", 2073 gtid, taskdata, __kmp_gtid_from_thread(victim), task_team, 2074 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head, 2075 victim_td->td.td_deque_tail)); 2076 2077 task = KMP_TASKDATA_TO_TASK(taskdata); 2078 return task; 2079 } 2080 2081 // __kmp_execute_tasks_template: Choose and execute tasks until either the 2082 // condition is statisfied (return true) or there are none left (return false). 2083 // 2084 // final_spin is TRUE if this is the spin at the release barrier. 2085 // thread_finished indicates whether the thread is finished executing all 2086 // the tasks it has on its deque, and is at the release barrier. 2087 // spinner is the location on which to spin. 2088 // spinner == NULL means only execute a single task and return. 2089 // checker is the value to check to terminate the spin. 2090 template <class C> 2091 static inline int __kmp_execute_tasks_template( 2092 kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin, 2093 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2094 kmp_int32 is_constrained) { 2095 kmp_task_team_t *task_team = thread->th.th_task_team; 2096 kmp_thread_data_t *threads_data; 2097 kmp_task_t *task; 2098 kmp_info_t *other_thread; 2099 kmp_taskdata_t *current_task = thread->th.th_current_task; 2100 volatile kmp_int32 *unfinished_threads; 2101 kmp_int32 nthreads, victim = -2, use_own_tasks = 1, new_victim = 0, 2102 tid = thread->th.th_info.ds.ds_tid; 2103 2104 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2105 KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]); 2106 2107 if (task_team == NULL) 2108 return FALSE; 2109 2110 KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d " 2111 "*thread_finished=%d\n", 2112 gtid, final_spin, *thread_finished)); 2113 2114 thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 2115 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 2116 KMP_DEBUG_ASSERT(threads_data != NULL); 2117 2118 nthreads = task_team->tt.tt_nproc; 2119 unfinished_threads = &(task_team->tt.tt_unfinished_threads); 2120 #if OMP_45_ENABLED 2121 KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks); 2122 #else 2123 KMP_DEBUG_ASSERT(nthreads > 1); 2124 #endif 2125 KMP_DEBUG_ASSERT(TCR_4(*unfinished_threads) >= 0); 2126 2127 while (1) { // Outer loop keeps trying to find tasks in case of single thread 2128 // getting tasks from target constructs 2129 while (1) { // Inner loop to find a task and execute it 2130 task = NULL; 2131 if (use_own_tasks) { // check on own queue first 2132 task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained); 2133 } 2134 if ((task == NULL) && (nthreads > 1)) { // Steal a task 2135 int asleep = 1; 2136 use_own_tasks = 0; 2137 // Try to steal from the last place I stole from successfully. 2138 if (victim == -2) { // haven't stolen anything yet 2139 victim = threads_data[tid].td.td_deque_last_stolen; 2140 if (victim != 2141 -1) // if we have a last stolen from victim, get the thread 2142 other_thread = threads_data[victim].td.td_thr; 2143 } 2144 if (victim != -1) { // found last victim 2145 asleep = 0; 2146 } else if (!new_victim) { // no recent steals and we haven't already 2147 // used a new victim; select a random thread 2148 do { // Find a different thread to steal work from. 2149 // Pick a random thread. Initial plan was to cycle through all the 2150 // threads, and only return if we tried to steal from every thread, 2151 // and failed. Arch says that's not such a great idea. 2152 victim = __kmp_get_random(thread) % (nthreads - 1); 2153 if (victim >= tid) { 2154 ++victim; // Adjusts random distribution to exclude self 2155 } 2156 // Found a potential victim 2157 other_thread = threads_data[victim].td.td_thr; 2158 // There is a slight chance that __kmp_enable_tasking() did not wake 2159 // up all threads waiting at the barrier. If victim is sleeping, 2160 // then wake it up. Since we were going to pay the cache miss 2161 // penalty for referencing another thread's kmp_info_t struct 2162 // anyway, 2163 // the check shouldn't cost too much performance at this point. In 2164 // extra barrier mode, tasks do not sleep at the separate tasking 2165 // barrier, so this isn't a problem. 2166 asleep = 0; 2167 if ((__kmp_tasking_mode == tskm_task_teams) && 2168 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) && 2169 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) != 2170 NULL)) { 2171 asleep = 1; 2172 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread), 2173 other_thread->th.th_sleep_loc); 2174 // A sleeping thread should not have any tasks on it's queue. 2175 // There is a slight possibility that it resumes, steals a task 2176 // from another thread, which spawns more tasks, all in the time 2177 // that it takes this thread to check => don't write an assertion 2178 // that the victim's queue is empty. Try stealing from a 2179 // different thread. 2180 } 2181 } while (asleep); 2182 } 2183 2184 if (!asleep) { 2185 // We have a victim to try to steal from 2186 task = __kmp_steal_task(other_thread, gtid, task_team, 2187 unfinished_threads, thread_finished, 2188 is_constrained); 2189 } 2190 if (task != NULL) { // set last stolen to victim 2191 if (threads_data[tid].td.td_deque_last_stolen != victim) { 2192 threads_data[tid].td.td_deque_last_stolen = victim; 2193 // The pre-refactored code did not try more than 1 successful new 2194 // vicitm, unless the last one generated more local tasks; 2195 // new_victim keeps track of this 2196 new_victim = 1; 2197 } 2198 } else { // No tasks found; unset last_stolen 2199 KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1); 2200 victim = -2; // no successful victim found 2201 } 2202 } 2203 2204 if (task == NULL) // break out of tasking loop 2205 break; 2206 2207 // Found a task; execute it 2208 #if USE_ITT_BUILD && USE_ITT_NOTIFY 2209 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { 2210 if (itt_sync_obj == NULL) { // we are at fork barrier where we could not 2211 // get the object reliably 2212 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); 2213 } 2214 __kmp_itt_task_starting(itt_sync_obj); 2215 } 2216 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 2217 __kmp_invoke_task(gtid, task, current_task); 2218 #if USE_ITT_BUILD 2219 if (itt_sync_obj != NULL) 2220 __kmp_itt_task_finished(itt_sync_obj); 2221 #endif /* USE_ITT_BUILD */ 2222 // If this thread is only partway through the barrier and the condition is 2223 // met, then return now, so that the barrier gather/release pattern can 2224 // proceed. If this thread is in the last spin loop in the barrier, 2225 // waiting to be released, we know that the termination condition will not 2226 // be satisified, so don't waste any cycles checking it. 2227 if (flag == NULL || (!final_spin && flag->done_check())) { 2228 KA_TRACE( 2229 15, 2230 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 2231 gtid)); 2232 return TRUE; 2233 } 2234 if (thread->th.th_task_team == NULL) { 2235 break; 2236 } 2237 // Yield before executing next task 2238 KMP_YIELD(__kmp_library == library_throughput); 2239 // If execution of a stolen task results in more tasks being placed on our 2240 // run queue, reset use_own_tasks 2241 if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) { 2242 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned " 2243 "other tasks, restart\n", 2244 gtid)); 2245 use_own_tasks = 1; 2246 new_victim = 0; 2247 } 2248 } 2249 2250 // The task source has been exhausted. If in final spin loop of barrier, check 2251 // if termination condition is satisfied. 2252 #if OMP_45_ENABLED 2253 // The work queue may be empty but there might be proxy tasks still 2254 // executing 2255 if (final_spin && TCR_4(current_task->td_incomplete_child_tasks) == 0) 2256 #else 2257 if (final_spin) 2258 #endif 2259 { 2260 // First, decrement the #unfinished threads, if that has not already been 2261 // done. This decrement might be to the spin location, and result in the 2262 // termination condition being satisfied. 2263 if (!*thread_finished) { 2264 kmp_int32 count; 2265 2266 count = KMP_TEST_THEN_DEC32(unfinished_threads) - 1; 2267 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec " 2268 "unfinished_threads to %d task_team=%p\n", 2269 gtid, count, task_team)); 2270 *thread_finished = TRUE; 2271 } 2272 2273 // It is now unsafe to reference thread->th.th_team !!! 2274 // Decrementing task_team->tt.tt_unfinished_threads can allow the master 2275 // thread to pass through the barrier, where it might reset each thread's 2276 // th.th_team field for the next parallel region. If we can steal more 2277 // work, we know that this has not happened yet. 2278 if (flag != NULL && flag->done_check()) { 2279 KA_TRACE( 2280 15, 2281 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", 2282 gtid)); 2283 return TRUE; 2284 } 2285 } 2286 2287 // If this thread's task team is NULL, master has recognized that there are 2288 // no more tasks; bail out 2289 if (thread->th.th_task_team == NULL) { 2290 KA_TRACE(15, 2291 ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid)); 2292 return FALSE; 2293 } 2294 2295 #if OMP_45_ENABLED 2296 // We could be getting tasks from target constructs; if this is the only 2297 // thread, keep trying to execute tasks from own queue 2298 if (nthreads == 1) 2299 use_own_tasks = 1; 2300 else 2301 #endif 2302 { 2303 KA_TRACE(15, 2304 ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid)); 2305 return FALSE; 2306 } 2307 } 2308 } 2309 2310 int __kmp_execute_tasks_32( 2311 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin, 2312 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2313 kmp_int32 is_constrained) { 2314 return __kmp_execute_tasks_template( 2315 thread, gtid, flag, final_spin, 2316 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 2317 } 2318 2319 int __kmp_execute_tasks_64( 2320 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin, 2321 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2322 kmp_int32 is_constrained) { 2323 return __kmp_execute_tasks_template( 2324 thread, gtid, flag, final_spin, 2325 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 2326 } 2327 2328 int __kmp_execute_tasks_oncore( 2329 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin, 2330 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), 2331 kmp_int32 is_constrained) { 2332 return __kmp_execute_tasks_template( 2333 thread, gtid, flag, final_spin, 2334 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained); 2335 } 2336 2337 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the 2338 // next barrier so they can assist in executing enqueued tasks. 2339 // First thread in allocates the task team atomically. 2340 static void __kmp_enable_tasking(kmp_task_team_t *task_team, 2341 kmp_info_t *this_thr) { 2342 kmp_thread_data_t *threads_data; 2343 int nthreads, i, is_init_thread; 2344 2345 KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n", 2346 __kmp_gtid_from_thread(this_thr))); 2347 2348 KMP_DEBUG_ASSERT(task_team != NULL); 2349 KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL); 2350 2351 nthreads = task_team->tt.tt_nproc; 2352 KMP_DEBUG_ASSERT(nthreads > 0); 2353 KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc); 2354 2355 // Allocate or increase the size of threads_data if necessary 2356 is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team); 2357 2358 if (!is_init_thread) { 2359 // Some other thread already set up the array. 2360 KA_TRACE( 2361 20, 2362 ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n", 2363 __kmp_gtid_from_thread(this_thr))); 2364 return; 2365 } 2366 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data); 2367 KMP_DEBUG_ASSERT(threads_data != NULL); 2368 2369 if ((__kmp_tasking_mode == tskm_task_teams) && 2370 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) { 2371 // Release any threads sleeping at the barrier, so that they can steal 2372 // tasks and execute them. In extra barrier mode, tasks do not sleep 2373 // at the separate tasking barrier, so this isn't a problem. 2374 for (i = 0; i < nthreads; i++) { 2375 volatile void *sleep_loc; 2376 kmp_info_t *thread = threads_data[i].td.td_thr; 2377 2378 if (i == this_thr->th.th_info.ds.ds_tid) { 2379 continue; 2380 } 2381 // Since we haven't locked the thread's suspend mutex lock at this 2382 // point, there is a small window where a thread might be putting 2383 // itself to sleep, but hasn't set the th_sleep_loc field yet. 2384 // To work around this, __kmp_execute_tasks_template() periodically checks 2385 // see if other threads are sleeping (using the same random mechanism that 2386 // is used for task stealing) and awakens them if they are. 2387 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != 2388 NULL) { 2389 KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n", 2390 __kmp_gtid_from_thread(this_thr), 2391 __kmp_gtid_from_thread(thread))); 2392 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc); 2393 } else { 2394 KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n", 2395 __kmp_gtid_from_thread(this_thr), 2396 __kmp_gtid_from_thread(thread))); 2397 } 2398 } 2399 } 2400 2401 KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n", 2402 __kmp_gtid_from_thread(this_thr))); 2403 } 2404 2405 /* // TODO: Check the comment consistency 2406 * Utility routines for "task teams". A task team (kmp_task_t) is kind of 2407 * like a shadow of the kmp_team_t data struct, with a different lifetime. 2408 * After a child * thread checks into a barrier and calls __kmp_release() from 2409 * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no 2410 * longer assume that the kmp_team_t structure is intact (at any moment, the 2411 * master thread may exit the barrier code and free the team data structure, 2412 * and return the threads to the thread pool). 2413 * 2414 * This does not work with the the tasking code, as the thread is still 2415 * expected to participate in the execution of any tasks that may have been 2416 * spawned my a member of the team, and the thread still needs access to all 2417 * to each thread in the team, so that it can steal work from it. 2418 * 2419 * Enter the existence of the kmp_task_team_t struct. It employs a reference 2420 * counting mechanims, and is allocated by the master thread before calling 2421 * __kmp_<barrier_kind>_release, and then is release by the last thread to 2422 * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes 2423 * of the kmp_task_team_t structs for consecutive barriers can overlap 2424 * (and will, unless the master thread is the last thread to exit the barrier 2425 * release phase, which is not typical). 2426 * 2427 * The existence of such a struct is useful outside the context of tasking, 2428 * but for now, I'm trying to keep it specific to the OMP_30_ENABLED macro, 2429 * so that any performance differences show up when comparing the 2.5 vs. 3.0 2430 * libraries. 2431 * 2432 * We currently use the existence of the threads array as an indicator that 2433 * tasks were spawned since the last barrier. If the structure is to be 2434 * useful outside the context of tasking, then this will have to change, but 2435 * not settting the field minimizes the performance impact of tasking on 2436 * barriers, when no explicit tasks were spawned (pushed, actually). 2437 */ 2438 2439 static kmp_task_team_t *__kmp_free_task_teams = 2440 NULL; // Free list for task_team data structures 2441 // Lock for task team data structures 2442 static kmp_bootstrap_lock_t __kmp_task_team_lock = 2443 KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock); 2444 2445 // __kmp_alloc_task_deque: 2446 // Allocates a task deque for a particular thread, and initialize the necessary 2447 // data structures relating to the deque. This only happens once per thread 2448 // per task team since task teams are recycled. No lock is needed during 2449 // allocation since each thread allocates its own deque. 2450 static void __kmp_alloc_task_deque(kmp_info_t *thread, 2451 kmp_thread_data_t *thread_data) { 2452 __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock); 2453 KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL); 2454 2455 // Initialize last stolen task field to "none" 2456 thread_data->td.td_deque_last_stolen = -1; 2457 2458 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0); 2459 KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0); 2460 KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0); 2461 2462 KE_TRACE( 2463 10, 2464 ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n", 2465 __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data)); 2466 // Allocate space for task deque, and zero the deque 2467 // Cannot use __kmp_thread_calloc() because threads not around for 2468 // kmp_reap_task_team( ). 2469 thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate( 2470 INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *)); 2471 thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE; 2472 } 2473 2474 // __kmp_realloc_task_deque: 2475 // Re-allocates a task deque for a particular thread, copies the content from 2476 // the old deque and adjusts the necessary data structures relating to the 2477 // deque. This operation must be done with a the deque_lock being held 2478 static void __kmp_realloc_task_deque(kmp_info_t *thread, 2479 kmp_thread_data_t *thread_data) { 2480 kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td); 2481 kmp_int32 new_size = 2 * size; 2482 2483 KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to " 2484 "%d] for thread_data %p\n", 2485 __kmp_gtid_from_thread(thread), size, new_size, thread_data)); 2486 2487 kmp_taskdata_t **new_deque = 2488 (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *)); 2489 2490 int i, j; 2491 for (i = thread_data->td.td_deque_head, j = 0; j < size; 2492 i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++) 2493 new_deque[j] = thread_data->td.td_deque[i]; 2494 2495 __kmp_free(thread_data->td.td_deque); 2496 2497 thread_data->td.td_deque_head = 0; 2498 thread_data->td.td_deque_tail = size; 2499 thread_data->td.td_deque = new_deque; 2500 thread_data->td.td_deque_size = new_size; 2501 } 2502 2503 // __kmp_free_task_deque: 2504 // Deallocates a task deque for a particular thread. Happens at library 2505 // deallocation so don't need to reset all thread data fields. 2506 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) { 2507 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 2508 2509 if (thread_data->td.td_deque != NULL) { 2510 TCW_4(thread_data->td.td_deque_ntasks, 0); 2511 __kmp_free(thread_data->td.td_deque); 2512 thread_data->td.td_deque = NULL; 2513 } 2514 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 2515 2516 #ifdef BUILD_TIED_TASK_STACK 2517 // GEH: Figure out what to do here for td_susp_tied_tasks 2518 if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) { 2519 __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data); 2520 } 2521 #endif // BUILD_TIED_TASK_STACK 2522 } 2523 2524 // __kmp_realloc_task_threads_data: 2525 // Allocates a threads_data array for a task team, either by allocating an 2526 // initial array or enlarging an existing array. Only the first thread to get 2527 // the lock allocs or enlarges the array and re-initializes the array eleemnts. 2528 // That thread returns "TRUE", the rest return "FALSE". 2529 // Assumes that the new array size is given by task_team -> tt.tt_nproc. 2530 // The current size is given by task_team -> tt.tt_max_threads. 2531 static int __kmp_realloc_task_threads_data(kmp_info_t *thread, 2532 kmp_task_team_t *task_team) { 2533 kmp_thread_data_t **threads_data_p; 2534 kmp_int32 nthreads, maxthreads; 2535 int is_init_thread = FALSE; 2536 2537 if (TCR_4(task_team->tt.tt_found_tasks)) { 2538 // Already reallocated and initialized. 2539 return FALSE; 2540 } 2541 2542 threads_data_p = &task_team->tt.tt_threads_data; 2543 nthreads = task_team->tt.tt_nproc; 2544 maxthreads = task_team->tt.tt_max_threads; 2545 2546 // All threads must lock when they encounter the first task of the implicit 2547 // task region to make sure threads_data fields are (re)initialized before 2548 // used. 2549 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 2550 2551 if (!TCR_4(task_team->tt.tt_found_tasks)) { 2552 // first thread to enable tasking 2553 kmp_team_t *team = thread->th.th_team; 2554 int i; 2555 2556 is_init_thread = TRUE; 2557 if (maxthreads < nthreads) { 2558 2559 if (*threads_data_p != NULL) { 2560 kmp_thread_data_t *old_data = *threads_data_p; 2561 kmp_thread_data_t *new_data = NULL; 2562 2563 KE_TRACE( 2564 10, 2565 ("__kmp_realloc_task_threads_data: T#%d reallocating " 2566 "threads data for task_team %p, new_size = %d, old_size = %d\n", 2567 __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads)); 2568 // Reallocate threads_data to have more elements than current array 2569 // Cannot use __kmp_thread_realloc() because threads not around for 2570 // kmp_reap_task_team( ). Note all new array entries are initialized 2571 // to zero by __kmp_allocate(). 2572 new_data = (kmp_thread_data_t *)__kmp_allocate( 2573 nthreads * sizeof(kmp_thread_data_t)); 2574 // copy old data to new data 2575 KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t), 2576 (void *)old_data, maxthreads * sizeof(kmp_thread_data_t)); 2577 2578 #ifdef BUILD_TIED_TASK_STACK 2579 // GEH: Figure out if this is the right thing to do 2580 for (i = maxthreads; i < nthreads; i++) { 2581 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 2582 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 2583 } 2584 #endif // BUILD_TIED_TASK_STACK 2585 // Install the new data and free the old data 2586 (*threads_data_p) = new_data; 2587 __kmp_free(old_data); 2588 } else { 2589 KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating " 2590 "threads data for task_team %p, size = %d\n", 2591 __kmp_gtid_from_thread(thread), task_team, nthreads)); 2592 // Make the initial allocate for threads_data array, and zero entries 2593 // Cannot use __kmp_thread_calloc() because threads not around for 2594 // kmp_reap_task_team( ). 2595 ANNOTATE_IGNORE_WRITES_BEGIN(); 2596 *threads_data_p = (kmp_thread_data_t *)__kmp_allocate( 2597 nthreads * sizeof(kmp_thread_data_t)); 2598 ANNOTATE_IGNORE_WRITES_END(); 2599 #ifdef BUILD_TIED_TASK_STACK 2600 // GEH: Figure out if this is the right thing to do 2601 for (i = 0; i < nthreads; i++) { 2602 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 2603 __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); 2604 } 2605 #endif // BUILD_TIED_TASK_STACK 2606 } 2607 task_team->tt.tt_max_threads = nthreads; 2608 } else { 2609 // If array has (more than) enough elements, go ahead and use it 2610 KMP_DEBUG_ASSERT(*threads_data_p != NULL); 2611 } 2612 2613 // initialize threads_data pointers back to thread_info structures 2614 for (i = 0; i < nthreads; i++) { 2615 kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; 2616 thread_data->td.td_thr = team->t.t_threads[i]; 2617 2618 if (thread_data->td.td_deque_last_stolen >= nthreads) { 2619 // The last stolen field survives across teams / barrier, and the number 2620 // of threads may have changed. It's possible (likely?) that a new 2621 // parallel region will exhibit the same behavior as previous region. 2622 thread_data->td.td_deque_last_stolen = -1; 2623 } 2624 } 2625 2626 KMP_MB(); 2627 TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE); 2628 } 2629 2630 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 2631 return is_init_thread; 2632 } 2633 2634 // __kmp_free_task_threads_data: 2635 // Deallocates a threads_data array for a task team, including any attached 2636 // tasking deques. Only occurs at library shutdown. 2637 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) { 2638 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock); 2639 if (task_team->tt.tt_threads_data != NULL) { 2640 int i; 2641 for (i = 0; i < task_team->tt.tt_max_threads; i++) { 2642 __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]); 2643 } 2644 __kmp_free(task_team->tt.tt_threads_data); 2645 task_team->tt.tt_threads_data = NULL; 2646 } 2647 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock); 2648 } 2649 2650 // __kmp_allocate_task_team: 2651 // Allocates a task team associated with a specific team, taking it from 2652 // the global task team free list if possible. Also initializes data 2653 // structures. 2654 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread, 2655 kmp_team_t *team) { 2656 kmp_task_team_t *task_team = NULL; 2657 int nthreads; 2658 2659 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n", 2660 (thread ? __kmp_gtid_from_thread(thread) : -1), team)); 2661 2662 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 2663 // Take a task team from the task team pool 2664 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 2665 if (__kmp_free_task_teams != NULL) { 2666 task_team = __kmp_free_task_teams; 2667 TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next); 2668 task_team->tt.tt_next = NULL; 2669 } 2670 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 2671 } 2672 2673 if (task_team == NULL) { 2674 KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating " 2675 "task team for team %p\n", 2676 __kmp_gtid_from_thread(thread), team)); 2677 // Allocate a new task team if one is not available. 2678 // Cannot use __kmp_thread_malloc() because threads not around for 2679 // kmp_reap_task_team( ). 2680 task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t)); 2681 __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock); 2682 // AC: __kmp_allocate zeroes returned memory 2683 // task_team -> tt.tt_threads_data = NULL; 2684 // task_team -> tt.tt_max_threads = 0; 2685 // task_team -> tt.tt_next = NULL; 2686 } 2687 2688 TCW_4(task_team->tt.tt_found_tasks, FALSE); 2689 #if OMP_45_ENABLED 2690 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 2691 #endif 2692 task_team->tt.tt_nproc = nthreads = team->t.t_nproc; 2693 2694 TCW_4(task_team->tt.tt_unfinished_threads, nthreads); 2695 TCW_4(task_team->tt.tt_active, TRUE); 2696 2697 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p " 2698 "unfinished_threads init'd to %d\n", 2699 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team, 2700 task_team->tt.tt_unfinished_threads)); 2701 return task_team; 2702 } 2703 2704 // __kmp_free_task_team: 2705 // Frees the task team associated with a specific thread, and adds it 2706 // to the global task team free list. 2707 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) { 2708 KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n", 2709 thread ? __kmp_gtid_from_thread(thread) : -1, task_team)); 2710 2711 // Put task team back on free list 2712 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 2713 2714 KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL); 2715 task_team->tt.tt_next = __kmp_free_task_teams; 2716 TCW_PTR(__kmp_free_task_teams, task_team); 2717 2718 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 2719 } 2720 2721 // __kmp_reap_task_teams: 2722 // Free all the task teams on the task team free list. 2723 // Should only be done during library shutdown. 2724 // Cannot do anything that needs a thread structure or gtid since they are 2725 // already gone. 2726 void __kmp_reap_task_teams(void) { 2727 kmp_task_team_t *task_team; 2728 2729 if (TCR_PTR(__kmp_free_task_teams) != NULL) { 2730 // Free all task_teams on the free list 2731 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock); 2732 while ((task_team = __kmp_free_task_teams) != NULL) { 2733 __kmp_free_task_teams = task_team->tt.tt_next; 2734 task_team->tt.tt_next = NULL; 2735 2736 // Free threads_data if necessary 2737 if (task_team->tt.tt_threads_data != NULL) { 2738 __kmp_free_task_threads_data(task_team); 2739 } 2740 __kmp_free(task_team); 2741 } 2742 __kmp_release_bootstrap_lock(&__kmp_task_team_lock); 2743 } 2744 } 2745 2746 // __kmp_wait_to_unref_task_teams: 2747 // Some threads could still be in the fork barrier release code, possibly 2748 // trying to steal tasks. Wait for each thread to unreference its task team. 2749 void __kmp_wait_to_unref_task_teams(void) { 2750 kmp_info_t *thread; 2751 kmp_uint32 spins; 2752 int done; 2753 2754 KMP_INIT_YIELD(spins); 2755 2756 for (;;) { 2757 done = TRUE; 2758 2759 // TODO: GEH - this may be is wrong because some sync would be necessary 2760 // in case threads are added to the pool during the traversal. Need to 2761 // verify that lock for thread pool is held when calling this routine. 2762 for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL; 2763 thread = thread->th.th_next_pool) { 2764 #if KMP_OS_WINDOWS 2765 DWORD exit_val; 2766 #endif 2767 if (TCR_PTR(thread->th.th_task_team) == NULL) { 2768 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n", 2769 __kmp_gtid_from_thread(thread))); 2770 continue; 2771 } 2772 #if KMP_OS_WINDOWS 2773 // TODO: GEH - add this check for Linux* OS / OS X* as well? 2774 if (!__kmp_is_thread_alive(thread, &exit_val)) { 2775 thread->th.th_task_team = NULL; 2776 continue; 2777 } 2778 #endif 2779 2780 done = FALSE; // Because th_task_team pointer is not NULL for this thread 2781 2782 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to " 2783 "unreference task_team\n", 2784 __kmp_gtid_from_thread(thread))); 2785 2786 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 2787 volatile void *sleep_loc; 2788 // If the thread is sleeping, awaken it. 2789 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) != 2790 NULL) { 2791 KA_TRACE( 2792 10, 2793 ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n", 2794 __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread))); 2795 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc); 2796 } 2797 } 2798 } 2799 if (done) { 2800 break; 2801 } 2802 2803 // If we are oversubscribed, or have waited a bit (and library mode is 2804 // throughput), yield. Pause is in the following code. 2805 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc); 2806 KMP_YIELD_SPIN(spins); // Yields only if KMP_LIBRARY=throughput 2807 } 2808 } 2809 2810 // __kmp_task_team_setup: Create a task_team for the current team, but use 2811 // an already created, unused one if it already exists. 2812 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) { 2813 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2814 2815 // If this task_team hasn't been created yet, allocate it. It will be used in 2816 // the region after the next. 2817 // If it exists, it is the current task team and shouldn't be touched yet as 2818 // it may still be in use. 2819 if (team->t.t_task_team[this_thr->th.th_task_state] == NULL && 2820 (always || team->t.t_nproc > 1)) { 2821 team->t.t_task_team[this_thr->th.th_task_state] = 2822 __kmp_allocate_task_team(this_thr, team); 2823 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p " 2824 "for team %d at parity=%d\n", 2825 __kmp_gtid_from_thread(this_thr), 2826 team->t.t_task_team[this_thr->th.th_task_state], 2827 ((team != NULL) ? team->t.t_id : -1), 2828 this_thr->th.th_task_state)); 2829 } 2830 2831 // After threads exit the release, they will call sync, and then point to this 2832 // other task_team; make sure it is allocated and properly initialized. As 2833 // threads spin in the barrier release phase, they will continue to use the 2834 // previous task_team struct(above), until they receive the signal to stop 2835 // checking for tasks (they can't safely reference the kmp_team_t struct, 2836 // which could be reallocated by the master thread). No task teams are formed 2837 // for serialized teams. 2838 if (team->t.t_nproc > 1) { 2839 int other_team = 1 - this_thr->th.th_task_state; 2840 if (team->t.t_task_team[other_team] == NULL) { // setup other team as well 2841 team->t.t_task_team[other_team] = 2842 __kmp_allocate_task_team(this_thr, team); 2843 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new " 2844 "task_team %p for team %d at parity=%d\n", 2845 __kmp_gtid_from_thread(this_thr), 2846 team->t.t_task_team[other_team], 2847 ((team != NULL) ? team->t.t_id : -1), other_team)); 2848 } else { // Leave the old task team struct in place for the upcoming region; 2849 // adjust as needed 2850 kmp_task_team_t *task_team = team->t.t_task_team[other_team]; 2851 if (!task_team->tt.tt_active || 2852 team->t.t_nproc != task_team->tt.tt_nproc) { 2853 TCW_4(task_team->tt.tt_nproc, team->t.t_nproc); 2854 TCW_4(task_team->tt.tt_found_tasks, FALSE); 2855 #if OMP_45_ENABLED 2856 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE); 2857 #endif 2858 TCW_4(task_team->tt.tt_unfinished_threads, team->t.t_nproc); 2859 TCW_4(task_team->tt.tt_active, TRUE); 2860 } 2861 // if team size has changed, the first thread to enable tasking will 2862 // realloc threads_data if necessary 2863 KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team " 2864 "%p for team %d at parity=%d\n", 2865 __kmp_gtid_from_thread(this_thr), 2866 team->t.t_task_team[other_team], 2867 ((team != NULL) ? team->t.t_id : -1), other_team)); 2868 } 2869 } 2870 } 2871 2872 // __kmp_task_team_sync: Propagation of task team data from team to threads 2873 // which happens just after the release phase of a team barrier. This may be 2874 // called by any thread, but only for teams with # threads > 1. 2875 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) { 2876 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2877 2878 // Toggle the th_task_state field, to switch which task_team this thread 2879 // refers to 2880 this_thr->th.th_task_state = 1 - this_thr->th.th_task_state; 2881 // It is now safe to propagate the task team pointer from the team struct to 2882 // the current thread. 2883 TCW_PTR(this_thr->th.th_task_team, 2884 team->t.t_task_team[this_thr->th.th_task_state]); 2885 KA_TRACE(20, 2886 ("__kmp_task_team_sync: Thread T#%d task team switched to task_team " 2887 "%p from Team #%d (parity=%d)\n", 2888 __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team, 2889 ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state)); 2890 } 2891 2892 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the 2893 // barrier gather phase. Only called by master thread if #threads in team > 1 or 2894 // if proxy tasks were created. 2895 // 2896 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off 2897 // by passing in 0 optionally as the last argument. When wait is zero, master 2898 // thread does not wait for unfinished_threads to reach 0. 2899 void __kmp_task_team_wait( 2900 kmp_info_t *this_thr, 2901 kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) { 2902 kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state]; 2903 2904 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); 2905 KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team); 2906 2907 if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) { 2908 if (wait) { 2909 KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks " 2910 "(for unfinished_threads to reach 0) on task_team = %p\n", 2911 __kmp_gtid_from_thread(this_thr), task_team)); 2912 // Worker threads may have dropped through to release phase, but could 2913 // still be executing tasks. Wait here for tasks to complete. To avoid 2914 // memory contention, only master thread checks termination condition. 2915 kmp_flag_32 flag( 2916 RCAST(volatile kmp_uint32 *, &task_team->tt.tt_unfinished_threads), 2917 0U); 2918 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); 2919 } 2920 // Deactivate the old task team, so that the worker threads will stop 2921 // referencing it while spinning. 2922 KA_TRACE( 2923 20, 2924 ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: " 2925 "setting active to false, setting local and team's pointer to NULL\n", 2926 __kmp_gtid_from_thread(this_thr), task_team)); 2927 #if OMP_45_ENABLED 2928 KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 || 2929 task_team->tt.tt_found_proxy_tasks == TRUE); 2930 TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE); 2931 #else 2932 KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1); 2933 #endif 2934 TCW_SYNC_4(task_team->tt.tt_active, FALSE); 2935 KMP_MB(); 2936 2937 TCW_PTR(this_thr->th.th_task_team, NULL); 2938 } 2939 } 2940 2941 // __kmp_tasking_barrier: 2942 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier. 2943 // Internal function to execute all tasks prior to a regular barrier or a join 2944 // barrier. It is a full barrier itself, which unfortunately turns regular 2945 // barriers into double barriers and join barriers into 1 1/2 barriers. 2946 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) { 2947 volatile kmp_uint32 *spin = RCAST( 2948 volatile kmp_uint32 *, 2949 &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads); 2950 int flag = FALSE; 2951 KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier); 2952 2953 #if USE_ITT_BUILD 2954 KMP_FSYNC_SPIN_INIT(spin, (kmp_uint32 *)NULL); 2955 #endif /* USE_ITT_BUILD */ 2956 kmp_flag_32 spin_flag(spin, 0U); 2957 while (!spin_flag.execute_tasks(thread, gtid, TRUE, 2958 &flag USE_ITT_BUILD_ARG(NULL), 0)) { 2959 #if USE_ITT_BUILD 2960 // TODO: What about itt_sync_obj?? 2961 KMP_FSYNC_SPIN_PREPARE(CCAST(kmp_uint32 *, spin)); 2962 #endif /* USE_ITT_BUILD */ 2963 2964 if (TCR_4(__kmp_global.g.g_done)) { 2965 if (__kmp_global.g.g_abort) 2966 __kmp_abort_thread(); 2967 break; 2968 } 2969 KMP_YIELD(TRUE); // GH: We always yield here 2970 } 2971 #if USE_ITT_BUILD 2972 KMP_FSYNC_SPIN_ACQUIRED(CCAST(kmp_uint32 *, spin)); 2973 #endif /* USE_ITT_BUILD */ 2974 } 2975 2976 #if OMP_45_ENABLED 2977 2978 // __kmp_give_task puts a task into a given thread queue if: 2979 // - the queue for that thread was created 2980 // - there's space in that queue 2981 // Because of this, __kmp_push_task needs to check if there's space after 2982 // getting the lock 2983 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task, 2984 kmp_int32 pass) { 2985 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 2986 kmp_task_team_t *task_team = taskdata->td_task_team; 2987 2988 KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n", 2989 taskdata, tid)); 2990 2991 // If task_team is NULL something went really bad... 2992 KMP_DEBUG_ASSERT(task_team != NULL); 2993 2994 bool result = false; 2995 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid]; 2996 2997 if (thread_data->td.td_deque == NULL) { 2998 // There's no queue in this thread, go find another one 2999 // We're guaranteed that at least one thread has a queue 3000 KA_TRACE(30, 3001 ("__kmp_give_task: thread %d has no queue while giving task %p.\n", 3002 tid, taskdata)); 3003 return result; 3004 } 3005 3006 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3007 TASK_DEQUE_SIZE(thread_data->td)) { 3008 KA_TRACE( 3009 30, 3010 ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", 3011 taskdata, tid)); 3012 3013 // if this deque is bigger than the pass ratio give a chance to another 3014 // thread 3015 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 3016 return result; 3017 3018 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3019 __kmp_realloc_task_deque(thread, thread_data); 3020 3021 } else { 3022 3023 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock); 3024 3025 if (TCR_4(thread_data->td.td_deque_ntasks) >= 3026 TASK_DEQUE_SIZE(thread_data->td)) { 3027 KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to " 3028 "thread %d.\n", 3029 taskdata, tid)); 3030 3031 // if this deque is bigger than the pass ratio give a chance to another 3032 // thread 3033 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass) 3034 goto release_and_exit; 3035 3036 __kmp_realloc_task_deque(thread, thread_data); 3037 } 3038 } 3039 3040 // lock is held here, and there is space in the deque 3041 3042 thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata; 3043 // Wrap index. 3044 thread_data->td.td_deque_tail = 3045 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td); 3046 TCW_4(thread_data->td.td_deque_ntasks, 3047 TCR_4(thread_data->td.td_deque_ntasks) + 1); 3048 3049 result = true; 3050 KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n", 3051 taskdata, tid)); 3052 3053 release_and_exit: 3054 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); 3055 3056 return result; 3057 } 3058 3059 /* The finish of the proxy tasks is divided in two pieces: 3060 - the top half is the one that can be done from a thread outside the team 3061 - the bottom half must be run from a them within the team 3062 3063 In order to run the bottom half the task gets queued back into one of the 3064 threads of the team. Once the td_incomplete_child_task counter of the parent 3065 is decremented the threads can leave the barriers. So, the bottom half needs 3066 to be queued before the counter is decremented. The top half is therefore 3067 divided in two parts: 3068 - things that can be run before queuing the bottom half 3069 - things that must be run after queuing the bottom half 3070 3071 This creates a second race as the bottom half can free the task before the 3072 second top half is executed. To avoid this we use the 3073 td_incomplete_child_task of the proxy task to synchronize the top and bottom 3074 half. */ 3075 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 3076 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT); 3077 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3078 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0); 3079 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0); 3080 3081 taskdata->td_flags.complete = 1; // mark the task as completed 3082 3083 if (taskdata->td_taskgroup) 3084 KMP_TEST_THEN_DEC32(&taskdata->td_taskgroup->count); 3085 3086 // Create an imaginary children for this task so the bottom half cannot 3087 // release the task before we have completed the second top half 3088 TCI_4(taskdata->td_incomplete_child_tasks); 3089 } 3090 3091 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) { 3092 kmp_int32 children = 0; 3093 3094 // Predecrement simulated by "- 1" calculation 3095 children = 3096 KMP_TEST_THEN_DEC32(&taskdata->td_parent->td_incomplete_child_tasks) - 1; 3097 KMP_DEBUG_ASSERT(children >= 0); 3098 3099 // Remove the imaginary children 3100 TCD_4(taskdata->td_incomplete_child_tasks); 3101 } 3102 3103 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) { 3104 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3105 kmp_info_t *thread = __kmp_threads[gtid]; 3106 3107 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3108 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 3109 1); // top half must run before bottom half 3110 3111 // We need to wait to make sure the top half is finished 3112 // Spinning here should be ok as this should happen quickly 3113 while (TCR_4(taskdata->td_incomplete_child_tasks) > 0) 3114 ; 3115 3116 __kmp_release_deps(gtid, taskdata); 3117 __kmp_free_task_and_ancestors(gtid, taskdata, thread); 3118 } 3119 3120 /*! 3121 @ingroup TASKING 3122 @param gtid Global Thread ID of encountering thread 3123 @param ptask Task which execution is completed 3124 3125 Execute the completation of a proxy task from a thread of that is part of the 3126 team. Run first and bottom halves directly. 3127 */ 3128 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) { 3129 KMP_DEBUG_ASSERT(ptask != NULL); 3130 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3131 KA_TRACE( 3132 10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n", 3133 gtid, taskdata)); 3134 3135 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3136 3137 __kmp_first_top_half_finish_proxy(taskdata); 3138 __kmp_second_top_half_finish_proxy(taskdata); 3139 __kmp_bottom_half_finish_proxy(gtid, ptask); 3140 3141 KA_TRACE(10, 3142 ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n", 3143 gtid, taskdata)); 3144 } 3145 3146 /*! 3147 @ingroup TASKING 3148 @param ptask Task which execution is completed 3149 3150 Execute the completation of a proxy task from a thread that could not belong to 3151 the team. 3152 */ 3153 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) { 3154 KMP_DEBUG_ASSERT(ptask != NULL); 3155 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask); 3156 3157 KA_TRACE( 3158 10, 3159 ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n", 3160 taskdata)); 3161 3162 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY); 3163 3164 __kmp_first_top_half_finish_proxy(taskdata); 3165 3166 // Enqueue task to complete bottom half completion from a thread within the 3167 // corresponding team 3168 kmp_team_t *team = taskdata->td_team; 3169 kmp_int32 nthreads = team->t.t_nproc; 3170 kmp_info_t *thread; 3171 3172 // This should be similar to start_k = __kmp_get_random( thread ) % nthreads 3173 // but we cannot use __kmp_get_random here 3174 kmp_int32 start_k = 0; 3175 kmp_int32 pass = 1; 3176 kmp_int32 k = start_k; 3177 3178 do { 3179 // For now we're just linearly trying to find a thread 3180 thread = team->t.t_threads[k]; 3181 k = (k + 1) % nthreads; 3182 3183 // we did a full pass through all the threads 3184 if (k == start_k) 3185 pass = pass << 1; 3186 3187 } while (!__kmp_give_task(thread, k, ptask, pass)); 3188 3189 __kmp_second_top_half_finish_proxy(taskdata); 3190 3191 KA_TRACE( 3192 10, 3193 ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", 3194 taskdata)); 3195 } 3196 3197 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task 3198 // for taskloop 3199 // 3200 // thread: allocating thread 3201 // task_src: pointer to source task to be duplicated 3202 // returns: a pointer to the allocated kmp_task_t structure (task). 3203 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) { 3204 kmp_task_t *task; 3205 kmp_taskdata_t *taskdata; 3206 kmp_taskdata_t *taskdata_src; 3207 kmp_taskdata_t *parent_task = thread->th.th_current_task; 3208 size_t shareds_offset; 3209 size_t task_size; 3210 3211 KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, 3212 task_src)); 3213 taskdata_src = KMP_TASK_TO_TASKDATA(task_src); 3214 KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy == 3215 TASK_FULL); // it should not be proxy task 3216 KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT); 3217 task_size = taskdata_src->td_size_alloc; 3218 3219 // Allocate a kmp_taskdata_t block and a kmp_task_t block. 3220 KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread, 3221 task_size)); 3222 #if USE_FAST_MEMORY 3223 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size); 3224 #else 3225 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size); 3226 #endif /* USE_FAST_MEMORY */ 3227 KMP_MEMCPY(taskdata, taskdata_src, task_size); 3228 3229 task = KMP_TASKDATA_TO_TASK(taskdata); 3230 3231 // Initialize new task (only specific fields not affected by memcpy) 3232 taskdata->td_task_id = KMP_GEN_TASK_ID(); 3233 if (task->shareds != NULL) { // need setup shareds pointer 3234 shareds_offset = (char *)task_src->shareds - (char *)taskdata_src; 3235 task->shareds = &((char *)taskdata)[shareds_offset]; 3236 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) == 3237 0); 3238 } 3239 taskdata->td_alloc_thread = thread; 3240 taskdata->td_parent = parent_task; 3241 taskdata->td_taskgroup = 3242 parent_task 3243 ->td_taskgroup; // task inherits the taskgroup from the parent task 3244 3245 // Only need to keep track of child task counts if team parallel and tasking 3246 // not serialized 3247 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) { 3248 KMP_TEST_THEN_INC32(&parent_task->td_incomplete_child_tasks); 3249 if (parent_task->td_taskgroup) 3250 KMP_TEST_THEN_INC32(&parent_task->td_taskgroup->count); 3251 // Only need to keep track of allocated child tasks for explicit tasks since 3252 // implicit not deallocated 3253 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) 3254 KMP_TEST_THEN_INC32(&taskdata->td_parent->td_allocated_child_tasks); 3255 } 3256 3257 KA_TRACE(20, 3258 ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n", 3259 thread, taskdata, taskdata->td_parent)); 3260 #if OMPT_SUPPORT 3261 __kmp_task_init_ompt(taskdata, thread->th.th_info.ds.ds_gtid, 3262 (void *)task->routine); 3263 #endif 3264 return task; 3265 } 3266 3267 // Routine optionally generated by the compiler for setting the lastprivate flag 3268 // and calling needed constructors for private/firstprivate objects 3269 // (used to form taskloop tasks from pattern task) 3270 // Parameters: dest task, src task, lastprivate flag. 3271 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32); 3272 3273 // __kmp_taskloop_linear: Start tasks of the taskloop linearly 3274 // 3275 // loc Source location information 3276 // gtid Global thread ID 3277 // task Pattern task, exposes the loop iteration range 3278 // lb Pointer to loop lower bound in task structure 3279 // ub Pointer to loop upper bound in task structure 3280 // st Loop stride 3281 // ub_glob Global upper bound (used for lastprivate check) 3282 // num_tasks Number of tasks to execute 3283 // grainsize Number of loop iterations per task 3284 // extras Number of chunks with grainsize+1 iterations 3285 // tc Iterations count 3286 // task_dup Tasks duplication routine 3287 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, 3288 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 3289 kmp_uint64 ub_glob, kmp_uint64 num_tasks, 3290 kmp_uint64 grainsize, kmp_uint64 extras, 3291 kmp_uint64 tc, void *task_dup) { 3292 KMP_COUNT_BLOCK(OMP_TASKLOOP); 3293 KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling); 3294 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 3295 kmp_uint64 lower = *lb; // compiler provides global bounds here 3296 kmp_uint64 upper = *ub; 3297 kmp_uint64 i; 3298 kmp_info_t *thread = __kmp_threads[gtid]; 3299 kmp_taskdata_t *current_task = thread->th.th_current_task; 3300 kmp_task_t *next_task; 3301 kmp_int32 lastpriv = 0; 3302 size_t lower_offset = 3303 (char *)lb - (char *)task; // remember offset of lb in the task structure 3304 size_t upper_offset = 3305 (char *)ub - (char *)task; // remember offset of ub in the task structure 3306 3307 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); 3308 KMP_DEBUG_ASSERT(num_tasks > extras); 3309 KMP_DEBUG_ASSERT(num_tasks > 0); 3310 KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, " 3311 "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n", gtid, num_tasks, 3312 grainsize, extras, lower, upper, ub_glob, st, task_dup)); 3313 3314 // Launch num_tasks tasks, assign grainsize iterations each task 3315 for (i = 0; i < num_tasks; ++i) { 3316 kmp_uint64 chunk_minus_1; 3317 if (extras == 0) { 3318 chunk_minus_1 = grainsize - 1; 3319 } else { 3320 chunk_minus_1 = grainsize; 3321 --extras; // first extras iterations get bigger chunk (grainsize+1) 3322 } 3323 upper = lower + st * chunk_minus_1; 3324 if (i == num_tasks - 1) { 3325 // schedule the last task, set lastprivate flag if needed 3326 if (st == 1) { // most common case 3327 KMP_DEBUG_ASSERT(upper == *ub); 3328 if (upper == ub_glob) 3329 lastpriv = 1; 3330 } else if (st > 0) { // positive loop stride 3331 KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper); 3332 if ((kmp_uint64)st > ub_glob - upper) 3333 lastpriv = 1; 3334 } else { // negative loop stride 3335 KMP_DEBUG_ASSERT(upper + st < *ub); 3336 if (upper - ub_glob < (kmp_uint64)(-st)) 3337 lastpriv = 1; 3338 } 3339 } 3340 next_task = __kmp_task_dup_alloc(thread, task); // allocate new task 3341 // adjust task-specific bounds 3342 *(kmp_uint64 *)((char *)next_task + lower_offset) = lower; 3343 *(kmp_uint64 *)((char *)next_task + upper_offset) = upper; 3344 if (ptask_dup != NULL) // set lastprivate flag, construct fistprivates, etc. 3345 ptask_dup(next_task, task, lastpriv); 3346 KA_TRACE(40, ("__kmp_taskloop_linear: T#%d; task %p: lower %lld, " 3347 "upper %lld (offsets %p %p)\n", 3348 gtid, next_task, lower, upper, lower_offset, upper_offset)); 3349 __kmp_omp_task(gtid, next_task, true); // schedule new task 3350 lower = upper + st; // adjust lower bound for the next iteration 3351 } 3352 // free the pattern task and exit 3353 __kmp_task_start(gtid, task, current_task); // make internal bookkeeping 3354 // do not execute the pattern task, just do internal bookkeeping 3355 __kmp_task_finish(gtid, task, current_task); 3356 } 3357 3358 // Structure to keep taskloop parameters for auxiliary task 3359 // kept in the shareds of the task structure. 3360 typedef struct __taskloop_params { 3361 kmp_task_t *task; 3362 kmp_uint64 *lb; 3363 kmp_uint64 *ub; 3364 void *task_dup; 3365 kmp_int64 st; 3366 kmp_uint64 ub_glob; 3367 kmp_uint64 num_tasks; 3368 kmp_uint64 grainsize; 3369 kmp_uint64 extras; 3370 kmp_uint64 tc; 3371 kmp_uint64 num_t_min; 3372 } __taskloop_params_t; 3373 3374 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *, 3375 kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64, 3376 kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64, 3377 void *); 3378 3379 // Execute part of the the taskloop submitted as a task. 3380 int __kmp_taskloop_task(int gtid, void *ptask) { 3381 __taskloop_params_t *p = (__taskloop_params_t*)((kmp_task_t*)ptask)->shareds; 3382 kmp_task_t *task = p->task; 3383 kmp_uint64 *lb = p->lb; 3384 kmp_uint64 *ub = p->ub; 3385 void *task_dup = p->task_dup; 3386 // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 3387 kmp_int64 st = p->st; 3388 kmp_uint64 ub_glob = p->ub_glob; 3389 kmp_uint64 num_tasks = p->num_tasks; 3390 kmp_uint64 grainsize = p->grainsize; 3391 kmp_uint64 extras = p->extras; 3392 kmp_uint64 tc = p->tc; 3393 kmp_uint64 num_t_min = p->num_t_min; 3394 #if KMP_DEBUG 3395 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 3396 KMP_DEBUG_ASSERT(task != NULL); 3397 KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize" 3398 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", gtid, taskdata, 3399 num_tasks, grainsize, extras, *lb, *ub, st, task_dup)); 3400 #endif 3401 KMP_DEBUG_ASSERT(num_tasks*2+1 > num_t_min); 3402 if (num_tasks > num_t_min) 3403 __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, 3404 grainsize, extras, tc, num_t_min, task_dup); 3405 else 3406 __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, 3407 grainsize, extras, tc, task_dup); 3408 3409 KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid)); 3410 return 0; 3411 } 3412 3413 // Schedule part of the the taskloop as a task, 3414 // execute the rest of the the taskloop. 3415 // 3416 // loc Source location information 3417 // gtid Global thread ID 3418 // task Pattern task, exposes the loop iteration range 3419 // lb Pointer to loop lower bound in task structure 3420 // ub Pointer to loop upper bound in task structure 3421 // st Loop stride 3422 // ub_glob Global upper bound (used for lastprivate check) 3423 // num_tasks Number of tasks to execute 3424 // grainsize Number of loop iterations per task 3425 // extras Number of chunks with grainsize+1 iterations 3426 // tc Iterations count 3427 // num_t_min Threashold to launch tasks recursively 3428 // task_dup Tasks duplication routine 3429 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task, 3430 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, 3431 kmp_uint64 ub_glob, kmp_uint64 num_tasks, 3432 kmp_uint64 grainsize, kmp_uint64 extras, 3433 kmp_uint64 tc, kmp_uint64 num_t_min, void *task_dup) { 3434 #if KMP_DEBUG 3435 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 3436 KMP_DEBUG_ASSERT(task != NULL); 3437 KMP_DEBUG_ASSERT(num_tasks > num_t_min); 3438 KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize" 3439 " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", gtid, taskdata, 3440 num_tasks, grainsize, extras, *lb, *ub, st, task_dup)); 3441 #endif 3442 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; 3443 kmp_uint64 lower = *lb; 3444 kmp_uint64 upper = *ub; 3445 kmp_info_t *thread = __kmp_threads[gtid]; 3446 // kmp_taskdata_t *current_task = thread->th.th_current_task; 3447 kmp_task_t *next_task; 3448 kmp_int32 lastpriv = 0; 3449 size_t lower_offset = 3450 (char *)lb - (char *)task; // remember offset of lb in the task structure 3451 size_t upper_offset = 3452 (char *)ub - (char *)task; // remember offset of ub in the task structure 3453 3454 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); 3455 KMP_DEBUG_ASSERT(num_tasks > extras); 3456 KMP_DEBUG_ASSERT(num_tasks > 0); 3457 3458 // split the loop in two halves 3459 kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1; 3460 kmp_uint64 gr_size0 = grainsize; 3461 kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute 3462 kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task 3463 if (n_tsk0 <= extras) { 3464 gr_size0++; // integrate extras into grainsize 3465 ext0 = 0; // no extra iters in 1st half 3466 ext1 = extras - n_tsk0; // remaining extras 3467 tc0 = gr_size0 * n_tsk0; 3468 tc1 = tc - tc0; 3469 } else { // n_tsk0 > extras 3470 ext1 = 0; // no extra iters in 2nd half 3471 ext0 = extras; 3472 tc1 = grainsize * n_tsk1; 3473 tc0 = tc - tc1; 3474 } 3475 ub0 = lower + st * (tc0 - 1); 3476 lb1 = ub0 + st; 3477 3478 // create pattern task for 2nd half of the loop 3479 next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task 3480 // adjust lower bound (upper bound is not changed) for the 2nd half 3481 *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1; 3482 if (ptask_dup != NULL) // construct fistprivates, etc. 3483 ptask_dup(next_task, task, 0); 3484 *ub = ub0; // adjust upper bound for the 1st half 3485 3486 // create auxiliary task for 2nd half of the loop 3487 kmp_task_t *new_task = 3488 __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void*), 3489 sizeof(__taskloop_params_t), &__kmp_taskloop_task); 3490 __taskloop_params_t * p = (__taskloop_params_t *)new_task->shareds; 3491 p->task = next_task; 3492 p->lb = (kmp_uint64 *)((char *)next_task + lower_offset); 3493 p->ub = (kmp_uint64 *)((char *)next_task + upper_offset); 3494 p->task_dup = task_dup; 3495 p->st = st; 3496 p->ub_glob = ub_glob; 3497 p->num_tasks = n_tsk1; 3498 p->grainsize = grainsize; 3499 p->extras = ext1; 3500 p->tc = tc1; 3501 p->num_t_min = num_t_min; 3502 __kmp_omp_task(gtid, new_task, true); // schedule new task 3503 3504 // execute the 1st half of current subrange 3505 if (n_tsk0 > num_t_min) 3506 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, 3507 gr_size0, ext0, tc0, num_t_min, task_dup); 3508 else 3509 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, 3510 gr_size0, ext0, tc0, task_dup); 3511 3512 KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid)); 3513 } 3514 3515 /*! 3516 @ingroup TASKING 3517 @param loc Source location information 3518 @param gtid Global thread ID 3519 @param task Task structure 3520 @param if_val Value of the if clause 3521 @param lb Pointer to loop lower bound in task structure 3522 @param ub Pointer to loop upper bound in task structure 3523 @param st Loop stride 3524 @param nogroup Flag, 1 if nogroup clause specified, 0 otherwise 3525 @param sched Schedule specified 0/1/2 for none/grainsize/num_tasks 3526 @param grainsize Schedule value if specified 3527 @param task_dup Tasks duplication routine 3528 3529 Execute the taskloop construct. 3530 */ 3531 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, 3532 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, 3533 int sched, kmp_uint64 grainsize, void *task_dup) { 3534 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); 3535 KMP_DEBUG_ASSERT(task != NULL); 3536 3537 KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, " 3538 "grain %llu(%d), dup %p\n", gtid, taskdata, *lb, *ub, st, 3539 grainsize, sched, task_dup)); 3540 3541 if (nogroup == 0) 3542 __kmpc_taskgroup(loc, gtid); 3543 3544 // ========================================================================= 3545 // calculate loop parameters 3546 kmp_uint64 tc; 3547 kmp_uint64 lower = *lb; // compiler provides global bounds here 3548 kmp_uint64 upper = *ub; 3549 kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag 3550 kmp_uint64 num_tasks = 0, extras = 0; 3551 kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks; 3552 kmp_info_t *thread = __kmp_threads[gtid]; 3553 kmp_taskdata_t *current_task = thread->th.th_current_task; 3554 3555 // compute trip count 3556 if (st == 1) { // most common case 3557 tc = upper - lower + 1; 3558 } else if (st < 0) { 3559 tc = (lower - upper) / (-st) + 1; 3560 } else { // st > 0 3561 tc = (upper - lower) / st + 1; 3562 } 3563 if (tc == 0) { 3564 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid)); 3565 // free the pattern task and exit 3566 __kmp_task_start(gtid, task, current_task); 3567 // do not execute anything for zero-trip loop 3568 __kmp_task_finish(gtid, task, current_task); 3569 return; 3570 } 3571 if (num_tasks_min == 0) 3572 // TODO: can we choose better default heuristic? 3573 num_tasks_min = KMP_MIN(thread->th.th_team_nproc * 10, 3574 INITIAL_TASK_DEQUE_SIZE); 3575 3576 // compute num_tasks/grainsize based on the input provided 3577 switch (sched) { 3578 case 0: // no schedule clause specified, we can choose the default 3579 // let's try to schedule (team_size*10) tasks 3580 grainsize = thread->th.th_team_nproc * 10; 3581 case 2: // num_tasks provided 3582 if (grainsize > tc) { 3583 num_tasks = tc; // too big num_tasks requested, adjust values 3584 grainsize = 1; 3585 extras = 0; 3586 } else { 3587 num_tasks = grainsize; 3588 grainsize = tc / num_tasks; 3589 extras = tc % num_tasks; 3590 } 3591 break; 3592 case 1: // grainsize provided 3593 if (grainsize > tc) { 3594 num_tasks = 1; // too big grainsize requested, adjust values 3595 grainsize = tc; 3596 extras = 0; 3597 } else { 3598 num_tasks = tc / grainsize; 3599 // adjust grainsize for balanced distribution of iterations 3600 grainsize = tc / num_tasks; 3601 extras = tc % num_tasks; 3602 } 3603 break; 3604 default: 3605 KMP_ASSERT2(0, "unknown scheduling of taskloop"); 3606 } 3607 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); 3608 KMP_DEBUG_ASSERT(num_tasks > extras); 3609 KMP_DEBUG_ASSERT(num_tasks > 0); 3610 // ========================================================================= 3611 3612 // check if clause value first 3613 if (if_val == 0) { // if(0) specified, mark task as serial 3614 taskdata->td_flags.task_serial = 1; 3615 taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied 3616 // always start serial tasks linearly 3617 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 3618 grainsize, extras, tc, task_dup); 3619 } else if (num_tasks > num_tasks_min) { 3620 KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu" 3621 "(%lld), grain %llu, extras %llu\n", gtid, tc, num_tasks, 3622 num_tasks_min, grainsize, extras)); 3623 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 3624 grainsize, extras, tc, num_tasks_min, task_dup); 3625 } else { 3626 KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu" 3627 "(%lld), grain %llu, extras %llu\n", gtid, tc, num_tasks, 3628 num_tasks_min, grainsize, extras)); 3629 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, 3630 grainsize, extras, tc, task_dup); 3631 } 3632 3633 if (nogroup == 0) 3634 __kmpc_end_taskgroup(loc, gtid); 3635 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid)); 3636 } 3637 3638 #endif 3639