1 /* 2 * kmp_taskq.cpp -- TASKQ support for OpenMP. 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // The LLVM Compiler Infrastructure 8 // 9 // This file is dual licensed under the MIT and the University of Illinois Open 10 // Source Licenses. See LICENSE.txt for details. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "kmp.h" 15 #include "kmp_error.h" 16 #include "kmp_i18n.h" 17 #include "kmp_io.h" 18 19 #define MAX_MESSAGE 512 20 21 /* Taskq routines and global variables */ 22 23 #define KMP_DEBUG_REF_CTS(x) KF_TRACE(1, x); 24 25 #define THREAD_ALLOC_FOR_TASKQ 26 27 static int in_parallel_context(kmp_team_t *team) { 28 return !team->t.t_serialized; 29 } 30 31 static void __kmp_taskq_eo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 32 int gtid = *gtid_ref; 33 int tid = __kmp_tid_from_gtid(gtid); 34 kmp_uint32 my_token; 35 kmpc_task_queue_t *taskq; 36 kmp_taskq_t *tq = &__kmp_threads[gtid]->th.th_team->t.t_taskq; 37 38 if (__kmp_env_consistency_check) 39 #if KMP_USE_DYNAMIC_LOCK 40 __kmp_push_sync(gtid, ct_ordered_in_taskq, loc_ref, NULL, 0); 41 #else 42 __kmp_push_sync(gtid, ct_ordered_in_taskq, loc_ref, NULL); 43 #endif 44 45 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) { 46 KMP_MB(); /* Flush all pending memory write invalidates. */ 47 48 /* GEH - need check here under stats to make sure */ 49 /* inside task (curr_thunk[*tid_ref] != NULL) */ 50 51 my_token = tq->tq_curr_thunk[tid]->th_tasknum; 52 53 taskq = tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue; 54 55 KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_EQ, NULL); 56 KMP_MB(); 57 } 58 } 59 60 static void __kmp_taskq_xo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 61 int gtid = *gtid_ref; 62 int tid = __kmp_tid_from_gtid(gtid); 63 kmp_uint32 my_token; 64 kmp_taskq_t *tq = &__kmp_threads[gtid]->th.th_team->t.t_taskq; 65 66 if (__kmp_env_consistency_check) 67 __kmp_pop_sync(gtid, ct_ordered_in_taskq, loc_ref); 68 69 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) { 70 KMP_MB(); /* Flush all pending memory write invalidates. */ 71 72 /* GEH - need check here under stats to make sure */ 73 /* inside task (curr_thunk[tid] != NULL) */ 74 75 my_token = tq->tq_curr_thunk[tid]->th_tasknum; 76 77 KMP_MB(); /* Flush all pending memory write invalidates. */ 78 79 tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue->tq_tasknum_serving = 80 my_token + 1; 81 82 KMP_MB(); /* Flush all pending memory write invalidates. */ 83 } 84 } 85 86 static void __kmp_taskq_check_ordered(kmp_int32 gtid, kmpc_thunk_t *thunk) { 87 kmp_uint32 my_token; 88 kmpc_task_queue_t *taskq; 89 90 /* assume we are always called from an active parallel context */ 91 92 KMP_MB(); /* Flush all pending memory write invalidates. */ 93 94 my_token = thunk->th_tasknum; 95 96 taskq = thunk->th.th_shareds->sv_queue; 97 98 if (taskq->tq_tasknum_serving <= my_token) { 99 KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_GE, NULL); 100 KMP_MB(); 101 taskq->tq_tasknum_serving = my_token + 1; 102 KMP_MB(); 103 } 104 } 105 106 #ifdef KMP_DEBUG 107 108 static void __kmp_dump_TQF(kmp_int32 flags) { 109 if (flags & TQF_IS_ORDERED) 110 __kmp_printf("ORDERED "); 111 if (flags & TQF_IS_LASTPRIVATE) 112 __kmp_printf("LAST_PRIV "); 113 if (flags & TQF_IS_NOWAIT) 114 __kmp_printf("NOWAIT "); 115 if (flags & TQF_HEURISTICS) 116 __kmp_printf("HEURIST "); 117 if (flags & TQF_INTERFACE_RESERVED1) 118 __kmp_printf("RESERV1 "); 119 if (flags & TQF_INTERFACE_RESERVED2) 120 __kmp_printf("RESERV2 "); 121 if (flags & TQF_INTERFACE_RESERVED3) 122 __kmp_printf("RESERV3 "); 123 if (flags & TQF_INTERFACE_RESERVED4) 124 __kmp_printf("RESERV4 "); 125 if (flags & TQF_IS_LAST_TASK) 126 __kmp_printf("LAST_TASK "); 127 if (flags & TQF_TASKQ_TASK) 128 __kmp_printf("TASKQ_TASK "); 129 if (flags & TQF_RELEASE_WORKERS) 130 __kmp_printf("RELEASE "); 131 if (flags & TQF_ALL_TASKS_QUEUED) 132 __kmp_printf("ALL_QUEUED "); 133 if (flags & TQF_PARALLEL_CONTEXT) 134 __kmp_printf("PARALLEL "); 135 if (flags & TQF_DEALLOCATED) 136 __kmp_printf("DEALLOC "); 137 if (!(flags & (TQF_INTERNAL_FLAGS | TQF_INTERFACE_FLAGS))) 138 __kmp_printf("(NONE)"); 139 } 140 141 static void __kmp_dump_thunk(kmp_taskq_t *tq, kmpc_thunk_t *thunk, 142 kmp_int32 global_tid) { 143 int i; 144 int nproc = __kmp_threads[global_tid]->th.th_team->t.t_nproc; 145 146 __kmp_printf("\tThunk at %p on (%d): ", thunk, global_tid); 147 148 if (thunk != NULL) { 149 for (i = 0; i < nproc; i++) { 150 if (tq->tq_curr_thunk[i] == thunk) { 151 __kmp_printf("[%i] ", i); 152 } 153 } 154 __kmp_printf("th_shareds=%p, ", thunk->th.th_shareds); 155 __kmp_printf("th_task=%p, ", thunk->th_task); 156 __kmp_printf("th_encl_thunk=%p, ", thunk->th_encl_thunk); 157 __kmp_printf("th_status=%d, ", thunk->th_status); 158 __kmp_printf("th_tasknum=%u, ", thunk->th_tasknum); 159 __kmp_printf("th_flags="); 160 __kmp_dump_TQF(thunk->th_flags); 161 } 162 163 __kmp_printf("\n"); 164 } 165 166 static void __kmp_dump_thunk_stack(kmpc_thunk_t *thunk, kmp_int32 thread_num) { 167 kmpc_thunk_t *th; 168 169 __kmp_printf(" Thunk stack for T#%d: ", thread_num); 170 171 for (th = thunk; th != NULL; th = th->th_encl_thunk) 172 __kmp_printf("%p ", th); 173 174 __kmp_printf("\n"); 175 } 176 177 static void __kmp_dump_task_queue(kmp_taskq_t *tq, kmpc_task_queue_t *queue, 178 kmp_int32 global_tid) { 179 int qs, count, i; 180 kmpc_thunk_t *thunk; 181 kmpc_task_queue_t *taskq; 182 183 __kmp_printf("Task Queue at %p on (%d):\n", queue, global_tid); 184 185 if (queue != NULL) { 186 int in_parallel = queue->tq_flags & TQF_PARALLEL_CONTEXT; 187 188 if (__kmp_env_consistency_check) { 189 __kmp_printf(" tq_loc : "); 190 } 191 if (in_parallel) { 192 193 // if (queue->tq.tq_parent != 0) 194 //__kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid); 195 196 //__kmp_acquire_lock(& queue->tq_link_lck, global_tid); 197 198 // Make sure data structures are in consistent state before querying them 199 // Seems to work without this for digital/alpha, needed for IBM/RS6000 200 KMP_MB(); 201 202 __kmp_printf(" tq_parent : %p\n", queue->tq.tq_parent); 203 __kmp_printf(" tq_first_child : %p\n", queue->tq_first_child); 204 __kmp_printf(" tq_next_child : %p\n", queue->tq_next_child); 205 __kmp_printf(" tq_prev_child : %p\n", queue->tq_prev_child); 206 __kmp_printf(" tq_ref_count : %d\n", queue->tq_ref_count); 207 208 //__kmp_release_lock(& queue->tq_link_lck, global_tid); 209 210 // if (queue->tq.tq_parent != 0) 211 //__kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid); 212 213 //__kmp_acquire_lock(& queue->tq_free_thunks_lck, global_tid); 214 //__kmp_acquire_lock(& queue->tq_queue_lck, global_tid); 215 216 // Make sure data structures are in consistent state before querying them 217 // Seems to work without this for digital/alpha, needed for IBM/RS6000 218 KMP_MB(); 219 } 220 221 __kmp_printf(" tq_shareds : "); 222 for (i = 0; i < ((queue == tq->tq_root) ? queue->tq_nproc : 1); i++) 223 __kmp_printf("%p ", queue->tq_shareds[i].ai_data); 224 __kmp_printf("\n"); 225 226 if (in_parallel) { 227 __kmp_printf(" tq_tasknum_queuing : %u\n", queue->tq_tasknum_queuing); 228 __kmp_printf(" tq_tasknum_serving : %u\n", queue->tq_tasknum_serving); 229 } 230 231 __kmp_printf(" tq_queue : %p\n", queue->tq_queue); 232 __kmp_printf(" tq_thunk_space : %p\n", queue->tq_thunk_space); 233 __kmp_printf(" tq_taskq_slot : %p\n", queue->tq_taskq_slot); 234 235 __kmp_printf(" tq_free_thunks : "); 236 for (thunk = queue->tq_free_thunks; thunk != NULL; 237 thunk = thunk->th.th_next_free) 238 __kmp_printf("%p ", thunk); 239 __kmp_printf("\n"); 240 241 __kmp_printf(" tq_nslots : %d\n", queue->tq_nslots); 242 __kmp_printf(" tq_head : %d\n", queue->tq_head); 243 __kmp_printf(" tq_tail : %d\n", queue->tq_tail); 244 __kmp_printf(" tq_nfull : %d\n", queue->tq_nfull); 245 __kmp_printf(" tq_hiwat : %d\n", queue->tq_hiwat); 246 __kmp_printf(" tq_flags : "); 247 __kmp_dump_TQF(queue->tq_flags); 248 __kmp_printf("\n"); 249 250 if (in_parallel) { 251 __kmp_printf(" tq_th_thunks : "); 252 for (i = 0; i < queue->tq_nproc; i++) { 253 __kmp_printf("%d ", queue->tq_th_thunks[i].ai_data); 254 } 255 __kmp_printf("\n"); 256 } 257 258 __kmp_printf("\n"); 259 __kmp_printf(" Queue slots:\n"); 260 261 qs = queue->tq_tail; 262 for (count = 0; count < queue->tq_nfull; ++count) { 263 __kmp_printf("(%d)", qs); 264 __kmp_dump_thunk(tq, queue->tq_queue[qs].qs_thunk, global_tid); 265 qs = (qs + 1) % queue->tq_nslots; 266 } 267 268 __kmp_printf("\n"); 269 270 if (in_parallel) { 271 if (queue->tq_taskq_slot != NULL) { 272 __kmp_printf(" TaskQ slot:\n"); 273 __kmp_dump_thunk(tq, CCAST(kmpc_thunk_t *, queue->tq_taskq_slot), 274 global_tid); 275 __kmp_printf("\n"); 276 } 277 //__kmp_release_lock(& queue->tq_queue_lck, global_tid); 278 //__kmp_release_lock(& queue->tq_free_thunks_lck, global_tid); 279 } 280 } 281 282 __kmp_printf(" Taskq freelist: "); 283 284 //__kmp_acquire_lock( & tq->tq_freelist_lck, global_tid ); 285 286 // Make sure data structures are in consistent state before querying them 287 // Seems to work without this call for digital/alpha, needed for IBM/RS6000 288 KMP_MB(); 289 290 for (taskq = tq->tq_freelist; taskq != NULL; taskq = taskq->tq.tq_next_free) 291 __kmp_printf("%p ", taskq); 292 293 //__kmp_release_lock( & tq->tq_freelist_lck, global_tid ); 294 295 __kmp_printf("\n\n"); 296 } 297 298 static void __kmp_aux_dump_task_queue_tree(kmp_taskq_t *tq, 299 kmpc_task_queue_t *curr_queue, 300 kmp_int32 level, 301 kmp_int32 global_tid) { 302 int i, count, qs; 303 int nproc = __kmp_threads[global_tid]->th.th_team->t.t_nproc; 304 kmpc_task_queue_t *queue = curr_queue; 305 306 if (curr_queue == NULL) 307 return; 308 309 __kmp_printf(" "); 310 311 for (i = 0; i < level; i++) 312 __kmp_printf(" "); 313 314 __kmp_printf("%p", curr_queue); 315 316 for (i = 0; i < nproc; i++) { 317 if (tq->tq_curr_thunk[i] && 318 tq->tq_curr_thunk[i]->th.th_shareds->sv_queue == curr_queue) { 319 __kmp_printf(" [%i]", i); 320 } 321 } 322 323 __kmp_printf(":"); 324 325 //__kmp_acquire_lock(& curr_queue->tq_queue_lck, global_tid); 326 327 // Make sure data structures are in consistent state before querying them 328 // Seems to work without this call for digital/alpha, needed for IBM/RS6000 329 KMP_MB(); 330 331 qs = curr_queue->tq_tail; 332 333 for (count = 0; count < curr_queue->tq_nfull; ++count) { 334 __kmp_printf("%p ", curr_queue->tq_queue[qs].qs_thunk); 335 qs = (qs + 1) % curr_queue->tq_nslots; 336 } 337 338 //__kmp_release_lock(& curr_queue->tq_queue_lck, global_tid); 339 340 __kmp_printf("\n"); 341 342 if (curr_queue->tq_first_child) { 343 //__kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid); 344 345 // Make sure data structures are in consistent state before querying them 346 // Seems to work without this call for digital/alpha, needed for IBM/RS6000 347 KMP_MB(); 348 349 if (curr_queue->tq_first_child) { 350 for (queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child); 351 queue != NULL; queue = queue->tq_next_child) { 352 __kmp_aux_dump_task_queue_tree(tq, queue, level + 1, global_tid); 353 } 354 } 355 356 //__kmp_release_lock(& curr_queue->tq_link_lck, global_tid); 357 } 358 } 359 360 static void __kmp_dump_task_queue_tree(kmp_taskq_t *tq, 361 kmpc_task_queue_t *tqroot, 362 kmp_int32 global_tid) { 363 __kmp_printf("TaskQ Tree at root %p on (%d):\n", tqroot, global_tid); 364 365 __kmp_aux_dump_task_queue_tree(tq, tqroot, 0, global_tid); 366 367 __kmp_printf("\n"); 368 } 369 #endif 370 371 /* New taskq storage routines that try to minimize overhead of mallocs but 372 still provide cache line alignment. */ 373 static void *__kmp_taskq_allocate(size_t size, kmp_int32 global_tid) { 374 void *addr, *orig_addr; 375 size_t bytes; 376 377 KB_TRACE(5, ("__kmp_taskq_allocate: called size=%d, gtid=%d\n", (int)size, 378 global_tid)); 379 380 bytes = sizeof(void *) + CACHE_LINE + size; 381 382 #ifdef THREAD_ALLOC_FOR_TASKQ 383 orig_addr = 384 (void *)__kmp_thread_malloc(__kmp_thread_from_gtid(global_tid), bytes); 385 #else 386 KE_TRACE(10, ("%%%%%% MALLOC( %d )\n", bytes)); 387 orig_addr = (void *)KMP_INTERNAL_MALLOC(bytes); 388 #endif /* THREAD_ALLOC_FOR_TASKQ */ 389 390 if (orig_addr == 0) 391 KMP_FATAL(OutOfHeapMemory); 392 393 addr = orig_addr; 394 395 if (((kmp_uintptr_t)addr & (CACHE_LINE - 1)) != 0) { 396 KB_TRACE(50, ("__kmp_taskq_allocate: adjust for cache alignment\n")); 397 addr = (void *)(((kmp_uintptr_t)addr + CACHE_LINE) & ~(CACHE_LINE - 1)); 398 } 399 400 (*(void **)addr) = orig_addr; 401 402 KB_TRACE(10, 403 ("__kmp_taskq_allocate: allocate: %p, use: %p - %p, size: %d, " 404 "gtid: %d\n", 405 orig_addr, ((void **)addr) + 1, 406 ((char *)(((void **)addr) + 1)) + size - 1, (int)size, global_tid)); 407 408 return (((void **)addr) + 1); 409 } 410 411 static void __kmpc_taskq_free(void *p, kmp_int32 global_tid) { 412 KB_TRACE(5, ("__kmpc_taskq_free: called addr=%p, gtid=%d\n", p, global_tid)); 413 414 KB_TRACE(10, ("__kmpc_taskq_free: freeing: %p, gtid: %d\n", 415 (*(((void **)p) - 1)), global_tid)); 416 417 #ifdef THREAD_ALLOC_FOR_TASKQ 418 __kmp_thread_free(__kmp_thread_from_gtid(global_tid), *(((void **)p) - 1)); 419 #else 420 KMP_INTERNAL_FREE(*(((void **)p) - 1)); 421 #endif /* THREAD_ALLOC_FOR_TASKQ */ 422 } 423 424 /* Keep freed kmpc_task_queue_t on an internal freelist and recycle since 425 they're of constant size. */ 426 427 static kmpc_task_queue_t * 428 __kmp_alloc_taskq(kmp_taskq_t *tq, int in_parallel, kmp_int32 nslots, 429 kmp_int32 nthunks, kmp_int32 nshareds, kmp_int32 nproc, 430 size_t sizeof_thunk, size_t sizeof_shareds, 431 kmpc_thunk_t **new_taskq_thunk, kmp_int32 global_tid) { 432 kmp_int32 i; 433 size_t bytes; 434 kmpc_task_queue_t *new_queue; 435 kmpc_aligned_shared_vars_t *shared_var_array; 436 char *shared_var_storage; 437 char *pt; /* for doing byte-adjusted address computations */ 438 439 __kmp_acquire_lock(&tq->tq_freelist_lck, global_tid); 440 441 // Make sure data structures are in consistent state before querying them 442 // Seems to work without this call for digital/alpha, needed for IBM/RS6000 443 KMP_MB(); 444 445 if (tq->tq_freelist) { 446 new_queue = tq->tq_freelist; 447 tq->tq_freelist = tq->tq_freelist->tq.tq_next_free; 448 449 KMP_DEBUG_ASSERT(new_queue->tq_flags & TQF_DEALLOCATED); 450 451 new_queue->tq_flags = 0; 452 453 __kmp_release_lock(&tq->tq_freelist_lck, global_tid); 454 } else { 455 __kmp_release_lock(&tq->tq_freelist_lck, global_tid); 456 457 new_queue = (kmpc_task_queue_t *)__kmp_taskq_allocate( 458 sizeof(kmpc_task_queue_t), global_tid); 459 new_queue->tq_flags = 0; 460 } 461 462 /* space in the task queue for queue slots (allocate as one big chunk */ 463 /* of storage including new_taskq_task space) */ 464 465 sizeof_thunk += 466 (CACHE_LINE - (sizeof_thunk % CACHE_LINE)); /* pad to cache line size */ 467 pt = (char *)__kmp_taskq_allocate(nthunks * sizeof_thunk, global_tid); 468 new_queue->tq_thunk_space = (kmpc_thunk_t *)pt; 469 *new_taskq_thunk = (kmpc_thunk_t *)(pt + (nthunks - 1) * sizeof_thunk); 470 471 /* chain the allocated thunks into a freelist for this queue */ 472 473 new_queue->tq_free_thunks = (kmpc_thunk_t *)pt; 474 475 for (i = 0; i < (nthunks - 2); i++) { 476 ((kmpc_thunk_t *)(pt + i * sizeof_thunk))->th.th_next_free = 477 (kmpc_thunk_t *)(pt + (i + 1) * sizeof_thunk); 478 #ifdef KMP_DEBUG 479 ((kmpc_thunk_t *)(pt + i * sizeof_thunk))->th_flags = TQF_DEALLOCATED; 480 #endif 481 } 482 483 ((kmpc_thunk_t *)(pt + (nthunks - 2) * sizeof_thunk))->th.th_next_free = NULL; 484 #ifdef KMP_DEBUG 485 ((kmpc_thunk_t *)(pt + (nthunks - 2) * sizeof_thunk))->th_flags = 486 TQF_DEALLOCATED; 487 #endif 488 489 /* initialize the locks */ 490 491 if (in_parallel) { 492 __kmp_init_lock(&new_queue->tq_link_lck); 493 __kmp_init_lock(&new_queue->tq_free_thunks_lck); 494 __kmp_init_lock(&new_queue->tq_queue_lck); 495 } 496 497 /* now allocate the slots */ 498 499 bytes = nslots * sizeof(kmpc_aligned_queue_slot_t); 500 new_queue->tq_queue = 501 (kmpc_aligned_queue_slot_t *)__kmp_taskq_allocate(bytes, global_tid); 502 503 /* space for array of pointers to shared variable structures */ 504 sizeof_shareds += sizeof(kmpc_task_queue_t *); 505 sizeof_shareds += 506 (CACHE_LINE - (sizeof_shareds % CACHE_LINE)); /* pad to cache line size */ 507 508 bytes = nshareds * sizeof(kmpc_aligned_shared_vars_t); 509 shared_var_array = 510 (kmpc_aligned_shared_vars_t *)__kmp_taskq_allocate(bytes, global_tid); 511 512 bytes = nshareds * sizeof_shareds; 513 shared_var_storage = (char *)__kmp_taskq_allocate(bytes, global_tid); 514 515 for (i = 0; i < nshareds; i++) { 516 shared_var_array[i].ai_data = 517 (kmpc_shared_vars_t *)(shared_var_storage + i * sizeof_shareds); 518 shared_var_array[i].ai_data->sv_queue = new_queue; 519 } 520 new_queue->tq_shareds = shared_var_array; 521 522 /* array for number of outstanding thunks per thread */ 523 524 if (in_parallel) { 525 bytes = nproc * sizeof(kmpc_aligned_int32_t); 526 new_queue->tq_th_thunks = 527 (kmpc_aligned_int32_t *)__kmp_taskq_allocate(bytes, global_tid); 528 new_queue->tq_nproc = nproc; 529 530 for (i = 0; i < nproc; i++) 531 new_queue->tq_th_thunks[i].ai_data = 0; 532 } 533 534 return new_queue; 535 } 536 537 static void __kmp_free_taskq(kmp_taskq_t *tq, kmpc_task_queue_t *p, 538 int in_parallel, kmp_int32 global_tid) { 539 __kmpc_taskq_free(p->tq_thunk_space, global_tid); 540 __kmpc_taskq_free(p->tq_queue, global_tid); 541 542 /* free shared var structure storage */ 543 __kmpc_taskq_free(CCAST(kmpc_shared_vars_t *, p->tq_shareds[0].ai_data), 544 global_tid); 545 /* free array of pointers to shared vars storage */ 546 __kmpc_taskq_free(p->tq_shareds, global_tid); 547 548 #ifdef KMP_DEBUG 549 p->tq_first_child = NULL; 550 p->tq_next_child = NULL; 551 p->tq_prev_child = NULL; 552 p->tq_ref_count = -10; 553 p->tq_shareds = NULL; 554 p->tq_tasknum_queuing = 0; 555 p->tq_tasknum_serving = 0; 556 p->tq_queue = NULL; 557 p->tq_thunk_space = NULL; 558 p->tq_taskq_slot = NULL; 559 p->tq_free_thunks = NULL; 560 p->tq_nslots = 0; 561 p->tq_head = 0; 562 p->tq_tail = 0; 563 p->tq_nfull = 0; 564 p->tq_hiwat = 0; 565 566 if (in_parallel) { 567 int i; 568 569 for (i = 0; i < p->tq_nproc; i++) 570 p->tq_th_thunks[i].ai_data = 0; 571 } 572 if (__kmp_env_consistency_check) 573 p->tq_loc = NULL; 574 KMP_DEBUG_ASSERT(p->tq_flags & TQF_DEALLOCATED); 575 p->tq_flags = TQF_DEALLOCATED; 576 #endif /* KMP_DEBUG */ 577 578 if (in_parallel) { 579 __kmpc_taskq_free(p->tq_th_thunks, global_tid); 580 __kmp_destroy_lock(&p->tq_link_lck); 581 __kmp_destroy_lock(&p->tq_queue_lck); 582 __kmp_destroy_lock(&p->tq_free_thunks_lck); 583 } 584 #ifdef KMP_DEBUG 585 p->tq_th_thunks = NULL; 586 #endif /* KMP_DEBUG */ 587 588 // Make sure data structures are in consistent state before querying them 589 // Seems to work without this call for digital/alpha, needed for IBM/RS6000 590 KMP_MB(); 591 592 __kmp_acquire_lock(&tq->tq_freelist_lck, global_tid); 593 p->tq.tq_next_free = tq->tq_freelist; 594 595 tq->tq_freelist = p; 596 __kmp_release_lock(&tq->tq_freelist_lck, global_tid); 597 } 598 599 /* Once a group of thunks has been allocated for use in a particular queue, 600 these are managed via a per-queue freelist. 601 We force a check that there's always a thunk free if we need one. */ 602 603 static kmpc_thunk_t *__kmp_alloc_thunk(kmpc_task_queue_t *queue, 604 int in_parallel, kmp_int32 global_tid) { 605 kmpc_thunk_t *fl; 606 607 if (in_parallel) { 608 __kmp_acquire_lock(&queue->tq_free_thunks_lck, global_tid); 609 // Make sure data structures are in consistent state before querying them 610 // Seems to work without this call for digital/alpha, needed for IBM/RS6000 611 KMP_MB(); 612 } 613 614 fl = queue->tq_free_thunks; 615 616 KMP_DEBUG_ASSERT(fl != NULL); 617 618 queue->tq_free_thunks = fl->th.th_next_free; 619 fl->th_flags = 0; 620 621 if (in_parallel) 622 __kmp_release_lock(&queue->tq_free_thunks_lck, global_tid); 623 624 return fl; 625 } 626 627 static void __kmp_free_thunk(kmpc_task_queue_t *queue, kmpc_thunk_t *p, 628 int in_parallel, kmp_int32 global_tid) { 629 #ifdef KMP_DEBUG 630 p->th_task = 0; 631 p->th_encl_thunk = 0; 632 p->th_status = 0; 633 p->th_tasknum = 0; 634 /* Also could zero pointers to private vars */ 635 #endif 636 637 if (in_parallel) { 638 __kmp_acquire_lock(&queue->tq_free_thunks_lck, global_tid); 639 // Make sure data structures are in consistent state before querying them 640 // Seems to work without this call for digital/alpha, needed for IBM/RS6000 641 KMP_MB(); 642 } 643 644 p->th.th_next_free = queue->tq_free_thunks; 645 queue->tq_free_thunks = p; 646 647 #ifdef KMP_DEBUG 648 p->th_flags = TQF_DEALLOCATED; 649 #endif 650 651 if (in_parallel) 652 __kmp_release_lock(&queue->tq_free_thunks_lck, global_tid); 653 } 654 655 /* returns nonzero if the queue just became full after the enqueue */ 656 static kmp_int32 __kmp_enqueue_task(kmp_taskq_t *tq, kmp_int32 global_tid, 657 kmpc_task_queue_t *queue, 658 kmpc_thunk_t *thunk, int in_parallel) { 659 kmp_int32 ret; 660 661 /* dkp: can we get around the lock in the TQF_RELEASE_WORKERS case (only the 662 * master is executing then) */ 663 if (in_parallel) { 664 __kmp_acquire_lock(&queue->tq_queue_lck, global_tid); 665 // Make sure data structures are in consistent state before querying them 666 // Seems to work without this call for digital/alpha, needed for IBM/RS6000 667 KMP_MB(); 668 } 669 670 KMP_DEBUG_ASSERT(queue->tq_nfull < queue->tq_nslots); // check queue not full 671 672 queue->tq_queue[(queue->tq_head)++].qs_thunk = thunk; 673 674 if (queue->tq_head >= queue->tq_nslots) 675 queue->tq_head = 0; 676 677 (queue->tq_nfull)++; 678 679 KMP_MB(); /* to assure that nfull is seen to increase before 680 TQF_ALL_TASKS_QUEUED is set */ 681 682 ret = (in_parallel) ? (queue->tq_nfull == queue->tq_nslots) : FALSE; 683 684 if (in_parallel) { 685 /* don't need to wait until workers are released before unlocking */ 686 __kmp_release_lock(&queue->tq_queue_lck, global_tid); 687 688 if (tq->tq_global_flags & TQF_RELEASE_WORKERS) { 689 // If just creating the root queue, the worker threads are waiting at a 690 // join barrier until now, when there's something in the queue for them to 691 // do; release them now to do work. This should only be done when this is 692 // the first task enqueued, so reset the flag here also. 693 tq->tq_global_flags &= ~TQF_RELEASE_WORKERS; /* no lock needed, workers 694 are still in spin mode */ 695 // avoid releasing barrier twice if taskq_task switches threads 696 KMP_MB(); 697 698 __kmpc_end_barrier_master(NULL, global_tid); 699 } 700 } 701 702 return ret; 703 } 704 705 static kmpc_thunk_t *__kmp_dequeue_task(kmp_int32 global_tid, 706 kmpc_task_queue_t *queue, 707 int in_parallel) { 708 kmpc_thunk_t *pt; 709 int tid = __kmp_tid_from_gtid(global_tid); 710 711 KMP_DEBUG_ASSERT(queue->tq_nfull > 0); /* check queue not empty */ 712 713 if (queue->tq.tq_parent != NULL && in_parallel) { 714 int ct; 715 __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); 716 ct = ++(queue->tq_ref_count); 717 __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); 718 KMP_DEBUG_REF_CTS( 719 ("line %d gtid %d: Q %p inc %d\n", __LINE__, global_tid, queue, ct)); 720 } 721 722 pt = queue->tq_queue[(queue->tq_tail)++].qs_thunk; 723 724 if (queue->tq_tail >= queue->tq_nslots) 725 queue->tq_tail = 0; 726 727 if (in_parallel) { 728 queue->tq_th_thunks[tid].ai_data++; 729 730 KMP_MB(); /* necessary so ai_data increment is propagated to other threads 731 immediately (digital) */ 732 733 KF_TRACE(200, ("__kmp_dequeue_task: T#%d(:%d) now has %d outstanding " 734 "thunks from queue %p\n", 735 global_tid, tid, queue->tq_th_thunks[tid].ai_data, queue)); 736 } 737 738 (queue->tq_nfull)--; 739 740 #ifdef KMP_DEBUG 741 KMP_MB(); 742 743 /* necessary so (queue->tq_nfull > 0) above succeeds after tq_nfull is 744 * decremented */ 745 746 KMP_DEBUG_ASSERT(queue->tq_nfull >= 0); 747 748 if (in_parallel) { 749 KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data <= 750 __KMP_TASKQ_THUNKS_PER_TH); 751 } 752 #endif 753 754 return pt; 755 } 756 757 /* Find the next (non-null) task to dequeue and return it. 758 * This is never called unless in_parallel=TRUE 759 * 760 * Here are the rules for deciding which queue to take the task from: 761 * 1. Walk up the task queue tree from the current queue's parent and look 762 * on the way up (for loop, below). 763 * 2. Do a depth-first search back down the tree from the root and 764 * look (find_task_in_descendant_queue()). 765 * 766 * Here are the rules for deciding which task to take from a queue 767 * (__kmp_find_task_in_queue ()): 768 * 1. Never take the last task from a queue if TQF_IS_LASTPRIVATE; this task 769 * must be staged to make sure we execute the last one with 770 * TQF_IS_LAST_TASK at the end of task queue execution. 771 * 2. If the queue length is below some high water mark and the taskq task 772 * is enqueued, prefer running the taskq task. 773 * 3. Otherwise, take a (normal) task from the queue. 774 * 775 * If we do all this and return pt == NULL at the bottom of this routine, 776 * this means there are no more tasks to execute (except possibly for 777 * TQF_IS_LASTPRIVATE). 778 */ 779 780 static kmpc_thunk_t *__kmp_find_task_in_queue(kmp_int32 global_tid, 781 kmpc_task_queue_t *queue) { 782 kmpc_thunk_t *pt = NULL; 783 int tid = __kmp_tid_from_gtid(global_tid); 784 785 /* To prevent deadlock from tq_queue_lck if queue already deallocated */ 786 if (!(queue->tq_flags & TQF_DEALLOCATED)) { 787 788 __kmp_acquire_lock(&queue->tq_queue_lck, global_tid); 789 790 /* Check again to avoid race in __kmpc_end_taskq() */ 791 if (!(queue->tq_flags & TQF_DEALLOCATED)) { 792 // Make sure data structures are in consistent state before querying them 793 // Seems to work without this for digital/alpha, needed for IBM/RS6000 794 KMP_MB(); 795 796 if ((queue->tq_taskq_slot != NULL) && 797 (queue->tq_nfull <= queue->tq_hiwat)) { 798 /* if there's enough room in the queue and the dispatcher */ 799 /* (taskq task) is available, schedule more tasks */ 800 pt = CCAST(kmpc_thunk_t *, queue->tq_taskq_slot); 801 queue->tq_taskq_slot = NULL; 802 } else if (queue->tq_nfull == 0 || 803 queue->tq_th_thunks[tid].ai_data >= 804 __KMP_TASKQ_THUNKS_PER_TH) { 805 /* do nothing if no thunks available or this thread can't */ 806 /* run any because it already is executing too many */ 807 pt = NULL; 808 } else if (queue->tq_nfull > 1) { 809 /* always safe to schedule a task even if TQF_IS_LASTPRIVATE */ 810 811 pt = __kmp_dequeue_task(global_tid, queue, TRUE); 812 } else if (!(queue->tq_flags & TQF_IS_LASTPRIVATE)) { 813 // one thing in queue, always safe to schedule if !TQF_IS_LASTPRIVATE 814 pt = __kmp_dequeue_task(global_tid, queue, TRUE); 815 } else if (queue->tq_flags & TQF_IS_LAST_TASK) { 816 /* TQF_IS_LASTPRIVATE, one thing in queue, kmpc_end_taskq_task() */ 817 /* has been run so this is last task, run with TQF_IS_LAST_TASK so */ 818 /* instrumentation does copy-out. */ 819 pt = __kmp_dequeue_task(global_tid, queue, TRUE); 820 pt->th_flags |= 821 TQF_IS_LAST_TASK; /* don't need test_then_or since already locked */ 822 } 823 } 824 825 /* GEH - What happens here if is lastprivate, but not last task? */ 826 __kmp_release_lock(&queue->tq_queue_lck, global_tid); 827 } 828 829 return pt; 830 } 831 832 /* Walk a tree of queues starting at queue's first child and return a non-NULL 833 thunk if one can be scheduled. Must only be called when in_parallel=TRUE */ 834 835 static kmpc_thunk_t * 836 __kmp_find_task_in_descendant_queue(kmp_int32 global_tid, 837 kmpc_task_queue_t *curr_queue) { 838 kmpc_thunk_t *pt = NULL; 839 kmpc_task_queue_t *queue = curr_queue; 840 841 if (curr_queue->tq_first_child != NULL) { 842 __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid); 843 // Make sure data structures are in consistent state before querying them 844 // Seems to work without this call for digital/alpha, needed for IBM/RS6000 845 KMP_MB(); 846 847 queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child); 848 if (queue == NULL) { 849 __kmp_release_lock(&curr_queue->tq_link_lck, global_tid); 850 return NULL; 851 } 852 853 while (queue != NULL) { 854 int ct; 855 kmpc_task_queue_t *next; 856 857 ct = ++(queue->tq_ref_count); 858 __kmp_release_lock(&curr_queue->tq_link_lck, global_tid); 859 KMP_DEBUG_REF_CTS( 860 ("line %d gtid %d: Q %p inc %d\n", __LINE__, global_tid, queue, ct)); 861 862 pt = __kmp_find_task_in_queue(global_tid, queue); 863 864 if (pt != NULL) { 865 int ct; 866 867 __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid); 868 // Make sure data structures in consistent state before querying them 869 // Seems to work without this for digital/alpha, needed for IBM/RS6000 870 KMP_MB(); 871 872 ct = --(queue->tq_ref_count); 873 KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__, 874 global_tid, queue, ct)); 875 KMP_DEBUG_ASSERT(queue->tq_ref_count >= 0); 876 877 __kmp_release_lock(&curr_queue->tq_link_lck, global_tid); 878 879 return pt; 880 } 881 882 /* although reference count stays active during descendant walk, shouldn't 883 matter since if children still exist, reference counts aren't being 884 monitored anyway */ 885 886 pt = __kmp_find_task_in_descendant_queue(global_tid, queue); 887 888 if (pt != NULL) { 889 int ct; 890 891 __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid); 892 // Make sure data structures in consistent state before querying them 893 // Seems to work without this for digital/alpha, needed for IBM/RS6000 894 KMP_MB(); 895 896 ct = --(queue->tq_ref_count); 897 KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__, 898 global_tid, queue, ct)); 899 KMP_DEBUG_ASSERT(ct >= 0); 900 901 __kmp_release_lock(&curr_queue->tq_link_lck, global_tid); 902 903 return pt; 904 } 905 906 __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid); 907 // Make sure data structures in consistent state before querying them 908 // Seems to work without this for digital/alpha, needed for IBM/RS6000 909 KMP_MB(); 910 911 next = queue->tq_next_child; 912 913 ct = --(queue->tq_ref_count); 914 KMP_DEBUG_REF_CTS( 915 ("line %d gtid %d: Q %p dec %d\n", __LINE__, global_tid, queue, ct)); 916 KMP_DEBUG_ASSERT(ct >= 0); 917 918 queue = next; 919 } 920 921 __kmp_release_lock(&curr_queue->tq_link_lck, global_tid); 922 } 923 924 return pt; 925 } 926 927 /* Walk up the taskq tree looking for a task to execute. If we get to the root, 928 search the tree for a descendent queue task. Must only be called when 929 in_parallel=TRUE */ 930 static kmpc_thunk_t * 931 __kmp_find_task_in_ancestor_queue(kmp_taskq_t *tq, kmp_int32 global_tid, 932 kmpc_task_queue_t *curr_queue) { 933 kmpc_task_queue_t *queue; 934 kmpc_thunk_t *pt; 935 936 pt = NULL; 937 938 if (curr_queue->tq.tq_parent != NULL) { 939 queue = curr_queue->tq.tq_parent; 940 941 while (queue != NULL) { 942 if (queue->tq.tq_parent != NULL) { 943 int ct; 944 __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); 945 // Make sure data structures in consistent state before querying them 946 // Seems to work without this for digital/alpha, needed for IBM/RS6000 947 KMP_MB(); 948 949 ct = ++(queue->tq_ref_count); 950 __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); 951 KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n", __LINE__, 952 global_tid, queue, ct)); 953 } 954 955 pt = __kmp_find_task_in_queue(global_tid, queue); 956 if (pt != NULL) { 957 if (queue->tq.tq_parent != NULL) { 958 int ct; 959 __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); 960 // Make sure data structures in consistent state before querying them 961 // Seems to work without this for digital/alpha, needed for IBM/RS6000 962 KMP_MB(); 963 964 ct = --(queue->tq_ref_count); 965 KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__, 966 global_tid, queue, ct)); 967 KMP_DEBUG_ASSERT(ct >= 0); 968 969 __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); 970 } 971 972 return pt; 973 } 974 975 if (queue->tq.tq_parent != NULL) { 976 int ct; 977 __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); 978 // Make sure data structures in consistent state before querying them 979 // Seems to work without this for digital/alpha, needed for IBM/RS6000 980 KMP_MB(); 981 982 ct = --(queue->tq_ref_count); 983 KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__, 984 global_tid, queue, ct)); 985 KMP_DEBUG_ASSERT(ct >= 0); 986 } 987 queue = queue->tq.tq_parent; 988 989 if (queue != NULL) 990 __kmp_release_lock(&queue->tq_link_lck, global_tid); 991 } 992 } 993 994 pt = __kmp_find_task_in_descendant_queue(global_tid, tq->tq_root); 995 996 return pt; 997 } 998 999 static int __kmp_taskq_tasks_finished(kmpc_task_queue_t *queue) { 1000 int i; 1001 1002 /* KMP_MB(); */ /* is this really necessary? */ 1003 1004 for (i = 0; i < queue->tq_nproc; i++) { 1005 if (queue->tq_th_thunks[i].ai_data != 0) 1006 return FALSE; 1007 } 1008 1009 return TRUE; 1010 } 1011 1012 static int __kmp_taskq_has_any_children(kmpc_task_queue_t *queue) { 1013 return (queue->tq_first_child != NULL); 1014 } 1015 1016 static void __kmp_remove_queue_from_tree(kmp_taskq_t *tq, kmp_int32 global_tid, 1017 kmpc_task_queue_t *queue, 1018 int in_parallel) { 1019 #ifdef KMP_DEBUG 1020 kmp_int32 i; 1021 kmpc_thunk_t *thunk; 1022 #endif 1023 1024 KF_TRACE(50, 1025 ("Before Deletion of TaskQ at %p on (%d):\n", queue, global_tid)); 1026 KF_DUMP(50, __kmp_dump_task_queue(tq, queue, global_tid)); 1027 1028 /* sub-queue in a recursion, not the root task queue */ 1029 KMP_DEBUG_ASSERT(queue->tq.tq_parent != NULL); 1030 1031 if (in_parallel) { 1032 __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); 1033 // Make sure data structures are in consistent state before querying them 1034 // Seems to work without this call for digital/alpha, needed for IBM/RS6000 1035 KMP_MB(); 1036 } 1037 1038 KMP_DEBUG_ASSERT(queue->tq_first_child == NULL); 1039 1040 /* unlink queue from its siblings if any at this level */ 1041 if (queue->tq_prev_child != NULL) 1042 queue->tq_prev_child->tq_next_child = queue->tq_next_child; 1043 if (queue->tq_next_child != NULL) 1044 queue->tq_next_child->tq_prev_child = queue->tq_prev_child; 1045 if (queue->tq.tq_parent->tq_first_child == queue) 1046 queue->tq.tq_parent->tq_first_child = queue->tq_next_child; 1047 1048 queue->tq_prev_child = NULL; 1049 queue->tq_next_child = NULL; 1050 1051 if (in_parallel) { 1052 KMP_DEBUG_REF_CTS( 1053 ("line %d gtid %d: Q %p waiting for ref_count of %d to reach 1\n", 1054 __LINE__, global_tid, queue, queue->tq_ref_count)); 1055 1056 /* wait until all other threads have stopped accessing this queue */ 1057 while (queue->tq_ref_count > 1) { 1058 __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); 1059 1060 KMP_WAIT_YIELD((volatile kmp_uint32 *)&queue->tq_ref_count, 1, KMP_LE, 1061 NULL); 1062 1063 __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); 1064 // Make sure data structures are in consistent state before querying them 1065 // Seems to work without this for digital/alpha, needed for IBM/RS6000 1066 KMP_MB(); 1067 } 1068 1069 __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); 1070 } 1071 1072 KMP_DEBUG_REF_CTS( 1073 ("line %d gtid %d: Q %p freeing queue\n", __LINE__, global_tid, queue)); 1074 1075 #ifdef KMP_DEBUG 1076 KMP_DEBUG_ASSERT(queue->tq_flags & TQF_ALL_TASKS_QUEUED); 1077 KMP_DEBUG_ASSERT(queue->tq_nfull == 0); 1078 1079 for (i = 0; i < queue->tq_nproc; i++) { 1080 KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0); 1081 } 1082 1083 i = 0; 1084 for (thunk = queue->tq_free_thunks; thunk != NULL; 1085 thunk = thunk->th.th_next_free) 1086 ++i; 1087 1088 KMP_ASSERT(i == 1089 queue->tq_nslots + (queue->tq_nproc * __KMP_TASKQ_THUNKS_PER_TH)); 1090 #endif 1091 1092 /* release storage for queue entry */ 1093 __kmp_free_taskq(tq, queue, TRUE, global_tid); 1094 1095 KF_TRACE(50, ("After Deletion of TaskQ at %p on (%d):\n", queue, global_tid)); 1096 KF_DUMP(50, __kmp_dump_task_queue_tree(tq, tq->tq_root, global_tid)); 1097 } 1098 1099 /* Starting from indicated queue, proceed downward through tree and remove all 1100 taskqs which are finished, but only go down to taskqs which have the "nowait" 1101 clause present. Assume this is only called when in_parallel=TRUE. */ 1102 1103 static void __kmp_find_and_remove_finished_child_taskq( 1104 kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *curr_queue) { 1105 kmpc_task_queue_t *queue = curr_queue; 1106 1107 if (curr_queue->tq_first_child != NULL) { 1108 __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid); 1109 // Make sure data structures are in consistent state before querying them 1110 // Seems to work without this call for digital/alpha, needed for IBM/RS6000 1111 KMP_MB(); 1112 1113 queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child); 1114 if (queue != NULL) { 1115 __kmp_release_lock(&curr_queue->tq_link_lck, global_tid); 1116 return; 1117 } 1118 1119 while (queue != NULL) { 1120 kmpc_task_queue_t *next; 1121 int ct = ++(queue->tq_ref_count); 1122 KMP_DEBUG_REF_CTS( 1123 ("line %d gtid %d: Q %p inc %d\n", __LINE__, global_tid, queue, ct)); 1124 1125 /* although reference count stays active during descendant walk, */ 1126 /* shouldn't matter since if children still exist, reference */ 1127 /* counts aren't being monitored anyway */ 1128 1129 if (queue->tq_flags & TQF_IS_NOWAIT) { 1130 __kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue); 1131 1132 if ((queue->tq_flags & TQF_ALL_TASKS_QUEUED) && 1133 (queue->tq_nfull == 0) && __kmp_taskq_tasks_finished(queue) && 1134 !__kmp_taskq_has_any_children(queue)) { 1135 1136 /* Only remove this if we have not already marked it for deallocation. 1137 This should prevent multiple threads from trying to free this. */ 1138 1139 if (__kmp_test_lock(&queue->tq_queue_lck, global_tid)) { 1140 if (!(queue->tq_flags & TQF_DEALLOCATED)) { 1141 queue->tq_flags |= TQF_DEALLOCATED; 1142 __kmp_release_lock(&queue->tq_queue_lck, global_tid); 1143 1144 __kmp_remove_queue_from_tree(tq, global_tid, queue, TRUE); 1145 1146 /* Can't do any more here since can't be sure where sibling queue 1147 * is so just exit this level */ 1148 return; 1149 } else { 1150 __kmp_release_lock(&queue->tq_queue_lck, global_tid); 1151 } 1152 } 1153 /* otherwise, just fall through and decrement reference count */ 1154 } 1155 } 1156 1157 __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid); 1158 // Make sure data structures are in consistent state before querying them 1159 // Seems to work without this for digital/alpha, needed for IBM/RS6000 1160 KMP_MB(); 1161 1162 next = queue->tq_next_child; 1163 1164 ct = --(queue->tq_ref_count); 1165 KMP_DEBUG_REF_CTS( 1166 ("line %d gtid %d: Q %p dec %d\n", __LINE__, global_tid, queue, ct)); 1167 KMP_DEBUG_ASSERT(ct >= 0); 1168 1169 queue = next; 1170 } 1171 1172 __kmp_release_lock(&curr_queue->tq_link_lck, global_tid); 1173 } 1174 } 1175 1176 /* Starting from indicated queue, proceed downward through tree and remove all 1177 taskq's assuming all are finished and assuming NO other threads are executing 1178 at this point. */ 1179 static void __kmp_remove_all_child_taskq(kmp_taskq_t *tq, kmp_int32 global_tid, 1180 kmpc_task_queue_t *queue) { 1181 kmpc_task_queue_t *next_child; 1182 1183 queue = CCAST(kmpc_task_queue_t *, queue->tq_first_child); 1184 1185 while (queue != NULL) { 1186 __kmp_remove_all_child_taskq(tq, global_tid, queue); 1187 1188 next_child = queue->tq_next_child; 1189 queue->tq_flags |= TQF_DEALLOCATED; 1190 __kmp_remove_queue_from_tree(tq, global_tid, queue, FALSE); 1191 queue = next_child; 1192 } 1193 } 1194 1195 static void __kmp_execute_task_from_queue(kmp_taskq_t *tq, ident_t *loc, 1196 kmp_int32 global_tid, 1197 kmpc_thunk_t *thunk, 1198 int in_parallel) { 1199 kmpc_task_queue_t *queue = thunk->th.th_shareds->sv_queue; 1200 kmp_int32 tid = __kmp_tid_from_gtid(global_tid); 1201 1202 KF_TRACE(100, ("After dequeueing this Task on (%d):\n", global_tid)); 1203 KF_DUMP(100, __kmp_dump_thunk(tq, thunk, global_tid)); 1204 KF_TRACE(100, ("Task Queue: %p looks like this (%d):\n", queue, global_tid)); 1205 KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid)); 1206 1207 /* For the taskq task, the curr_thunk pushes and pop pairs are set up as 1208 * follows: 1209 * 1210 * happens exactly once: 1211 * 1) __kmpc_taskq : push (if returning thunk only) 1212 * 4) __kmpc_end_taskq_task : pop 1213 * 1214 * optionally happens *each* time taskq task is dequeued/enqueued: 1215 * 2) __kmpc_taskq_task : pop 1216 * 3) __kmp_execute_task_from_queue : push 1217 * 1218 * execution ordering: 1,(2,3)*,4 1219 */ 1220 1221 if (!(thunk->th_flags & TQF_TASKQ_TASK)) { 1222 kmp_int32 index = (queue == tq->tq_root) ? tid : 0; 1223 thunk->th.th_shareds = 1224 CCAST(kmpc_shared_vars_t *, queue->tq_shareds[index].ai_data); 1225 1226 if (__kmp_env_consistency_check) { 1227 __kmp_push_workshare(global_tid, 1228 (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered 1229 : ct_task, 1230 queue->tq_loc); 1231 } 1232 } else { 1233 if (__kmp_env_consistency_check) 1234 __kmp_push_workshare(global_tid, ct_taskq, queue->tq_loc); 1235 } 1236 1237 if (in_parallel) { 1238 thunk->th_encl_thunk = tq->tq_curr_thunk[tid]; 1239 tq->tq_curr_thunk[tid] = thunk; 1240 1241 KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid)); 1242 } 1243 1244 KF_TRACE(50, ("Begin Executing Thunk %p from queue %p on (%d)\n", thunk, 1245 queue, global_tid)); 1246 thunk->th_task(global_tid, thunk); 1247 KF_TRACE(50, ("End Executing Thunk %p from queue %p on (%d)\n", thunk, queue, 1248 global_tid)); 1249 1250 if (!(thunk->th_flags & TQF_TASKQ_TASK)) { 1251 if (__kmp_env_consistency_check) 1252 __kmp_pop_workshare(global_tid, 1253 (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered 1254 : ct_task, 1255 queue->tq_loc); 1256 1257 if (in_parallel) { 1258 tq->tq_curr_thunk[tid] = thunk->th_encl_thunk; 1259 thunk->th_encl_thunk = NULL; 1260 KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid)); 1261 } 1262 1263 if ((thunk->th_flags & TQF_IS_ORDERED) && in_parallel) { 1264 __kmp_taskq_check_ordered(global_tid, thunk); 1265 } 1266 1267 __kmp_free_thunk(queue, thunk, in_parallel, global_tid); 1268 1269 KF_TRACE(100, ("T#%d After freeing thunk: %p, TaskQ looks like this:\n", 1270 global_tid, thunk)); 1271 KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid)); 1272 1273 if (in_parallel) { 1274 KMP_MB(); /* needed so thunk put on free list before outstanding thunk 1275 count is decremented */ 1276 1277 KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data >= 1); 1278 1279 KF_TRACE( 1280 200, 1281 ("__kmp_execute_task_from_queue: T#%d has %d thunks in queue %p\n", 1282 global_tid, queue->tq_th_thunks[tid].ai_data - 1, queue)); 1283 1284 queue->tq_th_thunks[tid].ai_data--; 1285 1286 /* KMP_MB(); */ /* is MB really necessary ? */ 1287 } 1288 1289 if (queue->tq.tq_parent != NULL && in_parallel) { 1290 int ct; 1291 __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); 1292 ct = --(queue->tq_ref_count); 1293 __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); 1294 KMP_DEBUG_REF_CTS( 1295 ("line %d gtid %d: Q %p dec %d\n", __LINE__, global_tid, queue, ct)); 1296 KMP_DEBUG_ASSERT(ct >= 0); 1297 } 1298 } 1299 } 1300 1301 /* starts a taskq; creates and returns a thunk for the taskq_task */ 1302 /* also, returns pointer to shared vars for this thread in "shareds" arg */ 1303 kmpc_thunk_t *__kmpc_taskq(ident_t *loc, kmp_int32 global_tid, 1304 kmpc_task_t taskq_task, size_t sizeof_thunk, 1305 size_t sizeof_shareds, kmp_int32 flags, 1306 kmpc_shared_vars_t **shareds) { 1307 int in_parallel; 1308 kmp_int32 nslots, nthunks, nshareds, nproc; 1309 kmpc_task_queue_t *new_queue, *curr_queue; 1310 kmpc_thunk_t *new_taskq_thunk; 1311 kmp_info_t *th; 1312 kmp_team_t *team; 1313 kmp_taskq_t *tq; 1314 kmp_int32 tid; 1315 1316 KE_TRACE(10, ("__kmpc_taskq called (%d)\n", global_tid)); 1317 1318 th = __kmp_threads[global_tid]; 1319 team = th->th.th_team; 1320 tq = &team->t.t_taskq; 1321 nproc = team->t.t_nproc; 1322 tid = __kmp_tid_from_gtid(global_tid); 1323 1324 /* find out whether this is a parallel taskq or serialized one. */ 1325 in_parallel = in_parallel_context(team); 1326 1327 if (!tq->tq_root) { 1328 if (in_parallel) { 1329 /* Vector ORDERED SECTION to taskq version */ 1330 th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo; 1331 1332 /* Vector ORDERED SECTION to taskq version */ 1333 th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo; 1334 } 1335 1336 if (in_parallel) { 1337 // This shouldn't be a barrier region boundary, it will confuse the user. 1338 /* Need the boundary to be at the end taskq instead. */ 1339 if (__kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL)) { 1340 /* Creating the active root queue, and we are not the master thread. */ 1341 /* The master thread below created the queue and tasks have been */ 1342 /* enqueued, and the master thread released this barrier. This */ 1343 /* worker thread can now proceed and execute tasks. See also the */ 1344 /* TQF_RELEASE_WORKERS which is used to handle this case. */ 1345 *shareds = 1346 CCAST(kmpc_shared_vars_t *, tq->tq_root->tq_shareds[tid].ai_data); 1347 KE_TRACE(10, ("__kmpc_taskq return (%d)\n", global_tid)); 1348 1349 return NULL; 1350 } 1351 } 1352 1353 /* master thread only executes this code */ 1354 if (tq->tq_curr_thunk_capacity < nproc) { 1355 if (tq->tq_curr_thunk) 1356 __kmp_free(tq->tq_curr_thunk); 1357 else { 1358 /* only need to do this once at outer level, i.e. when tq_curr_thunk is 1359 * still NULL */ 1360 __kmp_init_lock(&tq->tq_freelist_lck); 1361 } 1362 1363 tq->tq_curr_thunk = 1364 (kmpc_thunk_t **)__kmp_allocate(nproc * sizeof(kmpc_thunk_t *)); 1365 tq->tq_curr_thunk_capacity = nproc; 1366 } 1367 1368 if (in_parallel) 1369 tq->tq_global_flags = TQF_RELEASE_WORKERS; 1370 } 1371 1372 /* dkp: in future, if flags & TQF_HEURISTICS, will choose nslots based */ 1373 /* on some heuristics (e.g., depth of queue nesting?). */ 1374 nslots = (in_parallel) ? (2 * nproc) : 1; 1375 1376 /* There must be nproc * __KMP_TASKQ_THUNKS_PER_TH extra slots for pending */ 1377 /* jobs being executed by other threads, and one extra for taskq slot */ 1378 nthunks = (in_parallel) ? (nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH) + 1) 1379 : nslots + 2; 1380 1381 /* Only the root taskq gets a per-thread array of shareds. */ 1382 /* The rest of the taskq's only get one copy of the shared vars. */ 1383 nshareds = (!tq->tq_root && in_parallel) ? nproc : 1; 1384 1385 /* create overall queue data structure and its components that require 1386 * allocation */ 1387 new_queue = __kmp_alloc_taskq(tq, in_parallel, nslots, nthunks, nshareds, 1388 nproc, sizeof_thunk, sizeof_shareds, 1389 &new_taskq_thunk, global_tid); 1390 1391 /* rest of new_queue initializations */ 1392 new_queue->tq_flags = flags & TQF_INTERFACE_FLAGS; 1393 1394 if (in_parallel) { 1395 new_queue->tq_tasknum_queuing = 0; 1396 new_queue->tq_tasknum_serving = 0; 1397 new_queue->tq_flags |= TQF_PARALLEL_CONTEXT; 1398 } 1399 1400 new_queue->tq_taskq_slot = NULL; 1401 new_queue->tq_nslots = nslots; 1402 new_queue->tq_hiwat = HIGH_WATER_MARK(nslots); 1403 new_queue->tq_nfull = 0; 1404 new_queue->tq_head = 0; 1405 new_queue->tq_tail = 0; 1406 new_queue->tq_loc = loc; 1407 1408 if ((new_queue->tq_flags & TQF_IS_ORDERED) && in_parallel) { 1409 /* prepare to serve the first-queued task's ORDERED directive */ 1410 new_queue->tq_tasknum_serving = 1; 1411 1412 /* Vector ORDERED SECTION to taskq version */ 1413 th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo; 1414 1415 /* Vector ORDERED SECTION to taskq version */ 1416 th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo; 1417 } 1418 1419 /* create a new thunk for the taskq_task in the new_queue */ 1420 *shareds = CCAST(kmpc_shared_vars_t *, new_queue->tq_shareds[0].ai_data); 1421 1422 new_taskq_thunk->th.th_shareds = *shareds; 1423 new_taskq_thunk->th_task = taskq_task; 1424 new_taskq_thunk->th_flags = new_queue->tq_flags | TQF_TASKQ_TASK; 1425 new_taskq_thunk->th_status = 0; 1426 1427 KMP_DEBUG_ASSERT(new_taskq_thunk->th_flags & TQF_TASKQ_TASK); 1428 1429 // Make sure these inits complete before threads start using this queue 1430 /* KMP_MB(); */ // (necessary?) 1431 1432 /* insert the new task queue into the tree, but only after all fields 1433 * initialized */ 1434 1435 if (in_parallel) { 1436 if (!tq->tq_root) { 1437 new_queue->tq.tq_parent = NULL; 1438 new_queue->tq_first_child = NULL; 1439 new_queue->tq_next_child = NULL; 1440 new_queue->tq_prev_child = NULL; 1441 new_queue->tq_ref_count = 1; 1442 tq->tq_root = new_queue; 1443 } else { 1444 curr_queue = tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue; 1445 new_queue->tq.tq_parent = curr_queue; 1446 new_queue->tq_first_child = NULL; 1447 new_queue->tq_prev_child = NULL; 1448 new_queue->tq_ref_count = 1449 1; /* for this the thread that built the queue */ 1450 1451 KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p alloc %d\n", __LINE__, 1452 global_tid, new_queue, new_queue->tq_ref_count)); 1453 1454 __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid); 1455 1456 // Make sure data structures are in consistent state before querying them 1457 // Seems to work without this for digital/alpha, needed for IBM/RS6000 1458 KMP_MB(); 1459 1460 new_queue->tq_next_child = 1461 CCAST(struct kmpc_task_queue_t *, curr_queue->tq_first_child); 1462 1463 if (curr_queue->tq_first_child != NULL) 1464 curr_queue->tq_first_child->tq_prev_child = new_queue; 1465 1466 curr_queue->tq_first_child = new_queue; 1467 1468 __kmp_release_lock(&curr_queue->tq_link_lck, global_tid); 1469 } 1470 1471 /* set up thunk stack only after code that determines curr_queue above */ 1472 new_taskq_thunk->th_encl_thunk = tq->tq_curr_thunk[tid]; 1473 tq->tq_curr_thunk[tid] = new_taskq_thunk; 1474 1475 KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid)); 1476 } else { 1477 new_taskq_thunk->th_encl_thunk = 0; 1478 new_queue->tq.tq_parent = NULL; 1479 new_queue->tq_first_child = NULL; 1480 new_queue->tq_next_child = NULL; 1481 new_queue->tq_prev_child = NULL; 1482 new_queue->tq_ref_count = 1; 1483 } 1484 1485 #ifdef KMP_DEBUG 1486 KF_TRACE(150, ("Creating TaskQ Task on (%d):\n", global_tid)); 1487 KF_DUMP(150, __kmp_dump_thunk(tq, new_taskq_thunk, global_tid)); 1488 1489 if (in_parallel) { 1490 KF_TRACE(25, 1491 ("After TaskQ at %p Creation on (%d):\n", new_queue, global_tid)); 1492 } else { 1493 KF_TRACE(25, ("After Serial TaskQ at %p Creation on (%d):\n", new_queue, 1494 global_tid)); 1495 } 1496 1497 KF_DUMP(25, __kmp_dump_task_queue(tq, new_queue, global_tid)); 1498 1499 if (in_parallel) { 1500 KF_DUMP(50, __kmp_dump_task_queue_tree(tq, tq->tq_root, global_tid)); 1501 } 1502 #endif /* KMP_DEBUG */ 1503 1504 if (__kmp_env_consistency_check) 1505 __kmp_push_workshare(global_tid, ct_taskq, new_queue->tq_loc); 1506 1507 KE_TRACE(10, ("__kmpc_taskq return (%d)\n", global_tid)); 1508 1509 return new_taskq_thunk; 1510 } 1511 1512 /* ends a taskq; last thread out destroys the queue */ 1513 1514 void __kmpc_end_taskq(ident_t *loc, kmp_int32 global_tid, 1515 kmpc_thunk_t *taskq_thunk) { 1516 #ifdef KMP_DEBUG 1517 kmp_int32 i; 1518 #endif 1519 kmp_taskq_t *tq; 1520 int in_parallel; 1521 kmp_info_t *th; 1522 kmp_int32 is_outermost; 1523 kmpc_task_queue_t *queue; 1524 kmpc_thunk_t *thunk; 1525 int nproc; 1526 1527 KE_TRACE(10, ("__kmpc_end_taskq called (%d)\n", global_tid)); 1528 1529 tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq; 1530 nproc = __kmp_threads[global_tid]->th.th_team->t.t_nproc; 1531 1532 /* For the outermost taskq only, all but one thread will have taskq_thunk == 1533 * NULL */ 1534 queue = (taskq_thunk == NULL) ? tq->tq_root 1535 : taskq_thunk->th.th_shareds->sv_queue; 1536 1537 KE_TRACE(50, ("__kmpc_end_taskq queue=%p (%d) \n", queue, global_tid)); 1538 is_outermost = (queue == tq->tq_root); 1539 in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT); 1540 1541 if (in_parallel) { 1542 kmp_uint32 spins; 1543 1544 /* this is just a safeguard to release the waiting threads if */ 1545 /* the outermost taskq never queues a task */ 1546 1547 if (is_outermost && (KMP_MASTER_GTID(global_tid))) { 1548 if (tq->tq_global_flags & TQF_RELEASE_WORKERS) { 1549 /* no lock needed, workers are still in spin mode */ 1550 tq->tq_global_flags &= ~TQF_RELEASE_WORKERS; 1551 1552 __kmp_end_split_barrier(bs_plain_barrier, global_tid); 1553 } 1554 } 1555 1556 /* keep dequeueing work until all tasks are queued and dequeued */ 1557 1558 do { 1559 /* wait until something is available to dequeue */ 1560 KMP_INIT_YIELD(spins); 1561 1562 while ((queue->tq_nfull == 0) && (queue->tq_taskq_slot == NULL) && 1563 (!__kmp_taskq_has_any_children(queue)) && 1564 (!(queue->tq_flags & TQF_ALL_TASKS_QUEUED))) { 1565 KMP_YIELD_WHEN(TRUE, spins); 1566 } 1567 1568 /* check to see if we can execute tasks in the queue */ 1569 while (((queue->tq_nfull != 0) || (queue->tq_taskq_slot != NULL)) && 1570 (thunk = __kmp_find_task_in_queue(global_tid, queue)) != NULL) { 1571 KF_TRACE(50, ("Found thunk: %p in primary queue %p (%d)\n", thunk, 1572 queue, global_tid)); 1573 __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel); 1574 } 1575 1576 /* see if work found can be found in a descendant queue */ 1577 if ((__kmp_taskq_has_any_children(queue)) && 1578 (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) != 1579 NULL) { 1580 1581 KF_TRACE(50, 1582 ("Stole thunk: %p in descendant queue: %p while waiting in " 1583 "queue: %p (%d)\n", 1584 thunk, thunk->th.th_shareds->sv_queue, queue, global_tid)); 1585 1586 __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel); 1587 } 1588 1589 } while ((!(queue->tq_flags & TQF_ALL_TASKS_QUEUED)) || 1590 (queue->tq_nfull != 0)); 1591 1592 KF_TRACE(50, ("All tasks queued and dequeued in queue: %p (%d)\n", queue, 1593 global_tid)); 1594 1595 /* wait while all tasks are not finished and more work found 1596 in descendant queues */ 1597 1598 while ((!__kmp_taskq_tasks_finished(queue)) && 1599 (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) != 1600 NULL) { 1601 1602 KF_TRACE(50, ("Stole thunk: %p in descendant queue: %p while waiting in " 1603 "queue: %p (%d)\n", 1604 thunk, thunk->th.th_shareds->sv_queue, queue, global_tid)); 1605 1606 __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel); 1607 } 1608 1609 KF_TRACE(50, ("No work found in descendent queues or all work finished in " 1610 "queue: %p (%d)\n", 1611 queue, global_tid)); 1612 1613 if (!is_outermost) { 1614 /* need to return if NOWAIT present and not outermost taskq */ 1615 1616 if (queue->tq_flags & TQF_IS_NOWAIT) { 1617 __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); 1618 queue->tq_ref_count--; 1619 KMP_DEBUG_ASSERT(queue->tq_ref_count >= 0); 1620 __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid); 1621 1622 KE_TRACE( 1623 10, ("__kmpc_end_taskq return for nowait case (%d)\n", global_tid)); 1624 1625 return; 1626 } 1627 1628 __kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue); 1629 1630 /* WAIT until all tasks are finished and no child queues exist before 1631 * proceeding */ 1632 KMP_INIT_YIELD(spins); 1633 1634 while (!__kmp_taskq_tasks_finished(queue) || 1635 __kmp_taskq_has_any_children(queue)) { 1636 thunk = __kmp_find_task_in_ancestor_queue(tq, global_tid, queue); 1637 1638 if (thunk != NULL) { 1639 KF_TRACE(50, 1640 ("Stole thunk: %p in ancestor queue: %p while waiting in " 1641 "queue: %p (%d)\n", 1642 thunk, thunk->th.th_shareds->sv_queue, queue, global_tid)); 1643 __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, 1644 in_parallel); 1645 } 1646 1647 KMP_YIELD_WHEN(thunk == NULL, spins); 1648 1649 __kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue); 1650 } 1651 1652 __kmp_acquire_lock(&queue->tq_queue_lck, global_tid); 1653 if (!(queue->tq_flags & TQF_DEALLOCATED)) { 1654 queue->tq_flags |= TQF_DEALLOCATED; 1655 } 1656 __kmp_release_lock(&queue->tq_queue_lck, global_tid); 1657 1658 /* only the allocating thread can deallocate the queue */ 1659 if (taskq_thunk != NULL) { 1660 __kmp_remove_queue_from_tree(tq, global_tid, queue, TRUE); 1661 } 1662 1663 KE_TRACE( 1664 10, 1665 ("__kmpc_end_taskq return for non_outermost queue, wait case (%d)\n", 1666 global_tid)); 1667 1668 return; 1669 } 1670 1671 // Outermost Queue: steal work from descendants until all tasks are finished 1672 1673 KMP_INIT_YIELD(spins); 1674 1675 while (!__kmp_taskq_tasks_finished(queue)) { 1676 thunk = __kmp_find_task_in_descendant_queue(global_tid, queue); 1677 1678 if (thunk != NULL) { 1679 KF_TRACE(50, 1680 ("Stole thunk: %p in descendant queue: %p while waiting in " 1681 "queue: %p (%d)\n", 1682 thunk, thunk->th.th_shareds->sv_queue, queue, global_tid)); 1683 1684 __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel); 1685 } 1686 1687 KMP_YIELD_WHEN(thunk == NULL, spins); 1688 } 1689 1690 /* Need this barrier to prevent destruction of queue before threads have all 1691 * executed above code */ 1692 /* This may need to be done earlier when NOWAIT is implemented for the 1693 * outermost level */ 1694 1695 if (!__kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL)) { 1696 /* the queue->tq_flags & TQF_IS_NOWAIT case is not yet handled here; */ 1697 /* for right now, everybody waits, and the master thread destroys the */ 1698 /* remaining queues. */ 1699 1700 __kmp_remove_all_child_taskq(tq, global_tid, queue); 1701 1702 /* Now destroy the root queue */ 1703 KF_TRACE(100, ("T#%d Before Deletion of top-level TaskQ at %p:\n", 1704 global_tid, queue)); 1705 KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid)); 1706 1707 #ifdef KMP_DEBUG 1708 /* the root queue entry */ 1709 KMP_DEBUG_ASSERT((queue->tq.tq_parent == NULL) && 1710 (queue->tq_next_child == NULL)); 1711 1712 /* children must all be gone by now because of barrier above */ 1713 KMP_DEBUG_ASSERT(queue->tq_first_child == NULL); 1714 1715 for (i = 0; i < nproc; i++) { 1716 KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0); 1717 } 1718 1719 for (i = 0, thunk = queue->tq_free_thunks; thunk != NULL; 1720 i++, thunk = thunk->th.th_next_free) 1721 ; 1722 1723 KMP_DEBUG_ASSERT(i == 1724 queue->tq_nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH)); 1725 1726 for (i = 0; i < nproc; i++) { 1727 KMP_DEBUG_ASSERT(!tq->tq_curr_thunk[i]); 1728 } 1729 #endif 1730 /* unlink the root queue entry */ 1731 tq->tq_root = NULL; 1732 1733 /* release storage for root queue entry */ 1734 KF_TRACE(50, ("After Deletion of top-level TaskQ at %p on (%d):\n", queue, 1735 global_tid)); 1736 1737 queue->tq_flags |= TQF_DEALLOCATED; 1738 __kmp_free_taskq(tq, queue, in_parallel, global_tid); 1739 1740 KF_DUMP(50, __kmp_dump_task_queue_tree(tq, tq->tq_root, global_tid)); 1741 1742 /* release the workers now that the data structures are up to date */ 1743 __kmp_end_split_barrier(bs_plain_barrier, global_tid); 1744 } 1745 1746 th = __kmp_threads[global_tid]; 1747 1748 /* Reset ORDERED SECTION to parallel version */ 1749 th->th.th_dispatch->th_deo_fcn = 0; 1750 1751 /* Reset ORDERED SECTION to parallel version */ 1752 th->th.th_dispatch->th_dxo_fcn = 0; 1753 } else { 1754 /* in serial execution context, dequeue the last task */ 1755 /* and execute it, if there were any tasks encountered */ 1756 1757 if (queue->tq_nfull > 0) { 1758 KMP_DEBUG_ASSERT(queue->tq_nfull == 1); 1759 1760 thunk = __kmp_dequeue_task(global_tid, queue, in_parallel); 1761 1762 if (queue->tq_flags & TQF_IS_LAST_TASK) { 1763 /* TQF_IS_LASTPRIVATE, one thing in queue, __kmpc_end_taskq_task() */ 1764 /* has been run so this is last task, run with TQF_IS_LAST_TASK so */ 1765 /* instrumentation does copy-out. */ 1766 1767 /* no need for test_then_or call since already locked */ 1768 thunk->th_flags |= TQF_IS_LAST_TASK; 1769 } 1770 1771 KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid, 1772 thunk, queue)); 1773 1774 __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel); 1775 } 1776 1777 // destroy the unattached serial queue now that there is no more work to do 1778 KF_TRACE(100, ("Before Deletion of Serialized TaskQ at %p on (%d):\n", 1779 queue, global_tid)); 1780 KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid)); 1781 1782 #ifdef KMP_DEBUG 1783 i = 0; 1784 for (thunk = queue->tq_free_thunks; thunk != NULL; 1785 thunk = thunk->th.th_next_free) 1786 ++i; 1787 KMP_DEBUG_ASSERT(i == queue->tq_nslots + 1); 1788 #endif 1789 /* release storage for unattached serial queue */ 1790 KF_TRACE(50, 1791 ("Serialized TaskQ at %p deleted on (%d).\n", queue, global_tid)); 1792 1793 queue->tq_flags |= TQF_DEALLOCATED; 1794 __kmp_free_taskq(tq, queue, in_parallel, global_tid); 1795 } 1796 1797 KE_TRACE(10, ("__kmpc_end_taskq return (%d)\n", global_tid)); 1798 } 1799 1800 /* Enqueues a task for thunk previously created by __kmpc_task_buffer. */ 1801 /* Returns nonzero if just filled up queue */ 1802 1803 kmp_int32 __kmpc_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk) { 1804 kmp_int32 ret; 1805 kmpc_task_queue_t *queue; 1806 int in_parallel; 1807 kmp_taskq_t *tq; 1808 1809 KE_TRACE(10, ("__kmpc_task called (%d)\n", global_tid)); 1810 1811 KMP_DEBUG_ASSERT(!(thunk->th_flags & 1812 TQF_TASKQ_TASK)); /* thunk->th_task is a regular task */ 1813 1814 tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq; 1815 queue = thunk->th.th_shareds->sv_queue; 1816 in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT); 1817 1818 if (in_parallel && (thunk->th_flags & TQF_IS_ORDERED)) 1819 thunk->th_tasknum = ++queue->tq_tasknum_queuing; 1820 1821 /* For serial execution dequeue the preceding task and execute it, if one 1822 * exists */ 1823 /* This cannot be the last task. That one is handled in __kmpc_end_taskq */ 1824 1825 if (!in_parallel && queue->tq_nfull > 0) { 1826 kmpc_thunk_t *prev_thunk; 1827 1828 KMP_DEBUG_ASSERT(queue->tq_nfull == 1); 1829 1830 prev_thunk = __kmp_dequeue_task(global_tid, queue, in_parallel); 1831 1832 KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid, 1833 prev_thunk, queue)); 1834 1835 __kmp_execute_task_from_queue(tq, loc, global_tid, prev_thunk, in_parallel); 1836 } 1837 1838 /* The instrumentation sequence is: __kmpc_task_buffer(), initialize private 1839 variables, __kmpc_task(). The __kmpc_task_buffer routine checks that the 1840 task queue is not full and allocates a thunk (which is then passed to 1841 __kmpc_task()). So, the enqueue below should never fail due to a full 1842 queue. */ 1843 1844 KF_TRACE(100, ("After enqueueing this Task on (%d):\n", global_tid)); 1845 KF_DUMP(100, __kmp_dump_thunk(tq, thunk, global_tid)); 1846 1847 ret = __kmp_enqueue_task(tq, global_tid, queue, thunk, in_parallel); 1848 1849 KF_TRACE(100, ("Task Queue looks like this on (%d):\n", global_tid)); 1850 KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid)); 1851 1852 KE_TRACE(10, ("__kmpc_task return (%d)\n", global_tid)); 1853 1854 return ret; 1855 } 1856 1857 /* enqueues a taskq_task for thunk previously created by __kmpc_taskq */ 1858 /* this should never be called unless in a parallel context */ 1859 1860 void __kmpc_taskq_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk, 1861 kmp_int32 status) { 1862 kmpc_task_queue_t *queue; 1863 kmp_taskq_t *tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq; 1864 int tid = __kmp_tid_from_gtid(global_tid); 1865 1866 KE_TRACE(10, ("__kmpc_taskq_task called (%d)\n", global_tid)); 1867 KF_TRACE(100, ("TaskQ Task argument thunk on (%d):\n", global_tid)); 1868 KF_DUMP(100, __kmp_dump_thunk(tq, thunk, global_tid)); 1869 1870 queue = thunk->th.th_shareds->sv_queue; 1871 1872 if (__kmp_env_consistency_check) 1873 __kmp_pop_workshare(global_tid, ct_taskq, loc); 1874 1875 /* thunk->th_task is the taskq_task */ 1876 KMP_DEBUG_ASSERT(thunk->th_flags & TQF_TASKQ_TASK); 1877 1878 /* not supposed to call __kmpc_taskq_task if it's already enqueued */ 1879 KMP_DEBUG_ASSERT(queue->tq_taskq_slot == NULL); 1880 1881 /* dequeue taskq thunk from curr_thunk stack */ 1882 tq->tq_curr_thunk[tid] = thunk->th_encl_thunk; 1883 thunk->th_encl_thunk = NULL; 1884 1885 KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid)); 1886 1887 thunk->th_status = status; 1888 1889 // Flush thunk->th_status before taskq_task enqueued to avoid race condition 1890 KMP_MB(); 1891 1892 /* enqueue taskq_task in thunk into special slot in queue */ 1893 /* GEH - probably don't need to lock taskq slot since only one */ 1894 /* thread enqueues & already a lock set at dequeue point */ 1895 1896 queue->tq_taskq_slot = thunk; 1897 1898 KE_TRACE(10, ("__kmpc_taskq_task return (%d)\n", global_tid)); 1899 } 1900 1901 /* ends a taskq_task; done generating tasks */ 1902 1903 void __kmpc_end_taskq_task(ident_t *loc, kmp_int32 global_tid, 1904 kmpc_thunk_t *thunk) { 1905 kmp_taskq_t *tq; 1906 kmpc_task_queue_t *queue; 1907 int in_parallel; 1908 int tid; 1909 1910 KE_TRACE(10, ("__kmpc_end_taskq_task called (%d)\n", global_tid)); 1911 1912 tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq; 1913 queue = thunk->th.th_shareds->sv_queue; 1914 in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT); 1915 tid = __kmp_tid_from_gtid(global_tid); 1916 1917 if (__kmp_env_consistency_check) 1918 __kmp_pop_workshare(global_tid, ct_taskq, loc); 1919 1920 if (in_parallel) { 1921 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1922 KMP_TEST_THEN_OR32(RCAST(volatile kmp_uint32 *, &queue->tq_flags), 1923 TQF_ALL_TASKS_QUEUED); 1924 #else 1925 { 1926 __kmp_acquire_lock(&queue->tq_queue_lck, global_tid); 1927 1928 // Make sure data structures are in consistent state before querying them 1929 // Seems to work without this for digital/alpha, needed for IBM/RS6000 1930 KMP_MB(); 1931 1932 queue->tq_flags |= TQF_ALL_TASKS_QUEUED; 1933 __kmp_release_lock(&queue->tq_queue_lck, global_tid); 1934 } 1935 #endif 1936 } 1937 1938 if (thunk->th_flags & TQF_IS_LASTPRIVATE) { 1939 /* Normally, __kmp_find_task_in_queue() refuses to schedule the last task in 1940 the queue if TQF_IS_LASTPRIVATE so we can positively identify that last 1941 task and run it with its TQF_IS_LAST_TASK bit turned on in th_flags. 1942 When __kmpc_end_taskq_task() is called we are done generating all the 1943 tasks, so we know the last one in the queue is the lastprivate task. 1944 Mark the queue as having gotten to this state via tq_flags & 1945 TQF_IS_LAST_TASK; when that task actually executes mark it via th_flags & 1946 TQF_IS_LAST_TASK (this th_flags bit signals the instrumented code to do 1947 copy-outs after execution). */ 1948 if (!in_parallel) { 1949 /* No synchronization needed for serial context */ 1950 queue->tq_flags |= TQF_IS_LAST_TASK; 1951 } else { 1952 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1953 KMP_TEST_THEN_OR32(RCAST(volatile kmp_uint32 *, &queue->tq_flags), 1954 TQF_IS_LAST_TASK); 1955 #else 1956 { 1957 __kmp_acquire_lock(&queue->tq_queue_lck, global_tid); 1958 1959 // Make sure data structures in consistent state before querying them 1960 // Seems to work without this for digital/alpha, needed for IBM/RS6000 1961 KMP_MB(); 1962 1963 queue->tq_flags |= TQF_IS_LAST_TASK; 1964 __kmp_release_lock(&queue->tq_queue_lck, global_tid); 1965 } 1966 #endif 1967 /* to prevent race condition where last task is dequeued but */ 1968 /* flag isn't visible yet (not sure about this) */ 1969 KMP_MB(); 1970 } 1971 } 1972 1973 /* dequeue taskq thunk from curr_thunk stack */ 1974 if (in_parallel) { 1975 tq->tq_curr_thunk[tid] = thunk->th_encl_thunk; 1976 thunk->th_encl_thunk = NULL; 1977 1978 KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid)); 1979 } 1980 1981 KE_TRACE(10, ("__kmpc_end_taskq_task return (%d)\n", global_tid)); 1982 } 1983 1984 /* returns thunk for a regular task based on taskq_thunk */ 1985 /* (__kmpc_taskq_task does the analogous thing for a TQF_TASKQ_TASK) */ 1986 1987 kmpc_thunk_t *__kmpc_task_buffer(ident_t *loc, kmp_int32 global_tid, 1988 kmpc_thunk_t *taskq_thunk, kmpc_task_t task) { 1989 kmp_taskq_t *tq; 1990 kmpc_task_queue_t *queue; 1991 kmpc_thunk_t *new_thunk; 1992 int in_parallel; 1993 1994 KE_TRACE(10, ("__kmpc_task_buffer called (%d)\n", global_tid)); 1995 1996 KMP_DEBUG_ASSERT( 1997 taskq_thunk->th_flags & 1998 TQF_TASKQ_TASK); /* taskq_thunk->th_task is the taskq_task */ 1999 2000 tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq; 2001 queue = taskq_thunk->th.th_shareds->sv_queue; 2002 in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT); 2003 2004 /* The instrumentation sequence is: __kmpc_task_buffer(), initialize private 2005 variables, __kmpc_task(). The __kmpc_task_buffer routine checks that the 2006 task queue is not full and allocates a thunk (which is then passed to 2007 __kmpc_task()). So, we can pre-allocate a thunk here assuming it will be 2008 the next to be enqueued in __kmpc_task(). */ 2009 2010 new_thunk = __kmp_alloc_thunk(queue, in_parallel, global_tid); 2011 new_thunk->th.th_shareds = 2012 CCAST(kmpc_shared_vars_t *, queue->tq_shareds[0].ai_data); 2013 new_thunk->th_encl_thunk = NULL; 2014 new_thunk->th_task = task; 2015 2016 /* GEH - shouldn't need to lock the read of tq_flags here */ 2017 new_thunk->th_flags = queue->tq_flags & TQF_INTERFACE_FLAGS; 2018 2019 new_thunk->th_status = 0; 2020 2021 KMP_DEBUG_ASSERT(!(new_thunk->th_flags & TQF_TASKQ_TASK)); 2022 2023 KF_TRACE(100, ("Creating Regular Task on (%d):\n", global_tid)); 2024 KF_DUMP(100, __kmp_dump_thunk(tq, new_thunk, global_tid)); 2025 2026 KE_TRACE(10, ("__kmpc_task_buffer return (%d)\n", global_tid)); 2027 2028 return new_thunk; 2029 } 2030