1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is dual licensed under the MIT and the University of Illinois Open 6 // Source Licenses. See LICENSE.txt for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // RTL for CUDA machine 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include <cassert> 15 #include <cstddef> 16 #include <cuda.h> 17 #include <cuda_runtime_api.h> 18 #include <list> 19 #include <string> 20 #include <vector> 21 22 #include "omptargetplugin.h" 23 24 #ifndef TARGET_NAME 25 #define TARGET_NAME CUDA 26 #endif 27 28 #define GETNAME2(name) #name 29 #define GETNAME(name) GETNAME2(name) 30 #define DP(...) DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__) 31 32 #include "../../common/elf_common.c" 33 34 // Utility for retrieving and printing CUDA error string. 35 #ifdef CUDA_ERROR_REPORT 36 #define CUDA_ERR_STRING(err) \ 37 do { \ 38 const char *errStr; \ 39 cuGetErrorString(err, &errStr); \ 40 DP("CUDA error is: %s\n", errStr); \ 41 } while (0) 42 #else 43 #define CUDA_ERR_STRING(err) \ 44 {} 45 #endif 46 47 /// Keep entries table per device. 48 struct FuncOrGblEntryTy { 49 __tgt_target_table Table; 50 std::vector<__tgt_offload_entry> Entries; 51 }; 52 53 enum ExecutionModeType { 54 SPMD, // constructors, destructors, 55 // combined constructs (`teams distribute parallel for [simd]`) 56 GENERIC, // everything else 57 NONE 58 }; 59 60 /// Use a single entity to encode a kernel and a set of flags 61 struct KernelTy { 62 CUfunction Func; 63 64 // execution mode of kernel 65 // 0 - SPMD mode (without master warp) 66 // 1 - Generic mode (with master warp) 67 int8_t ExecutionMode; 68 69 KernelTy(CUfunction _Func, int8_t _ExecutionMode) 70 : Func(_Func), ExecutionMode(_ExecutionMode) {} 71 }; 72 73 /// List that contains all the kernels. 74 /// FIXME: we may need this to be per device and per library. 75 std::list<KernelTy> KernelsList; 76 77 /// Class containing all the device information. 78 class RTLDeviceInfoTy { 79 std::vector<FuncOrGblEntryTy> FuncGblEntries; 80 81 public: 82 int NumberOfDevices; 83 std::vector<CUmodule> Modules; 84 std::vector<CUcontext> Contexts; 85 86 // Device properties 87 std::vector<int> ThreadsPerBlock; 88 std::vector<int> BlocksPerGrid; 89 std::vector<int> WarpSize; 90 91 // OpenMP properties 92 std::vector<int> NumTeams; 93 std::vector<int> NumThreads; 94 95 // OpenMP Environment properties 96 int EnvNumTeams; 97 int EnvTeamLimit; 98 99 //static int EnvNumThreads; 100 static const int HardTeamLimit = 1<<16; // 64k 101 static const int HardThreadLimit = 1024; 102 static const int DefaultNumTeams = 128; 103 static const int DefaultNumThreads = 128; 104 105 // Record entry point associated with device 106 void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { 107 assert(device_id < (int32_t)FuncGblEntries.size() && 108 "Unexpected device id!"); 109 FuncOrGblEntryTy &E = FuncGblEntries[device_id]; 110 111 E.Entries.push_back(entry); 112 } 113 114 // Return true if the entry is associated with device 115 bool findOffloadEntry(int32_t device_id, void *addr) { 116 assert(device_id < (int32_t)FuncGblEntries.size() && 117 "Unexpected device id!"); 118 FuncOrGblEntryTy &E = FuncGblEntries[device_id]; 119 120 for (auto &it : E.Entries) { 121 if (it.addr == addr) 122 return true; 123 } 124 125 return false; 126 } 127 128 // Return the pointer to the target entries table 129 __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { 130 assert(device_id < (int32_t)FuncGblEntries.size() && 131 "Unexpected device id!"); 132 FuncOrGblEntryTy &E = FuncGblEntries[device_id]; 133 134 int32_t size = E.Entries.size(); 135 136 // Table is empty 137 if (!size) 138 return 0; 139 140 __tgt_offload_entry *begin = &E.Entries[0]; 141 __tgt_offload_entry *end = &E.Entries[size - 1]; 142 143 // Update table info according to the entries and return the pointer 144 E.Table.EntriesBegin = begin; 145 E.Table.EntriesEnd = ++end; 146 147 return &E.Table; 148 } 149 150 // Clear entries table for a device 151 void clearOffloadEntriesTable(int32_t device_id) { 152 assert(device_id < (int32_t)FuncGblEntries.size() && 153 "Unexpected device id!"); 154 FuncOrGblEntryTy &E = FuncGblEntries[device_id]; 155 E.Entries.clear(); 156 E.Table.EntriesBegin = E.Table.EntriesEnd = 0; 157 } 158 159 RTLDeviceInfoTy() { 160 DP("Start initializing CUDA\n"); 161 162 CUresult err = cuInit(0); 163 if (err != CUDA_SUCCESS) { 164 DP("Error when initializing CUDA\n"); 165 CUDA_ERR_STRING(err); 166 return; 167 } 168 169 NumberOfDevices = 0; 170 171 err = cuDeviceGetCount(&NumberOfDevices); 172 if (err != CUDA_SUCCESS) { 173 DP("Error when getting CUDA device count\n"); 174 CUDA_ERR_STRING(err); 175 return; 176 } 177 178 if (NumberOfDevices == 0) { 179 DP("There are no devices supporting CUDA.\n"); 180 return; 181 } 182 183 FuncGblEntries.resize(NumberOfDevices); 184 Contexts.resize(NumberOfDevices); 185 ThreadsPerBlock.resize(NumberOfDevices); 186 BlocksPerGrid.resize(NumberOfDevices); 187 WarpSize.resize(NumberOfDevices); 188 NumTeams.resize(NumberOfDevices); 189 NumThreads.resize(NumberOfDevices); 190 191 // Get environment variables regarding teams 192 char *envStr = getenv("OMP_TEAM_LIMIT"); 193 if (envStr) { 194 // OMP_TEAM_LIMIT has been set 195 EnvTeamLimit = std::stoi(envStr); 196 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 197 } else { 198 EnvTeamLimit = -1; 199 } 200 envStr = getenv("OMP_NUM_TEAMS"); 201 if (envStr) { 202 // OMP_NUM_TEAMS has been set 203 EnvNumTeams = std::stoi(envStr); 204 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 205 } else { 206 EnvNumTeams = -1; 207 } 208 } 209 210 ~RTLDeviceInfoTy() { 211 // Close modules 212 for (auto &module : Modules) 213 if (module) { 214 CUresult err = cuModuleUnload(module); 215 if (err != CUDA_SUCCESS) { 216 DP("Error when unloading CUDA module\n"); 217 CUDA_ERR_STRING(err); 218 } 219 } 220 221 // Destroy contexts 222 for (auto &ctx : Contexts) 223 if (ctx) { 224 CUresult err = cuCtxDestroy(ctx); 225 if (err != CUDA_SUCCESS) { 226 DP("Error when destroying CUDA context\n"); 227 CUDA_ERR_STRING(err); 228 } 229 } 230 } 231 }; 232 233 static RTLDeviceInfoTy DeviceInfo; 234 235 #ifdef __cplusplus 236 extern "C" { 237 #endif 238 239 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 240 return elf_check_machine(image, 190); // EM_CUDA = 190. 241 } 242 243 int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; } 244 245 int32_t __tgt_rtl_init_device(int32_t device_id) { 246 247 CUdevice cuDevice; 248 DP("Getting device %d\n", device_id); 249 CUresult err = cuDeviceGet(&cuDevice, device_id); 250 if (err != CUDA_SUCCESS) { 251 DP("Error when getting CUDA device with id = %d\n", device_id); 252 CUDA_ERR_STRING(err); 253 return OFFLOAD_FAIL; 254 } 255 256 // Create the context and save it to use whenever this device is selected. 257 err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC, 258 cuDevice); 259 if (err != CUDA_SUCCESS) { 260 DP("Error when creating a CUDA context\n"); 261 CUDA_ERR_STRING(err); 262 return OFFLOAD_FAIL; 263 } 264 265 // scan properties to determine number of threads/block and blocks/grid. 266 struct cudaDeviceProp Properties; 267 cudaError_t error = cudaGetDeviceProperties(&Properties, device_id); 268 if (error != cudaSuccess) { 269 DP("Error getting device Properties, use defaults\n"); 270 DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams; 271 DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads; 272 DeviceInfo.WarpSize[device_id] = 32; 273 } else { 274 // Get blocks per grid 275 if (Properties.maxGridSize[0] <= RTLDeviceInfoTy::HardTeamLimit) { 276 DeviceInfo.BlocksPerGrid[device_id] = Properties.maxGridSize[0]; 277 DP("Using %d CUDA blocks per grid\n", Properties.maxGridSize[0]); 278 } else { 279 DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit; 280 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 281 "at the hard limit\n", Properties.maxGridSize[0], 282 RTLDeviceInfoTy::HardTeamLimit); 283 } 284 285 // Get threads per block, exploit threads only along x axis 286 if (Properties.maxThreadsDim[0] <= RTLDeviceInfoTy::HardThreadLimit) { 287 DeviceInfo.ThreadsPerBlock[device_id] = Properties.maxThreadsDim[0]; 288 DP("Using %d CUDA threads per block\n", Properties.maxThreadsDim[0]); 289 if (Properties.maxThreadsDim[0] < Properties.maxThreadsPerBlock) { 290 DP("(fewer than max per block along all xyz dims %d)\n", 291 Properties.maxThreadsPerBlock); 292 } 293 } else { 294 DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit; 295 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 296 "capping at the hard limit\n", Properties.maxThreadsDim[0], 297 RTLDeviceInfoTy::HardThreadLimit); 298 } 299 300 // Get warp size 301 DeviceInfo.WarpSize[device_id] = Properties.warpSize; 302 } 303 304 // Adjust teams to the env variables 305 if (DeviceInfo.EnvTeamLimit > 0 && 306 DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) { 307 DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit; 308 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 309 DeviceInfo.EnvTeamLimit); 310 } 311 312 DP("Max number of CUDA blocks %d, threads %d & warp size %d\n", 313 DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id], 314 DeviceInfo.WarpSize[device_id]); 315 316 // Set default number of teams 317 if (DeviceInfo.EnvNumTeams > 0) { 318 DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams; 319 DP("Default number of teams set according to environment %d\n", 320 DeviceInfo.EnvNumTeams); 321 } else { 322 DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams; 323 DP("Default number of teams set according to library's default %d\n", 324 RTLDeviceInfoTy::DefaultNumTeams); 325 } 326 if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) { 327 DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id]; 328 DP("Default number of teams exceeds device limit, capping at %d\n", 329 DeviceInfo.BlocksPerGrid[device_id]); 330 } 331 332 // Set default number of threads 333 DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads; 334 DP("Default number of threads set according to library's default %d\n", 335 RTLDeviceInfoTy::DefaultNumThreads); 336 if (DeviceInfo.NumThreads[device_id] > 337 DeviceInfo.ThreadsPerBlock[device_id]) { 338 DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id]; 339 DP("Default number of threads exceeds device limit, capping at %d\n", 340 DeviceInfo.ThreadsPerBlock[device_id]); 341 } 342 343 return OFFLOAD_SUCCESS; 344 } 345 346 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 347 __tgt_device_image *image) { 348 349 // Set the context we are using. 350 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 351 if (err != CUDA_SUCCESS) { 352 DP("Error when setting a CUDA context for device %d\n", device_id); 353 CUDA_ERR_STRING(err); 354 return NULL; 355 } 356 357 // Clear the offload table as we are going to create a new one. 358 DeviceInfo.clearOffloadEntriesTable(device_id); 359 360 // Create the module and extract the function pointers. 361 362 CUmodule cumod; 363 DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart)); 364 err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL); 365 if (err != CUDA_SUCCESS) { 366 DP("Error when loading CUDA module\n"); 367 CUDA_ERR_STRING(err); 368 return NULL; 369 } 370 371 DP("CUDA module successfully loaded!\n"); 372 DeviceInfo.Modules.push_back(cumod); 373 374 // Find the symbols in the module by name. 375 __tgt_offload_entry *HostBegin = image->EntriesBegin; 376 __tgt_offload_entry *HostEnd = image->EntriesEnd; 377 378 for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { 379 380 if (!e->addr) { 381 // We return NULL when something like this happens, the host should have 382 // always something in the address to uniquely identify the target region. 383 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size); 384 385 return NULL; 386 } 387 388 if (e->size) { 389 __tgt_offload_entry entry = *e; 390 391 CUdeviceptr cuptr; 392 size_t cusize; 393 err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name); 394 395 if (err != CUDA_SUCCESS) { 396 DP("Loading global '%s' (Failed)\n", e->name); 397 CUDA_ERR_STRING(err); 398 return NULL; 399 } 400 401 if (cusize != e->size) { 402 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name, 403 cusize, e->size); 404 CUDA_ERR_STRING(err); 405 return NULL; 406 } 407 408 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 409 DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr)); 410 entry.addr = (void *)cuptr; 411 412 DeviceInfo.addOffloadEntry(device_id, entry); 413 414 continue; 415 } 416 417 CUfunction fun; 418 err = cuModuleGetFunction(&fun, cumod, e->name); 419 420 if (err != CUDA_SUCCESS) { 421 DP("Loading '%s' (Failed)\n", e->name); 422 CUDA_ERR_STRING(err); 423 return NULL; 424 } 425 426 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 427 DPxPTR(e - HostBegin), e->name, DPxPTR(fun)); 428 429 // default value GENERIC (in case symbol is missing from cubin file) 430 int8_t ExecModeVal = ExecutionModeType::GENERIC; 431 std::string ExecModeNameStr (e->name); 432 ExecModeNameStr += "_exec_mode"; 433 const char *ExecModeName = ExecModeNameStr.c_str(); 434 435 CUdeviceptr ExecModePtr; 436 size_t cusize; 437 err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName); 438 if (err == CUDA_SUCCESS) { 439 if ((size_t)cusize != sizeof(int8_t)) { 440 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 441 ExecModeName, cusize, sizeof(int8_t)); 442 CUDA_ERR_STRING(err); 443 return NULL; 444 } 445 446 err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize); 447 if (err != CUDA_SUCCESS) { 448 DP("Error when copying data from device to host. Pointers: " 449 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 450 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize); 451 CUDA_ERR_STRING(err); 452 return NULL; 453 } 454 455 if (ExecModeVal < 0 || ExecModeVal > 1) { 456 DP("Error wrong exec_mode value specified in cubin file: %d\n", 457 ExecModeVal); 458 return NULL; 459 } 460 } else { 461 DP("Loading global exec_mode '%s' - symbol missing, using default value " 462 "GENERIC (1)\n", ExecModeName); 463 CUDA_ERR_STRING(err); 464 } 465 466 KernelsList.push_back(KernelTy(fun, ExecModeVal)); 467 468 __tgt_offload_entry entry = *e; 469 entry.addr = (void *)&KernelsList.back(); 470 DeviceInfo.addOffloadEntry(device_id, entry); 471 } 472 473 return DeviceInfo.getOffloadEntriesTable(device_id); 474 } 475 476 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { 477 if (size == 0) { 478 return NULL; 479 } 480 481 // Set the context we are using. 482 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 483 if (err != CUDA_SUCCESS) { 484 DP("Error while trying to set CUDA current context\n"); 485 CUDA_ERR_STRING(err); 486 return NULL; 487 } 488 489 CUdeviceptr ptr; 490 err = cuMemAlloc(&ptr, size); 491 if (err != CUDA_SUCCESS) { 492 DP("Error while trying to allocate %d\n", err); 493 CUDA_ERR_STRING(err); 494 return NULL; 495 } 496 497 void *vptr = (void *)ptr; 498 return vptr; 499 } 500 501 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 502 int64_t size) { 503 // Set the context we are using. 504 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 505 if (err != CUDA_SUCCESS) { 506 DP("Error when setting CUDA context\n"); 507 CUDA_ERR_STRING(err); 508 return OFFLOAD_FAIL; 509 } 510 511 err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size); 512 if (err != CUDA_SUCCESS) { 513 DP("Error when copying data from host to device. Pointers: host = " DPxMOD 514 ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), 515 DPxPTR(tgt_ptr), size); 516 CUDA_ERR_STRING(err); 517 return OFFLOAD_FAIL; 518 } 519 return OFFLOAD_SUCCESS; 520 } 521 522 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 523 int64_t size) { 524 // Set the context we are using. 525 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 526 if (err != CUDA_SUCCESS) { 527 DP("Error when setting CUDA context\n"); 528 CUDA_ERR_STRING(err); 529 return OFFLOAD_FAIL; 530 } 531 532 err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size); 533 if (err != CUDA_SUCCESS) { 534 DP("Error when copying data from device to host. Pointers: host = " DPxMOD 535 ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), 536 DPxPTR(tgt_ptr), size); 537 CUDA_ERR_STRING(err); 538 return OFFLOAD_FAIL; 539 } 540 return OFFLOAD_SUCCESS; 541 } 542 543 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 544 // Set the context we are using. 545 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 546 if (err != CUDA_SUCCESS) { 547 DP("Error when setting CUDA context\n"); 548 CUDA_ERR_STRING(err); 549 return OFFLOAD_FAIL; 550 } 551 552 err = cuMemFree((CUdeviceptr)tgt_ptr); 553 if (err != CUDA_SUCCESS) { 554 DP("Error when freeing CUDA memory\n"); 555 CUDA_ERR_STRING(err); 556 return OFFLOAD_FAIL; 557 } 558 return OFFLOAD_SUCCESS; 559 } 560 561 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 562 void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 563 int32_t thread_limit, uint64_t loop_tripcount) { 564 // Set the context we are using. 565 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 566 if (err != CUDA_SUCCESS) { 567 DP("Error when setting CUDA context\n"); 568 CUDA_ERR_STRING(err); 569 return OFFLOAD_FAIL; 570 } 571 572 // All args are references. 573 std::vector<void *> args(arg_num); 574 std::vector<void *> ptrs(arg_num); 575 576 for (int32_t i = 0; i < arg_num; ++i) { 577 ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); 578 args[i] = &ptrs[i]; 579 } 580 581 KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; 582 583 int cudaThreadsPerBlock; 584 585 if (thread_limit > 0) { 586 cudaThreadsPerBlock = thread_limit; 587 DP("Setting CUDA threads per block to requested %d\n", thread_limit); 588 // Add master warp if necessary 589 if (KernelInfo->ExecutionMode == GENERIC) { 590 cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id]; 591 DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]); 592 } 593 } else { 594 cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id]; 595 DP("Setting CUDA threads per block to default %d\n", 596 DeviceInfo.NumThreads[device_id]); 597 } 598 599 if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) { 600 cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id]; 601 DP("Threads per block capped at device limit %d\n", 602 DeviceInfo.ThreadsPerBlock[device_id]); 603 } 604 605 int kernel_limit; 606 err = cuFuncGetAttribute(&kernel_limit, 607 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func); 608 if (err == CUDA_SUCCESS) { 609 if (kernel_limit < cudaThreadsPerBlock) { 610 cudaThreadsPerBlock = kernel_limit; 611 DP("Threads per block capped at kernel limit %d\n", kernel_limit); 612 } 613 } 614 615 int cudaBlocksPerGrid; 616 if (team_num <= 0) { 617 if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) { 618 if (KernelInfo->ExecutionMode == SPMD) { 619 // We have a combined construct, i.e. `target teams distribute parallel 620 // for [simd]`. We launch so many teams so that each thread will 621 // execute one iteration of the loop. 622 // round up to the nearest integer 623 cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1; 624 } else { 625 // If we reach this point, then we have a non-combined construct, i.e. 626 // `teams distribute` with a nested `parallel for` and each team is 627 // assigned one iteration of the `distribute` loop. E.g.: 628 // 629 // #pragma omp target teams distribute 630 // for(...loop_tripcount...) { 631 // #pragma omp parallel for 632 // for(...) {} 633 // } 634 // 635 // Threads within a team will execute the iterations of the `parallel` 636 // loop. 637 cudaBlocksPerGrid = loop_tripcount; 638 } 639 DP("Using %d teams due to loop trip count %" PRIu64 " and number of " 640 "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount, 641 cudaThreadsPerBlock); 642 } else { 643 cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id]; 644 DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]); 645 } 646 } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) { 647 cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id]; 648 DP("Capping number of teams to team limit %d\n", 649 DeviceInfo.BlocksPerGrid[device_id]); 650 } else { 651 cudaBlocksPerGrid = team_num; 652 DP("Using requested number of teams %d\n", team_num); 653 } 654 655 // Run on the device. 656 DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid, 657 cudaThreadsPerBlock); 658 659 err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1, 660 cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0); 661 if (err != CUDA_SUCCESS) { 662 DP("Device kernel launch failed!\n"); 663 CUDA_ERR_STRING(err); 664 assert(err == CUDA_SUCCESS && "Unable to launch target execution!"); 665 return OFFLOAD_FAIL; 666 } 667 668 DP("Launch of entry point at " DPxMOD " successful!\n", 669 DPxPTR(tgt_entry_ptr)); 670 671 cudaError_t sync_error = cudaDeviceSynchronize(); 672 if (sync_error != cudaSuccess) { 673 DP("Kernel execution error at " DPxMOD ", %s.\n", DPxPTR(tgt_entry_ptr), 674 cudaGetErrorString(sync_error)); 675 return OFFLOAD_FAIL; 676 } else { 677 DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); 678 } 679 680 return OFFLOAD_SUCCESS; 681 } 682 683 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 684 void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { 685 // use one team and the default number of threads. 686 const int32_t team_num = 1; 687 const int32_t thread_limit = 0; 688 return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, 689 tgt_offsets, arg_num, team_num, thread_limit, 0); 690 } 691 692 #ifdef __cplusplus 693 } 694 #endif 695