1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is dual licensed under the MIT and the University of Illinois Open 6 // Source Licenses. See LICENSE.txt for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // RTL for CUDA machine 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include <cassert> 15 #include <cstddef> 16 #include <cuda.h> 17 #include <list> 18 #include <string> 19 #include <vector> 20 21 #include "omptargetplugin.h" 22 23 #ifndef TARGET_NAME 24 #define TARGET_NAME CUDA 25 #endif 26 27 #ifdef OMPTARGET_DEBUG 28 static int DebugLevel = 0; 29 30 #define GETNAME2(name) #name 31 #define GETNAME(name) GETNAME2(name) 32 #define DP(...) \ 33 do { \ 34 if (DebugLevel > 0) { \ 35 DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ 36 } \ 37 } while (false) 38 #else // OMPTARGET_DEBUG 39 #define DP(...) {} 40 #endif // OMPTARGET_DEBUG 41 42 #include "../../common/elf_common.c" 43 44 // Utility for retrieving and printing CUDA error string. 45 #ifdef CUDA_ERROR_REPORT 46 #define CUDA_ERR_STRING(err) \ 47 do { \ 48 const char *errStr; \ 49 cuGetErrorString(err, &errStr); \ 50 DP("CUDA error is: %s\n", errStr); \ 51 } while (0) 52 #else 53 #define CUDA_ERR_STRING(err) \ 54 {} 55 #endif 56 57 /// Keep entries table per device. 58 struct FuncOrGblEntryTy { 59 __tgt_target_table Table; 60 std::vector<__tgt_offload_entry> Entries; 61 }; 62 63 enum ExecutionModeType { 64 SPMD, // constructors, destructors, 65 // combined constructs (`teams distribute parallel for [simd]`) 66 GENERIC, // everything else 67 NONE 68 }; 69 70 /// Use a single entity to encode a kernel and a set of flags 71 struct KernelTy { 72 CUfunction Func; 73 74 // execution mode of kernel 75 // 0 - SPMD mode (without master warp) 76 // 1 - Generic mode (with master warp) 77 int8_t ExecutionMode; 78 79 KernelTy(CUfunction _Func, int8_t _ExecutionMode) 80 : Func(_Func), ExecutionMode(_ExecutionMode) {} 81 }; 82 83 /// List that contains all the kernels. 84 /// FIXME: we may need this to be per device and per library. 85 std::list<KernelTy> KernelsList; 86 87 /// Class containing all the device information. 88 class RTLDeviceInfoTy { 89 std::vector<FuncOrGblEntryTy> FuncGblEntries; 90 91 public: 92 int NumberOfDevices; 93 std::vector<CUmodule> Modules; 94 std::vector<CUcontext> Contexts; 95 96 // Device properties 97 std::vector<int> ThreadsPerBlock; 98 std::vector<int> BlocksPerGrid; 99 std::vector<int> WarpSize; 100 101 // OpenMP properties 102 std::vector<int> NumTeams; 103 std::vector<int> NumThreads; 104 105 // OpenMP Environment properties 106 int EnvNumTeams; 107 int EnvTeamLimit; 108 109 //static int EnvNumThreads; 110 static const int HardTeamLimit = 1<<16; // 64k 111 static const int HardThreadLimit = 1024; 112 static const int DefaultNumTeams = 128; 113 static const int DefaultNumThreads = 128; 114 115 // Record entry point associated with device 116 void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { 117 assert(device_id < (int32_t)FuncGblEntries.size() && 118 "Unexpected device id!"); 119 FuncOrGblEntryTy &E = FuncGblEntries[device_id]; 120 121 E.Entries.push_back(entry); 122 } 123 124 // Return true if the entry is associated with device 125 bool findOffloadEntry(int32_t device_id, void *addr) { 126 assert(device_id < (int32_t)FuncGblEntries.size() && 127 "Unexpected device id!"); 128 FuncOrGblEntryTy &E = FuncGblEntries[device_id]; 129 130 for (auto &it : E.Entries) { 131 if (it.addr == addr) 132 return true; 133 } 134 135 return false; 136 } 137 138 // Return the pointer to the target entries table 139 __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { 140 assert(device_id < (int32_t)FuncGblEntries.size() && 141 "Unexpected device id!"); 142 FuncOrGblEntryTy &E = FuncGblEntries[device_id]; 143 144 int32_t size = E.Entries.size(); 145 146 // Table is empty 147 if (!size) 148 return 0; 149 150 __tgt_offload_entry *begin = &E.Entries[0]; 151 __tgt_offload_entry *end = &E.Entries[size - 1]; 152 153 // Update table info according to the entries and return the pointer 154 E.Table.EntriesBegin = begin; 155 E.Table.EntriesEnd = ++end; 156 157 return &E.Table; 158 } 159 160 // Clear entries table for a device 161 void clearOffloadEntriesTable(int32_t device_id) { 162 assert(device_id < (int32_t)FuncGblEntries.size() && 163 "Unexpected device id!"); 164 FuncOrGblEntryTy &E = FuncGblEntries[device_id]; 165 E.Entries.clear(); 166 E.Table.EntriesBegin = E.Table.EntriesEnd = 0; 167 } 168 169 RTLDeviceInfoTy() { 170 #ifdef OMPTARGET_DEBUG 171 if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) { 172 DebugLevel = std::stoi(envStr); 173 } 174 #endif // OMPTARGET_DEBUG 175 176 DP("Start initializing CUDA\n"); 177 178 CUresult err = cuInit(0); 179 if (err != CUDA_SUCCESS) { 180 DP("Error when initializing CUDA\n"); 181 CUDA_ERR_STRING(err); 182 return; 183 } 184 185 NumberOfDevices = 0; 186 187 err = cuDeviceGetCount(&NumberOfDevices); 188 if (err != CUDA_SUCCESS) { 189 DP("Error when getting CUDA device count\n"); 190 CUDA_ERR_STRING(err); 191 return; 192 } 193 194 if (NumberOfDevices == 0) { 195 DP("There are no devices supporting CUDA.\n"); 196 return; 197 } 198 199 FuncGblEntries.resize(NumberOfDevices); 200 Contexts.resize(NumberOfDevices); 201 ThreadsPerBlock.resize(NumberOfDevices); 202 BlocksPerGrid.resize(NumberOfDevices); 203 WarpSize.resize(NumberOfDevices); 204 NumTeams.resize(NumberOfDevices); 205 NumThreads.resize(NumberOfDevices); 206 207 // Get environment variables regarding teams 208 char *envStr = getenv("OMP_TEAM_LIMIT"); 209 if (envStr) { 210 // OMP_TEAM_LIMIT has been set 211 EnvTeamLimit = std::stoi(envStr); 212 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 213 } else { 214 EnvTeamLimit = -1; 215 } 216 envStr = getenv("OMP_NUM_TEAMS"); 217 if (envStr) { 218 // OMP_NUM_TEAMS has been set 219 EnvNumTeams = std::stoi(envStr); 220 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 221 } else { 222 EnvNumTeams = -1; 223 } 224 } 225 226 ~RTLDeviceInfoTy() { 227 // Close modules 228 for (auto &module : Modules) 229 if (module) { 230 CUresult err = cuModuleUnload(module); 231 if (err != CUDA_SUCCESS) { 232 DP("Error when unloading CUDA module\n"); 233 CUDA_ERR_STRING(err); 234 } 235 } 236 237 // Destroy contexts 238 for (auto &ctx : Contexts) 239 if (ctx) { 240 CUresult err = cuCtxDestroy(ctx); 241 if (err != CUDA_SUCCESS) { 242 DP("Error when destroying CUDA context\n"); 243 CUDA_ERR_STRING(err); 244 } 245 } 246 } 247 }; 248 249 static RTLDeviceInfoTy DeviceInfo; 250 251 #ifdef __cplusplus 252 extern "C" { 253 #endif 254 255 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 256 return elf_check_machine(image, 190); // EM_CUDA = 190. 257 } 258 259 int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; } 260 261 int32_t __tgt_rtl_init_device(int32_t device_id) { 262 263 CUdevice cuDevice; 264 DP("Getting device %d\n", device_id); 265 CUresult err = cuDeviceGet(&cuDevice, device_id); 266 if (err != CUDA_SUCCESS) { 267 DP("Error when getting CUDA device with id = %d\n", device_id); 268 CUDA_ERR_STRING(err); 269 return OFFLOAD_FAIL; 270 } 271 272 // Create the context and save it to use whenever this device is selected. 273 err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC, 274 cuDevice); 275 if (err != CUDA_SUCCESS) { 276 DP("Error when creating a CUDA context\n"); 277 CUDA_ERR_STRING(err); 278 return OFFLOAD_FAIL; 279 } 280 281 // scan properties to determine number of threads/block and blocks/grid. 282 CUdevprop Properties; 283 err = cuDeviceGetProperties(&Properties, cuDevice); 284 if (err != CUDA_SUCCESS) { 285 DP("Error getting device Properties, use defaults\n"); 286 DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams; 287 DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads; 288 DeviceInfo.WarpSize[device_id] = 32; 289 } else { 290 // Get blocks per grid 291 if (Properties.maxGridSize[0] <= RTLDeviceInfoTy::HardTeamLimit) { 292 DeviceInfo.BlocksPerGrid[device_id] = Properties.maxGridSize[0]; 293 DP("Using %d CUDA blocks per grid\n", Properties.maxGridSize[0]); 294 } else { 295 DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit; 296 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 297 "at the hard limit\n", Properties.maxGridSize[0], 298 RTLDeviceInfoTy::HardTeamLimit); 299 } 300 301 // Get threads per block, exploit threads only along x axis 302 if (Properties.maxThreadsDim[0] <= RTLDeviceInfoTy::HardThreadLimit) { 303 DeviceInfo.ThreadsPerBlock[device_id] = Properties.maxThreadsDim[0]; 304 DP("Using %d CUDA threads per block\n", Properties.maxThreadsDim[0]); 305 if (Properties.maxThreadsDim[0] < Properties.maxThreadsPerBlock) { 306 DP("(fewer than max per block along all xyz dims %d)\n", 307 Properties.maxThreadsPerBlock); 308 } 309 } else { 310 DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit; 311 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 312 "capping at the hard limit\n", Properties.maxThreadsDim[0], 313 RTLDeviceInfoTy::HardThreadLimit); 314 } 315 316 // According to the documentation, SIMDWidth is "Warp size in threads". 317 DeviceInfo.WarpSize[device_id] = Properties.SIMDWidth; 318 } 319 320 // Adjust teams to the env variables 321 if (DeviceInfo.EnvTeamLimit > 0 && 322 DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) { 323 DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit; 324 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 325 DeviceInfo.EnvTeamLimit); 326 } 327 328 DP("Max number of CUDA blocks %d, threads %d & warp size %d\n", 329 DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id], 330 DeviceInfo.WarpSize[device_id]); 331 332 // Set default number of teams 333 if (DeviceInfo.EnvNumTeams > 0) { 334 DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams; 335 DP("Default number of teams set according to environment %d\n", 336 DeviceInfo.EnvNumTeams); 337 } else { 338 DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams; 339 DP("Default number of teams set according to library's default %d\n", 340 RTLDeviceInfoTy::DefaultNumTeams); 341 } 342 if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) { 343 DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id]; 344 DP("Default number of teams exceeds device limit, capping at %d\n", 345 DeviceInfo.BlocksPerGrid[device_id]); 346 } 347 348 // Set default number of threads 349 DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads; 350 DP("Default number of threads set according to library's default %d\n", 351 RTLDeviceInfoTy::DefaultNumThreads); 352 if (DeviceInfo.NumThreads[device_id] > 353 DeviceInfo.ThreadsPerBlock[device_id]) { 354 DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id]; 355 DP("Default number of threads exceeds device limit, capping at %d\n", 356 DeviceInfo.ThreadsPerBlock[device_id]); 357 } 358 359 return OFFLOAD_SUCCESS; 360 } 361 362 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 363 __tgt_device_image *image) { 364 365 // Set the context we are using. 366 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 367 if (err != CUDA_SUCCESS) { 368 DP("Error when setting a CUDA context for device %d\n", device_id); 369 CUDA_ERR_STRING(err); 370 return NULL; 371 } 372 373 // Clear the offload table as we are going to create a new one. 374 DeviceInfo.clearOffloadEntriesTable(device_id); 375 376 // Create the module and extract the function pointers. 377 378 CUmodule cumod; 379 DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart)); 380 err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL); 381 if (err != CUDA_SUCCESS) { 382 DP("Error when loading CUDA module\n"); 383 CUDA_ERR_STRING(err); 384 return NULL; 385 } 386 387 DP("CUDA module successfully loaded!\n"); 388 DeviceInfo.Modules.push_back(cumod); 389 390 // Find the symbols in the module by name. 391 __tgt_offload_entry *HostBegin = image->EntriesBegin; 392 __tgt_offload_entry *HostEnd = image->EntriesEnd; 393 394 for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { 395 396 if (!e->addr) { 397 // We return NULL when something like this happens, the host should have 398 // always something in the address to uniquely identify the target region. 399 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size); 400 401 return NULL; 402 } 403 404 if (e->size) { 405 __tgt_offload_entry entry = *e; 406 407 CUdeviceptr cuptr; 408 size_t cusize; 409 err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name); 410 411 if (err != CUDA_SUCCESS) { 412 DP("Loading global '%s' (Failed)\n", e->name); 413 CUDA_ERR_STRING(err); 414 return NULL; 415 } 416 417 if (cusize != e->size) { 418 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name, 419 cusize, e->size); 420 CUDA_ERR_STRING(err); 421 return NULL; 422 } 423 424 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 425 DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr)); 426 entry.addr = (void *)cuptr; 427 428 DeviceInfo.addOffloadEntry(device_id, entry); 429 430 continue; 431 } 432 433 CUfunction fun; 434 err = cuModuleGetFunction(&fun, cumod, e->name); 435 436 if (err != CUDA_SUCCESS) { 437 DP("Loading '%s' (Failed)\n", e->name); 438 CUDA_ERR_STRING(err); 439 return NULL; 440 } 441 442 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 443 DPxPTR(e - HostBegin), e->name, DPxPTR(fun)); 444 445 // default value GENERIC (in case symbol is missing from cubin file) 446 int8_t ExecModeVal = ExecutionModeType::GENERIC; 447 std::string ExecModeNameStr (e->name); 448 ExecModeNameStr += "_exec_mode"; 449 const char *ExecModeName = ExecModeNameStr.c_str(); 450 451 CUdeviceptr ExecModePtr; 452 size_t cusize; 453 err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName); 454 if (err == CUDA_SUCCESS) { 455 if ((size_t)cusize != sizeof(int8_t)) { 456 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 457 ExecModeName, cusize, sizeof(int8_t)); 458 CUDA_ERR_STRING(err); 459 return NULL; 460 } 461 462 err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize); 463 if (err != CUDA_SUCCESS) { 464 DP("Error when copying data from device to host. Pointers: " 465 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 466 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize); 467 CUDA_ERR_STRING(err); 468 return NULL; 469 } 470 471 if (ExecModeVal < 0 || ExecModeVal > 1) { 472 DP("Error wrong exec_mode value specified in cubin file: %d\n", 473 ExecModeVal); 474 return NULL; 475 } 476 } else { 477 DP("Loading global exec_mode '%s' - symbol missing, using default value " 478 "GENERIC (1)\n", ExecModeName); 479 CUDA_ERR_STRING(err); 480 } 481 482 KernelsList.push_back(KernelTy(fun, ExecModeVal)); 483 484 __tgt_offload_entry entry = *e; 485 entry.addr = (void *)&KernelsList.back(); 486 DeviceInfo.addOffloadEntry(device_id, entry); 487 } 488 489 return DeviceInfo.getOffloadEntriesTable(device_id); 490 } 491 492 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { 493 if (size == 0) { 494 return NULL; 495 } 496 497 // Set the context we are using. 498 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 499 if (err != CUDA_SUCCESS) { 500 DP("Error while trying to set CUDA current context\n"); 501 CUDA_ERR_STRING(err); 502 return NULL; 503 } 504 505 CUdeviceptr ptr; 506 err = cuMemAlloc(&ptr, size); 507 if (err != CUDA_SUCCESS) { 508 DP("Error while trying to allocate %d\n", err); 509 CUDA_ERR_STRING(err); 510 return NULL; 511 } 512 513 void *vptr = (void *)ptr; 514 return vptr; 515 } 516 517 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 518 int64_t size) { 519 // Set the context we are using. 520 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 521 if (err != CUDA_SUCCESS) { 522 DP("Error when setting CUDA context\n"); 523 CUDA_ERR_STRING(err); 524 return OFFLOAD_FAIL; 525 } 526 527 err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size); 528 if (err != CUDA_SUCCESS) { 529 DP("Error when copying data from host to device. Pointers: host = " DPxMOD 530 ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), 531 DPxPTR(tgt_ptr), size); 532 CUDA_ERR_STRING(err); 533 return OFFLOAD_FAIL; 534 } 535 return OFFLOAD_SUCCESS; 536 } 537 538 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 539 int64_t size) { 540 // Set the context we are using. 541 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 542 if (err != CUDA_SUCCESS) { 543 DP("Error when setting CUDA context\n"); 544 CUDA_ERR_STRING(err); 545 return OFFLOAD_FAIL; 546 } 547 548 err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size); 549 if (err != CUDA_SUCCESS) { 550 DP("Error when copying data from device to host. Pointers: host = " DPxMOD 551 ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), 552 DPxPTR(tgt_ptr), size); 553 CUDA_ERR_STRING(err); 554 return OFFLOAD_FAIL; 555 } 556 return OFFLOAD_SUCCESS; 557 } 558 559 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 560 // Set the context we are using. 561 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 562 if (err != CUDA_SUCCESS) { 563 DP("Error when setting CUDA context\n"); 564 CUDA_ERR_STRING(err); 565 return OFFLOAD_FAIL; 566 } 567 568 err = cuMemFree((CUdeviceptr)tgt_ptr); 569 if (err != CUDA_SUCCESS) { 570 DP("Error when freeing CUDA memory\n"); 571 CUDA_ERR_STRING(err); 572 return OFFLOAD_FAIL; 573 } 574 return OFFLOAD_SUCCESS; 575 } 576 577 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 578 void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 579 int32_t thread_limit, uint64_t loop_tripcount) { 580 // Set the context we are using. 581 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 582 if (err != CUDA_SUCCESS) { 583 DP("Error when setting CUDA context\n"); 584 CUDA_ERR_STRING(err); 585 return OFFLOAD_FAIL; 586 } 587 588 // All args are references. 589 std::vector<void *> args(arg_num); 590 std::vector<void *> ptrs(arg_num); 591 592 for (int32_t i = 0; i < arg_num; ++i) { 593 ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); 594 args[i] = &ptrs[i]; 595 } 596 597 KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; 598 599 int cudaThreadsPerBlock; 600 601 if (thread_limit > 0) { 602 cudaThreadsPerBlock = thread_limit; 603 DP("Setting CUDA threads per block to requested %d\n", thread_limit); 604 // Add master warp if necessary 605 if (KernelInfo->ExecutionMode == GENERIC) { 606 cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id]; 607 DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]); 608 } 609 } else { 610 cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id]; 611 DP("Setting CUDA threads per block to default %d\n", 612 DeviceInfo.NumThreads[device_id]); 613 } 614 615 if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) { 616 cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id]; 617 DP("Threads per block capped at device limit %d\n", 618 DeviceInfo.ThreadsPerBlock[device_id]); 619 } 620 621 int kernel_limit; 622 err = cuFuncGetAttribute(&kernel_limit, 623 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func); 624 if (err == CUDA_SUCCESS) { 625 if (kernel_limit < cudaThreadsPerBlock) { 626 cudaThreadsPerBlock = kernel_limit; 627 DP("Threads per block capped at kernel limit %d\n", kernel_limit); 628 } 629 } 630 631 int cudaBlocksPerGrid; 632 if (team_num <= 0) { 633 if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) { 634 if (KernelInfo->ExecutionMode == SPMD) { 635 // We have a combined construct, i.e. `target teams distribute parallel 636 // for [simd]`. We launch so many teams so that each thread will 637 // execute one iteration of the loop. 638 // round up to the nearest integer 639 cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1; 640 } else { 641 // If we reach this point, then we have a non-combined construct, i.e. 642 // `teams distribute` with a nested `parallel for` and each team is 643 // assigned one iteration of the `distribute` loop. E.g.: 644 // 645 // #pragma omp target teams distribute 646 // for(...loop_tripcount...) { 647 // #pragma omp parallel for 648 // for(...) {} 649 // } 650 // 651 // Threads within a team will execute the iterations of the `parallel` 652 // loop. 653 cudaBlocksPerGrid = loop_tripcount; 654 } 655 DP("Using %d teams due to loop trip count %" PRIu64 " and number of " 656 "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount, 657 cudaThreadsPerBlock); 658 } else { 659 cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id]; 660 DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]); 661 } 662 } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) { 663 cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id]; 664 DP("Capping number of teams to team limit %d\n", 665 DeviceInfo.BlocksPerGrid[device_id]); 666 } else { 667 cudaBlocksPerGrid = team_num; 668 DP("Using requested number of teams %d\n", team_num); 669 } 670 671 // Run on the device. 672 DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid, 673 cudaThreadsPerBlock); 674 675 err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1, 676 cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0); 677 if (err != CUDA_SUCCESS) { 678 DP("Device kernel launch failed!\n"); 679 CUDA_ERR_STRING(err); 680 return OFFLOAD_FAIL; 681 } 682 683 DP("Launch of entry point at " DPxMOD " successful!\n", 684 DPxPTR(tgt_entry_ptr)); 685 686 CUresult sync_err = cuCtxSynchronize(); 687 if (sync_err != CUDA_SUCCESS) { 688 DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr)); 689 CUDA_ERR_STRING(sync_err); 690 return OFFLOAD_FAIL; 691 } else { 692 DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); 693 } 694 695 return OFFLOAD_SUCCESS; 696 } 697 698 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 699 void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { 700 // use one team and the default number of threads. 701 const int32_t team_num = 1; 702 const int32_t thread_limit = 0; 703 return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, 704 tgt_offsets, arg_num, team_num, thread_limit, 0); 705 } 706 707 #ifdef __cplusplus 708 } 709 #endif 710