1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is dual licensed under the MIT and the University of Illinois Open 6 // Source Licenses. See LICENSE.txt for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // RTL for CUDA machine 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include <cassert> 15 #include <cstddef> 16 #include <cuda.h> 17 #include <list> 18 #include <string> 19 #include <vector> 20 21 #include "omptargetplugin.h" 22 23 #ifndef TARGET_NAME 24 #define TARGET_NAME CUDA 25 #endif 26 27 #ifdef OMPTARGET_DEBUG 28 static int DebugLevel = 0; 29 30 #define GETNAME2(name) #name 31 #define GETNAME(name) GETNAME2(name) 32 #define DP(...) \ 33 do { \ 34 if (DebugLevel > 0) { \ 35 DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ 36 } \ 37 } while (false) 38 #else // OMPTARGET_DEBUG 39 #define DP(...) {} 40 #endif // OMPTARGET_DEBUG 41 42 #include "../../common/elf_common.c" 43 44 // Utility for retrieving and printing CUDA error string. 45 #ifdef CUDA_ERROR_REPORT 46 #define CUDA_ERR_STRING(err) \ 47 do { \ 48 const char *errStr; \ 49 cuGetErrorString(err, &errStr); \ 50 DP("CUDA error is: %s\n", errStr); \ 51 } while (0) 52 #else 53 #define CUDA_ERR_STRING(err) \ 54 {} 55 #endif 56 57 /// Keep entries table per device. 58 struct FuncOrGblEntryTy { 59 __tgt_target_table Table; 60 std::vector<__tgt_offload_entry> Entries; 61 }; 62 63 enum ExecutionModeType { 64 SPMD, // constructors, destructors, 65 // combined constructs (`teams distribute parallel for [simd]`) 66 GENERIC, // everything else 67 NONE 68 }; 69 70 /// Use a single entity to encode a kernel and a set of flags 71 struct KernelTy { 72 CUfunction Func; 73 74 // execution mode of kernel 75 // 0 - SPMD mode (without master warp) 76 // 1 - Generic mode (with master warp) 77 int8_t ExecutionMode; 78 79 KernelTy(CUfunction _Func, int8_t _ExecutionMode) 80 : Func(_Func), ExecutionMode(_ExecutionMode) {} 81 }; 82 83 /// Device envrionment data 84 /// Manually sync with the deviceRTL side for now, move to a dedicated header file later. 85 struct omptarget_device_environmentTy { 86 int32_t debug_level; 87 }; 88 89 /// List that contains all the kernels. 90 /// FIXME: we may need this to be per device and per library. 91 std::list<KernelTy> KernelsList; 92 93 /// Class containing all the device information. 94 class RTLDeviceInfoTy { 95 std::vector<FuncOrGblEntryTy> FuncGblEntries; 96 97 public: 98 int NumberOfDevices; 99 std::vector<CUmodule> Modules; 100 std::vector<CUcontext> Contexts; 101 102 // Device properties 103 std::vector<int> ThreadsPerBlock; 104 std::vector<int> BlocksPerGrid; 105 std::vector<int> WarpSize; 106 107 // OpenMP properties 108 std::vector<int> NumTeams; 109 std::vector<int> NumThreads; 110 111 // OpenMP Environment properties 112 int EnvNumTeams; 113 int EnvTeamLimit; 114 115 //static int EnvNumThreads; 116 static const int HardTeamLimit = 1<<16; // 64k 117 static const int HardThreadLimit = 1024; 118 static const int DefaultNumTeams = 128; 119 static const int DefaultNumThreads = 128; 120 121 // Record entry point associated with device 122 void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { 123 assert(device_id < (int32_t)FuncGblEntries.size() && 124 "Unexpected device id!"); 125 FuncOrGblEntryTy &E = FuncGblEntries[device_id]; 126 127 E.Entries.push_back(entry); 128 } 129 130 // Return true if the entry is associated with device 131 bool findOffloadEntry(int32_t device_id, void *addr) { 132 assert(device_id < (int32_t)FuncGblEntries.size() && 133 "Unexpected device id!"); 134 FuncOrGblEntryTy &E = FuncGblEntries[device_id]; 135 136 for (auto &it : E.Entries) { 137 if (it.addr == addr) 138 return true; 139 } 140 141 return false; 142 } 143 144 // Return the pointer to the target entries table 145 __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { 146 assert(device_id < (int32_t)FuncGblEntries.size() && 147 "Unexpected device id!"); 148 FuncOrGblEntryTy &E = FuncGblEntries[device_id]; 149 150 int32_t size = E.Entries.size(); 151 152 // Table is empty 153 if (!size) 154 return 0; 155 156 __tgt_offload_entry *begin = &E.Entries[0]; 157 __tgt_offload_entry *end = &E.Entries[size - 1]; 158 159 // Update table info according to the entries and return the pointer 160 E.Table.EntriesBegin = begin; 161 E.Table.EntriesEnd = ++end; 162 163 return &E.Table; 164 } 165 166 // Clear entries table for a device 167 void clearOffloadEntriesTable(int32_t device_id) { 168 assert(device_id < (int32_t)FuncGblEntries.size() && 169 "Unexpected device id!"); 170 FuncOrGblEntryTy &E = FuncGblEntries[device_id]; 171 E.Entries.clear(); 172 E.Table.EntriesBegin = E.Table.EntriesEnd = 0; 173 } 174 175 RTLDeviceInfoTy() { 176 #ifdef OMPTARGET_DEBUG 177 if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) { 178 DebugLevel = std::stoi(envStr); 179 } 180 #endif // OMPTARGET_DEBUG 181 182 DP("Start initializing CUDA\n"); 183 184 CUresult err = cuInit(0); 185 if (err != CUDA_SUCCESS) { 186 DP("Error when initializing CUDA\n"); 187 CUDA_ERR_STRING(err); 188 return; 189 } 190 191 NumberOfDevices = 0; 192 193 err = cuDeviceGetCount(&NumberOfDevices); 194 if (err != CUDA_SUCCESS) { 195 DP("Error when getting CUDA device count\n"); 196 CUDA_ERR_STRING(err); 197 return; 198 } 199 200 if (NumberOfDevices == 0) { 201 DP("There are no devices supporting CUDA.\n"); 202 return; 203 } 204 205 FuncGblEntries.resize(NumberOfDevices); 206 Contexts.resize(NumberOfDevices); 207 ThreadsPerBlock.resize(NumberOfDevices); 208 BlocksPerGrid.resize(NumberOfDevices); 209 WarpSize.resize(NumberOfDevices); 210 NumTeams.resize(NumberOfDevices); 211 NumThreads.resize(NumberOfDevices); 212 213 // Get environment variables regarding teams 214 char *envStr = getenv("OMP_TEAM_LIMIT"); 215 if (envStr) { 216 // OMP_TEAM_LIMIT has been set 217 EnvTeamLimit = std::stoi(envStr); 218 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 219 } else { 220 EnvTeamLimit = -1; 221 } 222 envStr = getenv("OMP_NUM_TEAMS"); 223 if (envStr) { 224 // OMP_NUM_TEAMS has been set 225 EnvNumTeams = std::stoi(envStr); 226 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 227 } else { 228 EnvNumTeams = -1; 229 } 230 } 231 232 ~RTLDeviceInfoTy() { 233 // Close modules 234 for (auto &module : Modules) 235 if (module) { 236 CUresult err = cuModuleUnload(module); 237 if (err != CUDA_SUCCESS) { 238 DP("Error when unloading CUDA module\n"); 239 CUDA_ERR_STRING(err); 240 } 241 } 242 243 // Destroy contexts 244 for (auto &ctx : Contexts) 245 if (ctx) { 246 CUresult err = cuCtxDestroy(ctx); 247 if (err != CUDA_SUCCESS) { 248 DP("Error when destroying CUDA context\n"); 249 CUDA_ERR_STRING(err); 250 } 251 } 252 } 253 }; 254 255 static RTLDeviceInfoTy DeviceInfo; 256 257 #ifdef __cplusplus 258 extern "C" { 259 #endif 260 261 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 262 return elf_check_machine(image, 190); // EM_CUDA = 190. 263 } 264 265 int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; } 266 267 int32_t __tgt_rtl_init_device(int32_t device_id) { 268 269 CUdevice cuDevice; 270 DP("Getting device %d\n", device_id); 271 CUresult err = cuDeviceGet(&cuDevice, device_id); 272 if (err != CUDA_SUCCESS) { 273 DP("Error when getting CUDA device with id = %d\n", device_id); 274 CUDA_ERR_STRING(err); 275 return OFFLOAD_FAIL; 276 } 277 278 // Create the context and save it to use whenever this device is selected. 279 err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC, 280 cuDevice); 281 if (err != CUDA_SUCCESS) { 282 DP("Error when creating a CUDA context\n"); 283 CUDA_ERR_STRING(err); 284 return OFFLOAD_FAIL; 285 } 286 287 // scan properties to determine number of threads/block and blocks/grid. 288 CUdevprop Properties; 289 err = cuDeviceGetProperties(&Properties, cuDevice); 290 if (err != CUDA_SUCCESS) { 291 DP("Error getting device Properties, use defaults\n"); 292 DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams; 293 DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads; 294 DeviceInfo.WarpSize[device_id] = 32; 295 } else { 296 // Get blocks per grid 297 if (Properties.maxGridSize[0] <= RTLDeviceInfoTy::HardTeamLimit) { 298 DeviceInfo.BlocksPerGrid[device_id] = Properties.maxGridSize[0]; 299 DP("Using %d CUDA blocks per grid\n", Properties.maxGridSize[0]); 300 } else { 301 DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit; 302 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 303 "at the hard limit\n", Properties.maxGridSize[0], 304 RTLDeviceInfoTy::HardTeamLimit); 305 } 306 307 // Get threads per block, exploit threads only along x axis 308 if (Properties.maxThreadsDim[0] <= RTLDeviceInfoTy::HardThreadLimit) { 309 DeviceInfo.ThreadsPerBlock[device_id] = Properties.maxThreadsDim[0]; 310 DP("Using %d CUDA threads per block\n", Properties.maxThreadsDim[0]); 311 if (Properties.maxThreadsDim[0] < Properties.maxThreadsPerBlock) { 312 DP("(fewer than max per block along all xyz dims %d)\n", 313 Properties.maxThreadsPerBlock); 314 } 315 } else { 316 DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit; 317 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 318 "capping at the hard limit\n", Properties.maxThreadsDim[0], 319 RTLDeviceInfoTy::HardThreadLimit); 320 } 321 322 // According to the documentation, SIMDWidth is "Warp size in threads". 323 DeviceInfo.WarpSize[device_id] = Properties.SIMDWidth; 324 } 325 326 // Adjust teams to the env variables 327 if (DeviceInfo.EnvTeamLimit > 0 && 328 DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) { 329 DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit; 330 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 331 DeviceInfo.EnvTeamLimit); 332 } 333 334 DP("Max number of CUDA blocks %d, threads %d & warp size %d\n", 335 DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id], 336 DeviceInfo.WarpSize[device_id]); 337 338 // Set default number of teams 339 if (DeviceInfo.EnvNumTeams > 0) { 340 DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams; 341 DP("Default number of teams set according to environment %d\n", 342 DeviceInfo.EnvNumTeams); 343 } else { 344 DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams; 345 DP("Default number of teams set according to library's default %d\n", 346 RTLDeviceInfoTy::DefaultNumTeams); 347 } 348 if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) { 349 DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id]; 350 DP("Default number of teams exceeds device limit, capping at %d\n", 351 DeviceInfo.BlocksPerGrid[device_id]); 352 } 353 354 // Set default number of threads 355 DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads; 356 DP("Default number of threads set according to library's default %d\n", 357 RTLDeviceInfoTy::DefaultNumThreads); 358 if (DeviceInfo.NumThreads[device_id] > 359 DeviceInfo.ThreadsPerBlock[device_id]) { 360 DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id]; 361 DP("Default number of threads exceeds device limit, capping at %d\n", 362 DeviceInfo.ThreadsPerBlock[device_id]); 363 } 364 365 return OFFLOAD_SUCCESS; 366 } 367 368 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 369 __tgt_device_image *image) { 370 371 // Set the context we are using. 372 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 373 if (err != CUDA_SUCCESS) { 374 DP("Error when setting a CUDA context for device %d\n", device_id); 375 CUDA_ERR_STRING(err); 376 return NULL; 377 } 378 379 // Clear the offload table as we are going to create a new one. 380 DeviceInfo.clearOffloadEntriesTable(device_id); 381 382 // Create the module and extract the function pointers. 383 384 CUmodule cumod; 385 DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart)); 386 err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL); 387 if (err != CUDA_SUCCESS) { 388 DP("Error when loading CUDA module\n"); 389 CUDA_ERR_STRING(err); 390 return NULL; 391 } 392 393 DP("CUDA module successfully loaded!\n"); 394 DeviceInfo.Modules.push_back(cumod); 395 396 // Find the symbols in the module by name. 397 __tgt_offload_entry *HostBegin = image->EntriesBegin; 398 __tgt_offload_entry *HostEnd = image->EntriesEnd; 399 400 for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { 401 402 if (!e->addr) { 403 // We return NULL when something like this happens, the host should have 404 // always something in the address to uniquely identify the target region. 405 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size); 406 407 return NULL; 408 } 409 410 if (e->size) { 411 __tgt_offload_entry entry = *e; 412 413 CUdeviceptr cuptr; 414 size_t cusize; 415 err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name); 416 417 if (err != CUDA_SUCCESS) { 418 DP("Loading global '%s' (Failed)\n", e->name); 419 CUDA_ERR_STRING(err); 420 return NULL; 421 } 422 423 if (cusize != e->size) { 424 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name, 425 cusize, e->size); 426 CUDA_ERR_STRING(err); 427 return NULL; 428 } 429 430 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 431 DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr)); 432 entry.addr = (void *)cuptr; 433 434 DeviceInfo.addOffloadEntry(device_id, entry); 435 436 continue; 437 } 438 439 CUfunction fun; 440 err = cuModuleGetFunction(&fun, cumod, e->name); 441 442 if (err != CUDA_SUCCESS) { 443 DP("Loading '%s' (Failed)\n", e->name); 444 CUDA_ERR_STRING(err); 445 return NULL; 446 } 447 448 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 449 DPxPTR(e - HostBegin), e->name, DPxPTR(fun)); 450 451 // default value GENERIC (in case symbol is missing from cubin file) 452 int8_t ExecModeVal = ExecutionModeType::GENERIC; 453 std::string ExecModeNameStr (e->name); 454 ExecModeNameStr += "_exec_mode"; 455 const char *ExecModeName = ExecModeNameStr.c_str(); 456 457 CUdeviceptr ExecModePtr; 458 size_t cusize; 459 err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName); 460 if (err == CUDA_SUCCESS) { 461 if ((size_t)cusize != sizeof(int8_t)) { 462 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 463 ExecModeName, cusize, sizeof(int8_t)); 464 CUDA_ERR_STRING(err); 465 return NULL; 466 } 467 468 err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize); 469 if (err != CUDA_SUCCESS) { 470 DP("Error when copying data from device to host. Pointers: " 471 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 472 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize); 473 CUDA_ERR_STRING(err); 474 return NULL; 475 } 476 477 if (ExecModeVal < 0 || ExecModeVal > 1) { 478 DP("Error wrong exec_mode value specified in cubin file: %d\n", 479 ExecModeVal); 480 return NULL; 481 } 482 } else { 483 DP("Loading global exec_mode '%s' - symbol missing, using default value " 484 "GENERIC (1)\n", ExecModeName); 485 CUDA_ERR_STRING(err); 486 } 487 488 KernelsList.push_back(KernelTy(fun, ExecModeVal)); 489 490 __tgt_offload_entry entry = *e; 491 entry.addr = (void *)&KernelsList.back(); 492 DeviceInfo.addOffloadEntry(device_id, entry); 493 } 494 495 // send device environment data to the device 496 { 497 omptarget_device_environmentTy device_env; 498 499 device_env.debug_level = 0; 500 501 #ifdef OMPTARGET_DEBUG 502 if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) { 503 device_env.debug_level = std::stoi(envStr); 504 } 505 #endif 506 507 const char * device_env_Name="omptarget_device_environment"; 508 CUdeviceptr device_env_Ptr; 509 size_t cusize; 510 511 err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name); 512 513 if (err == CUDA_SUCCESS) { 514 if ((size_t)cusize != sizeof(device_env)) { 515 DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n", 516 device_env_Name, cusize, sizeof(int32_t)); 517 CUDA_ERR_STRING(err); 518 return NULL; 519 } 520 521 err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize); 522 if (err != CUDA_SUCCESS) { 523 DP("Error when copying data from host to device. Pointers: " 524 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 525 DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize); 526 CUDA_ERR_STRING(err); 527 return NULL; 528 } 529 530 DP("Sending global device environment data %zu bytes\n", (size_t)cusize); 531 } else { 532 DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name); 533 DP("Continue, considering this is a device RTL which does not accept envrionment setting.\n"); 534 } 535 } 536 537 return DeviceInfo.getOffloadEntriesTable(device_id); 538 } 539 540 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { 541 if (size == 0) { 542 return NULL; 543 } 544 545 // Set the context we are using. 546 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 547 if (err != CUDA_SUCCESS) { 548 DP("Error while trying to set CUDA current context\n"); 549 CUDA_ERR_STRING(err); 550 return NULL; 551 } 552 553 CUdeviceptr ptr; 554 err = cuMemAlloc(&ptr, size); 555 if (err != CUDA_SUCCESS) { 556 DP("Error while trying to allocate %d\n", err); 557 CUDA_ERR_STRING(err); 558 return NULL; 559 } 560 561 void *vptr = (void *)ptr; 562 return vptr; 563 } 564 565 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 566 int64_t size) { 567 // Set the context we are using. 568 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 569 if (err != CUDA_SUCCESS) { 570 DP("Error when setting CUDA context\n"); 571 CUDA_ERR_STRING(err); 572 return OFFLOAD_FAIL; 573 } 574 575 err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size); 576 if (err != CUDA_SUCCESS) { 577 DP("Error when copying data from host to device. Pointers: host = " DPxMOD 578 ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), 579 DPxPTR(tgt_ptr), size); 580 CUDA_ERR_STRING(err); 581 return OFFLOAD_FAIL; 582 } 583 return OFFLOAD_SUCCESS; 584 } 585 586 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 587 int64_t size) { 588 // Set the context we are using. 589 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 590 if (err != CUDA_SUCCESS) { 591 DP("Error when setting CUDA context\n"); 592 CUDA_ERR_STRING(err); 593 return OFFLOAD_FAIL; 594 } 595 596 err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size); 597 if (err != CUDA_SUCCESS) { 598 DP("Error when copying data from device to host. Pointers: host = " DPxMOD 599 ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), 600 DPxPTR(tgt_ptr), size); 601 CUDA_ERR_STRING(err); 602 return OFFLOAD_FAIL; 603 } 604 return OFFLOAD_SUCCESS; 605 } 606 607 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 608 // Set the context we are using. 609 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 610 if (err != CUDA_SUCCESS) { 611 DP("Error when setting CUDA context\n"); 612 CUDA_ERR_STRING(err); 613 return OFFLOAD_FAIL; 614 } 615 616 err = cuMemFree((CUdeviceptr)tgt_ptr); 617 if (err != CUDA_SUCCESS) { 618 DP("Error when freeing CUDA memory\n"); 619 CUDA_ERR_STRING(err); 620 return OFFLOAD_FAIL; 621 } 622 return OFFLOAD_SUCCESS; 623 } 624 625 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 626 void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 627 int32_t thread_limit, uint64_t loop_tripcount) { 628 // Set the context we are using. 629 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 630 if (err != CUDA_SUCCESS) { 631 DP("Error when setting CUDA context\n"); 632 CUDA_ERR_STRING(err); 633 return OFFLOAD_FAIL; 634 } 635 636 // All args are references. 637 std::vector<void *> args(arg_num); 638 std::vector<void *> ptrs(arg_num); 639 640 for (int32_t i = 0; i < arg_num; ++i) { 641 ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); 642 args[i] = &ptrs[i]; 643 } 644 645 KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; 646 647 int cudaThreadsPerBlock; 648 649 if (thread_limit > 0) { 650 cudaThreadsPerBlock = thread_limit; 651 DP("Setting CUDA threads per block to requested %d\n", thread_limit); 652 // Add master warp if necessary 653 if (KernelInfo->ExecutionMode == GENERIC) { 654 cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id]; 655 DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]); 656 } 657 } else { 658 cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id]; 659 DP("Setting CUDA threads per block to default %d\n", 660 DeviceInfo.NumThreads[device_id]); 661 } 662 663 if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) { 664 cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id]; 665 DP("Threads per block capped at device limit %d\n", 666 DeviceInfo.ThreadsPerBlock[device_id]); 667 } 668 669 int kernel_limit; 670 err = cuFuncGetAttribute(&kernel_limit, 671 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func); 672 if (err == CUDA_SUCCESS) { 673 if (kernel_limit < cudaThreadsPerBlock) { 674 cudaThreadsPerBlock = kernel_limit; 675 DP("Threads per block capped at kernel limit %d\n", kernel_limit); 676 } 677 } 678 679 int cudaBlocksPerGrid; 680 if (team_num <= 0) { 681 if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) { 682 if (KernelInfo->ExecutionMode == SPMD) { 683 // We have a combined construct, i.e. `target teams distribute parallel 684 // for [simd]`. We launch so many teams so that each thread will 685 // execute one iteration of the loop. 686 // round up to the nearest integer 687 cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1; 688 } else { 689 // If we reach this point, then we have a non-combined construct, i.e. 690 // `teams distribute` with a nested `parallel for` and each team is 691 // assigned one iteration of the `distribute` loop. E.g.: 692 // 693 // #pragma omp target teams distribute 694 // for(...loop_tripcount...) { 695 // #pragma omp parallel for 696 // for(...) {} 697 // } 698 // 699 // Threads within a team will execute the iterations of the `parallel` 700 // loop. 701 cudaBlocksPerGrid = loop_tripcount; 702 } 703 DP("Using %d teams due to loop trip count %" PRIu64 " and number of " 704 "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount, 705 cudaThreadsPerBlock); 706 } else { 707 cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id]; 708 DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]); 709 } 710 } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) { 711 cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id]; 712 DP("Capping number of teams to team limit %d\n", 713 DeviceInfo.BlocksPerGrid[device_id]); 714 } else { 715 cudaBlocksPerGrid = team_num; 716 DP("Using requested number of teams %d\n", team_num); 717 } 718 719 // Run on the device. 720 DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid, 721 cudaThreadsPerBlock); 722 723 err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1, 724 cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0); 725 if (err != CUDA_SUCCESS) { 726 DP("Device kernel launch failed!\n"); 727 CUDA_ERR_STRING(err); 728 return OFFLOAD_FAIL; 729 } 730 731 DP("Launch of entry point at " DPxMOD " successful!\n", 732 DPxPTR(tgt_entry_ptr)); 733 734 CUresult sync_err = cuCtxSynchronize(); 735 if (sync_err != CUDA_SUCCESS) { 736 DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr)); 737 CUDA_ERR_STRING(sync_err); 738 return OFFLOAD_FAIL; 739 } else { 740 DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); 741 } 742 743 return OFFLOAD_SUCCESS; 744 } 745 746 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 747 void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { 748 // use one team and the default number of threads. 749 const int32_t team_num = 1; 750 const int32_t thread_limit = 0; 751 return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, 752 tgt_offsets, arg_num, team_num, thread_limit, 0); 753 } 754 755 #ifdef __cplusplus 756 } 757 #endif 758