1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is dual licensed under the MIT and the University of Illinois Open 6 // Source Licenses. See LICENSE.txt for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // RTL for CUDA machine 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include <cassert> 15 #include <cstddef> 16 #include <cuda.h> 17 #include <list> 18 #include <string> 19 #include <vector> 20 21 #include "omptargetplugin.h" 22 23 #ifndef TARGET_NAME 24 #define TARGET_NAME CUDA 25 #endif 26 27 #ifdef OMPTARGET_DEBUG 28 static int DebugLevel = 0; 29 30 #define GETNAME2(name) #name 31 #define GETNAME(name) GETNAME2(name) 32 #define DP(...) \ 33 do { \ 34 if (DebugLevel > 0) { \ 35 DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ 36 } \ 37 } while (false) 38 #else // OMPTARGET_DEBUG 39 #define DP(...) {} 40 #endif // OMPTARGET_DEBUG 41 42 #include "../../common/elf_common.c" 43 44 // Utility for retrieving and printing CUDA error string. 45 #ifdef CUDA_ERROR_REPORT 46 #define CUDA_ERR_STRING(err) \ 47 do { \ 48 const char *errStr; \ 49 cuGetErrorString(err, &errStr); \ 50 DP("CUDA error is: %s\n", errStr); \ 51 } while (0) 52 #else 53 #define CUDA_ERR_STRING(err) \ 54 {} 55 #endif 56 57 /// Keep entries table per device. 58 struct FuncOrGblEntryTy { 59 __tgt_target_table Table; 60 std::vector<__tgt_offload_entry> Entries; 61 }; 62 63 enum ExecutionModeType { 64 SPMD, // constructors, destructors, 65 // combined constructs (`teams distribute parallel for [simd]`) 66 GENERIC, // everything else 67 NONE 68 }; 69 70 /// Use a single entity to encode a kernel and a set of flags 71 struct KernelTy { 72 CUfunction Func; 73 74 // execution mode of kernel 75 // 0 - SPMD mode (without master warp) 76 // 1 - Generic mode (with master warp) 77 int8_t ExecutionMode; 78 79 KernelTy(CUfunction _Func, int8_t _ExecutionMode) 80 : Func(_Func), ExecutionMode(_ExecutionMode) {} 81 }; 82 83 /// Device envrionment data 84 /// Manually sync with the deviceRTL side for now, move to a dedicated header file later. 85 struct omptarget_device_environmentTy { 86 int32_t debug_level; 87 }; 88 89 /// List that contains all the kernels. 90 /// FIXME: we may need this to be per device and per library. 91 std::list<KernelTy> KernelsList; 92 93 /// Class containing all the device information. 94 class RTLDeviceInfoTy { 95 std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries; 96 97 public: 98 int NumberOfDevices; 99 std::vector<CUmodule> Modules; 100 std::vector<CUcontext> Contexts; 101 102 // Device properties 103 std::vector<int> ThreadsPerBlock; 104 std::vector<int> BlocksPerGrid; 105 std::vector<int> WarpSize; 106 107 // OpenMP properties 108 std::vector<int> NumTeams; 109 std::vector<int> NumThreads; 110 111 // OpenMP Environment properties 112 int EnvNumTeams; 113 int EnvTeamLimit; 114 115 //static int EnvNumThreads; 116 static const int HardTeamLimit = 1<<16; // 64k 117 static const int HardThreadLimit = 1024; 118 static const int DefaultNumTeams = 128; 119 static const int DefaultNumThreads = 128; 120 121 // Record entry point associated with device 122 void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { 123 assert(device_id < (int32_t)FuncGblEntries.size() && 124 "Unexpected device id!"); 125 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 126 127 E.Entries.push_back(entry); 128 } 129 130 // Return true if the entry is associated with device 131 bool findOffloadEntry(int32_t device_id, void *addr) { 132 assert(device_id < (int32_t)FuncGblEntries.size() && 133 "Unexpected device id!"); 134 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 135 136 for (auto &it : E.Entries) { 137 if (it.addr == addr) 138 return true; 139 } 140 141 return false; 142 } 143 144 // Return the pointer to the target entries table 145 __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { 146 assert(device_id < (int32_t)FuncGblEntries.size() && 147 "Unexpected device id!"); 148 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 149 150 int32_t size = E.Entries.size(); 151 152 // Table is empty 153 if (!size) 154 return 0; 155 156 __tgt_offload_entry *begin = &E.Entries[0]; 157 __tgt_offload_entry *end = &E.Entries[size - 1]; 158 159 // Update table info according to the entries and return the pointer 160 E.Table.EntriesBegin = begin; 161 E.Table.EntriesEnd = ++end; 162 163 return &E.Table; 164 } 165 166 // Clear entries table for a device 167 void clearOffloadEntriesTable(int32_t device_id) { 168 assert(device_id < (int32_t)FuncGblEntries.size() && 169 "Unexpected device id!"); 170 FuncGblEntries[device_id].emplace_back(); 171 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 172 E.Entries.clear(); 173 E.Table.EntriesBegin = E.Table.EntriesEnd = 0; 174 } 175 176 RTLDeviceInfoTy() { 177 #ifdef OMPTARGET_DEBUG 178 if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) { 179 DebugLevel = std::stoi(envStr); 180 } 181 #endif // OMPTARGET_DEBUG 182 183 DP("Start initializing CUDA\n"); 184 185 CUresult err = cuInit(0); 186 if (err != CUDA_SUCCESS) { 187 DP("Error when initializing CUDA\n"); 188 CUDA_ERR_STRING(err); 189 return; 190 } 191 192 NumberOfDevices = 0; 193 194 err = cuDeviceGetCount(&NumberOfDevices); 195 if (err != CUDA_SUCCESS) { 196 DP("Error when getting CUDA device count\n"); 197 CUDA_ERR_STRING(err); 198 return; 199 } 200 201 if (NumberOfDevices == 0) { 202 DP("There are no devices supporting CUDA.\n"); 203 return; 204 } 205 206 FuncGblEntries.resize(NumberOfDevices); 207 Contexts.resize(NumberOfDevices); 208 ThreadsPerBlock.resize(NumberOfDevices); 209 BlocksPerGrid.resize(NumberOfDevices); 210 WarpSize.resize(NumberOfDevices); 211 NumTeams.resize(NumberOfDevices); 212 NumThreads.resize(NumberOfDevices); 213 214 // Get environment variables regarding teams 215 char *envStr = getenv("OMP_TEAM_LIMIT"); 216 if (envStr) { 217 // OMP_TEAM_LIMIT has been set 218 EnvTeamLimit = std::stoi(envStr); 219 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 220 } else { 221 EnvTeamLimit = -1; 222 } 223 envStr = getenv("OMP_NUM_TEAMS"); 224 if (envStr) { 225 // OMP_NUM_TEAMS has been set 226 EnvNumTeams = std::stoi(envStr); 227 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 228 } else { 229 EnvNumTeams = -1; 230 } 231 } 232 233 ~RTLDeviceInfoTy() { 234 // Close modules 235 for (auto &module : Modules) 236 if (module) { 237 CUresult err = cuModuleUnload(module); 238 if (err != CUDA_SUCCESS) { 239 DP("Error when unloading CUDA module\n"); 240 CUDA_ERR_STRING(err); 241 } 242 } 243 244 // Destroy contexts 245 for (auto &ctx : Contexts) 246 if (ctx) { 247 CUresult err = cuCtxDestroy(ctx); 248 if (err != CUDA_SUCCESS) { 249 DP("Error when destroying CUDA context\n"); 250 CUDA_ERR_STRING(err); 251 } 252 } 253 } 254 }; 255 256 static RTLDeviceInfoTy DeviceInfo; 257 258 #ifdef __cplusplus 259 extern "C" { 260 #endif 261 262 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 263 return elf_check_machine(image, 190); // EM_CUDA = 190. 264 } 265 266 int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; } 267 268 int32_t __tgt_rtl_init_device(int32_t device_id) { 269 270 CUdevice cuDevice; 271 DP("Getting device %d\n", device_id); 272 CUresult err = cuDeviceGet(&cuDevice, device_id); 273 if (err != CUDA_SUCCESS) { 274 DP("Error when getting CUDA device with id = %d\n", device_id); 275 CUDA_ERR_STRING(err); 276 return OFFLOAD_FAIL; 277 } 278 279 // Create the context and save it to use whenever this device is selected. 280 err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC, 281 cuDevice); 282 if (err != CUDA_SUCCESS) { 283 DP("Error when creating a CUDA context\n"); 284 CUDA_ERR_STRING(err); 285 return OFFLOAD_FAIL; 286 } 287 288 // scan properties to determine number of threads/block and blocks/grid. 289 CUdevprop Properties; 290 err = cuDeviceGetProperties(&Properties, cuDevice); 291 if (err != CUDA_SUCCESS) { 292 DP("Error getting device Properties, use defaults\n"); 293 DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams; 294 DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads; 295 DeviceInfo.WarpSize[device_id] = 32; 296 } else { 297 // Get blocks per grid 298 if (Properties.maxGridSize[0] <= RTLDeviceInfoTy::HardTeamLimit) { 299 DeviceInfo.BlocksPerGrid[device_id] = Properties.maxGridSize[0]; 300 DP("Using %d CUDA blocks per grid\n", Properties.maxGridSize[0]); 301 } else { 302 DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit; 303 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 304 "at the hard limit\n", Properties.maxGridSize[0], 305 RTLDeviceInfoTy::HardTeamLimit); 306 } 307 308 // Get threads per block, exploit threads only along x axis 309 if (Properties.maxThreadsDim[0] <= RTLDeviceInfoTy::HardThreadLimit) { 310 DeviceInfo.ThreadsPerBlock[device_id] = Properties.maxThreadsDim[0]; 311 DP("Using %d CUDA threads per block\n", Properties.maxThreadsDim[0]); 312 if (Properties.maxThreadsDim[0] < Properties.maxThreadsPerBlock) { 313 DP("(fewer than max per block along all xyz dims %d)\n", 314 Properties.maxThreadsPerBlock); 315 } 316 } else { 317 DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit; 318 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 319 "capping at the hard limit\n", Properties.maxThreadsDim[0], 320 RTLDeviceInfoTy::HardThreadLimit); 321 } 322 323 // According to the documentation, SIMDWidth is "Warp size in threads". 324 DeviceInfo.WarpSize[device_id] = Properties.SIMDWidth; 325 } 326 327 // Adjust teams to the env variables 328 if (DeviceInfo.EnvTeamLimit > 0 && 329 DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) { 330 DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit; 331 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 332 DeviceInfo.EnvTeamLimit); 333 } 334 335 DP("Max number of CUDA blocks %d, threads %d & warp size %d\n", 336 DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id], 337 DeviceInfo.WarpSize[device_id]); 338 339 // Set default number of teams 340 if (DeviceInfo.EnvNumTeams > 0) { 341 DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams; 342 DP("Default number of teams set according to environment %d\n", 343 DeviceInfo.EnvNumTeams); 344 } else { 345 DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams; 346 DP("Default number of teams set according to library's default %d\n", 347 RTLDeviceInfoTy::DefaultNumTeams); 348 } 349 if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) { 350 DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id]; 351 DP("Default number of teams exceeds device limit, capping at %d\n", 352 DeviceInfo.BlocksPerGrid[device_id]); 353 } 354 355 // Set default number of threads 356 DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads; 357 DP("Default number of threads set according to library's default %d\n", 358 RTLDeviceInfoTy::DefaultNumThreads); 359 if (DeviceInfo.NumThreads[device_id] > 360 DeviceInfo.ThreadsPerBlock[device_id]) { 361 DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id]; 362 DP("Default number of threads exceeds device limit, capping at %d\n", 363 DeviceInfo.ThreadsPerBlock[device_id]); 364 } 365 366 return OFFLOAD_SUCCESS; 367 } 368 369 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 370 __tgt_device_image *image) { 371 372 // Set the context we are using. 373 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 374 if (err != CUDA_SUCCESS) { 375 DP("Error when setting a CUDA context for device %d\n", device_id); 376 CUDA_ERR_STRING(err); 377 return NULL; 378 } 379 380 // Clear the offload table as we are going to create a new one. 381 DeviceInfo.clearOffloadEntriesTable(device_id); 382 383 // Create the module and extract the function pointers. 384 385 CUmodule cumod; 386 DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart)); 387 err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL); 388 if (err != CUDA_SUCCESS) { 389 DP("Error when loading CUDA module\n"); 390 CUDA_ERR_STRING(err); 391 return NULL; 392 } 393 394 DP("CUDA module successfully loaded!\n"); 395 DeviceInfo.Modules.push_back(cumod); 396 397 // Find the symbols in the module by name. 398 __tgt_offload_entry *HostBegin = image->EntriesBegin; 399 __tgt_offload_entry *HostEnd = image->EntriesEnd; 400 401 for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { 402 403 if (!e->addr) { 404 // We return NULL when something like this happens, the host should have 405 // always something in the address to uniquely identify the target region. 406 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size); 407 408 return NULL; 409 } 410 411 if (e->size) { 412 __tgt_offload_entry entry = *e; 413 414 CUdeviceptr cuptr; 415 size_t cusize; 416 err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name); 417 418 if (err != CUDA_SUCCESS) { 419 DP("Loading global '%s' (Failed)\n", e->name); 420 CUDA_ERR_STRING(err); 421 return NULL; 422 } 423 424 if (cusize != e->size) { 425 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name, 426 cusize, e->size); 427 CUDA_ERR_STRING(err); 428 return NULL; 429 } 430 431 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 432 DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr)); 433 entry.addr = (void *)cuptr; 434 435 DeviceInfo.addOffloadEntry(device_id, entry); 436 437 continue; 438 } 439 440 CUfunction fun; 441 err = cuModuleGetFunction(&fun, cumod, e->name); 442 443 if (err != CUDA_SUCCESS) { 444 DP("Loading '%s' (Failed)\n", e->name); 445 CUDA_ERR_STRING(err); 446 return NULL; 447 } 448 449 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 450 DPxPTR(e - HostBegin), e->name, DPxPTR(fun)); 451 452 // default value GENERIC (in case symbol is missing from cubin file) 453 int8_t ExecModeVal = ExecutionModeType::GENERIC; 454 std::string ExecModeNameStr (e->name); 455 ExecModeNameStr += "_exec_mode"; 456 const char *ExecModeName = ExecModeNameStr.c_str(); 457 458 CUdeviceptr ExecModePtr; 459 size_t cusize; 460 err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName); 461 if (err == CUDA_SUCCESS) { 462 if ((size_t)cusize != sizeof(int8_t)) { 463 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 464 ExecModeName, cusize, sizeof(int8_t)); 465 CUDA_ERR_STRING(err); 466 return NULL; 467 } 468 469 err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize); 470 if (err != CUDA_SUCCESS) { 471 DP("Error when copying data from device to host. Pointers: " 472 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 473 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize); 474 CUDA_ERR_STRING(err); 475 return NULL; 476 } 477 478 if (ExecModeVal < 0 || ExecModeVal > 1) { 479 DP("Error wrong exec_mode value specified in cubin file: %d\n", 480 ExecModeVal); 481 return NULL; 482 } 483 } else { 484 DP("Loading global exec_mode '%s' - symbol missing, using default value " 485 "GENERIC (1)\n", ExecModeName); 486 CUDA_ERR_STRING(err); 487 } 488 489 KernelsList.push_back(KernelTy(fun, ExecModeVal)); 490 491 __tgt_offload_entry entry = *e; 492 entry.addr = (void *)&KernelsList.back(); 493 DeviceInfo.addOffloadEntry(device_id, entry); 494 } 495 496 // send device environment data to the device 497 { 498 omptarget_device_environmentTy device_env; 499 500 device_env.debug_level = 0; 501 502 #ifdef OMPTARGET_DEBUG 503 if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) { 504 device_env.debug_level = std::stoi(envStr); 505 } 506 #endif 507 508 const char * device_env_Name="omptarget_device_environment"; 509 CUdeviceptr device_env_Ptr; 510 size_t cusize; 511 512 err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name); 513 514 if (err == CUDA_SUCCESS) { 515 if ((size_t)cusize != sizeof(device_env)) { 516 DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n", 517 device_env_Name, cusize, sizeof(int32_t)); 518 CUDA_ERR_STRING(err); 519 return NULL; 520 } 521 522 err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize); 523 if (err != CUDA_SUCCESS) { 524 DP("Error when copying data from host to device. Pointers: " 525 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 526 DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize); 527 CUDA_ERR_STRING(err); 528 return NULL; 529 } 530 531 DP("Sending global device environment data %zu bytes\n", (size_t)cusize); 532 } else { 533 DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name); 534 DP("Continue, considering this is a device RTL which does not accept envrionment setting.\n"); 535 } 536 } 537 538 return DeviceInfo.getOffloadEntriesTable(device_id); 539 } 540 541 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { 542 if (size == 0) { 543 return NULL; 544 } 545 546 // Set the context we are using. 547 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 548 if (err != CUDA_SUCCESS) { 549 DP("Error while trying to set CUDA current context\n"); 550 CUDA_ERR_STRING(err); 551 return NULL; 552 } 553 554 CUdeviceptr ptr; 555 err = cuMemAlloc(&ptr, size); 556 if (err != CUDA_SUCCESS) { 557 DP("Error while trying to allocate %d\n", err); 558 CUDA_ERR_STRING(err); 559 return NULL; 560 } 561 562 void *vptr = (void *)ptr; 563 return vptr; 564 } 565 566 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 567 int64_t size) { 568 // Set the context we are using. 569 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 570 if (err != CUDA_SUCCESS) { 571 DP("Error when setting CUDA context\n"); 572 CUDA_ERR_STRING(err); 573 return OFFLOAD_FAIL; 574 } 575 576 err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size); 577 if (err != CUDA_SUCCESS) { 578 DP("Error when copying data from host to device. Pointers: host = " DPxMOD 579 ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), 580 DPxPTR(tgt_ptr), size); 581 CUDA_ERR_STRING(err); 582 return OFFLOAD_FAIL; 583 } 584 return OFFLOAD_SUCCESS; 585 } 586 587 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 588 int64_t size) { 589 // Set the context we are using. 590 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 591 if (err != CUDA_SUCCESS) { 592 DP("Error when setting CUDA context\n"); 593 CUDA_ERR_STRING(err); 594 return OFFLOAD_FAIL; 595 } 596 597 err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size); 598 if (err != CUDA_SUCCESS) { 599 DP("Error when copying data from device to host. Pointers: host = " DPxMOD 600 ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), 601 DPxPTR(tgt_ptr), size); 602 CUDA_ERR_STRING(err); 603 return OFFLOAD_FAIL; 604 } 605 return OFFLOAD_SUCCESS; 606 } 607 608 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 609 // Set the context we are using. 610 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 611 if (err != CUDA_SUCCESS) { 612 DP("Error when setting CUDA context\n"); 613 CUDA_ERR_STRING(err); 614 return OFFLOAD_FAIL; 615 } 616 617 err = cuMemFree((CUdeviceptr)tgt_ptr); 618 if (err != CUDA_SUCCESS) { 619 DP("Error when freeing CUDA memory\n"); 620 CUDA_ERR_STRING(err); 621 return OFFLOAD_FAIL; 622 } 623 return OFFLOAD_SUCCESS; 624 } 625 626 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 627 void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 628 int32_t thread_limit, uint64_t loop_tripcount) { 629 // Set the context we are using. 630 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 631 if (err != CUDA_SUCCESS) { 632 DP("Error when setting CUDA context\n"); 633 CUDA_ERR_STRING(err); 634 return OFFLOAD_FAIL; 635 } 636 637 // All args are references. 638 std::vector<void *> args(arg_num); 639 std::vector<void *> ptrs(arg_num); 640 641 for (int32_t i = 0; i < arg_num; ++i) { 642 ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); 643 args[i] = &ptrs[i]; 644 } 645 646 KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; 647 648 int cudaThreadsPerBlock; 649 650 if (thread_limit > 0) { 651 cudaThreadsPerBlock = thread_limit; 652 DP("Setting CUDA threads per block to requested %d\n", thread_limit); 653 // Add master warp if necessary 654 if (KernelInfo->ExecutionMode == GENERIC) { 655 cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id]; 656 DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]); 657 } 658 } else { 659 cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id]; 660 DP("Setting CUDA threads per block to default %d\n", 661 DeviceInfo.NumThreads[device_id]); 662 } 663 664 if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) { 665 cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id]; 666 DP("Threads per block capped at device limit %d\n", 667 DeviceInfo.ThreadsPerBlock[device_id]); 668 } 669 670 int kernel_limit; 671 err = cuFuncGetAttribute(&kernel_limit, 672 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func); 673 if (err == CUDA_SUCCESS) { 674 if (kernel_limit < cudaThreadsPerBlock) { 675 cudaThreadsPerBlock = kernel_limit; 676 DP("Threads per block capped at kernel limit %d\n", kernel_limit); 677 } 678 } 679 680 int cudaBlocksPerGrid; 681 if (team_num <= 0) { 682 if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) { 683 if (KernelInfo->ExecutionMode == SPMD) { 684 // We have a combined construct, i.e. `target teams distribute parallel 685 // for [simd]`. We launch so many teams so that each thread will 686 // execute one iteration of the loop. 687 // round up to the nearest integer 688 cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1; 689 } else { 690 // If we reach this point, then we have a non-combined construct, i.e. 691 // `teams distribute` with a nested `parallel for` and each team is 692 // assigned one iteration of the `distribute` loop. E.g.: 693 // 694 // #pragma omp target teams distribute 695 // for(...loop_tripcount...) { 696 // #pragma omp parallel for 697 // for(...) {} 698 // } 699 // 700 // Threads within a team will execute the iterations of the `parallel` 701 // loop. 702 cudaBlocksPerGrid = loop_tripcount; 703 } 704 DP("Using %d teams due to loop trip count %" PRIu64 " and number of " 705 "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount, 706 cudaThreadsPerBlock); 707 } else { 708 cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id]; 709 DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]); 710 } 711 } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) { 712 cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id]; 713 DP("Capping number of teams to team limit %d\n", 714 DeviceInfo.BlocksPerGrid[device_id]); 715 } else { 716 cudaBlocksPerGrid = team_num; 717 DP("Using requested number of teams %d\n", team_num); 718 } 719 720 // Run on the device. 721 DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid, 722 cudaThreadsPerBlock); 723 724 err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1, 725 cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0); 726 if (err != CUDA_SUCCESS) { 727 DP("Device kernel launch failed!\n"); 728 CUDA_ERR_STRING(err); 729 return OFFLOAD_FAIL; 730 } 731 732 DP("Launch of entry point at " DPxMOD " successful!\n", 733 DPxPTR(tgt_entry_ptr)); 734 735 CUresult sync_err = cuCtxSynchronize(); 736 if (sync_err != CUDA_SUCCESS) { 737 DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr)); 738 CUDA_ERR_STRING(sync_err); 739 return OFFLOAD_FAIL; 740 } else { 741 DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); 742 } 743 744 return OFFLOAD_SUCCESS; 745 } 746 747 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 748 void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { 749 // use one team and the default number of threads. 750 const int32_t team_num = 1; 751 const int32_t thread_limit = 0; 752 return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, 753 tgt_offsets, arg_num, team_num, thread_limit, 0); 754 } 755 756 #ifdef __cplusplus 757 } 758 #endif 759