1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is dual licensed under the MIT and the University of Illinois Open 6 // Source Licenses. See LICENSE.txt for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // RTL for CUDA machine 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include <cassert> 15 #include <cstddef> 16 #include <cuda.h> 17 #include <cuda_runtime_api.h> 18 #include <list> 19 #include <string> 20 #include <vector> 21 22 #include "omptargetplugin.h" 23 24 #ifndef TARGET_NAME 25 #define TARGET_NAME CUDA 26 #endif 27 28 #ifdef OMPTARGET_DEBUG 29 static int DebugLevel = 0; 30 31 #define GETNAME2(name) #name 32 #define GETNAME(name) GETNAME2(name) 33 #define DP(...) \ 34 do { \ 35 if (DebugLevel > 0) { \ 36 DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ 37 } \ 38 } while (false) 39 #else // OMPTARGET_DEBUG 40 #define DP(...) {} 41 #endif // OMPTARGET_DEBUG 42 43 #include "../../common/elf_common.c" 44 45 // Utility for retrieving and printing CUDA error string. 46 #ifdef CUDA_ERROR_REPORT 47 #define CUDA_ERR_STRING(err) \ 48 do { \ 49 const char *errStr; \ 50 cuGetErrorString(err, &errStr); \ 51 DP("CUDA error is: %s\n", errStr); \ 52 } while (0) 53 #else 54 #define CUDA_ERR_STRING(err) \ 55 {} 56 #endif 57 58 /// Keep entries table per device. 59 struct FuncOrGblEntryTy { 60 __tgt_target_table Table; 61 std::vector<__tgt_offload_entry> Entries; 62 }; 63 64 enum ExecutionModeType { 65 SPMD, // constructors, destructors, 66 // combined constructs (`teams distribute parallel for [simd]`) 67 GENERIC, // everything else 68 NONE 69 }; 70 71 /// Use a single entity to encode a kernel and a set of flags 72 struct KernelTy { 73 CUfunction Func; 74 75 // execution mode of kernel 76 // 0 - SPMD mode (without master warp) 77 // 1 - Generic mode (with master warp) 78 int8_t ExecutionMode; 79 80 KernelTy(CUfunction _Func, int8_t _ExecutionMode) 81 : Func(_Func), ExecutionMode(_ExecutionMode) {} 82 }; 83 84 /// List that contains all the kernels. 85 /// FIXME: we may need this to be per device and per library. 86 std::list<KernelTy> KernelsList; 87 88 /// Class containing all the device information. 89 class RTLDeviceInfoTy { 90 std::vector<FuncOrGblEntryTy> FuncGblEntries; 91 92 public: 93 int NumberOfDevices; 94 std::vector<CUmodule> Modules; 95 std::vector<CUcontext> Contexts; 96 97 // Device properties 98 std::vector<int> ThreadsPerBlock; 99 std::vector<int> BlocksPerGrid; 100 std::vector<int> WarpSize; 101 102 // OpenMP properties 103 std::vector<int> NumTeams; 104 std::vector<int> NumThreads; 105 106 // OpenMP Environment properties 107 int EnvNumTeams; 108 int EnvTeamLimit; 109 110 //static int EnvNumThreads; 111 static const int HardTeamLimit = 1<<16; // 64k 112 static const int HardThreadLimit = 1024; 113 static const int DefaultNumTeams = 128; 114 static const int DefaultNumThreads = 128; 115 116 // Record entry point associated with device 117 void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { 118 assert(device_id < (int32_t)FuncGblEntries.size() && 119 "Unexpected device id!"); 120 FuncOrGblEntryTy &E = FuncGblEntries[device_id]; 121 122 E.Entries.push_back(entry); 123 } 124 125 // Return true if the entry is associated with device 126 bool findOffloadEntry(int32_t device_id, void *addr) { 127 assert(device_id < (int32_t)FuncGblEntries.size() && 128 "Unexpected device id!"); 129 FuncOrGblEntryTy &E = FuncGblEntries[device_id]; 130 131 for (auto &it : E.Entries) { 132 if (it.addr == addr) 133 return true; 134 } 135 136 return false; 137 } 138 139 // Return the pointer to the target entries table 140 __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { 141 assert(device_id < (int32_t)FuncGblEntries.size() && 142 "Unexpected device id!"); 143 FuncOrGblEntryTy &E = FuncGblEntries[device_id]; 144 145 int32_t size = E.Entries.size(); 146 147 // Table is empty 148 if (!size) 149 return 0; 150 151 __tgt_offload_entry *begin = &E.Entries[0]; 152 __tgt_offload_entry *end = &E.Entries[size - 1]; 153 154 // Update table info according to the entries and return the pointer 155 E.Table.EntriesBegin = begin; 156 E.Table.EntriesEnd = ++end; 157 158 return &E.Table; 159 } 160 161 // Clear entries table for a device 162 void clearOffloadEntriesTable(int32_t device_id) { 163 assert(device_id < (int32_t)FuncGblEntries.size() && 164 "Unexpected device id!"); 165 FuncOrGblEntryTy &E = FuncGblEntries[device_id]; 166 E.Entries.clear(); 167 E.Table.EntriesBegin = E.Table.EntriesEnd = 0; 168 } 169 170 RTLDeviceInfoTy() { 171 #ifdef OMPTARGET_DEBUG 172 if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) { 173 DebugLevel = std::stoi(envStr); 174 } 175 #endif // OMPTARGET_DEBUG 176 177 DP("Start initializing CUDA\n"); 178 179 CUresult err = cuInit(0); 180 if (err != CUDA_SUCCESS) { 181 DP("Error when initializing CUDA\n"); 182 CUDA_ERR_STRING(err); 183 return; 184 } 185 186 NumberOfDevices = 0; 187 188 err = cuDeviceGetCount(&NumberOfDevices); 189 if (err != CUDA_SUCCESS) { 190 DP("Error when getting CUDA device count\n"); 191 CUDA_ERR_STRING(err); 192 return; 193 } 194 195 if (NumberOfDevices == 0) { 196 DP("There are no devices supporting CUDA.\n"); 197 return; 198 } 199 200 FuncGblEntries.resize(NumberOfDevices); 201 Contexts.resize(NumberOfDevices); 202 ThreadsPerBlock.resize(NumberOfDevices); 203 BlocksPerGrid.resize(NumberOfDevices); 204 WarpSize.resize(NumberOfDevices); 205 NumTeams.resize(NumberOfDevices); 206 NumThreads.resize(NumberOfDevices); 207 208 // Get environment variables regarding teams 209 char *envStr = getenv("OMP_TEAM_LIMIT"); 210 if (envStr) { 211 // OMP_TEAM_LIMIT has been set 212 EnvTeamLimit = std::stoi(envStr); 213 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 214 } else { 215 EnvTeamLimit = -1; 216 } 217 envStr = getenv("OMP_NUM_TEAMS"); 218 if (envStr) { 219 // OMP_NUM_TEAMS has been set 220 EnvNumTeams = std::stoi(envStr); 221 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 222 } else { 223 EnvNumTeams = -1; 224 } 225 } 226 227 ~RTLDeviceInfoTy() { 228 // Close modules 229 for (auto &module : Modules) 230 if (module) { 231 CUresult err = cuModuleUnload(module); 232 if (err != CUDA_SUCCESS) { 233 DP("Error when unloading CUDA module\n"); 234 CUDA_ERR_STRING(err); 235 } 236 } 237 238 // Destroy contexts 239 for (auto &ctx : Contexts) 240 if (ctx) { 241 CUresult err = cuCtxDestroy(ctx); 242 if (err != CUDA_SUCCESS) { 243 DP("Error when destroying CUDA context\n"); 244 CUDA_ERR_STRING(err); 245 } 246 } 247 } 248 }; 249 250 static RTLDeviceInfoTy DeviceInfo; 251 252 #ifdef __cplusplus 253 extern "C" { 254 #endif 255 256 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 257 return elf_check_machine(image, 190); // EM_CUDA = 190. 258 } 259 260 int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; } 261 262 int32_t __tgt_rtl_init_device(int32_t device_id) { 263 264 CUdevice cuDevice; 265 DP("Getting device %d\n", device_id); 266 CUresult err = cuDeviceGet(&cuDevice, device_id); 267 if (err != CUDA_SUCCESS) { 268 DP("Error when getting CUDA device with id = %d\n", device_id); 269 CUDA_ERR_STRING(err); 270 return OFFLOAD_FAIL; 271 } 272 273 // Create the context and save it to use whenever this device is selected. 274 err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC, 275 cuDevice); 276 if (err != CUDA_SUCCESS) { 277 DP("Error when creating a CUDA context\n"); 278 CUDA_ERR_STRING(err); 279 return OFFLOAD_FAIL; 280 } 281 282 // scan properties to determine number of threads/block and blocks/grid. 283 struct cudaDeviceProp Properties; 284 cudaError_t error = cudaGetDeviceProperties(&Properties, device_id); 285 if (error != cudaSuccess) { 286 DP("Error getting device Properties, use defaults\n"); 287 DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams; 288 DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads; 289 DeviceInfo.WarpSize[device_id] = 32; 290 } else { 291 // Get blocks per grid 292 if (Properties.maxGridSize[0] <= RTLDeviceInfoTy::HardTeamLimit) { 293 DeviceInfo.BlocksPerGrid[device_id] = Properties.maxGridSize[0]; 294 DP("Using %d CUDA blocks per grid\n", Properties.maxGridSize[0]); 295 } else { 296 DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit; 297 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 298 "at the hard limit\n", Properties.maxGridSize[0], 299 RTLDeviceInfoTy::HardTeamLimit); 300 } 301 302 // Get threads per block, exploit threads only along x axis 303 if (Properties.maxThreadsDim[0] <= RTLDeviceInfoTy::HardThreadLimit) { 304 DeviceInfo.ThreadsPerBlock[device_id] = Properties.maxThreadsDim[0]; 305 DP("Using %d CUDA threads per block\n", Properties.maxThreadsDim[0]); 306 if (Properties.maxThreadsDim[0] < Properties.maxThreadsPerBlock) { 307 DP("(fewer than max per block along all xyz dims %d)\n", 308 Properties.maxThreadsPerBlock); 309 } 310 } else { 311 DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit; 312 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 313 "capping at the hard limit\n", Properties.maxThreadsDim[0], 314 RTLDeviceInfoTy::HardThreadLimit); 315 } 316 317 // Get warp size 318 DeviceInfo.WarpSize[device_id] = Properties.warpSize; 319 } 320 321 // Adjust teams to the env variables 322 if (DeviceInfo.EnvTeamLimit > 0 && 323 DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) { 324 DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit; 325 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 326 DeviceInfo.EnvTeamLimit); 327 } 328 329 DP("Max number of CUDA blocks %d, threads %d & warp size %d\n", 330 DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id], 331 DeviceInfo.WarpSize[device_id]); 332 333 // Set default number of teams 334 if (DeviceInfo.EnvNumTeams > 0) { 335 DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams; 336 DP("Default number of teams set according to environment %d\n", 337 DeviceInfo.EnvNumTeams); 338 } else { 339 DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams; 340 DP("Default number of teams set according to library's default %d\n", 341 RTLDeviceInfoTy::DefaultNumTeams); 342 } 343 if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) { 344 DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id]; 345 DP("Default number of teams exceeds device limit, capping at %d\n", 346 DeviceInfo.BlocksPerGrid[device_id]); 347 } 348 349 // Set default number of threads 350 DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads; 351 DP("Default number of threads set according to library's default %d\n", 352 RTLDeviceInfoTy::DefaultNumThreads); 353 if (DeviceInfo.NumThreads[device_id] > 354 DeviceInfo.ThreadsPerBlock[device_id]) { 355 DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id]; 356 DP("Default number of threads exceeds device limit, capping at %d\n", 357 DeviceInfo.ThreadsPerBlock[device_id]); 358 } 359 360 return OFFLOAD_SUCCESS; 361 } 362 363 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 364 __tgt_device_image *image) { 365 366 // Set the context we are using. 367 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 368 if (err != CUDA_SUCCESS) { 369 DP("Error when setting a CUDA context for device %d\n", device_id); 370 CUDA_ERR_STRING(err); 371 return NULL; 372 } 373 374 // Clear the offload table as we are going to create a new one. 375 DeviceInfo.clearOffloadEntriesTable(device_id); 376 377 // Create the module and extract the function pointers. 378 379 CUmodule cumod; 380 DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart)); 381 err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL); 382 if (err != CUDA_SUCCESS) { 383 DP("Error when loading CUDA module\n"); 384 CUDA_ERR_STRING(err); 385 return NULL; 386 } 387 388 DP("CUDA module successfully loaded!\n"); 389 DeviceInfo.Modules.push_back(cumod); 390 391 // Find the symbols in the module by name. 392 __tgt_offload_entry *HostBegin = image->EntriesBegin; 393 __tgt_offload_entry *HostEnd = image->EntriesEnd; 394 395 for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { 396 397 if (!e->addr) { 398 // We return NULL when something like this happens, the host should have 399 // always something in the address to uniquely identify the target region. 400 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size); 401 402 return NULL; 403 } 404 405 if (e->size) { 406 __tgt_offload_entry entry = *e; 407 408 CUdeviceptr cuptr; 409 size_t cusize; 410 err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name); 411 412 if (err != CUDA_SUCCESS) { 413 DP("Loading global '%s' (Failed)\n", e->name); 414 CUDA_ERR_STRING(err); 415 return NULL; 416 } 417 418 if (cusize != e->size) { 419 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name, 420 cusize, e->size); 421 CUDA_ERR_STRING(err); 422 return NULL; 423 } 424 425 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 426 DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr)); 427 entry.addr = (void *)cuptr; 428 429 DeviceInfo.addOffloadEntry(device_id, entry); 430 431 continue; 432 } 433 434 CUfunction fun; 435 err = cuModuleGetFunction(&fun, cumod, e->name); 436 437 if (err != CUDA_SUCCESS) { 438 DP("Loading '%s' (Failed)\n", e->name); 439 CUDA_ERR_STRING(err); 440 return NULL; 441 } 442 443 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 444 DPxPTR(e - HostBegin), e->name, DPxPTR(fun)); 445 446 // default value GENERIC (in case symbol is missing from cubin file) 447 int8_t ExecModeVal = ExecutionModeType::GENERIC; 448 std::string ExecModeNameStr (e->name); 449 ExecModeNameStr += "_exec_mode"; 450 const char *ExecModeName = ExecModeNameStr.c_str(); 451 452 CUdeviceptr ExecModePtr; 453 size_t cusize; 454 err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName); 455 if (err == CUDA_SUCCESS) { 456 if ((size_t)cusize != sizeof(int8_t)) { 457 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 458 ExecModeName, cusize, sizeof(int8_t)); 459 CUDA_ERR_STRING(err); 460 return NULL; 461 } 462 463 err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize); 464 if (err != CUDA_SUCCESS) { 465 DP("Error when copying data from device to host. Pointers: " 466 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 467 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize); 468 CUDA_ERR_STRING(err); 469 return NULL; 470 } 471 472 if (ExecModeVal < 0 || ExecModeVal > 1) { 473 DP("Error wrong exec_mode value specified in cubin file: %d\n", 474 ExecModeVal); 475 return NULL; 476 } 477 } else { 478 DP("Loading global exec_mode '%s' - symbol missing, using default value " 479 "GENERIC (1)\n", ExecModeName); 480 CUDA_ERR_STRING(err); 481 } 482 483 KernelsList.push_back(KernelTy(fun, ExecModeVal)); 484 485 __tgt_offload_entry entry = *e; 486 entry.addr = (void *)&KernelsList.back(); 487 DeviceInfo.addOffloadEntry(device_id, entry); 488 } 489 490 return DeviceInfo.getOffloadEntriesTable(device_id); 491 } 492 493 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { 494 if (size == 0) { 495 return NULL; 496 } 497 498 // Set the context we are using. 499 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 500 if (err != CUDA_SUCCESS) { 501 DP("Error while trying to set CUDA current context\n"); 502 CUDA_ERR_STRING(err); 503 return NULL; 504 } 505 506 CUdeviceptr ptr; 507 err = cuMemAlloc(&ptr, size); 508 if (err != CUDA_SUCCESS) { 509 DP("Error while trying to allocate %d\n", err); 510 CUDA_ERR_STRING(err); 511 return NULL; 512 } 513 514 void *vptr = (void *)ptr; 515 return vptr; 516 } 517 518 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 519 int64_t size) { 520 // Set the context we are using. 521 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 522 if (err != CUDA_SUCCESS) { 523 DP("Error when setting CUDA context\n"); 524 CUDA_ERR_STRING(err); 525 return OFFLOAD_FAIL; 526 } 527 528 err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size); 529 if (err != CUDA_SUCCESS) { 530 DP("Error when copying data from host to device. Pointers: host = " DPxMOD 531 ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), 532 DPxPTR(tgt_ptr), size); 533 CUDA_ERR_STRING(err); 534 return OFFLOAD_FAIL; 535 } 536 return OFFLOAD_SUCCESS; 537 } 538 539 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 540 int64_t size) { 541 // Set the context we are using. 542 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 543 if (err != CUDA_SUCCESS) { 544 DP("Error when setting CUDA context\n"); 545 CUDA_ERR_STRING(err); 546 return OFFLOAD_FAIL; 547 } 548 549 err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size); 550 if (err != CUDA_SUCCESS) { 551 DP("Error when copying data from device to host. Pointers: host = " DPxMOD 552 ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), 553 DPxPTR(tgt_ptr), size); 554 CUDA_ERR_STRING(err); 555 return OFFLOAD_FAIL; 556 } 557 return OFFLOAD_SUCCESS; 558 } 559 560 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 561 // Set the context we are using. 562 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 563 if (err != CUDA_SUCCESS) { 564 DP("Error when setting CUDA context\n"); 565 CUDA_ERR_STRING(err); 566 return OFFLOAD_FAIL; 567 } 568 569 err = cuMemFree((CUdeviceptr)tgt_ptr); 570 if (err != CUDA_SUCCESS) { 571 DP("Error when freeing CUDA memory\n"); 572 CUDA_ERR_STRING(err); 573 return OFFLOAD_FAIL; 574 } 575 return OFFLOAD_SUCCESS; 576 } 577 578 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 579 void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 580 int32_t thread_limit, uint64_t loop_tripcount) { 581 // Set the context we are using. 582 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 583 if (err != CUDA_SUCCESS) { 584 DP("Error when setting CUDA context\n"); 585 CUDA_ERR_STRING(err); 586 return OFFLOAD_FAIL; 587 } 588 589 // All args are references. 590 std::vector<void *> args(arg_num); 591 std::vector<void *> ptrs(arg_num); 592 593 for (int32_t i = 0; i < arg_num; ++i) { 594 ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); 595 args[i] = &ptrs[i]; 596 } 597 598 KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; 599 600 int cudaThreadsPerBlock; 601 602 if (thread_limit > 0) { 603 cudaThreadsPerBlock = thread_limit; 604 DP("Setting CUDA threads per block to requested %d\n", thread_limit); 605 // Add master warp if necessary 606 if (KernelInfo->ExecutionMode == GENERIC) { 607 cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id]; 608 DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]); 609 } 610 } else { 611 cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id]; 612 DP("Setting CUDA threads per block to default %d\n", 613 DeviceInfo.NumThreads[device_id]); 614 } 615 616 if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) { 617 cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id]; 618 DP("Threads per block capped at device limit %d\n", 619 DeviceInfo.ThreadsPerBlock[device_id]); 620 } 621 622 int kernel_limit; 623 err = cuFuncGetAttribute(&kernel_limit, 624 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func); 625 if (err == CUDA_SUCCESS) { 626 if (kernel_limit < cudaThreadsPerBlock) { 627 cudaThreadsPerBlock = kernel_limit; 628 DP("Threads per block capped at kernel limit %d\n", kernel_limit); 629 } 630 } 631 632 int cudaBlocksPerGrid; 633 if (team_num <= 0) { 634 if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) { 635 if (KernelInfo->ExecutionMode == SPMD) { 636 // We have a combined construct, i.e. `target teams distribute parallel 637 // for [simd]`. We launch so many teams so that each thread will 638 // execute one iteration of the loop. 639 // round up to the nearest integer 640 cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1; 641 } else { 642 // If we reach this point, then we have a non-combined construct, i.e. 643 // `teams distribute` with a nested `parallel for` and each team is 644 // assigned one iteration of the `distribute` loop. E.g.: 645 // 646 // #pragma omp target teams distribute 647 // for(...loop_tripcount...) { 648 // #pragma omp parallel for 649 // for(...) {} 650 // } 651 // 652 // Threads within a team will execute the iterations of the `parallel` 653 // loop. 654 cudaBlocksPerGrid = loop_tripcount; 655 } 656 DP("Using %d teams due to loop trip count %" PRIu64 " and number of " 657 "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount, 658 cudaThreadsPerBlock); 659 } else { 660 cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id]; 661 DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]); 662 } 663 } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) { 664 cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id]; 665 DP("Capping number of teams to team limit %d\n", 666 DeviceInfo.BlocksPerGrid[device_id]); 667 } else { 668 cudaBlocksPerGrid = team_num; 669 DP("Using requested number of teams %d\n", team_num); 670 } 671 672 // Run on the device. 673 DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid, 674 cudaThreadsPerBlock); 675 676 err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1, 677 cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0); 678 if (err != CUDA_SUCCESS) { 679 DP("Device kernel launch failed!\n"); 680 CUDA_ERR_STRING(err); 681 assert(err == CUDA_SUCCESS && "Unable to launch target execution!"); 682 return OFFLOAD_FAIL; 683 } 684 685 DP("Launch of entry point at " DPxMOD " successful!\n", 686 DPxPTR(tgt_entry_ptr)); 687 688 cudaError_t sync_error = cudaDeviceSynchronize(); 689 if (sync_error != cudaSuccess) { 690 DP("Kernel execution error at " DPxMOD ", %s.\n", DPxPTR(tgt_entry_ptr), 691 cudaGetErrorString(sync_error)); 692 return OFFLOAD_FAIL; 693 } else { 694 DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); 695 } 696 697 return OFFLOAD_SUCCESS; 698 } 699 700 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 701 void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { 702 // use one team and the default number of threads. 703 const int32_t team_num = 1; 704 const int32_t thread_limit = 0; 705 return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, 706 tgt_offsets, arg_num, team_num, thread_limit, 0); 707 } 708 709 #ifdef __cplusplus 710 } 711 #endif 712