1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is dual licensed under the MIT and the University of Illinois Open 6 // Source Licenses. See LICENSE.txt for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // RTL for CUDA machine 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include <cassert> 15 #include <cstddef> 16 #include <cuda.h> 17 #include <cuda_runtime_api.h> 18 #include <list> 19 #include <string> 20 #include <vector> 21 22 #include "omptarget.h" 23 24 #ifndef TARGET_NAME 25 #define TARGET_NAME CUDA 26 #endif 27 28 #define GETNAME2(name) #name 29 #define GETNAME(name) GETNAME2(name) 30 #define DP(...) DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__) 31 32 #include "../../common/elf_common.c" 33 34 // Utility for retrieving and printing CUDA error string. 35 #ifdef CUDA_ERROR_REPORT 36 #define CUDA_ERR_STRING(err) \ 37 do { \ 38 const char *errStr; \ 39 cuGetErrorString(err, &errStr); \ 40 DP("CUDA error is: %s\n", errStr); \ 41 } while (0) 42 #else 43 #define CUDA_ERR_STRING(err) \ 44 {} 45 #endif 46 47 /// Keep entries table per device. 48 struct FuncOrGblEntryTy { 49 __tgt_target_table Table; 50 std::vector<__tgt_offload_entry> Entries; 51 }; 52 53 enum ExecutionModeType { 54 SPMD, 55 GENERIC, 56 NONE 57 }; 58 59 /// Use a single entity to encode a kernel and a set of flags 60 struct KernelTy { 61 CUfunction Func; 62 63 // execution mode of kernel 64 // 0 - SPMD mode (without master warp) 65 // 1 - Generic mode (with master warp) 66 int8_t ExecutionMode; 67 68 KernelTy(CUfunction _Func, int8_t _ExecutionMode) 69 : Func(_Func), ExecutionMode(_ExecutionMode) {} 70 }; 71 72 /// List that contains all the kernels. 73 /// FIXME: we may need this to be per device and per library. 74 std::list<KernelTy> KernelsList; 75 76 /// Class containing all the device information. 77 class RTLDeviceInfoTy { 78 std::vector<FuncOrGblEntryTy> FuncGblEntries; 79 80 public: 81 int NumberOfDevices; 82 std::vector<CUmodule> Modules; 83 std::vector<CUcontext> Contexts; 84 85 // Device properties 86 std::vector<int> ThreadsPerBlock; 87 std::vector<int> BlocksPerGrid; 88 std::vector<int> WarpSize; 89 90 // OpenMP properties 91 std::vector<int> NumTeams; 92 std::vector<int> NumThreads; 93 94 // OpenMP Environment properties 95 int EnvNumTeams; 96 int EnvTeamLimit; 97 98 //static int EnvNumThreads; 99 static const int HardTeamLimit = 1<<16; // 64k 100 static const int HardThreadLimit = 1024; 101 static const int DefaultNumTeams = 128; 102 static const int DefaultNumThreads = 1024; 103 104 // Record entry point associated with device 105 void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { 106 assert(device_id < (int32_t)FuncGblEntries.size() && 107 "Unexpected device id!"); 108 FuncOrGblEntryTy &E = FuncGblEntries[device_id]; 109 110 E.Entries.push_back(entry); 111 } 112 113 // Return true if the entry is associated with device 114 bool findOffloadEntry(int32_t device_id, void *addr) { 115 assert(device_id < (int32_t)FuncGblEntries.size() && 116 "Unexpected device id!"); 117 FuncOrGblEntryTy &E = FuncGblEntries[device_id]; 118 119 for (auto &it : E.Entries) { 120 if (it.addr == addr) 121 return true; 122 } 123 124 return false; 125 } 126 127 // Return the pointer to the target entries table 128 __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { 129 assert(device_id < (int32_t)FuncGblEntries.size() && 130 "Unexpected device id!"); 131 FuncOrGblEntryTy &E = FuncGblEntries[device_id]; 132 133 int32_t size = E.Entries.size(); 134 135 // Table is empty 136 if (!size) 137 return 0; 138 139 __tgt_offload_entry *begin = &E.Entries[0]; 140 __tgt_offload_entry *end = &E.Entries[size - 1]; 141 142 // Update table info according to the entries and return the pointer 143 E.Table.EntriesBegin = begin; 144 E.Table.EntriesEnd = ++end; 145 146 return &E.Table; 147 } 148 149 // Clear entries table for a device 150 void clearOffloadEntriesTable(int32_t device_id) { 151 assert(device_id < (int32_t)FuncGblEntries.size() && 152 "Unexpected device id!"); 153 FuncOrGblEntryTy &E = FuncGblEntries[device_id]; 154 E.Entries.clear(); 155 E.Table.EntriesBegin = E.Table.EntriesEnd = 0; 156 } 157 158 RTLDeviceInfoTy() { 159 DP("Start initializing CUDA\n"); 160 161 CUresult err = cuInit(0); 162 if (err != CUDA_SUCCESS) { 163 DP("Error when initializing CUDA\n"); 164 CUDA_ERR_STRING(err); 165 return; 166 } 167 168 NumberOfDevices = 0; 169 170 err = cuDeviceGetCount(&NumberOfDevices); 171 if (err != CUDA_SUCCESS) { 172 DP("Error when getting CUDA device count\n"); 173 CUDA_ERR_STRING(err); 174 return; 175 } 176 177 if (NumberOfDevices == 0) { 178 DP("There are no devices supporting CUDA.\n"); 179 return; 180 } 181 182 FuncGblEntries.resize(NumberOfDevices); 183 Contexts.resize(NumberOfDevices); 184 ThreadsPerBlock.resize(NumberOfDevices); 185 BlocksPerGrid.resize(NumberOfDevices); 186 WarpSize.resize(NumberOfDevices); 187 NumTeams.resize(NumberOfDevices); 188 NumThreads.resize(NumberOfDevices); 189 190 // Get environment variables regarding teams 191 char *envStr = getenv("OMP_TEAM_LIMIT"); 192 if (envStr) { 193 // OMP_TEAM_LIMIT has been set 194 EnvTeamLimit = std::stoi(envStr); 195 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 196 } else { 197 EnvTeamLimit = -1; 198 } 199 envStr = getenv("OMP_NUM_TEAMS"); 200 if (envStr) { 201 // OMP_NUM_TEAMS has been set 202 EnvNumTeams = std::stoi(envStr); 203 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 204 } else { 205 EnvNumTeams = -1; 206 } 207 } 208 209 ~RTLDeviceInfoTy() { 210 // Close modules 211 for (auto &module : Modules) 212 if (module) { 213 CUresult err = cuModuleUnload(module); 214 if (err != CUDA_SUCCESS) { 215 DP("Error when unloading CUDA module\n"); 216 CUDA_ERR_STRING(err); 217 } 218 } 219 220 // Destroy contexts 221 for (auto &ctx : Contexts) 222 if (ctx) { 223 CUresult err = cuCtxDestroy(ctx); 224 if (err != CUDA_SUCCESS) { 225 DP("Error when destroying CUDA context\n"); 226 CUDA_ERR_STRING(err); 227 } 228 } 229 } 230 }; 231 232 static RTLDeviceInfoTy DeviceInfo; 233 234 #ifdef __cplusplus 235 extern "C" { 236 #endif 237 238 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 239 return elf_check_machine(image, 190); // EM_CUDA = 190. 240 } 241 242 int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; } 243 244 int32_t __tgt_rtl_init_device(int32_t device_id) { 245 246 CUdevice cuDevice; 247 DP("Getting device %d\n", device_id); 248 CUresult err = cuDeviceGet(&cuDevice, device_id); 249 if (err != CUDA_SUCCESS) { 250 DP("Error when getting CUDA device with id = %d\n", device_id); 251 CUDA_ERR_STRING(err); 252 return OFFLOAD_FAIL; 253 } 254 255 // Create the context and save it to use whenever this device is selected. 256 err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC, 257 cuDevice); 258 if (err != CUDA_SUCCESS) { 259 DP("Error when creating a CUDA context\n"); 260 CUDA_ERR_STRING(err); 261 return OFFLOAD_FAIL; 262 } 263 264 // scan properties to determine number of threads/block and blocks/grid. 265 struct cudaDeviceProp Properties; 266 cudaError_t error = cudaGetDeviceProperties(&Properties, device_id); 267 if (error != cudaSuccess) { 268 DP("Error getting device Properties, use defaults\n"); 269 DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams; 270 DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads; 271 DeviceInfo.WarpSize[device_id] = 32; 272 } else { 273 // Get blocks per grid 274 if (Properties.maxGridSize[0] <= RTLDeviceInfoTy::HardTeamLimit) { 275 DeviceInfo.BlocksPerGrid[device_id] = Properties.maxGridSize[0]; 276 DP("Using %d CUDA blocks per grid\n", Properties.maxGridSize[0]); 277 } else { 278 DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit; 279 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 280 "at the hard limit\n", Properties.maxGridSize[0], 281 RTLDeviceInfoTy::HardTeamLimit); 282 } 283 284 // Get threads per block, exploit threads only along x axis 285 if (Properties.maxThreadsDim[0] <= RTLDeviceInfoTy::HardThreadLimit) { 286 DeviceInfo.ThreadsPerBlock[device_id] = Properties.maxThreadsDim[0]; 287 DP("Using %d CUDA threads per block\n", Properties.maxThreadsDim[0]); 288 if (Properties.maxThreadsDim[0] < Properties.maxThreadsPerBlock) { 289 DP("(fewer than max per block along all xyz dims %d)\n", 290 Properties.maxThreadsPerBlock); 291 } 292 } else { 293 DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit; 294 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 295 "capping at the hard limit\n", Properties.maxThreadsDim[0], 296 RTLDeviceInfoTy::HardThreadLimit); 297 } 298 299 // Get warp size 300 DeviceInfo.WarpSize[device_id] = Properties.warpSize; 301 } 302 303 // Adjust teams to the env variables 304 if (DeviceInfo.EnvTeamLimit > 0 && 305 DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) { 306 DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit; 307 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 308 DeviceInfo.EnvTeamLimit); 309 } 310 311 DP("Max number of CUDA blocks %d, threads %d & warp size %d\n", 312 DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id], 313 DeviceInfo.WarpSize[device_id]); 314 315 // Set default number of teams 316 if (DeviceInfo.EnvNumTeams > 0) { 317 DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams; 318 DP("Default number of teams set according to environment %d\n", 319 DeviceInfo.EnvNumTeams); 320 } else { 321 DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams; 322 DP("Default number of teams set according to library's default %d\n", 323 RTLDeviceInfoTy::DefaultNumTeams); 324 } 325 if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) { 326 DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id]; 327 DP("Default number of teams exceeds device limit, capping at %d\n", 328 DeviceInfo.BlocksPerGrid[device_id]); 329 } 330 331 // Set default number of threads 332 DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads; 333 DP("Default number of threads set according to library's default %d\n", 334 RTLDeviceInfoTy::DefaultNumThreads); 335 if (DeviceInfo.NumThreads[device_id] > 336 DeviceInfo.ThreadsPerBlock[device_id]) { 337 DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id]; 338 DP("Default number of threads exceeds device limit, capping at %d\n", 339 DeviceInfo.ThreadsPerBlock[device_id]); 340 } 341 342 return OFFLOAD_SUCCESS; 343 } 344 345 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 346 __tgt_device_image *image) { 347 348 // Set the context we are using. 349 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 350 if (err != CUDA_SUCCESS) { 351 DP("Error when setting a CUDA context for device %d\n", device_id); 352 CUDA_ERR_STRING(err); 353 return NULL; 354 } 355 356 // Clear the offload table as we are going to create a new one. 357 DeviceInfo.clearOffloadEntriesTable(device_id); 358 359 // Create the module and extract the function pointers. 360 361 CUmodule cumod; 362 DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart)); 363 err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL); 364 if (err != CUDA_SUCCESS) { 365 DP("Error when loading CUDA module\n"); 366 CUDA_ERR_STRING(err); 367 return NULL; 368 } 369 370 DP("CUDA module successfully loaded!\n"); 371 DeviceInfo.Modules.push_back(cumod); 372 373 // Find the symbols in the module by name. 374 __tgt_offload_entry *HostBegin = image->EntriesBegin; 375 __tgt_offload_entry *HostEnd = image->EntriesEnd; 376 377 for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { 378 379 if (!e->addr) { 380 // We return NULL when something like this happens, the host should have 381 // always something in the address to uniquely identify the target region. 382 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size); 383 384 return NULL; 385 } 386 387 if (e->size) { 388 __tgt_offload_entry entry = *e; 389 390 CUdeviceptr cuptr; 391 size_t cusize; 392 err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name); 393 394 if (err != CUDA_SUCCESS) { 395 DP("Loading global '%s' (Failed)\n", e->name); 396 CUDA_ERR_STRING(err); 397 return NULL; 398 } 399 400 if (cusize != e->size) { 401 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name, 402 cusize, e->size); 403 CUDA_ERR_STRING(err); 404 return NULL; 405 } 406 407 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 408 DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr)); 409 entry.addr = (void *)cuptr; 410 411 DeviceInfo.addOffloadEntry(device_id, entry); 412 413 continue; 414 } 415 416 CUfunction fun; 417 err = cuModuleGetFunction(&fun, cumod, e->name); 418 419 if (err != CUDA_SUCCESS) { 420 DP("Loading '%s' (Failed)\n", e->name); 421 CUDA_ERR_STRING(err); 422 return NULL; 423 } 424 425 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 426 DPxPTR(e - HostBegin), e->name, DPxPTR(fun)); 427 428 // default value GENERIC (in case symbol is missing from cubin file) 429 int8_t ExecModeVal = ExecutionModeType::GENERIC; 430 std::string ExecModeNameStr (e->name); 431 ExecModeNameStr += "_exec_mode"; 432 const char *ExecModeName = ExecModeNameStr.c_str(); 433 434 CUdeviceptr ExecModePtr; 435 size_t cusize; 436 err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName); 437 if (err == CUDA_SUCCESS) { 438 if ((size_t)cusize != sizeof(int8_t)) { 439 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 440 ExecModeName, cusize, sizeof(int8_t)); 441 CUDA_ERR_STRING(err); 442 return NULL; 443 } 444 445 err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize); 446 if (err != CUDA_SUCCESS) { 447 DP("Error when copying data from device to host. Pointers: " 448 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 449 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize); 450 CUDA_ERR_STRING(err); 451 return NULL; 452 } 453 454 if (ExecModeVal < 0 || ExecModeVal > 1) { 455 DP("Error wrong exec_mode value specified in cubin file: %d\n", 456 ExecModeVal); 457 return NULL; 458 } 459 } else { 460 DP("Loading global exec_mode '%s' - symbol missing, using default value " 461 "GENERIC (1)\n", ExecModeName); 462 CUDA_ERR_STRING(err); 463 } 464 465 KernelsList.push_back(KernelTy(fun, ExecModeVal)); 466 467 __tgt_offload_entry entry = *e; 468 entry.addr = (void *)&KernelsList.back(); 469 DeviceInfo.addOffloadEntry(device_id, entry); 470 } 471 472 return DeviceInfo.getOffloadEntriesTable(device_id); 473 } 474 475 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size) { 476 if (size == 0) { 477 return NULL; 478 } 479 480 // Set the context we are using. 481 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 482 if (err != CUDA_SUCCESS) { 483 DP("Error while trying to set CUDA current context\n"); 484 CUDA_ERR_STRING(err); 485 return NULL; 486 } 487 488 CUdeviceptr ptr; 489 err = cuMemAlloc(&ptr, size); 490 if (err != CUDA_SUCCESS) { 491 DP("Error while trying to allocate %d\n", err); 492 CUDA_ERR_STRING(err); 493 return NULL; 494 } 495 496 void *vptr = (void *)ptr; 497 return vptr; 498 } 499 500 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 501 int64_t size) { 502 // Set the context we are using. 503 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 504 if (err != CUDA_SUCCESS) { 505 DP("Error when setting CUDA context\n"); 506 CUDA_ERR_STRING(err); 507 return OFFLOAD_FAIL; 508 } 509 510 err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size); 511 if (err != CUDA_SUCCESS) { 512 DP("Error when copying data from host to device. Pointers: host = " DPxMOD 513 ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), 514 DPxPTR(tgt_ptr), size); 515 CUDA_ERR_STRING(err); 516 return OFFLOAD_FAIL; 517 } 518 return OFFLOAD_SUCCESS; 519 } 520 521 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 522 int64_t size) { 523 // Set the context we are using. 524 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 525 if (err != CUDA_SUCCESS) { 526 DP("Error when setting CUDA context\n"); 527 CUDA_ERR_STRING(err); 528 return OFFLOAD_FAIL; 529 } 530 531 err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size); 532 if (err != CUDA_SUCCESS) { 533 DP("Error when copying data from device to host. Pointers: host = " DPxMOD 534 ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), 535 DPxPTR(tgt_ptr), size); 536 CUDA_ERR_STRING(err); 537 return OFFLOAD_FAIL; 538 } 539 return OFFLOAD_SUCCESS; 540 } 541 542 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 543 // Set the context we are using. 544 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 545 if (err != CUDA_SUCCESS) { 546 DP("Error when setting CUDA context\n"); 547 CUDA_ERR_STRING(err); 548 return OFFLOAD_FAIL; 549 } 550 551 err = cuMemFree((CUdeviceptr)tgt_ptr); 552 if (err != CUDA_SUCCESS) { 553 DP("Error when freeing CUDA memory\n"); 554 CUDA_ERR_STRING(err); 555 return OFFLOAD_FAIL; 556 } 557 return OFFLOAD_SUCCESS; 558 } 559 560 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 561 void **tgt_args, int32_t arg_num, int32_t team_num, int32_t thread_limit, 562 uint64_t loop_tripcount) { 563 // Set the context we are using. 564 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 565 if (err != CUDA_SUCCESS) { 566 DP("Error when setting CUDA context\n"); 567 CUDA_ERR_STRING(err); 568 return OFFLOAD_FAIL; 569 } 570 571 // All args are references. 572 std::vector<void *> args(arg_num); 573 574 for (int32_t i = 0; i < arg_num; ++i) 575 args[i] = &tgt_args[i]; 576 577 KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; 578 579 int cudaThreadsPerBlock; 580 581 if (thread_limit > 0) { 582 cudaThreadsPerBlock = thread_limit; 583 DP("Setting CUDA threads per block to requested %d\n", thread_limit); 584 } else { 585 cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id]; 586 DP("Setting CUDA threads per block to default %d\n", 587 DeviceInfo.NumThreads[device_id]); 588 } 589 590 // Add master warp if necessary 591 if (KernelInfo->ExecutionMode == GENERIC) { 592 cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id]; 593 DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]); 594 } 595 596 if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) { 597 cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id]; 598 DP("Threads per block capped at device limit %d\n", 599 DeviceInfo.ThreadsPerBlock[device_id]); 600 } 601 602 int kernel_limit; 603 err = cuFuncGetAttribute(&kernel_limit, 604 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func); 605 if (err == CUDA_SUCCESS) { 606 if (kernel_limit < cudaThreadsPerBlock) { 607 cudaThreadsPerBlock = kernel_limit; 608 DP("Threads per block capped at kernel limit %d\n", kernel_limit); 609 } 610 } 611 612 int cudaBlocksPerGrid; 613 if (team_num <= 0) { 614 if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) { 615 // round up to the nearest integer 616 cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1; 617 DP("Using %d teams due to loop trip count %" PRIu64 " and number of " 618 "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount, 619 cudaThreadsPerBlock); 620 } else { 621 cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id]; 622 DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]); 623 } 624 } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) { 625 cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id]; 626 DP("Capping number of teams to team limit %d\n", 627 DeviceInfo.BlocksPerGrid[device_id]); 628 } else { 629 cudaBlocksPerGrid = team_num; 630 DP("Using requested number of teams %d\n", team_num); 631 } 632 633 // Run on the device. 634 DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid, 635 cudaThreadsPerBlock); 636 637 err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1, 638 cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0); 639 if (err != CUDA_SUCCESS) { 640 DP("Device kernel launch failed!\n"); 641 CUDA_ERR_STRING(err); 642 assert(err == CUDA_SUCCESS && "Unable to launch target execution!"); 643 return OFFLOAD_FAIL; 644 } 645 646 DP("Launch of entry point at " DPxMOD " successful!\n", 647 DPxPTR(tgt_entry_ptr)); 648 649 if (cudaDeviceSynchronize() != cudaSuccess) { 650 DP("Kernel execution error at " DPxMOD ".\n", DPxPTR(tgt_entry_ptr)); 651 return OFFLOAD_FAIL; 652 } else { 653 DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); 654 } 655 656 return OFFLOAD_SUCCESS; 657 } 658 659 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 660 void **tgt_args, int32_t arg_num) { 661 // use one team and the default number of threads. 662 const int32_t team_num = 1; 663 const int32_t thread_limit = 0; 664 return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, 665 arg_num, team_num, thread_limit, 0); 666 } 667 668 #ifdef __cplusplus 669 } 670 #endif 671