1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <cassert> 14 #include <cstddef> 15 #include <cuda.h> 16 #include <list> 17 #include <string> 18 #include <vector> 19 20 #include "omptargetplugin.h" 21 22 #ifndef TARGET_NAME 23 #define TARGET_NAME CUDA 24 #endif 25 26 #ifdef OMPTARGET_DEBUG 27 static int DebugLevel = 0; 28 29 #define GETNAME2(name) #name 30 #define GETNAME(name) GETNAME2(name) 31 #define DP(...) \ 32 do { \ 33 if (DebugLevel > 0) { \ 34 DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ 35 } \ 36 } while (false) 37 #else // OMPTARGET_DEBUG 38 #define DP(...) {} 39 #endif // OMPTARGET_DEBUG 40 41 #include "../../common/elf_common.c" 42 43 // Utility for retrieving and printing CUDA error string. 44 #ifdef CUDA_ERROR_REPORT 45 #define CUDA_ERR_STRING(err) \ 46 do { \ 47 const char *errStr; \ 48 cuGetErrorString(err, &errStr); \ 49 DP("CUDA error is: %s\n", errStr); \ 50 } while (0) 51 #else 52 #define CUDA_ERR_STRING(err) \ 53 {} 54 #endif 55 56 /// Keep entries table per device. 57 struct FuncOrGblEntryTy { 58 __tgt_target_table Table; 59 std::vector<__tgt_offload_entry> Entries; 60 }; 61 62 enum ExecutionModeType { 63 SPMD, // constructors, destructors, 64 // combined constructs (`teams distribute parallel for [simd]`) 65 GENERIC, // everything else 66 NONE 67 }; 68 69 /// Use a single entity to encode a kernel and a set of flags 70 struct KernelTy { 71 CUfunction Func; 72 73 // execution mode of kernel 74 // 0 - SPMD mode (without master warp) 75 // 1 - Generic mode (with master warp) 76 int8_t ExecutionMode; 77 78 KernelTy(CUfunction _Func, int8_t _ExecutionMode) 79 : Func(_Func), ExecutionMode(_ExecutionMode) {} 80 }; 81 82 /// Device envrionment data 83 /// Manually sync with the deviceRTL side for now, move to a dedicated header file later. 84 struct omptarget_device_environmentTy { 85 int32_t debug_level; 86 }; 87 88 /// List that contains all the kernels. 89 /// FIXME: we may need this to be per device and per library. 90 std::list<KernelTy> KernelsList; 91 92 /// Class containing all the device information. 93 class RTLDeviceInfoTy { 94 std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries; 95 96 public: 97 int NumberOfDevices; 98 std::vector<CUmodule> Modules; 99 std::vector<CUcontext> Contexts; 100 101 // Device properties 102 std::vector<int> ThreadsPerBlock; 103 std::vector<int> BlocksPerGrid; 104 std::vector<int> WarpSize; 105 106 // OpenMP properties 107 std::vector<int> NumTeams; 108 std::vector<int> NumThreads; 109 110 // OpenMP Environment properties 111 int EnvNumTeams; 112 int EnvTeamLimit; 113 114 // OpenMP Requires Flags 115 int64_t RequiresFlags; 116 117 //static int EnvNumThreads; 118 static const int HardTeamLimit = 1<<16; // 64k 119 static const int HardThreadLimit = 1024; 120 static const int DefaultNumTeams = 128; 121 static const int DefaultNumThreads = 128; 122 123 // Record entry point associated with device 124 void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { 125 assert(device_id < (int32_t)FuncGblEntries.size() && 126 "Unexpected device id!"); 127 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 128 129 E.Entries.push_back(entry); 130 } 131 132 // Return true if the entry is associated with device 133 bool findOffloadEntry(int32_t device_id, void *addr) { 134 assert(device_id < (int32_t)FuncGblEntries.size() && 135 "Unexpected device id!"); 136 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 137 138 for (auto &it : E.Entries) { 139 if (it.addr == addr) 140 return true; 141 } 142 143 return false; 144 } 145 146 // Return the pointer to the target entries table 147 __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { 148 assert(device_id < (int32_t)FuncGblEntries.size() && 149 "Unexpected device id!"); 150 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 151 152 int32_t size = E.Entries.size(); 153 154 // Table is empty 155 if (!size) 156 return 0; 157 158 __tgt_offload_entry *begin = &E.Entries[0]; 159 __tgt_offload_entry *end = &E.Entries[size - 1]; 160 161 // Update table info according to the entries and return the pointer 162 E.Table.EntriesBegin = begin; 163 E.Table.EntriesEnd = ++end; 164 165 return &E.Table; 166 } 167 168 // Clear entries table for a device 169 void clearOffloadEntriesTable(int32_t device_id) { 170 assert(device_id < (int32_t)FuncGblEntries.size() && 171 "Unexpected device id!"); 172 FuncGblEntries[device_id].emplace_back(); 173 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 174 E.Entries.clear(); 175 E.Table.EntriesBegin = E.Table.EntriesEnd = 0; 176 } 177 178 RTLDeviceInfoTy() { 179 #ifdef OMPTARGET_DEBUG 180 if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) { 181 DebugLevel = std::stoi(envStr); 182 } 183 #endif // OMPTARGET_DEBUG 184 185 DP("Start initializing CUDA\n"); 186 187 CUresult err = cuInit(0); 188 if (err != CUDA_SUCCESS) { 189 DP("Error when initializing CUDA\n"); 190 CUDA_ERR_STRING(err); 191 return; 192 } 193 194 NumberOfDevices = 0; 195 196 err = cuDeviceGetCount(&NumberOfDevices); 197 if (err != CUDA_SUCCESS) { 198 DP("Error when getting CUDA device count\n"); 199 CUDA_ERR_STRING(err); 200 return; 201 } 202 203 if (NumberOfDevices == 0) { 204 DP("There are no devices supporting CUDA.\n"); 205 return; 206 } 207 208 FuncGblEntries.resize(NumberOfDevices); 209 Contexts.resize(NumberOfDevices); 210 ThreadsPerBlock.resize(NumberOfDevices); 211 BlocksPerGrid.resize(NumberOfDevices); 212 WarpSize.resize(NumberOfDevices); 213 NumTeams.resize(NumberOfDevices); 214 NumThreads.resize(NumberOfDevices); 215 216 // Get environment variables regarding teams 217 char *envStr = getenv("OMP_TEAM_LIMIT"); 218 if (envStr) { 219 // OMP_TEAM_LIMIT has been set 220 EnvTeamLimit = std::stoi(envStr); 221 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 222 } else { 223 EnvTeamLimit = -1; 224 } 225 envStr = getenv("OMP_NUM_TEAMS"); 226 if (envStr) { 227 // OMP_NUM_TEAMS has been set 228 EnvNumTeams = std::stoi(envStr); 229 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 230 } else { 231 EnvNumTeams = -1; 232 } 233 234 // Default state. 235 RequiresFlags = OMP_REQ_UNDEFINED; 236 } 237 238 ~RTLDeviceInfoTy() { 239 // Close modules 240 for (auto &module : Modules) 241 if (module) { 242 CUresult err = cuModuleUnload(module); 243 if (err != CUDA_SUCCESS) { 244 DP("Error when unloading CUDA module\n"); 245 CUDA_ERR_STRING(err); 246 } 247 } 248 249 // Destroy contexts 250 for (auto &ctx : Contexts) 251 if (ctx) { 252 CUresult err = cuCtxDestroy(ctx); 253 if (err != CUDA_SUCCESS) { 254 DP("Error when destroying CUDA context\n"); 255 CUDA_ERR_STRING(err); 256 } 257 } 258 } 259 }; 260 261 static RTLDeviceInfoTy DeviceInfo; 262 263 #ifdef __cplusplus 264 extern "C" { 265 #endif 266 267 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 268 return elf_check_machine(image, 190); // EM_CUDA = 190. 269 } 270 271 int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; } 272 273 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 274 DP("Init requires flags to %ld\n", RequiresFlags); 275 DeviceInfo.RequiresFlags = RequiresFlags; 276 return RequiresFlags; 277 } 278 279 int32_t __tgt_rtl_init_device(int32_t device_id) { 280 281 CUdevice cuDevice; 282 DP("Getting device %d\n", device_id); 283 CUresult err = cuDeviceGet(&cuDevice, device_id); 284 if (err != CUDA_SUCCESS) { 285 DP("Error when getting CUDA device with id = %d\n", device_id); 286 CUDA_ERR_STRING(err); 287 return OFFLOAD_FAIL; 288 } 289 290 // Create the context and save it to use whenever this device is selected. 291 err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC, 292 cuDevice); 293 if (err != CUDA_SUCCESS) { 294 DP("Error when creating a CUDA context\n"); 295 CUDA_ERR_STRING(err); 296 return OFFLOAD_FAIL; 297 } 298 299 // Query attributes to determine number of threads/block and blocks/grid. 300 int maxGridDimX; 301 err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 302 cuDevice); 303 if (err != CUDA_SUCCESS) { 304 DP("Error getting max grid dimension, use default\n"); 305 DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams; 306 } else if (maxGridDimX <= RTLDeviceInfoTy::HardTeamLimit) { 307 DeviceInfo.BlocksPerGrid[device_id] = maxGridDimX; 308 DP("Using %d CUDA blocks per grid\n", maxGridDimX); 309 } else { 310 DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit; 311 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 312 "at the hard limit\n", 313 maxGridDimX, RTLDeviceInfoTy::HardTeamLimit); 314 } 315 316 // We are only exploiting threads along the x axis. 317 int maxBlockDimX; 318 err = cuDeviceGetAttribute(&maxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, 319 cuDevice); 320 if (err != CUDA_SUCCESS) { 321 DP("Error getting max block dimension, use default\n"); 322 DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads; 323 } else if (maxBlockDimX <= RTLDeviceInfoTy::HardThreadLimit) { 324 DeviceInfo.ThreadsPerBlock[device_id] = maxBlockDimX; 325 DP("Using %d CUDA threads per block\n", maxBlockDimX); 326 } else { 327 DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit; 328 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, capping" 329 "at the hard limit\n", 330 maxBlockDimX, RTLDeviceInfoTy::HardThreadLimit); 331 } 332 333 int warpSize; 334 err = 335 cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice); 336 if (err != CUDA_SUCCESS) { 337 DP("Error getting warp size, assume default\n"); 338 DeviceInfo.WarpSize[device_id] = 32; 339 } else { 340 DeviceInfo.WarpSize[device_id] = warpSize; 341 } 342 343 // Adjust teams to the env variables 344 if (DeviceInfo.EnvTeamLimit > 0 && 345 DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) { 346 DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit; 347 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 348 DeviceInfo.EnvTeamLimit); 349 } 350 351 DP("Max number of CUDA blocks %d, threads %d & warp size %d\n", 352 DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id], 353 DeviceInfo.WarpSize[device_id]); 354 355 // Set default number of teams 356 if (DeviceInfo.EnvNumTeams > 0) { 357 DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams; 358 DP("Default number of teams set according to environment %d\n", 359 DeviceInfo.EnvNumTeams); 360 } else { 361 DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams; 362 DP("Default number of teams set according to library's default %d\n", 363 RTLDeviceInfoTy::DefaultNumTeams); 364 } 365 if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) { 366 DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id]; 367 DP("Default number of teams exceeds device limit, capping at %d\n", 368 DeviceInfo.BlocksPerGrid[device_id]); 369 } 370 371 // Set default number of threads 372 DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads; 373 DP("Default number of threads set according to library's default %d\n", 374 RTLDeviceInfoTy::DefaultNumThreads); 375 if (DeviceInfo.NumThreads[device_id] > 376 DeviceInfo.ThreadsPerBlock[device_id]) { 377 DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id]; 378 DP("Default number of threads exceeds device limit, capping at %d\n", 379 DeviceInfo.ThreadsPerBlock[device_id]); 380 } 381 382 return OFFLOAD_SUCCESS; 383 } 384 385 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 386 __tgt_device_image *image) { 387 388 // Set the context we are using. 389 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 390 if (err != CUDA_SUCCESS) { 391 DP("Error when setting a CUDA context for device %d\n", device_id); 392 CUDA_ERR_STRING(err); 393 return NULL; 394 } 395 396 // Clear the offload table as we are going to create a new one. 397 DeviceInfo.clearOffloadEntriesTable(device_id); 398 399 // Create the module and extract the function pointers. 400 401 CUmodule cumod; 402 DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart)); 403 err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL); 404 if (err != CUDA_SUCCESS) { 405 DP("Error when loading CUDA module\n"); 406 CUDA_ERR_STRING(err); 407 return NULL; 408 } 409 410 DP("CUDA module successfully loaded!\n"); 411 DeviceInfo.Modules.push_back(cumod); 412 413 // Find the symbols in the module by name. 414 __tgt_offload_entry *HostBegin = image->EntriesBegin; 415 __tgt_offload_entry *HostEnd = image->EntriesEnd; 416 417 for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { 418 419 if (!e->addr) { 420 // We return NULL when something like this happens, the host should have 421 // always something in the address to uniquely identify the target region. 422 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size); 423 424 return NULL; 425 } 426 427 if (e->size) { 428 __tgt_offload_entry entry = *e; 429 430 CUdeviceptr cuptr; 431 size_t cusize; 432 err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name); 433 434 if (err != CUDA_SUCCESS) { 435 DP("Loading global '%s' (Failed)\n", e->name); 436 CUDA_ERR_STRING(err); 437 return NULL; 438 } 439 440 if (cusize != e->size) { 441 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name, 442 cusize, e->size); 443 CUDA_ERR_STRING(err); 444 return NULL; 445 } 446 447 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 448 DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr)); 449 entry.addr = (void *)cuptr; 450 451 if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && 452 e->flags & OMP_DECLARE_TARGET_LINK) { 453 // If unified memory is present any target link variables 454 // can access host addresses directly. There is no longer a 455 // need for device copies. 456 cuMemcpyHtoD(cuptr, e->addr, sizeof(void *)); 457 DP("Copy linked variable host address (" DPxMOD ")" 458 "to device address (" DPxMOD ")\n", 459 DPxPTR(*((void**)e->addr)), DPxPTR(cuptr)); 460 } 461 462 DeviceInfo.addOffloadEntry(device_id, entry); 463 464 continue; 465 } 466 467 CUfunction fun; 468 err = cuModuleGetFunction(&fun, cumod, e->name); 469 470 if (err != CUDA_SUCCESS) { 471 DP("Loading '%s' (Failed)\n", e->name); 472 CUDA_ERR_STRING(err); 473 return NULL; 474 } 475 476 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 477 DPxPTR(e - HostBegin), e->name, DPxPTR(fun)); 478 479 // default value GENERIC (in case symbol is missing from cubin file) 480 int8_t ExecModeVal = ExecutionModeType::GENERIC; 481 std::string ExecModeNameStr (e->name); 482 ExecModeNameStr += "_exec_mode"; 483 const char *ExecModeName = ExecModeNameStr.c_str(); 484 485 CUdeviceptr ExecModePtr; 486 size_t cusize; 487 err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName); 488 if (err == CUDA_SUCCESS) { 489 if ((size_t)cusize != sizeof(int8_t)) { 490 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 491 ExecModeName, cusize, sizeof(int8_t)); 492 CUDA_ERR_STRING(err); 493 return NULL; 494 } 495 496 err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize); 497 if (err != CUDA_SUCCESS) { 498 DP("Error when copying data from device to host. Pointers: " 499 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 500 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize); 501 CUDA_ERR_STRING(err); 502 return NULL; 503 } 504 505 if (ExecModeVal < 0 || ExecModeVal > 1) { 506 DP("Error wrong exec_mode value specified in cubin file: %d\n", 507 ExecModeVal); 508 return NULL; 509 } 510 } else { 511 DP("Loading global exec_mode '%s' - symbol missing, using default value " 512 "GENERIC (1)\n", ExecModeName); 513 CUDA_ERR_STRING(err); 514 } 515 516 KernelsList.push_back(KernelTy(fun, ExecModeVal)); 517 518 __tgt_offload_entry entry = *e; 519 entry.addr = (void *)&KernelsList.back(); 520 DeviceInfo.addOffloadEntry(device_id, entry); 521 } 522 523 // send device environment data to the device 524 { 525 omptarget_device_environmentTy device_env; 526 527 device_env.debug_level = 0; 528 529 #ifdef OMPTARGET_DEBUG 530 if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) { 531 device_env.debug_level = std::stoi(envStr); 532 } 533 #endif 534 535 const char * device_env_Name="omptarget_device_environment"; 536 CUdeviceptr device_env_Ptr; 537 size_t cusize; 538 539 err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name); 540 541 if (err == CUDA_SUCCESS) { 542 if ((size_t)cusize != sizeof(device_env)) { 543 DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n", 544 device_env_Name, cusize, sizeof(int32_t)); 545 CUDA_ERR_STRING(err); 546 return NULL; 547 } 548 549 err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize); 550 if (err != CUDA_SUCCESS) { 551 DP("Error when copying data from host to device. Pointers: " 552 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 553 DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize); 554 CUDA_ERR_STRING(err); 555 return NULL; 556 } 557 558 DP("Sending global device environment data %zu bytes\n", (size_t)cusize); 559 } else { 560 DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name); 561 DP("Continue, considering this is a device RTL which does not accept envrionment setting.\n"); 562 } 563 } 564 565 return DeviceInfo.getOffloadEntriesTable(device_id); 566 } 567 568 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { 569 if (size == 0) { 570 return NULL; 571 } 572 573 // Set the context we are using. 574 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 575 if (err != CUDA_SUCCESS) { 576 DP("Error while trying to set CUDA current context\n"); 577 CUDA_ERR_STRING(err); 578 return NULL; 579 } 580 581 CUdeviceptr ptr; 582 err = cuMemAlloc(&ptr, size); 583 if (err != CUDA_SUCCESS) { 584 DP("Error while trying to allocate %d\n", err); 585 CUDA_ERR_STRING(err); 586 return NULL; 587 } 588 589 void *vptr = (void *)ptr; 590 return vptr; 591 } 592 593 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 594 int64_t size) { 595 // Set the context we are using. 596 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 597 if (err != CUDA_SUCCESS) { 598 DP("Error when setting CUDA context\n"); 599 CUDA_ERR_STRING(err); 600 return OFFLOAD_FAIL; 601 } 602 603 err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size); 604 if (err != CUDA_SUCCESS) { 605 DP("Error when copying data from host to device. Pointers: host = " DPxMOD 606 ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), 607 DPxPTR(tgt_ptr), size); 608 CUDA_ERR_STRING(err); 609 return OFFLOAD_FAIL; 610 } 611 return OFFLOAD_SUCCESS; 612 } 613 614 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 615 int64_t size) { 616 // Set the context we are using. 617 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 618 if (err != CUDA_SUCCESS) { 619 DP("Error when setting CUDA context\n"); 620 CUDA_ERR_STRING(err); 621 return OFFLOAD_FAIL; 622 } 623 624 err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size); 625 if (err != CUDA_SUCCESS) { 626 DP("Error when copying data from device to host. Pointers: host = " DPxMOD 627 ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), 628 DPxPTR(tgt_ptr), size); 629 CUDA_ERR_STRING(err); 630 return OFFLOAD_FAIL; 631 } 632 return OFFLOAD_SUCCESS; 633 } 634 635 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 636 // Set the context we are using. 637 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 638 if (err != CUDA_SUCCESS) { 639 DP("Error when setting CUDA context\n"); 640 CUDA_ERR_STRING(err); 641 return OFFLOAD_FAIL; 642 } 643 644 err = cuMemFree((CUdeviceptr)tgt_ptr); 645 if (err != CUDA_SUCCESS) { 646 DP("Error when freeing CUDA memory\n"); 647 CUDA_ERR_STRING(err); 648 return OFFLOAD_FAIL; 649 } 650 return OFFLOAD_SUCCESS; 651 } 652 653 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 654 void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 655 int32_t thread_limit, uint64_t loop_tripcount) { 656 // Set the context we are using. 657 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 658 if (err != CUDA_SUCCESS) { 659 DP("Error when setting CUDA context\n"); 660 CUDA_ERR_STRING(err); 661 return OFFLOAD_FAIL; 662 } 663 664 // All args are references. 665 std::vector<void *> args(arg_num); 666 std::vector<void *> ptrs(arg_num); 667 668 for (int32_t i = 0; i < arg_num; ++i) { 669 ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); 670 args[i] = &ptrs[i]; 671 } 672 673 KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; 674 675 int cudaThreadsPerBlock; 676 677 if (thread_limit > 0) { 678 cudaThreadsPerBlock = thread_limit; 679 DP("Setting CUDA threads per block to requested %d\n", thread_limit); 680 // Add master warp if necessary 681 if (KernelInfo->ExecutionMode == GENERIC) { 682 cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id]; 683 DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]); 684 } 685 } else { 686 cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id]; 687 DP("Setting CUDA threads per block to default %d\n", 688 DeviceInfo.NumThreads[device_id]); 689 } 690 691 if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) { 692 cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id]; 693 DP("Threads per block capped at device limit %d\n", 694 DeviceInfo.ThreadsPerBlock[device_id]); 695 } 696 697 int kernel_limit; 698 err = cuFuncGetAttribute(&kernel_limit, 699 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func); 700 if (err == CUDA_SUCCESS) { 701 if (kernel_limit < cudaThreadsPerBlock) { 702 cudaThreadsPerBlock = kernel_limit; 703 DP("Threads per block capped at kernel limit %d\n", kernel_limit); 704 } 705 } 706 707 int cudaBlocksPerGrid; 708 if (team_num <= 0) { 709 if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) { 710 if (KernelInfo->ExecutionMode == SPMD) { 711 // We have a combined construct, i.e. `target teams distribute parallel 712 // for [simd]`. We launch so many teams so that each thread will 713 // execute one iteration of the loop. 714 // round up to the nearest integer 715 cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1; 716 } else { 717 // If we reach this point, then we have a non-combined construct, i.e. 718 // `teams distribute` with a nested `parallel for` and each team is 719 // assigned one iteration of the `distribute` loop. E.g.: 720 // 721 // #pragma omp target teams distribute 722 // for(...loop_tripcount...) { 723 // #pragma omp parallel for 724 // for(...) {} 725 // } 726 // 727 // Threads within a team will execute the iterations of the `parallel` 728 // loop. 729 cudaBlocksPerGrid = loop_tripcount; 730 } 731 DP("Using %d teams due to loop trip count %" PRIu64 " and number of " 732 "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount, 733 cudaThreadsPerBlock); 734 } else { 735 cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id]; 736 DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]); 737 } 738 } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) { 739 cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id]; 740 DP("Capping number of teams to team limit %d\n", 741 DeviceInfo.BlocksPerGrid[device_id]); 742 } else { 743 cudaBlocksPerGrid = team_num; 744 DP("Using requested number of teams %d\n", team_num); 745 } 746 747 // Run on the device. 748 DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid, 749 cudaThreadsPerBlock); 750 751 err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1, 752 cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0); 753 if (err != CUDA_SUCCESS) { 754 DP("Device kernel launch failed!\n"); 755 CUDA_ERR_STRING(err); 756 return OFFLOAD_FAIL; 757 } 758 759 DP("Launch of entry point at " DPxMOD " successful!\n", 760 DPxPTR(tgt_entry_ptr)); 761 762 CUresult sync_err = cuCtxSynchronize(); 763 if (sync_err != CUDA_SUCCESS) { 764 DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr)); 765 CUDA_ERR_STRING(sync_err); 766 return OFFLOAD_FAIL; 767 } else { 768 DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); 769 } 770 771 return OFFLOAD_SUCCESS; 772 } 773 774 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 775 void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { 776 // use one team and the default number of threads. 777 const int32_t team_num = 1; 778 const int32_t thread_limit = 0; 779 return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, 780 tgt_offsets, arg_num, team_num, thread_limit, 0); 781 } 782 783 #ifdef __cplusplus 784 } 785 #endif 786