1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <cassert> 14 #include <cstddef> 15 #include <cuda.h> 16 #include <list> 17 #include <string> 18 #include <vector> 19 20 #include "omptargetplugin.h" 21 22 #ifndef TARGET_NAME 23 #define TARGET_NAME CUDA 24 #endif 25 26 #ifdef OMPTARGET_DEBUG 27 static int DebugLevel = 0; 28 29 #define GETNAME2(name) #name 30 #define GETNAME(name) GETNAME2(name) 31 #define DP(...) \ 32 do { \ 33 if (DebugLevel > 0) { \ 34 DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ 35 } \ 36 } while (false) 37 38 // Utility for retrieving and printing CUDA error string. 39 #define CUDA_ERR_STRING(err) \ 40 do { \ 41 if (DebugLevel > 0) { \ 42 const char *errStr; \ 43 cuGetErrorString(err, &errStr); \ 44 DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", "CUDA error is: %s\n", errStr); \ 45 } \ 46 } while (false) 47 #else // OMPTARGET_DEBUG 48 #define DP(...) {} 49 #define CUDA_ERR_STRING(err) {} 50 #endif // OMPTARGET_DEBUG 51 52 #include "../../common/elf_common.c" 53 54 /// Keep entries table per device. 55 struct FuncOrGblEntryTy { 56 __tgt_target_table Table; 57 std::vector<__tgt_offload_entry> Entries; 58 }; 59 60 enum ExecutionModeType { 61 SPMD, // constructors, destructors, 62 // combined constructs (`teams distribute parallel for [simd]`) 63 GENERIC, // everything else 64 NONE 65 }; 66 67 /// Use a single entity to encode a kernel and a set of flags. 68 struct KernelTy { 69 CUfunction Func; 70 71 // execution mode of kernel 72 // 0 - SPMD mode (without master warp) 73 // 1 - Generic mode (with master warp) 74 int8_t ExecutionMode; 75 76 KernelTy(CUfunction _Func, int8_t _ExecutionMode) 77 : Func(_Func), ExecutionMode(_ExecutionMode) {} 78 }; 79 80 /// Device environment data 81 /// Manually sync with the deviceRTL side for now, move to a dedicated header file later. 82 struct omptarget_device_environmentTy { 83 int32_t debug_level; 84 }; 85 86 /// List that contains all the kernels. 87 /// FIXME: we may need this to be per device and per library. 88 std::list<KernelTy> KernelsList; 89 90 /// Class containing all the device information. 91 class RTLDeviceInfoTy { 92 std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries; 93 94 public: 95 int NumberOfDevices; 96 std::vector<CUmodule> Modules; 97 std::vector<CUcontext> Contexts; 98 99 // Device properties 100 std::vector<int> ThreadsPerBlock; 101 std::vector<int> BlocksPerGrid; 102 std::vector<int> WarpSize; 103 104 // OpenMP properties 105 std::vector<int> NumTeams; 106 std::vector<int> NumThreads; 107 108 // OpenMP Environment properties 109 int EnvNumTeams; 110 int EnvTeamLimit; 111 112 // OpenMP Requires Flags 113 int64_t RequiresFlags; 114 115 //static int EnvNumThreads; 116 static const int HardTeamLimit = 1<<16; // 64k 117 static const int HardThreadLimit = 1024; 118 static const int DefaultNumTeams = 128; 119 static const int DefaultNumThreads = 128; 120 121 // Record entry point associated with device 122 void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { 123 assert(device_id < (int32_t)FuncGblEntries.size() && 124 "Unexpected device id!"); 125 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 126 127 E.Entries.push_back(entry); 128 } 129 130 // Return true if the entry is associated with device 131 bool findOffloadEntry(int32_t device_id, void *addr) { 132 assert(device_id < (int32_t)FuncGblEntries.size() && 133 "Unexpected device id!"); 134 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 135 136 for (auto &it : E.Entries) { 137 if (it.addr == addr) 138 return true; 139 } 140 141 return false; 142 } 143 144 // Return the pointer to the target entries table 145 __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { 146 assert(device_id < (int32_t)FuncGblEntries.size() && 147 "Unexpected device id!"); 148 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 149 150 int32_t size = E.Entries.size(); 151 152 // Table is empty 153 if (!size) 154 return 0; 155 156 __tgt_offload_entry *begin = &E.Entries[0]; 157 __tgt_offload_entry *end = &E.Entries[size - 1]; 158 159 // Update table info according to the entries and return the pointer 160 E.Table.EntriesBegin = begin; 161 E.Table.EntriesEnd = ++end; 162 163 return &E.Table; 164 } 165 166 // Clear entries table for a device 167 void clearOffloadEntriesTable(int32_t device_id) { 168 assert(device_id < (int32_t)FuncGblEntries.size() && 169 "Unexpected device id!"); 170 FuncGblEntries[device_id].emplace_back(); 171 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 172 E.Entries.clear(); 173 E.Table.EntriesBegin = E.Table.EntriesEnd = 0; 174 } 175 176 RTLDeviceInfoTy() { 177 #ifdef OMPTARGET_DEBUG 178 if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) { 179 DebugLevel = std::stoi(envStr); 180 } 181 #endif // OMPTARGET_DEBUG 182 183 DP("Start initializing CUDA\n"); 184 185 CUresult err = cuInit(0); 186 if (err != CUDA_SUCCESS) { 187 DP("Error when initializing CUDA\n"); 188 CUDA_ERR_STRING(err); 189 return; 190 } 191 192 NumberOfDevices = 0; 193 194 err = cuDeviceGetCount(&NumberOfDevices); 195 if (err != CUDA_SUCCESS) { 196 DP("Error when getting CUDA device count\n"); 197 CUDA_ERR_STRING(err); 198 return; 199 } 200 201 if (NumberOfDevices == 0) { 202 DP("There are no devices supporting CUDA.\n"); 203 return; 204 } 205 206 FuncGblEntries.resize(NumberOfDevices); 207 Contexts.resize(NumberOfDevices); 208 ThreadsPerBlock.resize(NumberOfDevices); 209 BlocksPerGrid.resize(NumberOfDevices); 210 WarpSize.resize(NumberOfDevices); 211 NumTeams.resize(NumberOfDevices); 212 NumThreads.resize(NumberOfDevices); 213 214 // Get environment variables regarding teams 215 char *envStr = getenv("OMP_TEAM_LIMIT"); 216 if (envStr) { 217 // OMP_TEAM_LIMIT has been set 218 EnvTeamLimit = std::stoi(envStr); 219 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 220 } else { 221 EnvTeamLimit = -1; 222 } 223 envStr = getenv("OMP_NUM_TEAMS"); 224 if (envStr) { 225 // OMP_NUM_TEAMS has been set 226 EnvNumTeams = std::stoi(envStr); 227 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 228 } else { 229 EnvNumTeams = -1; 230 } 231 232 // Default state. 233 RequiresFlags = OMP_REQ_UNDEFINED; 234 } 235 236 ~RTLDeviceInfoTy() { 237 // Close modules 238 for (auto &module : Modules) 239 if (module) { 240 CUresult err = cuModuleUnload(module); 241 if (err != CUDA_SUCCESS) { 242 DP("Error when unloading CUDA module\n"); 243 CUDA_ERR_STRING(err); 244 } 245 } 246 247 // Destroy contexts 248 for (auto &ctx : Contexts) 249 if (ctx) { 250 CUresult err = cuCtxDestroy(ctx); 251 if (err != CUDA_SUCCESS) { 252 DP("Error when destroying CUDA context\n"); 253 CUDA_ERR_STRING(err); 254 } 255 } 256 } 257 }; 258 259 static RTLDeviceInfoTy DeviceInfo; 260 261 #ifdef __cplusplus 262 extern "C" { 263 #endif 264 265 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 266 return elf_check_machine(image, 190); // EM_CUDA = 190. 267 } 268 269 int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; } 270 271 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 272 DP("Init requires flags to %ld\n", RequiresFlags); 273 DeviceInfo.RequiresFlags = RequiresFlags; 274 return RequiresFlags; 275 } 276 277 int32_t __tgt_rtl_init_device(int32_t device_id) { 278 279 CUdevice cuDevice; 280 DP("Getting device %d\n", device_id); 281 CUresult err = cuDeviceGet(&cuDevice, device_id); 282 if (err != CUDA_SUCCESS) { 283 DP("Error when getting CUDA device with id = %d\n", device_id); 284 CUDA_ERR_STRING(err); 285 return OFFLOAD_FAIL; 286 } 287 288 // Create the context and save it to use whenever this device is selected. 289 err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC, 290 cuDevice); 291 if (err != CUDA_SUCCESS) { 292 DP("Error when creating a CUDA context\n"); 293 CUDA_ERR_STRING(err); 294 return OFFLOAD_FAIL; 295 } 296 297 // Query attributes to determine number of threads/block and blocks/grid. 298 int maxGridDimX; 299 err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 300 cuDevice); 301 if (err != CUDA_SUCCESS) { 302 DP("Error getting max grid dimension, use default\n"); 303 DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams; 304 } else if (maxGridDimX <= RTLDeviceInfoTy::HardTeamLimit) { 305 DeviceInfo.BlocksPerGrid[device_id] = maxGridDimX; 306 DP("Using %d CUDA blocks per grid\n", maxGridDimX); 307 } else { 308 DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit; 309 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 310 "at the hard limit\n", 311 maxGridDimX, RTLDeviceInfoTy::HardTeamLimit); 312 } 313 314 // We are only exploiting threads along the x axis. 315 int maxBlockDimX; 316 err = cuDeviceGetAttribute(&maxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, 317 cuDevice); 318 if (err != CUDA_SUCCESS) { 319 DP("Error getting max block dimension, use default\n"); 320 DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads; 321 } else if (maxBlockDimX <= RTLDeviceInfoTy::HardThreadLimit) { 322 DeviceInfo.ThreadsPerBlock[device_id] = maxBlockDimX; 323 DP("Using %d CUDA threads per block\n", maxBlockDimX); 324 } else { 325 DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit; 326 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, capping" 327 "at the hard limit\n", 328 maxBlockDimX, RTLDeviceInfoTy::HardThreadLimit); 329 } 330 331 int warpSize; 332 err = 333 cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice); 334 if (err != CUDA_SUCCESS) { 335 DP("Error getting warp size, assume default\n"); 336 DeviceInfo.WarpSize[device_id] = 32; 337 } else { 338 DeviceInfo.WarpSize[device_id] = warpSize; 339 } 340 341 // Adjust teams to the env variables 342 if (DeviceInfo.EnvTeamLimit > 0 && 343 DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) { 344 DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit; 345 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 346 DeviceInfo.EnvTeamLimit); 347 } 348 349 DP("Max number of CUDA blocks %d, threads %d & warp size %d\n", 350 DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id], 351 DeviceInfo.WarpSize[device_id]); 352 353 // Set default number of teams 354 if (DeviceInfo.EnvNumTeams > 0) { 355 DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams; 356 DP("Default number of teams set according to environment %d\n", 357 DeviceInfo.EnvNumTeams); 358 } else { 359 DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams; 360 DP("Default number of teams set according to library's default %d\n", 361 RTLDeviceInfoTy::DefaultNumTeams); 362 } 363 if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) { 364 DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id]; 365 DP("Default number of teams exceeds device limit, capping at %d\n", 366 DeviceInfo.BlocksPerGrid[device_id]); 367 } 368 369 // Set default number of threads 370 DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads; 371 DP("Default number of threads set according to library's default %d\n", 372 RTLDeviceInfoTy::DefaultNumThreads); 373 if (DeviceInfo.NumThreads[device_id] > 374 DeviceInfo.ThreadsPerBlock[device_id]) { 375 DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id]; 376 DP("Default number of threads exceeds device limit, capping at %d\n", 377 DeviceInfo.ThreadsPerBlock[device_id]); 378 } 379 380 return OFFLOAD_SUCCESS; 381 } 382 383 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 384 __tgt_device_image *image) { 385 386 // Set the context we are using. 387 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 388 if (err != CUDA_SUCCESS) { 389 DP("Error when setting a CUDA context for device %d\n", device_id); 390 CUDA_ERR_STRING(err); 391 return NULL; 392 } 393 394 // Clear the offload table as we are going to create a new one. 395 DeviceInfo.clearOffloadEntriesTable(device_id); 396 397 // Create the module and extract the function pointers. 398 399 CUmodule cumod; 400 DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart)); 401 err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL); 402 if (err != CUDA_SUCCESS) { 403 DP("Error when loading CUDA module\n"); 404 CUDA_ERR_STRING(err); 405 return NULL; 406 } 407 408 DP("CUDA module successfully loaded!\n"); 409 DeviceInfo.Modules.push_back(cumod); 410 411 // Find the symbols in the module by name. 412 __tgt_offload_entry *HostBegin = image->EntriesBegin; 413 __tgt_offload_entry *HostEnd = image->EntriesEnd; 414 415 for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { 416 417 if (!e->addr) { 418 // We return NULL when something like this happens, the host should have 419 // always something in the address to uniquely identify the target region. 420 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size); 421 422 return NULL; 423 } 424 425 if (e->size) { 426 __tgt_offload_entry entry = *e; 427 428 CUdeviceptr cuptr; 429 size_t cusize; 430 err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name); 431 432 if (err != CUDA_SUCCESS) { 433 DP("Loading global '%s' (Failed)\n", e->name); 434 CUDA_ERR_STRING(err); 435 return NULL; 436 } 437 438 if (cusize != e->size) { 439 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name, 440 cusize, e->size); 441 CUDA_ERR_STRING(err); 442 return NULL; 443 } 444 445 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 446 DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr)); 447 entry.addr = (void *)cuptr; 448 449 // Note: In the current implementation declare target variables 450 // can either be link or to. This means that once unified 451 // memory is activated via the requires directive, the variable 452 // can be used directly from the host in both cases. 453 // TODO: when variables types other than to or link are added, 454 // the below condition should be changed to explicitly 455 // check for to and link variables types: 456 // (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && 457 // (e->flags & OMP_DECLARE_TARGET_LINK || 458 // e->flags == OMP_DECLARE_TARGET_TO)) 459 if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { 460 // If unified memory is present any target link or to variables 461 // can access host addresses directly. There is no longer a 462 // need for device copies. 463 cuMemcpyHtoD(cuptr, e->addr, sizeof(void *)); 464 DP("Copy linked variable host address (" DPxMOD ")" 465 "to device address (" DPxMOD ")\n", 466 DPxPTR(*((void**)e->addr)), DPxPTR(cuptr)); 467 } 468 469 DeviceInfo.addOffloadEntry(device_id, entry); 470 471 continue; 472 } 473 474 CUfunction fun; 475 err = cuModuleGetFunction(&fun, cumod, e->name); 476 477 if (err != CUDA_SUCCESS) { 478 DP("Loading '%s' (Failed)\n", e->name); 479 CUDA_ERR_STRING(err); 480 return NULL; 481 } 482 483 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 484 DPxPTR(e - HostBegin), e->name, DPxPTR(fun)); 485 486 // default value GENERIC (in case symbol is missing from cubin file) 487 int8_t ExecModeVal = ExecutionModeType::GENERIC; 488 std::string ExecModeNameStr (e->name); 489 ExecModeNameStr += "_exec_mode"; 490 const char *ExecModeName = ExecModeNameStr.c_str(); 491 492 CUdeviceptr ExecModePtr; 493 size_t cusize; 494 err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName); 495 if (err == CUDA_SUCCESS) { 496 if ((size_t)cusize != sizeof(int8_t)) { 497 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 498 ExecModeName, cusize, sizeof(int8_t)); 499 CUDA_ERR_STRING(err); 500 return NULL; 501 } 502 503 err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize); 504 if (err != CUDA_SUCCESS) { 505 DP("Error when copying data from device to host. Pointers: " 506 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 507 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize); 508 CUDA_ERR_STRING(err); 509 return NULL; 510 } 511 512 if (ExecModeVal < 0 || ExecModeVal > 1) { 513 DP("Error wrong exec_mode value specified in cubin file: %d\n", 514 ExecModeVal); 515 return NULL; 516 } 517 } else { 518 DP("Loading global exec_mode '%s' - symbol missing, using default value " 519 "GENERIC (1)\n", ExecModeName); 520 CUDA_ERR_STRING(err); 521 } 522 523 KernelsList.push_back(KernelTy(fun, ExecModeVal)); 524 525 __tgt_offload_entry entry = *e; 526 entry.addr = (void *)&KernelsList.back(); 527 DeviceInfo.addOffloadEntry(device_id, entry); 528 } 529 530 // send device environment data to the device 531 { 532 omptarget_device_environmentTy device_env; 533 534 device_env.debug_level = 0; 535 536 #ifdef OMPTARGET_DEBUG 537 if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) { 538 device_env.debug_level = std::stoi(envStr); 539 } 540 #endif 541 542 const char * device_env_Name="omptarget_device_environment"; 543 CUdeviceptr device_env_Ptr; 544 size_t cusize; 545 546 err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name); 547 548 if (err == CUDA_SUCCESS) { 549 if ((size_t)cusize != sizeof(device_env)) { 550 DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n", 551 device_env_Name, cusize, sizeof(int32_t)); 552 CUDA_ERR_STRING(err); 553 return NULL; 554 } 555 556 err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize); 557 if (err != CUDA_SUCCESS) { 558 DP("Error when copying data from host to device. Pointers: " 559 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 560 DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize); 561 CUDA_ERR_STRING(err); 562 return NULL; 563 } 564 565 DP("Sending global device environment data %zu bytes\n", (size_t)cusize); 566 } else { 567 DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name); 568 DP("Continue, considering this is a device RTL which does not accept environment setting.\n"); 569 } 570 } 571 572 return DeviceInfo.getOffloadEntriesTable(device_id); 573 } 574 575 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { 576 if (size == 0) { 577 return NULL; 578 } 579 580 // Set the context we are using. 581 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 582 if (err != CUDA_SUCCESS) { 583 DP("Error while trying to set CUDA current context\n"); 584 CUDA_ERR_STRING(err); 585 return NULL; 586 } 587 588 CUdeviceptr ptr; 589 err = cuMemAlloc(&ptr, size); 590 if (err != CUDA_SUCCESS) { 591 DP("Error while trying to allocate %d\n", err); 592 CUDA_ERR_STRING(err); 593 return NULL; 594 } 595 596 void *vptr = (void *)ptr; 597 return vptr; 598 } 599 600 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 601 int64_t size) { 602 // Set the context we are using. 603 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 604 if (err != CUDA_SUCCESS) { 605 DP("Error when setting CUDA context\n"); 606 CUDA_ERR_STRING(err); 607 return OFFLOAD_FAIL; 608 } 609 610 err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size); 611 if (err != CUDA_SUCCESS) { 612 DP("Error when copying data from host to device. Pointers: host = " DPxMOD 613 ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), 614 DPxPTR(tgt_ptr), size); 615 CUDA_ERR_STRING(err); 616 return OFFLOAD_FAIL; 617 } 618 return OFFLOAD_SUCCESS; 619 } 620 621 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 622 int64_t size) { 623 // Set the context we are using. 624 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 625 if (err != CUDA_SUCCESS) { 626 DP("Error when setting CUDA context\n"); 627 CUDA_ERR_STRING(err); 628 return OFFLOAD_FAIL; 629 } 630 631 err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size); 632 if (err != CUDA_SUCCESS) { 633 DP("Error when copying data from device to host. Pointers: host = " DPxMOD 634 ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), 635 DPxPTR(tgt_ptr), size); 636 CUDA_ERR_STRING(err); 637 return OFFLOAD_FAIL; 638 } 639 return OFFLOAD_SUCCESS; 640 } 641 642 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 643 // Set the context we are using. 644 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 645 if (err != CUDA_SUCCESS) { 646 DP("Error when setting CUDA context\n"); 647 CUDA_ERR_STRING(err); 648 return OFFLOAD_FAIL; 649 } 650 651 err = cuMemFree((CUdeviceptr)tgt_ptr); 652 if (err != CUDA_SUCCESS) { 653 DP("Error when freeing CUDA memory\n"); 654 CUDA_ERR_STRING(err); 655 return OFFLOAD_FAIL; 656 } 657 return OFFLOAD_SUCCESS; 658 } 659 660 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 661 void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 662 int32_t thread_limit, uint64_t loop_tripcount) { 663 // Set the context we are using. 664 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 665 if (err != CUDA_SUCCESS) { 666 DP("Error when setting CUDA context\n"); 667 CUDA_ERR_STRING(err); 668 return OFFLOAD_FAIL; 669 } 670 671 // All args are references. 672 std::vector<void *> args(arg_num); 673 std::vector<void *> ptrs(arg_num); 674 675 for (int32_t i = 0; i < arg_num; ++i) { 676 ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); 677 args[i] = &ptrs[i]; 678 } 679 680 KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; 681 682 int cudaThreadsPerBlock; 683 684 if (thread_limit > 0) { 685 cudaThreadsPerBlock = thread_limit; 686 DP("Setting CUDA threads per block to requested %d\n", thread_limit); 687 // Add master warp if necessary 688 if (KernelInfo->ExecutionMode == GENERIC) { 689 cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id]; 690 DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]); 691 } 692 } else { 693 cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id]; 694 DP("Setting CUDA threads per block to default %d\n", 695 DeviceInfo.NumThreads[device_id]); 696 } 697 698 if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) { 699 cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id]; 700 DP("Threads per block capped at device limit %d\n", 701 DeviceInfo.ThreadsPerBlock[device_id]); 702 } 703 704 int kernel_limit; 705 err = cuFuncGetAttribute(&kernel_limit, 706 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func); 707 if (err == CUDA_SUCCESS) { 708 if (kernel_limit < cudaThreadsPerBlock) { 709 cudaThreadsPerBlock = kernel_limit; 710 DP("Threads per block capped at kernel limit %d\n", kernel_limit); 711 } 712 } 713 714 int cudaBlocksPerGrid; 715 if (team_num <= 0) { 716 if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) { 717 if (KernelInfo->ExecutionMode == SPMD) { 718 // We have a combined construct, i.e. `target teams distribute parallel 719 // for [simd]`. We launch so many teams so that each thread will 720 // execute one iteration of the loop. 721 // round up to the nearest integer 722 cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1; 723 } else { 724 // If we reach this point, then we have a non-combined construct, i.e. 725 // `teams distribute` with a nested `parallel for` and each team is 726 // assigned one iteration of the `distribute` loop. E.g.: 727 // 728 // #pragma omp target teams distribute 729 // for(...loop_tripcount...) { 730 // #pragma omp parallel for 731 // for(...) {} 732 // } 733 // 734 // Threads within a team will execute the iterations of the `parallel` 735 // loop. 736 cudaBlocksPerGrid = loop_tripcount; 737 } 738 DP("Using %d teams due to loop trip count %" PRIu64 " and number of " 739 "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount, 740 cudaThreadsPerBlock); 741 } else { 742 cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id]; 743 DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]); 744 } 745 } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) { 746 cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id]; 747 DP("Capping number of teams to team limit %d\n", 748 DeviceInfo.BlocksPerGrid[device_id]); 749 } else { 750 cudaBlocksPerGrid = team_num; 751 DP("Using requested number of teams %d\n", team_num); 752 } 753 754 // Run on the device. 755 DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid, 756 cudaThreadsPerBlock); 757 758 err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1, 759 cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0); 760 if (err != CUDA_SUCCESS) { 761 DP("Device kernel launch failed!\n"); 762 CUDA_ERR_STRING(err); 763 return OFFLOAD_FAIL; 764 } 765 766 DP("Launch of entry point at " DPxMOD " successful!\n", 767 DPxPTR(tgt_entry_ptr)); 768 769 CUresult sync_err = cuCtxSynchronize(); 770 if (sync_err != CUDA_SUCCESS) { 771 DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr)); 772 CUDA_ERR_STRING(sync_err); 773 return OFFLOAD_FAIL; 774 } else { 775 DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); 776 } 777 778 return OFFLOAD_SUCCESS; 779 } 780 781 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 782 void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { 783 // use one team and the default number of threads. 784 const int32_t team_num = 1; 785 const int32_t thread_limit = 0; 786 return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, 787 tgt_offsets, arg_num, team_num, thread_limit, 0); 788 } 789 790 #ifdef __cplusplus 791 } 792 #endif 793