1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <atomic> 14 #include <cassert> 15 #include <cstddef> 16 #include <cuda.h> 17 #include <list> 18 #include <memory> 19 #include <string> 20 #include <vector> 21 22 #include "omptargetplugin.h" 23 24 #ifndef TARGET_NAME 25 #define TARGET_NAME CUDA 26 #endif 27 28 #ifdef OMPTARGET_DEBUG 29 static int DebugLevel = 0; 30 31 #define GETNAME2(name) #name 32 #define GETNAME(name) GETNAME2(name) 33 #define DP(...) \ 34 do { \ 35 if (DebugLevel > 0) { \ 36 DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ 37 } \ 38 } while (false) 39 40 // Utility for retrieving and printing CUDA error string. 41 #define CUDA_ERR_STRING(err) \ 42 do { \ 43 if (DebugLevel > 0) { \ 44 const char *errStr; \ 45 cuGetErrorString(err, &errStr); \ 46 DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", "CUDA error is: %s\n", errStr); \ 47 } \ 48 } while (false) 49 #else // OMPTARGET_DEBUG 50 #define DP(...) {} 51 #define CUDA_ERR_STRING(err) {} 52 #endif // OMPTARGET_DEBUG 53 54 #include "../../common/elf_common.c" 55 56 /// Keep entries table per device. 57 struct FuncOrGblEntryTy { 58 __tgt_target_table Table; 59 std::vector<__tgt_offload_entry> Entries; 60 }; 61 62 enum ExecutionModeType { 63 SPMD, // constructors, destructors, 64 // combined constructs (`teams distribute parallel for [simd]`) 65 GENERIC, // everything else 66 NONE 67 }; 68 69 /// Use a single entity to encode a kernel and a set of flags. 70 struct KernelTy { 71 CUfunction Func; 72 73 // execution mode of kernel 74 // 0 - SPMD mode (without master warp) 75 // 1 - Generic mode (with master warp) 76 int8_t ExecutionMode; 77 78 KernelTy(CUfunction _Func, int8_t _ExecutionMode) 79 : Func(_Func), ExecutionMode(_ExecutionMode) {} 80 }; 81 82 /// Device environment data 83 /// Manually sync with the deviceRTL side for now, move to a dedicated header file later. 84 struct omptarget_device_environmentTy { 85 int32_t debug_level; 86 }; 87 88 /// List that contains all the kernels. 89 /// FIXME: we may need this to be per device and per library. 90 std::list<KernelTy> KernelsList; 91 92 /// Class containing all the device information. 93 class RTLDeviceInfoTy { 94 std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries; 95 std::vector<std::unique_ptr<std::atomic_uint>> NextStreamId; 96 97 public: 98 int NumberOfDevices; 99 std::vector<CUmodule> Modules; 100 std::vector<CUcontext> Contexts; 101 std::vector<std::vector<CUstream>> Streams; 102 103 // Device properties 104 std::vector<int> ThreadsPerBlock; 105 std::vector<int> BlocksPerGrid; 106 std::vector<int> WarpSize; 107 108 // OpenMP properties 109 std::vector<int> NumTeams; 110 std::vector<int> NumThreads; 111 112 // OpenMP Environment properties 113 int EnvNumTeams; 114 int EnvTeamLimit; 115 int EnvNumStreams; 116 117 // OpenMP Requires Flags 118 int64_t RequiresFlags; 119 120 //static int EnvNumThreads; 121 static const int HardTeamLimit = 1<<16; // 64k 122 static const int HardThreadLimit = 1024; 123 static const int DefaultNumTeams = 128; 124 static const int DefaultNumThreads = 128; 125 126 // Record entry point associated with device 127 void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { 128 assert(device_id < (int32_t)FuncGblEntries.size() && 129 "Unexpected device id!"); 130 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 131 132 E.Entries.push_back(entry); 133 } 134 135 // Return true if the entry is associated with device 136 bool findOffloadEntry(int32_t device_id, void *addr) { 137 assert(device_id < (int32_t)FuncGblEntries.size() && 138 "Unexpected device id!"); 139 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 140 141 for (auto &it : E.Entries) { 142 if (it.addr == addr) 143 return true; 144 } 145 146 return false; 147 } 148 149 // Return the pointer to the target entries table 150 __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { 151 assert(device_id < (int32_t)FuncGblEntries.size() && 152 "Unexpected device id!"); 153 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 154 155 int32_t size = E.Entries.size(); 156 157 // Table is empty 158 if (!size) 159 return 0; 160 161 __tgt_offload_entry *begin = &E.Entries[0]; 162 __tgt_offload_entry *end = &E.Entries[size - 1]; 163 164 // Update table info according to the entries and return the pointer 165 E.Table.EntriesBegin = begin; 166 E.Table.EntriesEnd = ++end; 167 168 return &E.Table; 169 } 170 171 // Clear entries table for a device 172 void clearOffloadEntriesTable(int32_t device_id) { 173 assert(device_id < (int32_t)FuncGblEntries.size() && 174 "Unexpected device id!"); 175 FuncGblEntries[device_id].emplace_back(); 176 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 177 E.Entries.clear(); 178 E.Table.EntriesBegin = E.Table.EntriesEnd = 0; 179 } 180 181 // Get the next stream on a given device in a round robin manner 182 CUstream &getNextStream(const int DeviceId) { 183 assert(DeviceId >= 0 && 184 static_cast<size_t>(DeviceId) < NextStreamId.size() && 185 "Unexpected device id!"); 186 const unsigned int Id = NextStreamId[DeviceId]->fetch_add(1); 187 return Streams[DeviceId][Id % EnvNumStreams]; 188 } 189 190 RTLDeviceInfoTy() { 191 #ifdef OMPTARGET_DEBUG 192 if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) { 193 DebugLevel = std::stoi(envStr); 194 } 195 #endif // OMPTARGET_DEBUG 196 197 DP("Start initializing CUDA\n"); 198 199 CUresult err = cuInit(0); 200 if (err != CUDA_SUCCESS) { 201 DP("Error when initializing CUDA\n"); 202 CUDA_ERR_STRING(err); 203 return; 204 } 205 206 NumberOfDevices = 0; 207 208 err = cuDeviceGetCount(&NumberOfDevices); 209 if (err != CUDA_SUCCESS) { 210 DP("Error when getting CUDA device count\n"); 211 CUDA_ERR_STRING(err); 212 return; 213 } 214 215 if (NumberOfDevices == 0) { 216 DP("There are no devices supporting CUDA.\n"); 217 return; 218 } 219 220 FuncGblEntries.resize(NumberOfDevices); 221 Contexts.resize(NumberOfDevices); 222 Streams.resize(NumberOfDevices); 223 NextStreamId.resize(NumberOfDevices); 224 ThreadsPerBlock.resize(NumberOfDevices); 225 BlocksPerGrid.resize(NumberOfDevices); 226 WarpSize.resize(NumberOfDevices); 227 NumTeams.resize(NumberOfDevices); 228 NumThreads.resize(NumberOfDevices); 229 230 // Get environment variables regarding teams 231 char *envStr = getenv("OMP_TEAM_LIMIT"); 232 if (envStr) { 233 // OMP_TEAM_LIMIT has been set 234 EnvTeamLimit = std::stoi(envStr); 235 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 236 } else { 237 EnvTeamLimit = -1; 238 } 239 envStr = getenv("OMP_NUM_TEAMS"); 240 if (envStr) { 241 // OMP_NUM_TEAMS has been set 242 EnvNumTeams = std::stoi(envStr); 243 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 244 } else { 245 EnvNumTeams = -1; 246 } 247 248 // By default let's create 256 streams per device 249 EnvNumStreams = 256; 250 envStr = getenv("LIBOMPTARGET_NUM_STREAMS"); 251 if (envStr) { 252 EnvNumStreams = std::stoi(envStr); 253 } 254 255 // Initialize streams for each device 256 for (std::vector<CUstream> &S : Streams) { 257 S.resize(EnvNumStreams); 258 } 259 260 // Initialize the next stream id 261 for (std::unique_ptr<std::atomic_uint> &Ptr : NextStreamId) { 262 Ptr = std::make_unique<std::atomic_uint>(0); 263 } 264 265 // Default state. 266 RequiresFlags = OMP_REQ_UNDEFINED; 267 } 268 269 ~RTLDeviceInfoTy() { 270 // Close modules 271 for (auto &module : Modules) 272 if (module) { 273 CUresult err = cuModuleUnload(module); 274 if (err != CUDA_SUCCESS) { 275 DP("Error when unloading CUDA module\n"); 276 CUDA_ERR_STRING(err); 277 } 278 } 279 280 // Destroy streams before contexts 281 for (int I = 0; I < NumberOfDevices; ++I) { 282 CUresult err = cuCtxSetCurrent(Contexts[I]); 283 if (err != CUDA_SUCCESS) { 284 DP("Error when setting current CUDA context\n"); 285 CUDA_ERR_STRING(err); 286 } 287 288 for (auto &S : Streams[I]) 289 if (S) { 290 err = cuStreamDestroy(S); 291 if (err != CUDA_SUCCESS) { 292 DP("Error when destroying CUDA stream\n"); 293 CUDA_ERR_STRING(err); 294 } 295 } 296 } 297 298 // Destroy contexts 299 for (auto &ctx : Contexts) 300 if (ctx) { 301 CUresult err = cuCtxDestroy(ctx); 302 if (err != CUDA_SUCCESS) { 303 DP("Error when destroying CUDA context\n"); 304 CUDA_ERR_STRING(err); 305 } 306 } 307 } 308 }; 309 310 static RTLDeviceInfoTy DeviceInfo; 311 312 #ifdef __cplusplus 313 extern "C" { 314 #endif 315 316 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 317 return elf_check_machine(image, 190); // EM_CUDA = 190. 318 } 319 320 int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; } 321 322 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 323 DP("Init requires flags to %ld\n", RequiresFlags); 324 DeviceInfo.RequiresFlags = RequiresFlags; 325 return RequiresFlags; 326 } 327 328 int32_t __tgt_rtl_init_device(int32_t device_id) { 329 330 CUdevice cuDevice; 331 DP("Getting device %d\n", device_id); 332 CUresult err = cuDeviceGet(&cuDevice, device_id); 333 if (err != CUDA_SUCCESS) { 334 DP("Error when getting CUDA device with id = %d\n", device_id); 335 CUDA_ERR_STRING(err); 336 return OFFLOAD_FAIL; 337 } 338 339 // Create the context and save it to use whenever this device is selected. 340 err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC, 341 cuDevice); 342 if (err != CUDA_SUCCESS) { 343 DP("Error when creating a CUDA context\n"); 344 CUDA_ERR_STRING(err); 345 return OFFLOAD_FAIL; 346 } 347 348 err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 349 if (err != CUDA_SUCCESS) { 350 DP("Error when setting current CUDA context\n"); 351 CUDA_ERR_STRING(err); 352 } 353 354 for (CUstream &Stream : DeviceInfo.Streams[device_id]) { 355 err = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING); 356 if (err != CUDA_SUCCESS) { 357 DP("Error when creating CUDA stream\n"); 358 CUDA_ERR_STRING(err); 359 } 360 } 361 362 // Query attributes to determine number of threads/block and blocks/grid. 363 int maxGridDimX; 364 err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 365 cuDevice); 366 if (err != CUDA_SUCCESS) { 367 DP("Error getting max grid dimension, use default\n"); 368 DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams; 369 } else if (maxGridDimX <= RTLDeviceInfoTy::HardTeamLimit) { 370 DeviceInfo.BlocksPerGrid[device_id] = maxGridDimX; 371 DP("Using %d CUDA blocks per grid\n", maxGridDimX); 372 } else { 373 DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit; 374 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 375 "at the hard limit\n", 376 maxGridDimX, RTLDeviceInfoTy::HardTeamLimit); 377 } 378 379 // We are only exploiting threads along the x axis. 380 int maxBlockDimX; 381 err = cuDeviceGetAttribute(&maxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, 382 cuDevice); 383 if (err != CUDA_SUCCESS) { 384 DP("Error getting max block dimension, use default\n"); 385 DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads; 386 } else if (maxBlockDimX <= RTLDeviceInfoTy::HardThreadLimit) { 387 DeviceInfo.ThreadsPerBlock[device_id] = maxBlockDimX; 388 DP("Using %d CUDA threads per block\n", maxBlockDimX); 389 } else { 390 DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit; 391 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, capping" 392 "at the hard limit\n", 393 maxBlockDimX, RTLDeviceInfoTy::HardThreadLimit); 394 } 395 396 int warpSize; 397 err = 398 cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice); 399 if (err != CUDA_SUCCESS) { 400 DP("Error getting warp size, assume default\n"); 401 DeviceInfo.WarpSize[device_id] = 32; 402 } else { 403 DeviceInfo.WarpSize[device_id] = warpSize; 404 } 405 406 // Adjust teams to the env variables 407 if (DeviceInfo.EnvTeamLimit > 0 && 408 DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) { 409 DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit; 410 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 411 DeviceInfo.EnvTeamLimit); 412 } 413 414 DP("Max number of CUDA blocks %d, threads %d & warp size %d\n", 415 DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id], 416 DeviceInfo.WarpSize[device_id]); 417 418 // Set default number of teams 419 if (DeviceInfo.EnvNumTeams > 0) { 420 DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams; 421 DP("Default number of teams set according to environment %d\n", 422 DeviceInfo.EnvNumTeams); 423 } else { 424 DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams; 425 DP("Default number of teams set according to library's default %d\n", 426 RTLDeviceInfoTy::DefaultNumTeams); 427 } 428 if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) { 429 DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id]; 430 DP("Default number of teams exceeds device limit, capping at %d\n", 431 DeviceInfo.BlocksPerGrid[device_id]); 432 } 433 434 // Set default number of threads 435 DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads; 436 DP("Default number of threads set according to library's default %d\n", 437 RTLDeviceInfoTy::DefaultNumThreads); 438 if (DeviceInfo.NumThreads[device_id] > 439 DeviceInfo.ThreadsPerBlock[device_id]) { 440 DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id]; 441 DP("Default number of threads exceeds device limit, capping at %d\n", 442 DeviceInfo.ThreadsPerBlock[device_id]); 443 } 444 445 return OFFLOAD_SUCCESS; 446 } 447 448 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 449 __tgt_device_image *image) { 450 451 // Set the context we are using. 452 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 453 if (err != CUDA_SUCCESS) { 454 DP("Error when setting a CUDA context for device %d\n", device_id); 455 CUDA_ERR_STRING(err); 456 return NULL; 457 } 458 459 // Clear the offload table as we are going to create a new one. 460 DeviceInfo.clearOffloadEntriesTable(device_id); 461 462 // Create the module and extract the function pointers. 463 464 CUmodule cumod; 465 DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart)); 466 err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL); 467 if (err != CUDA_SUCCESS) { 468 DP("Error when loading CUDA module\n"); 469 CUDA_ERR_STRING(err); 470 return NULL; 471 } 472 473 DP("CUDA module successfully loaded!\n"); 474 DeviceInfo.Modules.push_back(cumod); 475 476 // Find the symbols in the module by name. 477 __tgt_offload_entry *HostBegin = image->EntriesBegin; 478 __tgt_offload_entry *HostEnd = image->EntriesEnd; 479 480 for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { 481 482 if (!e->addr) { 483 // We return NULL when something like this happens, the host should have 484 // always something in the address to uniquely identify the target region. 485 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size); 486 487 return NULL; 488 } 489 490 if (e->size) { 491 __tgt_offload_entry entry = *e; 492 493 CUdeviceptr cuptr; 494 size_t cusize; 495 err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name); 496 497 if (err != CUDA_SUCCESS) { 498 DP("Loading global '%s' (Failed)\n", e->name); 499 CUDA_ERR_STRING(err); 500 return NULL; 501 } 502 503 if (cusize != e->size) { 504 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name, 505 cusize, e->size); 506 CUDA_ERR_STRING(err); 507 return NULL; 508 } 509 510 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 511 DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr)); 512 entry.addr = (void *)cuptr; 513 514 // Note: In the current implementation declare target variables 515 // can either be link or to. This means that once unified 516 // memory is activated via the requires directive, the variable 517 // can be used directly from the host in both cases. 518 // TODO: when variables types other than to or link are added, 519 // the below condition should be changed to explicitly 520 // check for to and link variables types: 521 // (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && 522 // (e->flags & OMP_DECLARE_TARGET_LINK || 523 // e->flags == OMP_DECLARE_TARGET_TO)) 524 if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { 525 // If unified memory is present any target link or to variables 526 // can access host addresses directly. There is no longer a 527 // need for device copies. 528 cuMemcpyHtoD(cuptr, e->addr, sizeof(void *)); 529 DP("Copy linked variable host address (" DPxMOD ")" 530 "to device address (" DPxMOD ")\n", 531 DPxPTR(*((void**)e->addr)), DPxPTR(cuptr)); 532 } 533 534 DeviceInfo.addOffloadEntry(device_id, entry); 535 536 continue; 537 } 538 539 CUfunction fun; 540 err = cuModuleGetFunction(&fun, cumod, e->name); 541 542 if (err != CUDA_SUCCESS) { 543 DP("Loading '%s' (Failed)\n", e->name); 544 CUDA_ERR_STRING(err); 545 return NULL; 546 } 547 548 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 549 DPxPTR(e - HostBegin), e->name, DPxPTR(fun)); 550 551 // default value GENERIC (in case symbol is missing from cubin file) 552 int8_t ExecModeVal = ExecutionModeType::GENERIC; 553 std::string ExecModeNameStr (e->name); 554 ExecModeNameStr += "_exec_mode"; 555 const char *ExecModeName = ExecModeNameStr.c_str(); 556 557 CUdeviceptr ExecModePtr; 558 size_t cusize; 559 err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName); 560 if (err == CUDA_SUCCESS) { 561 if ((size_t)cusize != sizeof(int8_t)) { 562 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 563 ExecModeName, cusize, sizeof(int8_t)); 564 CUDA_ERR_STRING(err); 565 return NULL; 566 } 567 568 err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize); 569 if (err != CUDA_SUCCESS) { 570 DP("Error when copying data from device to host. Pointers: " 571 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 572 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize); 573 CUDA_ERR_STRING(err); 574 return NULL; 575 } 576 577 if (ExecModeVal < 0 || ExecModeVal > 1) { 578 DP("Error wrong exec_mode value specified in cubin file: %d\n", 579 ExecModeVal); 580 return NULL; 581 } 582 } else { 583 DP("Loading global exec_mode '%s' - symbol missing, using default value " 584 "GENERIC (1)\n", ExecModeName); 585 CUDA_ERR_STRING(err); 586 } 587 588 KernelsList.push_back(KernelTy(fun, ExecModeVal)); 589 590 __tgt_offload_entry entry = *e; 591 entry.addr = (void *)&KernelsList.back(); 592 DeviceInfo.addOffloadEntry(device_id, entry); 593 } 594 595 // send device environment data to the device 596 { 597 omptarget_device_environmentTy device_env; 598 599 device_env.debug_level = 0; 600 601 #ifdef OMPTARGET_DEBUG 602 if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) { 603 device_env.debug_level = std::stoi(envStr); 604 } 605 #endif 606 607 const char * device_env_Name="omptarget_device_environment"; 608 CUdeviceptr device_env_Ptr; 609 size_t cusize; 610 611 err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name); 612 613 if (err == CUDA_SUCCESS) { 614 if ((size_t)cusize != sizeof(device_env)) { 615 DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n", 616 device_env_Name, cusize, sizeof(int32_t)); 617 CUDA_ERR_STRING(err); 618 return NULL; 619 } 620 621 err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize); 622 if (err != CUDA_SUCCESS) { 623 DP("Error when copying data from host to device. Pointers: " 624 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 625 DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize); 626 CUDA_ERR_STRING(err); 627 return NULL; 628 } 629 630 DP("Sending global device environment data %zu bytes\n", (size_t)cusize); 631 } else { 632 DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name); 633 DP("Continue, considering this is a device RTL which does not accept environment setting.\n"); 634 } 635 } 636 637 return DeviceInfo.getOffloadEntriesTable(device_id); 638 } 639 640 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { 641 if (size == 0) { 642 return NULL; 643 } 644 645 // Set the context we are using. 646 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 647 if (err != CUDA_SUCCESS) { 648 DP("Error while trying to set CUDA current context\n"); 649 CUDA_ERR_STRING(err); 650 return NULL; 651 } 652 653 CUdeviceptr ptr; 654 err = cuMemAlloc(&ptr, size); 655 if (err != CUDA_SUCCESS) { 656 DP("Error while trying to allocate %d\n", err); 657 CUDA_ERR_STRING(err); 658 return NULL; 659 } 660 661 void *vptr = (void *)ptr; 662 return vptr; 663 } 664 665 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 666 int64_t size) { 667 // Set the context we are using. 668 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 669 if (err != CUDA_SUCCESS) { 670 DP("Error when setting CUDA context\n"); 671 CUDA_ERR_STRING(err); 672 return OFFLOAD_FAIL; 673 } 674 675 CUstream &Stream = DeviceInfo.getNextStream(device_id); 676 677 err = cuMemcpyHtoDAsync((CUdeviceptr)tgt_ptr, hst_ptr, size, Stream); 678 if (err != CUDA_SUCCESS) { 679 DP("Error when copying data from host to device. Pointers: host = " DPxMOD 680 ", device = " DPxMOD ", size = %" PRId64 "\n", 681 DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size); 682 CUDA_ERR_STRING(err); 683 return OFFLOAD_FAIL; 684 } 685 686 err = cuStreamSynchronize(Stream); 687 if (err != CUDA_SUCCESS) { 688 DP("Error when synchronizing async data transfer from host to device. " 689 "Pointers: host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 690 DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size); 691 CUDA_ERR_STRING(err); 692 return OFFLOAD_FAIL; 693 } 694 695 return OFFLOAD_SUCCESS; 696 } 697 698 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 699 int64_t size) { 700 // Set the context we are using. 701 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 702 if (err != CUDA_SUCCESS) { 703 DP("Error when setting CUDA context\n"); 704 CUDA_ERR_STRING(err); 705 return OFFLOAD_FAIL; 706 } 707 708 CUstream &Stream = DeviceInfo.getNextStream(device_id); 709 710 err = cuMemcpyDtoHAsync(hst_ptr, (CUdeviceptr)tgt_ptr, size, Stream); 711 if (err != CUDA_SUCCESS) { 712 DP("Error when copying data from device to host. Pointers: host = " DPxMOD 713 ", device = " DPxMOD ", size = %" PRId64 "\n", 714 DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size); 715 CUDA_ERR_STRING(err); 716 return OFFLOAD_FAIL; 717 } 718 719 err = cuStreamSynchronize(Stream); 720 if (err != CUDA_SUCCESS) { 721 DP("Error when synchronizing async data transfer from device to host. " 722 "Pointers: host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", 723 DPxPTR(hst_ptr), DPxPTR(tgt_ptr), size); 724 CUDA_ERR_STRING(err); 725 return OFFLOAD_FAIL; 726 } 727 728 return OFFLOAD_SUCCESS; 729 } 730 731 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 732 // Set the context we are using. 733 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 734 if (err != CUDA_SUCCESS) { 735 DP("Error when setting CUDA context\n"); 736 CUDA_ERR_STRING(err); 737 return OFFLOAD_FAIL; 738 } 739 740 err = cuMemFree((CUdeviceptr)tgt_ptr); 741 if (err != CUDA_SUCCESS) { 742 DP("Error when freeing CUDA memory\n"); 743 CUDA_ERR_STRING(err); 744 return OFFLOAD_FAIL; 745 } 746 return OFFLOAD_SUCCESS; 747 } 748 749 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 750 void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 751 int32_t thread_limit, uint64_t loop_tripcount) { 752 // Set the context we are using. 753 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 754 if (err != CUDA_SUCCESS) { 755 DP("Error when setting CUDA context\n"); 756 CUDA_ERR_STRING(err); 757 return OFFLOAD_FAIL; 758 } 759 760 // All args are references. 761 std::vector<void *> args(arg_num); 762 std::vector<void *> ptrs(arg_num); 763 764 for (int32_t i = 0; i < arg_num; ++i) { 765 ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); 766 args[i] = &ptrs[i]; 767 } 768 769 KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; 770 771 int cudaThreadsPerBlock; 772 773 if (thread_limit > 0) { 774 cudaThreadsPerBlock = thread_limit; 775 DP("Setting CUDA threads per block to requested %d\n", thread_limit); 776 // Add master warp if necessary 777 if (KernelInfo->ExecutionMode == GENERIC) { 778 cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id]; 779 DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]); 780 } 781 } else { 782 cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id]; 783 DP("Setting CUDA threads per block to default %d\n", 784 DeviceInfo.NumThreads[device_id]); 785 } 786 787 if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) { 788 cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id]; 789 DP("Threads per block capped at device limit %d\n", 790 DeviceInfo.ThreadsPerBlock[device_id]); 791 } 792 793 int kernel_limit; 794 err = cuFuncGetAttribute(&kernel_limit, 795 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func); 796 if (err == CUDA_SUCCESS) { 797 if (kernel_limit < cudaThreadsPerBlock) { 798 cudaThreadsPerBlock = kernel_limit; 799 DP("Threads per block capped at kernel limit %d\n", kernel_limit); 800 } 801 } 802 803 int cudaBlocksPerGrid; 804 if (team_num <= 0) { 805 if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) { 806 if (KernelInfo->ExecutionMode == SPMD) { 807 // We have a combined construct, i.e. `target teams distribute parallel 808 // for [simd]`. We launch so many teams so that each thread will 809 // execute one iteration of the loop. 810 // round up to the nearest integer 811 cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1; 812 } else { 813 // If we reach this point, then we have a non-combined construct, i.e. 814 // `teams distribute` with a nested `parallel for` and each team is 815 // assigned one iteration of the `distribute` loop. E.g.: 816 // 817 // #pragma omp target teams distribute 818 // for(...loop_tripcount...) { 819 // #pragma omp parallel for 820 // for(...) {} 821 // } 822 // 823 // Threads within a team will execute the iterations of the `parallel` 824 // loop. 825 cudaBlocksPerGrid = loop_tripcount; 826 } 827 DP("Using %d teams due to loop trip count %" PRIu64 " and number of " 828 "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount, 829 cudaThreadsPerBlock); 830 } else { 831 cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id]; 832 DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]); 833 } 834 } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) { 835 cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id]; 836 DP("Capping number of teams to team limit %d\n", 837 DeviceInfo.BlocksPerGrid[device_id]); 838 } else { 839 cudaBlocksPerGrid = team_num; 840 DP("Using requested number of teams %d\n", team_num); 841 } 842 843 // Run on the device. 844 DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid, 845 cudaThreadsPerBlock); 846 847 CUstream &Stream = DeviceInfo.getNextStream(device_id); 848 849 err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1, 850 cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 851 Stream, &args[0], 0); 852 if (err != CUDA_SUCCESS) { 853 DP("Device kernel launch failed!\n"); 854 CUDA_ERR_STRING(err); 855 return OFFLOAD_FAIL; 856 } 857 858 DP("Launch of entry point at " DPxMOD " successful!\n", 859 DPxPTR(tgt_entry_ptr)); 860 861 CUresult sync_err = cuStreamSynchronize(Stream); 862 if (sync_err != CUDA_SUCCESS) { 863 DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr)); 864 CUDA_ERR_STRING(sync_err); 865 return OFFLOAD_FAIL; 866 } else { 867 DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); 868 } 869 870 return OFFLOAD_SUCCESS; 871 } 872 873 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 874 void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { 875 // use one team and the default number of threads. 876 const int32_t team_num = 1; 877 const int32_t thread_limit = 0; 878 return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, 879 tgt_offsets, arg_num, team_num, thread_limit, 0); 880 } 881 882 #ifdef __cplusplus 883 } 884 #endif 885