1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <cassert> 14 #include <cstddef> 15 #include <cuda.h> 16 #include <list> 17 #include <memory> 18 #include <mutex> 19 #include <string> 20 #include <vector> 21 22 #include "omptargetplugin.h" 23 24 #ifndef TARGET_NAME 25 #define TARGET_NAME CUDA 26 #endif 27 28 #ifdef OMPTARGET_DEBUG 29 static int DebugLevel = 0; 30 31 #define GETNAME2(name) #name 32 #define GETNAME(name) GETNAME2(name) 33 #define DP(...) \ 34 do { \ 35 if (DebugLevel > 0) { \ 36 DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ 37 } \ 38 } while (false) 39 40 // Utility for retrieving and printing CUDA error string. 41 #define CUDA_ERR_STRING(err) \ 42 do { \ 43 if (DebugLevel > 0) { \ 44 const char *errStr; \ 45 cuGetErrorString(err, &errStr); \ 46 DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", "CUDA error is: %s\n", errStr); \ 47 } \ 48 } while (false) 49 #else // OMPTARGET_DEBUG 50 #define DP(...) {} 51 #define CUDA_ERR_STRING(err) {} 52 #endif // OMPTARGET_DEBUG 53 54 #include "../../common/elf_common.c" 55 56 /// Keep entries table per device. 57 struct FuncOrGblEntryTy { 58 __tgt_target_table Table; 59 std::vector<__tgt_offload_entry> Entries; 60 }; 61 62 enum ExecutionModeType { 63 SPMD, // constructors, destructors, 64 // combined constructs (`teams distribute parallel for [simd]`) 65 GENERIC, // everything else 66 NONE 67 }; 68 69 /// Use a single entity to encode a kernel and a set of flags. 70 struct KernelTy { 71 CUfunction Func; 72 73 // execution mode of kernel 74 // 0 - SPMD mode (without master warp) 75 // 1 - Generic mode (with master warp) 76 int8_t ExecutionMode; 77 78 KernelTy(CUfunction _Func, int8_t _ExecutionMode) 79 : Func(_Func), ExecutionMode(_ExecutionMode) {} 80 }; 81 82 /// Device environment data 83 /// Manually sync with the deviceRTL side for now, move to a dedicated header 84 /// file later. 85 struct omptarget_device_environmentTy { 86 int32_t debug_level; 87 }; 88 89 /// List that contains all the kernels. 90 /// FIXME: we may need this to be per device and per library. 91 std::list<KernelTy> KernelsList; 92 93 namespace { 94 bool checkResult(CUresult Err, const char *ErrMsg) { 95 if (Err == CUDA_SUCCESS) 96 return true; 97 98 DP(ErrMsg); 99 CUDA_ERR_STRING(Err); 100 return false; 101 } 102 103 // Structure contains per-device data 104 struct DeviceDataTy { 105 std::list<FuncOrGblEntryTy> FuncGblEntries; 106 CUcontext Context = nullptr; 107 // Device properties 108 int ThreadsPerBlock = 0; 109 int BlocksPerGrid = 0; 110 int WarpSize = 0; 111 // OpenMP properties 112 int NumTeams = 0; 113 int NumThreads = 0; 114 }; 115 116 class StreamManagerTy { 117 int NumberOfDevices; 118 // The initial size of stream pool 119 int EnvNumInitialStreams; 120 // Per-device stream mutex 121 std::vector<std::unique_ptr<std::mutex>> StreamMtx; 122 // Per-device stream Id indicates the next available stream in the pool 123 std::vector<int> NextStreamId; 124 // Per-device stream pool 125 std::vector<std::vector<CUstream>> StreamPool; 126 // Reference to per-device data 127 std::vector<DeviceDataTy> &DeviceData; 128 129 // If there is no CUstream left in the pool, we will resize the pool to 130 // allocate more CUstream. This function should be called with device mutex, 131 // and we do not resize to smaller one. 132 void resizeStreamPool(const int DeviceId, const size_t NewSize) { 133 std::vector<CUstream> &Pool = StreamPool[DeviceId]; 134 const size_t CurrentSize = Pool.size(); 135 assert(NewSize > CurrentSize && "new size is not larger than current size"); 136 137 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 138 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) { 139 // We will return if cannot switch to the right context in case of 140 // creating bunch of streams that are not corresponding to the right 141 // device. The offloading will fail later because selected CUstream is 142 // nullptr. 143 return; 144 } 145 146 Pool.resize(NewSize, nullptr); 147 148 for (size_t I = CurrentSize; I < NewSize; ++I) { 149 checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING), 150 "Error returned from cuStreamCreate\n"); 151 } 152 } 153 154 public: 155 StreamManagerTy(const int NumberOfDevices, 156 std::vector<DeviceDataTy> &DeviceData) 157 : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32), 158 DeviceData(DeviceData) { 159 StreamPool.resize(NumberOfDevices); 160 NextStreamId.resize(NumberOfDevices); 161 StreamMtx.resize(NumberOfDevices); 162 163 if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) 164 EnvNumInitialStreams = std::stoi(EnvStr); 165 166 // Initialize the next stream id 167 std::fill(NextStreamId.begin(), NextStreamId.end(), 0); 168 169 // Initialize stream mutex 170 for (std::unique_ptr<std::mutex> &Ptr : StreamMtx) 171 Ptr = std::make_unique<std::mutex>(); 172 } 173 174 ~StreamManagerTy() { 175 // Destroy streams 176 for (int I = 0; I < NumberOfDevices; ++I) { 177 checkResult(cuCtxSetCurrent(DeviceData[I].Context), 178 "Error returned from cuCtxSetCurrent\n"); 179 180 for (CUstream &S : StreamPool[I]) { 181 if (S) 182 checkResult(cuStreamDestroy(S), 183 "Error returned from cuStreamDestroy\n"); 184 } 185 } 186 } 187 188 // Get a CUstream from pool. Per-device next stream id always points to the 189 // next available CUstream. That means, CUstreams [0, id-1] have been 190 // assigned, and [id,] are still available. If there is no CUstream left, we 191 // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned, 192 // the id will increase one. 193 // xxxxxs+++++++++ 194 // ^ 195 // id 196 // After assignment, the pool becomes the following and s is assigned. 197 // xxxxxs+++++++++ 198 // ^ 199 // id 200 CUstream getStream(const int DeviceId) { 201 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 202 int &Id = NextStreamId[DeviceId]; 203 // No CUstream left in the pool, we need to request from CUDA RT 204 if (Id == StreamPool[DeviceId].size()) { 205 // By default we double the stream pool every time 206 resizeStreamPool(DeviceId, Id * 2); 207 } 208 return StreamPool[DeviceId][Id++]; 209 } 210 211 // Return a CUstream back to pool. As mentioned above, per-device next 212 // stream is always points to the next available CUstream, so when we return 213 // a CUstream, we need to first decrease the id, and then copy the CUstream 214 // back. 215 // It is worth noting that, the order of streams return might be different 216 // from that they're assigned, that saying, at some point, there might be 217 // two identical CUstreams. 218 // xxax+a+++++ 219 // ^ 220 // id 221 // However, it doesn't matter, because they're always on the two sides of 222 // id. The left one will in the end be overwritten by another CUstream. 223 // Therefore, after several execution, the order of pool might be different 224 // from its initial state. 225 void returnStream(const int DeviceId, CUstream Stream) { 226 const std::lock_guard<std::mutex> Lock(*StreamMtx[DeviceId]); 227 int &Id = NextStreamId[DeviceId]; 228 assert(Id > 0 && "Wrong stream ID"); 229 StreamPool[DeviceId][--Id] = Stream; 230 } 231 232 bool initializeDeviceStreamPool(const int DeviceId) { 233 assert(StreamPool[DeviceId].empty() && "stream pool has been initialized"); 234 235 resizeStreamPool(DeviceId, EnvNumInitialStreams); 236 237 // Check the size of stream pool 238 if (StreamPool[DeviceId].size() != EnvNumInitialStreams) 239 return false; 240 241 // Check whether each stream is valid 242 for (CUstream &S : StreamPool[DeviceId]) 243 if (!S) 244 return false; 245 246 return true; 247 } 248 }; 249 250 class DeviceRTLTy { 251 int NumberOfDevices; 252 // OpenMP environment properties 253 int EnvNumTeams; 254 int EnvTeamLimit; 255 // OpenMP requires flags 256 int64_t RequiresFlags; 257 258 static constexpr const int HardTeamLimit = 1U << 16U; // 64k 259 static constexpr const int HardThreadLimit = 1024; 260 static constexpr const int DefaultNumTeams = 128; 261 static constexpr const int DefaultNumThreads = 128; 262 263 std::unique_ptr<StreamManagerTy> StreamManager; 264 std::vector<DeviceDataTy> DeviceData; 265 std::vector<CUmodule> Modules; 266 267 // Record entry point associated with device 268 void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { 269 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 270 E.Entries.push_back(entry); 271 } 272 273 // Return true if the entry is associated with device 274 bool findOffloadEntry(const int DeviceId, const void *Addr) const { 275 for (const __tgt_offload_entry &Itr : 276 DeviceData[DeviceId].FuncGblEntries.back().Entries) 277 if (Itr.addr == Addr) 278 return true; 279 280 return false; 281 } 282 283 // Return the pointer to the target entries table 284 __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { 285 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 286 287 if (E.Entries.empty()) 288 return nullptr; 289 290 // Update table info according to the entries and return the pointer 291 E.Table.EntriesBegin = E.Entries.data(); 292 E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); 293 294 return &E.Table; 295 } 296 297 // Clear entries table for a device 298 void clearOffloadEntriesTable(const int DeviceId) { 299 DeviceData[DeviceId].FuncGblEntries.emplace_back(); 300 FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); 301 E.Entries.clear(); 302 E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; 303 } 304 305 CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const { 306 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); 307 308 if (!AsyncInfoPtr->Queue) 309 AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId); 310 311 return reinterpret_cast<CUstream>(AsyncInfoPtr->Queue); 312 } 313 314 public: 315 // This class should not be copied 316 DeviceRTLTy(const DeviceRTLTy &) = delete; 317 DeviceRTLTy(DeviceRTLTy &&) = delete; 318 319 DeviceRTLTy() 320 : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), 321 RequiresFlags(OMP_REQ_UNDEFINED) { 322 #ifdef OMPTARGET_DEBUG 323 if (const char *EnvStr = getenv("LIBOMPTARGET_DEBUG")) 324 DebugLevel = std::stoi(EnvStr); 325 #endif // OMPTARGET_DEBUG 326 327 DP("Start initializing CUDA\n"); 328 329 CUresult Err = cuInit(0); 330 if (!checkResult(Err, "Error returned from cuInit\n")) { 331 return; 332 } 333 334 Err = cuDeviceGetCount(&NumberOfDevices); 335 if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) 336 return; 337 338 if (NumberOfDevices == 0) { 339 DP("There are no devices supporting CUDA.\n"); 340 return; 341 } 342 343 DeviceData.resize(NumberOfDevices); 344 345 // Get environment variables regarding teams 346 if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { 347 // OMP_TEAM_LIMIT has been set 348 EnvTeamLimit = std::stoi(EnvStr); 349 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 350 } 351 if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { 352 // OMP_NUM_TEAMS has been set 353 EnvNumTeams = std::stoi(EnvStr); 354 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 355 } 356 357 StreamManager = 358 std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData); 359 } 360 361 ~DeviceRTLTy() { 362 // First destruct stream manager in case of Contexts is destructed before it 363 StreamManager = nullptr; 364 365 for (CUmodule &M : Modules) 366 // Close module 367 if (M) 368 checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n"); 369 370 for (DeviceDataTy &D : DeviceData) { 371 // Destroy context 372 if (D.Context) 373 checkResult(cuCtxDestroy(D.Context), 374 "Error returned from cuCtxDestroy\n"); 375 } 376 } 377 378 // Check whether a given DeviceId is valid 379 bool isValidDeviceId(const int DeviceId) const { 380 return DeviceId >= 0 && DeviceId < NumberOfDevices; 381 } 382 383 int getNumOfDevices() const { return NumberOfDevices; } 384 385 void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } 386 387 int initDevice(const int DeviceId) { 388 CUdevice Device; 389 390 DP("Getting device %d\n", DeviceId); 391 CUresult Err = cuDeviceGet(&Device, DeviceId); 392 if (!checkResult(Err, "Error returned from cuDeviceGet\n")) 393 return OFFLOAD_FAIL; 394 395 // Create the context and save it to use whenever this device is selected. 396 Err = cuCtxCreate(&DeviceData[DeviceId].Context, CU_CTX_SCHED_BLOCKING_SYNC, 397 Device); 398 if (!checkResult(Err, "Error returned from cuCtxCreate\n")) 399 return OFFLOAD_FAIL; 400 401 Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 402 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 403 return OFFLOAD_FAIL; 404 405 // Initialize stream pool 406 if (!StreamManager->initializeDeviceStreamPool(DeviceId)) 407 return OFFLOAD_FAIL; 408 409 // Query attributes to determine number of threads/block and blocks/grid. 410 int MaxGridDimX; 411 Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 412 Device); 413 if (Err != CUDA_SUCCESS) { 414 DP("Error getting max grid dimension, use default value %d\n", 415 DeviceRTLTy::DefaultNumTeams); 416 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; 417 } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) { 418 DP("Using %d CUDA blocks per grid\n", MaxGridDimX); 419 DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; 420 } else { 421 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 422 "at the hard limit\n", 423 MaxGridDimX, DeviceRTLTy::HardTeamLimit); 424 DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit; 425 } 426 427 // We are only exploiting threads along the x axis. 428 int MaxBlockDimX; 429 Err = cuDeviceGetAttribute(&MaxBlockDimX, 430 CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); 431 if (Err != CUDA_SUCCESS) { 432 DP("Error getting max block dimension, use default value %d\n", 433 DeviceRTLTy::DefaultNumThreads); 434 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; 435 } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) { 436 DP("Using %d CUDA threads per block\n", MaxBlockDimX); 437 DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; 438 } else { 439 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " 440 "capping at the hard limit\n", 441 MaxBlockDimX, DeviceRTLTy::HardThreadLimit); 442 DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; 443 } 444 445 // Get and set warp size 446 int WarpSize; 447 Err = 448 cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); 449 if (Err != CUDA_SUCCESS) { 450 DP("Error getting warp size, assume default value 32\n"); 451 DeviceData[DeviceId].WarpSize = 32; 452 } else { 453 DP("Using warp size %d\n", WarpSize); 454 DeviceData[DeviceId].WarpSize = WarpSize; 455 } 456 457 // Adjust teams to the env variables 458 if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { 459 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 460 EnvTeamLimit); 461 DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; 462 } 463 464 DP("Max number of CUDA blocks %d, threads %d & warp size %d\n", 465 DeviceData[DeviceId].BlocksPerGrid, DeviceData[DeviceId].ThreadsPerBlock, 466 DeviceData[DeviceId].WarpSize); 467 468 // Set default number of teams 469 if (EnvNumTeams > 0) { 470 DP("Default number of teams set according to environment %d\n", 471 EnvNumTeams); 472 DeviceData[DeviceId].NumTeams = EnvNumTeams; 473 } else { 474 DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; 475 DP("Default number of teams set according to library's default %d\n", 476 DeviceRTLTy::DefaultNumTeams); 477 } 478 479 if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { 480 DP("Default number of teams exceeds device limit, capping at %d\n", 481 DeviceData[DeviceId].BlocksPerGrid); 482 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; 483 } 484 485 // Set default number of threads 486 DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; 487 DP("Default number of threads set according to library's default %d\n", 488 DeviceRTLTy::DefaultNumThreads); 489 if (DeviceData[DeviceId].NumThreads > 490 DeviceData[DeviceId].ThreadsPerBlock) { 491 DP("Default number of threads exceeds device limit, capping at %d\n", 492 DeviceData[DeviceId].ThreadsPerBlock); 493 DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock; 494 } 495 496 return OFFLOAD_SUCCESS; 497 } 498 499 __tgt_target_table *loadBinary(const int DeviceId, 500 const __tgt_device_image *Image) { 501 // Set the context we are using 502 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 503 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 504 return nullptr; 505 506 // Clear the offload table as we are going to create a new one. 507 clearOffloadEntriesTable(DeviceId); 508 509 // Create the module and extract the function pointers. 510 CUmodule Module; 511 DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); 512 Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); 513 if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) 514 return nullptr; 515 516 DP("CUDA module successfully loaded!\n"); 517 518 Modules.push_back(Module); 519 520 // Find the symbols in the module by name. 521 const __tgt_offload_entry *HostBegin = Image->EntriesBegin; 522 const __tgt_offload_entry *HostEnd = Image->EntriesEnd; 523 524 for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { 525 if (!E->addr) { 526 // We return nullptr when something like this happens, the host should 527 // have always something in the address to uniquely identify the target 528 // region. 529 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size); 530 return nullptr; 531 } 532 533 if (E->size) { 534 __tgt_offload_entry Entry = *E; 535 CUdeviceptr CUPtr; 536 size_t CUSize; 537 Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); 538 // We keep this style here because we need the name 539 if (Err != CUDA_SUCCESS) { 540 DP("Loading global '%s' (Failed)\n", E->name); 541 CUDA_ERR_STRING(Err); 542 return nullptr; 543 } 544 545 if (CUSize != E->size) { 546 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, 547 CUSize, E->size); 548 return nullptr; 549 } 550 551 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 552 DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); 553 554 Entry.addr = (void *)(CUPtr); 555 556 // Note: In the current implementation declare target variables 557 // can either be link or to. This means that once unified 558 // memory is activated via the requires directive, the variable 559 // can be used directly from the host in both cases. 560 // TODO: when variables types other than to or link are added, 561 // the below condition should be changed to explicitly 562 // check for to and link variables types: 563 // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & 564 // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) 565 if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { 566 // If unified memory is present any target link or to variables 567 // can access host addresses directly. There is no longer a 568 // need for device copies. 569 cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); 570 DP("Copy linked variable host address (" DPxMOD 571 ") to device address (" DPxMOD ")\n", 572 DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); 573 } 574 575 addOffloadEntry(DeviceId, Entry); 576 577 continue; 578 } 579 580 CUfunction Func; 581 Err = cuModuleGetFunction(&Func, Module, E->name); 582 // We keep this style here because we need the name 583 if (Err != CUDA_SUCCESS) { 584 DP("Loading '%s' (Failed)\n", E->name); 585 CUDA_ERR_STRING(Err); 586 return nullptr; 587 } 588 589 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 590 DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); 591 592 // default value GENERIC (in case symbol is missing from cubin file) 593 int8_t ExecModeVal = ExecutionModeType::GENERIC; 594 std::string ExecModeNameStr(E->name); 595 ExecModeNameStr += "_exec_mode"; 596 const char *ExecModeName = ExecModeNameStr.c_str(); 597 598 CUdeviceptr ExecModePtr; 599 size_t CUSize; 600 Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); 601 if (Err == CUDA_SUCCESS) { 602 if (CUSize != sizeof(int8_t)) { 603 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 604 ExecModeName, CUSize, sizeof(int8_t)); 605 return nullptr; 606 } 607 608 Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); 609 if (Err != CUDA_SUCCESS) { 610 DP("Error when copying data from device to host. Pointers: " 611 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 612 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); 613 CUDA_ERR_STRING(Err); 614 return nullptr; 615 } 616 617 if (ExecModeVal < 0 || ExecModeVal > 1) { 618 DP("Error wrong exec_mode value specified in cubin file: %d\n", 619 ExecModeVal); 620 return nullptr; 621 } 622 } else { 623 DP("Loading global exec_mode '%s' - symbol missing, using default " 624 "value GENERIC (1)\n", 625 ExecModeName); 626 CUDA_ERR_STRING(Err); 627 } 628 629 KernelsList.emplace_back(Func, ExecModeVal); 630 631 __tgt_offload_entry Entry = *E; 632 Entry.addr = &KernelsList.back(); 633 addOffloadEntry(DeviceId, Entry); 634 } 635 636 // send device environment data to the device 637 { 638 omptarget_device_environmentTy DeviceEnv{0}; 639 640 #ifdef OMPTARGET_DEBUG 641 if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) 642 DeviceEnv.debug_level = std::stoi(EnvStr); 643 #endif 644 645 const char *DeviceEnvName = "omptarget_device_environment"; 646 CUdeviceptr DeviceEnvPtr; 647 size_t CUSize; 648 649 Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); 650 if (Err == CUDA_SUCCESS) { 651 if (CUSize != sizeof(DeviceEnv)) { 652 DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n", 653 DeviceEnvName, CUSize, sizeof(int32_t)); 654 CUDA_ERR_STRING(Err); 655 return nullptr; 656 } 657 658 Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); 659 if (Err != CUDA_SUCCESS) { 660 DP("Error when copying data from host to device. Pointers: " 661 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 662 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); 663 CUDA_ERR_STRING(Err); 664 return nullptr; 665 } 666 667 DP("Sending global device environment data %zu bytes\n", CUSize); 668 } else { 669 DP("Finding global device environment '%s' - symbol missing.\n", 670 DeviceEnvName); 671 DP("Continue, considering this is a device RTL which does not accept " 672 "environment setting.\n"); 673 } 674 } 675 676 return getOffloadEntriesTable(DeviceId); 677 } 678 679 void *dataAlloc(const int DeviceId, const int64_t Size) const { 680 if (Size == 0) 681 return nullptr; 682 683 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 684 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 685 return nullptr; 686 687 CUdeviceptr DevicePtr; 688 Err = cuMemAlloc(&DevicePtr, Size); 689 if (!checkResult(Err, "Error returned from cuMemAlloc\n")) 690 return nullptr; 691 692 return (void *)DevicePtr; 693 } 694 695 int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, 696 const int64_t Size, __tgt_async_info *AsyncInfoPtr) const { 697 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); 698 699 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 700 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 701 return OFFLOAD_FAIL; 702 703 CUstream Stream = getStream(DeviceId, AsyncInfoPtr); 704 705 Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); 706 if (Err != CUDA_SUCCESS) { 707 DP("Error when copying data from host to device. Pointers: host = " DPxMOD 708 ", device = " DPxMOD ", size = %" PRId64 "\n", 709 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 710 CUDA_ERR_STRING(Err); 711 return OFFLOAD_FAIL; 712 } 713 714 return OFFLOAD_SUCCESS; 715 } 716 717 int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, 718 const int64_t Size, __tgt_async_info *AsyncInfoPtr) const { 719 assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); 720 721 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 722 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 723 return OFFLOAD_FAIL; 724 725 CUstream Stream = getStream(DeviceId, AsyncInfoPtr); 726 727 Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); 728 if (Err != CUDA_SUCCESS) { 729 DP("Error when copying data from device to host. Pointers: host = " DPxMOD 730 ", device = " DPxMOD ", size = %" PRId64 "\n", 731 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); 732 CUDA_ERR_STRING(Err); 733 return OFFLOAD_FAIL; 734 } 735 736 return OFFLOAD_SUCCESS; 737 } 738 739 int dataDelete(const int DeviceId, void *TgtPtr) const { 740 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 741 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 742 return OFFLOAD_FAIL; 743 744 Err = cuMemFree((CUdeviceptr)TgtPtr); 745 if (!checkResult(Err, "Error returned from cuMemFree\n")) 746 return OFFLOAD_FAIL; 747 748 return OFFLOAD_SUCCESS; 749 } 750 751 int runTargetTeamRegion(const int DeviceId, const void *TgtEntryPtr, 752 void **TgtArgs, ptrdiff_t *TgtOffsets, 753 const int ArgNum, const int TeamNum, 754 const int ThreadLimit, 755 const unsigned int LoopTripCount, 756 __tgt_async_info *AsyncInfo) const { 757 CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); 758 if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) 759 return OFFLOAD_FAIL; 760 761 // All args are references. 762 std::vector<void *> Args(ArgNum); 763 std::vector<void *> Ptrs(ArgNum); 764 765 for (int I = 0; I < ArgNum; ++I) { 766 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); 767 Args[I] = &Ptrs[I]; 768 } 769 770 const KernelTy *KernelInfo = 771 reinterpret_cast<const KernelTy *>(TgtEntryPtr); 772 773 unsigned int CudaThreadsPerBlock; 774 if (ThreadLimit > 0) { 775 DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); 776 CudaThreadsPerBlock = ThreadLimit; 777 // Add master warp if necessary 778 if (KernelInfo->ExecutionMode == GENERIC) { 779 DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); 780 CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; 781 } 782 } else { 783 DP("Setting CUDA threads per block to default %d\n", 784 DeviceData[DeviceId].NumThreads); 785 CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; 786 } 787 788 if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { 789 DP("Threads per block capped at device limit %d\n", 790 DeviceData[DeviceId].ThreadsPerBlock); 791 CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; 792 } 793 794 int KernelLimit; 795 Err = cuFuncGetAttribute(&KernelLimit, 796 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, 797 KernelInfo->Func); 798 if (Err == CUDA_SUCCESS && KernelLimit < CudaThreadsPerBlock) { 799 DP("Threads per block capped at kernel limit %d\n", KernelLimit); 800 CudaThreadsPerBlock = KernelLimit; 801 } 802 803 unsigned int CudaBlocksPerGrid; 804 if (TeamNum <= 0) { 805 if (LoopTripCount > 0 && EnvNumTeams < 0) { 806 if (KernelInfo->ExecutionMode == SPMD) { 807 // We have a combined construct, i.e. `target teams distribute 808 // parallel for [simd]`. We launch so many teams so that each thread 809 // will execute one iteration of the loop. round up to the nearest 810 // integer 811 CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; 812 } else { 813 // If we reach this point, then we have a non-combined construct, i.e. 814 // `teams distribute` with a nested `parallel for` and each team is 815 // assigned one iteration of the `distribute` loop. E.g.: 816 // 817 // #pragma omp target teams distribute 818 // for(...loop_tripcount...) { 819 // #pragma omp parallel for 820 // for(...) {} 821 // } 822 // 823 // Threads within a team will execute the iterations of the `parallel` 824 // loop. 825 CudaBlocksPerGrid = LoopTripCount; 826 } 827 DP("Using %d teams due to loop trip count %" PRIu64 828 " and number of threads per block %d\n", 829 CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); 830 } else { 831 DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); 832 CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; 833 } 834 } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) { 835 DP("Capping number of teams to team limit %d\n", 836 DeviceData[DeviceId].BlocksPerGrid); 837 CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; 838 } else { 839 DP("Using requested number of teams %d\n", TeamNum); 840 CudaBlocksPerGrid = TeamNum; 841 } 842 843 // Run on the device. 844 DP("Launch kernel with %d blocks and %d threads\n", CudaBlocksPerGrid, 845 CudaThreadsPerBlock); 846 847 CUstream Stream = getStream(DeviceId, AsyncInfo); 848 Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, 849 /* gridDimZ */ 1, CudaThreadsPerBlock, 850 /* blockDimY */ 1, /* blockDimZ */ 1, 851 /* sharedMemBytes */ 0, Stream, &Args[0], nullptr); 852 if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) 853 return OFFLOAD_FAIL; 854 855 DP("Launch of entry point at " DPxMOD " successful!\n", 856 DPxPTR(TgtEntryPtr)); 857 858 return OFFLOAD_SUCCESS; 859 } 860 861 int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const { 862 CUstream Stream = reinterpret_cast<CUstream>(AsyncInfoPtr->Queue); 863 CUresult Err = cuStreamSynchronize(Stream); 864 if (Err != CUDA_SUCCESS) { 865 DP("Error when synchronizing stream. stream = " DPxMOD 866 ", async info ptr = " DPxMOD "\n", 867 DPxPTR(Stream), DPxPTR(AsyncInfoPtr)); 868 CUDA_ERR_STRING(Err); 869 return OFFLOAD_FAIL; 870 } 871 872 // Once the stream is synchronized, return it to stream pool and reset 873 // async_info. This is to make sure the synchronization only works for its 874 // own tasks. 875 StreamManager->returnStream( 876 DeviceId, reinterpret_cast<CUstream>(AsyncInfoPtr->Queue)); 877 AsyncInfoPtr->Queue = nullptr; 878 879 return OFFLOAD_SUCCESS; 880 } 881 }; 882 883 DeviceRTLTy DeviceRTL; 884 } // namespace 885 886 // Exposed library API function 887 #ifdef __cplusplus 888 extern "C" { 889 #endif 890 891 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 892 return elf_check_machine(image, /* EM_CUDA */ 190); 893 } 894 895 int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } 896 897 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 898 DP("Init requires flags to %ld\n", RequiresFlags); 899 DeviceRTL.setRequiresFlag(RequiresFlags); 900 return RequiresFlags; 901 } 902 903 int32_t __tgt_rtl_init_device(int32_t device_id) { 904 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 905 906 return DeviceRTL.initDevice(device_id); 907 } 908 909 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 910 __tgt_device_image *image) { 911 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 912 913 return DeviceRTL.loadBinary(device_id, image); 914 } 915 916 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) { 917 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 918 919 return DeviceRTL.dataAlloc(device_id, size); 920 } 921 922 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 923 int64_t size) { 924 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 925 926 __tgt_async_info async_info; 927 const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, 928 size, &async_info); 929 if (rc != OFFLOAD_SUCCESS) 930 return OFFLOAD_FAIL; 931 932 return __tgt_rtl_synchronize(device_id, &async_info); 933 } 934 935 int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, 936 void *hst_ptr, int64_t size, 937 __tgt_async_info *async_info_ptr) { 938 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 939 assert(async_info_ptr && "async_info_ptr is nullptr"); 940 941 return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size, 942 async_info_ptr); 943 } 944 945 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 946 int64_t size) { 947 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 948 949 __tgt_async_info async_info; 950 const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, 951 size, &async_info); 952 if (rc != OFFLOAD_SUCCESS) 953 return OFFLOAD_FAIL; 954 955 return __tgt_rtl_synchronize(device_id, &async_info); 956 } 957 958 int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, 959 void *tgt_ptr, int64_t size, 960 __tgt_async_info *async_info_ptr) { 961 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 962 assert(async_info_ptr && "async_info_ptr is nullptr"); 963 964 return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, 965 async_info_ptr); 966 } 967 968 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 969 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 970 971 return DeviceRTL.dataDelete(device_id, tgt_ptr); 972 } 973 974 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 975 void **tgt_args, 976 ptrdiff_t *tgt_offsets, 977 int32_t arg_num, int32_t team_num, 978 int32_t thread_limit, 979 uint64_t loop_tripcount) { 980 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 981 982 __tgt_async_info async_info; 983 const int32_t rc = __tgt_rtl_run_target_team_region_async( 984 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 985 thread_limit, loop_tripcount, &async_info); 986 if (rc != OFFLOAD_SUCCESS) 987 return OFFLOAD_FAIL; 988 989 return __tgt_rtl_synchronize(device_id, &async_info); 990 } 991 992 int32_t __tgt_rtl_run_target_team_region_async( 993 int32_t device_id, void *tgt_entry_ptr, void **tgt_args, 994 ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 995 int32_t thread_limit, uint64_t loop_tripcount, 996 __tgt_async_info *async_info_ptr) { 997 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 998 999 return DeviceRTL.runTargetTeamRegion( 1000 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, 1001 thread_limit, loop_tripcount, async_info_ptr); 1002 } 1003 1004 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 1005 void **tgt_args, ptrdiff_t *tgt_offsets, 1006 int32_t arg_num) { 1007 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1008 1009 __tgt_async_info async_info; 1010 const int32_t rc = __tgt_rtl_run_target_region_async( 1011 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info); 1012 if (rc != OFFLOAD_SUCCESS) 1013 return OFFLOAD_FAIL; 1014 1015 return __tgt_rtl_synchronize(device_id, &async_info); 1016 } 1017 1018 int32_t __tgt_rtl_run_target_region_async(int32_t device_id, 1019 void *tgt_entry_ptr, void **tgt_args, 1020 ptrdiff_t *tgt_offsets, 1021 int32_t arg_num, 1022 __tgt_async_info *async_info_ptr) { 1023 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1024 1025 return __tgt_rtl_run_target_team_region_async( 1026 device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, 1027 /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, 1028 async_info_ptr); 1029 } 1030 1031 int32_t __tgt_rtl_synchronize(int32_t device_id, 1032 __tgt_async_info *async_info_ptr) { 1033 assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); 1034 assert(async_info_ptr && "async_info_ptr is nullptr"); 1035 assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); 1036 1037 return DeviceRTL.synchronize(device_id, async_info_ptr); 1038 } 1039 1040 #ifdef __cplusplus 1041 } 1042 #endif 1043