1 //===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // RTL for CUDA machine 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include <cassert> 14 #include <cstddef> 15 #include <cuda.h> 16 #include <list> 17 #include <string> 18 #include <vector> 19 20 #include "omptargetplugin.h" 21 22 #ifndef TARGET_NAME 23 #define TARGET_NAME CUDA 24 #endif 25 26 #ifdef OMPTARGET_DEBUG 27 static int DebugLevel = 0; 28 29 #define GETNAME2(name) #name 30 #define GETNAME(name) GETNAME2(name) 31 #define DP(...) \ 32 do { \ 33 if (DebugLevel > 0) { \ 34 DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ 35 } \ 36 } while (false) 37 #else // OMPTARGET_DEBUG 38 #define DP(...) {} 39 #endif // OMPTARGET_DEBUG 40 41 #include "../../common/elf_common.c" 42 43 // Utility for retrieving and printing CUDA error string. 44 #ifdef CUDA_ERROR_REPORT 45 #define CUDA_ERR_STRING(err) \ 46 do { \ 47 const char *errStr; \ 48 cuGetErrorString(err, &errStr); \ 49 DP("CUDA error is: %s\n", errStr); \ 50 } while (0) 51 #else 52 #define CUDA_ERR_STRING(err) \ 53 {} 54 #endif 55 56 /// Keep entries table per device. 57 struct FuncOrGblEntryTy { 58 __tgt_target_table Table; 59 std::vector<__tgt_offload_entry> Entries; 60 }; 61 62 enum ExecutionModeType { 63 SPMD, // constructors, destructors, 64 // combined constructs (`teams distribute parallel for [simd]`) 65 GENERIC, // everything else 66 NONE 67 }; 68 69 /// Use a single entity to encode a kernel and a set of flags 70 struct KernelTy { 71 CUfunction Func; 72 73 // execution mode of kernel 74 // 0 - SPMD mode (without master warp) 75 // 1 - Generic mode (with master warp) 76 int8_t ExecutionMode; 77 78 KernelTy(CUfunction _Func, int8_t _ExecutionMode) 79 : Func(_Func), ExecutionMode(_ExecutionMode) {} 80 }; 81 82 /// Device envrionment data 83 /// Manually sync with the deviceRTL side for now, move to a dedicated header file later. 84 struct omptarget_device_environmentTy { 85 int32_t debug_level; 86 }; 87 88 /// List that contains all the kernels. 89 /// FIXME: we may need this to be per device and per library. 90 std::list<KernelTy> KernelsList; 91 92 /// Class containing all the device information. 93 class RTLDeviceInfoTy { 94 std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries; 95 96 public: 97 int NumberOfDevices; 98 std::vector<CUmodule> Modules; 99 std::vector<CUcontext> Contexts; 100 101 // Device properties 102 std::vector<int> ThreadsPerBlock; 103 std::vector<int> BlocksPerGrid; 104 std::vector<int> WarpSize; 105 106 // OpenMP properties 107 std::vector<int> NumTeams; 108 std::vector<int> NumThreads; 109 110 // OpenMP Environment properties 111 int EnvNumTeams; 112 int EnvTeamLimit; 113 114 // OpenMP Requires Flags 115 int64_t RequiresFlags; 116 117 //static int EnvNumThreads; 118 static const int HardTeamLimit = 1<<16; // 64k 119 static const int HardThreadLimit = 1024; 120 static const int DefaultNumTeams = 128; 121 static const int DefaultNumThreads = 128; 122 123 // Record entry point associated with device 124 void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { 125 assert(device_id < (int32_t)FuncGblEntries.size() && 126 "Unexpected device id!"); 127 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 128 129 E.Entries.push_back(entry); 130 } 131 132 // Return true if the entry is associated with device 133 bool findOffloadEntry(int32_t device_id, void *addr) { 134 assert(device_id < (int32_t)FuncGblEntries.size() && 135 "Unexpected device id!"); 136 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 137 138 for (auto &it : E.Entries) { 139 if (it.addr == addr) 140 return true; 141 } 142 143 return false; 144 } 145 146 // Return the pointer to the target entries table 147 __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { 148 assert(device_id < (int32_t)FuncGblEntries.size() && 149 "Unexpected device id!"); 150 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 151 152 int32_t size = E.Entries.size(); 153 154 // Table is empty 155 if (!size) 156 return 0; 157 158 __tgt_offload_entry *begin = &E.Entries[0]; 159 __tgt_offload_entry *end = &E.Entries[size - 1]; 160 161 // Update table info according to the entries and return the pointer 162 E.Table.EntriesBegin = begin; 163 E.Table.EntriesEnd = ++end; 164 165 return &E.Table; 166 } 167 168 // Clear entries table for a device 169 void clearOffloadEntriesTable(int32_t device_id) { 170 assert(device_id < (int32_t)FuncGblEntries.size() && 171 "Unexpected device id!"); 172 FuncGblEntries[device_id].emplace_back(); 173 FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); 174 E.Entries.clear(); 175 E.Table.EntriesBegin = E.Table.EntriesEnd = 0; 176 } 177 178 RTLDeviceInfoTy() { 179 #ifdef OMPTARGET_DEBUG 180 if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) { 181 DebugLevel = std::stoi(envStr); 182 } 183 #endif // OMPTARGET_DEBUG 184 185 DP("Start initializing CUDA\n"); 186 187 CUresult err = cuInit(0); 188 if (err != CUDA_SUCCESS) { 189 DP("Error when initializing CUDA\n"); 190 CUDA_ERR_STRING(err); 191 return; 192 } 193 194 NumberOfDevices = 0; 195 196 err = cuDeviceGetCount(&NumberOfDevices); 197 if (err != CUDA_SUCCESS) { 198 DP("Error when getting CUDA device count\n"); 199 CUDA_ERR_STRING(err); 200 return; 201 } 202 203 if (NumberOfDevices == 0) { 204 DP("There are no devices supporting CUDA.\n"); 205 return; 206 } 207 208 FuncGblEntries.resize(NumberOfDevices); 209 Contexts.resize(NumberOfDevices); 210 ThreadsPerBlock.resize(NumberOfDevices); 211 BlocksPerGrid.resize(NumberOfDevices); 212 WarpSize.resize(NumberOfDevices); 213 NumTeams.resize(NumberOfDevices); 214 NumThreads.resize(NumberOfDevices); 215 216 // Get environment variables regarding teams 217 char *envStr = getenv("OMP_TEAM_LIMIT"); 218 if (envStr) { 219 // OMP_TEAM_LIMIT has been set 220 EnvTeamLimit = std::stoi(envStr); 221 DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); 222 } else { 223 EnvTeamLimit = -1; 224 } 225 envStr = getenv("OMP_NUM_TEAMS"); 226 if (envStr) { 227 // OMP_NUM_TEAMS has been set 228 EnvNumTeams = std::stoi(envStr); 229 DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); 230 } else { 231 EnvNumTeams = -1; 232 } 233 234 // Default state. 235 RequiresFlags = OMP_REQ_UNDEFINED; 236 } 237 238 ~RTLDeviceInfoTy() { 239 // Close modules 240 for (auto &module : Modules) 241 if (module) { 242 CUresult err = cuModuleUnload(module); 243 if (err != CUDA_SUCCESS) { 244 DP("Error when unloading CUDA module\n"); 245 CUDA_ERR_STRING(err); 246 } 247 } 248 249 // Destroy contexts 250 for (auto &ctx : Contexts) 251 if (ctx) { 252 CUresult err = cuCtxDestroy(ctx); 253 if (err != CUDA_SUCCESS) { 254 DP("Error when destroying CUDA context\n"); 255 CUDA_ERR_STRING(err); 256 } 257 } 258 } 259 }; 260 261 static RTLDeviceInfoTy DeviceInfo; 262 263 #ifdef __cplusplus 264 extern "C" { 265 #endif 266 267 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { 268 return elf_check_machine(image, 190); // EM_CUDA = 190. 269 } 270 271 int32_t __tgt_rtl_number_of_devices() { return DeviceInfo.NumberOfDevices; } 272 273 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { 274 DP("Init requires flags to %ld\n", RequiresFlags); 275 DeviceInfo.RequiresFlags = RequiresFlags; 276 return RequiresFlags; 277 } 278 279 int32_t __tgt_rtl_init_device(int32_t device_id) { 280 281 CUdevice cuDevice; 282 DP("Getting device %d\n", device_id); 283 CUresult err = cuDeviceGet(&cuDevice, device_id); 284 if (err != CUDA_SUCCESS) { 285 DP("Error when getting CUDA device with id = %d\n", device_id); 286 CUDA_ERR_STRING(err); 287 return OFFLOAD_FAIL; 288 } 289 290 // Create the context and save it to use whenever this device is selected. 291 err = cuCtxCreate(&DeviceInfo.Contexts[device_id], CU_CTX_SCHED_BLOCKING_SYNC, 292 cuDevice); 293 if (err != CUDA_SUCCESS) { 294 DP("Error when creating a CUDA context\n"); 295 CUDA_ERR_STRING(err); 296 return OFFLOAD_FAIL; 297 } 298 299 // Query attributes to determine number of threads/block and blocks/grid. 300 int maxGridDimX; 301 err = cuDeviceGetAttribute(&maxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, 302 cuDevice); 303 if (err != CUDA_SUCCESS) { 304 DP("Error getting max grid dimension, use default\n"); 305 DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::DefaultNumTeams; 306 } else if (maxGridDimX <= RTLDeviceInfoTy::HardTeamLimit) { 307 DeviceInfo.BlocksPerGrid[device_id] = maxGridDimX; 308 DP("Using %d CUDA blocks per grid\n", maxGridDimX); 309 } else { 310 DeviceInfo.BlocksPerGrid[device_id] = RTLDeviceInfoTy::HardTeamLimit; 311 DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " 312 "at the hard limit\n", 313 maxGridDimX, RTLDeviceInfoTy::HardTeamLimit); 314 } 315 316 // We are only exploiting threads along the x axis. 317 int maxBlockDimX; 318 err = cuDeviceGetAttribute(&maxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, 319 cuDevice); 320 if (err != CUDA_SUCCESS) { 321 DP("Error getting max block dimension, use default\n"); 322 DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::DefaultNumThreads; 323 } else if (maxBlockDimX <= RTLDeviceInfoTy::HardThreadLimit) { 324 DeviceInfo.ThreadsPerBlock[device_id] = maxBlockDimX; 325 DP("Using %d CUDA threads per block\n", maxBlockDimX); 326 } else { 327 DeviceInfo.ThreadsPerBlock[device_id] = RTLDeviceInfoTy::HardThreadLimit; 328 DP("Max CUDA threads per block %d exceeds the hard thread limit %d, capping" 329 "at the hard limit\n", 330 maxBlockDimX, RTLDeviceInfoTy::HardThreadLimit); 331 } 332 333 int warpSize; 334 err = 335 cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice); 336 if (err != CUDA_SUCCESS) { 337 DP("Error getting warp size, assume default\n"); 338 DeviceInfo.WarpSize[device_id] = 32; 339 } else { 340 DeviceInfo.WarpSize[device_id] = warpSize; 341 } 342 343 // Adjust teams to the env variables 344 if (DeviceInfo.EnvTeamLimit > 0 && 345 DeviceInfo.BlocksPerGrid[device_id] > DeviceInfo.EnvTeamLimit) { 346 DeviceInfo.BlocksPerGrid[device_id] = DeviceInfo.EnvTeamLimit; 347 DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", 348 DeviceInfo.EnvTeamLimit); 349 } 350 351 DP("Max number of CUDA blocks %d, threads %d & warp size %d\n", 352 DeviceInfo.BlocksPerGrid[device_id], DeviceInfo.ThreadsPerBlock[device_id], 353 DeviceInfo.WarpSize[device_id]); 354 355 // Set default number of teams 356 if (DeviceInfo.EnvNumTeams > 0) { 357 DeviceInfo.NumTeams[device_id] = DeviceInfo.EnvNumTeams; 358 DP("Default number of teams set according to environment %d\n", 359 DeviceInfo.EnvNumTeams); 360 } else { 361 DeviceInfo.NumTeams[device_id] = RTLDeviceInfoTy::DefaultNumTeams; 362 DP("Default number of teams set according to library's default %d\n", 363 RTLDeviceInfoTy::DefaultNumTeams); 364 } 365 if (DeviceInfo.NumTeams[device_id] > DeviceInfo.BlocksPerGrid[device_id]) { 366 DeviceInfo.NumTeams[device_id] = DeviceInfo.BlocksPerGrid[device_id]; 367 DP("Default number of teams exceeds device limit, capping at %d\n", 368 DeviceInfo.BlocksPerGrid[device_id]); 369 } 370 371 // Set default number of threads 372 DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::DefaultNumThreads; 373 DP("Default number of threads set according to library's default %d\n", 374 RTLDeviceInfoTy::DefaultNumThreads); 375 if (DeviceInfo.NumThreads[device_id] > 376 DeviceInfo.ThreadsPerBlock[device_id]) { 377 DeviceInfo.NumTeams[device_id] = DeviceInfo.ThreadsPerBlock[device_id]; 378 DP("Default number of threads exceeds device limit, capping at %d\n", 379 DeviceInfo.ThreadsPerBlock[device_id]); 380 } 381 382 return OFFLOAD_SUCCESS; 383 } 384 385 __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, 386 __tgt_device_image *image) { 387 388 // Set the context we are using. 389 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 390 if (err != CUDA_SUCCESS) { 391 DP("Error when setting a CUDA context for device %d\n", device_id); 392 CUDA_ERR_STRING(err); 393 return NULL; 394 } 395 396 // Clear the offload table as we are going to create a new one. 397 DeviceInfo.clearOffloadEntriesTable(device_id); 398 399 // Create the module and extract the function pointers. 400 401 CUmodule cumod; 402 DP("Load data from image " DPxMOD "\n", DPxPTR(image->ImageStart)); 403 err = cuModuleLoadDataEx(&cumod, image->ImageStart, 0, NULL, NULL); 404 if (err != CUDA_SUCCESS) { 405 DP("Error when loading CUDA module\n"); 406 CUDA_ERR_STRING(err); 407 return NULL; 408 } 409 410 DP("CUDA module successfully loaded!\n"); 411 DeviceInfo.Modules.push_back(cumod); 412 413 // Find the symbols in the module by name. 414 __tgt_offload_entry *HostBegin = image->EntriesBegin; 415 __tgt_offload_entry *HostEnd = image->EntriesEnd; 416 417 for (__tgt_offload_entry *e = HostBegin; e != HostEnd; ++e) { 418 419 if (!e->addr) { 420 // We return NULL when something like this happens, the host should have 421 // always something in the address to uniquely identify the target region. 422 DP("Invalid binary: host entry '<null>' (size = %zd)...\n", e->size); 423 424 return NULL; 425 } 426 427 if (e->size) { 428 __tgt_offload_entry entry = *e; 429 430 CUdeviceptr cuptr; 431 size_t cusize; 432 err = cuModuleGetGlobal(&cuptr, &cusize, cumod, e->name); 433 434 if (err != CUDA_SUCCESS) { 435 DP("Loading global '%s' (Failed)\n", e->name); 436 CUDA_ERR_STRING(err); 437 return NULL; 438 } 439 440 if (cusize != e->size) { 441 DP("Loading global '%s' - size mismatch (%zd != %zd)\n", e->name, 442 cusize, e->size); 443 CUDA_ERR_STRING(err); 444 return NULL; 445 } 446 447 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", 448 DPxPTR(e - HostBegin), e->name, DPxPTR(cuptr)); 449 entry.addr = (void *)cuptr; 450 451 // Note: In the current implementation declare target variables 452 // can either be link or to. This means that once unified 453 // memory is activated via the requires directive, the variable 454 // can be used directly from the host in both cases. 455 // TODO: when variables types other than to or link are added, 456 // the below condition should be changed to explicitely 457 // check for to and link variables types: 458 // (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && 459 // (e->flags & OMP_DECLARE_TARGET_LINK || 460 // e->flags == OMP_DECLARE_TARGET_TO)) 461 if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { 462 // If unified memory is present any target link or to variables 463 // can access host addresses directly. There is no longer a 464 // need for device copies. 465 cuMemcpyHtoD(cuptr, e->addr, sizeof(void *)); 466 DP("Copy linked variable host address (" DPxMOD ")" 467 "to device address (" DPxMOD ")\n", 468 DPxPTR(*((void**)e->addr)), DPxPTR(cuptr)); 469 } 470 471 DeviceInfo.addOffloadEntry(device_id, entry); 472 473 continue; 474 } 475 476 CUfunction fun; 477 err = cuModuleGetFunction(&fun, cumod, e->name); 478 479 if (err != CUDA_SUCCESS) { 480 DP("Loading '%s' (Failed)\n", e->name); 481 CUDA_ERR_STRING(err); 482 return NULL; 483 } 484 485 DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", 486 DPxPTR(e - HostBegin), e->name, DPxPTR(fun)); 487 488 // default value GENERIC (in case symbol is missing from cubin file) 489 int8_t ExecModeVal = ExecutionModeType::GENERIC; 490 std::string ExecModeNameStr (e->name); 491 ExecModeNameStr += "_exec_mode"; 492 const char *ExecModeName = ExecModeNameStr.c_str(); 493 494 CUdeviceptr ExecModePtr; 495 size_t cusize; 496 err = cuModuleGetGlobal(&ExecModePtr, &cusize, cumod, ExecModeName); 497 if (err == CUDA_SUCCESS) { 498 if ((size_t)cusize != sizeof(int8_t)) { 499 DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", 500 ExecModeName, cusize, sizeof(int8_t)); 501 CUDA_ERR_STRING(err); 502 return NULL; 503 } 504 505 err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, cusize); 506 if (err != CUDA_SUCCESS) { 507 DP("Error when copying data from device to host. Pointers: " 508 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", 509 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), cusize); 510 CUDA_ERR_STRING(err); 511 return NULL; 512 } 513 514 if (ExecModeVal < 0 || ExecModeVal > 1) { 515 DP("Error wrong exec_mode value specified in cubin file: %d\n", 516 ExecModeVal); 517 return NULL; 518 } 519 } else { 520 DP("Loading global exec_mode '%s' - symbol missing, using default value " 521 "GENERIC (1)\n", ExecModeName); 522 CUDA_ERR_STRING(err); 523 } 524 525 KernelsList.push_back(KernelTy(fun, ExecModeVal)); 526 527 __tgt_offload_entry entry = *e; 528 entry.addr = (void *)&KernelsList.back(); 529 DeviceInfo.addOffloadEntry(device_id, entry); 530 } 531 532 // send device environment data to the device 533 { 534 omptarget_device_environmentTy device_env; 535 536 device_env.debug_level = 0; 537 538 #ifdef OMPTARGET_DEBUG 539 if (char *envStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) { 540 device_env.debug_level = std::stoi(envStr); 541 } 542 #endif 543 544 const char * device_env_Name="omptarget_device_environment"; 545 CUdeviceptr device_env_Ptr; 546 size_t cusize; 547 548 err = cuModuleGetGlobal(&device_env_Ptr, &cusize, cumod, device_env_Name); 549 550 if (err == CUDA_SUCCESS) { 551 if ((size_t)cusize != sizeof(device_env)) { 552 DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n", 553 device_env_Name, cusize, sizeof(int32_t)); 554 CUDA_ERR_STRING(err); 555 return NULL; 556 } 557 558 err = cuMemcpyHtoD(device_env_Ptr, &device_env, cusize); 559 if (err != CUDA_SUCCESS) { 560 DP("Error when copying data from host to device. Pointers: " 561 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", 562 DPxPTR(&device_env), DPxPTR(device_env_Ptr), cusize); 563 CUDA_ERR_STRING(err); 564 return NULL; 565 } 566 567 DP("Sending global device environment data %zu bytes\n", (size_t)cusize); 568 } else { 569 DP("Finding global device environment '%s' - symbol missing.\n", device_env_Name); 570 DP("Continue, considering this is a device RTL which does not accept envrionment setting.\n"); 571 } 572 } 573 574 return DeviceInfo.getOffloadEntriesTable(device_id); 575 } 576 577 void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { 578 if (size == 0) { 579 return NULL; 580 } 581 582 // Set the context we are using. 583 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 584 if (err != CUDA_SUCCESS) { 585 DP("Error while trying to set CUDA current context\n"); 586 CUDA_ERR_STRING(err); 587 return NULL; 588 } 589 590 CUdeviceptr ptr; 591 err = cuMemAlloc(&ptr, size); 592 if (err != CUDA_SUCCESS) { 593 DP("Error while trying to allocate %d\n", err); 594 CUDA_ERR_STRING(err); 595 return NULL; 596 } 597 598 void *vptr = (void *)ptr; 599 return vptr; 600 } 601 602 int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, 603 int64_t size) { 604 // Set the context we are using. 605 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 606 if (err != CUDA_SUCCESS) { 607 DP("Error when setting CUDA context\n"); 608 CUDA_ERR_STRING(err); 609 return OFFLOAD_FAIL; 610 } 611 612 err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size); 613 if (err != CUDA_SUCCESS) { 614 DP("Error when copying data from host to device. Pointers: host = " DPxMOD 615 ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), 616 DPxPTR(tgt_ptr), size); 617 CUDA_ERR_STRING(err); 618 return OFFLOAD_FAIL; 619 } 620 return OFFLOAD_SUCCESS; 621 } 622 623 int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, 624 int64_t size) { 625 // Set the context we are using. 626 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 627 if (err != CUDA_SUCCESS) { 628 DP("Error when setting CUDA context\n"); 629 CUDA_ERR_STRING(err); 630 return OFFLOAD_FAIL; 631 } 632 633 err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size); 634 if (err != CUDA_SUCCESS) { 635 DP("Error when copying data from device to host. Pointers: host = " DPxMOD 636 ", device = " DPxMOD ", size = %" PRId64 "\n", DPxPTR(hst_ptr), 637 DPxPTR(tgt_ptr), size); 638 CUDA_ERR_STRING(err); 639 return OFFLOAD_FAIL; 640 } 641 return OFFLOAD_SUCCESS; 642 } 643 644 int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { 645 // Set the context we are using. 646 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 647 if (err != CUDA_SUCCESS) { 648 DP("Error when setting CUDA context\n"); 649 CUDA_ERR_STRING(err); 650 return OFFLOAD_FAIL; 651 } 652 653 err = cuMemFree((CUdeviceptr)tgt_ptr); 654 if (err != CUDA_SUCCESS) { 655 DP("Error when freeing CUDA memory\n"); 656 CUDA_ERR_STRING(err); 657 return OFFLOAD_FAIL; 658 } 659 return OFFLOAD_SUCCESS; 660 } 661 662 int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, 663 void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, 664 int32_t thread_limit, uint64_t loop_tripcount) { 665 // Set the context we are using. 666 CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); 667 if (err != CUDA_SUCCESS) { 668 DP("Error when setting CUDA context\n"); 669 CUDA_ERR_STRING(err); 670 return OFFLOAD_FAIL; 671 } 672 673 // All args are references. 674 std::vector<void *> args(arg_num); 675 std::vector<void *> ptrs(arg_num); 676 677 for (int32_t i = 0; i < arg_num; ++i) { 678 ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); 679 args[i] = &ptrs[i]; 680 } 681 682 KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; 683 684 int cudaThreadsPerBlock; 685 686 if (thread_limit > 0) { 687 cudaThreadsPerBlock = thread_limit; 688 DP("Setting CUDA threads per block to requested %d\n", thread_limit); 689 // Add master warp if necessary 690 if (KernelInfo->ExecutionMode == GENERIC) { 691 cudaThreadsPerBlock += DeviceInfo.WarpSize[device_id]; 692 DP("Adding master warp: +%d threads\n", DeviceInfo.WarpSize[device_id]); 693 } 694 } else { 695 cudaThreadsPerBlock = DeviceInfo.NumThreads[device_id]; 696 DP("Setting CUDA threads per block to default %d\n", 697 DeviceInfo.NumThreads[device_id]); 698 } 699 700 if (cudaThreadsPerBlock > DeviceInfo.ThreadsPerBlock[device_id]) { 701 cudaThreadsPerBlock = DeviceInfo.ThreadsPerBlock[device_id]; 702 DP("Threads per block capped at device limit %d\n", 703 DeviceInfo.ThreadsPerBlock[device_id]); 704 } 705 706 int kernel_limit; 707 err = cuFuncGetAttribute(&kernel_limit, 708 CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, KernelInfo->Func); 709 if (err == CUDA_SUCCESS) { 710 if (kernel_limit < cudaThreadsPerBlock) { 711 cudaThreadsPerBlock = kernel_limit; 712 DP("Threads per block capped at kernel limit %d\n", kernel_limit); 713 } 714 } 715 716 int cudaBlocksPerGrid; 717 if (team_num <= 0) { 718 if (loop_tripcount > 0 && DeviceInfo.EnvNumTeams < 0) { 719 if (KernelInfo->ExecutionMode == SPMD) { 720 // We have a combined construct, i.e. `target teams distribute parallel 721 // for [simd]`. We launch so many teams so that each thread will 722 // execute one iteration of the loop. 723 // round up to the nearest integer 724 cudaBlocksPerGrid = ((loop_tripcount - 1) / cudaThreadsPerBlock) + 1; 725 } else { 726 // If we reach this point, then we have a non-combined construct, i.e. 727 // `teams distribute` with a nested `parallel for` and each team is 728 // assigned one iteration of the `distribute` loop. E.g.: 729 // 730 // #pragma omp target teams distribute 731 // for(...loop_tripcount...) { 732 // #pragma omp parallel for 733 // for(...) {} 734 // } 735 // 736 // Threads within a team will execute the iterations of the `parallel` 737 // loop. 738 cudaBlocksPerGrid = loop_tripcount; 739 } 740 DP("Using %d teams due to loop trip count %" PRIu64 " and number of " 741 "threads per block %d\n", cudaBlocksPerGrid, loop_tripcount, 742 cudaThreadsPerBlock); 743 } else { 744 cudaBlocksPerGrid = DeviceInfo.NumTeams[device_id]; 745 DP("Using default number of teams %d\n", DeviceInfo.NumTeams[device_id]); 746 } 747 } else if (team_num > DeviceInfo.BlocksPerGrid[device_id]) { 748 cudaBlocksPerGrid = DeviceInfo.BlocksPerGrid[device_id]; 749 DP("Capping number of teams to team limit %d\n", 750 DeviceInfo.BlocksPerGrid[device_id]); 751 } else { 752 cudaBlocksPerGrid = team_num; 753 DP("Using requested number of teams %d\n", team_num); 754 } 755 756 // Run on the device. 757 DP("Launch kernel with %d blocks and %d threads\n", cudaBlocksPerGrid, 758 cudaThreadsPerBlock); 759 760 err = cuLaunchKernel(KernelInfo->Func, cudaBlocksPerGrid, 1, 1, 761 cudaThreadsPerBlock, 1, 1, 0 /*bytes of shared memory*/, 0, &args[0], 0); 762 if (err != CUDA_SUCCESS) { 763 DP("Device kernel launch failed!\n"); 764 CUDA_ERR_STRING(err); 765 return OFFLOAD_FAIL; 766 } 767 768 DP("Launch of entry point at " DPxMOD " successful!\n", 769 DPxPTR(tgt_entry_ptr)); 770 771 CUresult sync_err = cuCtxSynchronize(); 772 if (sync_err != CUDA_SUCCESS) { 773 DP("Kernel execution error at " DPxMOD "!\n", DPxPTR(tgt_entry_ptr)); 774 CUDA_ERR_STRING(sync_err); 775 return OFFLOAD_FAIL; 776 } else { 777 DP("Kernel execution at " DPxMOD " successful!\n", DPxPTR(tgt_entry_ptr)); 778 } 779 780 return OFFLOAD_SUCCESS; 781 } 782 783 int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, 784 void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) { 785 // use one team and the default number of threads. 786 const int32_t team_num = 1; 787 const int32_t thread_limit = 0; 788 return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, 789 tgt_offsets, arg_num, team_num, thread_limit, 0); 790 } 791 792 #ifdef __cplusplus 793 } 794 #endif 795