1 //===-RTLs/nec-aurora/src/rtl.cpp - Target RTLs Implementation - C++ -*-======// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is dual licensed under the MIT and the University of Illinois Open 6 // Source Licenses. See LICENSE.txt for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // RTL for NEC Aurora TSUBASA machines 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "omptargetplugin.h" 15 16 #include <algorithm> 17 #include <cassert> 18 #include <cerrno> 19 #include <cstring> 20 #include <list> 21 #include <stdlib.h> 22 #include <string> 23 #include <sys/stat.h> 24 #include <ve_offload.h> 25 #include <vector> 26 #include <veosinfo/veosinfo.h> 27 28 #ifndef TARGET_ELF_ID 29 #define TARGET_ELF_ID 0 30 #endif 31 32 #ifdef OMPTARGET_DEBUG 33 static int DebugLevel = 0; 34 35 #define GETNAME2(name) #name 36 #define GETNAME(name) GETNAME2(name) 37 #define DP(...) \ 38 do { \ 39 if (DebugLevel > 0) { \ 40 DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ 41 } \ 42 } while (false) 43 #else // OMPTARGET_DEBUG 44 #define DP(...) \ 45 {} 46 #endif // OMPTARGET_DEBUG 47 48 #include "../../common/elf_common.c" 49 50 struct DynLibTy { 51 char *FileName; 52 uint64_t VeoLibHandle; 53 }; 54 55 /// Keep entries table per device. 56 struct FuncOrGblEntryTy { 57 __tgt_target_table Table; 58 std::vector<__tgt_offload_entry> Entries; 59 }; 60 61 class RTLDeviceInfoTy { 62 std::vector<std::list<FuncOrGblEntryTy>> FuncOrGblEntry; 63 64 public: 65 std::vector<struct veo_proc_handle *> ProcHandles; 66 std::vector<struct veo_thr_ctxt *> Contexts; 67 std::vector<uint64_t> LibraryHandles; 68 std::list<DynLibTy> DynLibs; 69 // Maps OpenMP device Ids to Ve nodeids 70 std::vector<int> NodeIds; 71 72 void buildOffloadTableFromHost(int32_t device_id, uint64_t VeoLibHandle, 73 __tgt_offload_entry *HostBegin, 74 __tgt_offload_entry *HostEnd) { 75 FuncOrGblEntry[device_id].emplace_back(); 76 std::vector<__tgt_offload_entry> &T = 77 FuncOrGblEntry[device_id].back().Entries; 78 T.clear(); 79 for (__tgt_offload_entry *i = HostBegin; i != HostEnd; ++i) { 80 char *SymbolName = i->name; 81 // we have not enough access to the target memory to conveniently parse 82 // the offload table there so we need to lookup every symbol with the host 83 // table 84 DP("Looking up symbol: %s\n", SymbolName); 85 uint64_t SymbolTargetAddr = 86 veo_get_sym(ProcHandles[device_id], VeoLibHandle, SymbolName); 87 __tgt_offload_entry Entry; 88 89 if (!SymbolTargetAddr) { 90 DP("Symbol %s not found in target image\n", SymbolName); 91 Entry = {NULL, NULL, 0, 0, 0}; 92 } else { 93 DP("Found symbol %s successfully in target image (addr: %p)\n", 94 SymbolName, reinterpret_cast<void *>(SymbolTargetAddr)); 95 Entry = { reinterpret_cast<void *>(SymbolTargetAddr), 96 i->name, 97 i->size, 98 i->flags, 99 0 }; 100 } 101 102 T.push_back(Entry); 103 } 104 105 FuncOrGblEntry[device_id].back().Table.EntriesBegin = &T.front(); 106 FuncOrGblEntry[device_id].back().Table.EntriesEnd = &T.back() + 1; 107 } 108 109 __tgt_target_table *getOffloadTable(int32_t device_id) { 110 return &FuncOrGblEntry[device_id].back().Table; 111 } 112 113 RTLDeviceInfoTy() { 114 #ifdef OMPTARGET_DEBUG 115 if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) { 116 DebugLevel = std::stoi(envStr); 117 } 118 #endif // OMPTARGET_DEBUG 119 120 struct ve_nodeinfo node_info; 121 ve_node_info(&node_info); 122 123 // Build a predictable mapping between VE node ids and OpenMP device ids. 124 // This is necessary, because nodes can be missing or offline and (active) 125 // node ids are thus not consecutive. The entries in ve_nodeinfo may also 126 // not be in the order of their node ids. 127 for (int i = 0; i < node_info.total_node_count; ++i) { 128 if (node_info.status[i] == 0) { 129 NodeIds.push_back(node_info.nodeid[i]); 130 } 131 } 132 133 // Because the entries in ve_nodeinfo may not be in the order of their node 134 // ids, we sort NodeIds to get a predictable mapping. 135 std::sort(NodeIds.begin(), NodeIds.end()); 136 137 int NumDevices = NodeIds.size(); 138 DP("Found %i VE devices\n", NumDevices); 139 ProcHandles.resize(NumDevices, NULL); 140 Contexts.resize(NumDevices, NULL); 141 FuncOrGblEntry.resize(NumDevices); 142 LibraryHandles.resize(NumDevices); 143 } 144 145 ~RTLDeviceInfoTy() { 146 for (auto &ctx : Contexts) { 147 if (ctx != NULL) { 148 if (veo_context_close(ctx) != 0) { 149 DP("Failed to close VEO context.\n"); 150 } 151 } 152 } 153 154 for (auto &hdl : ProcHandles) { 155 if (hdl != NULL) { 156 veo_proc_destroy(hdl); 157 } 158 } 159 160 for (auto &lib : DynLibs) { 161 if (lib.FileName) { 162 remove(lib.FileName); 163 } 164 } 165 } 166 }; 167 168 static RTLDeviceInfoTy DeviceInfo; 169 170 static int target_run_function_wait(uint32_t DeviceID, uint64_t FuncAddr, 171 struct veo_args *args, uint64_t *RetVal) { 172 DP("Running function with entry point %p\n", 173 reinterpret_cast<void *>(FuncAddr)); 174 uint64_t RequestHandle = 175 veo_call_async(DeviceInfo.Contexts[DeviceID], FuncAddr, args); 176 if (RequestHandle == VEO_REQUEST_ID_INVALID) { 177 DP("Execution of entry point %p failed\n", 178 reinterpret_cast<void *>(FuncAddr)); 179 return OFFLOAD_FAIL; 180 } 181 182 DP("Function at address %p called (VEO request ID: %" PRIu64 ")\n", 183 reinterpret_cast<void *>(FuncAddr), RequestHandle); 184 185 int ret = veo_call_wait_result(DeviceInfo.Contexts[DeviceID], RequestHandle, 186 RetVal); 187 if (ret != 0) { 188 DP("Waiting for entry point %p failed (Error code %d)\n", 189 reinterpret_cast<void *>(FuncAddr), ret); 190 return OFFLOAD_FAIL; 191 } 192 return OFFLOAD_SUCCESS; 193 } 194 195 196 // Return the number of available devices of the type supported by the 197 // target RTL. 198 int32_t __tgt_rtl_number_of_devices(void) { return DeviceInfo.NodeIds.size(); } 199 200 // Return an integer different from zero if the provided device image can be 201 // supported by the runtime. The functionality is similar to comparing the 202 // result of __tgt__rtl__load__binary to NULL. However, this is meant to be a 203 // lightweight query to determine if the RTL is suitable for an image without 204 // having to load the library, which can be expensive. 205 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { 206 #if TARGET_ELF_ID < 1 207 return 0; 208 #else 209 return elf_check_machine(Image, TARGET_ELF_ID); 210 #endif 211 } 212 213 // Initialize the specified device. In case of success return 0; otherwise 214 // return an error code. 215 int32_t __tgt_rtl_init_device(int32_t ID) { 216 DP("Available VEO version: %i\n", veo_api_version()); 217 218 // At the moment we do not really initialize (i.e. create a process or 219 // context on) the device here, but in "__tgt_rtl_load_binary". 220 // The reason for this is, that, when we create a process for a statically 221 // linked binary, the VEO api needs us to already supply the binary (but we 222 // can load a dynamically linked binary later, after we create the process). 223 // At this stage, we cannot check if we have a dynamically or statically 224 // linked binary so we defer process creation until we know. 225 return OFFLOAD_SUCCESS; 226 } 227 228 // Pass an executable image section described by image to the specified 229 // device and prepare an address table of target entities. In case of error, 230 // return NULL. Otherwise, return a pointer to the built address table. 231 // Individual entries in the table may also be NULL, when the corresponding 232 // offload region is not supported on the target device. 233 __tgt_target_table *__tgt_rtl_load_binary(int32_t ID, 234 __tgt_device_image *Image) { 235 DP("Dev %d: load binary from " DPxMOD " image\n", ID, 236 DPxPTR(Image->ImageStart)); 237 238 assert(ID >= 0 && "bad dev id"); 239 240 size_t ImageSize = (size_t)Image->ImageEnd - (size_t)Image->ImageStart; 241 size_t NumEntries = (size_t)(Image->EntriesEnd - Image->EntriesBegin); 242 DP("Expecting to have %zd entries defined.\n", NumEntries); 243 244 // load dynamic library and get the entry points. We use the dl library 245 // to do the loading of the library, but we could do it directly to avoid the 246 // dump to the temporary file. 247 // 248 // 1) Create tmp file with the library contents. 249 // 2) Use dlopen to load the file and dlsym to retrieve the symbols. 250 char tmp_name[] = "/tmp/tmpfile_XXXXXX"; 251 int tmp_fd = mkstemp(tmp_name); 252 253 if (tmp_fd == -1) { 254 return NULL; 255 } 256 257 FILE *ftmp = fdopen(tmp_fd, "wb"); 258 259 if (!ftmp) { 260 DP("fdopen() for %s failed. Could not write target image\n", tmp_name); 261 return NULL; 262 } 263 264 fwrite(Image->ImageStart, ImageSize, 1, ftmp); 265 266 // at least for the static case we need to change the permissions 267 chmod(tmp_name, 0700); 268 269 DP("Wrote target image to %s. ImageSize=%zu\n", tmp_name, ImageSize); 270 271 fclose(ftmp); 272 273 // See comment in "__tgt_rtl_init_device" 274 bool is_dyn = true; 275 if (DeviceInfo.ProcHandles[ID] == NULL) { 276 struct veo_proc_handle *proc_handle; 277 is_dyn = elf_is_dynamic(Image); 278 // If we have a dynamically linked image, we create the process handle, then 279 // the thread, and then load the image. 280 // If we have a statically linked image, we need to create the process 281 // handle and load the image at the same time with veo_proc_create_static(). 282 if (is_dyn) { 283 proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]); 284 if (!proc_handle) { 285 DP("veo_proc_create() failed for device %d\n", ID); 286 return NULL; 287 } 288 } else { 289 proc_handle = veo_proc_create_static(DeviceInfo.NodeIds[ID], tmp_name); 290 if (!proc_handle) { 291 DP("veo_proc_create_static() failed for device %d, image=%s\n", ID, 292 tmp_name); 293 return NULL; 294 } 295 } 296 DeviceInfo.ProcHandles[ID] = proc_handle; 297 } 298 299 if (DeviceInfo.Contexts[ID] == NULL) { 300 struct veo_thr_ctxt *ctx = veo_context_open(DeviceInfo.ProcHandles[ID]); 301 302 if (!ctx) { 303 DP("veo_context_open() failed: %s\n", std::strerror(errno)); 304 return NULL; 305 } 306 307 DeviceInfo.Contexts[ID] = ctx; 308 } 309 310 DP("Aurora device successfully initialized with loaded binary: " 311 "proc_handle=%p, ctx=%p\n", 312 DeviceInfo.ProcHandles[ID], DeviceInfo.Contexts[ID]); 313 314 uint64_t LibHandle = 0UL; 315 if (is_dyn) { 316 LibHandle = veo_load_library(DeviceInfo.ProcHandles[ID], tmp_name); 317 318 if (!LibHandle) { 319 DP("veo_load_library() failed: LibHandle=%" PRIu64 320 " Name=%s. Set env VEORUN_BIN for static linked target code.\n", 321 LibHandle, tmp_name); 322 return NULL; 323 } 324 325 DP("Successfully loaded library dynamically\n"); 326 } else { 327 DP("Symbol table is expected to have been created by " 328 "veo_create_proc_static()\n"); 329 } 330 331 DynLibTy Lib = {tmp_name, LibHandle}; 332 DeviceInfo.DynLibs.push_back(Lib); 333 DeviceInfo.LibraryHandles[ID] = LibHandle; 334 335 DeviceInfo.buildOffloadTableFromHost(ID, LibHandle, Image->EntriesBegin, 336 Image->EntriesEnd); 337 338 return DeviceInfo.getOffloadTable(ID); 339 } 340 341 // Allocate data on the particular target device, of the specified size. 342 // HostPtr is a address of the host data the allocated target data 343 // will be associated with (HostPtr may be NULL if it is not known at 344 // allocation time, like for example it would be for target data that 345 // is allocated by omp_target_alloc() API). Return address of the 346 // allocated data on the target that will be used by libomptarget.so to 347 // initialize the target data mapping structures. These addresses are 348 // used to generate a table of target variables to pass to 349 // __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in 350 // case an error occurred on the target device. 351 void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr) { 352 int ret; 353 uint64_t addr; 354 355 if (DeviceInfo.ProcHandles[ID] == NULL) { 356 struct veo_proc_handle *proc_handle; 357 proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]); 358 if (!proc_handle) { 359 DP("veo_proc_create() failed for device %d\n", ID); 360 return NULL; 361 } 362 DeviceInfo.ProcHandles[ID] = proc_handle; 363 DP("Aurora device successfully initialized: proc_handle=%p", proc_handle); 364 } 365 366 ret = veo_alloc_mem(DeviceInfo.ProcHandles[ID], &addr, Size); 367 DP("Allocate target memory: device=%d, target addr=%p, size=%" PRIu64 "\n", 368 ID, reinterpret_cast<void *>(addr), Size); 369 if (ret != 0) { 370 DP("veo_alloc_mem(%d, %p, %" PRIu64 ") failed with error code %d\n", 371 ID, reinterpret_cast<void *>(addr), Size, ret); 372 return NULL; 373 } 374 375 return reinterpret_cast<void *>(addr); 376 } 377 378 // Pass the data content to the target device using the target address. 379 // In case of success, return zero. Otherwise, return an error code. 380 int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr, 381 int64_t Size) { 382 int ret = veo_write_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr, 383 HostPtr, (size_t)Size); 384 if (ret != 0) { 385 DP("veo_write_mem() failed with error code %d\n", ret); 386 return OFFLOAD_FAIL; 387 } 388 return OFFLOAD_SUCCESS; 389 } 390 391 // Retrieve the data content from the target device using its address. 392 // In case of success, return zero. Otherwise, return an error code. 393 int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr, 394 int64_t Size) { 395 int ret = veo_read_mem(DeviceInfo.ProcHandles[ID], HostPtr, 396 (uint64_t)TargetPtr, Size); 397 if (ret != 0) { 398 DP("veo_read_mem() failed with error code %d\n", ret); 399 return OFFLOAD_FAIL; 400 } 401 return OFFLOAD_SUCCESS; 402 } 403 404 // De-allocate the data referenced by target ptr on the device. In case of 405 // success, return zero. Otherwise, return an error code. 406 int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr) { 407 int ret = veo_free_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr); 408 409 if (ret != 0) { 410 DP("veo_free_mem() failed with error code %d\n", ret); 411 return OFFLOAD_FAIL; 412 } 413 return OFFLOAD_SUCCESS; 414 } 415 416 // Similar to __tgt_rtl_run_target_region, but additionally specify the 417 // number of teams to be created and a number of threads in each team. 418 int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args, 419 ptrdiff_t *Offsets, int32_t NumArgs, 420 int32_t NumTeams, int32_t ThreadLimit, 421 uint64_t loop_tripcount) { 422 int ret; 423 424 // ignore team num and thread limit. 425 std::vector<void *> ptrs(NumArgs); 426 427 struct veo_args *TargetArgs; 428 TargetArgs = veo_args_alloc(); 429 430 if (TargetArgs == NULL) { 431 DP("Could not allocate VEO args\n"); 432 return OFFLOAD_FAIL; 433 } 434 435 for (int i = 0; i < NumArgs; ++i) { 436 ret = veo_args_set_u64(TargetArgs, i, (intptr_t)Args[i]); 437 438 if (ret != 0) { 439 DP("veo_args_set_u64() has returned %d for argnum=%d and value %p\n", 440 ret, i, Args[i]); 441 return OFFLOAD_FAIL; 442 } 443 } 444 445 uint64_t RetVal; 446 if (target_run_function_wait(ID, reinterpret_cast<uint64_t>(Entry), 447 TargetArgs, &RetVal) != OFFLOAD_SUCCESS) { 448 veo_args_free(TargetArgs); 449 return OFFLOAD_FAIL; 450 } 451 veo_args_free(TargetArgs); 452 return OFFLOAD_SUCCESS; 453 } 454 455 // Transfer control to the offloaded entry Entry on the target device. 456 // Args and Offsets are arrays of NumArgs size of target addresses and 457 // offsets. An offset should be added to the target address before passing it 458 // to the outlined function on device side. In case of success, return zero. 459 // Otherwise, return an error code. 460 int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args, 461 ptrdiff_t *Offsets, int32_t NumArgs) { 462 return __tgt_rtl_run_target_team_region(ID, Entry, Args, Offsets, NumArgs, 1, 463 1, 0); 464 } 465