1 //===-RTLs/nec-aurora/src/rtl.cpp - Target RTLs Implementation - C++ -*-======// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is dual licensed under the MIT and the University of Illinois Open 6 // Source Licenses. See LICENSE.txt for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // RTL for NEC Aurora TSUBASA machines 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include <algorithm> 15 #include <cassert> 16 #include <cerrno> 17 #include <cstring> 18 #include <list> 19 #include <stdlib.h> 20 #include <string> 21 #include <sys/stat.h> 22 #include <ve_offload.h> 23 #include <vector> 24 #include <veosinfo/veosinfo.h> 25 26 #include "Debug.h" 27 #include "omptargetplugin.h" 28 29 #ifndef TARGET_NAME 30 #define TARGET_NAME VE 31 #endif 32 33 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" 34 35 #ifndef TARGET_ELF_ID 36 #define TARGET_ELF_ID 0 37 #endif 38 39 #include "elf_common.h" 40 41 struct DynLibTy { 42 char *FileName; 43 uint64_t VeoLibHandle; 44 }; 45 46 /// Keep entries table per device. 47 struct FuncOrGblEntryTy { 48 __tgt_target_table Table; 49 std::vector<__tgt_offload_entry> Entries; 50 }; 51 52 class RTLDeviceInfoTy { 53 std::vector<std::list<FuncOrGblEntryTy>> FuncOrGblEntry; 54 55 public: 56 std::vector<struct veo_proc_handle *> ProcHandles; 57 std::vector<struct veo_thr_ctxt *> Contexts; 58 std::vector<uint64_t> LibraryHandles; 59 std::list<DynLibTy> DynLibs; 60 // Maps OpenMP device Ids to Ve nodeids 61 std::vector<int> NodeIds; 62 63 void buildOffloadTableFromHost(int32_t device_id, uint64_t VeoLibHandle, 64 __tgt_offload_entry *HostBegin, 65 __tgt_offload_entry *HostEnd) { 66 FuncOrGblEntry[device_id].emplace_back(); 67 std::vector<__tgt_offload_entry> &T = 68 FuncOrGblEntry[device_id].back().Entries; 69 T.clear(); 70 for (__tgt_offload_entry *i = HostBegin; i != HostEnd; ++i) { 71 char *SymbolName = i->name; 72 // we have not enough access to the target memory to conveniently parse 73 // the offload table there so we need to lookup every symbol with the host 74 // table 75 DP("Looking up symbol: %s\n", SymbolName); 76 uint64_t SymbolTargetAddr = 77 veo_get_sym(ProcHandles[device_id], VeoLibHandle, SymbolName); 78 __tgt_offload_entry Entry; 79 80 if (!SymbolTargetAddr) { 81 DP("Symbol %s not found in target image\n", SymbolName); 82 Entry = {NULL, NULL, 0, 0, 0}; 83 } else { 84 DP("Found symbol %s successfully in target image (addr: %p)\n", 85 SymbolName, reinterpret_cast<void *>(SymbolTargetAddr)); 86 Entry = {reinterpret_cast<void *>(SymbolTargetAddr), i->name, i->size, 87 i->flags, 0}; 88 } 89 90 T.push_back(Entry); 91 } 92 93 FuncOrGblEntry[device_id].back().Table.EntriesBegin = &T.front(); 94 FuncOrGblEntry[device_id].back().Table.EntriesEnd = &T.back() + 1; 95 } 96 97 __tgt_target_table *getOffloadTable(int32_t device_id) { 98 return &FuncOrGblEntry[device_id].back().Table; 99 } 100 101 RTLDeviceInfoTy() { 102 103 struct ve_nodeinfo node_info; 104 ve_node_info(&node_info); 105 106 // Build a predictable mapping between VE node ids and OpenMP device ids. 107 // This is necessary, because nodes can be missing or offline and (active) 108 // node ids are thus not consecutive. The entries in ve_nodeinfo may also 109 // not be in the order of their node ids. 110 for (int i = 0; i < node_info.total_node_count; ++i) { 111 if (node_info.status[i] == 0) { 112 NodeIds.push_back(node_info.nodeid[i]); 113 } 114 } 115 116 // Because the entries in ve_nodeinfo may not be in the order of their node 117 // ids, we sort NodeIds to get a predictable mapping. 118 std::sort(NodeIds.begin(), NodeIds.end()); 119 120 int NumDevices = NodeIds.size(); 121 DP("Found %i VE devices\n", NumDevices); 122 ProcHandles.resize(NumDevices, NULL); 123 Contexts.resize(NumDevices, NULL); 124 FuncOrGblEntry.resize(NumDevices); 125 LibraryHandles.resize(NumDevices); 126 } 127 128 ~RTLDeviceInfoTy() { 129 for (auto &ctx : Contexts) { 130 if (ctx != NULL) { 131 if (veo_context_close(ctx) != 0) { 132 DP("Failed to close VEO context.\n"); 133 } 134 } 135 } 136 137 for (auto &hdl : ProcHandles) { 138 if (hdl != NULL) { 139 veo_proc_destroy(hdl); 140 } 141 } 142 143 for (auto &lib : DynLibs) { 144 if (lib.FileName) { 145 remove(lib.FileName); 146 } 147 } 148 } 149 }; 150 151 static RTLDeviceInfoTy DeviceInfo; 152 153 static int target_run_function_wait(uint32_t DeviceID, uint64_t FuncAddr, 154 struct veo_args *args, uint64_t *RetVal) { 155 DP("Running function with entry point %p\n", 156 reinterpret_cast<void *>(FuncAddr)); 157 uint64_t RequestHandle = 158 veo_call_async(DeviceInfo.Contexts[DeviceID], FuncAddr, args); 159 if (RequestHandle == VEO_REQUEST_ID_INVALID) { 160 DP("Execution of entry point %p failed\n", 161 reinterpret_cast<void *>(FuncAddr)); 162 return OFFLOAD_FAIL; 163 } 164 165 DP("Function at address %p called (VEO request ID: %" PRIu64 ")\n", 166 reinterpret_cast<void *>(FuncAddr), RequestHandle); 167 168 int ret = veo_call_wait_result(DeviceInfo.Contexts[DeviceID], RequestHandle, 169 RetVal); 170 if (ret != 0) { 171 DP("Waiting for entry point %p failed (Error code %d)\n", 172 reinterpret_cast<void *>(FuncAddr), ret); 173 return OFFLOAD_FAIL; 174 } 175 return OFFLOAD_SUCCESS; 176 } 177 178 // Return the number of available devices of the type supported by the 179 // target RTL. 180 int32_t __tgt_rtl_number_of_devices(void) { return DeviceInfo.NodeIds.size(); } 181 182 // Return an integer different from zero if the provided device image can be 183 // supported by the runtime. The functionality is similar to comparing the 184 // result of __tgt__rtl__load__binary to NULL. However, this is meant to be a 185 // lightweight query to determine if the RTL is suitable for an image without 186 // having to load the library, which can be expensive. 187 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { 188 #if TARGET_ELF_ID < 1 189 return 0; 190 #else 191 return elf_check_machine(Image, TARGET_ELF_ID); 192 #endif 193 } 194 195 // Initialize the specified device. In case of success return 0; otherwise 196 // return an error code. 197 int32_t __tgt_rtl_init_device(int32_t ID) { 198 DP("Available VEO version: %i\n", veo_api_version()); 199 200 // At the moment we do not really initialize (i.e. create a process or 201 // context on) the device here, but in "__tgt_rtl_load_binary". 202 // The reason for this is, that, when we create a process for a statically 203 // linked binary, the VEO api needs us to already supply the binary (but we 204 // can load a dynamically linked binary later, after we create the process). 205 // At this stage, we cannot check if we have a dynamically or statically 206 // linked binary so we defer process creation until we know. 207 return OFFLOAD_SUCCESS; 208 } 209 210 // Pass an executable image section described by image to the specified 211 // device and prepare an address table of target entities. In case of error, 212 // return NULL. Otherwise, return a pointer to the built address table. 213 // Individual entries in the table may also be NULL, when the corresponding 214 // offload region is not supported on the target device. 215 __tgt_target_table *__tgt_rtl_load_binary(int32_t ID, 216 __tgt_device_image *Image) { 217 DP("Dev %d: load binary from " DPxMOD " image\n", ID, 218 DPxPTR(Image->ImageStart)); 219 220 assert(ID >= 0 && "bad dev id"); 221 222 size_t ImageSize = (size_t)Image->ImageEnd - (size_t)Image->ImageStart; 223 size_t NumEntries = (size_t)(Image->EntriesEnd - Image->EntriesBegin); 224 DP("Expecting to have %zd entries defined.\n", NumEntries); 225 226 // load dynamic library and get the entry points. We use the dl library 227 // to do the loading of the library, but we could do it directly to avoid the 228 // dump to the temporary file. 229 // 230 // 1) Create tmp file with the library contents. 231 // 2) Use dlopen to load the file and dlsym to retrieve the symbols. 232 char tmp_name[] = "/tmp/tmpfile_XXXXXX"; 233 int tmp_fd = mkstemp(tmp_name); 234 235 if (tmp_fd == -1) { 236 return NULL; 237 } 238 239 FILE *ftmp = fdopen(tmp_fd, "wb"); 240 241 if (!ftmp) { 242 DP("fdopen() for %s failed. Could not write target image\n", tmp_name); 243 return NULL; 244 } 245 246 fwrite(Image->ImageStart, ImageSize, 1, ftmp); 247 248 // at least for the static case we need to change the permissions 249 chmod(tmp_name, 0700); 250 251 DP("Wrote target image to %s. ImageSize=%zu\n", tmp_name, ImageSize); 252 253 fclose(ftmp); 254 255 // See comment in "__tgt_rtl_init_device" 256 bool is_dyn = true; 257 if (DeviceInfo.ProcHandles[ID] == NULL) { 258 struct veo_proc_handle *proc_handle; 259 is_dyn = elf_is_dynamic(Image); 260 // If we have a dynamically linked image, we create the process handle, then 261 // the thread, and then load the image. 262 // If we have a statically linked image, we need to create the process 263 // handle and load the image at the same time with veo_proc_create_static(). 264 if (is_dyn) { 265 proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]); 266 if (!proc_handle) { 267 DP("veo_proc_create() failed for device %d\n", ID); 268 return NULL; 269 } 270 } else { 271 proc_handle = veo_proc_create_static(DeviceInfo.NodeIds[ID], tmp_name); 272 if (!proc_handle) { 273 DP("veo_proc_create_static() failed for device %d, image=%s\n", ID, 274 tmp_name); 275 return NULL; 276 } 277 } 278 DeviceInfo.ProcHandles[ID] = proc_handle; 279 } 280 281 if (DeviceInfo.Contexts[ID] == NULL) { 282 struct veo_thr_ctxt *ctx = veo_context_open(DeviceInfo.ProcHandles[ID]); 283 284 if (!ctx) { 285 DP("veo_context_open() failed: %s\n", std::strerror(errno)); 286 return NULL; 287 } 288 289 DeviceInfo.Contexts[ID] = ctx; 290 } 291 292 DP("Aurora device successfully initialized with loaded binary: " 293 "proc_handle=%p, ctx=%p\n", 294 DeviceInfo.ProcHandles[ID], DeviceInfo.Contexts[ID]); 295 296 uint64_t LibHandle = 0UL; 297 if (is_dyn) { 298 LibHandle = veo_load_library(DeviceInfo.ProcHandles[ID], tmp_name); 299 300 if (!LibHandle) { 301 DP("veo_load_library() failed: LibHandle=%" PRIu64 302 " Name=%s. Set env VEORUN_BIN for static linked target code.\n", 303 LibHandle, tmp_name); 304 return NULL; 305 } 306 307 DP("Successfully loaded library dynamically\n"); 308 } else { 309 DP("Symbol table is expected to have been created by " 310 "veo_create_proc_static()\n"); 311 } 312 313 DynLibTy Lib = {tmp_name, LibHandle}; 314 DeviceInfo.DynLibs.push_back(Lib); 315 DeviceInfo.LibraryHandles[ID] = LibHandle; 316 317 DeviceInfo.buildOffloadTableFromHost(ID, LibHandle, Image->EntriesBegin, 318 Image->EntriesEnd); 319 320 return DeviceInfo.getOffloadTable(ID); 321 } 322 323 // Allocate data on the particular target device, of the specified size. 324 // HostPtr is a address of the host data the allocated target data 325 // will be associated with (HostPtr may be NULL if it is not known at 326 // allocation time, like for example it would be for target data that 327 // is allocated by omp_target_alloc() API). Return address of the 328 // allocated data on the target that will be used by libomptarget.so to 329 // initialize the target data mapping structures. These addresses are 330 // used to generate a table of target variables to pass to 331 // __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in 332 // case an error occurred on the target device. 333 void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr, 334 int32_t kind) { 335 int ret; 336 uint64_t addr; 337 338 if (kind != TARGET_ALLOC_DEFAULT) { 339 REPORT("Invalid target data allocation kind or requested allocator not " 340 "implemented yet\n"); 341 return NULL; 342 } 343 344 if (DeviceInfo.ProcHandles[ID] == NULL) { 345 struct veo_proc_handle *proc_handle; 346 proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]); 347 if (!proc_handle) { 348 DP("veo_proc_create() failed for device %d\n", ID); 349 return NULL; 350 } 351 DeviceInfo.ProcHandles[ID] = proc_handle; 352 DP("Aurora device successfully initialized: proc_handle=%p", proc_handle); 353 } 354 355 ret = veo_alloc_mem(DeviceInfo.ProcHandles[ID], &addr, Size); 356 DP("Allocate target memory: device=%d, target addr=%p, size=%" PRIu64 "\n", 357 ID, reinterpret_cast<void *>(addr), Size); 358 if (ret != 0) { 359 DP("veo_alloc_mem(%d, %p, %" PRIu64 ") failed with error code %d\n", ID, 360 reinterpret_cast<void *>(addr), Size, ret); 361 return NULL; 362 } 363 364 return reinterpret_cast<void *>(addr); 365 } 366 367 // Pass the data content to the target device using the target address. 368 // In case of success, return zero. Otherwise, return an error code. 369 int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr, 370 int64_t Size) { 371 int ret = veo_write_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr, 372 HostPtr, (size_t)Size); 373 if (ret != 0) { 374 DP("veo_write_mem() failed with error code %d\n", ret); 375 return OFFLOAD_FAIL; 376 } 377 return OFFLOAD_SUCCESS; 378 } 379 380 // Retrieve the data content from the target device using its address. 381 // In case of success, return zero. Otherwise, return an error code. 382 int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr, 383 int64_t Size) { 384 int ret = veo_read_mem(DeviceInfo.ProcHandles[ID], HostPtr, 385 (uint64_t)TargetPtr, Size); 386 if (ret != 0) { 387 DP("veo_read_mem() failed with error code %d\n", ret); 388 return OFFLOAD_FAIL; 389 } 390 return OFFLOAD_SUCCESS; 391 } 392 393 // De-allocate the data referenced by target ptr on the device. In case of 394 // success, return zero. Otherwise, return an error code. 395 int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr) { 396 int ret = veo_free_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr); 397 398 if (ret != 0) { 399 DP("veo_free_mem() failed with error code %d\n", ret); 400 return OFFLOAD_FAIL; 401 } 402 return OFFLOAD_SUCCESS; 403 } 404 405 // Similar to __tgt_rtl_run_target_region, but additionally specify the 406 // number of teams to be created and a number of threads in each team. 407 int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args, 408 ptrdiff_t *Offsets, int32_t NumArgs, 409 int32_t NumTeams, int32_t ThreadLimit, 410 uint64_t loop_tripcount) { 411 int ret; 412 413 // ignore team num and thread limit. 414 std::vector<void *> ptrs(NumArgs); 415 416 struct veo_args *TargetArgs; 417 TargetArgs = veo_args_alloc(); 418 419 if (TargetArgs == NULL) { 420 DP("Could not allocate VEO args\n"); 421 return OFFLOAD_FAIL; 422 } 423 424 for (int i = 0; i < NumArgs; ++i) { 425 ret = veo_args_set_u64(TargetArgs, i, (intptr_t)Args[i]); 426 427 if (ret != 0) { 428 DP("veo_args_set_u64() has returned %d for argnum=%d and value %p\n", ret, 429 i, Args[i]); 430 return OFFLOAD_FAIL; 431 } 432 } 433 434 uint64_t RetVal; 435 if (target_run_function_wait(ID, reinterpret_cast<uint64_t>(Entry), 436 TargetArgs, &RetVal) != OFFLOAD_SUCCESS) { 437 veo_args_free(TargetArgs); 438 return OFFLOAD_FAIL; 439 } 440 veo_args_free(TargetArgs); 441 return OFFLOAD_SUCCESS; 442 } 443 444 // Transfer control to the offloaded entry Entry on the target device. 445 // Args and Offsets are arrays of NumArgs size of target addresses and 446 // offsets. An offset should be added to the target address before passing it 447 // to the outlined function on device side. In case of success, return zero. 448 // Otherwise, return an error code. 449 int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args, 450 ptrdiff_t *Offsets, int32_t NumArgs) { 451 return __tgt_rtl_run_target_team_region(ID, Entry, Args, Offsets, NumArgs, 1, 452 1, 0); 453 } 454 455 int32_t __tgt_rtl_supports_empty_images() { return 1; } 456 457 // VEC plugin's internal InfoLevel. 458 std::atomic<uint32_t> InfoLevel; 459