1 //===-RTLs/nec-aurora/src/rtl.cpp - Target RTLs Implementation - C++ -*-======//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is dual licensed under the MIT and the University of Illinois Open
6 // Source Licenses. See LICENSE.txt for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // RTL for NEC Aurora TSUBASA machines
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "Debug.h"
15 #include "omptargetplugin.h"
16 
17 #include <algorithm>
18 #include <cassert>
19 #include <cerrno>
20 #include <cstring>
21 #include <list>
22 #include <stdlib.h>
23 #include <string>
24 #include <sys/stat.h>
25 #include <ve_offload.h>
26 #include <vector>
27 #include <veosinfo/veosinfo.h>
28 
29 #ifndef TARGET_ELF_ID
30 #define TARGET_ELF_ID 0
31 #endif
32 
33 #define TARGET_NAME VE
34 
35 #define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
36 
37 #include "../../common/elf_common.c"
38 
39 struct DynLibTy {
40   char *FileName;
41   uint64_t VeoLibHandle;
42 };
43 
44 /// Keep entries table per device.
45 struct FuncOrGblEntryTy {
46   __tgt_target_table Table;
47   std::vector<__tgt_offload_entry> Entries;
48 };
49 
50 class RTLDeviceInfoTy {
51   std::vector<std::list<FuncOrGblEntryTy>> FuncOrGblEntry;
52 
53 public:
54   std::vector<struct veo_proc_handle *> ProcHandles;
55   std::vector<struct veo_thr_ctxt *> Contexts;
56   std::vector<uint64_t> LibraryHandles;
57   std::list<DynLibTy> DynLibs;
58   // Maps OpenMP device Ids to Ve nodeids
59   std::vector<int> NodeIds;
60 
61   void buildOffloadTableFromHost(int32_t device_id, uint64_t VeoLibHandle,
62                                  __tgt_offload_entry *HostBegin,
63                                  __tgt_offload_entry *HostEnd) {
64     FuncOrGblEntry[device_id].emplace_back();
65     std::vector<__tgt_offload_entry> &T =
66         FuncOrGblEntry[device_id].back().Entries;
67     T.clear();
68     for (__tgt_offload_entry *i = HostBegin; i != HostEnd; ++i) {
69       char *SymbolName = i->name;
70       // we have not enough access to the target memory to conveniently parse
71       // the offload table there so we need to lookup every symbol with the host
72       // table
73       DP("Looking up symbol: %s\n", SymbolName);
74       uint64_t SymbolTargetAddr =
75           veo_get_sym(ProcHandles[device_id], VeoLibHandle, SymbolName);
76       __tgt_offload_entry Entry;
77 
78       if (!SymbolTargetAddr) {
79         DP("Symbol %s not found in target image\n", SymbolName);
80         Entry = {NULL, NULL, 0, 0, 0};
81       } else {
82         DP("Found symbol %s successfully in target image (addr: %p)\n",
83            SymbolName, reinterpret_cast<void *>(SymbolTargetAddr));
84         Entry = { reinterpret_cast<void *>(SymbolTargetAddr),
85                   i->name,
86                   i->size,
87                   i->flags,
88                   0 };
89       }
90 
91       T.push_back(Entry);
92     }
93 
94     FuncOrGblEntry[device_id].back().Table.EntriesBegin = &T.front();
95     FuncOrGblEntry[device_id].back().Table.EntriesEnd = &T.back() + 1;
96   }
97 
98   __tgt_target_table *getOffloadTable(int32_t device_id) {
99     return &FuncOrGblEntry[device_id].back().Table;
100   }
101 
102   RTLDeviceInfoTy() {
103 
104     struct ve_nodeinfo node_info;
105     ve_node_info(&node_info);
106 
107     // Build a predictable mapping between VE node ids and OpenMP device ids.
108     // This is necessary, because nodes can be missing or offline and (active)
109     // node ids are thus not consecutive. The entries in ve_nodeinfo may also
110     // not be in the order of their node ids.
111     for (int i = 0; i < node_info.total_node_count; ++i) {
112       if (node_info.status[i] == 0) {
113         NodeIds.push_back(node_info.nodeid[i]);
114       }
115     }
116 
117     // Because the entries in ve_nodeinfo may not be in the order of their node
118     // ids, we sort NodeIds to get a predictable mapping.
119     std::sort(NodeIds.begin(), NodeIds.end());
120 
121     int NumDevices = NodeIds.size();
122     DP("Found %i VE devices\n", NumDevices);
123     ProcHandles.resize(NumDevices, NULL);
124     Contexts.resize(NumDevices, NULL);
125     FuncOrGblEntry.resize(NumDevices);
126     LibraryHandles.resize(NumDevices);
127   }
128 
129   ~RTLDeviceInfoTy() {
130     for (auto &ctx : Contexts) {
131       if (ctx != NULL) {
132         if (veo_context_close(ctx) != 0) {
133           DP("Failed to close VEO context.\n");
134         }
135       }
136     }
137 
138     for (auto &hdl : ProcHandles) {
139       if (hdl != NULL) {
140         veo_proc_destroy(hdl);
141       }
142     }
143 
144     for (auto &lib : DynLibs) {
145       if (lib.FileName) {
146         remove(lib.FileName);
147       }
148     }
149   }
150 };
151 
152 static RTLDeviceInfoTy DeviceInfo;
153 
154 static int target_run_function_wait(uint32_t DeviceID, uint64_t FuncAddr,
155                                     struct veo_args *args, uint64_t *RetVal) {
156   DP("Running function with entry point %p\n",
157      reinterpret_cast<void *>(FuncAddr));
158   uint64_t RequestHandle =
159       veo_call_async(DeviceInfo.Contexts[DeviceID], FuncAddr, args);
160   if (RequestHandle == VEO_REQUEST_ID_INVALID) {
161     DP("Execution of entry point %p failed\n",
162        reinterpret_cast<void *>(FuncAddr));
163     return OFFLOAD_FAIL;
164   }
165 
166   DP("Function at address %p called (VEO request ID: %" PRIu64 ")\n",
167      reinterpret_cast<void *>(FuncAddr), RequestHandle);
168 
169   int ret = veo_call_wait_result(DeviceInfo.Contexts[DeviceID], RequestHandle,
170                                  RetVal);
171   if (ret != 0) {
172     DP("Waiting for entry point %p failed (Error code %d)\n",
173        reinterpret_cast<void *>(FuncAddr), ret);
174     return OFFLOAD_FAIL;
175   }
176   return OFFLOAD_SUCCESS;
177 }
178 
179 
180 // Return the number of available devices of the type supported by the
181 // target RTL.
182 int32_t __tgt_rtl_number_of_devices(void) { return DeviceInfo.NodeIds.size(); }
183 
184 // Return an integer different from zero if the provided device image can be
185 // supported by the runtime. The functionality is similar to comparing the
186 // result of __tgt__rtl__load__binary to NULL. However, this is meant to be a
187 // lightweight query to determine if the RTL is suitable for an image without
188 // having to load the library, which can be expensive.
189 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
190 #if TARGET_ELF_ID < 1
191   return 0;
192 #else
193   return elf_check_machine(Image, TARGET_ELF_ID);
194 #endif
195 }
196 
197 // Initialize the specified device. In case of success return 0; otherwise
198 // return an error code.
199 int32_t __tgt_rtl_init_device(int32_t ID) {
200   DP("Available VEO version: %i\n", veo_api_version());
201 
202   // At the moment we do not really initialize (i.e. create a process or
203   // context on) the device here, but in "__tgt_rtl_load_binary".
204   // The reason for this is, that, when we create a process for a statically
205   // linked binary, the VEO api needs us to already supply the binary (but we
206   // can load a dynamically linked binary later, after we create the process).
207   // At this stage, we cannot check if we have a dynamically or statically
208   // linked binary so we defer process creation until we know.
209   return OFFLOAD_SUCCESS;
210 }
211 
212 // Pass an executable image section described by image to the specified
213 // device and prepare an address table of target entities. In case of error,
214 // return NULL. Otherwise, return a pointer to the built address table.
215 // Individual entries in the table may also be NULL, when the corresponding
216 // offload region is not supported on the target device.
217 __tgt_target_table *__tgt_rtl_load_binary(int32_t ID,
218                                           __tgt_device_image *Image) {
219   DP("Dev %d: load binary from " DPxMOD " image\n", ID,
220      DPxPTR(Image->ImageStart));
221 
222   assert(ID >= 0 && "bad dev id");
223 
224   size_t ImageSize = (size_t)Image->ImageEnd - (size_t)Image->ImageStart;
225   size_t NumEntries = (size_t)(Image->EntriesEnd - Image->EntriesBegin);
226   DP("Expecting to have %zd entries defined.\n", NumEntries);
227 
228   // load dynamic library and get the entry points. We use the dl library
229   // to do the loading of the library, but we could do it directly to avoid the
230   // dump to the temporary file.
231   //
232   // 1) Create tmp file with the library contents.
233   // 2) Use dlopen to load the file and dlsym to retrieve the symbols.
234   char tmp_name[] = "/tmp/tmpfile_XXXXXX";
235   int tmp_fd = mkstemp(tmp_name);
236 
237   if (tmp_fd == -1) {
238     return NULL;
239   }
240 
241   FILE *ftmp = fdopen(tmp_fd, "wb");
242 
243   if (!ftmp) {
244     DP("fdopen() for %s failed. Could not write target image\n", tmp_name);
245     return NULL;
246   }
247 
248   fwrite(Image->ImageStart, ImageSize, 1, ftmp);
249 
250   // at least for the static case we need to change the permissions
251   chmod(tmp_name, 0700);
252 
253   DP("Wrote target image to %s. ImageSize=%zu\n", tmp_name, ImageSize);
254 
255   fclose(ftmp);
256 
257   // See comment in "__tgt_rtl_init_device"
258   bool is_dyn = true;
259   if (DeviceInfo.ProcHandles[ID] == NULL) {
260     struct veo_proc_handle *proc_handle;
261     is_dyn = elf_is_dynamic(Image);
262     // If we have a dynamically linked image, we create the process handle, then
263     // the thread, and then load the image.
264     // If we have a statically linked image, we need to create the process
265     // handle and load the image at the same time with veo_proc_create_static().
266     if (is_dyn) {
267       proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]);
268       if (!proc_handle) {
269         DP("veo_proc_create() failed for device %d\n", ID);
270         return NULL;
271       }
272     } else {
273       proc_handle = veo_proc_create_static(DeviceInfo.NodeIds[ID], tmp_name);
274       if (!proc_handle) {
275         DP("veo_proc_create_static() failed for device %d, image=%s\n", ID,
276            tmp_name);
277         return NULL;
278       }
279     }
280     DeviceInfo.ProcHandles[ID] = proc_handle;
281   }
282 
283   if (DeviceInfo.Contexts[ID] == NULL) {
284     struct veo_thr_ctxt *ctx = veo_context_open(DeviceInfo.ProcHandles[ID]);
285 
286     if (!ctx) {
287       DP("veo_context_open() failed: %s\n", std::strerror(errno));
288       return NULL;
289     }
290 
291     DeviceInfo.Contexts[ID] = ctx;
292   }
293 
294   DP("Aurora device successfully initialized with loaded binary: "
295      "proc_handle=%p, ctx=%p\n",
296      DeviceInfo.ProcHandles[ID], DeviceInfo.Contexts[ID]);
297 
298   uint64_t LibHandle = 0UL;
299   if (is_dyn) {
300     LibHandle = veo_load_library(DeviceInfo.ProcHandles[ID], tmp_name);
301 
302     if (!LibHandle) {
303       DP("veo_load_library() failed: LibHandle=%" PRIu64
304          " Name=%s. Set env VEORUN_BIN for static linked target code.\n",
305          LibHandle, tmp_name);
306       return NULL;
307     }
308 
309     DP("Successfully loaded library dynamically\n");
310   } else {
311     DP("Symbol table is expected to have been created by "
312        "veo_create_proc_static()\n");
313   }
314 
315   DynLibTy Lib = {tmp_name, LibHandle};
316   DeviceInfo.DynLibs.push_back(Lib);
317   DeviceInfo.LibraryHandles[ID] = LibHandle;
318 
319   DeviceInfo.buildOffloadTableFromHost(ID, LibHandle, Image->EntriesBegin,
320                                        Image->EntriesEnd);
321 
322   return DeviceInfo.getOffloadTable(ID);
323 }
324 
325 // Allocate data on the particular target device, of the specified size.
326 // HostPtr is a address of the host data the allocated target data
327 // will be associated with (HostPtr may be NULL if it is not known at
328 // allocation time, like for example it would be for target data that
329 // is allocated by omp_target_alloc() API). Return address of the
330 // allocated data on the target that will be used by libomptarget.so to
331 // initialize the target data mapping structures. These addresses are
332 // used to generate a table of target variables to pass to
333 // __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in
334 // case an error occurred on the target device.
335 void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr) {
336   int ret;
337   uint64_t addr;
338 
339   if (DeviceInfo.ProcHandles[ID] == NULL) {
340     struct veo_proc_handle *proc_handle;
341     proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]);
342     if (!proc_handle) {
343       DP("veo_proc_create() failed for device %d\n", ID);
344       return NULL;
345     }
346     DeviceInfo.ProcHandles[ID] = proc_handle;
347     DP("Aurora device successfully initialized: proc_handle=%p", proc_handle);
348   }
349 
350   ret = veo_alloc_mem(DeviceInfo.ProcHandles[ID], &addr, Size);
351   DP("Allocate target memory: device=%d, target addr=%p, size=%" PRIu64 "\n",
352      ID, reinterpret_cast<void *>(addr), Size);
353   if (ret != 0) {
354     DP("veo_alloc_mem(%d, %p, %" PRIu64 ") failed with error code %d\n",
355        ID, reinterpret_cast<void *>(addr), Size, ret);
356     return NULL;
357   }
358 
359   return reinterpret_cast<void *>(addr);
360 }
361 
362 // Pass the data content to the target device using the target address.
363 // In case of success, return zero. Otherwise, return an error code.
364 int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr,
365                               int64_t Size) {
366   int ret = veo_write_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr,
367                           HostPtr, (size_t)Size);
368   if (ret != 0) {
369     DP("veo_write_mem() failed with error code %d\n", ret);
370     return OFFLOAD_FAIL;
371   }
372   return OFFLOAD_SUCCESS;
373 }
374 
375 // Retrieve the data content from the target device using its address.
376 // In case of success, return zero. Otherwise, return an error code.
377 int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr,
378                                 int64_t Size) {
379   int ret = veo_read_mem(DeviceInfo.ProcHandles[ID], HostPtr,
380                          (uint64_t)TargetPtr, Size);
381   if (ret != 0) {
382     DP("veo_read_mem() failed with error code %d\n", ret);
383     return OFFLOAD_FAIL;
384   }
385   return OFFLOAD_SUCCESS;
386 }
387 
388 // De-allocate the data referenced by target ptr on the device. In case of
389 // success, return zero. Otherwise, return an error code.
390 int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr) {
391   int ret =  veo_free_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr);
392 
393   if (ret != 0) {
394     DP("veo_free_mem() failed with error code %d\n", ret);
395     return OFFLOAD_FAIL;
396   }
397   return OFFLOAD_SUCCESS;
398 }
399 
400 // Similar to __tgt_rtl_run_target_region, but additionally specify the
401 // number of teams to be created and a number of threads in each team.
402 int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args,
403                                          ptrdiff_t *Offsets, int32_t NumArgs,
404                                          int32_t NumTeams, int32_t ThreadLimit,
405                                          uint64_t loop_tripcount) {
406   int ret;
407 
408   // ignore team num and thread limit.
409   std::vector<void *> ptrs(NumArgs);
410 
411   struct veo_args *TargetArgs;
412   TargetArgs = veo_args_alloc();
413 
414   if (TargetArgs == NULL) {
415     DP("Could not allocate VEO args\n");
416     return OFFLOAD_FAIL;
417   }
418 
419   for (int i = 0; i < NumArgs; ++i) {
420     ret = veo_args_set_u64(TargetArgs, i, (intptr_t)Args[i]);
421 
422     if (ret != 0) {
423       DP("veo_args_set_u64() has returned %d for argnum=%d and value %p\n",
424          ret, i, Args[i]);
425       return OFFLOAD_FAIL;
426     }
427   }
428 
429   uint64_t RetVal;
430   if (target_run_function_wait(ID, reinterpret_cast<uint64_t>(Entry),
431                                TargetArgs, &RetVal) != OFFLOAD_SUCCESS) {
432     veo_args_free(TargetArgs);
433     return OFFLOAD_FAIL;
434   }
435   veo_args_free(TargetArgs);
436   return OFFLOAD_SUCCESS;
437 }
438 
439 // Transfer control to the offloaded entry Entry on the target device.
440 // Args and Offsets are arrays of NumArgs size of target addresses and
441 // offsets. An offset should be added to the target address before passing it
442 // to the outlined function on device side. In case of success, return zero.
443 // Otherwise, return an error code.
444 int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args,
445                                     ptrdiff_t *Offsets, int32_t NumArgs) {
446   return __tgt_rtl_run_target_team_region(ID, Entry, Args, Offsets, NumArgs, 1,
447                                           1, 0);
448 }
449