1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2018 6WIND S.A.
3 * Copyright 2018 Mellanox Technologies, Ltd
4 */
5
6 #include <errno.h>
7 #include <linux/if_link.h>
8 #include <linux/rtnetlink.h>
9 #include <linux/genetlink.h>
10 #include <net/if.h>
11 #include <rdma/rdma_netlink.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <stdalign.h>
16 #include <string.h>
17 #include <sys/socket.h>
18 #include <unistd.h>
19
20 #include <rte_errno.h>
21
22 #include "mlx5_nl.h"
23 #include "mlx5_common_utils.h"
24 #include "mlx5_malloc.h"
25 #ifdef HAVE_DEVLINK
26 #include <linux/devlink.h>
27 #endif
28
29
30 /* Size of the buffer to receive kernel messages */
31 #define MLX5_NL_BUF_SIZE (32 * 1024)
32 /* Send buffer size for the Netlink socket */
33 #define MLX5_SEND_BUF_SIZE 32768
34 /* Receive buffer size for the Netlink socket */
35 #define MLX5_RECV_BUF_SIZE 32768
36
37 /** Parameters of VLAN devices created by driver. */
38 #define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx"
39 /*
40 * Define NDA_RTA as defined in iproute2 sources.
41 *
42 * see in iproute2 sources file include/libnetlink.h
43 */
44 #ifndef MLX5_NDA_RTA
45 #define MLX5_NDA_RTA(r) \
46 ((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
47 #endif
48 /*
49 * Define NLMSG_TAIL as defined in iproute2 sources.
50 *
51 * see in iproute2 sources file include/libnetlink.h
52 */
53 #ifndef NLMSG_TAIL
54 #define NLMSG_TAIL(nmsg) \
55 ((struct rtattr *)(((char *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
56 #endif
57 /*
58 * The following definitions are normally found in rdma/rdma_netlink.h,
59 * however they are so recent that most systems do not expose them yet.
60 */
61 #ifndef HAVE_RDMA_NL_NLDEV
62 #define RDMA_NL_NLDEV 5
63 #endif
64 #ifndef HAVE_RDMA_NLDEV_CMD_GET
65 #define RDMA_NLDEV_CMD_GET 1
66 #endif
67 #ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
68 #define RDMA_NLDEV_CMD_PORT_GET 5
69 #endif
70 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
71 #define RDMA_NLDEV_ATTR_DEV_INDEX 1
72 #endif
73 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
74 #define RDMA_NLDEV_ATTR_DEV_NAME 2
75 #endif
76 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
77 #define RDMA_NLDEV_ATTR_PORT_INDEX 3
78 #endif
79 #ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
80 #define RDMA_NLDEV_ATTR_NDEV_INDEX 50
81 #endif
82
83 /* These are normally found in linux/if_link.h. */
84 #ifndef HAVE_IFLA_NUM_VF
85 #define IFLA_NUM_VF 21
86 #endif
87 #ifndef HAVE_IFLA_EXT_MASK
88 #define IFLA_EXT_MASK 29
89 #endif
90 #ifndef HAVE_IFLA_PHYS_SWITCH_ID
91 #define IFLA_PHYS_SWITCH_ID 36
92 #endif
93 #ifndef HAVE_IFLA_PHYS_PORT_NAME
94 #define IFLA_PHYS_PORT_NAME 38
95 #endif
96
97 /*
98 * Some Devlink defines may be missed in old kernel versions,
99 * adjust used defines.
100 */
101 #ifndef DEVLINK_GENL_NAME
102 #define DEVLINK_GENL_NAME "devlink"
103 #endif
104 #ifndef DEVLINK_GENL_VERSION
105 #define DEVLINK_GENL_VERSION 1
106 #endif
107 #ifndef DEVLINK_ATTR_BUS_NAME
108 #define DEVLINK_ATTR_BUS_NAME 1
109 #endif
110 #ifndef DEVLINK_ATTR_DEV_NAME
111 #define DEVLINK_ATTR_DEV_NAME 2
112 #endif
113 #ifndef DEVLINK_ATTR_PARAM
114 #define DEVLINK_ATTR_PARAM 80
115 #endif
116 #ifndef DEVLINK_ATTR_PARAM_NAME
117 #define DEVLINK_ATTR_PARAM_NAME 81
118 #endif
119 #ifndef DEVLINK_ATTR_PARAM_TYPE
120 #define DEVLINK_ATTR_PARAM_TYPE 83
121 #endif
122 #ifndef DEVLINK_ATTR_PARAM_VALUES_LIST
123 #define DEVLINK_ATTR_PARAM_VALUES_LIST 84
124 #endif
125 #ifndef DEVLINK_ATTR_PARAM_VALUE
126 #define DEVLINK_ATTR_PARAM_VALUE 85
127 #endif
128 #ifndef DEVLINK_ATTR_PARAM_VALUE_DATA
129 #define DEVLINK_ATTR_PARAM_VALUE_DATA 86
130 #endif
131 #ifndef DEVLINK_ATTR_PARAM_VALUE_CMODE
132 #define DEVLINK_ATTR_PARAM_VALUE_CMODE 87
133 #endif
134 #ifndef DEVLINK_PARAM_CMODE_DRIVERINIT
135 #define DEVLINK_PARAM_CMODE_DRIVERINIT 1
136 #endif
137 #ifndef DEVLINK_CMD_RELOAD
138 #define DEVLINK_CMD_RELOAD 37
139 #endif
140 #ifndef DEVLINK_CMD_PARAM_GET
141 #define DEVLINK_CMD_PARAM_GET 38
142 #endif
143 #ifndef DEVLINK_CMD_PARAM_SET
144 #define DEVLINK_CMD_PARAM_SET 39
145 #endif
146 #ifndef NLA_FLAG
147 #define NLA_FLAG 6
148 #endif
149
150 /* Add/remove MAC address through Netlink */
151 struct mlx5_nl_mac_addr {
152 struct rte_ether_addr (*mac)[];
153 /**< MAC address handled by the device. */
154 int mac_n; /**< Number of addresses in the array. */
155 };
156
157 #define MLX5_NL_CMD_GET_IB_NAME (1 << 0)
158 #define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
159 #define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
160 #define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)
161
162 /** Data structure used by mlx5_nl_cmdget_cb(). */
163 struct mlx5_nl_ifindex_data {
164 const char *name; /**< IB device name (in). */
165 uint32_t flags; /**< found attribute flags (out). */
166 uint32_t ibindex; /**< IB device index (out). */
167 uint32_t ifindex; /**< Network interface index (out). */
168 uint32_t portnum; /**< IB device max port number (out). */
169 };
170
171 uint32_t atomic_sn;
172
173 /* Generate Netlink sequence number. */
174 #define MLX5_NL_SN_GENERATE __atomic_add_fetch(&atomic_sn, 1, __ATOMIC_RELAXED)
175
176 /**
177 * Opens a Netlink socket.
178 *
179 * @param protocol
180 * Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
181 *
182 * @return
183 * A file descriptor on success, a negative errno value otherwise and
184 * rte_errno is set.
185 */
186 int
mlx5_nl_init(int protocol)187 mlx5_nl_init(int protocol)
188 {
189 int fd;
190 int sndbuf_size = MLX5_SEND_BUF_SIZE;
191 int rcvbuf_size = MLX5_RECV_BUF_SIZE;
192 struct sockaddr_nl local = {
193 .nl_family = AF_NETLINK,
194 };
195 int ret;
196
197 fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
198 if (fd == -1) {
199 rte_errno = errno;
200 return -rte_errno;
201 }
202 ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int));
203 if (ret == -1) {
204 rte_errno = errno;
205 goto error;
206 }
207 ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int));
208 if (ret == -1) {
209 rte_errno = errno;
210 goto error;
211 }
212 ret = bind(fd, (struct sockaddr *)&local, sizeof(local));
213 if (ret == -1) {
214 rte_errno = errno;
215 goto error;
216 }
217 return fd;
218 error:
219 close(fd);
220 return -rte_errno;
221 }
222
223 /**
224 * Send a request message to the kernel on the Netlink socket.
225 *
226 * @param[in] nlsk_fd
227 * Netlink socket file descriptor.
228 * @param[in] nh
229 * The Netlink message send to the kernel.
230 * @param[in] ssn
231 * Sequence number.
232 * @param[in] req
233 * Pointer to the request structure.
234 * @param[in] len
235 * Length of the request in bytes.
236 *
237 * @return
238 * The number of sent bytes on success, a negative errno value otherwise and
239 * rte_errno is set.
240 */
241 static int
mlx5_nl_request(int nlsk_fd,struct nlmsghdr * nh,uint32_t sn,void * req,int len)242 mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req,
243 int len)
244 {
245 struct sockaddr_nl sa = {
246 .nl_family = AF_NETLINK,
247 };
248 struct iovec iov[2] = {
249 { .iov_base = nh, .iov_len = sizeof(*nh), },
250 { .iov_base = req, .iov_len = len, },
251 };
252 struct msghdr msg = {
253 .msg_name = &sa,
254 .msg_namelen = sizeof(sa),
255 .msg_iov = iov,
256 .msg_iovlen = 2,
257 };
258 int send_bytes;
259
260 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
261 nh->nlmsg_seq = sn;
262 send_bytes = sendmsg(nlsk_fd, &msg, 0);
263 if (send_bytes < 0) {
264 rte_errno = errno;
265 return -rte_errno;
266 }
267 return send_bytes;
268 }
269
270 /**
271 * Send a message to the kernel on the Netlink socket.
272 *
273 * @param[in] nlsk_fd
274 * The Netlink socket file descriptor used for communication.
275 * @param[in] nh
276 * The Netlink message send to the kernel.
277 * @param[in] sn
278 * Sequence number.
279 *
280 * @return
281 * The number of sent bytes on success, a negative errno value otherwise and
282 * rte_errno is set.
283 */
284 static int
mlx5_nl_send(int nlsk_fd,struct nlmsghdr * nh,uint32_t sn)285 mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn)
286 {
287 struct sockaddr_nl sa = {
288 .nl_family = AF_NETLINK,
289 };
290 struct iovec iov = {
291 .iov_base = nh,
292 .iov_len = nh->nlmsg_len,
293 };
294 struct msghdr msg = {
295 .msg_name = &sa,
296 .msg_namelen = sizeof(sa),
297 .msg_iov = &iov,
298 .msg_iovlen = 1,
299 };
300 int send_bytes;
301
302 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
303 nh->nlmsg_seq = sn;
304 send_bytes = sendmsg(nlsk_fd, &msg, 0);
305 if (send_bytes < 0) {
306 rte_errno = errno;
307 return -rte_errno;
308 }
309 return send_bytes;
310 }
311
312 /**
313 * Receive a message from the kernel on the Netlink socket, following
314 * mlx5_nl_send().
315 *
316 * @param[in] nlsk_fd
317 * The Netlink socket file descriptor used for communication.
318 * @param[in] sn
319 * Sequence number.
320 * @param[in] cb
321 * The callback function to call for each Netlink message received.
322 * @param[in, out] arg
323 * Custom arguments for the callback.
324 *
325 * @return
326 * 0 on success, a negative errno value otherwise and rte_errno is set.
327 */
328 static int
mlx5_nl_recv(int nlsk_fd,uint32_t sn,int (* cb)(struct nlmsghdr *,void * arg),void * arg)329 mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
330 void *arg)
331 {
332 struct sockaddr_nl sa;
333 void *buf = mlx5_malloc(0, MLX5_RECV_BUF_SIZE, 0, SOCKET_ID_ANY);
334 struct iovec iov = {
335 .iov_base = buf,
336 .iov_len = MLX5_RECV_BUF_SIZE,
337 };
338 struct msghdr msg = {
339 .msg_name = &sa,
340 .msg_namelen = sizeof(sa),
341 .msg_iov = &iov,
342 /* One message at a time */
343 .msg_iovlen = 1,
344 };
345 int multipart = 0;
346 int ret = 0;
347
348 if (!buf) {
349 rte_errno = ENOMEM;
350 return -rte_errno;
351 }
352 do {
353 struct nlmsghdr *nh;
354 int recv_bytes = 0;
355
356 do {
357 recv_bytes = recvmsg(nlsk_fd, &msg, 0);
358 if (recv_bytes == -1) {
359 rte_errno = errno;
360 ret = -rte_errno;
361 goto exit;
362 }
363 nh = (struct nlmsghdr *)buf;
364 } while (nh->nlmsg_seq != sn);
365 for (;
366 NLMSG_OK(nh, (unsigned int)recv_bytes);
367 nh = NLMSG_NEXT(nh, recv_bytes)) {
368 if (nh->nlmsg_type == NLMSG_ERROR) {
369 struct nlmsgerr *err_data = NLMSG_DATA(nh);
370
371 if (err_data->error < 0) {
372 rte_errno = -err_data->error;
373 ret = -rte_errno;
374 goto exit;
375 }
376 /* Ack message. */
377 ret = 0;
378 goto exit;
379 }
380 /* Multi-part msgs and their trailing DONE message. */
381 if (nh->nlmsg_flags & NLM_F_MULTI) {
382 if (nh->nlmsg_type == NLMSG_DONE) {
383 ret = 0;
384 goto exit;
385 }
386 multipart = 1;
387 }
388 if (cb) {
389 ret = cb(nh, arg);
390 if (ret < 0)
391 goto exit;
392 }
393 }
394 } while (multipart);
395 exit:
396 mlx5_free(buf);
397 return ret;
398 }
399
400 /**
401 * Parse Netlink message to retrieve the bridge MAC address.
402 *
403 * @param nh
404 * Pointer to Netlink Message Header.
405 * @param arg
406 * PMD data register with this callback.
407 *
408 * @return
409 * 0 on success, a negative errno value otherwise and rte_errno is set.
410 */
411 static int
mlx5_nl_mac_addr_cb(struct nlmsghdr * nh,void * arg)412 mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
413 {
414 struct mlx5_nl_mac_addr *data = arg;
415 struct ndmsg *r = NLMSG_DATA(nh);
416 struct rtattr *attribute;
417 int len;
418
419 len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
420 for (attribute = MLX5_NDA_RTA(r);
421 RTA_OK(attribute, len);
422 attribute = RTA_NEXT(attribute, len)) {
423 if (attribute->rta_type == NDA_LLADDR) {
424 if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) {
425 DRV_LOG(WARNING,
426 "not enough room to finalize the"
427 " request");
428 rte_errno = ENOMEM;
429 return -rte_errno;
430 }
431 #ifdef RTE_LIBRTE_MLX5_DEBUG
432 char m[RTE_ETHER_ADDR_FMT_SIZE];
433
434 rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE,
435 RTA_DATA(attribute));
436 DRV_LOG(DEBUG, "bridge MAC address %s", m);
437 #endif
438 memcpy(&(*data->mac)[data->mac_n++],
439 RTA_DATA(attribute), RTE_ETHER_ADDR_LEN);
440 }
441 }
442 return 0;
443 }
444
445 /**
446 * Get bridge MAC addresses.
447 *
448 * @param[in] nlsk_fd
449 * Netlink socket file descriptor.
450 * @param[in] iface_idx
451 * Net device interface index.
452 * @param mac[out]
453 * Pointer to the array table of MAC addresses to fill.
454 * Its size should be of MLX5_MAX_MAC_ADDRESSES.
455 * @param mac_n[out]
456 * Number of entries filled in MAC array.
457 *
458 * @return
459 * 0 on success, a negative errno value otherwise and rte_errno is set.
460 */
461 static int
mlx5_nl_mac_addr_list(int nlsk_fd,unsigned int iface_idx,struct rte_ether_addr (* mac)[],int * mac_n)462 mlx5_nl_mac_addr_list(int nlsk_fd, unsigned int iface_idx,
463 struct rte_ether_addr (*mac)[], int *mac_n)
464 {
465 struct {
466 struct nlmsghdr hdr;
467 struct ifinfomsg ifm;
468 } req = {
469 .hdr = {
470 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
471 .nlmsg_type = RTM_GETNEIGH,
472 .nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
473 },
474 .ifm = {
475 .ifi_family = PF_BRIDGE,
476 .ifi_index = iface_idx,
477 },
478 };
479 struct mlx5_nl_mac_addr data = {
480 .mac = mac,
481 .mac_n = 0,
482 };
483 uint32_t sn = MLX5_NL_SN_GENERATE;
484 int ret;
485
486 if (nlsk_fd == -1)
487 return 0;
488 ret = mlx5_nl_request(nlsk_fd, &req.hdr, sn, &req.ifm,
489 sizeof(struct ifinfomsg));
490 if (ret < 0)
491 goto error;
492 ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_mac_addr_cb, &data);
493 if (ret < 0)
494 goto error;
495 *mac_n = data.mac_n;
496 return 0;
497 error:
498 DRV_LOG(DEBUG, "Interface %u cannot retrieve MAC address list %s",
499 iface_idx, strerror(rte_errno));
500 return -rte_errno;
501 }
502
503 /**
504 * Modify the MAC address neighbour table with Netlink.
505 *
506 * @param[in] nlsk_fd
507 * Netlink socket file descriptor.
508 * @param[in] iface_idx
509 * Net device interface index.
510 * @param mac
511 * MAC address to consider.
512 * @param add
513 * 1 to add the MAC address, 0 to remove the MAC address.
514 *
515 * @return
516 * 0 on success, a negative errno value otherwise and rte_errno is set.
517 */
518 static int
mlx5_nl_mac_addr_modify(int nlsk_fd,unsigned int iface_idx,struct rte_ether_addr * mac,int add)519 mlx5_nl_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
520 struct rte_ether_addr *mac, int add)
521 {
522 struct {
523 struct nlmsghdr hdr;
524 struct ndmsg ndm;
525 struct rtattr rta;
526 uint8_t buffer[RTE_ETHER_ADDR_LEN];
527 } req = {
528 .hdr = {
529 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
530 .nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
531 NLM_F_EXCL | NLM_F_ACK,
532 .nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH,
533 },
534 .ndm = {
535 .ndm_family = PF_BRIDGE,
536 .ndm_state = NUD_NOARP | NUD_PERMANENT,
537 .ndm_ifindex = iface_idx,
538 .ndm_flags = NTF_SELF,
539 },
540 .rta = {
541 .rta_type = NDA_LLADDR,
542 .rta_len = RTA_LENGTH(RTE_ETHER_ADDR_LEN),
543 },
544 };
545 uint32_t sn = MLX5_NL_SN_GENERATE;
546 int ret;
547
548 if (nlsk_fd == -1)
549 return 0;
550 memcpy(RTA_DATA(&req.rta), mac, RTE_ETHER_ADDR_LEN);
551 req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
552 RTA_ALIGN(req.rta.rta_len);
553 ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
554 if (ret < 0)
555 goto error;
556 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
557 if (ret < 0)
558 goto error;
559 return 0;
560 error:
561 #ifdef RTE_LIBRTE_MLX5_DEBUG
562 {
563 char m[RTE_ETHER_ADDR_FMT_SIZE];
564
565 rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE, mac);
566 DRV_LOG(DEBUG,
567 "Interface %u cannot %s MAC address %s %s",
568 iface_idx,
569 add ? "add" : "remove", m, strerror(rte_errno));
570 }
571 #endif
572 return -rte_errno;
573 }
574
575 /**
576 * Modify the VF MAC address neighbour table with Netlink.
577 *
578 * @param[in] nlsk_fd
579 * Netlink socket file descriptor.
580 * @param[in] iface_idx
581 * Net device interface index.
582 * @param mac
583 * MAC address to consider.
584 * @param vf_index
585 * VF index.
586 *
587 * @return
588 * 0 on success, a negative errno value otherwise and rte_errno is set.
589 */
590 int
mlx5_nl_vf_mac_addr_modify(int nlsk_fd,unsigned int iface_idx,struct rte_ether_addr * mac,int vf_index)591 mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
592 struct rte_ether_addr *mac, int vf_index)
593 {
594 int ret;
595 struct {
596 struct nlmsghdr hdr;
597 struct ifinfomsg ifm;
598 struct rtattr vf_list_rta;
599 struct rtattr vf_info_rta;
600 struct rtattr vf_mac_rta;
601 struct ifla_vf_mac ivm;
602 } req = {
603 .hdr = {
604 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
605 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
606 .nlmsg_type = RTM_BASE,
607 },
608 .ifm = {
609 .ifi_index = iface_idx,
610 },
611 .vf_list_rta = {
612 .rta_type = IFLA_VFINFO_LIST,
613 .rta_len = RTA_ALIGN(RTA_LENGTH(0)),
614 },
615 .vf_info_rta = {
616 .rta_type = IFLA_VF_INFO,
617 .rta_len = RTA_ALIGN(RTA_LENGTH(0)),
618 },
619 .vf_mac_rta = {
620 .rta_type = IFLA_VF_MAC,
621 },
622 };
623 struct ifla_vf_mac ivm = {
624 .vf = vf_index,
625 };
626 uint32_t sn = MLX5_NL_SN_GENERATE;
627
628 memcpy(&ivm.mac, mac, RTE_ETHER_ADDR_LEN);
629 memcpy(RTA_DATA(&req.vf_mac_rta), &ivm, sizeof(ivm));
630
631 req.vf_mac_rta.rta_len = RTA_LENGTH(sizeof(ivm));
632 req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
633 RTA_ALIGN(req.vf_list_rta.rta_len) +
634 RTA_ALIGN(req.vf_info_rta.rta_len) +
635 RTA_ALIGN(req.vf_mac_rta.rta_len);
636 req.vf_list_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
637 &req.vf_list_rta);
638 req.vf_info_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
639 &req.vf_info_rta);
640
641 if (nlsk_fd < 0)
642 return -1;
643 ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
644 if (ret < 0)
645 goto error;
646 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
647 if (ret < 0)
648 goto error;
649 return 0;
650 error:
651 DRV_LOG(ERR,
652 "representor %u cannot set VF MAC address "
653 "%02X:%02X:%02X:%02X:%02X:%02X : %s",
654 vf_index,
655 mac->addr_bytes[0], mac->addr_bytes[1],
656 mac->addr_bytes[2], mac->addr_bytes[3],
657 mac->addr_bytes[4], mac->addr_bytes[5],
658 strerror(rte_errno));
659 return -rte_errno;
660 }
661
662 /**
663 * Add a MAC address.
664 *
665 * @param[in] nlsk_fd
666 * Netlink socket file descriptor.
667 * @param[in] iface_idx
668 * Net device interface index.
669 * @param mac_own
670 * BITFIELD_DECLARE array to store the mac.
671 * @param mac
672 * MAC address to register.
673 * @param index
674 * MAC address index.
675 *
676 * @return
677 * 0 on success, a negative errno value otherwise and rte_errno is set.
678 */
679 int
mlx5_nl_mac_addr_add(int nlsk_fd,unsigned int iface_idx,uint64_t * mac_own,struct rte_ether_addr * mac,uint32_t index)680 mlx5_nl_mac_addr_add(int nlsk_fd, unsigned int iface_idx,
681 uint64_t *mac_own, struct rte_ether_addr *mac,
682 uint32_t index)
683 {
684 int ret;
685
686 ret = mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 1);
687 if (!ret) {
688 MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
689 if (index >= MLX5_MAX_MAC_ADDRESSES)
690 return -EINVAL;
691
692 BITFIELD_SET(mac_own, index);
693 }
694 if (ret == -EEXIST)
695 return 0;
696 return ret;
697 }
698
699 /**
700 * Remove a MAC address.
701 *
702 * @param[in] nlsk_fd
703 * Netlink socket file descriptor.
704 * @param[in] iface_idx
705 * Net device interface index.
706 * @param mac_own
707 * BITFIELD_DECLARE array to store the mac.
708 * @param mac
709 * MAC address to remove.
710 * @param index
711 * MAC address index.
712 *
713 * @return
714 * 0 on success, a negative errno value otherwise and rte_errno is set.
715 */
716 int
mlx5_nl_mac_addr_remove(int nlsk_fd,unsigned int iface_idx,uint64_t * mac_own,struct rte_ether_addr * mac,uint32_t index)717 mlx5_nl_mac_addr_remove(int nlsk_fd, unsigned int iface_idx, uint64_t *mac_own,
718 struct rte_ether_addr *mac, uint32_t index)
719 {
720 MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
721 if (index >= MLX5_MAX_MAC_ADDRESSES)
722 return -EINVAL;
723
724 BITFIELD_RESET(mac_own, index);
725 return mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 0);
726 }
727
728 /**
729 * Synchronize Netlink bridge table to the internal table.
730 *
731 * @param[in] nlsk_fd
732 * Netlink socket file descriptor.
733 * @param[in] iface_idx
734 * Net device interface index.
735 * @param mac_addrs
736 * Mac addresses array to sync.
737 * @param n
738 * @p mac_addrs array size.
739 */
740 void
mlx5_nl_mac_addr_sync(int nlsk_fd,unsigned int iface_idx,struct rte_ether_addr * mac_addrs,int n)741 mlx5_nl_mac_addr_sync(int nlsk_fd, unsigned int iface_idx,
742 struct rte_ether_addr *mac_addrs, int n)
743 {
744 struct rte_ether_addr macs[n];
745 int macs_n = 0;
746 int i;
747 int ret;
748
749 ret = mlx5_nl_mac_addr_list(nlsk_fd, iface_idx, &macs, &macs_n);
750 if (ret)
751 return;
752 for (i = 0; i != macs_n; ++i) {
753 int j;
754
755 /* Verify the address is not in the array yet. */
756 for (j = 0; j != n; ++j)
757 if (rte_is_same_ether_addr(&macs[i], &mac_addrs[j]))
758 break;
759 if (j != n)
760 continue;
761 /* Find the first entry available. */
762 for (j = 0; j != n; ++j) {
763 if (rte_is_zero_ether_addr(&mac_addrs[j])) {
764 mac_addrs[j] = macs[i];
765 break;
766 }
767 }
768 }
769 }
770
771 /**
772 * Flush all added MAC addresses.
773 *
774 * @param[in] nlsk_fd
775 * Netlink socket file descriptor.
776 * @param[in] iface_idx
777 * Net device interface index.
778 * @param[in] mac_addrs
779 * Mac addresses array to flush.
780 * @param n
781 * @p mac_addrs array size.
782 * @param mac_own
783 * BITFIELD_DECLARE array to store the mac.
784 */
785 void
mlx5_nl_mac_addr_flush(int nlsk_fd,unsigned int iface_idx,struct rte_ether_addr * mac_addrs,int n,uint64_t * mac_own)786 mlx5_nl_mac_addr_flush(int nlsk_fd, unsigned int iface_idx,
787 struct rte_ether_addr *mac_addrs, int n,
788 uint64_t *mac_own)
789 {
790 int i;
791
792 if (n <= 0 || n > MLX5_MAX_MAC_ADDRESSES)
793 return;
794
795 for (i = n - 1; i >= 0; --i) {
796 struct rte_ether_addr *m = &mac_addrs[i];
797
798 if (BITFIELD_ISSET(mac_own, i))
799 mlx5_nl_mac_addr_remove(nlsk_fd, iface_idx, mac_own, m,
800 i);
801 }
802 }
803
804 /**
805 * Enable promiscuous / all multicast mode through Netlink.
806 *
807 * @param[in] nlsk_fd
808 * Netlink socket file descriptor.
809 * @param[in] iface_idx
810 * Net device interface index.
811 * @param flags
812 * IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti.
813 * @param enable
814 * Nonzero to enable, disable otherwise.
815 *
816 * @return
817 * 0 on success, a negative errno value otherwise and rte_errno is set.
818 */
819 static int
mlx5_nl_device_flags(int nlsk_fd,unsigned int iface_idx,uint32_t flags,int enable)820 mlx5_nl_device_flags(int nlsk_fd, unsigned int iface_idx, uint32_t flags,
821 int enable)
822 {
823 struct {
824 struct nlmsghdr hdr;
825 struct ifinfomsg ifi;
826 } req = {
827 .hdr = {
828 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
829 .nlmsg_type = RTM_NEWLINK,
830 .nlmsg_flags = NLM_F_REQUEST,
831 },
832 .ifi = {
833 .ifi_flags = enable ? flags : 0,
834 .ifi_change = flags,
835 .ifi_index = iface_idx,
836 },
837 };
838 uint32_t sn = MLX5_NL_SN_GENERATE;
839 int ret;
840
841 MLX5_ASSERT(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
842 if (nlsk_fd < 0)
843 return 0;
844 ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
845 if (ret < 0)
846 return ret;
847 return 0;
848 }
849
850 /**
851 * Enable promiscuous mode through Netlink.
852 *
853 * @param[in] nlsk_fd
854 * Netlink socket file descriptor.
855 * @param[in] iface_idx
856 * Net device interface index.
857 * @param enable
858 * Nonzero to enable, disable otherwise.
859 *
860 * @return
861 * 0 on success, a negative errno value otherwise and rte_errno is set.
862 */
863 int
mlx5_nl_promisc(int nlsk_fd,unsigned int iface_idx,int enable)864 mlx5_nl_promisc(int nlsk_fd, unsigned int iface_idx, int enable)
865 {
866 int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_PROMISC, enable);
867
868 if (ret)
869 DRV_LOG(DEBUG,
870 "Interface %u cannot %s promisc mode: Netlink error %s",
871 iface_idx, enable ? "enable" : "disable",
872 strerror(rte_errno));
873 return ret;
874 }
875
876 /**
877 * Enable all multicast mode through Netlink.
878 *
879 * @param[in] nlsk_fd
880 * Netlink socket file descriptor.
881 * @param[in] iface_idx
882 * Net device interface index.
883 * @param enable
884 * Nonzero to enable, disable otherwise.
885 *
886 * @return
887 * 0 on success, a negative errno value otherwise and rte_errno is set.
888 */
889 int
mlx5_nl_allmulti(int nlsk_fd,unsigned int iface_idx,int enable)890 mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable)
891 {
892 int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_ALLMULTI,
893 enable);
894
895 if (ret)
896 DRV_LOG(DEBUG,
897 "Interface %u cannot %s allmulti : Netlink error %s",
898 iface_idx, enable ? "enable" : "disable",
899 strerror(rte_errno));
900 return ret;
901 }
902
903 /**
904 * Process network interface information from Netlink message.
905 *
906 * @param nh
907 * Pointer to Netlink message header.
908 * @param arg
909 * Opaque data pointer for this callback.
910 *
911 * @return
912 * 0 on success, a negative errno value otherwise and rte_errno is set.
913 */
914 static int
mlx5_nl_cmdget_cb(struct nlmsghdr * nh,void * arg)915 mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
916 {
917 struct mlx5_nl_ifindex_data *data = arg;
918 struct mlx5_nl_ifindex_data local = {
919 .flags = 0,
920 };
921 size_t off = NLMSG_HDRLEN;
922
923 if (nh->nlmsg_type !=
924 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
925 nh->nlmsg_type !=
926 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
927 goto error;
928 while (off < nh->nlmsg_len) {
929 struct nlattr *na = (void *)((uintptr_t)nh + off);
930 void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
931
932 if (na->nla_len > nh->nlmsg_len - off)
933 goto error;
934 switch (na->nla_type) {
935 case RDMA_NLDEV_ATTR_DEV_INDEX:
936 local.ibindex = *(uint32_t *)payload;
937 local.flags |= MLX5_NL_CMD_GET_IB_INDEX;
938 break;
939 case RDMA_NLDEV_ATTR_DEV_NAME:
940 if (!strcmp(payload, data->name))
941 local.flags |= MLX5_NL_CMD_GET_IB_NAME;
942 break;
943 case RDMA_NLDEV_ATTR_NDEV_INDEX:
944 local.ifindex = *(uint32_t *)payload;
945 local.flags |= MLX5_NL_CMD_GET_NET_INDEX;
946 break;
947 case RDMA_NLDEV_ATTR_PORT_INDEX:
948 local.portnum = *(uint32_t *)payload;
949 local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
950 break;
951 default:
952 break;
953 }
954 off += NLA_ALIGN(na->nla_len);
955 }
956 /*
957 * It is possible to have multiple messages for all
958 * Infiniband devices in the system with appropriate name.
959 * So we should gather parameters locally and copy to
960 * query context only in case of coinciding device name.
961 */
962 if (local.flags & MLX5_NL_CMD_GET_IB_NAME) {
963 data->flags = local.flags;
964 data->ibindex = local.ibindex;
965 data->ifindex = local.ifindex;
966 data->portnum = local.portnum;
967 }
968 return 0;
969 error:
970 rte_errno = EINVAL;
971 return -rte_errno;
972 }
973
974 /**
975 * Get index of network interface associated with some IB device.
976 *
977 * This is the only somewhat safe method to avoid resorting to heuristics
978 * when faced with port representors. Unfortunately it requires at least
979 * Linux 4.17.
980 *
981 * @param nl
982 * Netlink socket of the RDMA kind (NETLINK_RDMA).
983 * @param[in] name
984 * IB device name.
985 * @param[in] pindex
986 * IB device port index, starting from 1
987 * @return
988 * A valid (nonzero) interface index on success, 0 otherwise and rte_errno
989 * is set.
990 */
991 unsigned int
mlx5_nl_ifindex(int nl,const char * name,uint32_t pindex)992 mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
993 {
994 struct mlx5_nl_ifindex_data data = {
995 .name = name,
996 .flags = 0,
997 .ibindex = 0, /* Determined during first pass. */
998 .ifindex = 0, /* Determined during second pass. */
999 };
1000 union {
1001 struct nlmsghdr nh;
1002 uint8_t buf[NLMSG_HDRLEN +
1003 NLA_HDRLEN + NLA_ALIGN(sizeof(data.ibindex)) +
1004 NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
1005 } req = {
1006 .nh = {
1007 .nlmsg_len = NLMSG_LENGTH(0),
1008 .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1009 RDMA_NLDEV_CMD_GET),
1010 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1011 },
1012 };
1013 struct nlattr *na;
1014 uint32_t sn = MLX5_NL_SN_GENERATE;
1015 int ret;
1016
1017 ret = mlx5_nl_send(nl, &req.nh, sn);
1018 if (ret < 0)
1019 return 0;
1020 ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
1021 if (ret < 0)
1022 return 0;
1023 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
1024 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX))
1025 goto error;
1026 data.flags = 0;
1027 sn = MLX5_NL_SN_GENERATE;
1028 req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1029 RDMA_NLDEV_CMD_PORT_GET);
1030 req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1031 req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
1032 na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
1033 na->nla_len = NLA_HDRLEN + sizeof(data.ibindex);
1034 na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
1035 memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
1036 &data.ibindex, sizeof(data.ibindex));
1037 na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
1038 na->nla_len = NLA_HDRLEN + sizeof(pindex);
1039 na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
1040 memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
1041 &pindex, sizeof(pindex));
1042 ret = mlx5_nl_send(nl, &req.nh, sn);
1043 if (ret < 0)
1044 return 0;
1045 ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
1046 if (ret < 0)
1047 return 0;
1048 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
1049 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1050 !(data.flags & MLX5_NL_CMD_GET_NET_INDEX) ||
1051 !data.ifindex)
1052 goto error;
1053 return data.ifindex;
1054 error:
1055 rte_errno = ENODEV;
1056 return 0;
1057 }
1058
1059 /**
1060 * Get the number of physical ports of given IB device.
1061 *
1062 * @param nl
1063 * Netlink socket of the RDMA kind (NETLINK_RDMA).
1064 * @param[in] name
1065 * IB device name.
1066 *
1067 * @return
1068 * A valid (nonzero) number of ports on success, 0 otherwise
1069 * and rte_errno is set.
1070 */
1071 unsigned int
mlx5_nl_portnum(int nl,const char * name)1072 mlx5_nl_portnum(int nl, const char *name)
1073 {
1074 struct mlx5_nl_ifindex_data data = {
1075 .flags = 0,
1076 .name = name,
1077 .ifindex = 0,
1078 .portnum = 0,
1079 };
1080 struct nlmsghdr req = {
1081 .nlmsg_len = NLMSG_LENGTH(0),
1082 .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1083 RDMA_NLDEV_CMD_GET),
1084 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1085 };
1086 uint32_t sn = MLX5_NL_SN_GENERATE;
1087 int ret;
1088
1089 ret = mlx5_nl_send(nl, &req, sn);
1090 if (ret < 0)
1091 return 0;
1092 ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
1093 if (ret < 0)
1094 return 0;
1095 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
1096 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1097 !(data.flags & MLX5_NL_CMD_GET_PORT_INDEX)) {
1098 rte_errno = ENODEV;
1099 return 0;
1100 }
1101 if (!data.portnum)
1102 rte_errno = EINVAL;
1103 return data.portnum;
1104 }
1105
1106 /**
1107 * Analyze gathered port parameters via Netlink to recognize master
1108 * and representor devices for E-Switch configuration.
1109 *
1110 * @param[in] num_vf_set
1111 * flag of presence of number of VFs port attribute.
1112 * @param[inout] switch_info
1113 * Port information, including port name as a number and port name
1114 * type if recognized
1115 *
1116 * @return
1117 * master and representor flags are set in switch_info according to
1118 * recognized parameters (if any).
1119 */
1120 static void
mlx5_nl_check_switch_info(bool num_vf_set,struct mlx5_switch_info * switch_info)1121 mlx5_nl_check_switch_info(bool num_vf_set,
1122 struct mlx5_switch_info *switch_info)
1123 {
1124 switch (switch_info->name_type) {
1125 case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1126 /*
1127 * Name is not recognized, assume the master,
1128 * check the number of VFs key presence.
1129 */
1130 switch_info->master = num_vf_set;
1131 break;
1132 case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1133 /*
1134 * Name is not set, this assumes the legacy naming
1135 * schema for master, just check if there is a
1136 * number of VFs key.
1137 */
1138 switch_info->master = num_vf_set;
1139 break;
1140 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1141 /* New uplink naming schema recognized. */
1142 switch_info->master = 1;
1143 break;
1144 case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1145 /* Legacy representors naming schema. */
1146 switch_info->representor = !num_vf_set;
1147 break;
1148 case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
1149 /* Fallthrough */
1150 case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1151 /* New representors naming schema. */
1152 switch_info->representor = 1;
1153 break;
1154 }
1155 }
1156
1157 /**
1158 * Process switch information from Netlink message.
1159 *
1160 * @param nh
1161 * Pointer to Netlink message header.
1162 * @param arg
1163 * Opaque data pointer for this callback.
1164 *
1165 * @return
1166 * 0 on success, a negative errno value otherwise and rte_errno is set.
1167 */
1168 static int
mlx5_nl_switch_info_cb(struct nlmsghdr * nh,void * arg)1169 mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
1170 {
1171 struct mlx5_switch_info info = {
1172 .master = 0,
1173 .representor = 0,
1174 .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1175 .port_name = 0,
1176 .switch_id = 0,
1177 };
1178 size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
1179 bool switch_id_set = false;
1180 bool num_vf_set = false;
1181
1182 if (nh->nlmsg_type != RTM_NEWLINK)
1183 goto error;
1184 while (off < nh->nlmsg_len) {
1185 struct rtattr *ra = (void *)((uintptr_t)nh + off);
1186 void *payload = RTA_DATA(ra);
1187 unsigned int i;
1188
1189 if (ra->rta_len > nh->nlmsg_len - off)
1190 goto error;
1191 switch (ra->rta_type) {
1192 case IFLA_NUM_VF:
1193 num_vf_set = true;
1194 break;
1195 case IFLA_PHYS_PORT_NAME:
1196 mlx5_translate_port_name((char *)payload, &info);
1197 break;
1198 case IFLA_PHYS_SWITCH_ID:
1199 info.switch_id = 0;
1200 for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
1201 info.switch_id <<= 8;
1202 info.switch_id |= ((uint8_t *)payload)[i];
1203 }
1204 switch_id_set = true;
1205 break;
1206 }
1207 off += RTA_ALIGN(ra->rta_len);
1208 }
1209 if (switch_id_set) {
1210 /* We have some E-Switch configuration. */
1211 mlx5_nl_check_switch_info(num_vf_set, &info);
1212 }
1213 MLX5_ASSERT(!(info.master && info.representor));
1214 memcpy(arg, &info, sizeof(info));
1215 return 0;
1216 error:
1217 rte_errno = EINVAL;
1218 return -rte_errno;
1219 }
1220
1221 /**
1222 * Get switch information associated with network interface.
1223 *
1224 * @param nl
1225 * Netlink socket of the ROUTE kind (NETLINK_ROUTE).
1226 * @param ifindex
1227 * Network interface index.
1228 * @param[out] info
1229 * Switch information object, populated in case of success.
1230 *
1231 * @return
1232 * 0 on success, a negative errno value otherwise and rte_errno is set.
1233 */
1234 int
mlx5_nl_switch_info(int nl,unsigned int ifindex,struct mlx5_switch_info * info)1235 mlx5_nl_switch_info(int nl, unsigned int ifindex,
1236 struct mlx5_switch_info *info)
1237 {
1238 struct {
1239 struct nlmsghdr nh;
1240 struct ifinfomsg info;
1241 struct rtattr rta;
1242 uint32_t extmask;
1243 } req = {
1244 .nh = {
1245 .nlmsg_len = NLMSG_LENGTH
1246 (sizeof(req.info) +
1247 RTA_LENGTH(sizeof(uint32_t))),
1248 .nlmsg_type = RTM_GETLINK,
1249 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1250 },
1251 .info = {
1252 .ifi_family = AF_UNSPEC,
1253 .ifi_index = ifindex,
1254 },
1255 .rta = {
1256 .rta_type = IFLA_EXT_MASK,
1257 .rta_len = RTA_LENGTH(sizeof(int32_t)),
1258 },
1259 .extmask = RTE_LE32(1),
1260 };
1261 uint32_t sn = MLX5_NL_SN_GENERATE;
1262 int ret;
1263
1264 ret = mlx5_nl_send(nl, &req.nh, sn);
1265 if (ret >= 0)
1266 ret = mlx5_nl_recv(nl, sn, mlx5_nl_switch_info_cb, info);
1267 if (info->master && info->representor) {
1268 DRV_LOG(ERR, "ifindex %u device is recognized as master"
1269 " and as representor", ifindex);
1270 rte_errno = ENODEV;
1271 ret = -rte_errno;
1272 }
1273 return ret;
1274 }
1275
1276 /*
1277 * Delete VLAN network device by ifindex.
1278 *
1279 * @param[in] tcf
1280 * Context object initialized by mlx5_nl_vlan_vmwa_init().
1281 * @param[in] ifindex
1282 * Interface index of network device to delete.
1283 */
1284 void
mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context * vmwa,uint32_t ifindex)1285 mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context *vmwa,
1286 uint32_t ifindex)
1287 {
1288 uint32_t sn = MLX5_NL_SN_GENERATE;
1289 int ret;
1290 struct {
1291 struct nlmsghdr nh;
1292 struct ifinfomsg info;
1293 } req = {
1294 .nh = {
1295 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
1296 .nlmsg_type = RTM_DELLINK,
1297 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1298 },
1299 .info = {
1300 .ifi_family = AF_UNSPEC,
1301 .ifi_index = ifindex,
1302 },
1303 };
1304
1305 if (ifindex) {
1306 ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, sn);
1307 if (ret >= 0)
1308 ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1309 if (ret < 0)
1310 DRV_LOG(WARNING, "netlink: error deleting VLAN WA"
1311 " ifindex %u, %d", ifindex, ret);
1312 }
1313 }
1314
1315 /* Set of subroutines to build Netlink message. */
1316 static struct nlattr *
nl_msg_tail(struct nlmsghdr * nlh)1317 nl_msg_tail(struct nlmsghdr *nlh)
1318 {
1319 return (struct nlattr *)
1320 (((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
1321 }
1322
1323 static void
nl_attr_put(struct nlmsghdr * nlh,int type,const void * data,int alen)1324 nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
1325 {
1326 struct nlattr *nla = nl_msg_tail(nlh);
1327
1328 nla->nla_type = type;
1329 nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr)) + alen;
1330 nlh->nlmsg_len += NLMSG_ALIGN(nla->nla_len);
1331
1332 if (alen)
1333 memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
1334 }
1335
1336 static struct nlattr *
nl_attr_nest_start(struct nlmsghdr * nlh,int type)1337 nl_attr_nest_start(struct nlmsghdr *nlh, int type)
1338 {
1339 struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
1340
1341 nl_attr_put(nlh, type, NULL, 0);
1342 return nest;
1343 }
1344
1345 static void
nl_attr_nest_end(struct nlmsghdr * nlh,struct nlattr * nest)1346 nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
1347 {
1348 nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
1349 }
1350
1351 /*
1352 * Create network VLAN device with specified VLAN tag.
1353 *
1354 * @param[in] tcf
1355 * Context object initialized by mlx5_nl_vlan_vmwa_init().
1356 * @param[in] ifindex
1357 * Base network interface index.
1358 * @param[in] tag
1359 * VLAN tag for VLAN network device to create.
1360 */
1361 uint32_t
mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context * vmwa,uint32_t ifindex,uint16_t tag)1362 mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context *vmwa,
1363 uint32_t ifindex, uint16_t tag)
1364 {
1365 struct nlmsghdr *nlh;
1366 struct ifinfomsg *ifm;
1367 char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32];
1368
1369 __rte_cache_aligned
1370 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1371 NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
1372 NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
1373 NLMSG_ALIGN(sizeof(uint32_t)) +
1374 NLMSG_ALIGN(sizeof(name)) +
1375 NLMSG_ALIGN(sizeof("vlan")) +
1376 NLMSG_ALIGN(sizeof(uint32_t)) +
1377 NLMSG_ALIGN(sizeof(uint16_t)) + 16];
1378 struct nlattr *na_info;
1379 struct nlattr *na_vlan;
1380 uint32_t sn = MLX5_NL_SN_GENERATE;
1381 int ret;
1382
1383 memset(buf, 0, sizeof(buf));
1384 nlh = (struct nlmsghdr *)buf;
1385 nlh->nlmsg_len = sizeof(struct nlmsghdr);
1386 nlh->nlmsg_type = RTM_NEWLINK;
1387 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
1388 NLM_F_EXCL | NLM_F_ACK;
1389 ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
1390 nlh->nlmsg_len += sizeof(struct ifinfomsg);
1391 ifm->ifi_family = AF_UNSPEC;
1392 ifm->ifi_type = 0;
1393 ifm->ifi_index = 0;
1394 ifm->ifi_flags = IFF_UP;
1395 ifm->ifi_change = 0xffffffff;
1396 nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
1397 ret = snprintf(name, sizeof(name), "%s.%u.%u",
1398 MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag);
1399 nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
1400 na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
1401 nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
1402 na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
1403 nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
1404 nl_attr_nest_end(nlh, na_vlan);
1405 nl_attr_nest_end(nlh, na_info);
1406 MLX5_ASSERT(sizeof(buf) >= nlh->nlmsg_len);
1407 ret = mlx5_nl_send(vmwa->nl_socket, nlh, sn);
1408 if (ret >= 0)
1409 ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1410 if (ret < 0) {
1411 DRV_LOG(WARNING, "netlink: VLAN %s create failure (%d)", name,
1412 ret);
1413 }
1414 /* Try to get ifindex of created or pre-existing device. */
1415 ret = if_nametoindex(name);
1416 if (!ret) {
1417 DRV_LOG(WARNING, "VLAN %s failed to get index (%d)", name,
1418 errno);
1419 return 0;
1420 }
1421 return ret;
1422 }
1423
1424 /**
1425 * Parse Netlink message to retrieve the general family ID.
1426 *
1427 * @param nh
1428 * Pointer to Netlink Message Header.
1429 * @param arg
1430 * PMD data register with this callback.
1431 *
1432 * @return
1433 * 0 on success, a negative errno value otherwise and rte_errno is set.
1434 */
1435 static int
mlx5_nl_family_id_cb(struct nlmsghdr * nh,void * arg)1436 mlx5_nl_family_id_cb(struct nlmsghdr *nh, void *arg)
1437 {
1438
1439 struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1440 struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1441 NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1442
1443 for (; nla->nla_len && nla < tail;
1444 nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len))) {
1445 if (nla->nla_type == CTRL_ATTR_FAMILY_ID) {
1446 *(uint16_t *)arg = *(uint16_t *)(nla + 1);
1447 return 0;
1448 }
1449 }
1450 return -EINVAL;
1451 }
1452
1453 #define MLX5_NL_MAX_ATTR_SIZE 100
1454 /**
1455 * Get generic netlink family ID.
1456 *
1457 * @param[in] nlsk_fd
1458 * Netlink socket file descriptor.
1459 * @param[in] name
1460 * The family name.
1461 *
1462 * @return
1463 * ID >= 0 on success and @p enable is updated, a negative errno value
1464 * otherwise and rte_errno is set.
1465 */
1466 static int
mlx5_nl_generic_family_id_get(int nlsk_fd,const char * name)1467 mlx5_nl_generic_family_id_get(int nlsk_fd, const char *name)
1468 {
1469 struct nlmsghdr *nlh;
1470 struct genlmsghdr *genl;
1471 uint32_t sn = MLX5_NL_SN_GENERATE;
1472 int name_size = strlen(name) + 1;
1473 int ret;
1474 uint16_t id = -1;
1475 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1476 NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1477 NLMSG_ALIGN(sizeof(struct nlattr)) +
1478 NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE)];
1479
1480 memset(buf, 0, sizeof(buf));
1481 nlh = (struct nlmsghdr *)buf;
1482 nlh->nlmsg_len = sizeof(struct nlmsghdr);
1483 nlh->nlmsg_type = GENL_ID_CTRL;
1484 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1485 genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1486 nlh->nlmsg_len += sizeof(struct genlmsghdr);
1487 genl->cmd = CTRL_CMD_GETFAMILY;
1488 genl->version = 1;
1489 nl_attr_put(nlh, CTRL_ATTR_FAMILY_NAME, name, name_size);
1490 ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1491 if (ret >= 0)
1492 ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_family_id_cb, &id);
1493 if (ret < 0) {
1494 DRV_LOG(DEBUG, "Failed to get Netlink %s family ID: %d.", name,
1495 ret);
1496 return ret;
1497 }
1498 DRV_LOG(DEBUG, "Netlink \"%s\" family ID is %u.", name, id);
1499 return (int)id;
1500 }
1501
1502 /**
1503 * Get Devlink family ID.
1504 *
1505 * @param[in] nlsk_fd
1506 * Netlink socket file descriptor.
1507 *
1508 * @return
1509 * ID >= 0 on success and @p enable is updated, a negative errno value
1510 * otherwise and rte_errno is set.
1511 */
1512
1513 int
mlx5_nl_devlink_family_id_get(int nlsk_fd)1514 mlx5_nl_devlink_family_id_get(int nlsk_fd)
1515 {
1516 return mlx5_nl_generic_family_id_get(nlsk_fd, DEVLINK_GENL_NAME);
1517 }
1518
1519 /**
1520 * Parse Netlink message to retrieve the ROCE enable status.
1521 *
1522 * @param nh
1523 * Pointer to Netlink Message Header.
1524 * @param arg
1525 * PMD data register with this callback.
1526 *
1527 * @return
1528 * 0 on success, a negative errno value otherwise and rte_errno is set.
1529 */
1530 static int
mlx5_nl_roce_cb(struct nlmsghdr * nh,void * arg)1531 mlx5_nl_roce_cb(struct nlmsghdr *nh, void *arg)
1532 {
1533
1534 int ret = -EINVAL;
1535 int *enable = arg;
1536 struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1537 struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1538 NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1539
1540 while (nla->nla_len && nla < tail) {
1541 switch (nla->nla_type) {
1542 /* Expected nested attributes case. */
1543 case DEVLINK_ATTR_PARAM:
1544 case DEVLINK_ATTR_PARAM_VALUES_LIST:
1545 case DEVLINK_ATTR_PARAM_VALUE:
1546 ret = 0;
1547 nla += 1;
1548 break;
1549 case DEVLINK_ATTR_PARAM_VALUE_DATA:
1550 *enable = 1;
1551 return 0;
1552 default:
1553 nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len));
1554 }
1555 }
1556 *enable = 0;
1557 return ret;
1558 }
1559
1560 /**
1561 * Get ROCE enable status through Netlink.
1562 *
1563 * @param[in] nlsk_fd
1564 * Netlink socket file descriptor.
1565 * @param[in] family_id
1566 * the Devlink family ID.
1567 * @param pci_addr
1568 * The device PCI address.
1569 * @param[out] enable
1570 * Where to store the enable status.
1571 *
1572 * @return
1573 * 0 on success and @p enable is updated, a negative errno value otherwise
1574 * and rte_errno is set.
1575 */
1576 int
mlx5_nl_enable_roce_get(int nlsk_fd,int family_id,const char * pci_addr,int * enable)1577 mlx5_nl_enable_roce_get(int nlsk_fd, int family_id, const char *pci_addr,
1578 int *enable)
1579 {
1580 struct nlmsghdr *nlh;
1581 struct genlmsghdr *genl;
1582 uint32_t sn = MLX5_NL_SN_GENERATE;
1583 int ret;
1584 int cur_en = 0;
1585 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1586 NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1587 NLMSG_ALIGN(sizeof(struct nlattr)) * 4 +
1588 NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 4];
1589
1590 memset(buf, 0, sizeof(buf));
1591 nlh = (struct nlmsghdr *)buf;
1592 nlh->nlmsg_len = sizeof(struct nlmsghdr);
1593 nlh->nlmsg_type = family_id;
1594 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1595 genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1596 nlh->nlmsg_len += sizeof(struct genlmsghdr);
1597 genl->cmd = DEVLINK_CMD_PARAM_GET;
1598 genl->version = DEVLINK_GENL_VERSION;
1599 nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1600 nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1601 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
1602 ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1603 if (ret >= 0)
1604 ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_roce_cb, &cur_en);
1605 if (ret < 0) {
1606 DRV_LOG(DEBUG, "Failed to get ROCE enable on device %s: %d.",
1607 pci_addr, ret);
1608 return ret;
1609 }
1610 *enable = cur_en;
1611 DRV_LOG(DEBUG, "ROCE is %sabled for device \"%s\".",
1612 cur_en ? "en" : "dis", pci_addr);
1613 return ret;
1614 }
1615
1616 /**
1617 * Reload mlx5 device kernel driver through Netlink.
1618 *
1619 * @param[in] nlsk_fd
1620 * Netlink socket file descriptor.
1621 * @param[in] family_id
1622 * the Devlink family ID.
1623 * @param pci_addr
1624 * The device PCI address.
1625 * @param[out] enable
1626 * The enable status to set.
1627 *
1628 * @return
1629 * 0 on success, a negative errno value otherwise and rte_errno is set.
1630 */
1631 int
mlx5_nl_driver_reload(int nlsk_fd,int family_id,const char * pci_addr)1632 mlx5_nl_driver_reload(int nlsk_fd, int family_id, const char *pci_addr)
1633 {
1634 struct nlmsghdr *nlh;
1635 struct genlmsghdr *genl;
1636 uint32_t sn = MLX5_NL_SN_GENERATE;
1637 int ret;
1638 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1639 NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1640 NLMSG_ALIGN(sizeof(struct nlattr)) * 2 +
1641 NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 2];
1642
1643 memset(buf, 0, sizeof(buf));
1644 nlh = (struct nlmsghdr *)buf;
1645 nlh->nlmsg_len = sizeof(struct nlmsghdr);
1646 nlh->nlmsg_type = family_id;
1647 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1648 genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1649 nlh->nlmsg_len += sizeof(struct genlmsghdr);
1650 genl->cmd = DEVLINK_CMD_RELOAD;
1651 genl->version = DEVLINK_GENL_VERSION;
1652 nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1653 nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1654 ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1655 if (ret >= 0)
1656 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
1657 if (ret < 0) {
1658 DRV_LOG(DEBUG, "Failed to reload %s device by Netlink - %d",
1659 pci_addr, ret);
1660 return ret;
1661 }
1662 DRV_LOG(DEBUG, "Device \"%s\" was reloaded by Netlink successfully.",
1663 pci_addr);
1664 return 0;
1665 }
1666
1667 /**
1668 * Set ROCE enable status through Netlink.
1669 *
1670 * @param[in] nlsk_fd
1671 * Netlink socket file descriptor.
1672 * @param[in] family_id
1673 * the Devlink family ID.
1674 * @param pci_addr
1675 * The device PCI address.
1676 * @param[out] enable
1677 * The enable status to set.
1678 *
1679 * @return
1680 * 0 on success, a negative errno value otherwise and rte_errno is set.
1681 */
1682 int
mlx5_nl_enable_roce_set(int nlsk_fd,int family_id,const char * pci_addr,int enable)1683 mlx5_nl_enable_roce_set(int nlsk_fd, int family_id, const char *pci_addr,
1684 int enable)
1685 {
1686 struct nlmsghdr *nlh;
1687 struct genlmsghdr *genl;
1688 uint32_t sn = MLX5_NL_SN_GENERATE;
1689 int ret;
1690 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1691 NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1692 NLMSG_ALIGN(sizeof(struct nlattr)) * 6 +
1693 NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 6];
1694 uint8_t cmode = DEVLINK_PARAM_CMODE_DRIVERINIT;
1695 uint8_t ptype = NLA_FLAG;
1696 ;
1697
1698 memset(buf, 0, sizeof(buf));
1699 nlh = (struct nlmsghdr *)buf;
1700 nlh->nlmsg_len = sizeof(struct nlmsghdr);
1701 nlh->nlmsg_type = family_id;
1702 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1703 genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1704 nlh->nlmsg_len += sizeof(struct genlmsghdr);
1705 genl->cmd = DEVLINK_CMD_PARAM_SET;
1706 genl->version = DEVLINK_GENL_VERSION;
1707 nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1708 nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1709 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
1710 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_CMODE, &cmode, sizeof(cmode));
1711 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_TYPE, &ptype, sizeof(ptype));
1712 if (enable)
1713 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, NULL, 0);
1714 ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1715 if (ret >= 0)
1716 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
1717 if (ret < 0) {
1718 DRV_LOG(DEBUG, "Failed to %sable ROCE for device %s by Netlink:"
1719 " %d.", enable ? "en" : "dis", pci_addr, ret);
1720 return ret;
1721 }
1722 DRV_LOG(DEBUG, "Device %s ROCE was %sabled by Netlink successfully.",
1723 pci_addr, enable ? "en" : "dis");
1724 /* Now, need to reload the driver. */
1725 return mlx5_nl_driver_reload(nlsk_fd, family_id, pci_addr);
1726 }
1727