1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2018 6WIND S.A.
3 * Copyright 2018 Mellanox Technologies, Ltd
4 */
5
6 #include <errno.h>
7 #include <linux/if_link.h>
8 #include <linux/rtnetlink.h>
9 #include <linux/genetlink.h>
10 #include <net/if.h>
11 #include <rdma/rdma_netlink.h>
12 #include <stdbool.h>
13 #include <stdint.h>
14 #include <stdlib.h>
15 #include <stdalign.h>
16 #include <string.h>
17 #include <sys/socket.h>
18 #include <unistd.h>
19
20 #include <rte_errno.h>
21
22 #include "mlx5_nl.h"
23 #include "../mlx5_common_log.h"
24 #include "mlx5_malloc.h"
25 #ifdef HAVE_DEVLINK
26 #include <linux/devlink.h>
27 #endif
28
29
30 /* Size of the buffer to receive kernel messages */
31 #define MLX5_NL_BUF_SIZE (32 * 1024)
32 /* Send buffer size for the Netlink socket */
33 #define MLX5_SEND_BUF_SIZE 32768
34 /* Receive buffer size for the Netlink socket */
35 #define MLX5_RECV_BUF_SIZE 32768
36 /* Maximal physical port name length. */
37 #define MLX5_PHYS_PORT_NAME_MAX 128
38
39 /** Parameters of VLAN devices created by driver. */
40 #define MLX5_VMWA_VLAN_DEVICE_PFX "evmlx"
41 /*
42 * Define NDA_RTA as defined in iproute2 sources.
43 *
44 * see in iproute2 sources file include/libnetlink.h
45 */
46 #ifndef MLX5_NDA_RTA
47 #define MLX5_NDA_RTA(r) \
48 ((struct rtattr *)(((char *)(r)) + NLMSG_ALIGN(sizeof(struct ndmsg))))
49 #endif
50 /*
51 * Define NLMSG_TAIL as defined in iproute2 sources.
52 *
53 * see in iproute2 sources file include/libnetlink.h
54 */
55 #ifndef NLMSG_TAIL
56 #define NLMSG_TAIL(nmsg) \
57 ((struct rtattr *)(((char *)(nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
58 #endif
59 /*
60 * The following definitions are normally found in rdma/rdma_netlink.h,
61 * however they are so recent that most systems do not expose them yet.
62 */
63 #ifndef HAVE_RDMA_NL_NLDEV
64 #define RDMA_NL_NLDEV 5
65 #endif
66 #ifndef HAVE_RDMA_NLDEV_CMD_GET
67 #define RDMA_NLDEV_CMD_GET 1
68 #endif
69 #ifndef HAVE_RDMA_NLDEV_CMD_PORT_GET
70 #define RDMA_NLDEV_CMD_PORT_GET 5
71 #endif
72 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_INDEX
73 #define RDMA_NLDEV_ATTR_DEV_INDEX 1
74 #endif
75 #ifndef HAVE_RDMA_NLDEV_ATTR_DEV_NAME
76 #define RDMA_NLDEV_ATTR_DEV_NAME 2
77 #endif
78 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_INDEX
79 #define RDMA_NLDEV_ATTR_PORT_INDEX 3
80 #endif
81 #ifndef HAVE_RDMA_NLDEV_ATTR_PORT_STATE
82 #define RDMA_NLDEV_ATTR_PORT_STATE 12
83 #endif
84 #ifndef HAVE_RDMA_NLDEV_ATTR_NDEV_INDEX
85 #define RDMA_NLDEV_ATTR_NDEV_INDEX 50
86 #endif
87
88 /* These are normally found in linux/if_link.h. */
89 #ifndef HAVE_IFLA_NUM_VF
90 #define IFLA_NUM_VF 21
91 #endif
92 #ifndef HAVE_IFLA_EXT_MASK
93 #define IFLA_EXT_MASK 29
94 #endif
95 #ifndef HAVE_IFLA_PHYS_SWITCH_ID
96 #define IFLA_PHYS_SWITCH_ID 36
97 #endif
98 #ifndef HAVE_IFLA_PHYS_PORT_NAME
99 #define IFLA_PHYS_PORT_NAME 38
100 #endif
101
102 /*
103 * Some Devlink defines may be missed in old kernel versions,
104 * adjust used defines.
105 */
106 #ifndef DEVLINK_GENL_NAME
107 #define DEVLINK_GENL_NAME "devlink"
108 #endif
109 #ifndef DEVLINK_GENL_VERSION
110 #define DEVLINK_GENL_VERSION 1
111 #endif
112 #ifndef DEVLINK_ATTR_BUS_NAME
113 #define DEVLINK_ATTR_BUS_NAME 1
114 #endif
115 #ifndef DEVLINK_ATTR_DEV_NAME
116 #define DEVLINK_ATTR_DEV_NAME 2
117 #endif
118 #ifndef DEVLINK_ATTR_PARAM
119 #define DEVLINK_ATTR_PARAM 80
120 #endif
121 #ifndef DEVLINK_ATTR_PARAM_NAME
122 #define DEVLINK_ATTR_PARAM_NAME 81
123 #endif
124 #ifndef DEVLINK_ATTR_PARAM_TYPE
125 #define DEVLINK_ATTR_PARAM_TYPE 83
126 #endif
127 #ifndef DEVLINK_ATTR_PARAM_VALUES_LIST
128 #define DEVLINK_ATTR_PARAM_VALUES_LIST 84
129 #endif
130 #ifndef DEVLINK_ATTR_PARAM_VALUE
131 #define DEVLINK_ATTR_PARAM_VALUE 85
132 #endif
133 #ifndef DEVLINK_ATTR_PARAM_VALUE_DATA
134 #define DEVLINK_ATTR_PARAM_VALUE_DATA 86
135 #endif
136 #ifndef DEVLINK_ATTR_PARAM_VALUE_CMODE
137 #define DEVLINK_ATTR_PARAM_VALUE_CMODE 87
138 #endif
139 #ifndef DEVLINK_PARAM_CMODE_DRIVERINIT
140 #define DEVLINK_PARAM_CMODE_DRIVERINIT 1
141 #endif
142 #ifndef DEVLINK_CMD_RELOAD
143 #define DEVLINK_CMD_RELOAD 37
144 #endif
145 #ifndef DEVLINK_CMD_PARAM_GET
146 #define DEVLINK_CMD_PARAM_GET 38
147 #endif
148 #ifndef DEVLINK_CMD_PARAM_SET
149 #define DEVLINK_CMD_PARAM_SET 39
150 #endif
151 #ifndef NLA_FLAG
152 #define NLA_FLAG 6
153 #endif
154
155 /* Add/remove MAC address through Netlink */
156 struct mlx5_nl_mac_addr {
157 struct rte_ether_addr (*mac)[];
158 /**< MAC address handled by the device. */
159 int mac_n; /**< Number of addresses in the array. */
160 };
161
162 #define MLX5_NL_CMD_GET_IB_NAME (1 << 0)
163 #define MLX5_NL_CMD_GET_IB_INDEX (1 << 1)
164 #define MLX5_NL_CMD_GET_NET_INDEX (1 << 2)
165 #define MLX5_NL_CMD_GET_PORT_INDEX (1 << 3)
166 #define MLX5_NL_CMD_GET_PORT_STATE (1 << 4)
167
168 /** Data structure used by mlx5_nl_cmdget_cb(). */
169 struct mlx5_nl_port_info {
170 const char *name; /**< IB device name (in). */
171 uint32_t flags; /**< found attribute flags (out). */
172 uint32_t ibindex; /**< IB device index (out). */
173 uint32_t ifindex; /**< Network interface index (out). */
174 uint32_t portnum; /**< IB device max port number (out). */
175 uint16_t state; /**< IB device port state (out). */
176 };
177
178 uint32_t atomic_sn;
179
180 /* Generate Netlink sequence number. */
181 #define MLX5_NL_SN_GENERATE __atomic_add_fetch(&atomic_sn, 1, __ATOMIC_RELAXED)
182
183 /**
184 * Opens a Netlink socket.
185 *
186 * @param protocol
187 * Netlink protocol (e.g. NETLINK_ROUTE, NETLINK_RDMA).
188 * @param groups
189 * Groups to listen (e.g. RTMGRP_LINK), can be 0.
190 *
191 * @return
192 * A file descriptor on success, a negative errno value otherwise and
193 * rte_errno is set.
194 */
195 int
mlx5_nl_init(int protocol,int groups)196 mlx5_nl_init(int protocol, int groups)
197 {
198 int fd;
199 int buf_size;
200 socklen_t opt_size;
201 struct sockaddr_nl local = {
202 .nl_family = AF_NETLINK,
203 .nl_groups = groups,
204 };
205 int ret;
206
207 fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, protocol);
208 if (fd == -1) {
209 rte_errno = errno;
210 return -rte_errno;
211 }
212 opt_size = sizeof(buf_size);
213 ret = getsockopt(fd, SOL_SOCKET, SO_SNDBUF, &buf_size, &opt_size);
214 if (ret == -1) {
215 rte_errno = errno;
216 goto error;
217 }
218 DRV_LOG(DEBUG, "Netlink socket send buffer: %d", buf_size);
219 if (buf_size < MLX5_SEND_BUF_SIZE) {
220 ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF,
221 &buf_size, sizeof(buf_size));
222 if (ret == -1) {
223 rte_errno = errno;
224 goto error;
225 }
226 }
227 opt_size = sizeof(buf_size);
228 ret = getsockopt(fd, SOL_SOCKET, SO_RCVBUF, &buf_size, &opt_size);
229 if (ret == -1) {
230 rte_errno = errno;
231 goto error;
232 }
233 DRV_LOG(DEBUG, "Netlink socket recv buffer: %d", buf_size);
234 if (buf_size < MLX5_RECV_BUF_SIZE) {
235 ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF,
236 &buf_size, sizeof(buf_size));
237 if (ret == -1) {
238 rte_errno = errno;
239 goto error;
240 }
241 }
242 ret = bind(fd, (struct sockaddr *)&local, sizeof(local));
243 if (ret == -1) {
244 rte_errno = errno;
245 goto error;
246 }
247 return fd;
248 error:
249 close(fd);
250 return -rte_errno;
251 }
252
253 /**
254 * Send a request message to the kernel on the Netlink socket.
255 *
256 * @param[in] nlsk_fd
257 * Netlink socket file descriptor.
258 * @param[in] nh
259 * The Netlink message send to the kernel.
260 * @param[in] ssn
261 * Sequence number.
262 * @param[in] req
263 * Pointer to the request structure.
264 * @param[in] len
265 * Length of the request in bytes.
266 *
267 * @return
268 * The number of sent bytes on success, a negative errno value otherwise and
269 * rte_errno is set.
270 */
271 static int
mlx5_nl_request(int nlsk_fd,struct nlmsghdr * nh,uint32_t sn,void * req,int len)272 mlx5_nl_request(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn, void *req,
273 int len)
274 {
275 struct sockaddr_nl sa = {
276 .nl_family = AF_NETLINK,
277 };
278 struct iovec iov[2] = {
279 { .iov_base = nh, .iov_len = sizeof(*nh), },
280 { .iov_base = req, .iov_len = len, },
281 };
282 struct msghdr msg = {
283 .msg_name = &sa,
284 .msg_namelen = sizeof(sa),
285 .msg_iov = iov,
286 .msg_iovlen = 2,
287 };
288 int send_bytes;
289
290 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
291 nh->nlmsg_seq = sn;
292 send_bytes = sendmsg(nlsk_fd, &msg, 0);
293 if (send_bytes < 0) {
294 rte_errno = errno;
295 return -rte_errno;
296 }
297 return send_bytes;
298 }
299
300 /**
301 * Send a message to the kernel on the Netlink socket.
302 *
303 * @param[in] nlsk_fd
304 * The Netlink socket file descriptor used for communication.
305 * @param[in] nh
306 * The Netlink message send to the kernel.
307 * @param[in] sn
308 * Sequence number.
309 *
310 * @return
311 * The number of sent bytes on success, a negative errno value otherwise and
312 * rte_errno is set.
313 */
314 static int
mlx5_nl_send(int nlsk_fd,struct nlmsghdr * nh,uint32_t sn)315 mlx5_nl_send(int nlsk_fd, struct nlmsghdr *nh, uint32_t sn)
316 {
317 struct sockaddr_nl sa = {
318 .nl_family = AF_NETLINK,
319 };
320 struct iovec iov = {
321 .iov_base = nh,
322 .iov_len = nh->nlmsg_len,
323 };
324 struct msghdr msg = {
325 .msg_name = &sa,
326 .msg_namelen = sizeof(sa),
327 .msg_iov = &iov,
328 .msg_iovlen = 1,
329 };
330 int send_bytes;
331
332 nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
333 nh->nlmsg_seq = sn;
334 send_bytes = sendmsg(nlsk_fd, &msg, 0);
335 if (send_bytes < 0) {
336 rte_errno = errno;
337 return -rte_errno;
338 }
339 return send_bytes;
340 }
341
342 /**
343 * Receive a message from the kernel on the Netlink socket, following
344 * mlx5_nl_send().
345 *
346 * @param[in] nlsk_fd
347 * The Netlink socket file descriptor used for communication.
348 * @param[in] sn
349 * Sequence number.
350 * @param[in] cb
351 * The callback function to call for each Netlink message received.
352 * @param[in, out] arg
353 * Custom arguments for the callback.
354 *
355 * @return
356 * 0 on success, a negative errno value otherwise and rte_errno is set.
357 */
358 static int
mlx5_nl_recv(int nlsk_fd,uint32_t sn,int (* cb)(struct nlmsghdr *,void * arg),void * arg)359 mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg),
360 void *arg)
361 {
362 struct sockaddr_nl sa;
363 struct iovec iov;
364 struct msghdr msg = {
365 .msg_name = &sa,
366 .msg_namelen = sizeof(sa),
367 .msg_iov = &iov,
368 /* One message at a time */
369 .msg_iovlen = 1,
370 };
371 void *buf = NULL;
372 int multipart = 0;
373 int ret = 0;
374
375 do {
376 struct nlmsghdr *nh;
377 int recv_bytes;
378
379 do {
380 /* Query length of incoming message. */
381 iov.iov_base = NULL;
382 iov.iov_len = 0;
383 recv_bytes = recvmsg(nlsk_fd, &msg,
384 MSG_PEEK | MSG_TRUNC);
385 if (recv_bytes < 0) {
386 rte_errno = errno;
387 ret = -rte_errno;
388 goto exit;
389 }
390 if (recv_bytes == 0) {
391 rte_errno = ENODATA;
392 ret = -rte_errno;
393 goto exit;
394 }
395 /* Allocate buffer to fetch the message. */
396 if (recv_bytes < MLX5_RECV_BUF_SIZE)
397 recv_bytes = MLX5_RECV_BUF_SIZE;
398 mlx5_free(buf);
399 buf = mlx5_malloc(0, recv_bytes, 0, SOCKET_ID_ANY);
400 if (!buf) {
401 rte_errno = ENOMEM;
402 ret = -rte_errno;
403 goto exit;
404 }
405 /* Fetch the message. */
406 iov.iov_base = buf;
407 iov.iov_len = recv_bytes;
408 recv_bytes = recvmsg(nlsk_fd, &msg, 0);
409 if (recv_bytes == -1) {
410 rte_errno = errno;
411 ret = -rte_errno;
412 goto exit;
413 }
414 nh = (struct nlmsghdr *)buf;
415 } while (nh->nlmsg_seq != sn);
416 for (;
417 NLMSG_OK(nh, (unsigned int)recv_bytes);
418 nh = NLMSG_NEXT(nh, recv_bytes)) {
419 if (nh->nlmsg_type == NLMSG_ERROR) {
420 struct nlmsgerr *err_data = NLMSG_DATA(nh);
421
422 if (err_data->error < 0) {
423 rte_errno = -err_data->error;
424 ret = -rte_errno;
425 goto exit;
426 }
427 /* Ack message. */
428 ret = 0;
429 goto exit;
430 }
431 /* Multi-part msgs and their trailing DONE message. */
432 if (nh->nlmsg_flags & NLM_F_MULTI) {
433 if (nh->nlmsg_type == NLMSG_DONE) {
434 ret = 0;
435 goto exit;
436 }
437 multipart = 1;
438 }
439 if (cb) {
440 ret = cb(nh, arg);
441 if (ret < 0)
442 goto exit;
443 }
444 }
445 } while (multipart);
446 exit:
447 mlx5_free(buf);
448 return ret;
449 }
450
451 /**
452 * Parse Netlink message to retrieve the bridge MAC address.
453 *
454 * @param nh
455 * Pointer to Netlink Message Header.
456 * @param arg
457 * PMD data register with this callback.
458 *
459 * @return
460 * 0 on success, a negative errno value otherwise and rte_errno is set.
461 */
462 static int
mlx5_nl_mac_addr_cb(struct nlmsghdr * nh,void * arg)463 mlx5_nl_mac_addr_cb(struct nlmsghdr *nh, void *arg)
464 {
465 struct mlx5_nl_mac_addr *data = arg;
466 struct ndmsg *r = NLMSG_DATA(nh);
467 struct rtattr *attribute;
468 int len;
469
470 len = nh->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
471 for (attribute = MLX5_NDA_RTA(r);
472 RTA_OK(attribute, len);
473 attribute = RTA_NEXT(attribute, len)) {
474 if (attribute->rta_type == NDA_LLADDR) {
475 if (data->mac_n == MLX5_MAX_MAC_ADDRESSES) {
476 DRV_LOG(WARNING,
477 "not enough room to finalize the"
478 " request");
479 rte_errno = ENOMEM;
480 return -rte_errno;
481 }
482 #ifdef RTE_LIBRTE_MLX5_DEBUG
483 char m[RTE_ETHER_ADDR_FMT_SIZE];
484
485 rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE,
486 RTA_DATA(attribute));
487 DRV_LOG(DEBUG, "bridge MAC address %s", m);
488 #endif
489 memcpy(&(*data->mac)[data->mac_n++],
490 RTA_DATA(attribute), RTE_ETHER_ADDR_LEN);
491 }
492 }
493 return 0;
494 }
495
496 /**
497 * Get bridge MAC addresses.
498 *
499 * @param[in] nlsk_fd
500 * Netlink socket file descriptor.
501 * @param[in] iface_idx
502 * Net device interface index.
503 * @param mac[out]
504 * Pointer to the array table of MAC addresses to fill.
505 * Its size should be of MLX5_MAX_MAC_ADDRESSES.
506 * @param mac_n[out]
507 * Number of entries filled in MAC array.
508 *
509 * @return
510 * 0 on success, a negative errno value otherwise and rte_errno is set.
511 */
512 static int
mlx5_nl_mac_addr_list(int nlsk_fd,unsigned int iface_idx,struct rte_ether_addr (* mac)[],int * mac_n)513 mlx5_nl_mac_addr_list(int nlsk_fd, unsigned int iface_idx,
514 struct rte_ether_addr (*mac)[], int *mac_n)
515 {
516 struct {
517 struct nlmsghdr hdr;
518 struct ifinfomsg ifm;
519 } req = {
520 .hdr = {
521 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
522 .nlmsg_type = RTM_GETNEIGH,
523 .nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
524 },
525 .ifm = {
526 .ifi_family = PF_BRIDGE,
527 .ifi_index = iface_idx,
528 },
529 };
530 struct mlx5_nl_mac_addr data = {
531 .mac = mac,
532 .mac_n = 0,
533 };
534 uint32_t sn = MLX5_NL_SN_GENERATE;
535 int ret;
536
537 if (nlsk_fd == -1)
538 return 0;
539 ret = mlx5_nl_request(nlsk_fd, &req.hdr, sn, &req.ifm,
540 sizeof(struct ifinfomsg));
541 if (ret < 0)
542 goto error;
543 ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_mac_addr_cb, &data);
544 if (ret < 0)
545 goto error;
546 *mac_n = data.mac_n;
547 return 0;
548 error:
549 DRV_LOG(DEBUG, "Interface %u cannot retrieve MAC address list %s",
550 iface_idx, strerror(rte_errno));
551 return -rte_errno;
552 }
553
554 /**
555 * Modify the MAC address neighbour table with Netlink.
556 *
557 * @param[in] nlsk_fd
558 * Netlink socket file descriptor.
559 * @param[in] iface_idx
560 * Net device interface index.
561 * @param mac
562 * MAC address to consider.
563 * @param add
564 * 1 to add the MAC address, 0 to remove the MAC address.
565 *
566 * @return
567 * 0 on success, a negative errno value otherwise and rte_errno is set.
568 */
569 static int
mlx5_nl_mac_addr_modify(int nlsk_fd,unsigned int iface_idx,struct rte_ether_addr * mac,int add)570 mlx5_nl_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
571 struct rte_ether_addr *mac, int add)
572 {
573 struct {
574 struct nlmsghdr hdr;
575 struct ndmsg ndm;
576 struct rtattr rta;
577 uint8_t buffer[RTE_ETHER_ADDR_LEN];
578 } req = {
579 .hdr = {
580 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
581 .nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
582 NLM_F_EXCL | NLM_F_ACK,
583 .nlmsg_type = add ? RTM_NEWNEIGH : RTM_DELNEIGH,
584 },
585 .ndm = {
586 .ndm_family = PF_BRIDGE,
587 .ndm_state = NUD_NOARP | NUD_PERMANENT,
588 .ndm_ifindex = iface_idx,
589 .ndm_flags = NTF_SELF,
590 },
591 .rta = {
592 .rta_type = NDA_LLADDR,
593 .rta_len = RTA_LENGTH(RTE_ETHER_ADDR_LEN),
594 },
595 };
596 uint32_t sn = MLX5_NL_SN_GENERATE;
597 int ret;
598
599 if (nlsk_fd == -1)
600 return 0;
601 memcpy(RTA_DATA(&req.rta), mac, RTE_ETHER_ADDR_LEN);
602 req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
603 RTA_ALIGN(req.rta.rta_len);
604 ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
605 if (ret < 0)
606 goto error;
607 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
608 if (ret < 0)
609 goto error;
610 return 0;
611 error:
612 #ifdef RTE_LIBRTE_MLX5_DEBUG
613 {
614 char m[RTE_ETHER_ADDR_FMT_SIZE];
615
616 rte_ether_format_addr(m, RTE_ETHER_ADDR_FMT_SIZE, mac);
617 DRV_LOG(DEBUG,
618 "Interface %u cannot %s MAC address %s %s",
619 iface_idx,
620 add ? "add" : "remove", m, strerror(rte_errno));
621 }
622 #endif
623 return -rte_errno;
624 }
625
626 /**
627 * Modify the VF MAC address neighbour table with Netlink.
628 *
629 * @param[in] nlsk_fd
630 * Netlink socket file descriptor.
631 * @param[in] iface_idx
632 * Net device interface index.
633 * @param mac
634 * MAC address to consider.
635 * @param vf_index
636 * VF index.
637 *
638 * @return
639 * 0 on success, a negative errno value otherwise and rte_errno is set.
640 */
641 int
mlx5_nl_vf_mac_addr_modify(int nlsk_fd,unsigned int iface_idx,struct rte_ether_addr * mac,int vf_index)642 mlx5_nl_vf_mac_addr_modify(int nlsk_fd, unsigned int iface_idx,
643 struct rte_ether_addr *mac, int vf_index)
644 {
645 int ret;
646 struct {
647 struct nlmsghdr hdr;
648 struct ifinfomsg ifm;
649 struct rtattr vf_list_rta;
650 struct rtattr vf_info_rta;
651 struct rtattr vf_mac_rta;
652 struct ifla_vf_mac ivm;
653 } req = {
654 .hdr = {
655 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
656 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
657 .nlmsg_type = RTM_BASE,
658 },
659 .ifm = {
660 .ifi_index = iface_idx,
661 },
662 .vf_list_rta = {
663 .rta_type = IFLA_VFINFO_LIST,
664 .rta_len = RTA_ALIGN(RTA_LENGTH(0)),
665 },
666 .vf_info_rta = {
667 .rta_type = IFLA_VF_INFO,
668 .rta_len = RTA_ALIGN(RTA_LENGTH(0)),
669 },
670 .vf_mac_rta = {
671 .rta_type = IFLA_VF_MAC,
672 },
673 };
674 struct ifla_vf_mac ivm = {
675 .vf = vf_index,
676 };
677 uint32_t sn = MLX5_NL_SN_GENERATE;
678
679 memcpy(&ivm.mac, mac, RTE_ETHER_ADDR_LEN);
680 memcpy(RTA_DATA(&req.vf_mac_rta), &ivm, sizeof(ivm));
681
682 req.vf_mac_rta.rta_len = RTA_LENGTH(sizeof(ivm));
683 req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
684 RTA_ALIGN(req.vf_list_rta.rta_len) +
685 RTA_ALIGN(req.vf_info_rta.rta_len) +
686 RTA_ALIGN(req.vf_mac_rta.rta_len);
687 req.vf_list_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
688 &req.vf_list_rta);
689 req.vf_info_rta.rta_len = RTE_PTR_DIFF(NLMSG_TAIL(&req.hdr),
690 &req.vf_info_rta);
691
692 if (nlsk_fd < 0)
693 return -1;
694 ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
695 if (ret < 0)
696 goto error;
697 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
698 if (ret < 0)
699 goto error;
700 return 0;
701 error:
702 DRV_LOG(ERR,
703 "representor %u cannot set VF MAC address "
704 RTE_ETHER_ADDR_PRT_FMT " : %s",
705 vf_index,
706 RTE_ETHER_ADDR_BYTES(mac),
707 strerror(rte_errno));
708 return -rte_errno;
709 }
710
711 /**
712 * Add a MAC address.
713 *
714 * @param[in] nlsk_fd
715 * Netlink socket file descriptor.
716 * @param[in] iface_idx
717 * Net device interface index.
718 * @param mac_own
719 * BITFIELD_DECLARE array to store the mac.
720 * @param mac
721 * MAC address to register.
722 * @param index
723 * MAC address index.
724 *
725 * @return
726 * 0 on success, a negative errno value otherwise and rte_errno is set.
727 */
728 int
mlx5_nl_mac_addr_add(int nlsk_fd,unsigned int iface_idx,uint64_t * mac_own,struct rte_ether_addr * mac,uint32_t index)729 mlx5_nl_mac_addr_add(int nlsk_fd, unsigned int iface_idx,
730 uint64_t *mac_own, struct rte_ether_addr *mac,
731 uint32_t index)
732 {
733 int ret;
734
735 ret = mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 1);
736 if (!ret) {
737 MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
738 if (index >= MLX5_MAX_MAC_ADDRESSES)
739 return -EINVAL;
740
741 BITFIELD_SET(mac_own, index);
742 }
743 if (ret == -EEXIST)
744 return 0;
745 return ret;
746 }
747
748 /**
749 * Remove a MAC address.
750 *
751 * @param[in] nlsk_fd
752 * Netlink socket file descriptor.
753 * @param[in] iface_idx
754 * Net device interface index.
755 * @param mac_own
756 * BITFIELD_DECLARE array to store the mac.
757 * @param mac
758 * MAC address to remove.
759 * @param index
760 * MAC address index.
761 *
762 * @return
763 * 0 on success, a negative errno value otherwise and rte_errno is set.
764 */
765 int
mlx5_nl_mac_addr_remove(int nlsk_fd,unsigned int iface_idx,uint64_t * mac_own,struct rte_ether_addr * mac,uint32_t index)766 mlx5_nl_mac_addr_remove(int nlsk_fd, unsigned int iface_idx, uint64_t *mac_own,
767 struct rte_ether_addr *mac, uint32_t index)
768 {
769 MLX5_ASSERT(index < MLX5_MAX_MAC_ADDRESSES);
770 if (index >= MLX5_MAX_MAC_ADDRESSES)
771 return -EINVAL;
772
773 BITFIELD_RESET(mac_own, index);
774 return mlx5_nl_mac_addr_modify(nlsk_fd, iface_idx, mac, 0);
775 }
776
777 /**
778 * Synchronize Netlink bridge table to the internal table.
779 *
780 * @param[in] nlsk_fd
781 * Netlink socket file descriptor.
782 * @param[in] iface_idx
783 * Net device interface index.
784 * @param mac_addrs
785 * Mac addresses array to sync.
786 * @param n
787 * @p mac_addrs array size.
788 */
789 void
mlx5_nl_mac_addr_sync(int nlsk_fd,unsigned int iface_idx,struct rte_ether_addr * mac_addrs,int n)790 mlx5_nl_mac_addr_sync(int nlsk_fd, unsigned int iface_idx,
791 struct rte_ether_addr *mac_addrs, int n)
792 {
793 struct rte_ether_addr macs[n];
794 int macs_n = 0;
795 int i;
796 int ret;
797
798 memset(macs, 0, n * sizeof(macs[0]));
799 ret = mlx5_nl_mac_addr_list(nlsk_fd, iface_idx, &macs, &macs_n);
800 if (ret)
801 return;
802 for (i = 0; i != macs_n; ++i) {
803 int j;
804
805 /* Verify the address is not in the array yet. */
806 for (j = 0; j != n; ++j)
807 if (rte_is_same_ether_addr(&macs[i], &mac_addrs[j]))
808 break;
809 if (j != n)
810 continue;
811 if (rte_is_multicast_ether_addr(&macs[i])) {
812 /* Find the first entry available. */
813 for (j = MLX5_MAX_UC_MAC_ADDRESSES; j != n; ++j) {
814 if (rte_is_zero_ether_addr(&mac_addrs[j])) {
815 mac_addrs[j] = macs[i];
816 break;
817 }
818 }
819 } else {
820 /* Find the first entry available. */
821 for (j = 0; j != MLX5_MAX_UC_MAC_ADDRESSES; ++j) {
822 if (rte_is_zero_ether_addr(&mac_addrs[j])) {
823 mac_addrs[j] = macs[i];
824 break;
825 }
826 }
827 }
828 }
829 }
830
831 /**
832 * Flush all added MAC addresses.
833 *
834 * @param[in] nlsk_fd
835 * Netlink socket file descriptor.
836 * @param[in] iface_idx
837 * Net device interface index.
838 * @param[in] mac_addrs
839 * Mac addresses array to flush.
840 * @param n
841 * @p mac_addrs array size.
842 * @param mac_own
843 * BITFIELD_DECLARE array to store the mac.
844 */
845 void
mlx5_nl_mac_addr_flush(int nlsk_fd,unsigned int iface_idx,struct rte_ether_addr * mac_addrs,int n,uint64_t * mac_own)846 mlx5_nl_mac_addr_flush(int nlsk_fd, unsigned int iface_idx,
847 struct rte_ether_addr *mac_addrs, int n,
848 uint64_t *mac_own)
849 {
850 int i;
851
852 if (n <= 0 || n > MLX5_MAX_MAC_ADDRESSES)
853 return;
854
855 for (i = n - 1; i >= 0; --i) {
856 struct rte_ether_addr *m = &mac_addrs[i];
857
858 if (BITFIELD_ISSET(mac_own, i))
859 mlx5_nl_mac_addr_remove(nlsk_fd, iface_idx, mac_own, m,
860 i);
861 }
862 }
863
864 /**
865 * Enable promiscuous / all multicast mode through Netlink.
866 *
867 * @param[in] nlsk_fd
868 * Netlink socket file descriptor.
869 * @param[in] iface_idx
870 * Net device interface index.
871 * @param flags
872 * IFF_PROMISC for promiscuous, IFF_ALLMULTI for allmulti.
873 * @param enable
874 * Nonzero to enable, disable otherwise.
875 *
876 * @return
877 * 0 on success, a negative errno value otherwise and rte_errno is set.
878 */
879 static int
mlx5_nl_device_flags(int nlsk_fd,unsigned int iface_idx,uint32_t flags,int enable)880 mlx5_nl_device_flags(int nlsk_fd, unsigned int iface_idx, uint32_t flags,
881 int enable)
882 {
883 struct {
884 struct nlmsghdr hdr;
885 struct ifinfomsg ifi;
886 } req = {
887 .hdr = {
888 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
889 .nlmsg_type = RTM_NEWLINK,
890 .nlmsg_flags = NLM_F_REQUEST,
891 },
892 .ifi = {
893 .ifi_flags = enable ? flags : 0,
894 .ifi_change = flags,
895 .ifi_index = iface_idx,
896 },
897 };
898 uint32_t sn = MLX5_NL_SN_GENERATE;
899 int ret;
900
901 MLX5_ASSERT(!(flags & ~(IFF_PROMISC | IFF_ALLMULTI)));
902 if (nlsk_fd < 0)
903 return 0;
904 ret = mlx5_nl_send(nlsk_fd, &req.hdr, sn);
905 if (ret < 0)
906 return ret;
907 return 0;
908 }
909
910 /**
911 * Enable promiscuous mode through Netlink.
912 *
913 * @param[in] nlsk_fd
914 * Netlink socket file descriptor.
915 * @param[in] iface_idx
916 * Net device interface index.
917 * @param enable
918 * Nonzero to enable, disable otherwise.
919 *
920 * @return
921 * 0 on success, a negative errno value otherwise and rte_errno is set.
922 */
923 int
mlx5_nl_promisc(int nlsk_fd,unsigned int iface_idx,int enable)924 mlx5_nl_promisc(int nlsk_fd, unsigned int iface_idx, int enable)
925 {
926 int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_PROMISC, enable);
927
928 if (ret)
929 DRV_LOG(DEBUG,
930 "Interface %u cannot %s promisc mode: Netlink error %s",
931 iface_idx, enable ? "enable" : "disable",
932 strerror(rte_errno));
933 return ret;
934 }
935
936 /**
937 * Enable all multicast mode through Netlink.
938 *
939 * @param[in] nlsk_fd
940 * Netlink socket file descriptor.
941 * @param[in] iface_idx
942 * Net device interface index.
943 * @param enable
944 * Nonzero to enable, disable otherwise.
945 *
946 * @return
947 * 0 on success, a negative errno value otherwise and rte_errno is set.
948 */
949 int
mlx5_nl_allmulti(int nlsk_fd,unsigned int iface_idx,int enable)950 mlx5_nl_allmulti(int nlsk_fd, unsigned int iface_idx, int enable)
951 {
952 int ret = mlx5_nl_device_flags(nlsk_fd, iface_idx, IFF_ALLMULTI,
953 enable);
954
955 if (ret)
956 DRV_LOG(DEBUG,
957 "Interface %u cannot %s allmulti : Netlink error %s",
958 iface_idx, enable ? "enable" : "disable",
959 strerror(rte_errno));
960 return ret;
961 }
962
963 /**
964 * Process network interface information from Netlink message.
965 *
966 * @param nh
967 * Pointer to Netlink message header.
968 * @param arg
969 * Opaque data pointer for this callback.
970 *
971 * @return
972 * 0 on success, a negative errno value otherwise and rte_errno is set.
973 */
974 static int
mlx5_nl_cmdget_cb(struct nlmsghdr * nh,void * arg)975 mlx5_nl_cmdget_cb(struct nlmsghdr *nh, void *arg)
976 {
977 struct mlx5_nl_port_info *data = arg;
978 struct mlx5_nl_port_info local = {
979 .flags = 0,
980 };
981 size_t off = NLMSG_HDRLEN;
982
983 if (nh->nlmsg_type !=
984 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET) &&
985 nh->nlmsg_type !=
986 RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_PORT_GET))
987 goto error;
988 while (off < nh->nlmsg_len) {
989 struct nlattr *na = (void *)((uintptr_t)nh + off);
990 void *payload = (void *)((uintptr_t)na + NLA_HDRLEN);
991
992 if (na->nla_len > nh->nlmsg_len - off)
993 goto error;
994 switch (na->nla_type) {
995 case RDMA_NLDEV_ATTR_DEV_INDEX:
996 local.ibindex = *(uint32_t *)payload;
997 local.flags |= MLX5_NL_CMD_GET_IB_INDEX;
998 break;
999 case RDMA_NLDEV_ATTR_DEV_NAME:
1000 if (!strcmp(payload, data->name))
1001 local.flags |= MLX5_NL_CMD_GET_IB_NAME;
1002 break;
1003 case RDMA_NLDEV_ATTR_NDEV_INDEX:
1004 local.ifindex = *(uint32_t *)payload;
1005 local.flags |= MLX5_NL_CMD_GET_NET_INDEX;
1006 break;
1007 case RDMA_NLDEV_ATTR_PORT_INDEX:
1008 local.portnum = *(uint32_t *)payload;
1009 local.flags |= MLX5_NL_CMD_GET_PORT_INDEX;
1010 break;
1011 case RDMA_NLDEV_ATTR_PORT_STATE:
1012 local.state = *(uint8_t *)payload;
1013 local.flags |= MLX5_NL_CMD_GET_PORT_STATE;
1014 break;
1015 default:
1016 break;
1017 }
1018 off += NLA_ALIGN(na->nla_len);
1019 }
1020 /*
1021 * It is possible to have multiple messages for all
1022 * Infiniband devices in the system with appropriate name.
1023 * So we should gather parameters locally and copy to
1024 * query context only in case of coinciding device name.
1025 */
1026 if (local.flags & MLX5_NL_CMD_GET_IB_NAME) {
1027 data->flags = local.flags;
1028 data->ibindex = local.ibindex;
1029 data->ifindex = local.ifindex;
1030 data->portnum = local.portnum;
1031 data->state = local.state;
1032 }
1033 return 0;
1034 error:
1035 rte_errno = EINVAL;
1036 return -rte_errno;
1037 }
1038
1039 /**
1040 * Get port info of network interface associated with some IB device.
1041 *
1042 * This is the only somewhat safe method to avoid resorting to heuristics
1043 * when faced with port representors. Unfortunately it requires at least
1044 * Linux 4.17.
1045 *
1046 * @param nl
1047 * Netlink socket of the RDMA kind (NETLINK_RDMA).
1048 * @param[in] pindex
1049 * IB device port index, starting from 1
1050 * @param[out] data
1051 * Pointer to port info.
1052 * @return
1053 * 0 on success, negative on error and rte_errno is set.
1054 */
1055 static int
mlx5_nl_port_info(int nl,uint32_t pindex,struct mlx5_nl_port_info * data)1056 mlx5_nl_port_info(int nl, uint32_t pindex, struct mlx5_nl_port_info *data)
1057 {
1058 union {
1059 struct nlmsghdr nh;
1060 uint8_t buf[NLMSG_HDRLEN +
1061 NLA_HDRLEN + NLA_ALIGN(sizeof(data->ibindex)) +
1062 NLA_HDRLEN + NLA_ALIGN(sizeof(pindex))];
1063 } req = {
1064 .nh = {
1065 .nlmsg_len = NLMSG_LENGTH(0),
1066 .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1067 RDMA_NLDEV_CMD_GET),
1068 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1069 },
1070 };
1071 struct nlattr *na;
1072 uint32_t sn = MLX5_NL_SN_GENERATE;
1073 int ret;
1074
1075 ret = mlx5_nl_send(nl, &req.nh, sn);
1076 if (ret < 0)
1077 return ret;
1078 ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, data);
1079 if (ret < 0)
1080 return ret;
1081 if (!(data->flags & MLX5_NL_CMD_GET_IB_NAME) ||
1082 !(data->flags & MLX5_NL_CMD_GET_IB_INDEX))
1083 goto error;
1084 data->flags = 0;
1085 sn = MLX5_NL_SN_GENERATE;
1086 req.nh.nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1087 RDMA_NLDEV_CMD_PORT_GET);
1088 req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1089 req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.buf) - NLMSG_HDRLEN);
1090 na = (void *)((uintptr_t)req.buf + NLMSG_HDRLEN);
1091 na->nla_len = NLA_HDRLEN + sizeof(data->ibindex);
1092 na->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
1093 memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
1094 &data->ibindex, sizeof(data->ibindex));
1095 na = (void *)((uintptr_t)na + NLA_ALIGN(na->nla_len));
1096 na->nla_len = NLA_HDRLEN + sizeof(pindex);
1097 na->nla_type = RDMA_NLDEV_ATTR_PORT_INDEX;
1098 memcpy((void *)((uintptr_t)na + NLA_HDRLEN),
1099 &pindex, sizeof(pindex));
1100 ret = mlx5_nl_send(nl, &req.nh, sn);
1101 if (ret < 0)
1102 return ret;
1103 ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, data);
1104 if (ret < 0)
1105 return ret;
1106 if (!(data->flags & MLX5_NL_CMD_GET_IB_NAME) ||
1107 !(data->flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1108 !(data->flags & MLX5_NL_CMD_GET_NET_INDEX) ||
1109 !data->ifindex)
1110 goto error;
1111 return 1;
1112 error:
1113 rte_errno = ENODEV;
1114 return -rte_errno;
1115 }
1116
1117 /**
1118 * Get index of network interface associated with some IB device.
1119 *
1120 * This is the only somewhat safe method to avoid resorting to heuristics
1121 * when faced with port representors. Unfortunately it requires at least
1122 * Linux 4.17.
1123 *
1124 * @param nl
1125 * Netlink socket of the RDMA kind (NETLINK_RDMA).
1126 * @param[in] name
1127 * IB device name.
1128 * @param[in] pindex
1129 * IB device port index, starting from 1
1130 * @return
1131 * A valid (nonzero) interface index on success, 0 otherwise and rte_errno
1132 * is set.
1133 */
1134 unsigned int
mlx5_nl_ifindex(int nl,const char * name,uint32_t pindex)1135 mlx5_nl_ifindex(int nl, const char *name, uint32_t pindex)
1136 {
1137 struct mlx5_nl_port_info data = {
1138 .ifindex = 0,
1139 .name = name,
1140 };
1141
1142 if (mlx5_nl_port_info(nl, pindex, &data) < 0)
1143 return 0;
1144 return data.ifindex;
1145 }
1146
1147 /**
1148 * Get IB device port state.
1149 *
1150 * This is the only somewhat safe method to get info for port number >= 255.
1151 * Unfortunately it requires at least Linux 4.17.
1152 *
1153 * @param nl
1154 * Netlink socket of the RDMA kind (NETLINK_RDMA).
1155 * @param[in] name
1156 * IB device name.
1157 * @param[in] pindex
1158 * IB device port index, starting from 1
1159 * @return
1160 * Port state (ibv_port_state) on success, negative on error
1161 * and rte_errno is set.
1162 */
1163 int
mlx5_nl_port_state(int nl,const char * name,uint32_t pindex)1164 mlx5_nl_port_state(int nl, const char *name, uint32_t pindex)
1165 {
1166 struct mlx5_nl_port_info data = {
1167 .state = 0,
1168 .name = name,
1169 };
1170
1171 if (mlx5_nl_port_info(nl, pindex, &data) < 0)
1172 return -rte_errno;
1173 if ((data.flags & MLX5_NL_CMD_GET_PORT_STATE) == 0) {
1174 rte_errno = ENOTSUP;
1175 return -rte_errno;
1176 }
1177 return (int)data.state;
1178 }
1179
1180 /**
1181 * Get the number of physical ports of given IB device.
1182 *
1183 * @param nl
1184 * Netlink socket of the RDMA kind (NETLINK_RDMA).
1185 * @param[in] name
1186 * IB device name.
1187 *
1188 * @return
1189 * A valid (nonzero) number of ports on success, 0 otherwise
1190 * and rte_errno is set.
1191 */
1192 unsigned int
mlx5_nl_portnum(int nl,const char * name)1193 mlx5_nl_portnum(int nl, const char *name)
1194 {
1195 struct mlx5_nl_port_info data = {
1196 .flags = 0,
1197 .name = name,
1198 .ifindex = 0,
1199 .portnum = 0,
1200 };
1201 struct nlmsghdr req = {
1202 .nlmsg_len = NLMSG_LENGTH(0),
1203 .nlmsg_type = RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
1204 RDMA_NLDEV_CMD_GET),
1205 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
1206 };
1207 uint32_t sn = MLX5_NL_SN_GENERATE;
1208 int ret;
1209
1210 ret = mlx5_nl_send(nl, &req, sn);
1211 if (ret < 0)
1212 return 0;
1213 ret = mlx5_nl_recv(nl, sn, mlx5_nl_cmdget_cb, &data);
1214 if (ret < 0)
1215 return 0;
1216 if (!(data.flags & MLX5_NL_CMD_GET_IB_NAME) ||
1217 !(data.flags & MLX5_NL_CMD_GET_IB_INDEX) ||
1218 !(data.flags & MLX5_NL_CMD_GET_PORT_INDEX)) {
1219 rte_errno = ENODEV;
1220 return 0;
1221 }
1222 if (!data.portnum)
1223 rte_errno = EINVAL;
1224 return data.portnum;
1225 }
1226
1227 /**
1228 * Analyze gathered port parameters via Netlink to recognize master
1229 * and representor devices for E-Switch configuration.
1230 *
1231 * @param[in] num_vf_set
1232 * flag of presence of number of VFs port attribute.
1233 * @param[inout] switch_info
1234 * Port information, including port name as a number and port name
1235 * type if recognized
1236 *
1237 * @return
1238 * master and representor flags are set in switch_info according to
1239 * recognized parameters (if any).
1240 */
1241 static void
mlx5_nl_check_switch_info(bool num_vf_set,struct mlx5_switch_info * switch_info)1242 mlx5_nl_check_switch_info(bool num_vf_set,
1243 struct mlx5_switch_info *switch_info)
1244 {
1245 switch (switch_info->name_type) {
1246 case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
1247 /*
1248 * Name is not recognized, assume the master,
1249 * check the number of VFs key presence.
1250 */
1251 switch_info->master = num_vf_set;
1252 break;
1253 case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
1254 /*
1255 * Name is not set, this assumes the legacy naming
1256 * schema for master, just check if there is a
1257 * number of VFs key.
1258 */
1259 switch_info->master = num_vf_set;
1260 break;
1261 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1262 /* New uplink naming schema recognized. */
1263 switch_info->master = 1;
1264 break;
1265 case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
1266 /* Legacy representors naming schema. */
1267 switch_info->representor = !num_vf_set;
1268 break;
1269 case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
1270 /* Fallthrough */
1271 case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1272 /* Fallthrough */
1273 case MLX5_PHYS_PORT_NAME_TYPE_PFSF:
1274 /* New representors naming schema. */
1275 switch_info->representor = 1;
1276 break;
1277 }
1278 }
1279
1280 /**
1281 * Process switch information from Netlink message.
1282 *
1283 * @param nh
1284 * Pointer to Netlink message header.
1285 * @param arg
1286 * Opaque data pointer for this callback.
1287 *
1288 * @return
1289 * 0 on success, a negative errno value otherwise and rte_errno is set.
1290 */
1291 static int
mlx5_nl_switch_info_cb(struct nlmsghdr * nh,void * arg)1292 mlx5_nl_switch_info_cb(struct nlmsghdr *nh, void *arg)
1293 {
1294 struct mlx5_switch_info info = {
1295 .master = 0,
1296 .representor = 0,
1297 .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
1298 .port_name = 0,
1299 .switch_id = 0,
1300 };
1301 size_t off = NLMSG_LENGTH(sizeof(struct ifinfomsg));
1302 bool switch_id_set = false;
1303 bool num_vf_set = false;
1304 int len;
1305
1306 if (nh->nlmsg_type != RTM_NEWLINK)
1307 goto error;
1308 while (off < nh->nlmsg_len) {
1309 struct rtattr *ra = (void *)((uintptr_t)nh + off);
1310 void *payload = RTA_DATA(ra);
1311 unsigned int i;
1312
1313 if (ra->rta_len > nh->nlmsg_len - off)
1314 goto error;
1315 switch (ra->rta_type) {
1316 case IFLA_NUM_VF:
1317 num_vf_set = true;
1318 break;
1319 case IFLA_PHYS_PORT_NAME:
1320 len = RTA_PAYLOAD(ra);
1321 /* Some kernels do not pad attributes with zero. */
1322 if (len > 0 && len < MLX5_PHYS_PORT_NAME_MAX) {
1323 char name[MLX5_PHYS_PORT_NAME_MAX];
1324
1325 /*
1326 * We can't just patch the message with padding
1327 * zero - it might corrupt the following items
1328 * in the message, we have to copy the string
1329 * by attribute length and pad the copied one.
1330 */
1331 memcpy(name, payload, len);
1332 name[len] = 0;
1333 mlx5_translate_port_name(name, &info);
1334 } else {
1335 info.name_type =
1336 MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN;
1337 }
1338 break;
1339 case IFLA_PHYS_SWITCH_ID:
1340 info.switch_id = 0;
1341 for (i = 0; i < RTA_PAYLOAD(ra); ++i) {
1342 info.switch_id <<= 8;
1343 info.switch_id |= ((uint8_t *)payload)[i];
1344 }
1345 switch_id_set = true;
1346 break;
1347 }
1348 off += RTA_ALIGN(ra->rta_len);
1349 }
1350 if (switch_id_set) {
1351 /* We have some E-Switch configuration. */
1352 mlx5_nl_check_switch_info(num_vf_set, &info);
1353 }
1354 MLX5_ASSERT(!(info.master && info.representor));
1355 memcpy(arg, &info, sizeof(info));
1356 return 0;
1357 error:
1358 rte_errno = EINVAL;
1359 return -rte_errno;
1360 }
1361
1362 /**
1363 * Get switch information associated with network interface.
1364 *
1365 * @param nl
1366 * Netlink socket of the ROUTE kind (NETLINK_ROUTE).
1367 * @param ifindex
1368 * Network interface index.
1369 * @param[out] info
1370 * Switch information object, populated in case of success.
1371 *
1372 * @return
1373 * 0 on success, a negative errno value otherwise and rte_errno is set.
1374 */
1375 int
mlx5_nl_switch_info(int nl,unsigned int ifindex,struct mlx5_switch_info * info)1376 mlx5_nl_switch_info(int nl, unsigned int ifindex,
1377 struct mlx5_switch_info *info)
1378 {
1379 struct {
1380 struct nlmsghdr nh;
1381 struct ifinfomsg info;
1382 struct rtattr rta;
1383 uint32_t extmask;
1384 } req = {
1385 .nh = {
1386 .nlmsg_len = NLMSG_LENGTH
1387 (sizeof(req.info) +
1388 RTA_LENGTH(sizeof(uint32_t))),
1389 .nlmsg_type = RTM_GETLINK,
1390 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1391 },
1392 .info = {
1393 .ifi_family = AF_UNSPEC,
1394 .ifi_index = ifindex,
1395 },
1396 .rta = {
1397 .rta_type = IFLA_EXT_MASK,
1398 .rta_len = RTA_LENGTH(sizeof(int32_t)),
1399 },
1400 .extmask = RTE_LE32(1),
1401 };
1402 uint32_t sn = MLX5_NL_SN_GENERATE;
1403 int ret;
1404
1405 ret = mlx5_nl_send(nl, &req.nh, sn);
1406 if (ret >= 0)
1407 ret = mlx5_nl_recv(nl, sn, mlx5_nl_switch_info_cb, info);
1408 if (info->master && info->representor) {
1409 DRV_LOG(ERR, "ifindex %u device is recognized as master"
1410 " and as representor", ifindex);
1411 rte_errno = ENODEV;
1412 ret = -rte_errno;
1413 }
1414 return ret;
1415 }
1416
1417 /*
1418 * Delete VLAN network device by ifindex.
1419 *
1420 * @param[in] tcf
1421 * Context object initialized by mlx5_nl_vlan_vmwa_init().
1422 * @param[in] ifindex
1423 * Interface index of network device to delete.
1424 */
1425 void
mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context * vmwa,uint32_t ifindex)1426 mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context *vmwa,
1427 uint32_t ifindex)
1428 {
1429 uint32_t sn = MLX5_NL_SN_GENERATE;
1430 int ret;
1431 struct {
1432 struct nlmsghdr nh;
1433 struct ifinfomsg info;
1434 } req = {
1435 .nh = {
1436 .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
1437 .nlmsg_type = RTM_DELLINK,
1438 .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
1439 },
1440 .info = {
1441 .ifi_family = AF_UNSPEC,
1442 .ifi_index = ifindex,
1443 },
1444 };
1445
1446 if (ifindex) {
1447 ret = mlx5_nl_send(vmwa->nl_socket, &req.nh, sn);
1448 if (ret >= 0)
1449 ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1450 if (ret < 0)
1451 DRV_LOG(WARNING, "netlink: error deleting VLAN WA"
1452 " ifindex %u, %d", ifindex, ret);
1453 }
1454 }
1455
1456 /* Set of subroutines to build Netlink message. */
1457 static struct nlattr *
nl_msg_tail(struct nlmsghdr * nlh)1458 nl_msg_tail(struct nlmsghdr *nlh)
1459 {
1460 return (struct nlattr *)
1461 (((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
1462 }
1463
1464 static void
nl_attr_put(struct nlmsghdr * nlh,int type,const void * data,int alen)1465 nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
1466 {
1467 struct nlattr *nla = nl_msg_tail(nlh);
1468
1469 nla->nla_type = type;
1470 nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr)) + alen;
1471 nlh->nlmsg_len += NLMSG_ALIGN(nla->nla_len);
1472
1473 if (alen)
1474 memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
1475 }
1476
1477 static struct nlattr *
nl_attr_nest_start(struct nlmsghdr * nlh,int type)1478 nl_attr_nest_start(struct nlmsghdr *nlh, int type)
1479 {
1480 struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
1481
1482 nl_attr_put(nlh, type, NULL, 0);
1483 return nest;
1484 }
1485
1486 static void
nl_attr_nest_end(struct nlmsghdr * nlh,struct nlattr * nest)1487 nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
1488 {
1489 nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
1490 }
1491
1492 /*
1493 * Create network VLAN device with specified VLAN tag.
1494 *
1495 * @param[in] tcf
1496 * Context object initialized by mlx5_nl_vlan_vmwa_init().
1497 * @param[in] ifindex
1498 * Base network interface index.
1499 * @param[in] tag
1500 * VLAN tag for VLAN network device to create.
1501 */
1502 uint32_t
mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context * vmwa,uint32_t ifindex,uint16_t tag)1503 mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context *vmwa,
1504 uint32_t ifindex, uint16_t tag)
1505 {
1506 struct nlmsghdr *nlh;
1507 struct ifinfomsg *ifm;
1508 char name[sizeof(MLX5_VMWA_VLAN_DEVICE_PFX) + 32];
1509
1510 __rte_cache_aligned
1511 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1512 NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
1513 NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
1514 NLMSG_ALIGN(sizeof(uint32_t)) +
1515 NLMSG_ALIGN(sizeof(name)) +
1516 NLMSG_ALIGN(sizeof("vlan")) +
1517 NLMSG_ALIGN(sizeof(uint32_t)) +
1518 NLMSG_ALIGN(sizeof(uint16_t)) + 16];
1519 struct nlattr *na_info;
1520 struct nlattr *na_vlan;
1521 uint32_t sn = MLX5_NL_SN_GENERATE;
1522 int ret;
1523
1524 memset(buf, 0, sizeof(buf));
1525 nlh = (struct nlmsghdr *)buf;
1526 nlh->nlmsg_len = sizeof(struct nlmsghdr);
1527 nlh->nlmsg_type = RTM_NEWLINK;
1528 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
1529 NLM_F_EXCL | NLM_F_ACK;
1530 ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
1531 nlh->nlmsg_len += sizeof(struct ifinfomsg);
1532 ifm->ifi_family = AF_UNSPEC;
1533 ifm->ifi_type = 0;
1534 ifm->ifi_index = 0;
1535 ifm->ifi_flags = IFF_UP;
1536 ifm->ifi_change = 0xffffffff;
1537 nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
1538 ret = snprintf(name, sizeof(name), "%s.%u.%u",
1539 MLX5_VMWA_VLAN_DEVICE_PFX, ifindex, tag);
1540 nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
1541 na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
1542 nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
1543 na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
1544 nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
1545 nl_attr_nest_end(nlh, na_vlan);
1546 nl_attr_nest_end(nlh, na_info);
1547 MLX5_ASSERT(sizeof(buf) >= nlh->nlmsg_len);
1548 ret = mlx5_nl_send(vmwa->nl_socket, nlh, sn);
1549 if (ret >= 0)
1550 ret = mlx5_nl_recv(vmwa->nl_socket, sn, NULL, NULL);
1551 if (ret < 0) {
1552 DRV_LOG(WARNING, "netlink: VLAN %s create failure (%d)", name,
1553 ret);
1554 }
1555 /* Try to get ifindex of created or pre-existing device. */
1556 ret = if_nametoindex(name);
1557 if (!ret) {
1558 DRV_LOG(WARNING, "VLAN %s failed to get index (%d)", name,
1559 errno);
1560 return 0;
1561 }
1562 return ret;
1563 }
1564
1565 /**
1566 * Parse Netlink message to retrieve the general family ID.
1567 *
1568 * @param nh
1569 * Pointer to Netlink Message Header.
1570 * @param arg
1571 * PMD data register with this callback.
1572 *
1573 * @return
1574 * 0 on success, a negative errno value otherwise and rte_errno is set.
1575 */
1576 static int
mlx5_nl_family_id_cb(struct nlmsghdr * nh,void * arg)1577 mlx5_nl_family_id_cb(struct nlmsghdr *nh, void *arg)
1578 {
1579
1580 struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1581 struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1582 NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1583
1584 for (; nla->nla_len && nla < tail;
1585 nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len))) {
1586 if (nla->nla_type == CTRL_ATTR_FAMILY_ID) {
1587 *(uint16_t *)arg = *(uint16_t *)(nla + 1);
1588 return 0;
1589 }
1590 }
1591 return -EINVAL;
1592 }
1593
1594 #define MLX5_NL_MAX_ATTR_SIZE 100
1595 /**
1596 * Get generic netlink family ID.
1597 *
1598 * @param[in] nlsk_fd
1599 * Netlink socket file descriptor.
1600 * @param[in] name
1601 * The family name.
1602 *
1603 * @return
1604 * ID >= 0 on success and @p enable is updated, a negative errno value
1605 * otherwise and rte_errno is set.
1606 */
1607 static int
mlx5_nl_generic_family_id_get(int nlsk_fd,const char * name)1608 mlx5_nl_generic_family_id_get(int nlsk_fd, const char *name)
1609 {
1610 struct nlmsghdr *nlh;
1611 struct genlmsghdr *genl;
1612 uint32_t sn = MLX5_NL_SN_GENERATE;
1613 int name_size = strlen(name) + 1;
1614 int ret;
1615 uint16_t id = -1;
1616 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1617 NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1618 NLMSG_ALIGN(sizeof(struct nlattr)) +
1619 NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE)];
1620
1621 memset(buf, 0, sizeof(buf));
1622 nlh = (struct nlmsghdr *)buf;
1623 nlh->nlmsg_len = sizeof(struct nlmsghdr);
1624 nlh->nlmsg_type = GENL_ID_CTRL;
1625 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1626 genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1627 nlh->nlmsg_len += sizeof(struct genlmsghdr);
1628 genl->cmd = CTRL_CMD_GETFAMILY;
1629 genl->version = 1;
1630 nl_attr_put(nlh, CTRL_ATTR_FAMILY_NAME, name, name_size);
1631 ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1632 if (ret >= 0)
1633 ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_family_id_cb, &id);
1634 if (ret < 0) {
1635 DRV_LOG(DEBUG, "Failed to get Netlink %s family ID: %d.", name,
1636 ret);
1637 return ret;
1638 }
1639 DRV_LOG(DEBUG, "Netlink \"%s\" family ID is %u.", name, id);
1640 return (int)id;
1641 }
1642
1643 /**
1644 * Get Devlink family ID.
1645 *
1646 * @param[in] nlsk_fd
1647 * Netlink socket file descriptor.
1648 *
1649 * @return
1650 * ID >= 0 on success and @p enable is updated, a negative errno value
1651 * otherwise and rte_errno is set.
1652 */
1653
1654 int
mlx5_nl_devlink_family_id_get(int nlsk_fd)1655 mlx5_nl_devlink_family_id_get(int nlsk_fd)
1656 {
1657 return mlx5_nl_generic_family_id_get(nlsk_fd, DEVLINK_GENL_NAME);
1658 }
1659
1660 /**
1661 * Parse Netlink message to retrieve the ROCE enable status.
1662 *
1663 * @param nh
1664 * Pointer to Netlink Message Header.
1665 * @param arg
1666 * PMD data register with this callback.
1667 *
1668 * @return
1669 * 0 on success, a negative errno value otherwise and rte_errno is set.
1670 */
1671 static int
mlx5_nl_roce_cb(struct nlmsghdr * nh,void * arg)1672 mlx5_nl_roce_cb(struct nlmsghdr *nh, void *arg)
1673 {
1674
1675 int ret = -EINVAL;
1676 int *enable = arg;
1677 struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
1678 struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
1679 NLMSG_ALIGN(sizeof(struct genlmsghdr)));
1680
1681 while (nla->nla_len && nla < tail) {
1682 switch (nla->nla_type) {
1683 /* Expected nested attributes case. */
1684 case DEVLINK_ATTR_PARAM:
1685 case DEVLINK_ATTR_PARAM_VALUES_LIST:
1686 case DEVLINK_ATTR_PARAM_VALUE:
1687 ret = 0;
1688 nla += 1;
1689 break;
1690 case DEVLINK_ATTR_PARAM_VALUE_DATA:
1691 *enable = 1;
1692 return 0;
1693 default:
1694 nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len));
1695 }
1696 }
1697 *enable = 0;
1698 return ret;
1699 }
1700
1701 /**
1702 * Get ROCE enable status through Netlink.
1703 *
1704 * @param[in] nlsk_fd
1705 * Netlink socket file descriptor.
1706 * @param[in] family_id
1707 * the Devlink family ID.
1708 * @param pci_addr
1709 * The device PCI address.
1710 * @param[out] enable
1711 * Where to store the enable status.
1712 *
1713 * @return
1714 * 0 on success and @p enable is updated, a negative errno value otherwise
1715 * and rte_errno is set.
1716 */
1717 int
mlx5_nl_enable_roce_get(int nlsk_fd,int family_id,const char * pci_addr,int * enable)1718 mlx5_nl_enable_roce_get(int nlsk_fd, int family_id, const char *pci_addr,
1719 int *enable)
1720 {
1721 struct nlmsghdr *nlh;
1722 struct genlmsghdr *genl;
1723 uint32_t sn = MLX5_NL_SN_GENERATE;
1724 int ret;
1725 int cur_en = 0;
1726 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1727 NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1728 NLMSG_ALIGN(sizeof(struct nlattr)) * 4 +
1729 NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 4];
1730
1731 memset(buf, 0, sizeof(buf));
1732 nlh = (struct nlmsghdr *)buf;
1733 nlh->nlmsg_len = sizeof(struct nlmsghdr);
1734 nlh->nlmsg_type = family_id;
1735 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1736 genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1737 nlh->nlmsg_len += sizeof(struct genlmsghdr);
1738 genl->cmd = DEVLINK_CMD_PARAM_GET;
1739 genl->version = DEVLINK_GENL_VERSION;
1740 nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1741 nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1742 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
1743 ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1744 if (ret >= 0)
1745 ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_roce_cb, &cur_en);
1746 if (ret < 0) {
1747 DRV_LOG(DEBUG, "Failed to get ROCE enable on device %s: %d.",
1748 pci_addr, ret);
1749 return ret;
1750 }
1751 *enable = cur_en;
1752 DRV_LOG(DEBUG, "ROCE is %sabled for device \"%s\".",
1753 cur_en ? "en" : "dis", pci_addr);
1754 return ret;
1755 }
1756
1757 /**
1758 * Reload mlx5 device kernel driver through Netlink.
1759 *
1760 * @param[in] nlsk_fd
1761 * Netlink socket file descriptor.
1762 * @param[in] family_id
1763 * the Devlink family ID.
1764 * @param pci_addr
1765 * The device PCI address.
1766 * @param[out] enable
1767 * The enable status to set.
1768 *
1769 * @return
1770 * 0 on success, a negative errno value otherwise and rte_errno is set.
1771 */
1772 static int
mlx5_nl_driver_reload(int nlsk_fd,int family_id,const char * pci_addr)1773 mlx5_nl_driver_reload(int nlsk_fd, int family_id, const char *pci_addr)
1774 {
1775 struct nlmsghdr *nlh;
1776 struct genlmsghdr *genl;
1777 uint32_t sn = MLX5_NL_SN_GENERATE;
1778 int ret;
1779 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1780 NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1781 NLMSG_ALIGN(sizeof(struct nlattr)) * 2 +
1782 NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 2];
1783
1784 memset(buf, 0, sizeof(buf));
1785 nlh = (struct nlmsghdr *)buf;
1786 nlh->nlmsg_len = sizeof(struct nlmsghdr);
1787 nlh->nlmsg_type = family_id;
1788 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1789 genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1790 nlh->nlmsg_len += sizeof(struct genlmsghdr);
1791 genl->cmd = DEVLINK_CMD_RELOAD;
1792 genl->version = DEVLINK_GENL_VERSION;
1793 nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1794 nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1795 ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1796 if (ret >= 0)
1797 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
1798 if (ret < 0) {
1799 DRV_LOG(DEBUG, "Failed to reload %s device by Netlink - %d",
1800 pci_addr, ret);
1801 return ret;
1802 }
1803 DRV_LOG(DEBUG, "Device \"%s\" was reloaded by Netlink successfully.",
1804 pci_addr);
1805 return 0;
1806 }
1807
1808 /**
1809 * Set ROCE enable status through Netlink.
1810 *
1811 * @param[in] nlsk_fd
1812 * Netlink socket file descriptor.
1813 * @param[in] family_id
1814 * the Devlink family ID.
1815 * @param pci_addr
1816 * The device PCI address.
1817 * @param[out] enable
1818 * The enable status to set.
1819 *
1820 * @return
1821 * 0 on success, a negative errno value otherwise and rte_errno is set.
1822 */
1823 int
mlx5_nl_enable_roce_set(int nlsk_fd,int family_id,const char * pci_addr,int enable)1824 mlx5_nl_enable_roce_set(int nlsk_fd, int family_id, const char *pci_addr,
1825 int enable)
1826 {
1827 struct nlmsghdr *nlh;
1828 struct genlmsghdr *genl;
1829 uint32_t sn = MLX5_NL_SN_GENERATE;
1830 int ret;
1831 uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
1832 NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
1833 NLMSG_ALIGN(sizeof(struct nlattr)) * 6 +
1834 NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 6];
1835 uint8_t cmode = DEVLINK_PARAM_CMODE_DRIVERINIT;
1836 uint8_t ptype = NLA_FLAG;
1837 ;
1838
1839 memset(buf, 0, sizeof(buf));
1840 nlh = (struct nlmsghdr *)buf;
1841 nlh->nlmsg_len = sizeof(struct nlmsghdr);
1842 nlh->nlmsg_type = family_id;
1843 nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
1844 genl = (struct genlmsghdr *)nl_msg_tail(nlh);
1845 nlh->nlmsg_len += sizeof(struct genlmsghdr);
1846 genl->cmd = DEVLINK_CMD_PARAM_SET;
1847 genl->version = DEVLINK_GENL_VERSION;
1848 nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
1849 nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
1850 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
1851 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_CMODE, &cmode, sizeof(cmode));
1852 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_TYPE, &ptype, sizeof(ptype));
1853 if (enable)
1854 nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, NULL, 0);
1855 ret = mlx5_nl_send(nlsk_fd, nlh, sn);
1856 if (ret >= 0)
1857 ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
1858 if (ret < 0) {
1859 DRV_LOG(DEBUG, "Failed to %sable ROCE for device %s by Netlink:"
1860 " %d.", enable ? "en" : "dis", pci_addr, ret);
1861 return ret;
1862 }
1863 DRV_LOG(DEBUG, "Device %s ROCE was %sabled by Netlink successfully.",
1864 pci_addr, enable ? "en" : "dis");
1865 /* Now, need to reload the driver. */
1866 return mlx5_nl_driver_reload(nlsk_fd, family_id, pci_addr);
1867 }
1868
1869 /**
1870 * Try to parse a Netlink message as a link status update.
1871 *
1872 * @param hdr
1873 * Netlink message header.
1874 * @param[out] ifindex
1875 * Index of the updated interface.
1876 *
1877 * @return
1878 * 0 on success, negative on failure.
1879 */
1880 int
mlx5_nl_parse_link_status_update(struct nlmsghdr * hdr,uint32_t * ifindex)1881 mlx5_nl_parse_link_status_update(struct nlmsghdr *hdr, uint32_t *ifindex)
1882 {
1883 struct ifinfomsg *info;
1884
1885 switch (hdr->nlmsg_type) {
1886 case RTM_NEWLINK:
1887 case RTM_DELLINK:
1888 case RTM_GETLINK:
1889 case RTM_SETLINK:
1890 info = NLMSG_DATA(hdr);
1891 *ifindex = info->ifi_index;
1892 return 0;
1893 }
1894 return -1;
1895 }
1896
1897 /**
1898 * Read pending events from a Netlink socket.
1899 *
1900 * @param nlsk_fd
1901 * Netlink socket.
1902 * @param cb
1903 * Callback invoked for each of the events.
1904 * @param cb_arg
1905 * User data for the callback.
1906 *
1907 * @return
1908 * 0 on success, including the case when there are no events.
1909 * Negative on failure and rte_errno is set.
1910 */
1911 int
mlx5_nl_read_events(int nlsk_fd,mlx5_nl_event_cb * cb,void * cb_arg)1912 mlx5_nl_read_events(int nlsk_fd, mlx5_nl_event_cb *cb, void *cb_arg)
1913 {
1914 char buf[8192];
1915 struct sockaddr_nl addr;
1916 struct iovec iov = {
1917 .iov_base = buf,
1918 .iov_len = sizeof(buf),
1919 };
1920 struct msghdr msg = {
1921 .msg_name = &addr,
1922 .msg_namelen = sizeof(addr),
1923 .msg_iov = &iov,
1924 .msg_iovlen = 1,
1925 };
1926 struct nlmsghdr *hdr;
1927 ssize_t size;
1928
1929 while (1) {
1930 size = recvmsg(nlsk_fd, &msg, MSG_DONTWAIT);
1931 if (size < 0) {
1932 if (errno == EAGAIN)
1933 return 0;
1934 if (errno == EINTR)
1935 continue;
1936 DRV_LOG(DEBUG, "Failed to receive netlink message: %s",
1937 strerror(errno));
1938 rte_errno = errno;
1939 return -rte_errno;
1940 }
1941 hdr = (struct nlmsghdr *)buf;
1942 while (size >= (ssize_t)sizeof(*hdr)) {
1943 ssize_t msg_len = hdr->nlmsg_len;
1944 ssize_t data_len = msg_len - sizeof(*hdr);
1945 ssize_t aligned_len;
1946
1947 if (data_len < 0) {
1948 DRV_LOG(DEBUG, "Netlink message too short");
1949 rte_errno = EINVAL;
1950 return -rte_errno;
1951 }
1952 aligned_len = NLMSG_ALIGN(msg_len);
1953 if (aligned_len > size) {
1954 DRV_LOG(DEBUG, "Netlink message too long");
1955 rte_errno = EINVAL;
1956 return -rte_errno;
1957 }
1958 cb(hdr, cb_arg);
1959 hdr = RTE_PTR_ADD(hdr, aligned_len);
1960 size -= aligned_len;
1961 }
1962 }
1963 return 0;
1964 }
1965