1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2019 Mellanox Technologies, Ltd
3 */
4 #include <unistd.h>
5 #include <net/if.h>
6 #include <sys/socket.h>
7 #include <sys/ioctl.h>
8 #include <fcntl.h>
9 #include <netinet/in.h>
10
11 #include <rte_malloc.h>
12 #include <rte_log.h>
13 #include <rte_errno.h>
14 #include <rte_pci.h>
15 #include <rte_string_fns.h>
16
17 #include <mlx5_glue.h>
18 #include <mlx5_common.h>
19 #include <mlx5_common_pci.h>
20 #include <mlx5_devx_cmds.h>
21 #include <mlx5_prm.h>
22 #include <mlx5_nl.h>
23
24 #include "mlx5_vdpa_utils.h"
25 #include "mlx5_vdpa.h"
26
27
28 #define MLX5_VDPA_DEFAULT_FEATURES ((1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
29 (1ULL << VIRTIO_F_ANY_LAYOUT) | \
30 (1ULL << VIRTIO_NET_F_MQ) | \
31 (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \
32 (1ULL << VIRTIO_F_ORDER_PLATFORM) | \
33 (1ULL << VHOST_F_LOG_ALL) | \
34 (1ULL << VIRTIO_NET_F_MTU))
35
36 #define MLX5_VDPA_PROTOCOL_FEATURES \
37 ((1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ) | \
38 (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \
39 (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) | \
40 (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) | \
41 (1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
42 (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU) | \
43 (1ULL << VHOST_USER_PROTOCOL_F_STATUS))
44
45 #define MLX5_VDPA_MAX_RETRIES 20
46 #define MLX5_VDPA_USEC 1000
47 #define MLX5_VDPA_DEFAULT_NO_TRAFFIC_TIME_S 2LLU
48
49 TAILQ_HEAD(mlx5_vdpa_privs, mlx5_vdpa_priv) priv_list =
50 TAILQ_HEAD_INITIALIZER(priv_list);
51 static pthread_mutex_t priv_list_lock = PTHREAD_MUTEX_INITIALIZER;
52
53 static struct mlx5_vdpa_priv *
mlx5_vdpa_find_priv_resource_by_vdev(struct rte_vdpa_device * vdev)54 mlx5_vdpa_find_priv_resource_by_vdev(struct rte_vdpa_device *vdev)
55 {
56 struct mlx5_vdpa_priv *priv;
57 int found = 0;
58
59 pthread_mutex_lock(&priv_list_lock);
60 TAILQ_FOREACH(priv, &priv_list, next) {
61 if (vdev == priv->vdev) {
62 found = 1;
63 break;
64 }
65 }
66 pthread_mutex_unlock(&priv_list_lock);
67 if (!found) {
68 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
69 rte_errno = EINVAL;
70 return NULL;
71 }
72 return priv;
73 }
74
75 static int
mlx5_vdpa_get_queue_num(struct rte_vdpa_device * vdev,uint32_t * queue_num)76 mlx5_vdpa_get_queue_num(struct rte_vdpa_device *vdev, uint32_t *queue_num)
77 {
78 struct mlx5_vdpa_priv *priv =
79 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
80
81 if (priv == NULL) {
82 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
83 return -1;
84 }
85 *queue_num = priv->caps.max_num_virtio_queues;
86 return 0;
87 }
88
89 static int
mlx5_vdpa_get_vdpa_features(struct rte_vdpa_device * vdev,uint64_t * features)90 mlx5_vdpa_get_vdpa_features(struct rte_vdpa_device *vdev, uint64_t *features)
91 {
92 struct mlx5_vdpa_priv *priv =
93 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
94
95 if (priv == NULL) {
96 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
97 return -1;
98 }
99 *features = MLX5_VDPA_DEFAULT_FEATURES;
100 if (priv->caps.virtio_queue_type & (1 << MLX5_VIRTQ_TYPE_PACKED))
101 *features |= (1ULL << VIRTIO_F_RING_PACKED);
102 if (priv->caps.tso_ipv4)
103 *features |= (1ULL << VIRTIO_NET_F_HOST_TSO4);
104 if (priv->caps.tso_ipv6)
105 *features |= (1ULL << VIRTIO_NET_F_HOST_TSO6);
106 if (priv->caps.tx_csum)
107 *features |= (1ULL << VIRTIO_NET_F_CSUM);
108 if (priv->caps.rx_csum)
109 *features |= (1ULL << VIRTIO_NET_F_GUEST_CSUM);
110 if (priv->caps.virtio_version_1_0)
111 *features |= (1ULL << VIRTIO_F_VERSION_1);
112 return 0;
113 }
114
115 static int
mlx5_vdpa_get_protocol_features(struct rte_vdpa_device * vdev,uint64_t * features)116 mlx5_vdpa_get_protocol_features(struct rte_vdpa_device *vdev,
117 uint64_t *features)
118 {
119 struct mlx5_vdpa_priv *priv =
120 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
121
122 if (priv == NULL) {
123 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
124 return -1;
125 }
126 *features = MLX5_VDPA_PROTOCOL_FEATURES;
127 return 0;
128 }
129
130 static int
mlx5_vdpa_set_vring_state(int vid,int vring,int state)131 mlx5_vdpa_set_vring_state(int vid, int vring, int state)
132 {
133 struct rte_vdpa_device *vdev = rte_vhost_get_vdpa_device(vid);
134 struct mlx5_vdpa_priv *priv =
135 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
136 int ret;
137
138 if (priv == NULL) {
139 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
140 return -EINVAL;
141 }
142 if (vring >= (int)priv->caps.max_num_virtio_queues * 2) {
143 DRV_LOG(ERR, "Too big vring id: %d.", vring);
144 return -E2BIG;
145 }
146 pthread_mutex_lock(&priv->vq_config_lock);
147 ret = mlx5_vdpa_virtq_enable(priv, vring, state);
148 pthread_mutex_unlock(&priv->vq_config_lock);
149 return ret;
150 }
151
152 static int
mlx5_vdpa_features_set(int vid)153 mlx5_vdpa_features_set(int vid)
154 {
155 struct rte_vdpa_device *vdev = rte_vhost_get_vdpa_device(vid);
156 struct mlx5_vdpa_priv *priv =
157 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
158 uint64_t log_base, log_size;
159 uint64_t features;
160 int ret;
161
162 if (priv == NULL) {
163 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
164 return -EINVAL;
165 }
166 ret = rte_vhost_get_negotiated_features(vid, &features);
167 if (ret) {
168 DRV_LOG(ERR, "Failed to get negotiated features.");
169 return ret;
170 }
171 if (RTE_VHOST_NEED_LOG(features)) {
172 ret = rte_vhost_get_log_base(vid, &log_base, &log_size);
173 if (ret) {
174 DRV_LOG(ERR, "Failed to get log base.");
175 return ret;
176 }
177 ret = mlx5_vdpa_dirty_bitmap_set(priv, log_base, log_size);
178 if (ret) {
179 DRV_LOG(ERR, "Failed to set dirty bitmap.");
180 return ret;
181 }
182 DRV_LOG(INFO, "mlx5 vdpa: enabling dirty logging...");
183 ret = mlx5_vdpa_logging_enable(priv, 1);
184 if (ret) {
185 DRV_LOG(ERR, "Failed t enable dirty logging.");
186 return ret;
187 }
188 }
189 return 0;
190 }
191
192 static int
mlx5_vdpa_pd_create(struct mlx5_vdpa_priv * priv)193 mlx5_vdpa_pd_create(struct mlx5_vdpa_priv *priv)
194 {
195 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
196 priv->pd = mlx5_glue->alloc_pd(priv->ctx);
197 if (priv->pd == NULL) {
198 DRV_LOG(ERR, "Failed to allocate PD.");
199 return errno ? -errno : -ENOMEM;
200 }
201 struct mlx5dv_obj obj;
202 struct mlx5dv_pd pd_info;
203 int ret = 0;
204
205 obj.pd.in = priv->pd;
206 obj.pd.out = &pd_info;
207 ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
208 if (ret) {
209 DRV_LOG(ERR, "Fail to get PD object info.");
210 mlx5_glue->dealloc_pd(priv->pd);
211 priv->pd = NULL;
212 return -errno;
213 }
214 priv->pdn = pd_info.pdn;
215 return 0;
216 #else
217 (void)priv;
218 DRV_LOG(ERR, "Cannot get pdn - no DV support.");
219 return -ENOTSUP;
220 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
221 }
222
223 static int
mlx5_vdpa_mtu_set(struct mlx5_vdpa_priv * priv)224 mlx5_vdpa_mtu_set(struct mlx5_vdpa_priv *priv)
225 {
226 struct ifreq request;
227 uint16_t vhost_mtu = 0;
228 uint16_t kern_mtu = 0;
229 int ret = rte_vhost_get_mtu(priv->vid, &vhost_mtu);
230 int sock;
231 int retries = MLX5_VDPA_MAX_RETRIES;
232
233 if (ret) {
234 DRV_LOG(DEBUG, "Cannot get vhost MTU - %d.", ret);
235 return ret;
236 }
237 if (!vhost_mtu) {
238 DRV_LOG(DEBUG, "Vhost MTU is 0.");
239 return ret;
240 }
241 ret = mlx5_get_ifname_sysfs(priv->ctx->device->ibdev_path,
242 request.ifr_name);
243 if (ret) {
244 DRV_LOG(DEBUG, "Cannot get kernel IF name - %d.", ret);
245 return ret;
246 }
247 sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
248 if (sock == -1) {
249 DRV_LOG(DEBUG, "Cannot open IF socket.");
250 return sock;
251 }
252 while (retries--) {
253 ret = ioctl(sock, SIOCGIFMTU, &request);
254 if (ret == -1)
255 break;
256 kern_mtu = request.ifr_mtu;
257 DRV_LOG(DEBUG, "MTU: current %d requested %d.", (int)kern_mtu,
258 (int)vhost_mtu);
259 if (kern_mtu == vhost_mtu)
260 break;
261 request.ifr_mtu = vhost_mtu;
262 ret = ioctl(sock, SIOCSIFMTU, &request);
263 if (ret == -1)
264 break;
265 request.ifr_mtu = 0;
266 usleep(MLX5_VDPA_USEC);
267 }
268 close(sock);
269 return kern_mtu == vhost_mtu ? 0 : -1;
270 }
271
272 static int
mlx5_vdpa_dev_close(int vid)273 mlx5_vdpa_dev_close(int vid)
274 {
275 struct rte_vdpa_device *vdev = rte_vhost_get_vdpa_device(vid);
276 struct mlx5_vdpa_priv *priv =
277 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
278 int ret = 0;
279
280 if (priv == NULL) {
281 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
282 return -1;
283 }
284 if (priv->configured)
285 ret |= mlx5_vdpa_lm_log(priv);
286 mlx5_vdpa_err_event_unset(priv);
287 mlx5_vdpa_cqe_event_unset(priv);
288 mlx5_vdpa_steer_unset(priv);
289 mlx5_vdpa_virtqs_release(priv);
290 mlx5_vdpa_event_qp_global_release(priv);
291 mlx5_vdpa_mem_dereg(priv);
292 if (priv->pd) {
293 claim_zero(mlx5_glue->dealloc_pd(priv->pd));
294 priv->pd = NULL;
295 }
296 priv->configured = 0;
297 priv->vid = 0;
298 DRV_LOG(INFO, "vDPA device %d was closed.", vid);
299 return ret;
300 }
301
302 static int
mlx5_vdpa_dev_config(int vid)303 mlx5_vdpa_dev_config(int vid)
304 {
305 struct rte_vdpa_device *vdev = rte_vhost_get_vdpa_device(vid);
306 struct mlx5_vdpa_priv *priv =
307 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
308
309 if (priv == NULL) {
310 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
311 return -EINVAL;
312 }
313 if (priv->configured && mlx5_vdpa_dev_close(vid)) {
314 DRV_LOG(ERR, "Failed to reconfigure vid %d.", vid);
315 return -1;
316 }
317 priv->vid = vid;
318 if (mlx5_vdpa_mtu_set(priv))
319 DRV_LOG(WARNING, "MTU cannot be set on device %s.",
320 vdev->device->name);
321 if (mlx5_vdpa_pd_create(priv) || mlx5_vdpa_mem_register(priv) ||
322 mlx5_vdpa_err_event_setup(priv) ||
323 mlx5_vdpa_virtqs_prepare(priv) || mlx5_vdpa_steer_setup(priv) ||
324 mlx5_vdpa_cqe_event_setup(priv)) {
325 mlx5_vdpa_dev_close(vid);
326 return -1;
327 }
328 priv->configured = 1;
329 DRV_LOG(INFO, "vDPA device %d was configured.", vid);
330 return 0;
331 }
332
333 static int
mlx5_vdpa_get_device_fd(int vid)334 mlx5_vdpa_get_device_fd(int vid)
335 {
336 struct rte_vdpa_device *vdev = rte_vhost_get_vdpa_device(vid);
337 struct mlx5_vdpa_priv *priv =
338 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
339
340 if (priv == NULL) {
341 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
342 return -EINVAL;
343 }
344 return priv->ctx->cmd_fd;
345 }
346
347 static int
mlx5_vdpa_get_notify_area(int vid,int qid,uint64_t * offset,uint64_t * size)348 mlx5_vdpa_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
349 {
350 struct rte_vdpa_device *vdev = rte_vhost_get_vdpa_device(vid);
351 struct mlx5_vdpa_priv *priv =
352 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
353
354 RTE_SET_USED(qid);
355 if (priv == NULL) {
356 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
357 return -EINVAL;
358 }
359 if (!priv->var) {
360 DRV_LOG(ERR, "VAR was not created for device %s, is the device"
361 " configured?.", vdev->device->name);
362 return -EINVAL;
363 }
364 *offset = priv->var->mmap_off;
365 *size = priv->var->length;
366 return 0;
367 }
368
369 static int
mlx5_vdpa_get_stats_names(struct rte_vdpa_device * vdev,struct rte_vdpa_stat_name * stats_names,unsigned int size)370 mlx5_vdpa_get_stats_names(struct rte_vdpa_device *vdev,
371 struct rte_vdpa_stat_name *stats_names,
372 unsigned int size)
373 {
374 static const char *mlx5_vdpa_stats_names[MLX5_VDPA_STATS_MAX] = {
375 "received_descriptors",
376 "completed_descriptors",
377 "bad descriptor errors",
378 "exceed max chain",
379 "invalid buffer",
380 "completion errors",
381 };
382 struct mlx5_vdpa_priv *priv =
383 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
384 unsigned int i;
385
386 if (priv == NULL) {
387 DRV_LOG(ERR, "Invalid device: %s.", vdev->device->name);
388 return -ENODEV;
389 }
390 if (!stats_names)
391 return MLX5_VDPA_STATS_MAX;
392 size = RTE_MIN(size, (unsigned int)MLX5_VDPA_STATS_MAX);
393 for (i = 0; i < size; ++i)
394 strlcpy(stats_names[i].name, mlx5_vdpa_stats_names[i],
395 RTE_VDPA_STATS_NAME_SIZE);
396 return size;
397 }
398
399 static int
mlx5_vdpa_get_stats(struct rte_vdpa_device * vdev,int qid,struct rte_vdpa_stat * stats,unsigned int n)400 mlx5_vdpa_get_stats(struct rte_vdpa_device *vdev, int qid,
401 struct rte_vdpa_stat *stats, unsigned int n)
402 {
403 struct mlx5_vdpa_priv *priv =
404 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
405
406 if (priv == NULL) {
407 DRV_LOG(ERR, "Invalid device: %s.", vdev->device->name);
408 return -ENODEV;
409 }
410 if (!priv->configured) {
411 DRV_LOG(ERR, "Device %s was not configured.",
412 vdev->device->name);
413 return -ENODATA;
414 }
415 if (qid >= (int)priv->nr_virtqs) {
416 DRV_LOG(ERR, "Too big vring id: %d for device %s.", qid,
417 vdev->device->name);
418 return -E2BIG;
419 }
420 if (!priv->caps.queue_counters_valid) {
421 DRV_LOG(ERR, "Virtq statistics is not supported for device %s.",
422 vdev->device->name);
423 return -ENOTSUP;
424 }
425 return mlx5_vdpa_virtq_stats_get(priv, qid, stats, n);
426 }
427
428 static int
mlx5_vdpa_reset_stats(struct rte_vdpa_device * vdev,int qid)429 mlx5_vdpa_reset_stats(struct rte_vdpa_device *vdev, int qid)
430 {
431 struct mlx5_vdpa_priv *priv =
432 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
433
434 if (priv == NULL) {
435 DRV_LOG(ERR, "Invalid device: %s.", vdev->device->name);
436 return -ENODEV;
437 }
438 if (!priv->configured) {
439 DRV_LOG(ERR, "Device %s was not configured.",
440 vdev->device->name);
441 return -ENODATA;
442 }
443 if (qid >= (int)priv->nr_virtqs) {
444 DRV_LOG(ERR, "Too big vring id: %d for device %s.", qid,
445 vdev->device->name);
446 return -E2BIG;
447 }
448 if (!priv->caps.queue_counters_valid) {
449 DRV_LOG(ERR, "Virtq statistics is not supported for device %s.",
450 vdev->device->name);
451 return -ENOTSUP;
452 }
453 return mlx5_vdpa_virtq_stats_reset(priv, qid);
454 }
455
456 static struct rte_vdpa_dev_ops mlx5_vdpa_ops = {
457 .get_queue_num = mlx5_vdpa_get_queue_num,
458 .get_features = mlx5_vdpa_get_vdpa_features,
459 .get_protocol_features = mlx5_vdpa_get_protocol_features,
460 .dev_conf = mlx5_vdpa_dev_config,
461 .dev_close = mlx5_vdpa_dev_close,
462 .set_vring_state = mlx5_vdpa_set_vring_state,
463 .set_features = mlx5_vdpa_features_set,
464 .migration_done = NULL,
465 .get_vfio_group_fd = NULL,
466 .get_vfio_device_fd = mlx5_vdpa_get_device_fd,
467 .get_notify_area = mlx5_vdpa_get_notify_area,
468 .get_stats_names = mlx5_vdpa_get_stats_names,
469 .get_stats = mlx5_vdpa_get_stats,
470 .reset_stats = mlx5_vdpa_reset_stats,
471 };
472
473 static struct ibv_device *
mlx5_vdpa_get_ib_device_match(struct rte_pci_addr * addr)474 mlx5_vdpa_get_ib_device_match(struct rte_pci_addr *addr)
475 {
476 int n;
477 struct ibv_device **ibv_list = mlx5_glue->get_device_list(&n);
478 struct ibv_device *ibv_match = NULL;
479
480 if (!ibv_list) {
481 rte_errno = ENOSYS;
482 return NULL;
483 }
484 while (n-- > 0) {
485 struct rte_pci_addr pci_addr;
486
487 DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[n]->name);
488 if (mlx5_dev_to_pci_addr(ibv_list[n]->ibdev_path, &pci_addr))
489 continue;
490 if (rte_pci_addr_cmp(addr, &pci_addr))
491 continue;
492 ibv_match = ibv_list[n];
493 break;
494 }
495 if (!ibv_match)
496 rte_errno = ENOENT;
497 mlx5_glue->free_device_list(ibv_list);
498 return ibv_match;
499 }
500
501 /* Try to disable ROCE by Netlink\Devlink. */
502 static int
mlx5_vdpa_nl_roce_disable(const char * addr)503 mlx5_vdpa_nl_roce_disable(const char *addr)
504 {
505 int nlsk_fd = mlx5_nl_init(NETLINK_GENERIC);
506 int devlink_id;
507 int enable;
508 int ret;
509
510 if (nlsk_fd < 0)
511 return nlsk_fd;
512 devlink_id = mlx5_nl_devlink_family_id_get(nlsk_fd);
513 if (devlink_id < 0) {
514 ret = devlink_id;
515 DRV_LOG(DEBUG, "Failed to get devlink id for ROCE operations by"
516 " Netlink.");
517 goto close;
518 }
519 ret = mlx5_nl_enable_roce_get(nlsk_fd, devlink_id, addr, &enable);
520 if (ret) {
521 DRV_LOG(DEBUG, "Failed to get ROCE enable by Netlink: %d.",
522 ret);
523 goto close;
524 } else if (!enable) {
525 DRV_LOG(INFO, "ROCE has already disabled(Netlink).");
526 goto close;
527 }
528 ret = mlx5_nl_enable_roce_set(nlsk_fd, devlink_id, addr, 0);
529 if (ret)
530 DRV_LOG(DEBUG, "Failed to disable ROCE by Netlink: %d.", ret);
531 else
532 DRV_LOG(INFO, "ROCE is disabled by Netlink successfully.");
533 close:
534 close(nlsk_fd);
535 return ret;
536 }
537
538 /* Try to disable ROCE by sysfs. */
539 static int
mlx5_vdpa_sys_roce_disable(const char * addr)540 mlx5_vdpa_sys_roce_disable(const char *addr)
541 {
542 FILE *file_o;
543 int enable;
544 int ret;
545
546 MKSTR(file_p, "/sys/bus/pci/devices/%s/roce_enable", addr);
547 file_o = fopen(file_p, "rb");
548 if (!file_o) {
549 rte_errno = ENOTSUP;
550 return -ENOTSUP;
551 }
552 ret = fscanf(file_o, "%d", &enable);
553 if (ret != 1) {
554 rte_errno = EINVAL;
555 ret = EINVAL;
556 goto close;
557 } else if (!enable) {
558 ret = 0;
559 DRV_LOG(INFO, "ROCE has already disabled(sysfs).");
560 goto close;
561 }
562 fclose(file_o);
563 file_o = fopen(file_p, "wb");
564 if (!file_o) {
565 rte_errno = ENOTSUP;
566 return -ENOTSUP;
567 }
568 fprintf(file_o, "0\n");
569 ret = 0;
570 close:
571 if (ret)
572 DRV_LOG(DEBUG, "Failed to disable ROCE by sysfs: %d.", ret);
573 else
574 DRV_LOG(INFO, "ROCE is disabled by sysfs successfully.");
575 fclose(file_o);
576 return ret;
577 }
578
579 static int
mlx5_vdpa_roce_disable(struct rte_pci_addr * addr,struct ibv_device ** ibv)580 mlx5_vdpa_roce_disable(struct rte_pci_addr *addr, struct ibv_device **ibv)
581 {
582 char addr_name[64] = {0};
583
584 rte_pci_device_name(addr, addr_name, sizeof(addr_name));
585 /* Firstly try to disable ROCE by Netlink and fallback to sysfs. */
586 if (mlx5_vdpa_nl_roce_disable(addr_name) == 0 ||
587 mlx5_vdpa_sys_roce_disable(addr_name) == 0) {
588 /*
589 * Succeed to disable ROCE, wait for the IB device to appear
590 * again after reload.
591 */
592 int r;
593 struct ibv_device *ibv_new;
594
595 for (r = MLX5_VDPA_MAX_RETRIES; r; r--) {
596 ibv_new = mlx5_vdpa_get_ib_device_match(addr);
597 if (ibv_new) {
598 *ibv = ibv_new;
599 return 0;
600 }
601 usleep(MLX5_VDPA_USEC);
602 }
603 DRV_LOG(ERR, "Cannot much device %s after ROCE disable, "
604 "retries exceed %d", addr_name, MLX5_VDPA_MAX_RETRIES);
605 rte_errno = EAGAIN;
606 }
607 return -rte_errno;
608 }
609
610 static int
mlx5_vdpa_args_check_handler(const char * key,const char * val,void * opaque)611 mlx5_vdpa_args_check_handler(const char *key, const char *val, void *opaque)
612 {
613 struct mlx5_vdpa_priv *priv = opaque;
614 unsigned long tmp;
615
616 if (strcmp(key, "class") == 0)
617 return 0;
618 errno = 0;
619 tmp = strtoul(val, NULL, 0);
620 if (errno) {
621 DRV_LOG(WARNING, "%s: \"%s\" is an invalid integer.", key, val);
622 return -errno;
623 }
624 if (strcmp(key, "event_mode") == 0) {
625 if (tmp <= MLX5_VDPA_EVENT_MODE_ONLY_INTERRUPT)
626 priv->event_mode = (int)tmp;
627 else
628 DRV_LOG(WARNING, "Invalid event_mode %s.", val);
629 } else if (strcmp(key, "event_us") == 0) {
630 priv->event_us = (uint32_t)tmp;
631 } else if (strcmp(key, "no_traffic_time") == 0) {
632 priv->no_traffic_time_s = (uint32_t)tmp;
633 } else {
634 DRV_LOG(WARNING, "Invalid key %s.", key);
635 }
636 return 0;
637 }
638
639 static void
mlx5_vdpa_config_get(struct rte_devargs * devargs,struct mlx5_vdpa_priv * priv)640 mlx5_vdpa_config_get(struct rte_devargs *devargs, struct mlx5_vdpa_priv *priv)
641 {
642 struct rte_kvargs *kvlist;
643
644 priv->event_mode = MLX5_VDPA_EVENT_MODE_DYNAMIC_TIMER;
645 priv->event_us = 0;
646 priv->no_traffic_time_s = MLX5_VDPA_DEFAULT_NO_TRAFFIC_TIME_S;
647 if (devargs == NULL)
648 return;
649 kvlist = rte_kvargs_parse(devargs->args, NULL);
650 if (kvlist == NULL)
651 return;
652 rte_kvargs_process(kvlist, NULL, mlx5_vdpa_args_check_handler, priv);
653 rte_kvargs_free(kvlist);
654 if (!priv->event_us) {
655 if (priv->event_mode == MLX5_VDPA_EVENT_MODE_DYNAMIC_TIMER)
656 priv->event_us = MLX5_VDPA_DEFAULT_TIMER_STEP_US;
657 else if (priv->event_mode == MLX5_VDPA_EVENT_MODE_FIXED_TIMER)
658 priv->event_us = MLX5_VDPA_DEFAULT_TIMER_DELAY_US;
659 }
660 DRV_LOG(DEBUG, "event mode is %d.", priv->event_mode);
661 DRV_LOG(DEBUG, "event_us is %u us.", priv->event_us);
662 DRV_LOG(DEBUG, "no traffic time is %u s.", priv->no_traffic_time_s);
663 }
664
665 /**
666 * DPDK callback to register a mlx5 PCI device.
667 *
668 * This function spawns vdpa device out of a given PCI device.
669 *
670 * @param[in] pci_drv
671 * PCI driver structure (mlx5_vpda_driver).
672 * @param[in] pci_dev
673 * PCI device information.
674 *
675 * @return
676 * 0 on success, 1 to skip this driver, a negative errno value otherwise
677 * and rte_errno is set.
678 */
679 static int
mlx5_vdpa_pci_probe(struct rte_pci_driver * pci_drv __rte_unused,struct rte_pci_device * pci_dev __rte_unused)680 mlx5_vdpa_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
681 struct rte_pci_device *pci_dev __rte_unused)
682 {
683 struct ibv_device *ibv;
684 struct mlx5_vdpa_priv *priv = NULL;
685 struct ibv_context *ctx = NULL;
686 struct mlx5_hca_attr attr;
687 int ret;
688
689 ibv = mlx5_vdpa_get_ib_device_match(&pci_dev->addr);
690 if (!ibv) {
691 DRV_LOG(ERR, "No matching IB device for PCI slot "
692 PCI_PRI_FMT ".", pci_dev->addr.domain,
693 pci_dev->addr.bus, pci_dev->addr.devid,
694 pci_dev->addr.function);
695 return -rte_errno;
696 } else {
697 DRV_LOG(INFO, "PCI information matches for device \"%s\".",
698 ibv->name);
699 }
700 if (mlx5_vdpa_roce_disable(&pci_dev->addr, &ibv) != 0) {
701 DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".",
702 ibv->name);
703 return -rte_errno;
704 }
705 ctx = mlx5_glue->dv_open_device(ibv);
706 if (!ctx) {
707 DRV_LOG(ERR, "Failed to open IB device \"%s\".", ibv->name);
708 rte_errno = ENODEV;
709 return -rte_errno;
710 }
711 ret = mlx5_devx_cmd_query_hca_attr(ctx, &attr);
712 if (ret) {
713 DRV_LOG(ERR, "Unable to read HCA capabilities.");
714 rte_errno = ENOTSUP;
715 goto error;
716 } else if (!attr.vdpa.valid || !attr.vdpa.max_num_virtio_queues) {
717 DRV_LOG(ERR, "Not enough capabilities to support vdpa, maybe "
718 "old FW/OFED version?");
719 rte_errno = ENOTSUP;
720 goto error;
721 }
722 if (!attr.vdpa.queue_counters_valid)
723 DRV_LOG(DEBUG, "No capability to support virtq statistics.");
724 priv = rte_zmalloc("mlx5 vDPA device private", sizeof(*priv) +
725 sizeof(struct mlx5_vdpa_virtq) *
726 attr.vdpa.max_num_virtio_queues * 2,
727 RTE_CACHE_LINE_SIZE);
728 if (!priv) {
729 DRV_LOG(ERR, "Failed to allocate private memory.");
730 rte_errno = ENOMEM;
731 goto error;
732 }
733 priv->caps = attr.vdpa;
734 priv->log_max_rqt_size = attr.log_max_rqt_size;
735 priv->num_lag_ports = attr.num_lag_ports;
736 if (attr.num_lag_ports == 0)
737 priv->num_lag_ports = 1;
738 priv->ctx = ctx;
739 priv->pci_dev = pci_dev;
740 priv->var = mlx5_glue->dv_alloc_var(ctx, 0);
741 if (!priv->var) {
742 DRV_LOG(ERR, "Failed to allocate VAR %u.\n", errno);
743 goto error;
744 }
745 priv->vdev = rte_vdpa_register_device(&pci_dev->device,
746 &mlx5_vdpa_ops);
747 if (priv->vdev == NULL) {
748 DRV_LOG(ERR, "Failed to register vDPA device.");
749 rte_errno = rte_errno ? rte_errno : EINVAL;
750 goto error;
751 }
752 mlx5_vdpa_config_get(pci_dev->device.devargs, priv);
753 SLIST_INIT(&priv->mr_list);
754 pthread_mutex_init(&priv->vq_config_lock, NULL);
755 pthread_mutex_lock(&priv_list_lock);
756 TAILQ_INSERT_TAIL(&priv_list, priv, next);
757 pthread_mutex_unlock(&priv_list_lock);
758 return 0;
759
760 error:
761 if (priv) {
762 if (priv->var)
763 mlx5_glue->dv_free_var(priv->var);
764 rte_free(priv);
765 }
766 if (ctx)
767 mlx5_glue->close_device(ctx);
768 return -rte_errno;
769 }
770
771 /**
772 * DPDK callback to remove a PCI device.
773 *
774 * This function removes all vDPA devices belong to a given PCI device.
775 *
776 * @param[in] pci_dev
777 * Pointer to the PCI device.
778 *
779 * @return
780 * 0 on success, the function cannot fail.
781 */
782 static int
mlx5_vdpa_pci_remove(struct rte_pci_device * pci_dev)783 mlx5_vdpa_pci_remove(struct rte_pci_device *pci_dev)
784 {
785 struct mlx5_vdpa_priv *priv = NULL;
786 int found = 0;
787
788 pthread_mutex_lock(&priv_list_lock);
789 TAILQ_FOREACH(priv, &priv_list, next) {
790 if (!rte_pci_addr_cmp(&priv->pci_dev->addr, &pci_dev->addr)) {
791 found = 1;
792 break;
793 }
794 }
795 if (found)
796 TAILQ_REMOVE(&priv_list, priv, next);
797 pthread_mutex_unlock(&priv_list_lock);
798 if (found) {
799 if (priv->configured)
800 mlx5_vdpa_dev_close(priv->vid);
801 if (priv->var) {
802 mlx5_glue->dv_free_var(priv->var);
803 priv->var = NULL;
804 }
805 mlx5_glue->close_device(priv->ctx);
806 pthread_mutex_destroy(&priv->vq_config_lock);
807 rte_free(priv);
808 }
809 return 0;
810 }
811
812 static const struct rte_pci_id mlx5_vdpa_pci_id_map[] = {
813 {
814 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
815 PCI_DEVICE_ID_MELLANOX_CONNECTX6)
816 },
817 {
818 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
819 PCI_DEVICE_ID_MELLANOX_CONNECTX6VF)
820 },
821 {
822 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
823 PCI_DEVICE_ID_MELLANOX_CONNECTX6DX)
824 },
825 {
826 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
827 PCI_DEVICE_ID_MELLANOX_CONNECTXVF)
828 },
829 {
830 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
831 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXBF)
832 },
833 {
834 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
835 PCI_DEVICE_ID_MELLANOX_CONNECTX7)
836 },
837 {
838 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
839 PCI_DEVICE_ID_MELLANOX_CONNECTX7BF)
840 },
841 {
842 .vendor_id = 0
843 }
844 };
845
846 static struct mlx5_pci_driver mlx5_vdpa_driver = {
847 .driver_class = MLX5_CLASS_VDPA,
848 .pci_driver = {
849 .driver = {
850 .name = "mlx5_vdpa",
851 },
852 .id_table = mlx5_vdpa_pci_id_map,
853 .probe = mlx5_vdpa_pci_probe,
854 .remove = mlx5_vdpa_pci_remove,
855 .drv_flags = 0,
856 },
857 };
858
859 RTE_LOG_REGISTER(mlx5_vdpa_logtype, pmd.vdpa.mlx5, NOTICE)
860
861 /**
862 * Driver initialization routine.
863 */
RTE_INIT(rte_mlx5_vdpa_init)864 RTE_INIT(rte_mlx5_vdpa_init)
865 {
866 mlx5_common_init();
867 if (mlx5_glue)
868 mlx5_pci_driver_register(&mlx5_vdpa_driver);
869 }
870
871 RTE_PMD_EXPORT_NAME(net_mlx5_vdpa, __COUNTER__);
872 RTE_PMD_REGISTER_PCI_TABLE(net_mlx5_vdpa, mlx5_vdpa_pci_id_map);
873 RTE_PMD_REGISTER_KMOD_DEP(net_mlx5_vdpa, "* ib_uverbs & mlx5_core & mlx5_ib");
874