xref: /f-stack/dpdk/drivers/net/mlx4/mlx4.c (revision 2d9fd380)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2012 6WIND S.A.
3  * Copyright 2012 Mellanox Technologies, Ltd
4  */
5 
6 /**
7  * @file
8  * mlx4 driver initialization.
9  */
10 
11 #include <errno.h>
12 #include <inttypes.h>
13 #include <stddef.h>
14 #include <stdint.h>
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <sys/mman.h>
19 #include <unistd.h>
20 #ifdef RTE_IBVERBS_LINK_DLOPEN
21 #include <dlfcn.h>
22 #endif
23 
24 /* Verbs headers do not support -pedantic. */
25 #ifdef PEDANTIC
26 #pragma GCC diagnostic ignored "-Wpedantic"
27 #endif
28 #include <infiniband/verbs.h>
29 #ifdef PEDANTIC
30 #pragma GCC diagnostic error "-Wpedantic"
31 #endif
32 
33 #include <rte_common.h>
34 #include <rte_dev.h>
35 #include <rte_errno.h>
36 #include <rte_ethdev_driver.h>
37 #include <rte_ethdev_pci.h>
38 #include <rte_ether.h>
39 #include <rte_flow.h>
40 #include <rte_interrupts.h>
41 #include <rte_kvargs.h>
42 #include <rte_malloc.h>
43 #include <rte_mbuf.h>
44 
45 #include "mlx4.h"
46 #include "mlx4_glue.h"
47 #include "mlx4_flow.h"
48 #include "mlx4_mr.h"
49 #include "mlx4_rxtx.h"
50 #include "mlx4_utils.h"
51 
52 #ifdef MLX4_GLUE
53 const struct mlx4_glue *mlx4_glue;
54 #endif
55 
56 static const char *MZ_MLX4_PMD_SHARED_DATA = "mlx4_pmd_shared_data";
57 
58 /* Shared memory between primary and secondary processes. */
59 struct mlx4_shared_data *mlx4_shared_data;
60 
61 /* Spinlock for mlx4_shared_data allocation. */
62 static rte_spinlock_t mlx4_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
63 
64 /* Process local data for secondary processes. */
65 static struct mlx4_local_data mlx4_local_data;
66 
67 /** Configuration structure for device arguments. */
68 struct mlx4_conf {
69 	struct {
70 		uint32_t present; /**< Bit-field for existing ports. */
71 		uint32_t enabled; /**< Bit-field for user-enabled ports. */
72 	} ports;
73 	int mr_ext_memseg_en;
74 	/** Whether memseg should be extended for MR creation. */
75 };
76 
77 /* Available parameters list. */
78 const char *pmd_mlx4_init_params[] = {
79 	MLX4_PMD_PORT_KVARG,
80 	MLX4_MR_EXT_MEMSEG_EN_KVARG,
81 	NULL,
82 };
83 
84 static int mlx4_dev_stop(struct rte_eth_dev *dev);
85 
86 /**
87  * Initialize shared data between primary and secondary process.
88  *
89  * A memzone is reserved by primary process and secondary processes attach to
90  * the memzone.
91  *
92  * @return
93  *   0 on success, a negative errno value otherwise and rte_errno is set.
94  */
95 static int
mlx4_init_shared_data(void)96 mlx4_init_shared_data(void)
97 {
98 	const struct rte_memzone *mz;
99 	int ret = 0;
100 
101 	rte_spinlock_lock(&mlx4_shared_data_lock);
102 	if (mlx4_shared_data == NULL) {
103 		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
104 			/* Allocate shared memory. */
105 			mz = rte_memzone_reserve(MZ_MLX4_PMD_SHARED_DATA,
106 						 sizeof(*mlx4_shared_data),
107 						 SOCKET_ID_ANY, 0);
108 			if (mz == NULL) {
109 				ERROR("Cannot allocate mlx4 shared data\n");
110 				ret = -rte_errno;
111 				goto error;
112 			}
113 			mlx4_shared_data = mz->addr;
114 			memset(mlx4_shared_data, 0, sizeof(*mlx4_shared_data));
115 			rte_spinlock_init(&mlx4_shared_data->lock);
116 		} else {
117 			/* Lookup allocated shared memory. */
118 			mz = rte_memzone_lookup(MZ_MLX4_PMD_SHARED_DATA);
119 			if (mz == NULL) {
120 				ERROR("Cannot attach mlx4 shared data\n");
121 				ret = -rte_errno;
122 				goto error;
123 			}
124 			mlx4_shared_data = mz->addr;
125 			memset(&mlx4_local_data, 0, sizeof(mlx4_local_data));
126 		}
127 	}
128 error:
129 	rte_spinlock_unlock(&mlx4_shared_data_lock);
130 	return ret;
131 }
132 
133 #ifdef HAVE_IBV_MLX4_BUF_ALLOCATORS
134 /**
135  * Verbs callback to allocate a memory. This function should allocate the space
136  * according to the size provided residing inside a huge page.
137  * Please note that all allocation must respect the alignment from libmlx4
138  * (i.e. currently sysconf(_SC_PAGESIZE)).
139  *
140  * @param[in] size
141  *   The size in bytes of the memory to allocate.
142  * @param[in] data
143  *   A pointer to the callback data.
144  *
145  * @return
146  *   Allocated buffer, NULL otherwise and rte_errno is set.
147  */
148 static void *
mlx4_alloc_verbs_buf(size_t size,void * data)149 mlx4_alloc_verbs_buf(size_t size, void *data)
150 {
151 	struct mlx4_priv *priv = data;
152 	void *ret;
153 	size_t alignment = sysconf(_SC_PAGESIZE);
154 	unsigned int socket = SOCKET_ID_ANY;
155 
156 	if (priv->verbs_alloc_ctx.type == MLX4_VERBS_ALLOC_TYPE_TX_QUEUE) {
157 		const struct txq *txq = priv->verbs_alloc_ctx.obj;
158 
159 		socket = txq->socket;
160 	} else if (priv->verbs_alloc_ctx.type ==
161 		   MLX4_VERBS_ALLOC_TYPE_RX_QUEUE) {
162 		const struct rxq *rxq = priv->verbs_alloc_ctx.obj;
163 
164 		socket = rxq->socket;
165 	}
166 	MLX4_ASSERT(data != NULL);
167 	ret = rte_malloc_socket(__func__, size, alignment, socket);
168 	if (!ret && size)
169 		rte_errno = ENOMEM;
170 	return ret;
171 }
172 
173 /**
174  * Verbs callback to free a memory.
175  *
176  * @param[in] ptr
177  *   A pointer to the memory to free.
178  * @param[in] data
179  *   A pointer to the callback data.
180  */
181 static void
mlx4_free_verbs_buf(void * ptr,void * data __rte_unused)182 mlx4_free_verbs_buf(void *ptr, void *data __rte_unused)
183 {
184 	MLX4_ASSERT(data != NULL);
185 	rte_free(ptr);
186 }
187 #endif
188 
189 /**
190  * Initialize process private data structure.
191  *
192  * @param dev
193  *   Pointer to Ethernet device structure.
194  *
195  * @return
196  *   0 on success, a negative errno value otherwise and rte_errno is set.
197  */
198 static int
mlx4_proc_priv_init(struct rte_eth_dev * dev)199 mlx4_proc_priv_init(struct rte_eth_dev *dev)
200 {
201 	struct mlx4_proc_priv *ppriv;
202 	size_t ppriv_size;
203 
204 	/*
205 	 * UAR register table follows the process private structure. BlueFlame
206 	 * registers for Tx queues are stored in the table.
207 	 */
208 	ppriv_size = sizeof(struct mlx4_proc_priv) +
209 		     dev->data->nb_tx_queues * sizeof(void *);
210 	ppriv = rte_malloc_socket("mlx4_proc_priv", ppriv_size,
211 				  RTE_CACHE_LINE_SIZE, dev->device->numa_node);
212 	if (!ppriv) {
213 		rte_errno = ENOMEM;
214 		return -rte_errno;
215 	}
216 	ppriv->uar_table_sz = ppriv_size;
217 	dev->process_private = ppriv;
218 	return 0;
219 }
220 
221 /**
222  * Un-initialize process private data structure.
223  *
224  * @param dev
225  *   Pointer to Ethernet device structure.
226  */
227 static void
mlx4_proc_priv_uninit(struct rte_eth_dev * dev)228 mlx4_proc_priv_uninit(struct rte_eth_dev *dev)
229 {
230 	if (!dev->process_private)
231 		return;
232 	rte_free(dev->process_private);
233 	dev->process_private = NULL;
234 }
235 
236 /**
237  * DPDK callback for Ethernet device configuration.
238  *
239  * @param dev
240  *   Pointer to Ethernet device structure.
241  *
242  * @return
243  *   0 on success, negative errno value otherwise and rte_errno is set.
244  */
245 static int
mlx4_dev_configure(struct rte_eth_dev * dev)246 mlx4_dev_configure(struct rte_eth_dev *dev)
247 {
248 	struct mlx4_priv *priv = dev->data->dev_private;
249 	struct rte_flow_error error;
250 	int ret;
251 
252 	/* Prepare internal flow rules. */
253 	ret = mlx4_flow_sync(priv, &error);
254 	if (ret) {
255 		ERROR("cannot set up internal flow rules (code %d, \"%s\"),"
256 		      " flow error type %d, cause %p, message: %s",
257 		      -ret, strerror(-ret), error.type, error.cause,
258 		      error.message ? error.message : "(unspecified)");
259 		goto exit;
260 	}
261 	ret = mlx4_intr_install(priv);
262 	if (ret) {
263 		ERROR("%p: interrupt handler installation failed",
264 		      (void *)dev);
265 		goto exit;
266 	}
267 	ret = mlx4_proc_priv_init(dev);
268 	if (ret) {
269 		ERROR("%p: process private data allocation failed",
270 		      (void *)dev);
271 		goto exit;
272 	}
273 exit:
274 	return ret;
275 }
276 
277 /**
278  * DPDK callback to start the device.
279  *
280  * Simulate device start by initializing common RSS resources and attaching
281  * all configured flows.
282  *
283  * @param dev
284  *   Pointer to Ethernet device structure.
285  *
286  * @return
287  *   0 on success, negative errno value otherwise and rte_errno is set.
288  */
289 static int
mlx4_dev_start(struct rte_eth_dev * dev)290 mlx4_dev_start(struct rte_eth_dev *dev)
291 {
292 	struct mlx4_priv *priv = dev->data->dev_private;
293 	struct rte_flow_error error;
294 	int ret;
295 
296 	if (priv->started)
297 		return 0;
298 	DEBUG("%p: attaching configured flows to all RX queues", (void *)dev);
299 	priv->started = 1;
300 	ret = mlx4_rss_init(priv);
301 	if (ret) {
302 		ERROR("%p: cannot initialize RSS resources: %s",
303 		      (void *)dev, strerror(-ret));
304 		goto err;
305 	}
306 #ifdef RTE_LIBRTE_MLX4_DEBUG
307 	mlx4_mr_dump_dev(dev);
308 #endif
309 	ret = mlx4_rxq_intr_enable(priv);
310 	if (ret) {
311 		ERROR("%p: interrupt handler installation failed",
312 		     (void *)dev);
313 		goto err;
314 	}
315 	ret = mlx4_flow_sync(priv, &error);
316 	if (ret) {
317 		ERROR("%p: cannot attach flow rules (code %d, \"%s\"),"
318 		      " flow error type %d, cause %p, message: %s",
319 		      (void *)dev,
320 		      -ret, strerror(-ret), error.type, error.cause,
321 		      error.message ? error.message : "(unspecified)");
322 		goto err;
323 	}
324 	rte_wmb();
325 	dev->tx_pkt_burst = mlx4_tx_burst;
326 	dev->rx_pkt_burst = mlx4_rx_burst;
327 	/* Enable datapath on secondary process. */
328 	mlx4_mp_req_start_rxtx(dev);
329 	return 0;
330 err:
331 	mlx4_dev_stop(dev);
332 	return ret;
333 }
334 
335 /**
336  * DPDK callback to stop the device.
337  *
338  * Simulate device stop by detaching all configured flows.
339  *
340  * @param dev
341  *   Pointer to Ethernet device structure.
342  */
343 static int
mlx4_dev_stop(struct rte_eth_dev * dev)344 mlx4_dev_stop(struct rte_eth_dev *dev)
345 {
346 	struct mlx4_priv *priv = dev->data->dev_private;
347 
348 	if (!priv->started)
349 		return 0;
350 	DEBUG("%p: detaching flows from all RX queues", (void *)dev);
351 	priv->started = 0;
352 	dev->tx_pkt_burst = mlx4_tx_burst_removed;
353 	dev->rx_pkt_burst = mlx4_rx_burst_removed;
354 	rte_wmb();
355 	/* Disable datapath on secondary process. */
356 	mlx4_mp_req_stop_rxtx(dev);
357 	mlx4_flow_sync(priv, NULL);
358 	mlx4_rxq_intr_disable(priv);
359 	mlx4_rss_deinit(priv);
360 
361 	return 0;
362 }
363 
364 /**
365  * DPDK callback to close the device.
366  *
367  * Destroy all queues and objects, free memory.
368  *
369  * @param dev
370  *   Pointer to Ethernet device structure.
371  */
372 static int
mlx4_dev_close(struct rte_eth_dev * dev)373 mlx4_dev_close(struct rte_eth_dev *dev)
374 {
375 	struct mlx4_priv *priv = dev->data->dev_private;
376 	unsigned int i;
377 
378 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
379 		return 0;
380 	DEBUG("%p: closing device \"%s\"",
381 	      (void *)dev,
382 	      ((priv->ctx != NULL) ? priv->ctx->device->name : ""));
383 	dev->rx_pkt_burst = mlx4_rx_burst_removed;
384 	dev->tx_pkt_burst = mlx4_tx_burst_removed;
385 	rte_wmb();
386 	/* Disable datapath on secondary process. */
387 	mlx4_mp_req_stop_rxtx(dev);
388 	mlx4_flow_clean(priv);
389 	mlx4_rss_deinit(priv);
390 	for (i = 0; i != dev->data->nb_rx_queues; ++i)
391 		mlx4_rx_queue_release(dev->data->rx_queues[i]);
392 	for (i = 0; i != dev->data->nb_tx_queues; ++i)
393 		mlx4_tx_queue_release(dev->data->tx_queues[i]);
394 	mlx4_proc_priv_uninit(dev);
395 	mlx4_mr_release(dev);
396 	if (priv->pd != NULL) {
397 		MLX4_ASSERT(priv->ctx != NULL);
398 		claim_zero(mlx4_glue->dealloc_pd(priv->pd));
399 		claim_zero(mlx4_glue->close_device(priv->ctx));
400 	} else
401 		MLX4_ASSERT(priv->ctx == NULL);
402 	mlx4_intr_uninstall(priv);
403 	memset(priv, 0, sizeof(*priv));
404 	/* mac_addrs must not be freed because part of dev_private */
405 	dev->data->mac_addrs = NULL;
406 	return 0;
407 }
408 
409 static const struct eth_dev_ops mlx4_dev_ops = {
410 	.dev_configure = mlx4_dev_configure,
411 	.dev_start = mlx4_dev_start,
412 	.dev_stop = mlx4_dev_stop,
413 	.dev_set_link_down = mlx4_dev_set_link_down,
414 	.dev_set_link_up = mlx4_dev_set_link_up,
415 	.dev_close = mlx4_dev_close,
416 	.link_update = mlx4_link_update,
417 	.promiscuous_enable = mlx4_promiscuous_enable,
418 	.promiscuous_disable = mlx4_promiscuous_disable,
419 	.allmulticast_enable = mlx4_allmulticast_enable,
420 	.allmulticast_disable = mlx4_allmulticast_disable,
421 	.mac_addr_remove = mlx4_mac_addr_remove,
422 	.mac_addr_add = mlx4_mac_addr_add,
423 	.mac_addr_set = mlx4_mac_addr_set,
424 	.set_mc_addr_list = mlx4_set_mc_addr_list,
425 	.stats_get = mlx4_stats_get,
426 	.stats_reset = mlx4_stats_reset,
427 	.fw_version_get = mlx4_fw_version_get,
428 	.dev_infos_get = mlx4_dev_infos_get,
429 	.dev_supported_ptypes_get = mlx4_dev_supported_ptypes_get,
430 	.vlan_filter_set = mlx4_vlan_filter_set,
431 	.rx_queue_setup = mlx4_rx_queue_setup,
432 	.tx_queue_setup = mlx4_tx_queue_setup,
433 	.rx_queue_release = mlx4_rx_queue_release,
434 	.tx_queue_release = mlx4_tx_queue_release,
435 	.flow_ctrl_get = mlx4_flow_ctrl_get,
436 	.flow_ctrl_set = mlx4_flow_ctrl_set,
437 	.mtu_set = mlx4_mtu_set,
438 	.filter_ctrl = mlx4_filter_ctrl,
439 	.rx_queue_intr_enable = mlx4_rx_intr_enable,
440 	.rx_queue_intr_disable = mlx4_rx_intr_disable,
441 	.is_removed = mlx4_is_removed,
442 };
443 
444 /* Available operations from secondary process. */
445 static const struct eth_dev_ops mlx4_dev_sec_ops = {
446 	.stats_get = mlx4_stats_get,
447 	.stats_reset = mlx4_stats_reset,
448 	.fw_version_get = mlx4_fw_version_get,
449 	.dev_infos_get = mlx4_dev_infos_get,
450 };
451 
452 /**
453  * Get PCI information from struct ibv_device.
454  *
455  * @param device
456  *   Pointer to Ethernet device structure.
457  * @param[out] pci_addr
458  *   PCI bus address output buffer.
459  *
460  * @return
461  *   0 on success, negative errno value otherwise and rte_errno is set.
462  */
463 static int
mlx4_ibv_device_to_pci_addr(const struct ibv_device * device,struct rte_pci_addr * pci_addr)464 mlx4_ibv_device_to_pci_addr(const struct ibv_device *device,
465 			    struct rte_pci_addr *pci_addr)
466 {
467 	FILE *file;
468 	char line[32];
469 	MKSTR(path, "%s/device/uevent", device->ibdev_path);
470 
471 	file = fopen(path, "rb");
472 	if (file == NULL) {
473 		rte_errno = errno;
474 		return -rte_errno;
475 	}
476 	while (fgets(line, sizeof(line), file) == line) {
477 		size_t len = strlen(line);
478 		int ret;
479 
480 		/* Truncate long lines. */
481 		if (len == (sizeof(line) - 1))
482 			while (line[(len - 1)] != '\n') {
483 				ret = fgetc(file);
484 				if (ret == EOF)
485 					break;
486 				line[(len - 1)] = ret;
487 			}
488 		/* Extract information. */
489 		if (sscanf(line,
490 			   "PCI_SLOT_NAME="
491 			   "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
492 			   &pci_addr->domain,
493 			   &pci_addr->bus,
494 			   &pci_addr->devid,
495 			   &pci_addr->function) == 4) {
496 			break;
497 		}
498 	}
499 	fclose(file);
500 	return 0;
501 }
502 
503 /**
504  * Verify and store value for device argument.
505  *
506  * @param[in] key
507  *   Key argument to verify.
508  * @param[in] val
509  *   Value associated with key.
510  * @param[in, out] conf
511  *   Shared configuration data.
512  *
513  * @return
514  *   0 on success, negative errno value otherwise and rte_errno is set.
515  */
516 static int
mlx4_arg_parse(const char * key,const char * val,struct mlx4_conf * conf)517 mlx4_arg_parse(const char *key, const char *val, struct mlx4_conf *conf)
518 {
519 	unsigned long tmp;
520 
521 	errno = 0;
522 	tmp = strtoul(val, NULL, 0);
523 	if (errno) {
524 		rte_errno = errno;
525 		WARN("%s: \"%s\" is not a valid integer", key, val);
526 		return -rte_errno;
527 	}
528 	if (strcmp(MLX4_PMD_PORT_KVARG, key) == 0) {
529 		uint32_t ports = rte_log2_u32(conf->ports.present + 1);
530 
531 		if (tmp >= ports) {
532 			ERROR("port index %lu outside range [0,%" PRIu32 ")",
533 			      tmp, ports);
534 			return -EINVAL;
535 		}
536 		if (!(conf->ports.present & (1 << tmp))) {
537 			rte_errno = EINVAL;
538 			ERROR("invalid port index %lu", tmp);
539 			return -rte_errno;
540 		}
541 		conf->ports.enabled |= 1 << tmp;
542 	} else if (strcmp(MLX4_MR_EXT_MEMSEG_EN_KVARG, key) == 0) {
543 		conf->mr_ext_memseg_en = !!tmp;
544 	} else {
545 		rte_errno = EINVAL;
546 		WARN("%s: unknown parameter", key);
547 		return -rte_errno;
548 	}
549 	return 0;
550 }
551 
552 /**
553  * Parse device parameters.
554  *
555  * @param devargs
556  *   Device arguments structure.
557  *
558  * @return
559  *   0 on success, negative errno value otherwise and rte_errno is set.
560  */
561 static int
mlx4_args(struct rte_devargs * devargs,struct mlx4_conf * conf)562 mlx4_args(struct rte_devargs *devargs, struct mlx4_conf *conf)
563 {
564 	struct rte_kvargs *kvlist;
565 	unsigned int arg_count;
566 	int ret = 0;
567 	int i;
568 
569 	if (devargs == NULL)
570 		return 0;
571 	kvlist = rte_kvargs_parse(devargs->args, pmd_mlx4_init_params);
572 	if (kvlist == NULL) {
573 		rte_errno = EINVAL;
574 		ERROR("failed to parse kvargs");
575 		return -rte_errno;
576 	}
577 	/* Process parameters. */
578 	for (i = 0; pmd_mlx4_init_params[i]; ++i) {
579 		arg_count = rte_kvargs_count(kvlist, pmd_mlx4_init_params[i]);
580 		while (arg_count-- > 0) {
581 			ret = rte_kvargs_process(kvlist,
582 						 pmd_mlx4_init_params[i],
583 						 (int (*)(const char *,
584 							  const char *,
585 							  void *))
586 						 mlx4_arg_parse,
587 						 conf);
588 			if (ret != 0)
589 				goto free_kvlist;
590 		}
591 	}
592 free_kvlist:
593 	rte_kvargs_free(kvlist);
594 	return ret;
595 }
596 
597 /**
598  * Interpret RSS capabilities reported by device.
599  *
600  * This function returns the set of usable Verbs RSS hash fields, kernel
601  * quirks taken into account.
602  *
603  * @param ctx
604  *   Verbs context.
605  * @param pd
606  *   Verbs protection domain.
607  * @param device_attr_ex
608  *   Extended device attributes to interpret.
609  *
610  * @return
611  *   Usable RSS hash fields mask in Verbs format.
612  */
613 static uint64_t
mlx4_hw_rss_sup(struct ibv_context * ctx,struct ibv_pd * pd,struct ibv_device_attr_ex * device_attr_ex)614 mlx4_hw_rss_sup(struct ibv_context *ctx, struct ibv_pd *pd,
615 		struct ibv_device_attr_ex *device_attr_ex)
616 {
617 	uint64_t hw_rss_sup = device_attr_ex->rss_caps.rx_hash_fields_mask;
618 	struct ibv_cq *cq = NULL;
619 	struct ibv_wq *wq = NULL;
620 	struct ibv_rwq_ind_table *ind = NULL;
621 	struct ibv_qp *qp = NULL;
622 
623 	if (!hw_rss_sup) {
624 		WARN("no RSS capabilities reported; disabling support for UDP"
625 		     " RSS and inner VXLAN RSS");
626 		return IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4 |
627 			IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6 |
628 			IBV_RX_HASH_SRC_PORT_TCP | IBV_RX_HASH_DST_PORT_TCP;
629 	}
630 	if (!(hw_rss_sup & IBV_RX_HASH_INNER))
631 		return hw_rss_sup;
632 	/*
633 	 * Although reported as supported, missing code in some Linux
634 	 * versions (v4.15, v4.16) prevents the creation of hash QPs with
635 	 * inner capability.
636 	 *
637 	 * There is no choice but to attempt to instantiate a temporary RSS
638 	 * context in order to confirm its support.
639 	 */
640 	cq = mlx4_glue->create_cq(ctx, 1, NULL, NULL, 0);
641 	wq = cq ? mlx4_glue->create_wq
642 		(ctx,
643 		 &(struct ibv_wq_init_attr){
644 			.wq_type = IBV_WQT_RQ,
645 			.max_wr = 1,
646 			.max_sge = 1,
647 			.pd = pd,
648 			.cq = cq,
649 		 }) : NULL;
650 	ind = wq ? mlx4_glue->create_rwq_ind_table
651 		(ctx,
652 		 &(struct ibv_rwq_ind_table_init_attr){
653 			.log_ind_tbl_size = 0,
654 			.ind_tbl = &wq,
655 			.comp_mask = 0,
656 		 }) : NULL;
657 	qp = ind ? mlx4_glue->create_qp_ex
658 		(ctx,
659 		 &(struct ibv_qp_init_attr_ex){
660 			.comp_mask =
661 				(IBV_QP_INIT_ATTR_PD |
662 				 IBV_QP_INIT_ATTR_RX_HASH |
663 				 IBV_QP_INIT_ATTR_IND_TABLE),
664 			.qp_type = IBV_QPT_RAW_PACKET,
665 			.pd = pd,
666 			.rwq_ind_tbl = ind,
667 			.rx_hash_conf = {
668 				.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
669 				.rx_hash_key_len = MLX4_RSS_HASH_KEY_SIZE,
670 				.rx_hash_key = mlx4_rss_hash_key_default,
671 				.rx_hash_fields_mask = hw_rss_sup,
672 			},
673 		 }) : NULL;
674 	if (!qp) {
675 		WARN("disabling unusable inner RSS capability due to kernel"
676 		     " quirk");
677 		hw_rss_sup &= ~IBV_RX_HASH_INNER;
678 	} else {
679 		claim_zero(mlx4_glue->destroy_qp(qp));
680 	}
681 	if (ind)
682 		claim_zero(mlx4_glue->destroy_rwq_ind_table(ind));
683 	if (wq)
684 		claim_zero(mlx4_glue->destroy_wq(wq));
685 	if (cq)
686 		claim_zero(mlx4_glue->destroy_cq(cq));
687 	return hw_rss_sup;
688 }
689 
690 static struct rte_pci_driver mlx4_driver;
691 
692 /**
693  * PMD global initialization.
694  *
695  * Independent from individual device, this function initializes global
696  * per-PMD data structures distinguishing primary and secondary processes.
697  * Hence, each initialization is called once per a process.
698  *
699  * @return
700  *   0 on success, a negative errno value otherwise and rte_errno is set.
701  */
702 static int
mlx4_init_once(void)703 mlx4_init_once(void)
704 {
705 	struct mlx4_shared_data *sd;
706 	struct mlx4_local_data *ld = &mlx4_local_data;
707 	int ret = 0;
708 
709 	if (mlx4_init_shared_data())
710 		return -rte_errno;
711 	sd = mlx4_shared_data;
712 	MLX4_ASSERT(sd);
713 	rte_spinlock_lock(&sd->lock);
714 	switch (rte_eal_process_type()) {
715 	case RTE_PROC_PRIMARY:
716 		if (sd->init_done)
717 			break;
718 		LIST_INIT(&sd->mem_event_cb_list);
719 		rte_rwlock_init(&sd->mem_event_rwlock);
720 		rte_mem_event_callback_register("MLX4_MEM_EVENT_CB",
721 						mlx4_mr_mem_event_cb, NULL);
722 		ret = mlx4_mp_init_primary();
723 		if (ret)
724 			goto out;
725 		sd->init_done = 1;
726 		break;
727 	case RTE_PROC_SECONDARY:
728 		if (ld->init_done)
729 			break;
730 		ret = mlx4_mp_init_secondary();
731 		if (ret)
732 			goto out;
733 		++sd->secondary_cnt;
734 		ld->init_done = 1;
735 		break;
736 	default:
737 		break;
738 	}
739 out:
740 	rte_spinlock_unlock(&sd->lock);
741 	return ret;
742 }
743 
744 /**
745  * DPDK callback to register a PCI device.
746  *
747  * This function creates an Ethernet device for each port of a given
748  * PCI device.
749  *
750  * @param[in] pci_drv
751  *   PCI driver structure (mlx4_driver).
752  * @param[in] pci_dev
753  *   PCI device information.
754  *
755  * @return
756  *   0 on success, negative errno value otherwise and rte_errno is set.
757  */
758 static int
mlx4_pci_probe(struct rte_pci_driver * pci_drv,struct rte_pci_device * pci_dev)759 mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
760 {
761 	struct ibv_device **list;
762 	struct ibv_device *ibv_dev;
763 	int err = 0;
764 	struct ibv_context *attr_ctx = NULL;
765 	struct ibv_device_attr device_attr;
766 	struct ibv_device_attr_ex device_attr_ex;
767 	struct mlx4_conf conf = {
768 		.ports.present = 0,
769 		.mr_ext_memseg_en = 1,
770 	};
771 	unsigned int vf;
772 	int i;
773 	char ifname[IF_NAMESIZE];
774 
775 	(void)pci_drv;
776 	err = mlx4_init_once();
777 	if (err) {
778 		ERROR("unable to init PMD global data: %s",
779 		      strerror(rte_errno));
780 		return -rte_errno;
781 	}
782 	MLX4_ASSERT(pci_drv == &mlx4_driver);
783 	list = mlx4_glue->get_device_list(&i);
784 	if (list == NULL) {
785 		rte_errno = errno;
786 		MLX4_ASSERT(rte_errno);
787 		if (rte_errno == ENOSYS)
788 			ERROR("cannot list devices, is ib_uverbs loaded?");
789 		return -rte_errno;
790 	}
791 	MLX4_ASSERT(i >= 0);
792 	/*
793 	 * For each listed device, check related sysfs entry against
794 	 * the provided PCI ID.
795 	 */
796 	while (i != 0) {
797 		struct rte_pci_addr pci_addr;
798 
799 		--i;
800 		DEBUG("checking device \"%s\"", list[i]->name);
801 		if (mlx4_ibv_device_to_pci_addr(list[i], &pci_addr))
802 			continue;
803 		if ((pci_dev->addr.domain != pci_addr.domain) ||
804 		    (pci_dev->addr.bus != pci_addr.bus) ||
805 		    (pci_dev->addr.devid != pci_addr.devid) ||
806 		    (pci_dev->addr.function != pci_addr.function))
807 			continue;
808 		vf = (pci_dev->id.device_id ==
809 		      PCI_DEVICE_ID_MELLANOX_CONNECTX3VF);
810 		INFO("PCI information matches, using device \"%s\" (VF: %s)",
811 		     list[i]->name, (vf ? "true" : "false"));
812 		attr_ctx = mlx4_glue->open_device(list[i]);
813 		err = errno;
814 		break;
815 	}
816 	if (attr_ctx == NULL) {
817 		mlx4_glue->free_device_list(list);
818 		switch (err) {
819 		case 0:
820 			rte_errno = ENODEV;
821 			ERROR("cannot access device, is mlx4_ib loaded?");
822 			return -rte_errno;
823 		case EINVAL:
824 			rte_errno = EINVAL;
825 			ERROR("cannot use device, are drivers up to date?");
826 			return -rte_errno;
827 		}
828 		MLX4_ASSERT(err > 0);
829 		rte_errno = err;
830 		return -rte_errno;
831 	}
832 	ibv_dev = list[i];
833 	DEBUG("device opened");
834 	if (mlx4_glue->query_device(attr_ctx, &device_attr)) {
835 		err = ENODEV;
836 		goto error;
837 	}
838 	INFO("%u port(s) detected", device_attr.phys_port_cnt);
839 	conf.ports.present |= (UINT64_C(1) << device_attr.phys_port_cnt) - 1;
840 	if (mlx4_args(pci_dev->device.devargs, &conf)) {
841 		ERROR("failed to process device arguments");
842 		err = EINVAL;
843 		goto error;
844 	}
845 	/* Use all ports when none are defined */
846 	if (!conf.ports.enabled)
847 		conf.ports.enabled = conf.ports.present;
848 	/* Retrieve extended device attributes. */
849 	if (mlx4_glue->query_device_ex(attr_ctx, NULL, &device_attr_ex)) {
850 		err = ENODEV;
851 		goto error;
852 	}
853 	MLX4_ASSERT(device_attr.max_sge >= MLX4_MAX_SGE);
854 	for (i = 0; i < device_attr.phys_port_cnt; i++) {
855 		uint32_t port = i + 1; /* ports are indexed from one */
856 		struct ibv_context *ctx = NULL;
857 		struct ibv_port_attr port_attr;
858 		struct ibv_pd *pd = NULL;
859 		struct mlx4_priv *priv = NULL;
860 		struct rte_eth_dev *eth_dev = NULL;
861 		struct rte_ether_addr mac;
862 		char name[RTE_ETH_NAME_MAX_LEN];
863 
864 		/* If port is not enabled, skip. */
865 		if (!(conf.ports.enabled & (1 << i)))
866 			continue;
867 		DEBUG("using port %u", port);
868 		ctx = mlx4_glue->open_device(ibv_dev);
869 		if (ctx == NULL) {
870 			err = ENODEV;
871 			goto port_error;
872 		}
873 		snprintf(name, sizeof(name), "%s port %u",
874 			 mlx4_glue->get_device_name(ibv_dev), port);
875 		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
876 			eth_dev = rte_eth_dev_attach_secondary(name);
877 			if (eth_dev == NULL) {
878 				ERROR("can not attach rte ethdev");
879 				rte_errno = ENOMEM;
880 				err = rte_errno;
881 				goto error;
882 			}
883 			priv = eth_dev->data->dev_private;
884 			if (!priv->verbs_alloc_ctx.enabled) {
885 				ERROR("secondary process is not supported"
886 				      " due to lack of external allocator"
887 				      " from Verbs");
888 				rte_errno = ENOTSUP;
889 				err = rte_errno;
890 				goto error;
891 			}
892 			eth_dev->device = &pci_dev->device;
893 			eth_dev->dev_ops = &mlx4_dev_sec_ops;
894 			err = mlx4_proc_priv_init(eth_dev);
895 			if (err)
896 				goto error;
897 			/* Receive command fd from primary process. */
898 			err = mlx4_mp_req_verbs_cmd_fd(eth_dev);
899 			if (err < 0) {
900 				err = rte_errno;
901 				goto error;
902 			}
903 			/* Remap UAR for Tx queues. */
904 			err = mlx4_tx_uar_init_secondary(eth_dev, err);
905 			if (err) {
906 				err = rte_errno;
907 				goto error;
908 			}
909 			/*
910 			 * Ethdev pointer is still required as input since
911 			 * the primary device is not accessible from the
912 			 * secondary process.
913 			 */
914 			eth_dev->tx_pkt_burst = mlx4_tx_burst;
915 			eth_dev->rx_pkt_burst = mlx4_rx_burst;
916 			claim_zero(mlx4_glue->close_device(ctx));
917 			rte_eth_copy_pci_info(eth_dev, pci_dev);
918 			rte_eth_dev_probing_finish(eth_dev);
919 			continue;
920 		}
921 		/* Check port status. */
922 		err = mlx4_glue->query_port(ctx, port, &port_attr);
923 		if (err) {
924 			err = ENODEV;
925 			ERROR("port query failed: %s", strerror(err));
926 			goto port_error;
927 		}
928 		if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
929 			err = ENOTSUP;
930 			ERROR("port %d is not configured in Ethernet mode",
931 			      port);
932 			goto port_error;
933 		}
934 		if (port_attr.state != IBV_PORT_ACTIVE)
935 			DEBUG("port %d is not active: \"%s\" (%d)",
936 			      port, mlx4_glue->port_state_str(port_attr.state),
937 			      port_attr.state);
938 		/* Make asynchronous FD non-blocking to handle interrupts. */
939 		err = mlx4_fd_set_non_blocking(ctx->async_fd);
940 		if (err) {
941 			ERROR("cannot make asynchronous FD non-blocking: %s",
942 			      strerror(err));
943 			goto port_error;
944 		}
945 		/* Allocate protection domain. */
946 		pd = mlx4_glue->alloc_pd(ctx);
947 		if (pd == NULL) {
948 			err = ENOMEM;
949 			ERROR("PD allocation failure");
950 			goto port_error;
951 		}
952 		/* from rte_ethdev.c */
953 		priv = rte_zmalloc("ethdev private structure",
954 				   sizeof(*priv),
955 				   RTE_CACHE_LINE_SIZE);
956 		if (priv == NULL) {
957 			err = ENOMEM;
958 			ERROR("priv allocation failure");
959 			goto port_error;
960 		}
961 		priv->ctx = ctx;
962 		priv->device_attr = device_attr;
963 		priv->port = port;
964 		priv->pd = pd;
965 		priv->mtu = RTE_ETHER_MTU;
966 		priv->vf = vf;
967 		priv->hw_csum =	!!(device_attr.device_cap_flags &
968 				   IBV_DEVICE_RAW_IP_CSUM);
969 		DEBUG("checksum offloading is %ssupported",
970 		      (priv->hw_csum ? "" : "not "));
971 		/* Only ConnectX-3 Pro supports tunneling. */
972 		priv->hw_csum_l2tun =
973 			priv->hw_csum &&
974 			(device_attr.vendor_part_id ==
975 			 PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO);
976 		DEBUG("L2 tunnel checksum offloads are %ssupported",
977 		      priv->hw_csum_l2tun ? "" : "not ");
978 		priv->hw_rss_sup = mlx4_hw_rss_sup(priv->ctx, priv->pd,
979 						   &device_attr_ex);
980 		DEBUG("supported RSS hash fields mask: %016" PRIx64,
981 		      priv->hw_rss_sup);
982 		priv->hw_rss_max_qps =
983 			device_attr_ex.rss_caps.max_rwq_indirection_table_size;
984 		DEBUG("MAX RSS queues %d", priv->hw_rss_max_qps);
985 		priv->hw_fcs_strip = !!(device_attr_ex.raw_packet_caps &
986 					IBV_RAW_PACKET_CAP_SCATTER_FCS);
987 		DEBUG("FCS stripping toggling is %ssupported",
988 		      priv->hw_fcs_strip ? "" : "not ");
989 		priv->tso =
990 			((device_attr_ex.tso_caps.max_tso > 0) &&
991 			 (device_attr_ex.tso_caps.supported_qpts &
992 			  (1 << IBV_QPT_RAW_PACKET)));
993 		if (priv->tso)
994 			priv->tso_max_payload_sz =
995 					device_attr_ex.tso_caps.max_tso;
996 		DEBUG("TSO is %ssupported",
997 		      priv->tso ? "" : "not ");
998 		priv->mr_ext_memseg_en = conf.mr_ext_memseg_en;
999 		/* Configure the first MAC address by default. */
1000 		err = mlx4_get_mac(priv, &mac.addr_bytes);
1001 		if (err) {
1002 			ERROR("cannot get MAC address, is mlx4_en loaded?"
1003 			      " (error: %s)", strerror(err));
1004 			goto port_error;
1005 		}
1006 		INFO("port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
1007 		     priv->port,
1008 		     mac.addr_bytes[0], mac.addr_bytes[1],
1009 		     mac.addr_bytes[2], mac.addr_bytes[3],
1010 		     mac.addr_bytes[4], mac.addr_bytes[5]);
1011 		/* Register MAC address. */
1012 		priv->mac[0] = mac;
1013 
1014 		if (mlx4_get_ifname(priv, &ifname) == 0) {
1015 			DEBUG("port %u ifname is \"%s\"",
1016 			      priv->port, ifname);
1017 			priv->if_index = if_nametoindex(ifname);
1018 		} else {
1019 			DEBUG("port %u ifname is unknown", priv->port);
1020 		}
1021 
1022 		/* Get actual MTU if possible. */
1023 		mlx4_mtu_get(priv, &priv->mtu);
1024 		DEBUG("port %u MTU is %u", priv->port, priv->mtu);
1025 		eth_dev = rte_eth_dev_allocate(name);
1026 		if (eth_dev == NULL) {
1027 			err = ENOMEM;
1028 			ERROR("can not allocate rte ethdev");
1029 			goto port_error;
1030 		}
1031 		eth_dev->data->dev_private = priv;
1032 		eth_dev->data->mac_addrs = priv->mac;
1033 		eth_dev->device = &pci_dev->device;
1034 		rte_eth_copy_pci_info(eth_dev, pci_dev);
1035 		eth_dev->data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
1036 		/* Initialize local interrupt handle for current port. */
1037 		memset(&priv->intr_handle, 0, sizeof(struct rte_intr_handle));
1038 		priv->intr_handle.fd = -1;
1039 		priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
1040 		/*
1041 		 * Override ethdev interrupt handle pointer with private
1042 		 * handle instead of that of the parent PCI device used by
1043 		 * default. This prevents it from being shared between all
1044 		 * ports of the same PCI device since each of them is
1045 		 * associated its own Verbs context.
1046 		 *
1047 		 * Rx interrupts in particular require this as the PMD has
1048 		 * no control over the registration of queue interrupts
1049 		 * besides setting up eth_dev->intr_handle, the rest is
1050 		 * handled by rte_intr_rx_ctl().
1051 		 */
1052 		eth_dev->intr_handle = &priv->intr_handle;
1053 		priv->dev_data = eth_dev->data;
1054 		eth_dev->dev_ops = &mlx4_dev_ops;
1055 #ifdef HAVE_IBV_MLX4_BUF_ALLOCATORS
1056 		/* Hint libmlx4 to use PMD allocator for data plane resources */
1057 		err = mlx4_glue->dv_set_context_attr
1058 			(ctx, MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS,
1059 			 (void *)((uintptr_t)&(struct mlx4dv_ctx_allocators){
1060 				 .alloc = &mlx4_alloc_verbs_buf,
1061 				 .free = &mlx4_free_verbs_buf,
1062 				 .data = priv,
1063 			}));
1064 		if (err)
1065 			WARN("Verbs external allocator is not supported");
1066 		else
1067 			priv->verbs_alloc_ctx.enabled = 1;
1068 #endif
1069 		/* Bring Ethernet device up. */
1070 		DEBUG("forcing Ethernet interface up");
1071 		mlx4_dev_set_link_up(eth_dev);
1072 		/* Update link status once if waiting for LSC. */
1073 		if (eth_dev->data->dev_flags & RTE_ETH_DEV_INTR_LSC)
1074 			mlx4_link_update(eth_dev, 0);
1075 		/*
1076 		 * Once the device is added to the list of memory event
1077 		 * callback, its global MR cache table cannot be expanded
1078 		 * on the fly because of deadlock. If it overflows, lookup
1079 		 * should be done by searching MR list linearly, which is slow.
1080 		 */
1081 		err = mlx4_mr_btree_init(&priv->mr.cache,
1082 					 MLX4_MR_BTREE_CACHE_N * 2,
1083 					 eth_dev->device->numa_node);
1084 		if (err) {
1085 			/* rte_errno is already set. */
1086 			goto port_error;
1087 		}
1088 		/* Add device to memory callback list. */
1089 		rte_rwlock_write_lock(&mlx4_shared_data->mem_event_rwlock);
1090 		LIST_INSERT_HEAD(&mlx4_shared_data->mem_event_cb_list,
1091 				 priv, mem_event_cb);
1092 		rte_rwlock_write_unlock(&mlx4_shared_data->mem_event_rwlock);
1093 		rte_eth_dev_probing_finish(eth_dev);
1094 		continue;
1095 port_error:
1096 		rte_free(priv);
1097 		if (eth_dev != NULL)
1098 			eth_dev->data->dev_private = NULL;
1099 		if (pd)
1100 			claim_zero(mlx4_glue->dealloc_pd(pd));
1101 		if (ctx)
1102 			claim_zero(mlx4_glue->close_device(ctx));
1103 		if (eth_dev != NULL) {
1104 			/* mac_addrs must not be freed because part of dev_private */
1105 			eth_dev->data->mac_addrs = NULL;
1106 			rte_eth_dev_release_port(eth_dev);
1107 		}
1108 		break;
1109 	}
1110 	/*
1111 	 * XXX if something went wrong in the loop above, there is a resource
1112 	 * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as
1113 	 * long as the dpdk does not provide a way to deallocate a ethdev and a
1114 	 * way to enumerate the registered ethdevs to free the previous ones.
1115 	 */
1116 error:
1117 	if (attr_ctx)
1118 		claim_zero(mlx4_glue->close_device(attr_ctx));
1119 	if (list)
1120 		mlx4_glue->free_device_list(list);
1121 	if (err)
1122 		rte_errno = err;
1123 	return -err;
1124 }
1125 
1126 static const struct rte_pci_id mlx4_pci_id_map[] = {
1127 	{
1128 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1129 			       PCI_DEVICE_ID_MELLANOX_CONNECTX3)
1130 	},
1131 	{
1132 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1133 			       PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO)
1134 	},
1135 	{
1136 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
1137 			       PCI_DEVICE_ID_MELLANOX_CONNECTX3VF)
1138 	},
1139 	{
1140 		.vendor_id = 0
1141 	}
1142 };
1143 
1144 static struct rte_pci_driver mlx4_driver = {
1145 	.driver = {
1146 		.name = MLX4_DRIVER_NAME
1147 	},
1148 	.id_table = mlx4_pci_id_map,
1149 	.probe = mlx4_pci_probe,
1150 	.drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV,
1151 };
1152 
1153 #ifdef RTE_IBVERBS_LINK_DLOPEN
1154 
1155 /**
1156  * Suffix RTE_EAL_PMD_PATH with "-glue".
1157  *
1158  * This function performs a sanity check on RTE_EAL_PMD_PATH before
1159  * suffixing its last component.
1160  *
1161  * @param buf[out]
1162  *   Output buffer, should be large enough otherwise NULL is returned.
1163  * @param size
1164  *   Size of @p out.
1165  *
1166  * @return
1167  *   Pointer to @p buf or @p NULL in case suffix cannot be appended.
1168  */
1169 static char *
mlx4_glue_path(char * buf,size_t size)1170 mlx4_glue_path(char *buf, size_t size)
1171 {
1172 	static const char *const bad[] = { "/", ".", "..", NULL };
1173 	const char *path = RTE_EAL_PMD_PATH;
1174 	size_t len = strlen(path);
1175 	size_t off;
1176 	int i;
1177 
1178 	while (len && path[len - 1] == '/')
1179 		--len;
1180 	for (off = len; off && path[off - 1] != '/'; --off)
1181 		;
1182 	for (i = 0; bad[i]; ++i)
1183 		if (!strncmp(path + off, bad[i], (int)(len - off)))
1184 			goto error;
1185 	i = snprintf(buf, size, "%.*s-glue", (int)len, path);
1186 	if (i == -1 || (size_t)i >= size)
1187 		goto error;
1188 	return buf;
1189 error:
1190 	ERROR("unable to append \"-glue\" to last component of"
1191 	      " RTE_EAL_PMD_PATH (\"" RTE_EAL_PMD_PATH "\"),"
1192 	      " please re-configure DPDK");
1193 	return NULL;
1194 }
1195 
1196 /**
1197  * Initialization routine for run-time dependency on rdma-core.
1198  */
1199 static int
mlx4_glue_init(void)1200 mlx4_glue_init(void)
1201 {
1202 	char glue_path[sizeof(RTE_EAL_PMD_PATH) - 1 + sizeof("-glue")];
1203 	const char *path[] = {
1204 		/*
1205 		 * A basic security check is necessary before trusting
1206 		 * MLX4_GLUE_PATH, which may override RTE_EAL_PMD_PATH.
1207 		 */
1208 		(geteuid() == getuid() && getegid() == getgid() ?
1209 		 getenv("MLX4_GLUE_PATH") : NULL),
1210 		/*
1211 		 * When RTE_EAL_PMD_PATH is set, use its glue-suffixed
1212 		 * variant, otherwise let dlopen() look up libraries on its
1213 		 * own.
1214 		 */
1215 		(*RTE_EAL_PMD_PATH ?
1216 		 mlx4_glue_path(glue_path, sizeof(glue_path)) : ""),
1217 	};
1218 	unsigned int i = 0;
1219 	void *handle = NULL;
1220 	void **sym;
1221 	const char *dlmsg;
1222 
1223 	while (!handle && i != RTE_DIM(path)) {
1224 		const char *end;
1225 		size_t len;
1226 		int ret;
1227 
1228 		if (!path[i]) {
1229 			++i;
1230 			continue;
1231 		}
1232 		end = strpbrk(path[i], ":;");
1233 		if (!end)
1234 			end = path[i] + strlen(path[i]);
1235 		len = end - path[i];
1236 		ret = 0;
1237 		do {
1238 			char name[ret + 1];
1239 
1240 			ret = snprintf(name, sizeof(name), "%.*s%s" MLX4_GLUE,
1241 				       (int)len, path[i],
1242 				       (!len || *(end - 1) == '/') ? "" : "/");
1243 			if (ret == -1)
1244 				break;
1245 			if (sizeof(name) != (size_t)ret + 1)
1246 				continue;
1247 			DEBUG("looking for rdma-core glue as \"%s\"", name);
1248 			handle = dlopen(name, RTLD_LAZY);
1249 			break;
1250 		} while (1);
1251 		path[i] = end + 1;
1252 		if (!*end)
1253 			++i;
1254 	}
1255 	if (!handle) {
1256 		rte_errno = EINVAL;
1257 		dlmsg = dlerror();
1258 		if (dlmsg)
1259 			WARN("cannot load glue library: %s", dlmsg);
1260 		goto glue_error;
1261 	}
1262 	sym = dlsym(handle, "mlx4_glue");
1263 	if (!sym || !*sym) {
1264 		rte_errno = EINVAL;
1265 		dlmsg = dlerror();
1266 		if (dlmsg)
1267 			ERROR("cannot resolve glue symbol: %s", dlmsg);
1268 		goto glue_error;
1269 	}
1270 	mlx4_glue = *sym;
1271 	return 0;
1272 glue_error:
1273 	if (handle)
1274 		dlclose(handle);
1275 	WARN("cannot initialize PMD due to missing run-time"
1276 	     " dependency on rdma-core libraries (libibverbs,"
1277 	     " libmlx4)");
1278 	return -rte_errno;
1279 }
1280 
1281 #endif
1282 
1283 /* Initialize driver log type. */
1284 RTE_LOG_REGISTER(mlx4_logtype, pmd.net.mlx4, NOTICE)
1285 
1286 /**
1287  * Driver initialization routine.
1288  */
RTE_INIT(rte_mlx4_pmd_init)1289 RTE_INIT(rte_mlx4_pmd_init)
1290 {
1291 	/*
1292 	 * MLX4_DEVICE_FATAL_CLEANUP tells ibv_destroy functions we
1293 	 * want to get success errno value in case of calling them
1294 	 * when the device was removed.
1295 	 */
1296 	setenv("MLX4_DEVICE_FATAL_CLEANUP", "1", 1);
1297 	/*
1298 	 * RDMAV_HUGEPAGES_SAFE tells ibv_fork_init() we intend to use
1299 	 * huge pages. Calling ibv_fork_init() during init allows
1300 	 * applications to use fork() safely for purposes other than
1301 	 * using this PMD, which is not supported in forked processes.
1302 	 */
1303 	setenv("RDMAV_HUGEPAGES_SAFE", "1", 1);
1304 #ifdef RTE_IBVERBS_LINK_DLOPEN
1305 	if (mlx4_glue_init())
1306 		return;
1307 	MLX4_ASSERT(mlx4_glue);
1308 #endif
1309 #ifdef RTE_LIBRTE_MLX4_DEBUG
1310 	/* Glue structure must not contain any NULL pointers. */
1311 	{
1312 		unsigned int i;
1313 
1314 		for (i = 0; i != sizeof(*mlx4_glue) / sizeof(void *); ++i)
1315 			MLX4_ASSERT(((const void *const *)mlx4_glue)[i]);
1316 	}
1317 #endif
1318 	if (strcmp(mlx4_glue->version, MLX4_GLUE_VERSION)) {
1319 		ERROR("rdma-core glue \"%s\" mismatch: \"%s\" is required",
1320 		      mlx4_glue->version, MLX4_GLUE_VERSION);
1321 		return;
1322 	}
1323 	mlx4_glue->fork_init();
1324 	rte_pci_register(&mlx4_driver);
1325 }
1326 
1327 RTE_PMD_EXPORT_NAME(net_mlx4, __COUNTER__);
1328 RTE_PMD_REGISTER_PCI_TABLE(net_mlx4, mlx4_pci_id_map);
1329 RTE_PMD_REGISTER_KMOD_DEP(net_mlx4,
1330 	"* ib_uverbs & mlx4_en & mlx4_core & mlx4_ib");
1331