1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2019-2020 Intel Corporation.
3  */
4 #include <unistd.h>
5 #include <errno.h>
6 #include <stdlib.h>
7 #include <string.h>
8 #include <poll.h>
9 #include <netinet/in.h>
10 #include <net/if.h>
11 #include <sys/socket.h>
12 #include <sys/ioctl.h>
13 #include <linux/if_ether.h>
14 #include <linux/if_xdp.h>
15 #include <linux/if_link.h>
16 #include <linux/ethtool.h>
17 #include <linux/sockios.h>
18 #include "af_xdp_deps.h"
19 #include <bpf/xsk.h>
20 
21 #include <rte_ethdev.h>
22 #include <rte_ethdev_driver.h>
23 #include <rte_ethdev_vdev.h>
24 #include <rte_kvargs.h>
25 #include <rte_bus_vdev.h>
26 #include <rte_string_fns.h>
27 #include <rte_branch_prediction.h>
28 #include <rte_common.h>
29 #include <rte_dev.h>
30 #include <rte_eal.h>
31 #include <rte_ether.h>
32 #include <rte_lcore.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memzone.h>
36 #include <rte_mempool.h>
37 #include <rte_mbuf.h>
38 #include <rte_malloc.h>
39 #include <rte_ring.h>
40 #include <rte_spinlock.h>
41 
42 #include "compat.h"
43 
44 
45 #ifndef SOL_XDP
46 #define SOL_XDP 283
47 #endif
48 
49 #ifndef AF_XDP
50 #define AF_XDP 44
51 #endif
52 
53 #ifndef PF_XDP
54 #define PF_XDP AF_XDP
55 #endif
56 
57 RTE_LOG_REGISTER(af_xdp_logtype, pmd.net.af_xdp, NOTICE);
58 
59 #define AF_XDP_LOG(level, fmt, args...)			\
60 	rte_log(RTE_LOG_ ## level, af_xdp_logtype,	\
61 		"%s(): " fmt, __func__, ##args)
62 
63 #define ETH_AF_XDP_FRAME_SIZE		2048
64 #define ETH_AF_XDP_NUM_BUFFERS		4096
65 #define ETH_AF_XDP_DFLT_NUM_DESCS	XSK_RING_CONS__DEFAULT_NUM_DESCS
66 #define ETH_AF_XDP_DFLT_START_QUEUE_IDX	0
67 #define ETH_AF_XDP_DFLT_QUEUE_COUNT	1
68 
69 #define ETH_AF_XDP_RX_BATCH_SIZE	32
70 #define ETH_AF_XDP_TX_BATCH_SIZE	32
71 
72 
73 struct xsk_umem_info {
74 	struct xsk_umem *umem;
75 	struct rte_ring *buf_ring;
76 	const struct rte_memzone *mz;
77 	struct rte_mempool *mb_pool;
78 	void *buffer;
79 	uint8_t refcnt;
80 	uint32_t max_xsks;
81 };
82 
83 struct rx_stats {
84 	uint64_t rx_pkts;
85 	uint64_t rx_bytes;
86 	uint64_t rx_dropped;
87 };
88 
89 struct pkt_rx_queue {
90 	struct xsk_ring_cons rx;
91 	struct xsk_umem_info *umem;
92 	struct xsk_socket *xsk;
93 	struct rte_mempool *mb_pool;
94 
95 	struct rx_stats stats;
96 
97 	struct xsk_ring_prod fq;
98 	struct xsk_ring_cons cq;
99 
100 	struct pkt_tx_queue *pair;
101 	struct pollfd fds[1];
102 	int xsk_queue_idx;
103 };
104 
105 struct tx_stats {
106 	uint64_t tx_pkts;
107 	uint64_t tx_bytes;
108 	uint64_t tx_dropped;
109 };
110 
111 struct pkt_tx_queue {
112 	struct xsk_ring_prod tx;
113 	struct xsk_umem_info *umem;
114 
115 	struct tx_stats stats;
116 
117 	struct pkt_rx_queue *pair;
118 	int xsk_queue_idx;
119 };
120 
121 struct pmd_internals {
122 	int if_index;
123 	char if_name[IFNAMSIZ];
124 	int start_queue_idx;
125 	int queue_cnt;
126 	int max_queue_cnt;
127 	int combined_queue_cnt;
128 	bool shared_umem;
129 	char prog_path[PATH_MAX];
130 	bool custom_prog_configured;
131 
132 	struct rte_ether_addr eth_addr;
133 
134 	struct pkt_rx_queue *rx_queues;
135 	struct pkt_tx_queue *tx_queues;
136 };
137 
138 #define ETH_AF_XDP_IFACE_ARG			"iface"
139 #define ETH_AF_XDP_START_QUEUE_ARG		"start_queue"
140 #define ETH_AF_XDP_QUEUE_COUNT_ARG		"queue_count"
141 #define ETH_AF_XDP_SHARED_UMEM_ARG		"shared_umem"
142 #define ETH_AF_XDP_PROG_ARG			"xdp_prog"
143 
144 static const char * const valid_arguments[] = {
145 	ETH_AF_XDP_IFACE_ARG,
146 	ETH_AF_XDP_START_QUEUE_ARG,
147 	ETH_AF_XDP_QUEUE_COUNT_ARG,
148 	ETH_AF_XDP_SHARED_UMEM_ARG,
149 	ETH_AF_XDP_PROG_ARG,
150 	NULL
151 };
152 
153 static const struct rte_eth_link pmd_link = {
154 	.link_speed = ETH_SPEED_NUM_10G,
155 	.link_duplex = ETH_LINK_FULL_DUPLEX,
156 	.link_status = ETH_LINK_DOWN,
157 	.link_autoneg = ETH_LINK_AUTONEG
158 };
159 
160 /* List which tracks PMDs to facilitate sharing UMEMs across them. */
161 struct internal_list {
162 	TAILQ_ENTRY(internal_list) next;
163 	struct rte_eth_dev *eth_dev;
164 };
165 
166 TAILQ_HEAD(internal_list_head, internal_list);
167 static struct internal_list_head internal_list =
168 	TAILQ_HEAD_INITIALIZER(internal_list);
169 
170 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
171 
172 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
173 static inline int
reserve_fill_queue_zc(struct xsk_umem_info * umem,uint16_t reserve_size,struct rte_mbuf ** bufs,struct xsk_ring_prod * fq)174 reserve_fill_queue_zc(struct xsk_umem_info *umem, uint16_t reserve_size,
175 		      struct rte_mbuf **bufs, struct xsk_ring_prod *fq)
176 {
177 	uint32_t idx;
178 	uint16_t i;
179 
180 	if (unlikely(!xsk_ring_prod__reserve(fq, reserve_size, &idx))) {
181 		for (i = 0; i < reserve_size; i++)
182 			rte_pktmbuf_free(bufs[i]);
183 		AF_XDP_LOG(DEBUG, "Failed to reserve enough fq descs.\n");
184 		return -1;
185 	}
186 
187 	for (i = 0; i < reserve_size; i++) {
188 		__u64 *fq_addr;
189 		uint64_t addr;
190 
191 		fq_addr = xsk_ring_prod__fill_addr(fq, idx++);
192 		addr = (uint64_t)bufs[i] - (uint64_t)umem->buffer -
193 				umem->mb_pool->header_size;
194 		*fq_addr = addr;
195 	}
196 
197 	xsk_ring_prod__submit(fq, reserve_size);
198 
199 	return 0;
200 }
201 #else
202 static inline int
reserve_fill_queue_cp(struct xsk_umem_info * umem,uint16_t reserve_size,struct rte_mbuf ** bufs __rte_unused,struct xsk_ring_prod * fq)203 reserve_fill_queue_cp(struct xsk_umem_info *umem, uint16_t reserve_size,
204 		      struct rte_mbuf **bufs __rte_unused,
205 		      struct xsk_ring_prod *fq)
206 {
207 	void *addrs[reserve_size];
208 	uint32_t idx;
209 	uint16_t i;
210 
211 	if (rte_ring_dequeue_bulk(umem->buf_ring, addrs, reserve_size, NULL)
212 		    != reserve_size) {
213 		AF_XDP_LOG(DEBUG, "Failed to get enough buffers for fq.\n");
214 		return -1;
215 	}
216 
217 	if (unlikely(!xsk_ring_prod__reserve(fq, reserve_size, &idx))) {
218 		AF_XDP_LOG(DEBUG, "Failed to reserve enough fq descs.\n");
219 		rte_ring_enqueue_bulk(umem->buf_ring, addrs,
220 				reserve_size, NULL);
221 		return -1;
222 	}
223 
224 	for (i = 0; i < reserve_size; i++) {
225 		__u64 *fq_addr;
226 
227 		fq_addr = xsk_ring_prod__fill_addr(fq, idx++);
228 		*fq_addr = (uint64_t)addrs[i];
229 	}
230 
231 	xsk_ring_prod__submit(fq, reserve_size);
232 
233 	return 0;
234 }
235 #endif
236 
237 static inline int
reserve_fill_queue(struct xsk_umem_info * umem,uint16_t reserve_size,struct rte_mbuf ** bufs,struct xsk_ring_prod * fq)238 reserve_fill_queue(struct xsk_umem_info *umem, uint16_t reserve_size,
239 		   struct rte_mbuf **bufs, struct xsk_ring_prod *fq)
240 {
241 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
242 	return reserve_fill_queue_zc(umem, reserve_size, bufs, fq);
243 #else
244 	return reserve_fill_queue_cp(umem, reserve_size, bufs, fq);
245 #endif
246 }
247 
248 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
249 static uint16_t
af_xdp_rx_zc(void * queue,struct rte_mbuf ** bufs,uint16_t nb_pkts)250 af_xdp_rx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
251 {
252 	struct pkt_rx_queue *rxq = queue;
253 	struct xsk_ring_cons *rx = &rxq->rx;
254 	struct xsk_ring_prod *fq = &rxq->fq;
255 	struct xsk_umem_info *umem = rxq->umem;
256 	uint32_t idx_rx = 0;
257 	unsigned long rx_bytes = 0;
258 	int rcvd, i;
259 	struct rte_mbuf *fq_bufs[ETH_AF_XDP_RX_BATCH_SIZE];
260 
261 	/* allocate bufs for fill queue replenishment after rx */
262 	if (rte_pktmbuf_alloc_bulk(umem->mb_pool, fq_bufs, nb_pkts)) {
263 		AF_XDP_LOG(DEBUG,
264 			"Failed to get enough buffers for fq.\n");
265 		return 0;
266 	}
267 
268 	rcvd = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
269 
270 	if (rcvd == 0) {
271 #if defined(XDP_USE_NEED_WAKEUP)
272 		if (xsk_ring_prod__needs_wakeup(fq))
273 			(void)poll(rxq->fds, 1, 1000);
274 #endif
275 
276 		goto out;
277 	}
278 
279 	for (i = 0; i < rcvd; i++) {
280 		const struct xdp_desc *desc;
281 		uint64_t addr;
282 		uint32_t len;
283 		uint64_t offset;
284 
285 		desc = xsk_ring_cons__rx_desc(rx, idx_rx++);
286 		addr = desc->addr;
287 		len = desc->len;
288 
289 		offset = xsk_umem__extract_offset(addr);
290 		addr = xsk_umem__extract_addr(addr);
291 
292 		bufs[i] = (struct rte_mbuf *)
293 				xsk_umem__get_data(umem->buffer, addr +
294 					umem->mb_pool->header_size);
295 		bufs[i]->data_off = offset - sizeof(struct rte_mbuf) -
296 			rte_pktmbuf_priv_size(umem->mb_pool) -
297 			umem->mb_pool->header_size;
298 
299 		rte_pktmbuf_pkt_len(bufs[i]) = len;
300 		rte_pktmbuf_data_len(bufs[i]) = len;
301 		rx_bytes += len;
302 	}
303 
304 	xsk_ring_cons__release(rx, rcvd);
305 
306 	(void)reserve_fill_queue(umem, rcvd, fq_bufs, fq);
307 
308 	/* statistics */
309 	rxq->stats.rx_pkts += rcvd;
310 	rxq->stats.rx_bytes += rx_bytes;
311 
312 out:
313 	if (rcvd != nb_pkts)
314 		rte_mempool_put_bulk(umem->mb_pool, (void **)&fq_bufs[rcvd],
315 				     nb_pkts - rcvd);
316 
317 	return rcvd;
318 }
319 #else
320 static uint16_t
af_xdp_rx_cp(void * queue,struct rte_mbuf ** bufs,uint16_t nb_pkts)321 af_xdp_rx_cp(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
322 {
323 	struct pkt_rx_queue *rxq = queue;
324 	struct xsk_ring_cons *rx = &rxq->rx;
325 	struct xsk_umem_info *umem = rxq->umem;
326 	struct xsk_ring_prod *fq = &rxq->fq;
327 	uint32_t idx_rx = 0;
328 	unsigned long rx_bytes = 0;
329 	int rcvd, i;
330 	uint32_t free_thresh = fq->size >> 1;
331 	struct rte_mbuf *mbufs[ETH_AF_XDP_RX_BATCH_SIZE];
332 
333 	if (xsk_prod_nb_free(fq, free_thresh) >= free_thresh)
334 		(void)reserve_fill_queue(umem, ETH_AF_XDP_RX_BATCH_SIZE,
335 					 NULL, fq);
336 
337 	if (unlikely(rte_pktmbuf_alloc_bulk(rxq->mb_pool, mbufs, nb_pkts) != 0))
338 		return 0;
339 
340 	rcvd = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
341 	if (rcvd == 0) {
342 #if defined(XDP_USE_NEED_WAKEUP)
343 		if (xsk_ring_prod__needs_wakeup(fq))
344 			(void)poll(rxq->fds, 1, 1000);
345 #endif
346 
347 		goto out;
348 	}
349 
350 	for (i = 0; i < rcvd; i++) {
351 		const struct xdp_desc *desc;
352 		uint64_t addr;
353 		uint32_t len;
354 		void *pkt;
355 
356 		desc = xsk_ring_cons__rx_desc(rx, idx_rx++);
357 		addr = desc->addr;
358 		len = desc->len;
359 		pkt = xsk_umem__get_data(rxq->umem->mz->addr, addr);
360 
361 		rte_memcpy(rte_pktmbuf_mtod(mbufs[i], void *), pkt, len);
362 		rte_ring_enqueue(umem->buf_ring, (void *)addr);
363 		rte_pktmbuf_pkt_len(mbufs[i]) = len;
364 		rte_pktmbuf_data_len(mbufs[i]) = len;
365 		rx_bytes += len;
366 		bufs[i] = mbufs[i];
367 	}
368 
369 	xsk_ring_cons__release(rx, rcvd);
370 
371 	/* statistics */
372 	rxq->stats.rx_pkts += rcvd;
373 	rxq->stats.rx_bytes += rx_bytes;
374 
375 out:
376 	if (rcvd != nb_pkts)
377 		rte_mempool_put_bulk(rxq->mb_pool, (void **)&mbufs[rcvd],
378 				     nb_pkts - rcvd);
379 
380 	return rcvd;
381 }
382 #endif
383 
384 static uint16_t
eth_af_xdp_rx(void * queue,struct rte_mbuf ** bufs,uint16_t nb_pkts)385 eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
386 {
387 	nb_pkts = RTE_MIN(nb_pkts, ETH_AF_XDP_RX_BATCH_SIZE);
388 
389 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
390 	return af_xdp_rx_zc(queue, bufs, nb_pkts);
391 #else
392 	return af_xdp_rx_cp(queue, bufs, nb_pkts);
393 #endif
394 }
395 
396 static void
pull_umem_cq(struct xsk_umem_info * umem,int size,struct xsk_ring_cons * cq)397 pull_umem_cq(struct xsk_umem_info *umem, int size, struct xsk_ring_cons *cq)
398 {
399 	size_t i, n;
400 	uint32_t idx_cq = 0;
401 
402 	n = xsk_ring_cons__peek(cq, size, &idx_cq);
403 
404 	for (i = 0; i < n; i++) {
405 		uint64_t addr;
406 		addr = *xsk_ring_cons__comp_addr(cq, idx_cq++);
407 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
408 		addr = xsk_umem__extract_addr(addr);
409 		rte_pktmbuf_free((struct rte_mbuf *)
410 					xsk_umem__get_data(umem->buffer,
411 					addr + umem->mb_pool->header_size));
412 #else
413 		rte_ring_enqueue(umem->buf_ring, (void *)addr);
414 #endif
415 	}
416 
417 	xsk_ring_cons__release(cq, n);
418 }
419 
420 static void
kick_tx(struct pkt_tx_queue * txq,struct xsk_ring_cons * cq)421 kick_tx(struct pkt_tx_queue *txq, struct xsk_ring_cons *cq)
422 {
423 	struct xsk_umem_info *umem = txq->umem;
424 
425 	pull_umem_cq(umem, XSK_RING_CONS__DEFAULT_NUM_DESCS, cq);
426 
427 #if defined(XDP_USE_NEED_WAKEUP)
428 	if (xsk_ring_prod__needs_wakeup(&txq->tx))
429 #endif
430 		while (send(xsk_socket__fd(txq->pair->xsk), NULL,
431 			    0, MSG_DONTWAIT) < 0) {
432 			/* some thing unexpected */
433 			if (errno != EBUSY && errno != EAGAIN && errno != EINTR)
434 				break;
435 
436 			/* pull from completion queue to leave more space */
437 			if (errno == EAGAIN)
438 				pull_umem_cq(umem,
439 					     XSK_RING_CONS__DEFAULT_NUM_DESCS,
440 					     cq);
441 		}
442 }
443 
444 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
445 static uint16_t
af_xdp_tx_zc(void * queue,struct rte_mbuf ** bufs,uint16_t nb_pkts)446 af_xdp_tx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
447 {
448 	struct pkt_tx_queue *txq = queue;
449 	struct xsk_umem_info *umem = txq->umem;
450 	struct rte_mbuf *mbuf;
451 	unsigned long tx_bytes = 0;
452 	int i;
453 	uint32_t idx_tx;
454 	uint16_t count = 0;
455 	struct xdp_desc *desc;
456 	uint64_t addr, offset;
457 	struct xsk_ring_cons *cq = &txq->pair->cq;
458 	uint32_t free_thresh = cq->size >> 1;
459 
460 	if (xsk_cons_nb_avail(cq, free_thresh) >= free_thresh)
461 		pull_umem_cq(umem, XSK_RING_CONS__DEFAULT_NUM_DESCS, cq);
462 
463 	for (i = 0; i < nb_pkts; i++) {
464 		mbuf = bufs[i];
465 
466 		if (mbuf->pool == umem->mb_pool) {
467 			if (!xsk_ring_prod__reserve(&txq->tx, 1, &idx_tx)) {
468 				kick_tx(txq, cq);
469 				if (!xsk_ring_prod__reserve(&txq->tx, 1,
470 							    &idx_tx))
471 					goto out;
472 			}
473 			desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx);
474 			desc->len = mbuf->pkt_len;
475 			addr = (uint64_t)mbuf - (uint64_t)umem->buffer -
476 					umem->mb_pool->header_size;
477 			offset = rte_pktmbuf_mtod(mbuf, uint64_t) -
478 					(uint64_t)mbuf +
479 					umem->mb_pool->header_size;
480 			offset = offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT;
481 			desc->addr = addr | offset;
482 			count++;
483 		} else {
484 			struct rte_mbuf *local_mbuf =
485 					rte_pktmbuf_alloc(umem->mb_pool);
486 			void *pkt;
487 
488 			if (local_mbuf == NULL)
489 				goto out;
490 
491 			if (!xsk_ring_prod__reserve(&txq->tx, 1, &idx_tx)) {
492 				rte_pktmbuf_free(local_mbuf);
493 				kick_tx(txq, cq);
494 				goto out;
495 			}
496 
497 			desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx);
498 			desc->len = mbuf->pkt_len;
499 
500 			addr = (uint64_t)local_mbuf - (uint64_t)umem->buffer -
501 					umem->mb_pool->header_size;
502 			offset = rte_pktmbuf_mtod(local_mbuf, uint64_t) -
503 					(uint64_t)local_mbuf +
504 					umem->mb_pool->header_size;
505 			pkt = xsk_umem__get_data(umem->buffer, addr + offset);
506 			offset = offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT;
507 			desc->addr = addr | offset;
508 			rte_memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *),
509 					desc->len);
510 			rte_pktmbuf_free(mbuf);
511 			count++;
512 		}
513 
514 		tx_bytes += mbuf->pkt_len;
515 	}
516 
517 	kick_tx(txq, cq);
518 
519 out:
520 	xsk_ring_prod__submit(&txq->tx, count);
521 
522 	txq->stats.tx_pkts += count;
523 	txq->stats.tx_bytes += tx_bytes;
524 	txq->stats.tx_dropped += nb_pkts - count;
525 
526 	return count;
527 }
528 #else
529 static uint16_t
af_xdp_tx_cp(void * queue,struct rte_mbuf ** bufs,uint16_t nb_pkts)530 af_xdp_tx_cp(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
531 {
532 	struct pkt_tx_queue *txq = queue;
533 	struct xsk_umem_info *umem = txq->umem;
534 	struct rte_mbuf *mbuf;
535 	void *addrs[ETH_AF_XDP_TX_BATCH_SIZE];
536 	unsigned long tx_bytes = 0;
537 	int i;
538 	uint32_t idx_tx;
539 	struct xsk_ring_cons *cq = &txq->pair->cq;
540 
541 	nb_pkts = RTE_MIN(nb_pkts, ETH_AF_XDP_TX_BATCH_SIZE);
542 
543 	pull_umem_cq(umem, nb_pkts, cq);
544 
545 	nb_pkts = rte_ring_dequeue_bulk(umem->buf_ring, addrs,
546 					nb_pkts, NULL);
547 	if (nb_pkts == 0)
548 		return 0;
549 
550 	if (xsk_ring_prod__reserve(&txq->tx, nb_pkts, &idx_tx) != nb_pkts) {
551 		kick_tx(txq, cq);
552 		rte_ring_enqueue_bulk(umem->buf_ring, addrs, nb_pkts, NULL);
553 		return 0;
554 	}
555 
556 	for (i = 0; i < nb_pkts; i++) {
557 		struct xdp_desc *desc;
558 		void *pkt;
559 
560 		desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx + i);
561 		mbuf = bufs[i];
562 		desc->len = mbuf->pkt_len;
563 
564 		desc->addr = (uint64_t)addrs[i];
565 		pkt = xsk_umem__get_data(umem->mz->addr,
566 					 desc->addr);
567 		rte_memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *), desc->len);
568 		tx_bytes += mbuf->pkt_len;
569 		rte_pktmbuf_free(mbuf);
570 	}
571 
572 	xsk_ring_prod__submit(&txq->tx, nb_pkts);
573 
574 	kick_tx(txq, cq);
575 
576 	txq->stats.tx_pkts += nb_pkts;
577 	txq->stats.tx_bytes += tx_bytes;
578 
579 	return nb_pkts;
580 }
581 #endif
582 
583 static uint16_t
eth_af_xdp_tx(void * queue,struct rte_mbuf ** bufs,uint16_t nb_pkts)584 eth_af_xdp_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
585 {
586 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
587 	return af_xdp_tx_zc(queue, bufs, nb_pkts);
588 #else
589 	return af_xdp_tx_cp(queue, bufs, nb_pkts);
590 #endif
591 }
592 
593 static int
eth_dev_start(struct rte_eth_dev * dev)594 eth_dev_start(struct rte_eth_dev *dev)
595 {
596 	dev->data->dev_link.link_status = ETH_LINK_UP;
597 
598 	return 0;
599 }
600 
601 /* This function gets called when the current port gets stopped. */
602 static int
eth_dev_stop(struct rte_eth_dev * dev)603 eth_dev_stop(struct rte_eth_dev *dev)
604 {
605 	dev->data->dev_link.link_status = ETH_LINK_DOWN;
606 	return 0;
607 }
608 
609 /* Find ethdev in list */
610 static inline struct internal_list *
find_internal_resource(struct pmd_internals * port_int)611 find_internal_resource(struct pmd_internals *port_int)
612 {
613 	int found = 0;
614 	struct internal_list *list = NULL;
615 
616 	if (port_int == NULL)
617 		return NULL;
618 
619 	pthread_mutex_lock(&internal_list_lock);
620 
621 	TAILQ_FOREACH(list, &internal_list, next) {
622 		struct pmd_internals *list_int =
623 				list->eth_dev->data->dev_private;
624 		if (list_int == port_int) {
625 			found = 1;
626 			break;
627 		}
628 	}
629 
630 	pthread_mutex_unlock(&internal_list_lock);
631 
632 	if (!found)
633 		return NULL;
634 
635 	return list;
636 }
637 
638 /* Check if the netdev,qid context already exists */
639 static inline bool
ctx_exists(struct pkt_rx_queue * rxq,const char * ifname,struct pkt_rx_queue * list_rxq,const char * list_ifname)640 ctx_exists(struct pkt_rx_queue *rxq, const char *ifname,
641 		struct pkt_rx_queue *list_rxq, const char *list_ifname)
642 {
643 	bool exists = false;
644 
645 	if (rxq->xsk_queue_idx == list_rxq->xsk_queue_idx &&
646 			!strncmp(ifname, list_ifname, IFNAMSIZ)) {
647 		AF_XDP_LOG(ERR, "ctx %s,%i already exists, cannot share umem\n",
648 					ifname, rxq->xsk_queue_idx);
649 		exists = true;
650 	}
651 
652 	return exists;
653 }
654 
655 /* Get a pointer to an existing UMEM which overlays the rxq's mb_pool */
656 static inline int
get_shared_umem(struct pkt_rx_queue * rxq,const char * ifname,struct xsk_umem_info ** umem)657 get_shared_umem(struct pkt_rx_queue *rxq, const char *ifname,
658 			struct xsk_umem_info **umem)
659 {
660 	struct internal_list *list;
661 	struct pmd_internals *internals;
662 	int i = 0, ret = 0;
663 	struct rte_mempool *mb_pool = rxq->mb_pool;
664 
665 	if (mb_pool == NULL)
666 		return ret;
667 
668 	pthread_mutex_lock(&internal_list_lock);
669 
670 	TAILQ_FOREACH(list, &internal_list, next) {
671 		internals = list->eth_dev->data->dev_private;
672 		for (i = 0; i < internals->queue_cnt; i++) {
673 			struct pkt_rx_queue *list_rxq =
674 						&internals->rx_queues[i];
675 			if (rxq == list_rxq)
676 				continue;
677 			if (mb_pool == internals->rx_queues[i].mb_pool) {
678 				if (ctx_exists(rxq, ifname, list_rxq,
679 						internals->if_name)) {
680 					ret = -1;
681 					goto out;
682 				}
683 				if (__atomic_load_n(
684 					&internals->rx_queues[i].umem->refcnt,
685 							__ATOMIC_ACQUIRE)) {
686 					*umem = internals->rx_queues[i].umem;
687 					goto out;
688 				}
689 			}
690 		}
691 	}
692 
693 out:
694 	pthread_mutex_unlock(&internal_list_lock);
695 
696 	return ret;
697 }
698 
699 static int
eth_dev_configure(struct rte_eth_dev * dev)700 eth_dev_configure(struct rte_eth_dev *dev)
701 {
702 	struct pmd_internals *internal = dev->data->dev_private;
703 
704 	/* rx/tx must be paired */
705 	if (dev->data->nb_rx_queues != dev->data->nb_tx_queues)
706 		return -EINVAL;
707 
708 	if (internal->shared_umem) {
709 		struct internal_list *list = NULL;
710 		const char *name = dev->device->name;
711 
712 		/* Ensure PMD is not already inserted into the list */
713 		list = find_internal_resource(internal);
714 		if (list)
715 			return 0;
716 
717 		list = rte_zmalloc_socket(name, sizeof(*list), 0,
718 					dev->device->numa_node);
719 		if (list == NULL)
720 			return -1;
721 
722 		list->eth_dev = dev;
723 		pthread_mutex_lock(&internal_list_lock);
724 		TAILQ_INSERT_TAIL(&internal_list, list, next);
725 		pthread_mutex_unlock(&internal_list_lock);
726 	}
727 
728 	return 0;
729 }
730 
731 static int
eth_dev_info(struct rte_eth_dev * dev,struct rte_eth_dev_info * dev_info)732 eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
733 {
734 	struct pmd_internals *internals = dev->data->dev_private;
735 
736 	dev_info->if_index = internals->if_index;
737 	dev_info->max_mac_addrs = 1;
738 	dev_info->max_rx_pktlen = ETH_FRAME_LEN;
739 	dev_info->max_rx_queues = internals->queue_cnt;
740 	dev_info->max_tx_queues = internals->queue_cnt;
741 
742 	dev_info->min_mtu = RTE_ETHER_MIN_MTU;
743 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
744 	dev_info->max_mtu = getpagesize() -
745 				sizeof(struct rte_mempool_objhdr) -
746 				sizeof(struct rte_mbuf) -
747 				RTE_PKTMBUF_HEADROOM - XDP_PACKET_HEADROOM;
748 #else
749 	dev_info->max_mtu = ETH_AF_XDP_FRAME_SIZE - XDP_PACKET_HEADROOM;
750 #endif
751 
752 	dev_info->default_rxportconf.nb_queues = 1;
753 	dev_info->default_txportconf.nb_queues = 1;
754 	dev_info->default_rxportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
755 	dev_info->default_txportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
756 
757 	return 0;
758 }
759 
760 static int
eth_stats_get(struct rte_eth_dev * dev,struct rte_eth_stats * stats)761 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
762 {
763 	struct pmd_internals *internals = dev->data->dev_private;
764 	struct xdp_statistics xdp_stats;
765 	struct pkt_rx_queue *rxq;
766 	struct pkt_tx_queue *txq;
767 	socklen_t optlen;
768 	int i, ret;
769 
770 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
771 		optlen = sizeof(struct xdp_statistics);
772 		rxq = &internals->rx_queues[i];
773 		txq = rxq->pair;
774 		stats->q_ipackets[i] = rxq->stats.rx_pkts;
775 		stats->q_ibytes[i] = rxq->stats.rx_bytes;
776 
777 		stats->q_opackets[i] = txq->stats.tx_pkts;
778 		stats->q_obytes[i] = txq->stats.tx_bytes;
779 
780 		stats->ipackets += stats->q_ipackets[i];
781 		stats->ibytes += stats->q_ibytes[i];
782 		stats->imissed += rxq->stats.rx_dropped;
783 		stats->oerrors += txq->stats.tx_dropped;
784 		ret = getsockopt(xsk_socket__fd(rxq->xsk), SOL_XDP,
785 				XDP_STATISTICS, &xdp_stats, &optlen);
786 		if (ret != 0) {
787 			AF_XDP_LOG(ERR, "getsockopt() failed for XDP_STATISTICS.\n");
788 			return -1;
789 		}
790 		stats->imissed += xdp_stats.rx_dropped;
791 
792 		stats->opackets += stats->q_opackets[i];
793 		stats->obytes += stats->q_obytes[i];
794 	}
795 
796 	return 0;
797 }
798 
799 static int
eth_stats_reset(struct rte_eth_dev * dev)800 eth_stats_reset(struct rte_eth_dev *dev)
801 {
802 	struct pmd_internals *internals = dev->data->dev_private;
803 	int i;
804 
805 	for (i = 0; i < internals->queue_cnt; i++) {
806 		memset(&internals->rx_queues[i].stats, 0,
807 					sizeof(struct rx_stats));
808 		memset(&internals->tx_queues[i].stats, 0,
809 					sizeof(struct tx_stats));
810 	}
811 
812 	return 0;
813 }
814 
815 static void
remove_xdp_program(struct pmd_internals * internals)816 remove_xdp_program(struct pmd_internals *internals)
817 {
818 	uint32_t curr_prog_id = 0;
819 
820 	if (bpf_get_link_xdp_id(internals->if_index, &curr_prog_id,
821 				XDP_FLAGS_UPDATE_IF_NOEXIST)) {
822 		AF_XDP_LOG(ERR, "bpf_get_link_xdp_id failed\n");
823 		return;
824 	}
825 	bpf_set_link_xdp_fd(internals->if_index, -1,
826 			XDP_FLAGS_UPDATE_IF_NOEXIST);
827 }
828 
829 static void
xdp_umem_destroy(struct xsk_umem_info * umem)830 xdp_umem_destroy(struct xsk_umem_info *umem)
831 {
832 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
833 	umem->mb_pool = NULL;
834 #else
835 	rte_memzone_free(umem->mz);
836 	umem->mz = NULL;
837 
838 	rte_ring_free(umem->buf_ring);
839 	umem->buf_ring = NULL;
840 #endif
841 
842 	rte_free(umem);
843 	umem = NULL;
844 }
845 
846 static int
eth_dev_close(struct rte_eth_dev * dev)847 eth_dev_close(struct rte_eth_dev *dev)
848 {
849 	struct pmd_internals *internals = dev->data->dev_private;
850 	struct pkt_rx_queue *rxq;
851 	int i;
852 
853 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
854 		return 0;
855 
856 	AF_XDP_LOG(INFO, "Closing AF_XDP ethdev on numa socket %u\n",
857 		rte_socket_id());
858 
859 	for (i = 0; i < internals->queue_cnt; i++) {
860 		rxq = &internals->rx_queues[i];
861 		if (rxq->umem == NULL)
862 			break;
863 		xsk_socket__delete(rxq->xsk);
864 
865 		if (__atomic_sub_fetch(&rxq->umem->refcnt, 1, __ATOMIC_ACQUIRE)
866 				== 0) {
867 			(void)xsk_umem__delete(rxq->umem->umem);
868 			xdp_umem_destroy(rxq->umem);
869 		}
870 
871 		/* free pkt_tx_queue */
872 		rte_free(rxq->pair);
873 		rte_free(rxq);
874 	}
875 
876 	/*
877 	 * MAC is not allocated dynamically, setting it to NULL would prevent
878 	 * from releasing it in rte_eth_dev_release_port.
879 	 */
880 	dev->data->mac_addrs = NULL;
881 
882 	remove_xdp_program(internals);
883 
884 	if (internals->shared_umem) {
885 		struct internal_list *list;
886 
887 		/* Remove ethdev from list used to track and share UMEMs */
888 		list = find_internal_resource(internals);
889 		if (list) {
890 			pthread_mutex_lock(&internal_list_lock);
891 			TAILQ_REMOVE(&internal_list, list, next);
892 			pthread_mutex_unlock(&internal_list_lock);
893 			rte_free(list);
894 		}
895 	}
896 
897 	return 0;
898 }
899 
900 static void
eth_queue_release(void * q __rte_unused)901 eth_queue_release(void *q __rte_unused)
902 {
903 }
904 
905 static int
eth_link_update(struct rte_eth_dev * dev __rte_unused,int wait_to_complete __rte_unused)906 eth_link_update(struct rte_eth_dev *dev __rte_unused,
907 		int wait_to_complete __rte_unused)
908 {
909 	return 0;
910 }
911 
912 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
get_base_addr(struct rte_mempool * mp,uint64_t * align)913 static inline uintptr_t get_base_addr(struct rte_mempool *mp, uint64_t *align)
914 {
915 	struct rte_mempool_memhdr *memhdr;
916 	uintptr_t memhdr_addr, aligned_addr;
917 
918 	memhdr = STAILQ_FIRST(&mp->mem_list);
919 	memhdr_addr = (uintptr_t)memhdr->addr;
920 	aligned_addr = memhdr_addr & ~(getpagesize() - 1);
921 	*align = memhdr_addr - aligned_addr;
922 
923 	return aligned_addr;
924 }
925 
926 static struct
xdp_umem_configure(struct pmd_internals * internals,struct pkt_rx_queue * rxq)927 xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
928 				  struct pkt_rx_queue *rxq)
929 {
930 	struct xsk_umem_info *umem = NULL;
931 	int ret;
932 	struct xsk_umem_config usr_config = {
933 		.fill_size = ETH_AF_XDP_DFLT_NUM_DESCS * 2,
934 		.comp_size = ETH_AF_XDP_DFLT_NUM_DESCS,
935 		.flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG};
936 	void *base_addr = NULL;
937 	struct rte_mempool *mb_pool = rxq->mb_pool;
938 	uint64_t umem_size, align = 0;
939 
940 	if (internals->shared_umem) {
941 		if (get_shared_umem(rxq, internals->if_name, &umem) < 0)
942 			return NULL;
943 
944 		if (umem != NULL &&
945 			__atomic_load_n(&umem->refcnt, __ATOMIC_ACQUIRE) <
946 					umem->max_xsks) {
947 			AF_XDP_LOG(INFO, "%s,qid%i sharing UMEM\n",
948 					internals->if_name, rxq->xsk_queue_idx);
949 			__atomic_fetch_add(&umem->refcnt, 1, __ATOMIC_ACQUIRE);
950 		}
951 	}
952 
953 	if (umem == NULL) {
954 		usr_config.frame_size =
955 			rte_mempool_calc_obj_size(mb_pool->elt_size,
956 						  mb_pool->flags, NULL);
957 		usr_config.frame_headroom = mb_pool->header_size +
958 						sizeof(struct rte_mbuf) +
959 						rte_pktmbuf_priv_size(mb_pool) +
960 						RTE_PKTMBUF_HEADROOM;
961 
962 		umem = rte_zmalloc_socket("umem", sizeof(*umem), 0,
963 					  rte_socket_id());
964 		if (umem == NULL) {
965 			AF_XDP_LOG(ERR, "Failed to allocate umem info");
966 			return NULL;
967 		}
968 
969 		umem->mb_pool = mb_pool;
970 		base_addr = (void *)get_base_addr(mb_pool, &align);
971 		umem_size = (uint64_t)mb_pool->populated_size *
972 				(uint64_t)usr_config.frame_size +
973 				align;
974 
975 		ret = xsk_umem__create(&umem->umem, base_addr, umem_size,
976 				&rxq->fq, &rxq->cq, &usr_config);
977 		if (ret) {
978 			AF_XDP_LOG(ERR, "Failed to create umem");
979 			goto err;
980 		}
981 		umem->buffer = base_addr;
982 
983 		if (internals->shared_umem) {
984 			umem->max_xsks = mb_pool->populated_size /
985 						ETH_AF_XDP_NUM_BUFFERS;
986 			AF_XDP_LOG(INFO, "Max xsks for UMEM %s: %u\n",
987 						mb_pool->name, umem->max_xsks);
988 		}
989 
990 		__atomic_store_n(&umem->refcnt, 1, __ATOMIC_RELEASE);
991 	}
992 
993 #else
994 static struct
995 xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
996 				  struct pkt_rx_queue *rxq)
997 {
998 	struct xsk_umem_info *umem;
999 	const struct rte_memzone *mz;
1000 	struct xsk_umem_config usr_config = {
1001 		.fill_size = ETH_AF_XDP_DFLT_NUM_DESCS,
1002 		.comp_size = ETH_AF_XDP_DFLT_NUM_DESCS,
1003 		.frame_size = ETH_AF_XDP_FRAME_SIZE,
1004 		.frame_headroom = 0 };
1005 	char ring_name[RTE_RING_NAMESIZE];
1006 	char mz_name[RTE_MEMZONE_NAMESIZE];
1007 	int ret;
1008 	uint64_t i;
1009 
1010 	umem = rte_zmalloc_socket("umem", sizeof(*umem), 0, rte_socket_id());
1011 	if (umem == NULL) {
1012 		AF_XDP_LOG(ERR, "Failed to allocate umem info");
1013 		return NULL;
1014 	}
1015 
1016 	snprintf(ring_name, sizeof(ring_name), "af_xdp_ring_%s_%u",
1017 		       internals->if_name, rxq->xsk_queue_idx);
1018 	umem->buf_ring = rte_ring_create(ring_name,
1019 					 ETH_AF_XDP_NUM_BUFFERS,
1020 					 rte_socket_id(),
1021 					 0x0);
1022 	if (umem->buf_ring == NULL) {
1023 		AF_XDP_LOG(ERR, "Failed to create rte_ring\n");
1024 		goto err;
1025 	}
1026 
1027 	for (i = 0; i < ETH_AF_XDP_NUM_BUFFERS; i++)
1028 		rte_ring_enqueue(umem->buf_ring,
1029 				 (void *)(i * ETH_AF_XDP_FRAME_SIZE));
1030 
1031 	snprintf(mz_name, sizeof(mz_name), "af_xdp_umem_%s_%u",
1032 		       internals->if_name, rxq->xsk_queue_idx);
1033 	mz = rte_memzone_reserve_aligned(mz_name,
1034 			ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE,
1035 			rte_socket_id(), RTE_MEMZONE_IOVA_CONTIG,
1036 			getpagesize());
1037 	if (mz == NULL) {
1038 		AF_XDP_LOG(ERR, "Failed to reserve memzone for af_xdp umem.\n");
1039 		goto err;
1040 	}
1041 
1042 	ret = xsk_umem__create(&umem->umem, mz->addr,
1043 			       ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE,
1044 			       &rxq->fq, &rxq->cq,
1045 			       &usr_config);
1046 
1047 	if (ret) {
1048 		AF_XDP_LOG(ERR, "Failed to create umem");
1049 		goto err;
1050 	}
1051 	umem->mz = mz;
1052 
1053 #endif
1054 	return umem;
1055 
1056 err:
1057 	xdp_umem_destroy(umem);
1058 	return NULL;
1059 }
1060 
1061 static int
1062 load_custom_xdp_prog(const char *prog_path, int if_index)
1063 {
1064 	int ret, prog_fd = -1;
1065 	struct bpf_object *obj;
1066 	struct bpf_map *map;
1067 
1068 	ret = bpf_prog_load(prog_path, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
1069 	if (ret) {
1070 		AF_XDP_LOG(ERR, "Failed to load program %s\n", prog_path);
1071 		return ret;
1072 	}
1073 
1074 	/*
1075 	 * The loaded program must provision for a map of xsks, such that some
1076 	 * traffic can be redirected to userspace. When the xsk is created,
1077 	 * libbpf inserts it into the map.
1078 	 */
1079 	map = bpf_object__find_map_by_name(obj, "xsks_map");
1080 	if (!map) {
1081 		AF_XDP_LOG(ERR, "Failed to find xsks_map in %s\n", prog_path);
1082 		return -1;
1083 	}
1084 
1085 	/* Link the program with the given network device */
1086 	ret = bpf_set_link_xdp_fd(if_index, prog_fd,
1087 					XDP_FLAGS_UPDATE_IF_NOEXIST);
1088 	if (ret) {
1089 		AF_XDP_LOG(ERR, "Failed to set prog fd %d on interface\n",
1090 				prog_fd);
1091 		return -1;
1092 	}
1093 
1094 	AF_XDP_LOG(INFO, "Successfully loaded XDP program %s with fd %d\n",
1095 				prog_path, prog_fd);
1096 
1097 	return 0;
1098 }
1099 
1100 static int
1101 xsk_configure(struct pmd_internals *internals, struct pkt_rx_queue *rxq,
1102 	      int ring_size)
1103 {
1104 	struct xsk_socket_config cfg;
1105 	struct pkt_tx_queue *txq = rxq->pair;
1106 	int ret = 0;
1107 	int reserve_size = ETH_AF_XDP_DFLT_NUM_DESCS;
1108 	struct rte_mbuf *fq_bufs[reserve_size];
1109 
1110 	rxq->umem = xdp_umem_configure(internals, rxq);
1111 	if (rxq->umem == NULL)
1112 		return -ENOMEM;
1113 	txq->umem = rxq->umem;
1114 
1115 	cfg.rx_size = ring_size;
1116 	cfg.tx_size = ring_size;
1117 	cfg.libbpf_flags = 0;
1118 	cfg.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
1119 	cfg.bind_flags = 0;
1120 
1121 #if defined(XDP_USE_NEED_WAKEUP)
1122 	cfg.bind_flags |= XDP_USE_NEED_WAKEUP;
1123 #endif
1124 
1125 	if (strnlen(internals->prog_path, PATH_MAX) &&
1126 				!internals->custom_prog_configured) {
1127 		ret = load_custom_xdp_prog(internals->prog_path,
1128 					   internals->if_index);
1129 		if (ret) {
1130 			AF_XDP_LOG(ERR, "Failed to load custom XDP program %s\n",
1131 					internals->prog_path);
1132 			goto err;
1133 		}
1134 		internals->custom_prog_configured = 1;
1135 	}
1136 
1137 	if (internals->shared_umem)
1138 		ret = create_shared_socket(&rxq->xsk, internals->if_name,
1139 				rxq->xsk_queue_idx, rxq->umem->umem, &rxq->rx,
1140 				&txq->tx, &rxq->fq, &rxq->cq, &cfg);
1141 	else
1142 		ret = xsk_socket__create(&rxq->xsk, internals->if_name,
1143 				rxq->xsk_queue_idx, rxq->umem->umem, &rxq->rx,
1144 				&txq->tx, &cfg);
1145 
1146 	if (ret) {
1147 		AF_XDP_LOG(ERR, "Failed to create xsk socket.\n");
1148 		goto err;
1149 	}
1150 
1151 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
1152 	if (rte_pktmbuf_alloc_bulk(rxq->umem->mb_pool, fq_bufs, reserve_size)) {
1153 		AF_XDP_LOG(DEBUG, "Failed to get enough buffers for fq.\n");
1154 		goto err;
1155 	}
1156 #endif
1157 	ret = reserve_fill_queue(rxq->umem, reserve_size, fq_bufs, &rxq->fq);
1158 	if (ret) {
1159 		xsk_socket__delete(rxq->xsk);
1160 		AF_XDP_LOG(ERR, "Failed to reserve fill queue.\n");
1161 		goto err;
1162 	}
1163 
1164 	return 0;
1165 
1166 err:
1167 	if (__atomic_sub_fetch(&rxq->umem->refcnt, 1, __ATOMIC_ACQUIRE) == 0)
1168 		xdp_umem_destroy(rxq->umem);
1169 
1170 	return ret;
1171 }
1172 
1173 static int
1174 eth_rx_queue_setup(struct rte_eth_dev *dev,
1175 		   uint16_t rx_queue_id,
1176 		   uint16_t nb_rx_desc,
1177 		   unsigned int socket_id __rte_unused,
1178 		   const struct rte_eth_rxconf *rx_conf __rte_unused,
1179 		   struct rte_mempool *mb_pool)
1180 {
1181 	struct pmd_internals *internals = dev->data->dev_private;
1182 	struct pkt_rx_queue *rxq;
1183 	int ret;
1184 
1185 	rxq = &internals->rx_queues[rx_queue_id];
1186 
1187 	AF_XDP_LOG(INFO, "Set up rx queue, rx queue id: %d, xsk queue id: %d\n",
1188 		   rx_queue_id, rxq->xsk_queue_idx);
1189 
1190 #ifndef XDP_UMEM_UNALIGNED_CHUNK_FLAG
1191 	uint32_t buf_size, data_size;
1192 
1193 	/* Now get the space available for data in the mbuf */
1194 	buf_size = rte_pktmbuf_data_room_size(mb_pool) -
1195 		RTE_PKTMBUF_HEADROOM;
1196 	data_size = ETH_AF_XDP_FRAME_SIZE;
1197 
1198 	if (data_size > buf_size) {
1199 		AF_XDP_LOG(ERR, "%s: %d bytes will not fit in mbuf (%d bytes)\n",
1200 			dev->device->name, data_size, buf_size);
1201 		ret = -ENOMEM;
1202 		goto err;
1203 	}
1204 #endif
1205 
1206 	rxq->mb_pool = mb_pool;
1207 
1208 	if (xsk_configure(internals, rxq, nb_rx_desc)) {
1209 		AF_XDP_LOG(ERR, "Failed to configure xdp socket\n");
1210 		ret = -EINVAL;
1211 		goto err;
1212 	}
1213 
1214 	rxq->fds[0].fd = xsk_socket__fd(rxq->xsk);
1215 	rxq->fds[0].events = POLLIN;
1216 
1217 	dev->data->rx_queues[rx_queue_id] = rxq;
1218 	return 0;
1219 
1220 err:
1221 	return ret;
1222 }
1223 
1224 static int
1225 eth_tx_queue_setup(struct rte_eth_dev *dev,
1226 		   uint16_t tx_queue_id,
1227 		   uint16_t nb_tx_desc __rte_unused,
1228 		   unsigned int socket_id __rte_unused,
1229 		   const struct rte_eth_txconf *tx_conf __rte_unused)
1230 {
1231 	struct pmd_internals *internals = dev->data->dev_private;
1232 	struct pkt_tx_queue *txq;
1233 
1234 	txq = &internals->tx_queues[tx_queue_id];
1235 
1236 	dev->data->tx_queues[tx_queue_id] = txq;
1237 	return 0;
1238 }
1239 
1240 static int
1241 eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
1242 {
1243 	struct pmd_internals *internals = dev->data->dev_private;
1244 	struct ifreq ifr = { .ifr_mtu = mtu };
1245 	int ret;
1246 	int s;
1247 
1248 	s = socket(PF_INET, SOCK_DGRAM, 0);
1249 	if (s < 0)
1250 		return -EINVAL;
1251 
1252 	strlcpy(ifr.ifr_name, internals->if_name, IFNAMSIZ);
1253 	ret = ioctl(s, SIOCSIFMTU, &ifr);
1254 	close(s);
1255 
1256 	return (ret < 0) ? -errno : 0;
1257 }
1258 
1259 static int
1260 eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask)
1261 {
1262 	struct ifreq ifr;
1263 	int ret = 0;
1264 	int s;
1265 
1266 	s = socket(PF_INET, SOCK_DGRAM, 0);
1267 	if (s < 0)
1268 		return -errno;
1269 
1270 	strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
1271 	if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0) {
1272 		ret = -errno;
1273 		goto out;
1274 	}
1275 	ifr.ifr_flags &= mask;
1276 	ifr.ifr_flags |= flags;
1277 	if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) {
1278 		ret = -errno;
1279 		goto out;
1280 	}
1281 out:
1282 	close(s);
1283 	return ret;
1284 }
1285 
1286 static int
1287 eth_dev_promiscuous_enable(struct rte_eth_dev *dev)
1288 {
1289 	struct pmd_internals *internals = dev->data->dev_private;
1290 
1291 	return eth_dev_change_flags(internals->if_name, IFF_PROMISC, ~0);
1292 }
1293 
1294 static int
1295 eth_dev_promiscuous_disable(struct rte_eth_dev *dev)
1296 {
1297 	struct pmd_internals *internals = dev->data->dev_private;
1298 
1299 	return eth_dev_change_flags(internals->if_name, 0, ~IFF_PROMISC);
1300 }
1301 
1302 static const struct eth_dev_ops ops = {
1303 	.dev_start = eth_dev_start,
1304 	.dev_stop = eth_dev_stop,
1305 	.dev_close = eth_dev_close,
1306 	.dev_configure = eth_dev_configure,
1307 	.dev_infos_get = eth_dev_info,
1308 	.mtu_set = eth_dev_mtu_set,
1309 	.promiscuous_enable = eth_dev_promiscuous_enable,
1310 	.promiscuous_disable = eth_dev_promiscuous_disable,
1311 	.rx_queue_setup = eth_rx_queue_setup,
1312 	.tx_queue_setup = eth_tx_queue_setup,
1313 	.rx_queue_release = eth_queue_release,
1314 	.tx_queue_release = eth_queue_release,
1315 	.link_update = eth_link_update,
1316 	.stats_get = eth_stats_get,
1317 	.stats_reset = eth_stats_reset,
1318 };
1319 
1320 /** parse integer from integer argument */
1321 static int
1322 parse_integer_arg(const char *key __rte_unused,
1323 		  const char *value, void *extra_args)
1324 {
1325 	int *i = (int *)extra_args;
1326 	char *end;
1327 
1328 	*i = strtol(value, &end, 10);
1329 	if (*i < 0) {
1330 		AF_XDP_LOG(ERR, "Argument has to be positive.\n");
1331 		return -EINVAL;
1332 	}
1333 
1334 	return 0;
1335 }
1336 
1337 /** parse name argument */
1338 static int
1339 parse_name_arg(const char *key __rte_unused,
1340 	       const char *value, void *extra_args)
1341 {
1342 	char *name = extra_args;
1343 
1344 	if (strnlen(value, IFNAMSIZ) > IFNAMSIZ - 1) {
1345 		AF_XDP_LOG(ERR, "Invalid name %s, should be less than %u bytes.\n",
1346 			   value, IFNAMSIZ);
1347 		return -EINVAL;
1348 	}
1349 
1350 	strlcpy(name, value, IFNAMSIZ);
1351 
1352 	return 0;
1353 }
1354 
1355 /** parse xdp prog argument */
1356 static int
1357 parse_prog_arg(const char *key __rte_unused,
1358 	       const char *value, void *extra_args)
1359 {
1360 	char *path = extra_args;
1361 
1362 	if (strnlen(value, PATH_MAX) == PATH_MAX) {
1363 		AF_XDP_LOG(ERR, "Invalid path %s, should be less than %u bytes.\n",
1364 			   value, PATH_MAX);
1365 		return -EINVAL;
1366 	}
1367 
1368 	if (access(value, F_OK) != 0) {
1369 		AF_XDP_LOG(ERR, "Error accessing %s: %s\n",
1370 			   value, strerror(errno));
1371 		return -EINVAL;
1372 	}
1373 
1374 	strlcpy(path, value, PATH_MAX);
1375 
1376 	return 0;
1377 }
1378 
1379 static int
1380 xdp_get_channels_info(const char *if_name, int *max_queues,
1381 				int *combined_queues)
1382 {
1383 	struct ethtool_channels channels;
1384 	struct ifreq ifr;
1385 	int fd, ret;
1386 
1387 	fd = socket(AF_INET, SOCK_DGRAM, 0);
1388 	if (fd < 0)
1389 		return -1;
1390 
1391 	channels.cmd = ETHTOOL_GCHANNELS;
1392 	ifr.ifr_data = (void *)&channels;
1393 	strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
1394 	ret = ioctl(fd, SIOCETHTOOL, &ifr);
1395 	if (ret) {
1396 		if (errno == EOPNOTSUPP) {
1397 			ret = 0;
1398 		} else {
1399 			ret = -errno;
1400 			goto out;
1401 		}
1402 	}
1403 
1404 	if (channels.max_combined == 0 || errno == EOPNOTSUPP) {
1405 		/* If the device says it has no channels, then all traffic
1406 		 * is sent to a single stream, so max queues = 1.
1407 		 */
1408 		*max_queues = 1;
1409 		*combined_queues = 1;
1410 	} else {
1411 		*max_queues = channels.max_combined;
1412 		*combined_queues = channels.combined_count;
1413 	}
1414 
1415  out:
1416 	close(fd);
1417 	return ret;
1418 }
1419 
1420 static int
1421 parse_parameters(struct rte_kvargs *kvlist, char *if_name, int *start_queue,
1422 			int *queue_cnt, int *shared_umem, char *prog_path)
1423 {
1424 	int ret;
1425 
1426 	ret = rte_kvargs_process(kvlist, ETH_AF_XDP_IFACE_ARG,
1427 				 &parse_name_arg, if_name);
1428 	if (ret < 0)
1429 		goto free_kvlist;
1430 
1431 	ret = rte_kvargs_process(kvlist, ETH_AF_XDP_START_QUEUE_ARG,
1432 				 &parse_integer_arg, start_queue);
1433 	if (ret < 0)
1434 		goto free_kvlist;
1435 
1436 	ret = rte_kvargs_process(kvlist, ETH_AF_XDP_QUEUE_COUNT_ARG,
1437 				 &parse_integer_arg, queue_cnt);
1438 	if (ret < 0 || *queue_cnt <= 0) {
1439 		ret = -EINVAL;
1440 		goto free_kvlist;
1441 	}
1442 
1443 	ret = rte_kvargs_process(kvlist, ETH_AF_XDP_SHARED_UMEM_ARG,
1444 				&parse_integer_arg, shared_umem);
1445 	if (ret < 0)
1446 		goto free_kvlist;
1447 
1448 	ret = rte_kvargs_process(kvlist, ETH_AF_XDP_PROG_ARG,
1449 				 &parse_prog_arg, prog_path);
1450 	if (ret < 0)
1451 		goto free_kvlist;
1452 
1453 free_kvlist:
1454 	rte_kvargs_free(kvlist);
1455 	return ret;
1456 }
1457 
1458 static int
1459 get_iface_info(const char *if_name,
1460 	       struct rte_ether_addr *eth_addr,
1461 	       int *if_index)
1462 {
1463 	struct ifreq ifr;
1464 	int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
1465 
1466 	if (sock < 0)
1467 		return -1;
1468 
1469 	strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
1470 	if (ioctl(sock, SIOCGIFINDEX, &ifr))
1471 		goto error;
1472 
1473 	*if_index = ifr.ifr_ifindex;
1474 
1475 	if (ioctl(sock, SIOCGIFHWADDR, &ifr))
1476 		goto error;
1477 
1478 	rte_memcpy(eth_addr, ifr.ifr_hwaddr.sa_data, RTE_ETHER_ADDR_LEN);
1479 
1480 	close(sock);
1481 	return 0;
1482 
1483 error:
1484 	close(sock);
1485 	return -1;
1486 }
1487 
1488 static struct rte_eth_dev *
1489 init_internals(struct rte_vdev_device *dev, const char *if_name,
1490 		int start_queue_idx, int queue_cnt, int shared_umem,
1491 		const char *prog_path)
1492 {
1493 	const char *name = rte_vdev_device_name(dev);
1494 	const unsigned int numa_node = dev->device.numa_node;
1495 	struct pmd_internals *internals;
1496 	struct rte_eth_dev *eth_dev;
1497 	int ret;
1498 	int i;
1499 
1500 	internals = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node);
1501 	if (internals == NULL)
1502 		return NULL;
1503 
1504 	internals->start_queue_idx = start_queue_idx;
1505 	internals->queue_cnt = queue_cnt;
1506 	strlcpy(internals->if_name, if_name, IFNAMSIZ);
1507 	strlcpy(internals->prog_path, prog_path, PATH_MAX);
1508 	internals->custom_prog_configured = 0;
1509 
1510 #ifndef ETH_AF_XDP_SHARED_UMEM
1511 	if (shared_umem) {
1512 		AF_XDP_LOG(ERR, "Shared UMEM feature not available. "
1513 				"Check kernel and libbpf version\n");
1514 		goto err_free_internals;
1515 	}
1516 #endif
1517 	internals->shared_umem = shared_umem;
1518 
1519 	if (xdp_get_channels_info(if_name, &internals->max_queue_cnt,
1520 				  &internals->combined_queue_cnt)) {
1521 		AF_XDP_LOG(ERR, "Failed to get channel info of interface: %s\n",
1522 				if_name);
1523 		goto err_free_internals;
1524 	}
1525 
1526 	if (queue_cnt > internals->combined_queue_cnt) {
1527 		AF_XDP_LOG(ERR, "Specified queue count %d is larger than combined queue count %d.\n",
1528 				queue_cnt, internals->combined_queue_cnt);
1529 		goto err_free_internals;
1530 	}
1531 
1532 	internals->rx_queues = rte_zmalloc_socket(NULL,
1533 					sizeof(struct pkt_rx_queue) * queue_cnt,
1534 					0, numa_node);
1535 	if (internals->rx_queues == NULL) {
1536 		AF_XDP_LOG(ERR, "Failed to allocate memory for rx queues.\n");
1537 		goto err_free_internals;
1538 	}
1539 
1540 	internals->tx_queues = rte_zmalloc_socket(NULL,
1541 					sizeof(struct pkt_tx_queue) * queue_cnt,
1542 					0, numa_node);
1543 	if (internals->tx_queues == NULL) {
1544 		AF_XDP_LOG(ERR, "Failed to allocate memory for tx queues.\n");
1545 		goto err_free_rx;
1546 	}
1547 	for (i = 0; i < queue_cnt; i++) {
1548 		internals->tx_queues[i].pair = &internals->rx_queues[i];
1549 		internals->rx_queues[i].pair = &internals->tx_queues[i];
1550 		internals->rx_queues[i].xsk_queue_idx = start_queue_idx + i;
1551 		internals->tx_queues[i].xsk_queue_idx = start_queue_idx + i;
1552 	}
1553 
1554 	ret = get_iface_info(if_name, &internals->eth_addr,
1555 			     &internals->if_index);
1556 	if (ret)
1557 		goto err_free_tx;
1558 
1559 	eth_dev = rte_eth_vdev_allocate(dev, 0);
1560 	if (eth_dev == NULL)
1561 		goto err_free_tx;
1562 
1563 	eth_dev->data->dev_private = internals;
1564 	eth_dev->data->dev_link = pmd_link;
1565 	eth_dev->data->mac_addrs = &internals->eth_addr;
1566 	eth_dev->data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
1567 	eth_dev->dev_ops = &ops;
1568 	eth_dev->rx_pkt_burst = eth_af_xdp_rx;
1569 	eth_dev->tx_pkt_burst = eth_af_xdp_tx;
1570 
1571 #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
1572 	AF_XDP_LOG(INFO, "Zero copy between umem and mbuf enabled.\n");
1573 #endif
1574 
1575 	return eth_dev;
1576 
1577 err_free_tx:
1578 	rte_free(internals->tx_queues);
1579 err_free_rx:
1580 	rte_free(internals->rx_queues);
1581 err_free_internals:
1582 	rte_free(internals);
1583 	return NULL;
1584 }
1585 
1586 static int
1587 rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
1588 {
1589 	struct rte_kvargs *kvlist;
1590 	char if_name[IFNAMSIZ] = {'\0'};
1591 	int xsk_start_queue_idx = ETH_AF_XDP_DFLT_START_QUEUE_IDX;
1592 	int xsk_queue_cnt = ETH_AF_XDP_DFLT_QUEUE_COUNT;
1593 	int shared_umem = 0;
1594 	char prog_path[PATH_MAX] = {'\0'};
1595 	struct rte_eth_dev *eth_dev = NULL;
1596 	const char *name;
1597 
1598 	AF_XDP_LOG(INFO, "Initializing pmd_af_xdp for %s\n",
1599 		rte_vdev_device_name(dev));
1600 
1601 	name = rte_vdev_device_name(dev);
1602 	if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
1603 		strlen(rte_vdev_device_args(dev)) == 0) {
1604 		eth_dev = rte_eth_dev_attach_secondary(name);
1605 		if (eth_dev == NULL) {
1606 			AF_XDP_LOG(ERR, "Failed to probe %s\n", name);
1607 			return -EINVAL;
1608 		}
1609 		eth_dev->dev_ops = &ops;
1610 		rte_eth_dev_probing_finish(eth_dev);
1611 		return 0;
1612 	}
1613 
1614 	kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1615 	if (kvlist == NULL) {
1616 		AF_XDP_LOG(ERR, "Invalid kvargs key\n");
1617 		return -EINVAL;
1618 	}
1619 
1620 	if (dev->device.numa_node == SOCKET_ID_ANY)
1621 		dev->device.numa_node = rte_socket_id();
1622 
1623 	if (parse_parameters(kvlist, if_name, &xsk_start_queue_idx,
1624 			     &xsk_queue_cnt, &shared_umem, prog_path) < 0) {
1625 		AF_XDP_LOG(ERR, "Invalid kvargs value\n");
1626 		return -EINVAL;
1627 	}
1628 
1629 	if (strlen(if_name) == 0) {
1630 		AF_XDP_LOG(ERR, "Network interface must be specified\n");
1631 		return -EINVAL;
1632 	}
1633 
1634 	eth_dev = init_internals(dev, if_name, xsk_start_queue_idx,
1635 					xsk_queue_cnt, shared_umem, prog_path);
1636 	if (eth_dev == NULL) {
1637 		AF_XDP_LOG(ERR, "Failed to init internals\n");
1638 		return -1;
1639 	}
1640 
1641 	rte_eth_dev_probing_finish(eth_dev);
1642 
1643 	return 0;
1644 }
1645 
1646 static int
1647 rte_pmd_af_xdp_remove(struct rte_vdev_device *dev)
1648 {
1649 	struct rte_eth_dev *eth_dev = NULL;
1650 
1651 	AF_XDP_LOG(INFO, "Removing AF_XDP ethdev on numa socket %u\n",
1652 		rte_socket_id());
1653 
1654 	if (dev == NULL)
1655 		return -1;
1656 
1657 	/* find the ethdev entry */
1658 	eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
1659 	if (eth_dev == NULL)
1660 		return 0;
1661 
1662 	eth_dev_close(eth_dev);
1663 	rte_eth_dev_release_port(eth_dev);
1664 
1665 
1666 	return 0;
1667 }
1668 
1669 static struct rte_vdev_driver pmd_af_xdp_drv = {
1670 	.probe = rte_pmd_af_xdp_probe,
1671 	.remove = rte_pmd_af_xdp_remove,
1672 };
1673 
1674 RTE_PMD_REGISTER_VDEV(net_af_xdp, pmd_af_xdp_drv);
1675 RTE_PMD_REGISTER_PARAM_STRING(net_af_xdp,
1676 			      "iface=<string> "
1677 			      "start_queue=<int> "
1678 			      "queue_count=<int> "
1679 			      "shared_umem=<int> "
1680 			      "xdp_prog=<string> ");
1681