xref: /dpdk/drivers/net/vhost/rte_eth_vhost.c (revision 29fd052d)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016 IGEL Co., Ltd.
3  * Copyright(c) 2016-2018 Intel Corporation
4  */
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <stdbool.h>
8 #include <sys/epoll.h>
9 
10 #include <rte_mbuf.h>
11 #include <ethdev_driver.h>
12 #include <ethdev_vdev.h>
13 #include <rte_malloc.h>
14 #include <rte_memcpy.h>
15 #include <rte_bus_vdev.h>
16 #include <rte_kvargs.h>
17 #include <rte_vhost.h>
18 #include <rte_spinlock.h>
19 
20 #include "rte_eth_vhost.h"
21 
22 RTE_LOG_REGISTER_DEFAULT(vhost_logtype, NOTICE);
23 
24 #define VHOST_LOG(level, ...) \
25 	rte_log(RTE_LOG_ ## level, vhost_logtype, __VA_ARGS__)
26 
27 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
28 
29 #define ETH_VHOST_IFACE_ARG		"iface"
30 #define ETH_VHOST_QUEUES_ARG		"queues"
31 #define ETH_VHOST_CLIENT_ARG		"client"
32 #define ETH_VHOST_IOMMU_SUPPORT		"iommu-support"
33 #define ETH_VHOST_POSTCOPY_SUPPORT	"postcopy-support"
34 #define ETH_VHOST_VIRTIO_NET_F_HOST_TSO "tso"
35 #define ETH_VHOST_LINEAR_BUF  "linear-buffer"
36 #define ETH_VHOST_EXT_BUF  "ext-buffer"
37 #define VHOST_MAX_PKT_BURST 32
38 
39 static const char *valid_arguments[] = {
40 	ETH_VHOST_IFACE_ARG,
41 	ETH_VHOST_QUEUES_ARG,
42 	ETH_VHOST_CLIENT_ARG,
43 	ETH_VHOST_IOMMU_SUPPORT,
44 	ETH_VHOST_POSTCOPY_SUPPORT,
45 	ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
46 	ETH_VHOST_LINEAR_BUF,
47 	ETH_VHOST_EXT_BUF,
48 	NULL
49 };
50 
51 static struct rte_ether_addr base_eth_addr = {
52 	.addr_bytes = {
53 		0x56 /* V */,
54 		0x48 /* H */,
55 		0x4F /* O */,
56 		0x53 /* S */,
57 		0x54 /* T */,
58 		0x00
59 	}
60 };
61 
62 enum vhost_xstats_pkts {
63 	VHOST_UNDERSIZE_PKT = 0,
64 	VHOST_64_PKT,
65 	VHOST_65_TO_127_PKT,
66 	VHOST_128_TO_255_PKT,
67 	VHOST_256_TO_511_PKT,
68 	VHOST_512_TO_1023_PKT,
69 	VHOST_1024_TO_1522_PKT,
70 	VHOST_1523_TO_MAX_PKT,
71 	VHOST_BROADCAST_PKT,
72 	VHOST_MULTICAST_PKT,
73 	VHOST_UNICAST_PKT,
74 	VHOST_PKT,
75 	VHOST_BYTE,
76 	VHOST_MISSED_PKT,
77 	VHOST_ERRORS_PKT,
78 	VHOST_ERRORS_FRAGMENTED,
79 	VHOST_ERRORS_JABBER,
80 	VHOST_UNKNOWN_PROTOCOL,
81 	VHOST_XSTATS_MAX,
82 };
83 
84 struct vhost_stats {
85 	uint64_t pkts;
86 	uint64_t bytes;
87 	uint64_t missed_pkts;
88 	uint64_t xstats[VHOST_XSTATS_MAX];
89 };
90 
91 struct vhost_queue {
92 	int vid;
93 	rte_atomic32_t allow_queuing;
94 	rte_atomic32_t while_queuing;
95 	struct pmd_internal *internal;
96 	struct rte_mempool *mb_pool;
97 	uint16_t port;
98 	uint16_t virtqueue_id;
99 	struct vhost_stats stats;
100 	int intr_enable;
101 	rte_spinlock_t intr_lock;
102 };
103 
104 struct pmd_internal {
105 	rte_atomic32_t dev_attached;
106 	char *iface_name;
107 	uint64_t flags;
108 	uint64_t disable_flags;
109 	uint16_t max_queues;
110 	int vid;
111 	rte_atomic32_t started;
112 	uint8_t vlan_strip;
113 };
114 
115 struct internal_list {
116 	TAILQ_ENTRY(internal_list) next;
117 	struct rte_eth_dev *eth_dev;
118 };
119 
120 TAILQ_HEAD(internal_list_head, internal_list);
121 static struct internal_list_head internal_list =
122 	TAILQ_HEAD_INITIALIZER(internal_list);
123 
124 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
125 
126 static struct rte_eth_link pmd_link = {
127 		.link_speed = 10000,
128 		.link_duplex = RTE_ETH_LINK_FULL_DUPLEX,
129 		.link_status = RTE_ETH_LINK_DOWN
130 };
131 
132 struct rte_vhost_vring_state {
133 	rte_spinlock_t lock;
134 
135 	bool cur[RTE_MAX_QUEUES_PER_PORT * 2];
136 	bool seen[RTE_MAX_QUEUES_PER_PORT * 2];
137 	unsigned int index;
138 	unsigned int max_vring;
139 };
140 
141 static struct rte_vhost_vring_state *vring_states[RTE_MAX_ETHPORTS];
142 
143 #define VHOST_XSTATS_NAME_SIZE 64
144 
145 struct vhost_xstats_name_off {
146 	char name[VHOST_XSTATS_NAME_SIZE];
147 	uint64_t offset;
148 };
149 
150 /* [rx]_is prepended to the name string here */
151 static const struct vhost_xstats_name_off vhost_rxport_stat_strings[] = {
152 	{"good_packets",
153 	 offsetof(struct vhost_queue, stats.xstats[VHOST_PKT])},
154 	{"total_bytes",
155 	 offsetof(struct vhost_queue, stats.xstats[VHOST_BYTE])},
156 	{"missed_pkts",
157 	 offsetof(struct vhost_queue, stats.xstats[VHOST_MISSED_PKT])},
158 	{"broadcast_packets",
159 	 offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
160 	{"multicast_packets",
161 	 offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
162 	{"unicast_packets",
163 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
164 	 {"undersize_packets",
165 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
166 	{"size_64_packets",
167 	 offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
168 	{"size_65_to_127_packets",
169 	 offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
170 	{"size_128_to_255_packets",
171 	 offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
172 	{"size_256_to_511_packets",
173 	 offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
174 	{"size_512_to_1023_packets",
175 	 offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
176 	{"size_1024_to_1522_packets",
177 	 offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
178 	{"size_1523_to_max_packets",
179 	 offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
180 	{"errors_with_bad_CRC",
181 	 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
182 	{"fragmented_errors",
183 	 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_FRAGMENTED])},
184 	{"jabber_errors",
185 	 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_JABBER])},
186 	{"unknown_protos_packets",
187 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNKNOWN_PROTOCOL])},
188 };
189 
190 /* [tx]_ is prepended to the name string here */
191 static const struct vhost_xstats_name_off vhost_txport_stat_strings[] = {
192 	{"good_packets",
193 	 offsetof(struct vhost_queue, stats.xstats[VHOST_PKT])},
194 	{"total_bytes",
195 	 offsetof(struct vhost_queue, stats.xstats[VHOST_BYTE])},
196 	{"missed_pkts",
197 	 offsetof(struct vhost_queue, stats.xstats[VHOST_MISSED_PKT])},
198 	{"broadcast_packets",
199 	 offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
200 	{"multicast_packets",
201 	 offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
202 	{"unicast_packets",
203 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
204 	{"undersize_packets",
205 	 offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
206 	{"size_64_packets",
207 	 offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
208 	{"size_65_to_127_packets",
209 	 offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
210 	{"size_128_to_255_packets",
211 	 offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
212 	{"size_256_to_511_packets",
213 	 offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
214 	{"size_512_to_1023_packets",
215 	 offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
216 	{"size_1024_to_1522_packets",
217 	 offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
218 	{"size_1523_to_max_packets",
219 	 offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
220 	{"errors_with_bad_CRC",
221 	 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
222 };
223 
224 #define VHOST_NB_XSTATS_RXPORT (sizeof(vhost_rxport_stat_strings) / \
225 				sizeof(vhost_rxport_stat_strings[0]))
226 
227 #define VHOST_NB_XSTATS_TXPORT (sizeof(vhost_txport_stat_strings) / \
228 				sizeof(vhost_txport_stat_strings[0]))
229 
230 static int
231 vhost_dev_xstats_reset(struct rte_eth_dev *dev)
232 {
233 	struct vhost_queue *vq = NULL;
234 	unsigned int i = 0;
235 
236 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
237 		vq = dev->data->rx_queues[i];
238 		if (!vq)
239 			continue;
240 		memset(&vq->stats, 0, sizeof(vq->stats));
241 	}
242 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
243 		vq = dev->data->tx_queues[i];
244 		if (!vq)
245 			continue;
246 		memset(&vq->stats, 0, sizeof(vq->stats));
247 	}
248 
249 	return 0;
250 }
251 
252 static int
253 vhost_dev_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
254 			   struct rte_eth_xstat_name *xstats_names,
255 			   unsigned int limit __rte_unused)
256 {
257 	unsigned int t = 0;
258 	int count = 0;
259 	int nstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
260 
261 	if (!xstats_names)
262 		return nstats;
263 	for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
264 		snprintf(xstats_names[count].name,
265 			 sizeof(xstats_names[count].name),
266 			 "rx_%s", vhost_rxport_stat_strings[t].name);
267 		count++;
268 	}
269 	for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
270 		snprintf(xstats_names[count].name,
271 			 sizeof(xstats_names[count].name),
272 			 "tx_%s", vhost_txport_stat_strings[t].name);
273 		count++;
274 	}
275 	return count;
276 }
277 
278 static int
279 vhost_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats,
280 		     unsigned int n)
281 {
282 	unsigned int i;
283 	unsigned int t;
284 	unsigned int count = 0;
285 	struct vhost_queue *vq = NULL;
286 	unsigned int nxstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
287 
288 	if (n < nxstats)
289 		return nxstats;
290 
291 	for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
292 		xstats[count].value = 0;
293 		for (i = 0; i < dev->data->nb_rx_queues; i++) {
294 			vq = dev->data->rx_queues[i];
295 			if (!vq)
296 				continue;
297 			xstats[count].value +=
298 				*(uint64_t *)(((char *)vq)
299 				+ vhost_rxport_stat_strings[t].offset);
300 		}
301 		xstats[count].id = count;
302 		count++;
303 	}
304 	for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
305 		xstats[count].value = 0;
306 		for (i = 0; i < dev->data->nb_tx_queues; i++) {
307 			vq = dev->data->tx_queues[i];
308 			if (!vq)
309 				continue;
310 			xstats[count].value +=
311 				*(uint64_t *)(((char *)vq)
312 				+ vhost_txport_stat_strings[t].offset);
313 		}
314 		xstats[count].id = count;
315 		count++;
316 	}
317 	return count;
318 }
319 
320 static inline void
321 vhost_count_xcast_packets(struct vhost_queue *vq,
322 				struct rte_mbuf *mbuf)
323 {
324 	struct rte_ether_addr *ea = NULL;
325 	struct vhost_stats *pstats = &vq->stats;
326 
327 	ea = rte_pktmbuf_mtod(mbuf, struct rte_ether_addr *);
328 	if (rte_is_multicast_ether_addr(ea)) {
329 		if (rte_is_broadcast_ether_addr(ea))
330 			pstats->xstats[VHOST_BROADCAST_PKT]++;
331 		else
332 			pstats->xstats[VHOST_MULTICAST_PKT]++;
333 	} else {
334 		pstats->xstats[VHOST_UNICAST_PKT]++;
335 	}
336 }
337 
338 static __rte_always_inline void
339 vhost_update_single_packet_xstats(struct vhost_queue *vq, struct rte_mbuf *buf)
340 {
341 	uint32_t pkt_len = 0;
342 	uint64_t index;
343 	struct vhost_stats *pstats = &vq->stats;
344 
345 	pstats->xstats[VHOST_PKT]++;
346 	pkt_len = buf->pkt_len;
347 	if (pkt_len == 64) {
348 		pstats->xstats[VHOST_64_PKT]++;
349 	} else if (pkt_len > 64 && pkt_len < 1024) {
350 		index = (sizeof(pkt_len) * 8)
351 			- __builtin_clz(pkt_len) - 5;
352 		pstats->xstats[index]++;
353 	} else {
354 		if (pkt_len < 64)
355 			pstats->xstats[VHOST_UNDERSIZE_PKT]++;
356 		else if (pkt_len <= 1522)
357 			pstats->xstats[VHOST_1024_TO_1522_PKT]++;
358 		else if (pkt_len > 1522)
359 			pstats->xstats[VHOST_1523_TO_MAX_PKT]++;
360 	}
361 	vhost_count_xcast_packets(vq, buf);
362 }
363 
364 static uint16_t
365 eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
366 {
367 	struct vhost_queue *r = q;
368 	uint16_t i, nb_rx = 0;
369 	uint16_t nb_receive = nb_bufs;
370 
371 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
372 		return 0;
373 
374 	rte_atomic32_set(&r->while_queuing, 1);
375 
376 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
377 		goto out;
378 
379 	/* Dequeue packets from guest TX queue */
380 	while (nb_receive) {
381 		uint16_t nb_pkts;
382 		uint16_t num = (uint16_t)RTE_MIN(nb_receive,
383 						 VHOST_MAX_PKT_BURST);
384 
385 		nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
386 						  r->mb_pool, &bufs[nb_rx],
387 						  num);
388 
389 		nb_rx += nb_pkts;
390 		nb_receive -= nb_pkts;
391 		if (nb_pkts < num)
392 			break;
393 	}
394 
395 	r->stats.pkts += nb_rx;
396 
397 	for (i = 0; likely(i < nb_rx); i++) {
398 		bufs[i]->port = r->port;
399 		bufs[i]->vlan_tci = 0;
400 
401 		if (r->internal->vlan_strip)
402 			rte_vlan_strip(bufs[i]);
403 
404 		r->stats.bytes += bufs[i]->pkt_len;
405 		r->stats.xstats[VHOST_BYTE] += bufs[i]->pkt_len;
406 
407 		vhost_update_single_packet_xstats(r, bufs[i]);
408 	}
409 
410 out:
411 	rte_atomic32_set(&r->while_queuing, 0);
412 
413 	return nb_rx;
414 }
415 
416 static uint16_t
417 eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
418 {
419 	struct vhost_queue *r = q;
420 	uint16_t i, nb_tx = 0;
421 	uint16_t nb_send = 0;
422 	uint64_t nb_bytes = 0;
423 	uint64_t nb_missed = 0;
424 
425 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
426 		return 0;
427 
428 	rte_atomic32_set(&r->while_queuing, 1);
429 
430 	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
431 		goto out;
432 
433 	for (i = 0; i < nb_bufs; i++) {
434 		struct rte_mbuf *m = bufs[i];
435 
436 		/* Do VLAN tag insertion */
437 		if (m->ol_flags & RTE_MBUF_F_TX_VLAN) {
438 			int error = rte_vlan_insert(&m);
439 			if (unlikely(error)) {
440 				rte_pktmbuf_free(m);
441 				continue;
442 			}
443 		}
444 
445 		bufs[nb_send] = m;
446 		++nb_send;
447 	}
448 
449 	/* Enqueue packets to guest RX queue */
450 	while (nb_send) {
451 		uint16_t nb_pkts;
452 		uint16_t num = (uint16_t)RTE_MIN(nb_send,
453 						 VHOST_MAX_PKT_BURST);
454 
455 		nb_pkts = rte_vhost_enqueue_burst(r->vid, r->virtqueue_id,
456 						  &bufs[nb_tx], num);
457 
458 		nb_tx += nb_pkts;
459 		nb_send -= nb_pkts;
460 		if (nb_pkts < num)
461 			break;
462 	}
463 
464 	for (i = 0; likely(i < nb_tx); i++) {
465 		nb_bytes += bufs[i]->pkt_len;
466 		vhost_update_single_packet_xstats(r, bufs[i]);
467 	}
468 
469 	nb_missed = nb_bufs - nb_tx;
470 
471 	r->stats.pkts += nb_tx;
472 	r->stats.bytes += nb_bytes;
473 	r->stats.missed_pkts += nb_missed;
474 
475 	r->stats.xstats[VHOST_BYTE] += nb_bytes;
476 	r->stats.xstats[VHOST_MISSED_PKT] += nb_missed;
477 	r->stats.xstats[VHOST_UNICAST_PKT] += nb_missed;
478 
479 	/* According to RFC2863, ifHCOutUcastPkts, ifHCOutMulticastPkts and
480 	 * ifHCOutBroadcastPkts counters are increased when packets are not
481 	 * transmitted successfully.
482 	 */
483 	for (i = nb_tx; i < nb_bufs; i++)
484 		vhost_count_xcast_packets(r, bufs[i]);
485 
486 	for (i = 0; likely(i < nb_tx); i++)
487 		rte_pktmbuf_free(bufs[i]);
488 out:
489 	rte_atomic32_set(&r->while_queuing, 0);
490 
491 	return nb_tx;
492 }
493 
494 static inline struct internal_list *
495 find_internal_resource(char *ifname)
496 {
497 	int found = 0;
498 	struct internal_list *list;
499 	struct pmd_internal *internal;
500 
501 	if (ifname == NULL)
502 		return NULL;
503 
504 	pthread_mutex_lock(&internal_list_lock);
505 
506 	TAILQ_FOREACH(list, &internal_list, next) {
507 		internal = list->eth_dev->data->dev_private;
508 		if (!strcmp(internal->iface_name, ifname)) {
509 			found = 1;
510 			break;
511 		}
512 	}
513 
514 	pthread_mutex_unlock(&internal_list_lock);
515 
516 	if (!found)
517 		return NULL;
518 
519 	return list;
520 }
521 
522 static int
523 eth_vhost_update_intr(struct rte_eth_dev *eth_dev, uint16_t rxq_idx)
524 {
525 	struct rte_intr_handle *handle = eth_dev->intr_handle;
526 	struct rte_epoll_event rev, *elist;
527 	int epfd, ret;
528 
529 	if (handle == NULL)
530 		return 0;
531 
532 	elist = rte_intr_elist_index_get(handle, rxq_idx);
533 	if (rte_intr_efds_index_get(handle, rxq_idx) == elist->fd)
534 		return 0;
535 
536 	VHOST_LOG(INFO, "kickfd for rxq-%d was changed, updating handler.\n",
537 			rxq_idx);
538 
539 	if (elist->fd != -1)
540 		VHOST_LOG(ERR, "Unexpected previous kickfd value (Got %d, expected -1).\n",
541 			elist->fd);
542 
543 	/*
544 	 * First remove invalid epoll event, and then install
545 	 * the new one. May be solved with a proper API in the
546 	 * future.
547 	 */
548 	epfd = elist->epfd;
549 	rev = *elist;
550 	ret = rte_epoll_ctl(epfd, EPOLL_CTL_DEL, rev.fd,
551 			elist);
552 	if (ret) {
553 		VHOST_LOG(ERR, "Delete epoll event failed.\n");
554 		return ret;
555 	}
556 
557 	rev.fd = rte_intr_efds_index_get(handle, rxq_idx);
558 	if (rte_intr_elist_index_set(handle, rxq_idx, rev))
559 		return -rte_errno;
560 
561 	elist = rte_intr_elist_index_get(handle, rxq_idx);
562 	ret = rte_epoll_ctl(epfd, EPOLL_CTL_ADD, rev.fd, elist);
563 	if (ret) {
564 		VHOST_LOG(ERR, "Add epoll event failed.\n");
565 		return ret;
566 	}
567 
568 	return 0;
569 }
570 
571 static int
572 eth_rxq_intr_enable(struct rte_eth_dev *dev, uint16_t qid)
573 {
574 	struct rte_vhost_vring vring;
575 	struct vhost_queue *vq;
576 	int old_intr_enable, ret = 0;
577 
578 	vq = dev->data->rx_queues[qid];
579 	if (!vq) {
580 		VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
581 		return -1;
582 	}
583 
584 	rte_spinlock_lock(&vq->intr_lock);
585 	old_intr_enable = vq->intr_enable;
586 	vq->intr_enable = 1;
587 	ret = eth_vhost_update_intr(dev, qid);
588 	rte_spinlock_unlock(&vq->intr_lock);
589 
590 	if (ret < 0) {
591 		VHOST_LOG(ERR, "Failed to update rxq%d's intr\n", qid);
592 		vq->intr_enable = old_intr_enable;
593 		return ret;
594 	}
595 
596 	ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
597 	if (ret < 0) {
598 		VHOST_LOG(ERR, "Failed to get rxq%d's vring\n", qid);
599 		return ret;
600 	}
601 	VHOST_LOG(INFO, "Enable interrupt for rxq%d\n", qid);
602 	rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 1);
603 	rte_wmb();
604 
605 	return ret;
606 }
607 
608 static int
609 eth_rxq_intr_disable(struct rte_eth_dev *dev, uint16_t qid)
610 {
611 	struct rte_vhost_vring vring;
612 	struct vhost_queue *vq;
613 	int ret = 0;
614 
615 	vq = dev->data->rx_queues[qid];
616 	if (!vq) {
617 		VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
618 		return -1;
619 	}
620 
621 	ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
622 	if (ret < 0) {
623 		VHOST_LOG(ERR, "Failed to get rxq%d's vring", qid);
624 		return ret;
625 	}
626 	VHOST_LOG(INFO, "Disable interrupt for rxq%d\n", qid);
627 	rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 0);
628 	rte_wmb();
629 
630 	vq->intr_enable = 0;
631 
632 	return 0;
633 }
634 
635 static void
636 eth_vhost_uninstall_intr(struct rte_eth_dev *dev)
637 {
638 	struct rte_intr_handle *intr_handle = dev->intr_handle;
639 
640 	if (intr_handle != NULL) {
641 		rte_intr_vec_list_free(intr_handle);
642 		rte_intr_instance_free(intr_handle);
643 	}
644 	dev->intr_handle = NULL;
645 }
646 
647 static int
648 eth_vhost_install_intr(struct rte_eth_dev *dev)
649 {
650 	struct rte_vhost_vring vring;
651 	struct vhost_queue *vq;
652 	int nb_rxq = dev->data->nb_rx_queues;
653 	int i;
654 	int ret;
655 
656 	/* uninstall firstly if we are reconnecting */
657 	if (dev->intr_handle != NULL)
658 		eth_vhost_uninstall_intr(dev);
659 
660 	dev->intr_handle = rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_PRIVATE);
661 	if (dev->intr_handle == NULL) {
662 		VHOST_LOG(ERR, "Fail to allocate intr_handle\n");
663 		return -ENOMEM;
664 	}
665 	if (rte_intr_efd_counter_size_set(dev->intr_handle, sizeof(uint64_t)))
666 		return -rte_errno;
667 
668 	if (rte_intr_vec_list_alloc(dev->intr_handle, NULL, nb_rxq)) {
669 		VHOST_LOG(ERR,
670 			"Failed to allocate memory for interrupt vector\n");
671 		rte_intr_instance_free(dev->intr_handle);
672 		return -ENOMEM;
673 	}
674 
675 
676 	VHOST_LOG(INFO, "Prepare intr vec\n");
677 	for (i = 0; i < nb_rxq; i++) {
678 		if (rte_intr_vec_list_index_set(dev->intr_handle, i, RTE_INTR_VEC_RXTX_OFFSET + i))
679 			return -rte_errno;
680 		if (rte_intr_efds_index_set(dev->intr_handle, i, -1))
681 			return -rte_errno;
682 		vq = dev->data->rx_queues[i];
683 		if (!vq) {
684 			VHOST_LOG(INFO, "rxq-%d not setup yet, skip!\n", i);
685 			continue;
686 		}
687 
688 		ret = rte_vhost_get_vhost_vring(vq->vid, (i << 1) + 1, &vring);
689 		if (ret < 0) {
690 			VHOST_LOG(INFO,
691 				"Failed to get rxq-%d's vring, skip!\n", i);
692 			continue;
693 		}
694 
695 		if (vring.kickfd < 0) {
696 			VHOST_LOG(INFO,
697 				"rxq-%d's kickfd is invalid, skip!\n", i);
698 			continue;
699 		}
700 
701 		if (rte_intr_efds_index_set(dev->intr_handle, i, vring.kickfd))
702 			continue;
703 		VHOST_LOG(INFO, "Installed intr vec for rxq-%d\n", i);
704 	}
705 
706 	if (rte_intr_nb_efd_set(dev->intr_handle, nb_rxq))
707 		return -rte_errno;
708 
709 	if (rte_intr_max_intr_set(dev->intr_handle, nb_rxq + 1))
710 		return -rte_errno;
711 
712 	if (rte_intr_type_set(dev->intr_handle, RTE_INTR_HANDLE_VDEV))
713 		return -rte_errno;
714 
715 	return 0;
716 }
717 
718 static void
719 update_queuing_status(struct rte_eth_dev *dev)
720 {
721 	struct pmd_internal *internal = dev->data->dev_private;
722 	struct vhost_queue *vq;
723 	unsigned int i;
724 	int allow_queuing = 1;
725 
726 	if (!dev->data->rx_queues || !dev->data->tx_queues)
727 		return;
728 
729 	if (rte_atomic32_read(&internal->started) == 0 ||
730 	    rte_atomic32_read(&internal->dev_attached) == 0)
731 		allow_queuing = 0;
732 
733 	/* Wait until rx/tx_pkt_burst stops accessing vhost device */
734 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
735 		vq = dev->data->rx_queues[i];
736 		if (vq == NULL)
737 			continue;
738 		rte_atomic32_set(&vq->allow_queuing, allow_queuing);
739 		while (rte_atomic32_read(&vq->while_queuing))
740 			rte_pause();
741 	}
742 
743 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
744 		vq = dev->data->tx_queues[i];
745 		if (vq == NULL)
746 			continue;
747 		rte_atomic32_set(&vq->allow_queuing, allow_queuing);
748 		while (rte_atomic32_read(&vq->while_queuing))
749 			rte_pause();
750 	}
751 }
752 
753 static void
754 queue_setup(struct rte_eth_dev *eth_dev, struct pmd_internal *internal)
755 {
756 	struct vhost_queue *vq;
757 	int i;
758 
759 	for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
760 		vq = eth_dev->data->rx_queues[i];
761 		if (!vq)
762 			continue;
763 		vq->vid = internal->vid;
764 		vq->internal = internal;
765 		vq->port = eth_dev->data->port_id;
766 	}
767 	for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
768 		vq = eth_dev->data->tx_queues[i];
769 		if (!vq)
770 			continue;
771 		vq->vid = internal->vid;
772 		vq->internal = internal;
773 		vq->port = eth_dev->data->port_id;
774 	}
775 }
776 
777 static int
778 new_device(int vid)
779 {
780 	struct rte_eth_dev *eth_dev;
781 	struct internal_list *list;
782 	struct pmd_internal *internal;
783 	struct rte_eth_conf *dev_conf;
784 	unsigned i;
785 	char ifname[PATH_MAX];
786 #ifdef RTE_LIBRTE_VHOST_NUMA
787 	int newnode;
788 #endif
789 
790 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
791 	list = find_internal_resource(ifname);
792 	if (list == NULL) {
793 		VHOST_LOG(INFO, "Invalid device name: %s\n", ifname);
794 		return -1;
795 	}
796 
797 	eth_dev = list->eth_dev;
798 	internal = eth_dev->data->dev_private;
799 	dev_conf = &eth_dev->data->dev_conf;
800 
801 #ifdef RTE_LIBRTE_VHOST_NUMA
802 	newnode = rte_vhost_get_numa_node(vid);
803 	if (newnode >= 0)
804 		eth_dev->data->numa_node = newnode;
805 #endif
806 
807 	internal->vid = vid;
808 	if (rte_atomic32_read(&internal->started) == 1) {
809 		queue_setup(eth_dev, internal);
810 
811 		if (dev_conf->intr_conf.rxq) {
812 			if (eth_vhost_install_intr(eth_dev) < 0) {
813 				VHOST_LOG(INFO,
814 					"Failed to install interrupt handler.");
815 					return -1;
816 			}
817 		}
818 	} else {
819 		VHOST_LOG(INFO, "RX/TX queues not exist yet\n");
820 	}
821 
822 	for (i = 0; i < rte_vhost_get_vring_num(vid); i++)
823 		rte_vhost_enable_guest_notification(vid, i, 0);
824 
825 	rte_vhost_get_mtu(vid, &eth_dev->data->mtu);
826 
827 	eth_dev->data->dev_link.link_status = RTE_ETH_LINK_UP;
828 
829 	rte_atomic32_set(&internal->dev_attached, 1);
830 	update_queuing_status(eth_dev);
831 
832 	VHOST_LOG(INFO, "Vhost device %d created\n", vid);
833 
834 	rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
835 
836 	return 0;
837 }
838 
839 static void
840 destroy_device(int vid)
841 {
842 	struct rte_eth_dev *eth_dev;
843 	struct pmd_internal *internal;
844 	struct vhost_queue *vq;
845 	struct internal_list *list;
846 	char ifname[PATH_MAX];
847 	unsigned i;
848 	struct rte_vhost_vring_state *state;
849 
850 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
851 	list = find_internal_resource(ifname);
852 	if (list == NULL) {
853 		VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
854 		return;
855 	}
856 	eth_dev = list->eth_dev;
857 	internal = eth_dev->data->dev_private;
858 
859 	rte_atomic32_set(&internal->dev_attached, 0);
860 	update_queuing_status(eth_dev);
861 
862 	eth_dev->data->dev_link.link_status = RTE_ETH_LINK_DOWN;
863 
864 	if (eth_dev->data->rx_queues && eth_dev->data->tx_queues) {
865 		for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
866 			vq = eth_dev->data->rx_queues[i];
867 			if (!vq)
868 				continue;
869 			vq->vid = -1;
870 		}
871 		for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
872 			vq = eth_dev->data->tx_queues[i];
873 			if (!vq)
874 				continue;
875 			vq->vid = -1;
876 		}
877 	}
878 
879 	state = vring_states[eth_dev->data->port_id];
880 	rte_spinlock_lock(&state->lock);
881 	for (i = 0; i <= state->max_vring; i++) {
882 		state->cur[i] = false;
883 		state->seen[i] = false;
884 	}
885 	state->max_vring = 0;
886 	rte_spinlock_unlock(&state->lock);
887 
888 	VHOST_LOG(INFO, "Vhost device %d destroyed\n", vid);
889 	eth_vhost_uninstall_intr(eth_dev);
890 
891 	rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
892 }
893 
894 static int
895 vring_conf_update(int vid, struct rte_eth_dev *eth_dev, uint16_t vring_id)
896 {
897 	struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
898 	struct pmd_internal *internal = eth_dev->data->dev_private;
899 	struct vhost_queue *vq;
900 	struct rte_vhost_vring vring;
901 	int rx_idx = vring_id % 2 ? (vring_id - 1) >> 1 : -1;
902 	int ret = 0;
903 
904 	/*
905 	 * The vring kickfd may be changed after the new device notification.
906 	 * Update it when the vring state is updated.
907 	 */
908 	if (rx_idx >= 0 && rx_idx < eth_dev->data->nb_rx_queues &&
909 	    rte_atomic32_read(&internal->dev_attached) &&
910 	    rte_atomic32_read(&internal->started) &&
911 	    dev_conf->intr_conf.rxq) {
912 		ret = rte_vhost_get_vhost_vring(vid, vring_id, &vring);
913 		if (ret) {
914 			VHOST_LOG(ERR, "Failed to get vring %d information.\n",
915 					vring_id);
916 			return ret;
917 		}
918 
919 		if (rte_intr_efds_index_set(eth_dev->intr_handle, rx_idx,
920 						   vring.kickfd))
921 			return -rte_errno;
922 
923 		vq = eth_dev->data->rx_queues[rx_idx];
924 		if (!vq) {
925 			VHOST_LOG(ERR, "rxq%d is not setup yet\n", rx_idx);
926 			return -1;
927 		}
928 
929 		rte_spinlock_lock(&vq->intr_lock);
930 		if (vq->intr_enable)
931 			ret = eth_vhost_update_intr(eth_dev, rx_idx);
932 		rte_spinlock_unlock(&vq->intr_lock);
933 	}
934 
935 	return ret;
936 }
937 
938 static int
939 vring_state_changed(int vid, uint16_t vring, int enable)
940 {
941 	struct rte_vhost_vring_state *state;
942 	struct rte_eth_dev *eth_dev;
943 	struct internal_list *list;
944 	char ifname[PATH_MAX];
945 
946 	rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
947 	list = find_internal_resource(ifname);
948 	if (list == NULL) {
949 		VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
950 		return -1;
951 	}
952 
953 	eth_dev = list->eth_dev;
954 	/* won't be NULL */
955 	state = vring_states[eth_dev->data->port_id];
956 
957 	if (enable && vring_conf_update(vid, eth_dev, vring))
958 		VHOST_LOG(INFO, "Failed to update vring-%d configuration.\n",
959 			  (int)vring);
960 
961 	rte_spinlock_lock(&state->lock);
962 	if (state->cur[vring] == enable) {
963 		rte_spinlock_unlock(&state->lock);
964 		return 0;
965 	}
966 	state->cur[vring] = enable;
967 	state->max_vring = RTE_MAX(vring, state->max_vring);
968 	rte_spinlock_unlock(&state->lock);
969 
970 	VHOST_LOG(INFO, "vring%u is %s\n",
971 			vring, enable ? "enabled" : "disabled");
972 
973 	rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_QUEUE_STATE, NULL);
974 
975 	return 0;
976 }
977 
978 static struct rte_vhost_device_ops vhost_ops = {
979 	.new_device          = new_device,
980 	.destroy_device      = destroy_device,
981 	.vring_state_changed = vring_state_changed,
982 };
983 
984 static int
985 vhost_driver_setup(struct rte_eth_dev *eth_dev)
986 {
987 	struct pmd_internal *internal = eth_dev->data->dev_private;
988 	struct internal_list *list = NULL;
989 	struct rte_vhost_vring_state *vring_state = NULL;
990 	unsigned int numa_node = eth_dev->device->numa_node;
991 	const char *name = eth_dev->device->name;
992 
993 	/* Don't try to setup again if it has already been done. */
994 	list = find_internal_resource(internal->iface_name);
995 	if (list)
996 		return 0;
997 
998 	list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node);
999 	if (list == NULL)
1000 		return -1;
1001 
1002 	vring_state = rte_zmalloc_socket(name, sizeof(*vring_state),
1003 					 0, numa_node);
1004 	if (vring_state == NULL)
1005 		goto free_list;
1006 
1007 	list->eth_dev = eth_dev;
1008 	pthread_mutex_lock(&internal_list_lock);
1009 	TAILQ_INSERT_TAIL(&internal_list, list, next);
1010 	pthread_mutex_unlock(&internal_list_lock);
1011 
1012 	rte_spinlock_init(&vring_state->lock);
1013 	vring_states[eth_dev->data->port_id] = vring_state;
1014 
1015 	if (rte_vhost_driver_register(internal->iface_name, internal->flags))
1016 		goto list_remove;
1017 
1018 	if (internal->disable_flags) {
1019 		if (rte_vhost_driver_disable_features(internal->iface_name,
1020 						      internal->disable_flags))
1021 			goto drv_unreg;
1022 	}
1023 
1024 	if (rte_vhost_driver_callback_register(internal->iface_name,
1025 					       &vhost_ops) < 0) {
1026 		VHOST_LOG(ERR, "Can't register callbacks\n");
1027 		goto drv_unreg;
1028 	}
1029 
1030 	if (rte_vhost_driver_start(internal->iface_name) < 0) {
1031 		VHOST_LOG(ERR, "Failed to start driver for %s\n",
1032 			  internal->iface_name);
1033 		goto drv_unreg;
1034 	}
1035 
1036 	return 0;
1037 
1038 drv_unreg:
1039 	rte_vhost_driver_unregister(internal->iface_name);
1040 list_remove:
1041 	vring_states[eth_dev->data->port_id] = NULL;
1042 	pthread_mutex_lock(&internal_list_lock);
1043 	TAILQ_REMOVE(&internal_list, list, next);
1044 	pthread_mutex_unlock(&internal_list_lock);
1045 	rte_free(vring_state);
1046 free_list:
1047 	rte_free(list);
1048 
1049 	return -1;
1050 }
1051 
1052 int
1053 rte_eth_vhost_get_queue_event(uint16_t port_id,
1054 		struct rte_eth_vhost_queue_event *event)
1055 {
1056 	struct rte_vhost_vring_state *state;
1057 	unsigned int i;
1058 	int idx;
1059 
1060 	if (port_id >= RTE_MAX_ETHPORTS) {
1061 		VHOST_LOG(ERR, "Invalid port id\n");
1062 		return -1;
1063 	}
1064 
1065 	state = vring_states[port_id];
1066 	if (!state) {
1067 		VHOST_LOG(ERR, "Unused port\n");
1068 		return -1;
1069 	}
1070 
1071 	rte_spinlock_lock(&state->lock);
1072 	for (i = 0; i <= state->max_vring; i++) {
1073 		idx = state->index++ % (state->max_vring + 1);
1074 
1075 		if (state->cur[idx] != state->seen[idx]) {
1076 			state->seen[idx] = state->cur[idx];
1077 			event->queue_id = idx / 2;
1078 			event->rx = idx & 1;
1079 			event->enable = state->cur[idx];
1080 			rte_spinlock_unlock(&state->lock);
1081 			return 0;
1082 		}
1083 	}
1084 	rte_spinlock_unlock(&state->lock);
1085 
1086 	return -1;
1087 }
1088 
1089 int
1090 rte_eth_vhost_get_vid_from_port_id(uint16_t port_id)
1091 {
1092 	struct internal_list *list;
1093 	struct rte_eth_dev *eth_dev;
1094 	struct vhost_queue *vq;
1095 	int vid = -1;
1096 
1097 	if (!rte_eth_dev_is_valid_port(port_id))
1098 		return -1;
1099 
1100 	pthread_mutex_lock(&internal_list_lock);
1101 
1102 	TAILQ_FOREACH(list, &internal_list, next) {
1103 		eth_dev = list->eth_dev;
1104 		if (eth_dev->data->port_id == port_id) {
1105 			vq = eth_dev->data->rx_queues[0];
1106 			if (vq) {
1107 				vid = vq->vid;
1108 			}
1109 			break;
1110 		}
1111 	}
1112 
1113 	pthread_mutex_unlock(&internal_list_lock);
1114 
1115 	return vid;
1116 }
1117 
1118 static int
1119 eth_dev_configure(struct rte_eth_dev *dev)
1120 {
1121 	struct pmd_internal *internal = dev->data->dev_private;
1122 	const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
1123 
1124 	/* NOTE: the same process has to operate a vhost interface
1125 	 * from beginning to end (from eth_dev configure to eth_dev close).
1126 	 * It is user's responsibility at the moment.
1127 	 */
1128 	if (vhost_driver_setup(dev) < 0)
1129 		return -1;
1130 
1131 	internal->vlan_strip = !!(rxmode->offloads & RTE_ETH_RX_OFFLOAD_VLAN_STRIP);
1132 
1133 	return 0;
1134 }
1135 
1136 static int
1137 eth_dev_start(struct rte_eth_dev *eth_dev)
1138 {
1139 	struct pmd_internal *internal = eth_dev->data->dev_private;
1140 	struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
1141 
1142 	queue_setup(eth_dev, internal);
1143 
1144 	if (rte_atomic32_read(&internal->dev_attached) == 1) {
1145 		if (dev_conf->intr_conf.rxq) {
1146 			if (eth_vhost_install_intr(eth_dev) < 0) {
1147 				VHOST_LOG(INFO,
1148 					"Failed to install interrupt handler.");
1149 					return -1;
1150 			}
1151 		}
1152 	}
1153 
1154 	rte_atomic32_set(&internal->started, 1);
1155 	update_queuing_status(eth_dev);
1156 
1157 	return 0;
1158 }
1159 
1160 static int
1161 eth_dev_stop(struct rte_eth_dev *dev)
1162 {
1163 	struct pmd_internal *internal = dev->data->dev_private;
1164 
1165 	dev->data->dev_started = 0;
1166 	rte_atomic32_set(&internal->started, 0);
1167 	update_queuing_status(dev);
1168 
1169 	return 0;
1170 }
1171 
1172 static int
1173 eth_dev_close(struct rte_eth_dev *dev)
1174 {
1175 	struct pmd_internal *internal;
1176 	struct internal_list *list;
1177 	unsigned int i, ret;
1178 
1179 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1180 		return 0;
1181 
1182 	internal = dev->data->dev_private;
1183 	if (!internal)
1184 		return 0;
1185 
1186 	ret = eth_dev_stop(dev);
1187 
1188 	list = find_internal_resource(internal->iface_name);
1189 	if (list) {
1190 		rte_vhost_driver_unregister(internal->iface_name);
1191 		pthread_mutex_lock(&internal_list_lock);
1192 		TAILQ_REMOVE(&internal_list, list, next);
1193 		pthread_mutex_unlock(&internal_list_lock);
1194 		rte_free(list);
1195 	}
1196 
1197 	if (dev->data->rx_queues)
1198 		for (i = 0; i < dev->data->nb_rx_queues; i++)
1199 			rte_free(dev->data->rx_queues[i]);
1200 
1201 	if (dev->data->tx_queues)
1202 		for (i = 0; i < dev->data->nb_tx_queues; i++)
1203 			rte_free(dev->data->tx_queues[i]);
1204 
1205 	rte_free(internal->iface_name);
1206 	rte_free(internal);
1207 
1208 	dev->data->dev_private = NULL;
1209 
1210 	rte_free(vring_states[dev->data->port_id]);
1211 	vring_states[dev->data->port_id] = NULL;
1212 
1213 	return ret;
1214 }
1215 
1216 static int
1217 eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1218 		   uint16_t nb_rx_desc __rte_unused,
1219 		   unsigned int socket_id,
1220 		   const struct rte_eth_rxconf *rx_conf __rte_unused,
1221 		   struct rte_mempool *mb_pool)
1222 {
1223 	struct vhost_queue *vq;
1224 
1225 	vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1226 			RTE_CACHE_LINE_SIZE, socket_id);
1227 	if (vq == NULL) {
1228 		VHOST_LOG(ERR, "Failed to allocate memory for rx queue\n");
1229 		return -ENOMEM;
1230 	}
1231 
1232 	vq->mb_pool = mb_pool;
1233 	vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
1234 	rte_spinlock_init(&vq->intr_lock);
1235 	dev->data->rx_queues[rx_queue_id] = vq;
1236 
1237 	return 0;
1238 }
1239 
1240 static int
1241 eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1242 		   uint16_t nb_tx_desc __rte_unused,
1243 		   unsigned int socket_id,
1244 		   const struct rte_eth_txconf *tx_conf __rte_unused)
1245 {
1246 	struct vhost_queue *vq;
1247 
1248 	vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1249 			RTE_CACHE_LINE_SIZE, socket_id);
1250 	if (vq == NULL) {
1251 		VHOST_LOG(ERR, "Failed to allocate memory for tx queue\n");
1252 		return -ENOMEM;
1253 	}
1254 
1255 	vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
1256 	rte_spinlock_init(&vq->intr_lock);
1257 	dev->data->tx_queues[tx_queue_id] = vq;
1258 
1259 	return 0;
1260 }
1261 
1262 static int
1263 eth_dev_info(struct rte_eth_dev *dev,
1264 	     struct rte_eth_dev_info *dev_info)
1265 {
1266 	struct pmd_internal *internal;
1267 
1268 	internal = dev->data->dev_private;
1269 	if (internal == NULL) {
1270 		VHOST_LOG(ERR, "Invalid device specified\n");
1271 		return -ENODEV;
1272 	}
1273 
1274 	dev_info->max_mac_addrs = 1;
1275 	dev_info->max_rx_pktlen = (uint32_t)-1;
1276 	dev_info->max_rx_queues = internal->max_queues;
1277 	dev_info->max_tx_queues = internal->max_queues;
1278 	dev_info->min_rx_bufsize = 0;
1279 
1280 	dev_info->tx_offload_capa = RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
1281 				RTE_ETH_TX_OFFLOAD_VLAN_INSERT;
1282 	dev_info->rx_offload_capa = RTE_ETH_RX_OFFLOAD_VLAN_STRIP;
1283 
1284 	return 0;
1285 }
1286 
1287 static int
1288 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1289 {
1290 	unsigned i;
1291 	unsigned long rx_total = 0, tx_total = 0;
1292 	unsigned long rx_total_bytes = 0, tx_total_bytes = 0;
1293 	struct vhost_queue *vq;
1294 
1295 	for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1296 			i < dev->data->nb_rx_queues; i++) {
1297 		if (dev->data->rx_queues[i] == NULL)
1298 			continue;
1299 		vq = dev->data->rx_queues[i];
1300 		stats->q_ipackets[i] = vq->stats.pkts;
1301 		rx_total += stats->q_ipackets[i];
1302 
1303 		stats->q_ibytes[i] = vq->stats.bytes;
1304 		rx_total_bytes += stats->q_ibytes[i];
1305 	}
1306 
1307 	for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1308 			i < dev->data->nb_tx_queues; i++) {
1309 		if (dev->data->tx_queues[i] == NULL)
1310 			continue;
1311 		vq = dev->data->tx_queues[i];
1312 		stats->q_opackets[i] = vq->stats.pkts;
1313 		tx_total += stats->q_opackets[i];
1314 
1315 		stats->q_obytes[i] = vq->stats.bytes;
1316 		tx_total_bytes += stats->q_obytes[i];
1317 	}
1318 
1319 	stats->ipackets = rx_total;
1320 	stats->opackets = tx_total;
1321 	stats->ibytes = rx_total_bytes;
1322 	stats->obytes = tx_total_bytes;
1323 
1324 	return 0;
1325 }
1326 
1327 static int
1328 eth_stats_reset(struct rte_eth_dev *dev)
1329 {
1330 	struct vhost_queue *vq;
1331 	unsigned i;
1332 
1333 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
1334 		if (dev->data->rx_queues[i] == NULL)
1335 			continue;
1336 		vq = dev->data->rx_queues[i];
1337 		vq->stats.pkts = 0;
1338 		vq->stats.bytes = 0;
1339 	}
1340 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
1341 		if (dev->data->tx_queues[i] == NULL)
1342 			continue;
1343 		vq = dev->data->tx_queues[i];
1344 		vq->stats.pkts = 0;
1345 		vq->stats.bytes = 0;
1346 		vq->stats.missed_pkts = 0;
1347 	}
1348 
1349 	return 0;
1350 }
1351 
1352 static void
1353 eth_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
1354 {
1355 	rte_free(dev->data->rx_queues[qid]);
1356 }
1357 
1358 static void
1359 eth_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
1360 {
1361 	rte_free(dev->data->tx_queues[qid]);
1362 }
1363 
1364 static int
1365 eth_tx_done_cleanup(void *txq __rte_unused, uint32_t free_cnt __rte_unused)
1366 {
1367 	/*
1368 	 * vHost does not hang onto mbuf. eth_vhost_tx() copies packet data
1369 	 * and releases mbuf, so nothing to cleanup.
1370 	 */
1371 	return 0;
1372 }
1373 
1374 static int
1375 eth_link_update(struct rte_eth_dev *dev __rte_unused,
1376 		int wait_to_complete __rte_unused)
1377 {
1378 	return 0;
1379 }
1380 
1381 static uint32_t
1382 eth_rx_queue_count(void *rx_queue)
1383 {
1384 	struct vhost_queue *vq;
1385 
1386 	vq = rx_queue;
1387 	if (vq == NULL)
1388 		return 0;
1389 
1390 	return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
1391 }
1392 
1393 #define CLB_VAL_IDX 0
1394 #define CLB_MSK_IDX 1
1395 #define CLB_MATCH_IDX 2
1396 static int
1397 vhost_monitor_callback(const uint64_t value,
1398 		const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
1399 {
1400 	const uint64_t m = opaque[CLB_MSK_IDX];
1401 	const uint64_t v = opaque[CLB_VAL_IDX];
1402 	const uint64_t c = opaque[CLB_MATCH_IDX];
1403 
1404 	if (c)
1405 		return (value & m) == v ? -1 : 0;
1406 	else
1407 		return (value & m) == v ? 0 : -1;
1408 }
1409 
1410 static int
1411 vhost_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
1412 {
1413 	struct vhost_queue *vq = rx_queue;
1414 	struct rte_vhost_power_monitor_cond vhost_pmc;
1415 	int ret;
1416 	if (vq == NULL)
1417 		return -EINVAL;
1418 	ret = rte_vhost_get_monitor_addr(vq->vid, vq->virtqueue_id,
1419 			&vhost_pmc);
1420 	if (ret < 0)
1421 		return -EINVAL;
1422 	pmc->addr = vhost_pmc.addr;
1423 	pmc->opaque[CLB_VAL_IDX] = vhost_pmc.val;
1424 	pmc->opaque[CLB_MSK_IDX] = vhost_pmc.mask;
1425 	pmc->opaque[CLB_MATCH_IDX] = vhost_pmc.match;
1426 	pmc->size = vhost_pmc.size;
1427 	pmc->fn = vhost_monitor_callback;
1428 
1429 	return 0;
1430 }
1431 
1432 static const struct eth_dev_ops ops = {
1433 	.dev_start = eth_dev_start,
1434 	.dev_stop = eth_dev_stop,
1435 	.dev_close = eth_dev_close,
1436 	.dev_configure = eth_dev_configure,
1437 	.dev_infos_get = eth_dev_info,
1438 	.rx_queue_setup = eth_rx_queue_setup,
1439 	.tx_queue_setup = eth_tx_queue_setup,
1440 	.rx_queue_release = eth_rx_queue_release,
1441 	.tx_queue_release = eth_tx_queue_release,
1442 	.tx_done_cleanup = eth_tx_done_cleanup,
1443 	.link_update = eth_link_update,
1444 	.stats_get = eth_stats_get,
1445 	.stats_reset = eth_stats_reset,
1446 	.xstats_reset = vhost_dev_xstats_reset,
1447 	.xstats_get = vhost_dev_xstats_get,
1448 	.xstats_get_names = vhost_dev_xstats_get_names,
1449 	.rx_queue_intr_enable = eth_rxq_intr_enable,
1450 	.rx_queue_intr_disable = eth_rxq_intr_disable,
1451 	.get_monitor_addr = vhost_get_monitor_addr,
1452 };
1453 
1454 static int
1455 eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name,
1456 	int16_t queues, const unsigned int numa_node, uint64_t flags,
1457 	uint64_t disable_flags)
1458 {
1459 	const char *name = rte_vdev_device_name(dev);
1460 	struct rte_eth_dev_data *data;
1461 	struct pmd_internal *internal = NULL;
1462 	struct rte_eth_dev *eth_dev = NULL;
1463 	struct rte_ether_addr *eth_addr = NULL;
1464 
1465 	VHOST_LOG(INFO, "Creating VHOST-USER backend on numa socket %u\n",
1466 		numa_node);
1467 
1468 	/* reserve an ethdev entry */
1469 	eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal));
1470 	if (eth_dev == NULL)
1471 		goto error;
1472 	data = eth_dev->data;
1473 
1474 	eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
1475 	if (eth_addr == NULL)
1476 		goto error;
1477 	data->mac_addrs = eth_addr;
1478 	*eth_addr = base_eth_addr;
1479 	eth_addr->addr_bytes[5] = eth_dev->data->port_id;
1480 
1481 	/* now put it all together
1482 	 * - store queue data in internal,
1483 	 * - point eth_dev_data to internals
1484 	 * - and point eth_dev structure to new eth_dev_data structure
1485 	 */
1486 	internal = eth_dev->data->dev_private;
1487 	internal->iface_name = rte_malloc_socket(name, strlen(iface_name) + 1,
1488 						 0, numa_node);
1489 	if (internal->iface_name == NULL)
1490 		goto error;
1491 	strcpy(internal->iface_name, iface_name);
1492 
1493 	data->nb_rx_queues = queues;
1494 	data->nb_tx_queues = queues;
1495 	internal->max_queues = queues;
1496 	internal->vid = -1;
1497 	internal->flags = flags;
1498 	internal->disable_flags = disable_flags;
1499 	data->dev_link = pmd_link;
1500 	data->dev_flags = RTE_ETH_DEV_INTR_LSC |
1501 				RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
1502 	data->promiscuous = 1;
1503 	data->all_multicast = 1;
1504 
1505 	eth_dev->dev_ops = &ops;
1506 	eth_dev->rx_queue_count = eth_rx_queue_count;
1507 
1508 	/* finally assign rx and tx ops */
1509 	eth_dev->rx_pkt_burst = eth_vhost_rx;
1510 	eth_dev->tx_pkt_burst = eth_vhost_tx;
1511 
1512 	rte_eth_dev_probing_finish(eth_dev);
1513 	return 0;
1514 
1515 error:
1516 	if (internal)
1517 		rte_free(internal->iface_name);
1518 	rte_eth_dev_release_port(eth_dev);
1519 
1520 	return -1;
1521 }
1522 
1523 static inline int
1524 open_iface(const char *key __rte_unused, const char *value, void *extra_args)
1525 {
1526 	const char **iface_name = extra_args;
1527 
1528 	if (value == NULL)
1529 		return -1;
1530 
1531 	*iface_name = value;
1532 
1533 	return 0;
1534 }
1535 
1536 static inline int
1537 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1538 {
1539 	uint16_t *n = extra_args;
1540 
1541 	if (value == NULL || extra_args == NULL)
1542 		return -EINVAL;
1543 
1544 	*n = (uint16_t)strtoul(value, NULL, 0);
1545 	if (*n == USHRT_MAX && errno == ERANGE)
1546 		return -1;
1547 
1548 	return 0;
1549 }
1550 
1551 static int
1552 rte_pmd_vhost_probe(struct rte_vdev_device *dev)
1553 {
1554 	struct rte_kvargs *kvlist = NULL;
1555 	int ret = 0;
1556 	char *iface_name;
1557 	uint16_t queues;
1558 	uint64_t flags = 0;
1559 	uint64_t disable_flags = 0;
1560 	int client_mode = 0;
1561 	int iommu_support = 0;
1562 	int postcopy_support = 0;
1563 	int tso = 0;
1564 	int linear_buf = 0;
1565 	int ext_buf = 0;
1566 	struct rte_eth_dev *eth_dev;
1567 	const char *name = rte_vdev_device_name(dev);
1568 
1569 	VHOST_LOG(INFO, "Initializing pmd_vhost for %s\n", name);
1570 
1571 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1572 		eth_dev = rte_eth_dev_attach_secondary(name);
1573 		if (!eth_dev) {
1574 			VHOST_LOG(ERR, "Failed to probe %s\n", name);
1575 			return -1;
1576 		}
1577 		eth_dev->rx_pkt_burst = eth_vhost_rx;
1578 		eth_dev->tx_pkt_burst = eth_vhost_tx;
1579 		eth_dev->dev_ops = &ops;
1580 		if (dev->device.numa_node == SOCKET_ID_ANY)
1581 			dev->device.numa_node = rte_socket_id();
1582 		eth_dev->device = &dev->device;
1583 		rte_eth_dev_probing_finish(eth_dev);
1584 		return 0;
1585 	}
1586 
1587 	kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1588 	if (kvlist == NULL)
1589 		return -1;
1590 
1591 	if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
1592 		ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
1593 					 &open_iface, &iface_name);
1594 		if (ret < 0)
1595 			goto out_free;
1596 	} else {
1597 		ret = -1;
1598 		goto out_free;
1599 	}
1600 
1601 	if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) {
1602 		ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG,
1603 					 &open_int, &queues);
1604 		if (ret < 0 || queues > RTE_MAX_QUEUES_PER_PORT)
1605 			goto out_free;
1606 
1607 	} else
1608 		queues = 1;
1609 
1610 	if (rte_kvargs_count(kvlist, ETH_VHOST_CLIENT_ARG) == 1) {
1611 		ret = rte_kvargs_process(kvlist, ETH_VHOST_CLIENT_ARG,
1612 					 &open_int, &client_mode);
1613 		if (ret < 0)
1614 			goto out_free;
1615 
1616 		if (client_mode)
1617 			flags |= RTE_VHOST_USER_CLIENT;
1618 	}
1619 
1620 	if (rte_kvargs_count(kvlist, ETH_VHOST_IOMMU_SUPPORT) == 1) {
1621 		ret = rte_kvargs_process(kvlist, ETH_VHOST_IOMMU_SUPPORT,
1622 					 &open_int, &iommu_support);
1623 		if (ret < 0)
1624 			goto out_free;
1625 
1626 		if (iommu_support)
1627 			flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
1628 	}
1629 
1630 	if (rte_kvargs_count(kvlist, ETH_VHOST_POSTCOPY_SUPPORT) == 1) {
1631 		ret = rte_kvargs_process(kvlist, ETH_VHOST_POSTCOPY_SUPPORT,
1632 					 &open_int, &postcopy_support);
1633 		if (ret < 0)
1634 			goto out_free;
1635 
1636 		if (postcopy_support)
1637 			flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
1638 	}
1639 
1640 	if (rte_kvargs_count(kvlist, ETH_VHOST_VIRTIO_NET_F_HOST_TSO) == 1) {
1641 		ret = rte_kvargs_process(kvlist,
1642 				ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
1643 				&open_int, &tso);
1644 		if (ret < 0)
1645 			goto out_free;
1646 
1647 		if (tso == 0) {
1648 			disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO4);
1649 			disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO6);
1650 		}
1651 	}
1652 
1653 	if (rte_kvargs_count(kvlist, ETH_VHOST_LINEAR_BUF) == 1) {
1654 		ret = rte_kvargs_process(kvlist,
1655 				ETH_VHOST_LINEAR_BUF,
1656 				&open_int, &linear_buf);
1657 		if (ret < 0)
1658 			goto out_free;
1659 
1660 		if (linear_buf == 1)
1661 			flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
1662 	}
1663 
1664 	if (rte_kvargs_count(kvlist, ETH_VHOST_EXT_BUF) == 1) {
1665 		ret = rte_kvargs_process(kvlist,
1666 				ETH_VHOST_EXT_BUF,
1667 				&open_int, &ext_buf);
1668 		if (ret < 0)
1669 			goto out_free;
1670 
1671 		if (ext_buf == 1)
1672 			flags |= RTE_VHOST_USER_EXTBUF_SUPPORT;
1673 	}
1674 
1675 	if (dev->device.numa_node == SOCKET_ID_ANY)
1676 		dev->device.numa_node = rte_socket_id();
1677 
1678 	ret = eth_dev_vhost_create(dev, iface_name, queues,
1679 				   dev->device.numa_node, flags, disable_flags);
1680 	if (ret == -1)
1681 		VHOST_LOG(ERR, "Failed to create %s\n", name);
1682 
1683 out_free:
1684 	rte_kvargs_free(kvlist);
1685 	return ret;
1686 }
1687 
1688 static int
1689 rte_pmd_vhost_remove(struct rte_vdev_device *dev)
1690 {
1691 	const char *name;
1692 	struct rte_eth_dev *eth_dev = NULL;
1693 
1694 	name = rte_vdev_device_name(dev);
1695 	VHOST_LOG(INFO, "Un-Initializing pmd_vhost for %s\n", name);
1696 
1697 	/* find an ethdev entry */
1698 	eth_dev = rte_eth_dev_allocated(name);
1699 	if (eth_dev == NULL)
1700 		return 0;
1701 
1702 	eth_dev_close(eth_dev);
1703 	rte_eth_dev_release_port(eth_dev);
1704 
1705 	return 0;
1706 }
1707 
1708 static struct rte_vdev_driver pmd_vhost_drv = {
1709 	.probe = rte_pmd_vhost_probe,
1710 	.remove = rte_pmd_vhost_remove,
1711 };
1712 
1713 RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv);
1714 RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost);
1715 RTE_PMD_REGISTER_PARAM_STRING(net_vhost,
1716 	"iface=<ifc> "
1717 	"queues=<int> "
1718 	"client=<0|1> "
1719 	"iommu-support=<0|1> "
1720 	"postcopy-support=<0|1> "
1721 	"tso=<0|1> "
1722 	"linear-buffer=<0|1> "
1723 	"ext-buffer=<0|1>");
1724