xref: /f-stack/dpdk/examples/vhost/main.c (revision 2d9fd380)
1d30ea906Sjfb8856606 /* SPDX-License-Identifier: BSD-3-Clause
2d30ea906Sjfb8856606  * Copyright(c) 2010-2017 Intel Corporation
3a9643ea8Slogwang  */
4a9643ea8Slogwang 
5a9643ea8Slogwang #include <arpa/inet.h>
6a9643ea8Slogwang #include <getopt.h>
7a9643ea8Slogwang #include <linux/if_ether.h>
8a9643ea8Slogwang #include <linux/if_vlan.h>
9a9643ea8Slogwang #include <linux/virtio_net.h>
10a9643ea8Slogwang #include <linux/virtio_ring.h>
11a9643ea8Slogwang #include <signal.h>
12a9643ea8Slogwang #include <stdint.h>
13a9643ea8Slogwang #include <sys/eventfd.h>
14a9643ea8Slogwang #include <sys/param.h>
15a9643ea8Slogwang #include <unistd.h>
16a9643ea8Slogwang 
17a9643ea8Slogwang #include <rte_atomic.h>
18a9643ea8Slogwang #include <rte_cycles.h>
19a9643ea8Slogwang #include <rte_ethdev.h>
20a9643ea8Slogwang #include <rte_log.h>
21a9643ea8Slogwang #include <rte_string_fns.h>
22a9643ea8Slogwang #include <rte_malloc.h>
232bfe3f2eSlogwang #include <rte_vhost.h>
24a9643ea8Slogwang #include <rte_ip.h>
25a9643ea8Slogwang #include <rte_tcp.h>
262bfe3f2eSlogwang #include <rte_pause.h>
27a9643ea8Slogwang 
28*2d9fd380Sjfb8856606 #include "ioat.h"
29a9643ea8Slogwang #include "main.h"
30a9643ea8Slogwang 
31a9643ea8Slogwang #ifndef MAX_QUEUES
32a9643ea8Slogwang #define MAX_QUEUES 128
33a9643ea8Slogwang #endif
34a9643ea8Slogwang 
35a9643ea8Slogwang /* the maximum number of external ports supported */
36a9643ea8Slogwang #define MAX_SUP_PORTS 1
37a9643ea8Slogwang 
38a9643ea8Slogwang #define MBUF_CACHE_SIZE	128
39a9643ea8Slogwang #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
40a9643ea8Slogwang 
41a9643ea8Slogwang #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
42a9643ea8Slogwang 
43a9643ea8Slogwang #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
44a9643ea8Slogwang #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
45a9643ea8Slogwang 
46a9643ea8Slogwang #define JUMBO_FRAME_MAX_SIZE    0x2600
47a9643ea8Slogwang 
48a9643ea8Slogwang /* State of virtio device. */
49a9643ea8Slogwang #define DEVICE_MAC_LEARNING 0
50a9643ea8Slogwang #define DEVICE_RX			1
51a9643ea8Slogwang #define DEVICE_SAFE_REMOVE	2
52a9643ea8Slogwang 
53a9643ea8Slogwang /* Configurable number of RX/TX ring descriptors */
54a9643ea8Slogwang #define RTE_TEST_RX_DESC_DEFAULT 1024
55a9643ea8Slogwang #define RTE_TEST_TX_DESC_DEFAULT 512
56a9643ea8Slogwang 
57a9643ea8Slogwang #define INVALID_PORT_ID 0xFF
58a9643ea8Slogwang 
59a9643ea8Slogwang /* Maximum long option length for option parsing. */
60a9643ea8Slogwang #define MAX_LONG_OPT_SZ 64
61a9643ea8Slogwang 
62a9643ea8Slogwang /* mask of enabled ports */
63a9643ea8Slogwang static uint32_t enabled_port_mask = 0;
64a9643ea8Slogwang 
65a9643ea8Slogwang /* Promiscuous mode */
66a9643ea8Slogwang static uint32_t promiscuous;
67a9643ea8Slogwang 
68a9643ea8Slogwang /* number of devices/queues to support*/
69a9643ea8Slogwang static uint32_t num_queues = 0;
70a9643ea8Slogwang static uint32_t num_devices;
71a9643ea8Slogwang 
72a9643ea8Slogwang static struct rte_mempool *mbuf_pool;
73a9643ea8Slogwang static int mergeable;
74a9643ea8Slogwang 
75a9643ea8Slogwang /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
76a9643ea8Slogwang typedef enum {
77a9643ea8Slogwang 	VM2VM_DISABLED = 0,
78a9643ea8Slogwang 	VM2VM_SOFTWARE = 1,
79a9643ea8Slogwang 	VM2VM_HARDWARE = 2,
80a9643ea8Slogwang 	VM2VM_LAST
81a9643ea8Slogwang } vm2vm_type;
82a9643ea8Slogwang static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
83a9643ea8Slogwang 
84a9643ea8Slogwang /* Enable stats. */
85a9643ea8Slogwang static uint32_t enable_stats = 0;
86a9643ea8Slogwang /* Enable retries on RX. */
87a9643ea8Slogwang static uint32_t enable_retry = 1;
88a9643ea8Slogwang 
89a9643ea8Slogwang /* Disable TX checksum offload */
90a9643ea8Slogwang static uint32_t enable_tx_csum;
91a9643ea8Slogwang 
92a9643ea8Slogwang /* Disable TSO offload */
93a9643ea8Slogwang static uint32_t enable_tso;
94a9643ea8Slogwang 
95a9643ea8Slogwang static int client_mode;
962bfe3f2eSlogwang 
972bfe3f2eSlogwang static int builtin_net_driver;
98a9643ea8Slogwang 
99*2d9fd380Sjfb8856606 static int async_vhost_driver;
100*2d9fd380Sjfb8856606 
101*2d9fd380Sjfb8856606 static char dma_type[MAX_LONG_OPT_SZ];
102*2d9fd380Sjfb8856606 
103a9643ea8Slogwang /* Specify timeout (in useconds) between retries on RX. */
104a9643ea8Slogwang static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
105a9643ea8Slogwang /* Specify the number of retries on RX. */
106a9643ea8Slogwang static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
107a9643ea8Slogwang 
1082bfe3f2eSlogwang /* Socket file paths. Can be set by user */
1092bfe3f2eSlogwang static char *socket_files;
1102bfe3f2eSlogwang static int nb_sockets;
111a9643ea8Slogwang 
112a9643ea8Slogwang /* empty vmdq configuration structure. Filled in programatically */
113a9643ea8Slogwang static struct rte_eth_conf vmdq_conf_default = {
114a9643ea8Slogwang 	.rxmode = {
115a9643ea8Slogwang 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
116a9643ea8Slogwang 		.split_hdr_size = 0,
117a9643ea8Slogwang 		/*
118d30ea906Sjfb8856606 		 * VLAN strip is necessary for 1G NIC such as I350,
119a9643ea8Slogwang 		 * this fixes bug of ipv4 forwarding in guest can't
120a9643ea8Slogwang 		 * forward pakets from one virtio dev to another virtio dev.
121a9643ea8Slogwang 		 */
122d30ea906Sjfb8856606 		.offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
123a9643ea8Slogwang 	},
124a9643ea8Slogwang 
125a9643ea8Slogwang 	.txmode = {
126a9643ea8Slogwang 		.mq_mode = ETH_MQ_TX_NONE,
127d30ea906Sjfb8856606 		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
128d30ea906Sjfb8856606 			     DEV_TX_OFFLOAD_TCP_CKSUM |
129d30ea906Sjfb8856606 			     DEV_TX_OFFLOAD_VLAN_INSERT |
130d30ea906Sjfb8856606 			     DEV_TX_OFFLOAD_MULTI_SEGS |
131d30ea906Sjfb8856606 			     DEV_TX_OFFLOAD_TCP_TSO),
132a9643ea8Slogwang 	},
133a9643ea8Slogwang 	.rx_adv_conf = {
134a9643ea8Slogwang 		/*
135a9643ea8Slogwang 		 * should be overridden separately in code with
136a9643ea8Slogwang 		 * appropriate values
137a9643ea8Slogwang 		 */
138a9643ea8Slogwang 		.vmdq_rx_conf = {
139a9643ea8Slogwang 			.nb_queue_pools = ETH_8_POOLS,
140a9643ea8Slogwang 			.enable_default_pool = 0,
141a9643ea8Slogwang 			.default_pool = 0,
142a9643ea8Slogwang 			.nb_pool_maps = 0,
143a9643ea8Slogwang 			.pool_map = {{0, 0},},
144a9643ea8Slogwang 		},
145a9643ea8Slogwang 	},
146a9643ea8Slogwang };
147a9643ea8Slogwang 
148d30ea906Sjfb8856606 
149a9643ea8Slogwang static unsigned lcore_ids[RTE_MAX_LCORE];
1502bfe3f2eSlogwang static uint16_t ports[RTE_MAX_ETHPORTS];
151a9643ea8Slogwang static unsigned num_ports = 0; /**< The number of ports specified in command line */
152a9643ea8Slogwang static uint16_t num_pf_queues, num_vmdq_queues;
153a9643ea8Slogwang static uint16_t vmdq_pool_base, vmdq_queue_base;
154a9643ea8Slogwang static uint16_t queues_per_pool;
155a9643ea8Slogwang 
156a9643ea8Slogwang const uint16_t vlan_tags[] = {
157a9643ea8Slogwang 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
158a9643ea8Slogwang 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
159a9643ea8Slogwang 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
160a9643ea8Slogwang 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
161a9643ea8Slogwang 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
162a9643ea8Slogwang 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
163a9643ea8Slogwang 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
164a9643ea8Slogwang 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
165a9643ea8Slogwang };
166a9643ea8Slogwang 
167a9643ea8Slogwang /* ethernet addresses of ports */
1684418919fSjohnjiang static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
169a9643ea8Slogwang 
170a9643ea8Slogwang static struct vhost_dev_tailq_list vhost_dev_list =
171a9643ea8Slogwang 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
172a9643ea8Slogwang 
173a9643ea8Slogwang static struct lcore_info lcore_info[RTE_MAX_LCORE];
174a9643ea8Slogwang 
175a9643ea8Slogwang /* Used for queueing bursts of TX packets. */
176a9643ea8Slogwang struct mbuf_table {
177a9643ea8Slogwang 	unsigned len;
178a9643ea8Slogwang 	unsigned txq_id;
179a9643ea8Slogwang 	struct rte_mbuf *m_table[MAX_PKT_BURST];
180a9643ea8Slogwang };
181a9643ea8Slogwang 
182a9643ea8Slogwang /* TX queue for each data core. */
183a9643ea8Slogwang struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
184a9643ea8Slogwang 
185a9643ea8Slogwang #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
186a9643ea8Slogwang 				 / US_PER_S * BURST_TX_DRAIN_US)
187a9643ea8Slogwang #define VLAN_HLEN       4
188a9643ea8Slogwang 
189*2d9fd380Sjfb8856606 static inline int
open_dma(const char * value)190*2d9fd380Sjfb8856606 open_dma(const char *value)
191*2d9fd380Sjfb8856606 {
192*2d9fd380Sjfb8856606 	if (strncmp(dma_type, "ioat", 4) == 0)
193*2d9fd380Sjfb8856606 		return open_ioat(value);
194*2d9fd380Sjfb8856606 
195*2d9fd380Sjfb8856606 	return -1;
196*2d9fd380Sjfb8856606 }
197*2d9fd380Sjfb8856606 
198a9643ea8Slogwang /*
199a9643ea8Slogwang  * Builds up the correct configuration for VMDQ VLAN pool map
200a9643ea8Slogwang  * according to the pool & queue limits.
201a9643ea8Slogwang  */
202a9643ea8Slogwang static inline int
get_eth_conf(struct rte_eth_conf * eth_conf,uint32_t num_devices)203a9643ea8Slogwang get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
204a9643ea8Slogwang {
205a9643ea8Slogwang 	struct rte_eth_vmdq_rx_conf conf;
206a9643ea8Slogwang 	struct rte_eth_vmdq_rx_conf *def_conf =
207a9643ea8Slogwang 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
208a9643ea8Slogwang 	unsigned i;
209a9643ea8Slogwang 
210a9643ea8Slogwang 	memset(&conf, 0, sizeof(conf));
211a9643ea8Slogwang 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
212a9643ea8Slogwang 	conf.nb_pool_maps = num_devices;
213a9643ea8Slogwang 	conf.enable_loop_back = def_conf->enable_loop_back;
214a9643ea8Slogwang 	conf.rx_mode = def_conf->rx_mode;
215a9643ea8Slogwang 
216a9643ea8Slogwang 	for (i = 0; i < conf.nb_pool_maps; i++) {
217a9643ea8Slogwang 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
218a9643ea8Slogwang 		conf.pool_map[i].pools = (1UL << i);
219a9643ea8Slogwang 	}
220a9643ea8Slogwang 
221a9643ea8Slogwang 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
222a9643ea8Slogwang 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
223a9643ea8Slogwang 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
224a9643ea8Slogwang 	return 0;
225a9643ea8Slogwang }
226a9643ea8Slogwang 
227a9643ea8Slogwang /*
228a9643ea8Slogwang  * Initialises a given port using global settings and with the rx buffers
229a9643ea8Slogwang  * coming from the mbuf_pool passed as parameter
230a9643ea8Slogwang  */
231a9643ea8Slogwang static inline int
port_init(uint16_t port)2322bfe3f2eSlogwang port_init(uint16_t port)
233a9643ea8Slogwang {
234a9643ea8Slogwang 	struct rte_eth_dev_info dev_info;
235a9643ea8Slogwang 	struct rte_eth_conf port_conf;
236a9643ea8Slogwang 	struct rte_eth_rxconf *rxconf;
237a9643ea8Slogwang 	struct rte_eth_txconf *txconf;
238a9643ea8Slogwang 	int16_t rx_rings, tx_rings;
239a9643ea8Slogwang 	uint16_t rx_ring_size, tx_ring_size;
240a9643ea8Slogwang 	int retval;
241a9643ea8Slogwang 	uint16_t q;
242a9643ea8Slogwang 
243a9643ea8Slogwang 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
2444418919fSjohnjiang 	retval = rte_eth_dev_info_get(port, &dev_info);
2454418919fSjohnjiang 	if (retval != 0) {
2464418919fSjohnjiang 		RTE_LOG(ERR, VHOST_PORT,
2474418919fSjohnjiang 			"Error during getting device (port %u) info: %s\n",
2484418919fSjohnjiang 			port, strerror(-retval));
2494418919fSjohnjiang 
2504418919fSjohnjiang 		return retval;
2514418919fSjohnjiang 	}
252a9643ea8Slogwang 
253a9643ea8Slogwang 	rxconf = &dev_info.default_rxconf;
254a9643ea8Slogwang 	txconf = &dev_info.default_txconf;
255a9643ea8Slogwang 	rxconf->rx_drop_en = 1;
256a9643ea8Slogwang 
257a9643ea8Slogwang 	/*configure the number of supported virtio devices based on VMDQ limits */
258a9643ea8Slogwang 	num_devices = dev_info.max_vmdq_pools;
259a9643ea8Slogwang 
260a9643ea8Slogwang 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
261a9643ea8Slogwang 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
2622bfe3f2eSlogwang 
263a9643ea8Slogwang 	tx_rings = (uint16_t)rte_lcore_count();
264a9643ea8Slogwang 
265a9643ea8Slogwang 	/* Get port configuration. */
266a9643ea8Slogwang 	retval = get_eth_conf(&port_conf, num_devices);
267a9643ea8Slogwang 	if (retval < 0)
268a9643ea8Slogwang 		return retval;
269a9643ea8Slogwang 	/* NIC queues are divided into pf queues and vmdq queues.  */
270a9643ea8Slogwang 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
271a9643ea8Slogwang 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
272a9643ea8Slogwang 	num_vmdq_queues = num_devices * queues_per_pool;
273a9643ea8Slogwang 	num_queues = num_pf_queues + num_vmdq_queues;
274a9643ea8Slogwang 	vmdq_queue_base = dev_info.vmdq_queue_base;
275a9643ea8Slogwang 	vmdq_pool_base  = dev_info.vmdq_pool_base;
276a9643ea8Slogwang 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
277a9643ea8Slogwang 		num_pf_queues, num_devices, queues_per_pool);
278a9643ea8Slogwang 
279d30ea906Sjfb8856606 	if (!rte_eth_dev_is_valid_port(port))
280d30ea906Sjfb8856606 		return -1;
281a9643ea8Slogwang 
282a9643ea8Slogwang 	rx_rings = (uint16_t)dev_info.max_rx_queues;
283d30ea906Sjfb8856606 	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
284d30ea906Sjfb8856606 		port_conf.txmode.offloads |=
285d30ea906Sjfb8856606 			DEV_TX_OFFLOAD_MBUF_FAST_FREE;
286a9643ea8Slogwang 	/* Configure ethernet device. */
287a9643ea8Slogwang 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
288a9643ea8Slogwang 	if (retval != 0) {
289a9643ea8Slogwang 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
290a9643ea8Slogwang 			port, strerror(-retval));
291a9643ea8Slogwang 		return retval;
292a9643ea8Slogwang 	}
293a9643ea8Slogwang 
2942bfe3f2eSlogwang 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
2952bfe3f2eSlogwang 		&tx_ring_size);
2962bfe3f2eSlogwang 	if (retval != 0) {
2972bfe3f2eSlogwang 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
2982bfe3f2eSlogwang 			"for port %u: %s.\n", port, strerror(-retval));
2992bfe3f2eSlogwang 		return retval;
3002bfe3f2eSlogwang 	}
3012bfe3f2eSlogwang 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
3022bfe3f2eSlogwang 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
3032bfe3f2eSlogwang 			"for Rx queues on port %u.\n", port);
3042bfe3f2eSlogwang 		return -1;
3052bfe3f2eSlogwang 	}
3062bfe3f2eSlogwang 
307a9643ea8Slogwang 	/* Setup the queues. */
308d30ea906Sjfb8856606 	rxconf->offloads = port_conf.rxmode.offloads;
309a9643ea8Slogwang 	for (q = 0; q < rx_rings; q ++) {
310a9643ea8Slogwang 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
311a9643ea8Slogwang 						rte_eth_dev_socket_id(port),
312a9643ea8Slogwang 						rxconf,
313a9643ea8Slogwang 						mbuf_pool);
314a9643ea8Slogwang 		if (retval < 0) {
315a9643ea8Slogwang 			RTE_LOG(ERR, VHOST_PORT,
316a9643ea8Slogwang 				"Failed to setup rx queue %u of port %u: %s.\n",
317a9643ea8Slogwang 				q, port, strerror(-retval));
318a9643ea8Slogwang 			return retval;
319a9643ea8Slogwang 		}
320a9643ea8Slogwang 	}
321d30ea906Sjfb8856606 	txconf->offloads = port_conf.txmode.offloads;
322a9643ea8Slogwang 	for (q = 0; q < tx_rings; q ++) {
323a9643ea8Slogwang 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
324a9643ea8Slogwang 						rte_eth_dev_socket_id(port),
325a9643ea8Slogwang 						txconf);
326a9643ea8Slogwang 		if (retval < 0) {
327a9643ea8Slogwang 			RTE_LOG(ERR, VHOST_PORT,
328a9643ea8Slogwang 				"Failed to setup tx queue %u of port %u: %s.\n",
329a9643ea8Slogwang 				q, port, strerror(-retval));
330a9643ea8Slogwang 			return retval;
331a9643ea8Slogwang 		}
332a9643ea8Slogwang 	}
333a9643ea8Slogwang 
334a9643ea8Slogwang 	/* Start the device. */
335a9643ea8Slogwang 	retval  = rte_eth_dev_start(port);
336a9643ea8Slogwang 	if (retval < 0) {
337a9643ea8Slogwang 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
338a9643ea8Slogwang 			port, strerror(-retval));
339a9643ea8Slogwang 		return retval;
340a9643ea8Slogwang 	}
341a9643ea8Slogwang 
3424418919fSjohnjiang 	if (promiscuous) {
3434418919fSjohnjiang 		retval = rte_eth_promiscuous_enable(port);
3444418919fSjohnjiang 		if (retval != 0) {
3454418919fSjohnjiang 			RTE_LOG(ERR, VHOST_PORT,
3464418919fSjohnjiang 				"Failed to enable promiscuous mode on port %u: %s\n",
3474418919fSjohnjiang 				port, rte_strerror(-retval));
3484418919fSjohnjiang 			return retval;
3494418919fSjohnjiang 		}
3504418919fSjohnjiang 	}
351a9643ea8Slogwang 
3524418919fSjohnjiang 	retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
3534418919fSjohnjiang 	if (retval < 0) {
3544418919fSjohnjiang 		RTE_LOG(ERR, VHOST_PORT,
3554418919fSjohnjiang 			"Failed to get MAC address on port %u: %s\n",
3564418919fSjohnjiang 			port, rte_strerror(-retval));
3574418919fSjohnjiang 		return retval;
3584418919fSjohnjiang 	}
3594418919fSjohnjiang 
360a9643ea8Slogwang 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
361a9643ea8Slogwang 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
362a9643ea8Slogwang 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
3632bfe3f2eSlogwang 			port,
364a9643ea8Slogwang 			vmdq_ports_eth_addr[port].addr_bytes[0],
365a9643ea8Slogwang 			vmdq_ports_eth_addr[port].addr_bytes[1],
366a9643ea8Slogwang 			vmdq_ports_eth_addr[port].addr_bytes[2],
367a9643ea8Slogwang 			vmdq_ports_eth_addr[port].addr_bytes[3],
368a9643ea8Slogwang 			vmdq_ports_eth_addr[port].addr_bytes[4],
369a9643ea8Slogwang 			vmdq_ports_eth_addr[port].addr_bytes[5]);
370a9643ea8Slogwang 
371a9643ea8Slogwang 	return 0;
372a9643ea8Slogwang }
373a9643ea8Slogwang 
374a9643ea8Slogwang /*
3752bfe3f2eSlogwang  * Set socket file path.
376a9643ea8Slogwang  */
377a9643ea8Slogwang static int
us_vhost_parse_socket_path(const char * q_arg)3782bfe3f2eSlogwang us_vhost_parse_socket_path(const char *q_arg)
379a9643ea8Slogwang {
3801646932aSjfb8856606 	char *old;
3811646932aSjfb8856606 
382a9643ea8Slogwang 	/* parse number string */
3832bfe3f2eSlogwang 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
384a9643ea8Slogwang 		return -1;
3852bfe3f2eSlogwang 
3861646932aSjfb8856606 	old = socket_files;
3872bfe3f2eSlogwang 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
3881646932aSjfb8856606 	if (socket_files == NULL) {
3891646932aSjfb8856606 		free(old);
3901646932aSjfb8856606 		return -1;
3911646932aSjfb8856606 	}
3921646932aSjfb8856606 
3934418919fSjohnjiang 	strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
3942bfe3f2eSlogwang 	nb_sockets++;
395a9643ea8Slogwang 
396a9643ea8Slogwang 	return 0;
397a9643ea8Slogwang }
398a9643ea8Slogwang 
399a9643ea8Slogwang /*
400a9643ea8Slogwang  * Parse the portmask provided at run time.
401a9643ea8Slogwang  */
402a9643ea8Slogwang static int
parse_portmask(const char * portmask)403a9643ea8Slogwang parse_portmask(const char *portmask)
404a9643ea8Slogwang {
405a9643ea8Slogwang 	char *end = NULL;
406a9643ea8Slogwang 	unsigned long pm;
407a9643ea8Slogwang 
408a9643ea8Slogwang 	errno = 0;
409a9643ea8Slogwang 
410a9643ea8Slogwang 	/* parse hexadecimal string */
411a9643ea8Slogwang 	pm = strtoul(portmask, &end, 16);
412a9643ea8Slogwang 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
413*2d9fd380Sjfb8856606 		return 0;
414a9643ea8Slogwang 
415a9643ea8Slogwang 	return pm;
416a9643ea8Slogwang 
417a9643ea8Slogwang }
418a9643ea8Slogwang 
419a9643ea8Slogwang /*
420a9643ea8Slogwang  * Parse num options at run time.
421a9643ea8Slogwang  */
422a9643ea8Slogwang static int
parse_num_opt(const char * q_arg,uint32_t max_valid_value)423a9643ea8Slogwang parse_num_opt(const char *q_arg, uint32_t max_valid_value)
424a9643ea8Slogwang {
425a9643ea8Slogwang 	char *end = NULL;
426a9643ea8Slogwang 	unsigned long num;
427a9643ea8Slogwang 
428a9643ea8Slogwang 	errno = 0;
429a9643ea8Slogwang 
430a9643ea8Slogwang 	/* parse unsigned int string */
431a9643ea8Slogwang 	num = strtoul(q_arg, &end, 10);
432a9643ea8Slogwang 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
433a9643ea8Slogwang 		return -1;
434a9643ea8Slogwang 
435a9643ea8Slogwang 	if (num > max_valid_value)
436a9643ea8Slogwang 		return -1;
437a9643ea8Slogwang 
438a9643ea8Slogwang 	return num;
439a9643ea8Slogwang 
440a9643ea8Slogwang }
441a9643ea8Slogwang 
442a9643ea8Slogwang /*
443a9643ea8Slogwang  * Display usage
444a9643ea8Slogwang  */
445a9643ea8Slogwang static void
us_vhost_usage(const char * prgname)446a9643ea8Slogwang us_vhost_usage(const char *prgname)
447a9643ea8Slogwang {
448a9643ea8Slogwang 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
449a9643ea8Slogwang 	"		--vm2vm [0|1|2]\n"
450a9643ea8Slogwang 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
4512bfe3f2eSlogwang 	"		--socket-file <path>\n"
452a9643ea8Slogwang 	"		--nb-devices ND\n"
453a9643ea8Slogwang 	"		-p PORTMASK: Set mask for ports to be used by application\n"
454a9643ea8Slogwang 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
455a9643ea8Slogwang 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
456a9643ea8Slogwang 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
457a9643ea8Slogwang 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
458a9643ea8Slogwang 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
459a9643ea8Slogwang 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
4602bfe3f2eSlogwang 	"		--socket-file: The path of the socket file.\n"
461a9643ea8Slogwang 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
462a9643ea8Slogwang 	"		--tso [0|1] disable/enable TCP segment offload.\n"
4632bfe3f2eSlogwang 	"		--client register a vhost-user socket as client mode.\n"
464*2d9fd380Sjfb8856606 	"		--dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
465*2d9fd380Sjfb8856606 	"		--dmas register dma channel for specific vhost device.\n",
466a9643ea8Slogwang 	       prgname);
467a9643ea8Slogwang }
468a9643ea8Slogwang 
469a9643ea8Slogwang /*
470a9643ea8Slogwang  * Parse the arguments given in the command line of the application.
471a9643ea8Slogwang  */
472a9643ea8Slogwang static int
us_vhost_parse_args(int argc,char ** argv)473a9643ea8Slogwang us_vhost_parse_args(int argc, char **argv)
474a9643ea8Slogwang {
475a9643ea8Slogwang 	int opt, ret;
476a9643ea8Slogwang 	int option_index;
477a9643ea8Slogwang 	unsigned i;
478a9643ea8Slogwang 	const char *prgname = argv[0];
479a9643ea8Slogwang 	static struct option long_option[] = {
480a9643ea8Slogwang 		{"vm2vm", required_argument, NULL, 0},
481a9643ea8Slogwang 		{"rx-retry", required_argument, NULL, 0},
482a9643ea8Slogwang 		{"rx-retry-delay", required_argument, NULL, 0},
483a9643ea8Slogwang 		{"rx-retry-num", required_argument, NULL, 0},
484a9643ea8Slogwang 		{"mergeable", required_argument, NULL, 0},
485a9643ea8Slogwang 		{"stats", required_argument, NULL, 0},
4862bfe3f2eSlogwang 		{"socket-file", required_argument, NULL, 0},
487a9643ea8Slogwang 		{"tx-csum", required_argument, NULL, 0},
488a9643ea8Slogwang 		{"tso", required_argument, NULL, 0},
489a9643ea8Slogwang 		{"client", no_argument, &client_mode, 1},
4902bfe3f2eSlogwang 		{"builtin-net-driver", no_argument, &builtin_net_driver, 1},
491*2d9fd380Sjfb8856606 		{"dma-type", required_argument, NULL, 0},
492*2d9fd380Sjfb8856606 		{"dmas", required_argument, NULL, 0},
493a9643ea8Slogwang 		{NULL, 0, 0, 0},
494a9643ea8Slogwang 	};
495a9643ea8Slogwang 
496a9643ea8Slogwang 	/* Parse command line */
497a9643ea8Slogwang 	while ((opt = getopt_long(argc, argv, "p:P",
498a9643ea8Slogwang 			long_option, &option_index)) != EOF) {
499a9643ea8Slogwang 		switch (opt) {
500a9643ea8Slogwang 		/* Portmask */
501a9643ea8Slogwang 		case 'p':
502a9643ea8Slogwang 			enabled_port_mask = parse_portmask(optarg);
503a9643ea8Slogwang 			if (enabled_port_mask == 0) {
504a9643ea8Slogwang 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
505a9643ea8Slogwang 				us_vhost_usage(prgname);
506a9643ea8Slogwang 				return -1;
507a9643ea8Slogwang 			}
508a9643ea8Slogwang 			break;
509a9643ea8Slogwang 
510a9643ea8Slogwang 		case 'P':
511a9643ea8Slogwang 			promiscuous = 1;
512a9643ea8Slogwang 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
513a9643ea8Slogwang 				ETH_VMDQ_ACCEPT_BROADCAST |
514a9643ea8Slogwang 				ETH_VMDQ_ACCEPT_MULTICAST;
515a9643ea8Slogwang 
516a9643ea8Slogwang 			break;
517a9643ea8Slogwang 
518a9643ea8Slogwang 		case 0:
519a9643ea8Slogwang 			/* Enable/disable vm2vm comms. */
520a9643ea8Slogwang 			if (!strncmp(long_option[option_index].name, "vm2vm",
521a9643ea8Slogwang 				MAX_LONG_OPT_SZ)) {
522a9643ea8Slogwang 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
523a9643ea8Slogwang 				if (ret == -1) {
524a9643ea8Slogwang 					RTE_LOG(INFO, VHOST_CONFIG,
525a9643ea8Slogwang 						"Invalid argument for "
526a9643ea8Slogwang 						"vm2vm [0|1|2]\n");
527a9643ea8Slogwang 					us_vhost_usage(prgname);
528a9643ea8Slogwang 					return -1;
529a9643ea8Slogwang 				} else {
530a9643ea8Slogwang 					vm2vm_mode = (vm2vm_type)ret;
531a9643ea8Slogwang 				}
532a9643ea8Slogwang 			}
533a9643ea8Slogwang 
534a9643ea8Slogwang 			/* Enable/disable retries on RX. */
535a9643ea8Slogwang 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
536a9643ea8Slogwang 				ret = parse_num_opt(optarg, 1);
537a9643ea8Slogwang 				if (ret == -1) {
538a9643ea8Slogwang 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
539a9643ea8Slogwang 					us_vhost_usage(prgname);
540a9643ea8Slogwang 					return -1;
541a9643ea8Slogwang 				} else {
542a9643ea8Slogwang 					enable_retry = ret;
543a9643ea8Slogwang 				}
544a9643ea8Slogwang 			}
545a9643ea8Slogwang 
546a9643ea8Slogwang 			/* Enable/disable TX checksum offload. */
547a9643ea8Slogwang 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
548a9643ea8Slogwang 				ret = parse_num_opt(optarg, 1);
549a9643ea8Slogwang 				if (ret == -1) {
550a9643ea8Slogwang 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
551a9643ea8Slogwang 					us_vhost_usage(prgname);
552a9643ea8Slogwang 					return -1;
553a9643ea8Slogwang 				} else
554a9643ea8Slogwang 					enable_tx_csum = ret;
555a9643ea8Slogwang 			}
556a9643ea8Slogwang 
557a9643ea8Slogwang 			/* Enable/disable TSO offload. */
558a9643ea8Slogwang 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
559a9643ea8Slogwang 				ret = parse_num_opt(optarg, 1);
560a9643ea8Slogwang 				if (ret == -1) {
561a9643ea8Slogwang 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
562a9643ea8Slogwang 					us_vhost_usage(prgname);
563a9643ea8Slogwang 					return -1;
564a9643ea8Slogwang 				} else
565a9643ea8Slogwang 					enable_tso = ret;
566a9643ea8Slogwang 			}
567a9643ea8Slogwang 
568a9643ea8Slogwang 			/* Specify the retries delay time (in useconds) on RX. */
569a9643ea8Slogwang 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
570a9643ea8Slogwang 				ret = parse_num_opt(optarg, INT32_MAX);
571a9643ea8Slogwang 				if (ret == -1) {
572a9643ea8Slogwang 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
573a9643ea8Slogwang 					us_vhost_usage(prgname);
574a9643ea8Slogwang 					return -1;
575a9643ea8Slogwang 				} else {
576a9643ea8Slogwang 					burst_rx_delay_time = ret;
577a9643ea8Slogwang 				}
578a9643ea8Slogwang 			}
579a9643ea8Slogwang 
580a9643ea8Slogwang 			/* Specify the retries number on RX. */
581a9643ea8Slogwang 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
582a9643ea8Slogwang 				ret = parse_num_opt(optarg, INT32_MAX);
583a9643ea8Slogwang 				if (ret == -1) {
584a9643ea8Slogwang 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
585a9643ea8Slogwang 					us_vhost_usage(prgname);
586a9643ea8Slogwang 					return -1;
587a9643ea8Slogwang 				} else {
588a9643ea8Slogwang 					burst_rx_retry_num = ret;
589a9643ea8Slogwang 				}
590a9643ea8Slogwang 			}
591a9643ea8Slogwang 
592a9643ea8Slogwang 			/* Enable/disable RX mergeable buffers. */
593a9643ea8Slogwang 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
594a9643ea8Slogwang 				ret = parse_num_opt(optarg, 1);
595a9643ea8Slogwang 				if (ret == -1) {
596a9643ea8Slogwang 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
597a9643ea8Slogwang 					us_vhost_usage(prgname);
598a9643ea8Slogwang 					return -1;
599a9643ea8Slogwang 				} else {
600a9643ea8Slogwang 					mergeable = !!ret;
601a9643ea8Slogwang 					if (ret) {
602d30ea906Sjfb8856606 						vmdq_conf_default.rxmode.offloads |=
603d30ea906Sjfb8856606 							DEV_RX_OFFLOAD_JUMBO_FRAME;
604a9643ea8Slogwang 						vmdq_conf_default.rxmode.max_rx_pkt_len
605a9643ea8Slogwang 							= JUMBO_FRAME_MAX_SIZE;
606a9643ea8Slogwang 					}
607a9643ea8Slogwang 				}
608a9643ea8Slogwang 			}
609a9643ea8Slogwang 
610a9643ea8Slogwang 			/* Enable/disable stats. */
611a9643ea8Slogwang 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
612a9643ea8Slogwang 				ret = parse_num_opt(optarg, INT32_MAX);
613a9643ea8Slogwang 				if (ret == -1) {
6142bfe3f2eSlogwang 					RTE_LOG(INFO, VHOST_CONFIG,
6152bfe3f2eSlogwang 						"Invalid argument for stats [0..N]\n");
616a9643ea8Slogwang 					us_vhost_usage(prgname);
617a9643ea8Slogwang 					return -1;
618a9643ea8Slogwang 				} else {
619a9643ea8Slogwang 					enable_stats = ret;
620a9643ea8Slogwang 				}
621a9643ea8Slogwang 			}
622a9643ea8Slogwang 
6232bfe3f2eSlogwang 			/* Set socket file path. */
6242bfe3f2eSlogwang 			if (!strncmp(long_option[option_index].name,
6252bfe3f2eSlogwang 						"socket-file", MAX_LONG_OPT_SZ)) {
6262bfe3f2eSlogwang 				if (us_vhost_parse_socket_path(optarg) == -1) {
6272bfe3f2eSlogwang 					RTE_LOG(INFO, VHOST_CONFIG,
6282bfe3f2eSlogwang 					"Invalid argument for socket name (Max %d characters)\n",
6292bfe3f2eSlogwang 					PATH_MAX);
630a9643ea8Slogwang 					us_vhost_usage(prgname);
631a9643ea8Slogwang 					return -1;
632a9643ea8Slogwang 				}
633a9643ea8Slogwang 			}
634a9643ea8Slogwang 
635*2d9fd380Sjfb8856606 			if (!strncmp(long_option[option_index].name,
636*2d9fd380Sjfb8856606 						"dma-type", MAX_LONG_OPT_SZ)) {
637*2d9fd380Sjfb8856606 				if (strlen(optarg) >= MAX_LONG_OPT_SZ) {
638*2d9fd380Sjfb8856606 					RTE_LOG(INFO, VHOST_CONFIG,
639*2d9fd380Sjfb8856606 						"Wrong DMA type\n");
640*2d9fd380Sjfb8856606 					us_vhost_usage(prgname);
641*2d9fd380Sjfb8856606 					return -1;
642*2d9fd380Sjfb8856606 				}
643*2d9fd380Sjfb8856606 				strcpy(dma_type, optarg);
644*2d9fd380Sjfb8856606 			}
645*2d9fd380Sjfb8856606 
646*2d9fd380Sjfb8856606 			if (!strncmp(long_option[option_index].name,
647*2d9fd380Sjfb8856606 						"dmas", MAX_LONG_OPT_SZ)) {
648*2d9fd380Sjfb8856606 				if (open_dma(optarg) == -1) {
649*2d9fd380Sjfb8856606 					RTE_LOG(INFO, VHOST_CONFIG,
650*2d9fd380Sjfb8856606 						"Wrong DMA args\n");
651*2d9fd380Sjfb8856606 					us_vhost_usage(prgname);
652*2d9fd380Sjfb8856606 					return -1;
653*2d9fd380Sjfb8856606 				}
654*2d9fd380Sjfb8856606 				async_vhost_driver = 1;
655*2d9fd380Sjfb8856606 			}
656*2d9fd380Sjfb8856606 
657a9643ea8Slogwang 			break;
658a9643ea8Slogwang 
659a9643ea8Slogwang 			/* Invalid option - print options. */
660a9643ea8Slogwang 		default:
661a9643ea8Slogwang 			us_vhost_usage(prgname);
662a9643ea8Slogwang 			return -1;
663a9643ea8Slogwang 		}
664a9643ea8Slogwang 	}
665a9643ea8Slogwang 
666a9643ea8Slogwang 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
667a9643ea8Slogwang 		if (enabled_port_mask & (1 << i))
6682bfe3f2eSlogwang 			ports[num_ports++] = i;
669a9643ea8Slogwang 	}
670a9643ea8Slogwang 
671a9643ea8Slogwang 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
672a9643ea8Slogwang 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
673a9643ea8Slogwang 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
674a9643ea8Slogwang 		return -1;
675a9643ea8Slogwang 	}
676a9643ea8Slogwang 
677a9643ea8Slogwang 	return 0;
678a9643ea8Slogwang }
679a9643ea8Slogwang 
680a9643ea8Slogwang /*
681a9643ea8Slogwang  * Update the global var NUM_PORTS and array PORTS according to system ports number
682a9643ea8Slogwang  * and return valid ports number
683a9643ea8Slogwang  */
check_ports_num(unsigned nb_ports)684a9643ea8Slogwang static unsigned check_ports_num(unsigned nb_ports)
685a9643ea8Slogwang {
686a9643ea8Slogwang 	unsigned valid_num_ports = num_ports;
687a9643ea8Slogwang 	unsigned portid;
688a9643ea8Slogwang 
689a9643ea8Slogwang 	if (num_ports > nb_ports) {
690a9643ea8Slogwang 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
691a9643ea8Slogwang 			num_ports, nb_ports);
692a9643ea8Slogwang 		num_ports = nb_ports;
693a9643ea8Slogwang 	}
694a9643ea8Slogwang 
695a9643ea8Slogwang 	for (portid = 0; portid < num_ports; portid ++) {
696d30ea906Sjfb8856606 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
697d30ea906Sjfb8856606 			RTE_LOG(INFO, VHOST_PORT,
698d30ea906Sjfb8856606 				"\nSpecified port ID(%u) is not valid\n",
699d30ea906Sjfb8856606 				ports[portid]);
700a9643ea8Slogwang 			ports[portid] = INVALID_PORT_ID;
701a9643ea8Slogwang 			valid_num_ports--;
702a9643ea8Slogwang 		}
703a9643ea8Slogwang 	}
704a9643ea8Slogwang 	return valid_num_ports;
705a9643ea8Slogwang }
706a9643ea8Slogwang 
7072bfe3f2eSlogwang static __rte_always_inline struct vhost_dev *
find_vhost_dev(struct rte_ether_addr * mac)7084418919fSjohnjiang find_vhost_dev(struct rte_ether_addr *mac)
709a9643ea8Slogwang {
710a9643ea8Slogwang 	struct vhost_dev *vdev;
711a9643ea8Slogwang 
712a9643ea8Slogwang 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
713a9643ea8Slogwang 		if (vdev->ready == DEVICE_RX &&
7144418919fSjohnjiang 		    rte_is_same_ether_addr(mac, &vdev->mac_address))
715a9643ea8Slogwang 			return vdev;
716a9643ea8Slogwang 	}
717a9643ea8Slogwang 
718a9643ea8Slogwang 	return NULL;
719a9643ea8Slogwang }
720a9643ea8Slogwang 
721a9643ea8Slogwang /*
722a9643ea8Slogwang  * This function learns the MAC address of the device and registers this along with a
723a9643ea8Slogwang  * vlan tag to a VMDQ.
724a9643ea8Slogwang  */
725a9643ea8Slogwang static int
link_vmdq(struct vhost_dev * vdev,struct rte_mbuf * m)726a9643ea8Slogwang link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
727a9643ea8Slogwang {
7284418919fSjohnjiang 	struct rte_ether_hdr *pkt_hdr;
729a9643ea8Slogwang 	int i, ret;
730a9643ea8Slogwang 
731a9643ea8Slogwang 	/* Learn MAC address of guest device from packet */
7324418919fSjohnjiang 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
733a9643ea8Slogwang 
734a9643ea8Slogwang 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
735a9643ea8Slogwang 		RTE_LOG(ERR, VHOST_DATA,
736a9643ea8Slogwang 			"(%d) device is using a registered MAC!\n",
737a9643ea8Slogwang 			vdev->vid);
738a9643ea8Slogwang 		return -1;
739a9643ea8Slogwang 	}
740a9643ea8Slogwang 
7414418919fSjohnjiang 	for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
742a9643ea8Slogwang 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
743a9643ea8Slogwang 
744a9643ea8Slogwang 	/* vlan_tag currently uses the device_id. */
745a9643ea8Slogwang 	vdev->vlan_tag = vlan_tags[vdev->vid];
746a9643ea8Slogwang 
747a9643ea8Slogwang 	/* Print out VMDQ registration info. */
748a9643ea8Slogwang 	RTE_LOG(INFO, VHOST_DATA,
749a9643ea8Slogwang 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
750a9643ea8Slogwang 		vdev->vid,
751a9643ea8Slogwang 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
752a9643ea8Slogwang 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
753a9643ea8Slogwang 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
754a9643ea8Slogwang 		vdev->vlan_tag);
755a9643ea8Slogwang 
756a9643ea8Slogwang 	/* Register the MAC address. */
757a9643ea8Slogwang 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
758a9643ea8Slogwang 				(uint32_t)vdev->vid + vmdq_pool_base);
759a9643ea8Slogwang 	if (ret)
760a9643ea8Slogwang 		RTE_LOG(ERR, VHOST_DATA,
761a9643ea8Slogwang 			"(%d) failed to add device MAC address to VMDQ\n",
762a9643ea8Slogwang 			vdev->vid);
763a9643ea8Slogwang 
7642bfe3f2eSlogwang 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
765a9643ea8Slogwang 
766a9643ea8Slogwang 	/* Set device as ready for RX. */
767a9643ea8Slogwang 	vdev->ready = DEVICE_RX;
768a9643ea8Slogwang 
769a9643ea8Slogwang 	return 0;
770a9643ea8Slogwang }
771a9643ea8Slogwang 
772a9643ea8Slogwang /*
773a9643ea8Slogwang  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
774a9643ea8Slogwang  * queue before disabling RX on the device.
775a9643ea8Slogwang  */
776a9643ea8Slogwang static inline void
unlink_vmdq(struct vhost_dev * vdev)777a9643ea8Slogwang unlink_vmdq(struct vhost_dev *vdev)
778a9643ea8Slogwang {
779a9643ea8Slogwang 	unsigned i = 0;
780a9643ea8Slogwang 	unsigned rx_count;
781a9643ea8Slogwang 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
782a9643ea8Slogwang 
783a9643ea8Slogwang 	if (vdev->ready == DEVICE_RX) {
784a9643ea8Slogwang 		/*clear MAC and VLAN settings*/
785a9643ea8Slogwang 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
786a9643ea8Slogwang 		for (i = 0; i < 6; i++)
787a9643ea8Slogwang 			vdev->mac_address.addr_bytes[i] = 0;
788a9643ea8Slogwang 
789a9643ea8Slogwang 		vdev->vlan_tag = 0;
790a9643ea8Slogwang 
791a9643ea8Slogwang 		/*Clear out the receive buffers*/
792a9643ea8Slogwang 		rx_count = rte_eth_rx_burst(ports[0],
793a9643ea8Slogwang 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
794a9643ea8Slogwang 
795a9643ea8Slogwang 		while (rx_count) {
796a9643ea8Slogwang 			for (i = 0; i < rx_count; i++)
797a9643ea8Slogwang 				rte_pktmbuf_free(pkts_burst[i]);
798a9643ea8Slogwang 
799a9643ea8Slogwang 			rx_count = rte_eth_rx_burst(ports[0],
800a9643ea8Slogwang 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
801a9643ea8Slogwang 		}
802a9643ea8Slogwang 
803a9643ea8Slogwang 		vdev->ready = DEVICE_MAC_LEARNING;
804a9643ea8Slogwang 	}
805a9643ea8Slogwang }
806a9643ea8Slogwang 
8072bfe3f2eSlogwang static __rte_always_inline void
virtio_xmit(struct vhost_dev * dst_vdev,struct vhost_dev * src_vdev,struct rte_mbuf * m)808a9643ea8Slogwang virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
809a9643ea8Slogwang 	    struct rte_mbuf *m)
810a9643ea8Slogwang {
811a9643ea8Slogwang 	uint16_t ret;
812*2d9fd380Sjfb8856606 	struct rte_mbuf *m_cpl[1];
813a9643ea8Slogwang 
8142bfe3f2eSlogwang 	if (builtin_net_driver) {
8152bfe3f2eSlogwang 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
816*2d9fd380Sjfb8856606 	} else if (async_vhost_driver) {
817*2d9fd380Sjfb8856606 		ret = rte_vhost_submit_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ,
818*2d9fd380Sjfb8856606 						&m, 1);
819*2d9fd380Sjfb8856606 
820*2d9fd380Sjfb8856606 		if (likely(ret))
821*2d9fd380Sjfb8856606 			dst_vdev->nr_async_pkts++;
822*2d9fd380Sjfb8856606 
823*2d9fd380Sjfb8856606 		while (likely(dst_vdev->nr_async_pkts)) {
824*2d9fd380Sjfb8856606 			if (rte_vhost_poll_enqueue_completed(dst_vdev->vid,
825*2d9fd380Sjfb8856606 					VIRTIO_RXQ, m_cpl, 1))
826*2d9fd380Sjfb8856606 				dst_vdev->nr_async_pkts--;
827*2d9fd380Sjfb8856606 		}
8282bfe3f2eSlogwang 	} else {
829a9643ea8Slogwang 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
8302bfe3f2eSlogwang 	}
8312bfe3f2eSlogwang 
832a9643ea8Slogwang 	if (enable_stats) {
833a9643ea8Slogwang 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
834a9643ea8Slogwang 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
835a9643ea8Slogwang 		src_vdev->stats.tx_total++;
836a9643ea8Slogwang 		src_vdev->stats.tx += ret;
837a9643ea8Slogwang 	}
838a9643ea8Slogwang }
839a9643ea8Slogwang 
840a9643ea8Slogwang /*
841a9643ea8Slogwang  * Check if the packet destination MAC address is for a local device. If so then put
842a9643ea8Slogwang  * the packet on that devices RX queue. If not then return.
843a9643ea8Slogwang  */
8442bfe3f2eSlogwang static __rte_always_inline int
virtio_tx_local(struct vhost_dev * vdev,struct rte_mbuf * m)845a9643ea8Slogwang virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
846a9643ea8Slogwang {
8474418919fSjohnjiang 	struct rte_ether_hdr *pkt_hdr;
848a9643ea8Slogwang 	struct vhost_dev *dst_vdev;
849a9643ea8Slogwang 
8504418919fSjohnjiang 	pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
851a9643ea8Slogwang 
852a9643ea8Slogwang 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
853a9643ea8Slogwang 	if (!dst_vdev)
854a9643ea8Slogwang 		return -1;
855a9643ea8Slogwang 
856a9643ea8Slogwang 	if (vdev->vid == dst_vdev->vid) {
8572bfe3f2eSlogwang 		RTE_LOG_DP(DEBUG, VHOST_DATA,
858a9643ea8Slogwang 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
859a9643ea8Slogwang 			vdev->vid);
860a9643ea8Slogwang 		return 0;
861a9643ea8Slogwang 	}
862a9643ea8Slogwang 
8632bfe3f2eSlogwang 	RTE_LOG_DP(DEBUG, VHOST_DATA,
864a9643ea8Slogwang 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
865a9643ea8Slogwang 
866a9643ea8Slogwang 	if (unlikely(dst_vdev->remove)) {
8672bfe3f2eSlogwang 		RTE_LOG_DP(DEBUG, VHOST_DATA,
868a9643ea8Slogwang 			"(%d) device is marked for removal\n", dst_vdev->vid);
869a9643ea8Slogwang 		return 0;
870a9643ea8Slogwang 	}
871a9643ea8Slogwang 
872a9643ea8Slogwang 	virtio_xmit(dst_vdev, vdev, m);
873a9643ea8Slogwang 	return 0;
874a9643ea8Slogwang }
875a9643ea8Slogwang 
876a9643ea8Slogwang /*
877a9643ea8Slogwang  * Check if the destination MAC of a packet is one local VM,
878a9643ea8Slogwang  * and get its vlan tag, and offset if it is.
879a9643ea8Slogwang  */
8802bfe3f2eSlogwang static __rte_always_inline int
find_local_dest(struct vhost_dev * vdev,struct rte_mbuf * m,uint32_t * offset,uint16_t * vlan_tag)881a9643ea8Slogwang find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
882a9643ea8Slogwang 	uint32_t *offset, uint16_t *vlan_tag)
883a9643ea8Slogwang {
884a9643ea8Slogwang 	struct vhost_dev *dst_vdev;
8854418919fSjohnjiang 	struct rte_ether_hdr *pkt_hdr =
8864418919fSjohnjiang 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
887a9643ea8Slogwang 
888a9643ea8Slogwang 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
889a9643ea8Slogwang 	if (!dst_vdev)
890a9643ea8Slogwang 		return 0;
891a9643ea8Slogwang 
892a9643ea8Slogwang 	if (vdev->vid == dst_vdev->vid) {
8932bfe3f2eSlogwang 		RTE_LOG_DP(DEBUG, VHOST_DATA,
894a9643ea8Slogwang 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
895a9643ea8Slogwang 			vdev->vid);
896a9643ea8Slogwang 		return -1;
897a9643ea8Slogwang 	}
898a9643ea8Slogwang 
899a9643ea8Slogwang 	/*
900a9643ea8Slogwang 	 * HW vlan strip will reduce the packet length
901a9643ea8Slogwang 	 * by minus length of vlan tag, so need restore
902a9643ea8Slogwang 	 * the packet length by plus it.
903a9643ea8Slogwang 	 */
904a9643ea8Slogwang 	*offset  = VLAN_HLEN;
905a9643ea8Slogwang 	*vlan_tag = vlan_tags[vdev->vid];
906a9643ea8Slogwang 
9072bfe3f2eSlogwang 	RTE_LOG_DP(DEBUG, VHOST_DATA,
908a9643ea8Slogwang 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
909a9643ea8Slogwang 		vdev->vid, dst_vdev->vid, *vlan_tag);
910a9643ea8Slogwang 
911a9643ea8Slogwang 	return 0;
912a9643ea8Slogwang }
913a9643ea8Slogwang 
914a9643ea8Slogwang static uint16_t
get_psd_sum(void * l3_hdr,uint64_t ol_flags)915a9643ea8Slogwang get_psd_sum(void *l3_hdr, uint64_t ol_flags)
916a9643ea8Slogwang {
917a9643ea8Slogwang 	if (ol_flags & PKT_TX_IPV4)
918a9643ea8Slogwang 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
9194418919fSjohnjiang 	else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
920a9643ea8Slogwang 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
921a9643ea8Slogwang }
922a9643ea8Slogwang 
virtio_tx_offload(struct rte_mbuf * m)923a9643ea8Slogwang static void virtio_tx_offload(struct rte_mbuf *m)
924a9643ea8Slogwang {
925a9643ea8Slogwang 	void *l3_hdr;
9264418919fSjohnjiang 	struct rte_ipv4_hdr *ipv4_hdr = NULL;
9274418919fSjohnjiang 	struct rte_tcp_hdr *tcp_hdr = NULL;
9284418919fSjohnjiang 	struct rte_ether_hdr *eth_hdr =
9294418919fSjohnjiang 		rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
930a9643ea8Slogwang 
931a9643ea8Slogwang 	l3_hdr = (char *)eth_hdr + m->l2_len;
932a9643ea8Slogwang 
933a9643ea8Slogwang 	if (m->ol_flags & PKT_TX_IPV4) {
934a9643ea8Slogwang 		ipv4_hdr = l3_hdr;
935a9643ea8Slogwang 		ipv4_hdr->hdr_checksum = 0;
936a9643ea8Slogwang 		m->ol_flags |= PKT_TX_IP_CKSUM;
937a9643ea8Slogwang 	}
938a9643ea8Slogwang 
9394418919fSjohnjiang 	tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
940a9643ea8Slogwang 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
941a9643ea8Slogwang }
942a9643ea8Slogwang 
943a9643ea8Slogwang static inline void
free_pkts(struct rte_mbuf ** pkts,uint16_t n)944a9643ea8Slogwang free_pkts(struct rte_mbuf **pkts, uint16_t n)
945a9643ea8Slogwang {
946a9643ea8Slogwang 	while (n--)
947a9643ea8Slogwang 		rte_pktmbuf_free(pkts[n]);
948a9643ea8Slogwang }
949a9643ea8Slogwang 
9502bfe3f2eSlogwang static __rte_always_inline void
do_drain_mbuf_table(struct mbuf_table * tx_q)951a9643ea8Slogwang do_drain_mbuf_table(struct mbuf_table *tx_q)
952a9643ea8Slogwang {
953a9643ea8Slogwang 	uint16_t count;
954a9643ea8Slogwang 
955a9643ea8Slogwang 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
956a9643ea8Slogwang 				 tx_q->m_table, tx_q->len);
957a9643ea8Slogwang 	if (unlikely(count < tx_q->len))
958a9643ea8Slogwang 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
959a9643ea8Slogwang 
960a9643ea8Slogwang 	tx_q->len = 0;
961a9643ea8Slogwang }
962a9643ea8Slogwang 
963a9643ea8Slogwang /*
964a9643ea8Slogwang  * This function routes the TX packet to the correct interface. This
965a9643ea8Slogwang  * may be a local device or the physical port.
966a9643ea8Slogwang  */
9672bfe3f2eSlogwang static __rte_always_inline void
virtio_tx_route(struct vhost_dev * vdev,struct rte_mbuf * m,uint16_t vlan_tag)968a9643ea8Slogwang virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
969a9643ea8Slogwang {
970a9643ea8Slogwang 	struct mbuf_table *tx_q;
971a9643ea8Slogwang 	unsigned offset = 0;
972a9643ea8Slogwang 	const uint16_t lcore_id = rte_lcore_id();
9734418919fSjohnjiang 	struct rte_ether_hdr *nh;
974a9643ea8Slogwang 
975a9643ea8Slogwang 
9764418919fSjohnjiang 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
9774418919fSjohnjiang 	if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
978a9643ea8Slogwang 		struct vhost_dev *vdev2;
979a9643ea8Slogwang 
980a9643ea8Slogwang 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
9812bfe3f2eSlogwang 			if (vdev2 != vdev)
982a9643ea8Slogwang 				virtio_xmit(vdev2, vdev, m);
983a9643ea8Slogwang 		}
984a9643ea8Slogwang 		goto queue2nic;
985a9643ea8Slogwang 	}
986a9643ea8Slogwang 
987a9643ea8Slogwang 	/*check if destination is local VM*/
988a9643ea8Slogwang 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
989a9643ea8Slogwang 		rte_pktmbuf_free(m);
990a9643ea8Slogwang 		return;
991a9643ea8Slogwang 	}
992a9643ea8Slogwang 
993a9643ea8Slogwang 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
994a9643ea8Slogwang 		if (unlikely(find_local_dest(vdev, m, &offset,
995a9643ea8Slogwang 					     &vlan_tag) != 0)) {
996a9643ea8Slogwang 			rte_pktmbuf_free(m);
997a9643ea8Slogwang 			return;
998a9643ea8Slogwang 		}
999a9643ea8Slogwang 	}
1000a9643ea8Slogwang 
10012bfe3f2eSlogwang 	RTE_LOG_DP(DEBUG, VHOST_DATA,
1002a9643ea8Slogwang 		"(%d) TX: MAC address is external\n", vdev->vid);
1003a9643ea8Slogwang 
1004a9643ea8Slogwang queue2nic:
1005a9643ea8Slogwang 
1006a9643ea8Slogwang 	/*Add packet to the port tx queue*/
1007a9643ea8Slogwang 	tx_q = &lcore_tx_queue[lcore_id];
1008a9643ea8Slogwang 
10094418919fSjohnjiang 	nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
10104418919fSjohnjiang 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1011a9643ea8Slogwang 		/* Guest has inserted the vlan tag. */
10124418919fSjohnjiang 		struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1013a9643ea8Slogwang 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1014a9643ea8Slogwang 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
1015a9643ea8Slogwang 			(vh->vlan_tci != vlan_tag_be))
1016a9643ea8Slogwang 			vh->vlan_tci = vlan_tag_be;
1017a9643ea8Slogwang 	} else {
1018a9643ea8Slogwang 		m->ol_flags |= PKT_TX_VLAN_PKT;
1019a9643ea8Slogwang 
1020a9643ea8Slogwang 		/*
1021a9643ea8Slogwang 		 * Find the right seg to adjust the data len when offset is
1022a9643ea8Slogwang 		 * bigger than tail room size.
1023a9643ea8Slogwang 		 */
1024a9643ea8Slogwang 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1025a9643ea8Slogwang 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1026a9643ea8Slogwang 				m->data_len += offset;
1027a9643ea8Slogwang 			else {
1028a9643ea8Slogwang 				struct rte_mbuf *seg = m;
1029a9643ea8Slogwang 
1030a9643ea8Slogwang 				while ((seg->next != NULL) &&
1031a9643ea8Slogwang 					(offset > rte_pktmbuf_tailroom(seg)))
1032a9643ea8Slogwang 					seg = seg->next;
1033a9643ea8Slogwang 
1034a9643ea8Slogwang 				seg->data_len += offset;
1035a9643ea8Slogwang 			}
1036a9643ea8Slogwang 			m->pkt_len += offset;
1037a9643ea8Slogwang 		}
1038a9643ea8Slogwang 
1039a9643ea8Slogwang 		m->vlan_tci = vlan_tag;
1040a9643ea8Slogwang 	}
1041a9643ea8Slogwang 
1042a9643ea8Slogwang 	if (m->ol_flags & PKT_TX_TCP_SEG)
1043a9643ea8Slogwang 		virtio_tx_offload(m);
1044a9643ea8Slogwang 
1045a9643ea8Slogwang 	tx_q->m_table[tx_q->len++] = m;
1046a9643ea8Slogwang 	if (enable_stats) {
1047a9643ea8Slogwang 		vdev->stats.tx_total++;
1048a9643ea8Slogwang 		vdev->stats.tx++;
1049a9643ea8Slogwang 	}
1050a9643ea8Slogwang 
1051a9643ea8Slogwang 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1052a9643ea8Slogwang 		do_drain_mbuf_table(tx_q);
1053a9643ea8Slogwang }
1054a9643ea8Slogwang 
1055a9643ea8Slogwang 
10562bfe3f2eSlogwang static __rte_always_inline void
drain_mbuf_table(struct mbuf_table * tx_q)1057a9643ea8Slogwang drain_mbuf_table(struct mbuf_table *tx_q)
1058a9643ea8Slogwang {
1059a9643ea8Slogwang 	static uint64_t prev_tsc;
1060a9643ea8Slogwang 	uint64_t cur_tsc;
1061a9643ea8Slogwang 
1062a9643ea8Slogwang 	if (tx_q->len == 0)
1063a9643ea8Slogwang 		return;
1064a9643ea8Slogwang 
1065a9643ea8Slogwang 	cur_tsc = rte_rdtsc();
1066a9643ea8Slogwang 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1067a9643ea8Slogwang 		prev_tsc = cur_tsc;
1068a9643ea8Slogwang 
10692bfe3f2eSlogwang 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1070a9643ea8Slogwang 			"TX queue drained after timeout with burst size %u\n",
1071a9643ea8Slogwang 			tx_q->len);
1072a9643ea8Slogwang 		do_drain_mbuf_table(tx_q);
1073a9643ea8Slogwang 	}
1074a9643ea8Slogwang }
1075a9643ea8Slogwang 
10762bfe3f2eSlogwang static __rte_always_inline void
complete_async_pkts(struct vhost_dev * vdev,uint16_t qid)1077*2d9fd380Sjfb8856606 complete_async_pkts(struct vhost_dev *vdev, uint16_t qid)
1078*2d9fd380Sjfb8856606 {
1079*2d9fd380Sjfb8856606 	struct rte_mbuf *p_cpl[MAX_PKT_BURST];
1080*2d9fd380Sjfb8856606 	uint16_t complete_count;
1081*2d9fd380Sjfb8856606 
1082*2d9fd380Sjfb8856606 	complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
1083*2d9fd380Sjfb8856606 						qid, p_cpl, MAX_PKT_BURST);
1084*2d9fd380Sjfb8856606 	vdev->nr_async_pkts -= complete_count;
1085*2d9fd380Sjfb8856606 	if (complete_count)
1086*2d9fd380Sjfb8856606 		free_pkts(p_cpl, complete_count);
1087*2d9fd380Sjfb8856606 }
1088*2d9fd380Sjfb8856606 
1089*2d9fd380Sjfb8856606 static __rte_always_inline void
drain_eth_rx(struct vhost_dev * vdev)1090a9643ea8Slogwang drain_eth_rx(struct vhost_dev *vdev)
1091a9643ea8Slogwang {
1092a9643ea8Slogwang 	uint16_t rx_count, enqueue_count;
1093a9643ea8Slogwang 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1094a9643ea8Slogwang 
1095a9643ea8Slogwang 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1096a9643ea8Slogwang 				    pkts, MAX_PKT_BURST);
1097*2d9fd380Sjfb8856606 
1098*2d9fd380Sjfb8856606 	while (likely(vdev->nr_async_pkts))
1099*2d9fd380Sjfb8856606 		complete_async_pkts(vdev, VIRTIO_RXQ);
1100*2d9fd380Sjfb8856606 
1101a9643ea8Slogwang 	if (!rx_count)
1102a9643ea8Slogwang 		return;
1103a9643ea8Slogwang 
1104a9643ea8Slogwang 	/*
1105a9643ea8Slogwang 	 * When "enable_retry" is set, here we wait and retry when there
1106a9643ea8Slogwang 	 * is no enough free slots in the queue to hold @rx_count packets,
1107a9643ea8Slogwang 	 * to diminish packet loss.
1108a9643ea8Slogwang 	 */
1109a9643ea8Slogwang 	if (enable_retry &&
1110a9643ea8Slogwang 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1111a9643ea8Slogwang 			VIRTIO_RXQ))) {
1112a9643ea8Slogwang 		uint32_t retry;
1113a9643ea8Slogwang 
1114a9643ea8Slogwang 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1115a9643ea8Slogwang 			rte_delay_us(burst_rx_delay_time);
1116a9643ea8Slogwang 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1117a9643ea8Slogwang 					VIRTIO_RXQ))
1118a9643ea8Slogwang 				break;
1119a9643ea8Slogwang 		}
1120a9643ea8Slogwang 	}
1121a9643ea8Slogwang 
11222bfe3f2eSlogwang 	if (builtin_net_driver) {
11232bfe3f2eSlogwang 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
11242bfe3f2eSlogwang 						pkts, rx_count);
1125*2d9fd380Sjfb8856606 	} else if (async_vhost_driver) {
1126*2d9fd380Sjfb8856606 		enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1127*2d9fd380Sjfb8856606 					VIRTIO_RXQ, pkts, rx_count);
1128*2d9fd380Sjfb8856606 		vdev->nr_async_pkts += enqueue_count;
11292bfe3f2eSlogwang 	} else {
1130a9643ea8Slogwang 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1131a9643ea8Slogwang 						pkts, rx_count);
11322bfe3f2eSlogwang 	}
1133*2d9fd380Sjfb8856606 
1134a9643ea8Slogwang 	if (enable_stats) {
1135a9643ea8Slogwang 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1136a9643ea8Slogwang 		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1137a9643ea8Slogwang 	}
1138a9643ea8Slogwang 
1139*2d9fd380Sjfb8856606 	if (!async_vhost_driver)
1140a9643ea8Slogwang 		free_pkts(pkts, rx_count);
1141a9643ea8Slogwang }
1142a9643ea8Slogwang 
11432bfe3f2eSlogwang static __rte_always_inline void
drain_virtio_tx(struct vhost_dev * vdev)1144a9643ea8Slogwang drain_virtio_tx(struct vhost_dev *vdev)
1145a9643ea8Slogwang {
1146a9643ea8Slogwang 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1147a9643ea8Slogwang 	uint16_t count;
1148a9643ea8Slogwang 	uint16_t i;
1149a9643ea8Slogwang 
11502bfe3f2eSlogwang 	if (builtin_net_driver) {
11512bfe3f2eSlogwang 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1152a9643ea8Slogwang 					pkts, MAX_PKT_BURST);
11532bfe3f2eSlogwang 	} else {
11542bfe3f2eSlogwang 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
11552bfe3f2eSlogwang 					mbuf_pool, pkts, MAX_PKT_BURST);
11562bfe3f2eSlogwang 	}
1157a9643ea8Slogwang 
1158a9643ea8Slogwang 	/* setup VMDq for the first packet */
1159a9643ea8Slogwang 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1160a9643ea8Slogwang 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1161a9643ea8Slogwang 			free_pkts(pkts, count);
1162a9643ea8Slogwang 	}
1163a9643ea8Slogwang 
1164a9643ea8Slogwang 	for (i = 0; i < count; ++i)
1165a9643ea8Slogwang 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1166a9643ea8Slogwang }
1167a9643ea8Slogwang 
1168a9643ea8Slogwang /*
1169a9643ea8Slogwang  * Main function of vhost-switch. It basically does:
1170a9643ea8Slogwang  *
1171a9643ea8Slogwang  * for each vhost device {
1172a9643ea8Slogwang  *    - drain_eth_rx()
1173a9643ea8Slogwang  *
1174a9643ea8Slogwang  *      Which drains the host eth Rx queue linked to the vhost device,
1175a9643ea8Slogwang  *      and deliver all of them to guest virito Rx ring associated with
1176a9643ea8Slogwang  *      this vhost device.
1177a9643ea8Slogwang  *
1178a9643ea8Slogwang  *    - drain_virtio_tx()
1179a9643ea8Slogwang  *
1180a9643ea8Slogwang  *      Which drains the guest virtio Tx queue and deliver all of them
1181a9643ea8Slogwang  *      to the target, which could be another vhost device, or the
1182a9643ea8Slogwang  *      physical eth dev. The route is done in function "virtio_tx_route".
1183a9643ea8Slogwang  * }
1184a9643ea8Slogwang  */
1185a9643ea8Slogwang static int
switch_worker(void * arg __rte_unused)1186a9643ea8Slogwang switch_worker(void *arg __rte_unused)
1187a9643ea8Slogwang {
1188a9643ea8Slogwang 	unsigned i;
1189a9643ea8Slogwang 	unsigned lcore_id = rte_lcore_id();
1190a9643ea8Slogwang 	struct vhost_dev *vdev;
1191a9643ea8Slogwang 	struct mbuf_table *tx_q;
1192a9643ea8Slogwang 
1193a9643ea8Slogwang 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1194a9643ea8Slogwang 
1195a9643ea8Slogwang 	tx_q = &lcore_tx_queue[lcore_id];
1196a9643ea8Slogwang 	for (i = 0; i < rte_lcore_count(); i++) {
1197a9643ea8Slogwang 		if (lcore_ids[i] == lcore_id) {
1198a9643ea8Slogwang 			tx_q->txq_id = i;
1199a9643ea8Slogwang 			break;
1200a9643ea8Slogwang 		}
1201a9643ea8Slogwang 	}
1202a9643ea8Slogwang 
1203a9643ea8Slogwang 	while(1) {
1204a9643ea8Slogwang 		drain_mbuf_table(tx_q);
1205a9643ea8Slogwang 
1206a9643ea8Slogwang 		/*
1207a9643ea8Slogwang 		 * Inform the configuration core that we have exited the
1208a9643ea8Slogwang 		 * linked list and that no devices are in use if requested.
1209a9643ea8Slogwang 		 */
1210a9643ea8Slogwang 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1211a9643ea8Slogwang 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1212a9643ea8Slogwang 
1213a9643ea8Slogwang 		/*
1214a9643ea8Slogwang 		 * Process vhost devices
1215a9643ea8Slogwang 		 */
1216a9643ea8Slogwang 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1217a9643ea8Slogwang 			      lcore_vdev_entry) {
1218a9643ea8Slogwang 			if (unlikely(vdev->remove)) {
1219a9643ea8Slogwang 				unlink_vmdq(vdev);
1220a9643ea8Slogwang 				vdev->ready = DEVICE_SAFE_REMOVE;
1221a9643ea8Slogwang 				continue;
1222a9643ea8Slogwang 			}
1223a9643ea8Slogwang 
1224a9643ea8Slogwang 			if (likely(vdev->ready == DEVICE_RX))
1225a9643ea8Slogwang 				drain_eth_rx(vdev);
1226a9643ea8Slogwang 
1227a9643ea8Slogwang 			if (likely(!vdev->remove))
1228a9643ea8Slogwang 				drain_virtio_tx(vdev);
1229a9643ea8Slogwang 		}
1230a9643ea8Slogwang 	}
1231a9643ea8Slogwang 
1232a9643ea8Slogwang 	return 0;
1233a9643ea8Slogwang }
1234a9643ea8Slogwang 
1235a9643ea8Slogwang /*
1236a9643ea8Slogwang  * Remove a device from the specific data core linked list and from the
1237a9643ea8Slogwang  * main linked list. Synchonization  occurs through the use of the
1238a9643ea8Slogwang  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1239a9643ea8Slogwang  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1240a9643ea8Slogwang  */
1241a9643ea8Slogwang static void
destroy_device(int vid)1242a9643ea8Slogwang destroy_device(int vid)
1243a9643ea8Slogwang {
1244a9643ea8Slogwang 	struct vhost_dev *vdev = NULL;
1245a9643ea8Slogwang 	int lcore;
1246a9643ea8Slogwang 
1247a9643ea8Slogwang 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1248a9643ea8Slogwang 		if (vdev->vid == vid)
1249a9643ea8Slogwang 			break;
1250a9643ea8Slogwang 	}
1251a9643ea8Slogwang 	if (!vdev)
1252a9643ea8Slogwang 		return;
1253a9643ea8Slogwang 	/*set the remove flag. */
1254a9643ea8Slogwang 	vdev->remove = 1;
1255a9643ea8Slogwang 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1256a9643ea8Slogwang 		rte_pause();
1257a9643ea8Slogwang 	}
1258a9643ea8Slogwang 
12592bfe3f2eSlogwang 	if (builtin_net_driver)
12602bfe3f2eSlogwang 		vs_vhost_net_remove(vdev);
12612bfe3f2eSlogwang 
1262a9643ea8Slogwang 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1263a9643ea8Slogwang 		     lcore_vdev_entry);
1264a9643ea8Slogwang 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1265a9643ea8Slogwang 
1266a9643ea8Slogwang 
1267a9643ea8Slogwang 	/* Set the dev_removal_flag on each lcore. */
1268*2d9fd380Sjfb8856606 	RTE_LCORE_FOREACH_WORKER(lcore)
1269a9643ea8Slogwang 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1270a9643ea8Slogwang 
1271a9643ea8Slogwang 	/*
1272a9643ea8Slogwang 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1273a9643ea8Slogwang 	 * we can be sure that they can no longer access the device removed
1274a9643ea8Slogwang 	 * from the linked lists and that the devices are no longer in use.
1275a9643ea8Slogwang 	 */
1276*2d9fd380Sjfb8856606 	RTE_LCORE_FOREACH_WORKER(lcore) {
1277a9643ea8Slogwang 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1278a9643ea8Slogwang 			rte_pause();
1279a9643ea8Slogwang 	}
1280a9643ea8Slogwang 
1281a9643ea8Slogwang 	lcore_info[vdev->coreid].device_num--;
1282a9643ea8Slogwang 
1283a9643ea8Slogwang 	RTE_LOG(INFO, VHOST_DATA,
1284a9643ea8Slogwang 		"(%d) device has been removed from data core\n",
1285a9643ea8Slogwang 		vdev->vid);
1286a9643ea8Slogwang 
1287*2d9fd380Sjfb8856606 	if (async_vhost_driver)
1288*2d9fd380Sjfb8856606 		rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1289*2d9fd380Sjfb8856606 
1290a9643ea8Slogwang 	rte_free(vdev);
1291a9643ea8Slogwang }
1292a9643ea8Slogwang 
1293a9643ea8Slogwang /*
1294a9643ea8Slogwang  * A new device is added to a data core. First the device is added to the main linked list
12951646932aSjfb8856606  * and then allocated to a specific data core.
1296a9643ea8Slogwang  */
1297a9643ea8Slogwang static int
new_device(int vid)1298a9643ea8Slogwang new_device(int vid)
1299a9643ea8Slogwang {
1300a9643ea8Slogwang 	int lcore, core_add = 0;
1301a9643ea8Slogwang 	uint32_t device_num_min = num_devices;
1302a9643ea8Slogwang 	struct vhost_dev *vdev;
1303a9643ea8Slogwang 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1304a9643ea8Slogwang 	if (vdev == NULL) {
1305a9643ea8Slogwang 		RTE_LOG(INFO, VHOST_DATA,
1306a9643ea8Slogwang 			"(%d) couldn't allocate memory for vhost dev\n",
1307a9643ea8Slogwang 			vid);
1308a9643ea8Slogwang 		return -1;
1309a9643ea8Slogwang 	}
1310a9643ea8Slogwang 	vdev->vid = vid;
1311a9643ea8Slogwang 
13122bfe3f2eSlogwang 	if (builtin_net_driver)
13132bfe3f2eSlogwang 		vs_vhost_net_setup(vdev);
13142bfe3f2eSlogwang 
1315a9643ea8Slogwang 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1316a9643ea8Slogwang 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1317a9643ea8Slogwang 
1318a9643ea8Slogwang 	/*reset ready flag*/
1319a9643ea8Slogwang 	vdev->ready = DEVICE_MAC_LEARNING;
1320a9643ea8Slogwang 	vdev->remove = 0;
1321a9643ea8Slogwang 
1322a9643ea8Slogwang 	/* Find a suitable lcore to add the device. */
1323*2d9fd380Sjfb8856606 	RTE_LCORE_FOREACH_WORKER(lcore) {
1324a9643ea8Slogwang 		if (lcore_info[lcore].device_num < device_num_min) {
1325a9643ea8Slogwang 			device_num_min = lcore_info[lcore].device_num;
1326a9643ea8Slogwang 			core_add = lcore;
1327a9643ea8Slogwang 		}
1328a9643ea8Slogwang 	}
1329a9643ea8Slogwang 	vdev->coreid = core_add;
1330a9643ea8Slogwang 
1331a9643ea8Slogwang 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1332a9643ea8Slogwang 			  lcore_vdev_entry);
1333a9643ea8Slogwang 	lcore_info[vdev->coreid].device_num++;
1334a9643ea8Slogwang 
1335a9643ea8Slogwang 	/* Disable notifications. */
1336a9643ea8Slogwang 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1337a9643ea8Slogwang 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1338a9643ea8Slogwang 
1339a9643ea8Slogwang 	RTE_LOG(INFO, VHOST_DATA,
1340a9643ea8Slogwang 		"(%d) device has been added to data core %d\n",
1341a9643ea8Slogwang 		vid, vdev->coreid);
1342a9643ea8Slogwang 
1343*2d9fd380Sjfb8856606 	if (async_vhost_driver) {
1344*2d9fd380Sjfb8856606 		struct rte_vhost_async_features f;
1345*2d9fd380Sjfb8856606 		struct rte_vhost_async_channel_ops channel_ops;
1346*2d9fd380Sjfb8856606 		if (strncmp(dma_type, "ioat", 4) == 0) {
1347*2d9fd380Sjfb8856606 			channel_ops.transfer_data = ioat_transfer_data_cb;
1348*2d9fd380Sjfb8856606 			channel_ops.check_completed_copies =
1349*2d9fd380Sjfb8856606 				ioat_check_completed_copies_cb;
1350*2d9fd380Sjfb8856606 			f.async_inorder = 1;
1351*2d9fd380Sjfb8856606 			f.async_threshold = 256;
1352*2d9fd380Sjfb8856606 			return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1353*2d9fd380Sjfb8856606 				f.intval, &channel_ops);
1354*2d9fd380Sjfb8856606 		}
1355*2d9fd380Sjfb8856606 	}
1356*2d9fd380Sjfb8856606 
1357a9643ea8Slogwang 	return 0;
1358a9643ea8Slogwang }
1359a9643ea8Slogwang 
1360a9643ea8Slogwang /*
1361a9643ea8Slogwang  * These callback allow devices to be added to the data core when configuration
1362a9643ea8Slogwang  * has been fully complete.
1363a9643ea8Slogwang  */
13642bfe3f2eSlogwang static const struct vhost_device_ops virtio_net_device_ops =
1365a9643ea8Slogwang {
1366a9643ea8Slogwang 	.new_device =  new_device,
1367a9643ea8Slogwang 	.destroy_device = destroy_device,
1368a9643ea8Slogwang };
1369a9643ea8Slogwang 
1370a9643ea8Slogwang /*
1371a9643ea8Slogwang  * This is a thread will wake up after a period to print stats if the user has
1372a9643ea8Slogwang  * enabled them.
1373a9643ea8Slogwang  */
1374d30ea906Sjfb8856606 static void *
print_stats(__rte_unused void * arg)1375d30ea906Sjfb8856606 print_stats(__rte_unused void *arg)
1376a9643ea8Slogwang {
1377a9643ea8Slogwang 	struct vhost_dev *vdev;
1378a9643ea8Slogwang 	uint64_t tx_dropped, rx_dropped;
1379a9643ea8Slogwang 	uint64_t tx, tx_total, rx, rx_total;
1380a9643ea8Slogwang 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1381a9643ea8Slogwang 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1382a9643ea8Slogwang 
1383a9643ea8Slogwang 	while(1) {
1384a9643ea8Slogwang 		sleep(enable_stats);
1385a9643ea8Slogwang 
1386a9643ea8Slogwang 		/* Clear screen and move to top left */
1387a9643ea8Slogwang 		printf("%s%s\n", clr, top_left);
1388a9643ea8Slogwang 		printf("Device statistics =================================\n");
1389a9643ea8Slogwang 
1390a9643ea8Slogwang 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1391a9643ea8Slogwang 			tx_total   = vdev->stats.tx_total;
1392a9643ea8Slogwang 			tx         = vdev->stats.tx;
1393a9643ea8Slogwang 			tx_dropped = tx_total - tx;
1394a9643ea8Slogwang 
1395a9643ea8Slogwang 			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1396a9643ea8Slogwang 			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1397a9643ea8Slogwang 			rx_dropped = rx_total - rx;
1398a9643ea8Slogwang 
1399a9643ea8Slogwang 			printf("Statistics for device %d\n"
1400a9643ea8Slogwang 				"-----------------------\n"
1401a9643ea8Slogwang 				"TX total:              %" PRIu64 "\n"
1402a9643ea8Slogwang 				"TX dropped:            %" PRIu64 "\n"
1403a9643ea8Slogwang 				"TX successful:         %" PRIu64 "\n"
1404a9643ea8Slogwang 				"RX total:              %" PRIu64 "\n"
1405a9643ea8Slogwang 				"RX dropped:            %" PRIu64 "\n"
1406a9643ea8Slogwang 				"RX successful:         %" PRIu64 "\n",
1407a9643ea8Slogwang 				vdev->vid,
1408a9643ea8Slogwang 				tx_total, tx_dropped, tx,
1409a9643ea8Slogwang 				rx_total, rx_dropped, rx);
1410a9643ea8Slogwang 		}
1411a9643ea8Slogwang 
1412a9643ea8Slogwang 		printf("===================================================\n");
14130c6bd470Sfengbojiang 
14140c6bd470Sfengbojiang 		fflush(stdout);
1415a9643ea8Slogwang 	}
1416d30ea906Sjfb8856606 
1417d30ea906Sjfb8856606 	return NULL;
1418a9643ea8Slogwang }
1419a9643ea8Slogwang 
14202bfe3f2eSlogwang static void
unregister_drivers(int socket_num)14212bfe3f2eSlogwang unregister_drivers(int socket_num)
14222bfe3f2eSlogwang {
14232bfe3f2eSlogwang 	int i, ret;
14242bfe3f2eSlogwang 
14252bfe3f2eSlogwang 	for (i = 0; i < socket_num; i++) {
14262bfe3f2eSlogwang 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
14272bfe3f2eSlogwang 		if (ret != 0)
14282bfe3f2eSlogwang 			RTE_LOG(ERR, VHOST_CONFIG,
14292bfe3f2eSlogwang 				"Fail to unregister vhost driver for %s.\n",
14302bfe3f2eSlogwang 				socket_files + i * PATH_MAX);
14312bfe3f2eSlogwang 	}
14322bfe3f2eSlogwang }
14332bfe3f2eSlogwang 
1434a9643ea8Slogwang /* When we receive a INT signal, unregister vhost driver */
1435a9643ea8Slogwang static void
sigint_handler(__rte_unused int signum)1436a9643ea8Slogwang sigint_handler(__rte_unused int signum)
1437a9643ea8Slogwang {
1438a9643ea8Slogwang 	/* Unregister vhost driver. */
14392bfe3f2eSlogwang 	unregister_drivers(nb_sockets);
14402bfe3f2eSlogwang 
1441a9643ea8Slogwang 	exit(0);
1442a9643ea8Slogwang }
1443a9643ea8Slogwang 
1444a9643ea8Slogwang /*
1445a9643ea8Slogwang  * While creating an mbuf pool, one key thing is to figure out how
1446a9643ea8Slogwang  * many mbuf entries is enough for our use. FYI, here are some
1447a9643ea8Slogwang  * guidelines:
1448a9643ea8Slogwang  *
1449a9643ea8Slogwang  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1450a9643ea8Slogwang  *
1451a9643ea8Slogwang  * - For each switch core (A CPU core does the packet switch), we need
1452a9643ea8Slogwang  *   also make some reservation for receiving the packets from virtio
1453a9643ea8Slogwang  *   Tx queue. How many is enough depends on the usage. It's normally
1454a9643ea8Slogwang  *   a simple calculation like following:
1455a9643ea8Slogwang  *
1456a9643ea8Slogwang  *       MAX_PKT_BURST * max packet size / mbuf size
1457a9643ea8Slogwang  *
1458a9643ea8Slogwang  *   So, we definitely need allocate more mbufs when TSO is enabled.
1459a9643ea8Slogwang  *
1460a9643ea8Slogwang  * - Similarly, for each switching core, we should serve @nr_rx_desc
1461a9643ea8Slogwang  *   mbufs for receiving the packets from physical NIC device.
1462a9643ea8Slogwang  *
1463a9643ea8Slogwang  * - We also need make sure, for each switch core, we have allocated
1464a9643ea8Slogwang  *   enough mbufs to fill up the mbuf cache.
1465a9643ea8Slogwang  */
1466a9643ea8Slogwang static void
create_mbuf_pool(uint16_t nr_port,uint32_t nr_switch_core,uint32_t mbuf_size,uint32_t nr_queues,uint32_t nr_rx_desc,uint32_t nr_mbuf_cache)1467a9643ea8Slogwang create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1468a9643ea8Slogwang 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1469a9643ea8Slogwang {
1470a9643ea8Slogwang 	uint32_t nr_mbufs;
1471a9643ea8Slogwang 	uint32_t nr_mbufs_per_core;
1472a9643ea8Slogwang 	uint32_t mtu = 1500;
1473a9643ea8Slogwang 
1474a9643ea8Slogwang 	if (mergeable)
1475a9643ea8Slogwang 		mtu = 9000;
1476a9643ea8Slogwang 	if (enable_tso)
1477a9643ea8Slogwang 		mtu = 64 * 1024;
1478a9643ea8Slogwang 
1479a9643ea8Slogwang 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
14802bfe3f2eSlogwang 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1481a9643ea8Slogwang 	nr_mbufs_per_core += nr_rx_desc;
1482a9643ea8Slogwang 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1483a9643ea8Slogwang 
1484a9643ea8Slogwang 	nr_mbufs  = nr_queues * nr_rx_desc;
1485a9643ea8Slogwang 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1486a9643ea8Slogwang 	nr_mbufs *= nr_port;
1487a9643ea8Slogwang 
1488a9643ea8Slogwang 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1489a9643ea8Slogwang 					    nr_mbuf_cache, 0, mbuf_size,
1490a9643ea8Slogwang 					    rte_socket_id());
1491a9643ea8Slogwang 	if (mbuf_pool == NULL)
1492a9643ea8Slogwang 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1493a9643ea8Slogwang }
1494a9643ea8Slogwang 
1495a9643ea8Slogwang /*
14962bfe3f2eSlogwang  * Main function, does initialisation and calls the per-lcore functions.
1497a9643ea8Slogwang  */
1498a9643ea8Slogwang int
main(int argc,char * argv[])1499a9643ea8Slogwang main(int argc, char *argv[])
1500a9643ea8Slogwang {
1501a9643ea8Slogwang 	unsigned lcore_id, core_id = 0;
1502a9643ea8Slogwang 	unsigned nb_ports, valid_num_ports;
15032bfe3f2eSlogwang 	int ret, i;
15042bfe3f2eSlogwang 	uint16_t portid;
1505a9643ea8Slogwang 	static pthread_t tid;
1506a9643ea8Slogwang 	uint64_t flags = 0;
1507a9643ea8Slogwang 
1508a9643ea8Slogwang 	signal(SIGINT, sigint_handler);
1509a9643ea8Slogwang 
1510a9643ea8Slogwang 	/* init EAL */
1511a9643ea8Slogwang 	ret = rte_eal_init(argc, argv);
1512a9643ea8Slogwang 	if (ret < 0)
1513a9643ea8Slogwang 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1514a9643ea8Slogwang 	argc -= ret;
1515a9643ea8Slogwang 	argv += ret;
1516a9643ea8Slogwang 
1517a9643ea8Slogwang 	/* parse app arguments */
1518a9643ea8Slogwang 	ret = us_vhost_parse_args(argc, argv);
1519a9643ea8Slogwang 	if (ret < 0)
1520a9643ea8Slogwang 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1521a9643ea8Slogwang 
15222bfe3f2eSlogwang 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1523a9643ea8Slogwang 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1524a9643ea8Slogwang 
1525a9643ea8Slogwang 		if (rte_lcore_is_enabled(lcore_id))
1526a9643ea8Slogwang 			lcore_ids[core_id++] = lcore_id;
15272bfe3f2eSlogwang 	}
1528a9643ea8Slogwang 
1529a9643ea8Slogwang 	if (rte_lcore_count() > RTE_MAX_LCORE)
1530a9643ea8Slogwang 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1531a9643ea8Slogwang 
1532a9643ea8Slogwang 	/* Get the number of physical ports. */
1533d30ea906Sjfb8856606 	nb_ports = rte_eth_dev_count_avail();
1534a9643ea8Slogwang 
1535a9643ea8Slogwang 	/*
1536a9643ea8Slogwang 	 * Update the global var NUM_PORTS and global array PORTS
1537a9643ea8Slogwang 	 * and get value of var VALID_NUM_PORTS according to system ports number
1538a9643ea8Slogwang 	 */
1539a9643ea8Slogwang 	valid_num_ports = check_ports_num(nb_ports);
1540a9643ea8Slogwang 
1541a9643ea8Slogwang 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1542a9643ea8Slogwang 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1543a9643ea8Slogwang 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1544a9643ea8Slogwang 		return -1;
1545a9643ea8Slogwang 	}
1546a9643ea8Slogwang 
1547a9643ea8Slogwang 	/*
1548a9643ea8Slogwang 	 * FIXME: here we are trying to allocate mbufs big enough for
1549a9643ea8Slogwang 	 * @MAX_QUEUES, but the truth is we're never going to use that
1550a9643ea8Slogwang 	 * many queues here. We probably should only do allocation for
1551a9643ea8Slogwang 	 * those queues we are going to use.
1552a9643ea8Slogwang 	 */
1553a9643ea8Slogwang 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1554a9643ea8Slogwang 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1555a9643ea8Slogwang 
1556a9643ea8Slogwang 	if (vm2vm_mode == VM2VM_HARDWARE) {
1557a9643ea8Slogwang 		/* Enable VT loop back to let L2 switch to do it. */
1558a9643ea8Slogwang 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1559a9643ea8Slogwang 		RTE_LOG(DEBUG, VHOST_CONFIG,
1560a9643ea8Slogwang 			"Enable loop back for L2 switch in vmdq.\n");
1561a9643ea8Slogwang 	}
1562a9643ea8Slogwang 
1563a9643ea8Slogwang 	/* initialize all ports */
1564d30ea906Sjfb8856606 	RTE_ETH_FOREACH_DEV(portid) {
1565a9643ea8Slogwang 		/* skip ports that are not enabled */
1566a9643ea8Slogwang 		if ((enabled_port_mask & (1 << portid)) == 0) {
1567a9643ea8Slogwang 			RTE_LOG(INFO, VHOST_PORT,
1568a9643ea8Slogwang 				"Skipping disabled port %d\n", portid);
1569a9643ea8Slogwang 			continue;
1570a9643ea8Slogwang 		}
1571a9643ea8Slogwang 		if (port_init(portid) != 0)
1572a9643ea8Slogwang 			rte_exit(EXIT_FAILURE,
1573a9643ea8Slogwang 				"Cannot initialize network ports\n");
1574a9643ea8Slogwang 	}
1575a9643ea8Slogwang 
1576a9643ea8Slogwang 	/* Enable stats if the user option is set. */
1577a9643ea8Slogwang 	if (enable_stats) {
1578d30ea906Sjfb8856606 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1579d30ea906Sjfb8856606 					print_stats, NULL);
1580d30ea906Sjfb8856606 		if (ret < 0)
1581a9643ea8Slogwang 			rte_exit(EXIT_FAILURE,
1582a9643ea8Slogwang 				"Cannot create print-stats thread\n");
1583a9643ea8Slogwang 	}
1584a9643ea8Slogwang 
1585a9643ea8Slogwang 	/* Launch all data cores. */
1586*2d9fd380Sjfb8856606 	RTE_LCORE_FOREACH_WORKER(lcore_id)
1587a9643ea8Slogwang 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1588a9643ea8Slogwang 
1589a9643ea8Slogwang 	if (client_mode)
1590a9643ea8Slogwang 		flags |= RTE_VHOST_USER_CLIENT;
1591a9643ea8Slogwang 
15922bfe3f2eSlogwang 	/* Register vhost user driver to handle vhost messages. */
15932bfe3f2eSlogwang 	for (i = 0; i < nb_sockets; i++) {
15942bfe3f2eSlogwang 		char *file = socket_files + i * PATH_MAX;
1595*2d9fd380Sjfb8856606 		if (async_vhost_driver)
1596*2d9fd380Sjfb8856606 			flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1597*2d9fd380Sjfb8856606 
15982bfe3f2eSlogwang 		ret = rte_vhost_driver_register(file, flags);
15992bfe3f2eSlogwang 		if (ret != 0) {
16002bfe3f2eSlogwang 			unregister_drivers(i);
16012bfe3f2eSlogwang 			rte_exit(EXIT_FAILURE,
16022bfe3f2eSlogwang 				"vhost driver register failure.\n");
16032bfe3f2eSlogwang 		}
1604a9643ea8Slogwang 
16052bfe3f2eSlogwang 		if (builtin_net_driver)
16062bfe3f2eSlogwang 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
16072bfe3f2eSlogwang 
16082bfe3f2eSlogwang 		if (mergeable == 0) {
16092bfe3f2eSlogwang 			rte_vhost_driver_disable_features(file,
16102bfe3f2eSlogwang 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
16112bfe3f2eSlogwang 		}
16122bfe3f2eSlogwang 
16132bfe3f2eSlogwang 		if (enable_tx_csum == 0) {
16142bfe3f2eSlogwang 			rte_vhost_driver_disable_features(file,
16152bfe3f2eSlogwang 				1ULL << VIRTIO_NET_F_CSUM);
16162bfe3f2eSlogwang 		}
16172bfe3f2eSlogwang 
16182bfe3f2eSlogwang 		if (enable_tso == 0) {
16192bfe3f2eSlogwang 			rte_vhost_driver_disable_features(file,
16202bfe3f2eSlogwang 				1ULL << VIRTIO_NET_F_HOST_TSO4);
16212bfe3f2eSlogwang 			rte_vhost_driver_disable_features(file,
16222bfe3f2eSlogwang 				1ULL << VIRTIO_NET_F_HOST_TSO6);
16232bfe3f2eSlogwang 			rte_vhost_driver_disable_features(file,
16242bfe3f2eSlogwang 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
16252bfe3f2eSlogwang 			rte_vhost_driver_disable_features(file,
16262bfe3f2eSlogwang 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
16272bfe3f2eSlogwang 		}
16282bfe3f2eSlogwang 
16292bfe3f2eSlogwang 		if (promiscuous) {
16302bfe3f2eSlogwang 			rte_vhost_driver_enable_features(file,
16312bfe3f2eSlogwang 				1ULL << VIRTIO_NET_F_CTRL_RX);
16322bfe3f2eSlogwang 		}
16332bfe3f2eSlogwang 
16342bfe3f2eSlogwang 		ret = rte_vhost_driver_callback_register(file,
16352bfe3f2eSlogwang 			&virtio_net_device_ops);
16362bfe3f2eSlogwang 		if (ret != 0) {
16372bfe3f2eSlogwang 			rte_exit(EXIT_FAILURE,
16382bfe3f2eSlogwang 				"failed to register vhost driver callbacks.\n");
16392bfe3f2eSlogwang 		}
16402bfe3f2eSlogwang 
16412bfe3f2eSlogwang 		if (rte_vhost_driver_start(file) < 0) {
16422bfe3f2eSlogwang 			rte_exit(EXIT_FAILURE,
16432bfe3f2eSlogwang 				"failed to start vhost driver.\n");
16442bfe3f2eSlogwang 		}
16452bfe3f2eSlogwang 	}
16462bfe3f2eSlogwang 
1647*2d9fd380Sjfb8856606 	RTE_LCORE_FOREACH_WORKER(lcore_id)
16482bfe3f2eSlogwang 		rte_eal_wait_lcore(lcore_id);
16492bfe3f2eSlogwang 
1650a9643ea8Slogwang 	return 0;
1651a9643ea8Slogwang 
1652a9643ea8Slogwang }
1653