xref: /f-stack/dpdk/examples/vhost/main.c (revision 61467f3e)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <arpa/inet.h>
35 #include <getopt.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
40 #include <signal.h>
41 #include <stdint.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
44 #include <unistd.h>
45 
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
49 #include <rte_log.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_vhost.h>
53 #include <rte_ip.h>
54 #include <rte_tcp.h>
55 #include <rte_pause.h>
56 
57 #include "main.h"
58 
59 #ifndef MAX_QUEUES
60 #define MAX_QUEUES 128
61 #endif
62 
63 /* the maximum number of external ports supported */
64 #define MAX_SUP_PORTS 1
65 
66 #define MBUF_CACHE_SIZE	128
67 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
68 
69 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
70 
71 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
72 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
73 
74 #define JUMBO_FRAME_MAX_SIZE    0x2600
75 
76 /* State of virtio device. */
77 #define DEVICE_MAC_LEARNING 0
78 #define DEVICE_RX			1
79 #define DEVICE_SAFE_REMOVE	2
80 
81 /* Configurable number of RX/TX ring descriptors */
82 #define RTE_TEST_RX_DESC_DEFAULT 1024
83 #define RTE_TEST_TX_DESC_DEFAULT 512
84 
85 #define INVALID_PORT_ID 0xFF
86 
87 /* Max number of devices. Limited by vmdq. */
88 #define MAX_DEVICES 64
89 
90 /* Size of buffers used for snprintfs. */
91 #define MAX_PRINT_BUFF 6072
92 
93 /* Maximum long option length for option parsing. */
94 #define MAX_LONG_OPT_SZ 64
95 
96 /* mask of enabled ports */
97 static uint32_t enabled_port_mask = 0;
98 
99 /* Promiscuous mode */
100 static uint32_t promiscuous;
101 
102 /* number of devices/queues to support*/
103 static uint32_t num_queues = 0;
104 static uint32_t num_devices;
105 
106 static struct rte_mempool *mbuf_pool;
107 static int mergeable;
108 
109 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
110 typedef enum {
111 	VM2VM_DISABLED = 0,
112 	VM2VM_SOFTWARE = 1,
113 	VM2VM_HARDWARE = 2,
114 	VM2VM_LAST
115 } vm2vm_type;
116 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
117 
118 /* Enable stats. */
119 static uint32_t enable_stats = 0;
120 /* Enable retries on RX. */
121 static uint32_t enable_retry = 1;
122 
123 /* Disable TX checksum offload */
124 static uint32_t enable_tx_csum;
125 
126 /* Disable TSO offload */
127 static uint32_t enable_tso;
128 
129 static int client_mode;
130 static int dequeue_zero_copy;
131 
132 static int builtin_net_driver;
133 
134 /* Specify timeout (in useconds) between retries on RX. */
135 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
136 /* Specify the number of retries on RX. */
137 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
138 
139 /* Socket file paths. Can be set by user */
140 static char *socket_files;
141 static int nb_sockets;
142 
143 /* empty vmdq configuration structure. Filled in programatically */
144 static struct rte_eth_conf vmdq_conf_default = {
145 	.rxmode = {
146 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
147 		.split_hdr_size = 0,
148 		.header_split   = 0, /**< Header Split disabled */
149 		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
150 		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
151 		/*
152 		 * It is necessary for 1G NIC such as I350,
153 		 * this fixes bug of ipv4 forwarding in guest can't
154 		 * forward pakets from one virtio dev to another virtio dev.
155 		 */
156 		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
157 		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
158 		.hw_strip_crc   = 1, /**< CRC stripped by hardware */
159 	},
160 
161 	.txmode = {
162 		.mq_mode = ETH_MQ_TX_NONE,
163 	},
164 	.rx_adv_conf = {
165 		/*
166 		 * should be overridden separately in code with
167 		 * appropriate values
168 		 */
169 		.vmdq_rx_conf = {
170 			.nb_queue_pools = ETH_8_POOLS,
171 			.enable_default_pool = 0,
172 			.default_pool = 0,
173 			.nb_pool_maps = 0,
174 			.pool_map = {{0, 0},},
175 		},
176 	},
177 };
178 
179 static unsigned lcore_ids[RTE_MAX_LCORE];
180 static uint16_t ports[RTE_MAX_ETHPORTS];
181 static unsigned num_ports = 0; /**< The number of ports specified in command line */
182 static uint16_t num_pf_queues, num_vmdq_queues;
183 static uint16_t vmdq_pool_base, vmdq_queue_base;
184 static uint16_t queues_per_pool;
185 
186 const uint16_t vlan_tags[] = {
187 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
188 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
189 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
190 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
191 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
192 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
193 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
194 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
195 };
196 
197 /* ethernet addresses of ports */
198 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
199 
200 static struct vhost_dev_tailq_list vhost_dev_list =
201 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
202 
203 static struct lcore_info lcore_info[RTE_MAX_LCORE];
204 
205 /* Used for queueing bursts of TX packets. */
206 struct mbuf_table {
207 	unsigned len;
208 	unsigned txq_id;
209 	struct rte_mbuf *m_table[MAX_PKT_BURST];
210 };
211 
212 /* TX queue for each data core. */
213 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
214 
215 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
216 				 / US_PER_S * BURST_TX_DRAIN_US)
217 #define VLAN_HLEN       4
218 
219 /*
220  * Builds up the correct configuration for VMDQ VLAN pool map
221  * according to the pool & queue limits.
222  */
223 static inline int
224 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
225 {
226 	struct rte_eth_vmdq_rx_conf conf;
227 	struct rte_eth_vmdq_rx_conf *def_conf =
228 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
229 	unsigned i;
230 
231 	memset(&conf, 0, sizeof(conf));
232 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
233 	conf.nb_pool_maps = num_devices;
234 	conf.enable_loop_back = def_conf->enable_loop_back;
235 	conf.rx_mode = def_conf->rx_mode;
236 
237 	for (i = 0; i < conf.nb_pool_maps; i++) {
238 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
239 		conf.pool_map[i].pools = (1UL << i);
240 	}
241 
242 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
243 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
244 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
245 	return 0;
246 }
247 
248 /*
249  * Validate the device number according to the max pool number gotten form
250  * dev_info. If the device number is invalid, give the error message and
251  * return -1. Each device must have its own pool.
252  */
253 static inline int
254 validate_num_devices(uint32_t max_nb_devices)
255 {
256 	if (num_devices > max_nb_devices) {
257 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
258 		return -1;
259 	}
260 	return 0;
261 }
262 
263 /*
264  * Initialises a given port using global settings and with the rx buffers
265  * coming from the mbuf_pool passed as parameter
266  */
267 static inline int
268 port_init(uint16_t port)
269 {
270 	struct rte_eth_dev_info dev_info;
271 	struct rte_eth_conf port_conf;
272 	struct rte_eth_rxconf *rxconf;
273 	struct rte_eth_txconf *txconf;
274 	int16_t rx_rings, tx_rings;
275 	uint16_t rx_ring_size, tx_ring_size;
276 	int retval;
277 	uint16_t q;
278 
279 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
280 	rte_eth_dev_info_get (port, &dev_info);
281 
282 	rxconf = &dev_info.default_rxconf;
283 	txconf = &dev_info.default_txconf;
284 	rxconf->rx_drop_en = 1;
285 
286 	/* Enable vlan offload */
287 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
288 
289 	/*configure the number of supported virtio devices based on VMDQ limits */
290 	num_devices = dev_info.max_vmdq_pools;
291 
292 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
293 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
294 
295 	/*
296 	 * When dequeue zero copy is enabled, guest Tx used vring will be
297 	 * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
298 	 * (tx_ring_size here) must be small enough so that the driver will
299 	 * hit the free threshold easily and free mbufs timely. Otherwise,
300 	 * guest Tx vring would be starved.
301 	 */
302 	if (dequeue_zero_copy)
303 		tx_ring_size = 64;
304 
305 	tx_rings = (uint16_t)rte_lcore_count();
306 
307 	retval = validate_num_devices(MAX_DEVICES);
308 	if (retval < 0)
309 		return retval;
310 
311 	/* Get port configuration. */
312 	retval = get_eth_conf(&port_conf, num_devices);
313 	if (retval < 0)
314 		return retval;
315 	/* NIC queues are divided into pf queues and vmdq queues.  */
316 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
317 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
318 	num_vmdq_queues = num_devices * queues_per_pool;
319 	num_queues = num_pf_queues + num_vmdq_queues;
320 	vmdq_queue_base = dev_info.vmdq_queue_base;
321 	vmdq_pool_base  = dev_info.vmdq_pool_base;
322 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
323 		num_pf_queues, num_devices, queues_per_pool);
324 
325 	if (port >= rte_eth_dev_count()) return -1;
326 
327 	rx_rings = (uint16_t)dev_info.max_rx_queues;
328 	/* Configure ethernet device. */
329 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
330 	if (retval != 0) {
331 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
332 			port, strerror(-retval));
333 		return retval;
334 	}
335 
336 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
337 		&tx_ring_size);
338 	if (retval != 0) {
339 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
340 			"for port %u: %s.\n", port, strerror(-retval));
341 		return retval;
342 	}
343 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
344 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
345 			"for Rx queues on port %u.\n", port);
346 		return -1;
347 	}
348 
349 	/* Setup the queues. */
350 	for (q = 0; q < rx_rings; q ++) {
351 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
352 						rte_eth_dev_socket_id(port),
353 						rxconf,
354 						mbuf_pool);
355 		if (retval < 0) {
356 			RTE_LOG(ERR, VHOST_PORT,
357 				"Failed to setup rx queue %u of port %u: %s.\n",
358 				q, port, strerror(-retval));
359 			return retval;
360 		}
361 	}
362 	for (q = 0; q < tx_rings; q ++) {
363 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
364 						rte_eth_dev_socket_id(port),
365 						txconf);
366 		if (retval < 0) {
367 			RTE_LOG(ERR, VHOST_PORT,
368 				"Failed to setup tx queue %u of port %u: %s.\n",
369 				q, port, strerror(-retval));
370 			return retval;
371 		}
372 	}
373 
374 	/* Start the device. */
375 	retval  = rte_eth_dev_start(port);
376 	if (retval < 0) {
377 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
378 			port, strerror(-retval));
379 		return retval;
380 	}
381 
382 	if (promiscuous)
383 		rte_eth_promiscuous_enable(port);
384 
385 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
386 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
387 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
388 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
389 			port,
390 			vmdq_ports_eth_addr[port].addr_bytes[0],
391 			vmdq_ports_eth_addr[port].addr_bytes[1],
392 			vmdq_ports_eth_addr[port].addr_bytes[2],
393 			vmdq_ports_eth_addr[port].addr_bytes[3],
394 			vmdq_ports_eth_addr[port].addr_bytes[4],
395 			vmdq_ports_eth_addr[port].addr_bytes[5]);
396 
397 	return 0;
398 }
399 
400 /*
401  * Set socket file path.
402  */
403 static int
404 us_vhost_parse_socket_path(const char *q_arg)
405 {
406 	/* parse number string */
407 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
408 		return -1;
409 
410 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
411 	snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg);
412 	nb_sockets++;
413 
414 	return 0;
415 }
416 
417 /*
418  * Parse the portmask provided at run time.
419  */
420 static int
421 parse_portmask(const char *portmask)
422 {
423 	char *end = NULL;
424 	unsigned long pm;
425 
426 	errno = 0;
427 
428 	/* parse hexadecimal string */
429 	pm = strtoul(portmask, &end, 16);
430 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
431 		return -1;
432 
433 	if (pm == 0)
434 		return -1;
435 
436 	return pm;
437 
438 }
439 
440 /*
441  * Parse num options at run time.
442  */
443 static int
444 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
445 {
446 	char *end = NULL;
447 	unsigned long num;
448 
449 	errno = 0;
450 
451 	/* parse unsigned int string */
452 	num = strtoul(q_arg, &end, 10);
453 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
454 		return -1;
455 
456 	if (num > max_valid_value)
457 		return -1;
458 
459 	return num;
460 
461 }
462 
463 /*
464  * Display usage
465  */
466 static void
467 us_vhost_usage(const char *prgname)
468 {
469 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
470 	"		--vm2vm [0|1|2]\n"
471 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
472 	"		--socket-file <path>\n"
473 	"		--nb-devices ND\n"
474 	"		-p PORTMASK: Set mask for ports to be used by application\n"
475 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
476 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
477 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
478 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
479 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
480 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
481 	"		--socket-file: The path of the socket file.\n"
482 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
483 	"		--tso [0|1] disable/enable TCP segment offload.\n"
484 	"		--client register a vhost-user socket as client mode.\n"
485 	"		--dequeue-zero-copy enables dequeue zero copy\n",
486 	       prgname);
487 }
488 
489 /*
490  * Parse the arguments given in the command line of the application.
491  */
492 static int
493 us_vhost_parse_args(int argc, char **argv)
494 {
495 	int opt, ret;
496 	int option_index;
497 	unsigned i;
498 	const char *prgname = argv[0];
499 	static struct option long_option[] = {
500 		{"vm2vm", required_argument, NULL, 0},
501 		{"rx-retry", required_argument, NULL, 0},
502 		{"rx-retry-delay", required_argument, NULL, 0},
503 		{"rx-retry-num", required_argument, NULL, 0},
504 		{"mergeable", required_argument, NULL, 0},
505 		{"stats", required_argument, NULL, 0},
506 		{"socket-file", required_argument, NULL, 0},
507 		{"tx-csum", required_argument, NULL, 0},
508 		{"tso", required_argument, NULL, 0},
509 		{"client", no_argument, &client_mode, 1},
510 		{"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
511 		{"builtin-net-driver", no_argument, &builtin_net_driver, 1},
512 		{NULL, 0, 0, 0},
513 	};
514 
515 	/* Parse command line */
516 	while ((opt = getopt_long(argc, argv, "p:P",
517 			long_option, &option_index)) != EOF) {
518 		switch (opt) {
519 		/* Portmask */
520 		case 'p':
521 			enabled_port_mask = parse_portmask(optarg);
522 			if (enabled_port_mask == 0) {
523 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
524 				us_vhost_usage(prgname);
525 				return -1;
526 			}
527 			break;
528 
529 		case 'P':
530 			promiscuous = 1;
531 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
532 				ETH_VMDQ_ACCEPT_BROADCAST |
533 				ETH_VMDQ_ACCEPT_MULTICAST;
534 
535 			break;
536 
537 		case 0:
538 			/* Enable/disable vm2vm comms. */
539 			if (!strncmp(long_option[option_index].name, "vm2vm",
540 				MAX_LONG_OPT_SZ)) {
541 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
542 				if (ret == -1) {
543 					RTE_LOG(INFO, VHOST_CONFIG,
544 						"Invalid argument for "
545 						"vm2vm [0|1|2]\n");
546 					us_vhost_usage(prgname);
547 					return -1;
548 				} else {
549 					vm2vm_mode = (vm2vm_type)ret;
550 				}
551 			}
552 
553 			/* Enable/disable retries on RX. */
554 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
555 				ret = parse_num_opt(optarg, 1);
556 				if (ret == -1) {
557 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
558 					us_vhost_usage(prgname);
559 					return -1;
560 				} else {
561 					enable_retry = ret;
562 				}
563 			}
564 
565 			/* Enable/disable TX checksum offload. */
566 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
567 				ret = parse_num_opt(optarg, 1);
568 				if (ret == -1) {
569 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
570 					us_vhost_usage(prgname);
571 					return -1;
572 				} else
573 					enable_tx_csum = ret;
574 			}
575 
576 			/* Enable/disable TSO offload. */
577 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
578 				ret = parse_num_opt(optarg, 1);
579 				if (ret == -1) {
580 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
581 					us_vhost_usage(prgname);
582 					return -1;
583 				} else
584 					enable_tso = ret;
585 			}
586 
587 			/* Specify the retries delay time (in useconds) on RX. */
588 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
589 				ret = parse_num_opt(optarg, INT32_MAX);
590 				if (ret == -1) {
591 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
592 					us_vhost_usage(prgname);
593 					return -1;
594 				} else {
595 					burst_rx_delay_time = ret;
596 				}
597 			}
598 
599 			/* Specify the retries number on RX. */
600 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
601 				ret = parse_num_opt(optarg, INT32_MAX);
602 				if (ret == -1) {
603 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
604 					us_vhost_usage(prgname);
605 					return -1;
606 				} else {
607 					burst_rx_retry_num = ret;
608 				}
609 			}
610 
611 			/* Enable/disable RX mergeable buffers. */
612 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
613 				ret = parse_num_opt(optarg, 1);
614 				if (ret == -1) {
615 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
616 					us_vhost_usage(prgname);
617 					return -1;
618 				} else {
619 					mergeable = !!ret;
620 					if (ret) {
621 						vmdq_conf_default.rxmode.jumbo_frame = 1;
622 						vmdq_conf_default.rxmode.max_rx_pkt_len
623 							= JUMBO_FRAME_MAX_SIZE;
624 					}
625 				}
626 			}
627 
628 			/* Enable/disable stats. */
629 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
630 				ret = parse_num_opt(optarg, INT32_MAX);
631 				if (ret == -1) {
632 					RTE_LOG(INFO, VHOST_CONFIG,
633 						"Invalid argument for stats [0..N]\n");
634 					us_vhost_usage(prgname);
635 					return -1;
636 				} else {
637 					enable_stats = ret;
638 				}
639 			}
640 
641 			/* Set socket file path. */
642 			if (!strncmp(long_option[option_index].name,
643 						"socket-file", MAX_LONG_OPT_SZ)) {
644 				if (us_vhost_parse_socket_path(optarg) == -1) {
645 					RTE_LOG(INFO, VHOST_CONFIG,
646 					"Invalid argument for socket name (Max %d characters)\n",
647 					PATH_MAX);
648 					us_vhost_usage(prgname);
649 					return -1;
650 				}
651 			}
652 
653 			break;
654 
655 			/* Invalid option - print options. */
656 		default:
657 			us_vhost_usage(prgname);
658 			return -1;
659 		}
660 	}
661 
662 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
663 		if (enabled_port_mask & (1 << i))
664 			ports[num_ports++] = i;
665 	}
666 
667 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
668 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
669 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
670 		return -1;
671 	}
672 
673 	return 0;
674 }
675 
676 /*
677  * Update the global var NUM_PORTS and array PORTS according to system ports number
678  * and return valid ports number
679  */
680 static unsigned check_ports_num(unsigned nb_ports)
681 {
682 	unsigned valid_num_ports = num_ports;
683 	unsigned portid;
684 
685 	if (num_ports > nb_ports) {
686 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
687 			num_ports, nb_ports);
688 		num_ports = nb_ports;
689 	}
690 
691 	for (portid = 0; portid < num_ports; portid ++) {
692 		if (ports[portid] >= nb_ports) {
693 			RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
694 				ports[portid], (nb_ports - 1));
695 			ports[portid] = INVALID_PORT_ID;
696 			valid_num_ports--;
697 		}
698 	}
699 	return valid_num_ports;
700 }
701 
702 static __rte_always_inline struct vhost_dev *
703 find_vhost_dev(struct ether_addr *mac)
704 {
705 	struct vhost_dev *vdev;
706 
707 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
708 		if (vdev->ready == DEVICE_RX &&
709 		    is_same_ether_addr(mac, &vdev->mac_address))
710 			return vdev;
711 	}
712 
713 	return NULL;
714 }
715 
716 /*
717  * This function learns the MAC address of the device and registers this along with a
718  * vlan tag to a VMDQ.
719  */
720 static int
721 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
722 {
723 	struct ether_hdr *pkt_hdr;
724 	int i, ret;
725 
726 	/* Learn MAC address of guest device from packet */
727 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
728 
729 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
730 		RTE_LOG(ERR, VHOST_DATA,
731 			"(%d) device is using a registered MAC!\n",
732 			vdev->vid);
733 		return -1;
734 	}
735 
736 	for (i = 0; i < ETHER_ADDR_LEN; i++)
737 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
738 
739 	/* vlan_tag currently uses the device_id. */
740 	vdev->vlan_tag = vlan_tags[vdev->vid];
741 
742 	/* Print out VMDQ registration info. */
743 	RTE_LOG(INFO, VHOST_DATA,
744 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
745 		vdev->vid,
746 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
747 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
748 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
749 		vdev->vlan_tag);
750 
751 	/* Register the MAC address. */
752 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
753 				(uint32_t)vdev->vid + vmdq_pool_base);
754 	if (ret)
755 		RTE_LOG(ERR, VHOST_DATA,
756 			"(%d) failed to add device MAC address to VMDQ\n",
757 			vdev->vid);
758 
759 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
760 
761 	/* Set device as ready for RX. */
762 	vdev->ready = DEVICE_RX;
763 
764 	return 0;
765 }
766 
767 /*
768  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
769  * queue before disabling RX on the device.
770  */
771 static inline void
772 unlink_vmdq(struct vhost_dev *vdev)
773 {
774 	unsigned i = 0;
775 	unsigned rx_count;
776 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
777 
778 	if (vdev->ready == DEVICE_RX) {
779 		/*clear MAC and VLAN settings*/
780 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
781 		for (i = 0; i < 6; i++)
782 			vdev->mac_address.addr_bytes[i] = 0;
783 
784 		vdev->vlan_tag = 0;
785 
786 		/*Clear out the receive buffers*/
787 		rx_count = rte_eth_rx_burst(ports[0],
788 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
789 
790 		while (rx_count) {
791 			for (i = 0; i < rx_count; i++)
792 				rte_pktmbuf_free(pkts_burst[i]);
793 
794 			rx_count = rte_eth_rx_burst(ports[0],
795 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
796 		}
797 
798 		vdev->ready = DEVICE_MAC_LEARNING;
799 	}
800 }
801 
802 static __rte_always_inline void
803 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
804 	    struct rte_mbuf *m)
805 {
806 	uint16_t ret;
807 
808 	if (builtin_net_driver) {
809 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
810 	} else {
811 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
812 	}
813 
814 	if (enable_stats) {
815 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
816 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
817 		src_vdev->stats.tx_total++;
818 		src_vdev->stats.tx += ret;
819 	}
820 }
821 
822 /*
823  * Check if the packet destination MAC address is for a local device. If so then put
824  * the packet on that devices RX queue. If not then return.
825  */
826 static __rte_always_inline int
827 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
828 {
829 	struct ether_hdr *pkt_hdr;
830 	struct vhost_dev *dst_vdev;
831 
832 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
833 
834 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
835 	if (!dst_vdev)
836 		return -1;
837 
838 	if (vdev->vid == dst_vdev->vid) {
839 		RTE_LOG_DP(DEBUG, VHOST_DATA,
840 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
841 			vdev->vid);
842 		return 0;
843 	}
844 
845 	RTE_LOG_DP(DEBUG, VHOST_DATA,
846 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
847 
848 	if (unlikely(dst_vdev->remove)) {
849 		RTE_LOG_DP(DEBUG, VHOST_DATA,
850 			"(%d) device is marked for removal\n", dst_vdev->vid);
851 		return 0;
852 	}
853 
854 	virtio_xmit(dst_vdev, vdev, m);
855 	return 0;
856 }
857 
858 /*
859  * Check if the destination MAC of a packet is one local VM,
860  * and get its vlan tag, and offset if it is.
861  */
862 static __rte_always_inline int
863 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
864 	uint32_t *offset, uint16_t *vlan_tag)
865 {
866 	struct vhost_dev *dst_vdev;
867 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
868 
869 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
870 	if (!dst_vdev)
871 		return 0;
872 
873 	if (vdev->vid == dst_vdev->vid) {
874 		RTE_LOG_DP(DEBUG, VHOST_DATA,
875 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
876 			vdev->vid);
877 		return -1;
878 	}
879 
880 	/*
881 	 * HW vlan strip will reduce the packet length
882 	 * by minus length of vlan tag, so need restore
883 	 * the packet length by plus it.
884 	 */
885 	*offset  = VLAN_HLEN;
886 	*vlan_tag = vlan_tags[vdev->vid];
887 
888 	RTE_LOG_DP(DEBUG, VHOST_DATA,
889 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
890 		vdev->vid, dst_vdev->vid, *vlan_tag);
891 
892 	return 0;
893 }
894 
895 static uint16_t
896 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
897 {
898 	if (ol_flags & PKT_TX_IPV4)
899 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
900 	else /* assume ethertype == ETHER_TYPE_IPv6 */
901 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
902 }
903 
904 static void virtio_tx_offload(struct rte_mbuf *m)
905 {
906 	void *l3_hdr;
907 	struct ipv4_hdr *ipv4_hdr = NULL;
908 	struct tcp_hdr *tcp_hdr = NULL;
909 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
910 
911 	l3_hdr = (char *)eth_hdr + m->l2_len;
912 
913 	if (m->ol_flags & PKT_TX_IPV4) {
914 		ipv4_hdr = l3_hdr;
915 		ipv4_hdr->hdr_checksum = 0;
916 		m->ol_flags |= PKT_TX_IP_CKSUM;
917 	}
918 
919 	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
920 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
921 }
922 
923 static inline void
924 free_pkts(struct rte_mbuf **pkts, uint16_t n)
925 {
926 	while (n--)
927 		rte_pktmbuf_free(pkts[n]);
928 }
929 
930 static __rte_always_inline void
931 do_drain_mbuf_table(struct mbuf_table *tx_q)
932 {
933 	uint16_t count;
934 
935 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
936 				 tx_q->m_table, tx_q->len);
937 	if (unlikely(count < tx_q->len))
938 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
939 
940 	tx_q->len = 0;
941 }
942 
943 /*
944  * This function routes the TX packet to the correct interface. This
945  * may be a local device or the physical port.
946  */
947 static __rte_always_inline void
948 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
949 {
950 	struct mbuf_table *tx_q;
951 	unsigned offset = 0;
952 	const uint16_t lcore_id = rte_lcore_id();
953 	struct ether_hdr *nh;
954 
955 
956 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
957 	if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
958 		struct vhost_dev *vdev2;
959 
960 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
961 			if (vdev2 != vdev)
962 				virtio_xmit(vdev2, vdev, m);
963 		}
964 		goto queue2nic;
965 	}
966 
967 	/*check if destination is local VM*/
968 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
969 		rte_pktmbuf_free(m);
970 		return;
971 	}
972 
973 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
974 		if (unlikely(find_local_dest(vdev, m, &offset,
975 					     &vlan_tag) != 0)) {
976 			rte_pktmbuf_free(m);
977 			return;
978 		}
979 	}
980 
981 	RTE_LOG_DP(DEBUG, VHOST_DATA,
982 		"(%d) TX: MAC address is external\n", vdev->vid);
983 
984 queue2nic:
985 
986 	/*Add packet to the port tx queue*/
987 	tx_q = &lcore_tx_queue[lcore_id];
988 
989 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
990 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
991 		/* Guest has inserted the vlan tag. */
992 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
993 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
994 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
995 			(vh->vlan_tci != vlan_tag_be))
996 			vh->vlan_tci = vlan_tag_be;
997 	} else {
998 		m->ol_flags |= PKT_TX_VLAN_PKT;
999 
1000 		/*
1001 		 * Find the right seg to adjust the data len when offset is
1002 		 * bigger than tail room size.
1003 		 */
1004 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1005 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
1006 				m->data_len += offset;
1007 			else {
1008 				struct rte_mbuf *seg = m;
1009 
1010 				while ((seg->next != NULL) &&
1011 					(offset > rte_pktmbuf_tailroom(seg)))
1012 					seg = seg->next;
1013 
1014 				seg->data_len += offset;
1015 			}
1016 			m->pkt_len += offset;
1017 		}
1018 
1019 		m->vlan_tci = vlan_tag;
1020 	}
1021 
1022 	if (m->ol_flags & PKT_TX_TCP_SEG)
1023 		virtio_tx_offload(m);
1024 
1025 	tx_q->m_table[tx_q->len++] = m;
1026 	if (enable_stats) {
1027 		vdev->stats.tx_total++;
1028 		vdev->stats.tx++;
1029 	}
1030 
1031 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1032 		do_drain_mbuf_table(tx_q);
1033 }
1034 
1035 
1036 static __rte_always_inline void
1037 drain_mbuf_table(struct mbuf_table *tx_q)
1038 {
1039 	static uint64_t prev_tsc;
1040 	uint64_t cur_tsc;
1041 
1042 	if (tx_q->len == 0)
1043 		return;
1044 
1045 	cur_tsc = rte_rdtsc();
1046 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1047 		prev_tsc = cur_tsc;
1048 
1049 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1050 			"TX queue drained after timeout with burst size %u\n",
1051 			tx_q->len);
1052 		do_drain_mbuf_table(tx_q);
1053 	}
1054 }
1055 
1056 static __rte_always_inline void
1057 drain_eth_rx(struct vhost_dev *vdev)
1058 {
1059 	uint16_t rx_count, enqueue_count;
1060 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1061 
1062 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1063 				    pkts, MAX_PKT_BURST);
1064 	if (!rx_count)
1065 		return;
1066 
1067 	/*
1068 	 * When "enable_retry" is set, here we wait and retry when there
1069 	 * is no enough free slots in the queue to hold @rx_count packets,
1070 	 * to diminish packet loss.
1071 	 */
1072 	if (enable_retry &&
1073 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1074 			VIRTIO_RXQ))) {
1075 		uint32_t retry;
1076 
1077 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1078 			rte_delay_us(burst_rx_delay_time);
1079 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1080 					VIRTIO_RXQ))
1081 				break;
1082 		}
1083 	}
1084 
1085 	if (builtin_net_driver) {
1086 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1087 						pkts, rx_count);
1088 	} else {
1089 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1090 						pkts, rx_count);
1091 	}
1092 	if (enable_stats) {
1093 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1094 		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1095 	}
1096 
1097 	free_pkts(pkts, rx_count);
1098 }
1099 
1100 static __rte_always_inline void
1101 drain_virtio_tx(struct vhost_dev *vdev)
1102 {
1103 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1104 	uint16_t count;
1105 	uint16_t i;
1106 
1107 	if (builtin_net_driver) {
1108 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1109 					pkts, MAX_PKT_BURST);
1110 	} else {
1111 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1112 					mbuf_pool, pkts, MAX_PKT_BURST);
1113 	}
1114 
1115 	/* setup VMDq for the first packet */
1116 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1117 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1118 			free_pkts(pkts, count);
1119 	}
1120 
1121 	for (i = 0; i < count; ++i)
1122 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1123 }
1124 
1125 /*
1126  * Main function of vhost-switch. It basically does:
1127  *
1128  * for each vhost device {
1129  *    - drain_eth_rx()
1130  *
1131  *      Which drains the host eth Rx queue linked to the vhost device,
1132  *      and deliver all of them to guest virito Rx ring associated with
1133  *      this vhost device.
1134  *
1135  *    - drain_virtio_tx()
1136  *
1137  *      Which drains the guest virtio Tx queue and deliver all of them
1138  *      to the target, which could be another vhost device, or the
1139  *      physical eth dev. The route is done in function "virtio_tx_route".
1140  * }
1141  */
1142 static int
1143 switch_worker(void *arg __rte_unused)
1144 {
1145 	unsigned i;
1146 	unsigned lcore_id = rte_lcore_id();
1147 	struct vhost_dev *vdev;
1148 	struct mbuf_table *tx_q;
1149 
1150 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1151 
1152 	tx_q = &lcore_tx_queue[lcore_id];
1153 	for (i = 0; i < rte_lcore_count(); i++) {
1154 		if (lcore_ids[i] == lcore_id) {
1155 			tx_q->txq_id = i;
1156 			break;
1157 		}
1158 	}
1159 
1160 	while(1) {
1161 		drain_mbuf_table(tx_q);
1162 
1163 		/*
1164 		 * Inform the configuration core that we have exited the
1165 		 * linked list and that no devices are in use if requested.
1166 		 */
1167 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1168 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1169 
1170 		/*
1171 		 * Process vhost devices
1172 		 */
1173 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1174 			      lcore_vdev_entry) {
1175 			if (unlikely(vdev->remove)) {
1176 				unlink_vmdq(vdev);
1177 				vdev->ready = DEVICE_SAFE_REMOVE;
1178 				continue;
1179 			}
1180 
1181 			if (likely(vdev->ready == DEVICE_RX))
1182 				drain_eth_rx(vdev);
1183 
1184 			if (likely(!vdev->remove))
1185 				drain_virtio_tx(vdev);
1186 		}
1187 	}
1188 
1189 	return 0;
1190 }
1191 
1192 /*
1193  * Remove a device from the specific data core linked list and from the
1194  * main linked list. Synchonization  occurs through the use of the
1195  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1196  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1197  */
1198 static void
1199 destroy_device(int vid)
1200 {
1201 	struct vhost_dev *vdev = NULL;
1202 	int lcore;
1203 
1204 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1205 		if (vdev->vid == vid)
1206 			break;
1207 	}
1208 	if (!vdev)
1209 		return;
1210 	/*set the remove flag. */
1211 	vdev->remove = 1;
1212 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1213 		rte_pause();
1214 	}
1215 
1216 	if (builtin_net_driver)
1217 		vs_vhost_net_remove(vdev);
1218 
1219 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1220 		     lcore_vdev_entry);
1221 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1222 
1223 
1224 	/* Set the dev_removal_flag on each lcore. */
1225 	RTE_LCORE_FOREACH_SLAVE(lcore)
1226 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1227 
1228 	/*
1229 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1230 	 * we can be sure that they can no longer access the device removed
1231 	 * from the linked lists and that the devices are no longer in use.
1232 	 */
1233 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1234 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1235 			rte_pause();
1236 	}
1237 
1238 	lcore_info[vdev->coreid].device_num--;
1239 
1240 	RTE_LOG(INFO, VHOST_DATA,
1241 		"(%d) device has been removed from data core\n",
1242 		vdev->vid);
1243 
1244 	rte_free(vdev);
1245 }
1246 
1247 /*
1248  * A new device is added to a data core. First the device is added to the main linked list
1249  * and the allocated to a specific data core.
1250  */
1251 static int
1252 new_device(int vid)
1253 {
1254 	int lcore, core_add = 0;
1255 	uint32_t device_num_min = num_devices;
1256 	struct vhost_dev *vdev;
1257 
1258 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1259 	if (vdev == NULL) {
1260 		RTE_LOG(INFO, VHOST_DATA,
1261 			"(%d) couldn't allocate memory for vhost dev\n",
1262 			vid);
1263 		return -1;
1264 	}
1265 	vdev->vid = vid;
1266 
1267 	if (builtin_net_driver)
1268 		vs_vhost_net_setup(vdev);
1269 
1270 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1271 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1272 
1273 	/*reset ready flag*/
1274 	vdev->ready = DEVICE_MAC_LEARNING;
1275 	vdev->remove = 0;
1276 
1277 	/* Find a suitable lcore to add the device. */
1278 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1279 		if (lcore_info[lcore].device_num < device_num_min) {
1280 			device_num_min = lcore_info[lcore].device_num;
1281 			core_add = lcore;
1282 		}
1283 	}
1284 	vdev->coreid = core_add;
1285 
1286 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1287 			  lcore_vdev_entry);
1288 	lcore_info[vdev->coreid].device_num++;
1289 
1290 	/* Disable notifications. */
1291 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1292 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1293 
1294 	RTE_LOG(INFO, VHOST_DATA,
1295 		"(%d) device has been added to data core %d\n",
1296 		vid, vdev->coreid);
1297 
1298 	return 0;
1299 }
1300 
1301 /*
1302  * These callback allow devices to be added to the data core when configuration
1303  * has been fully complete.
1304  */
1305 static const struct vhost_device_ops virtio_net_device_ops =
1306 {
1307 	.new_device =  new_device,
1308 	.destroy_device = destroy_device,
1309 };
1310 
1311 /*
1312  * This is a thread will wake up after a period to print stats if the user has
1313  * enabled them.
1314  */
1315 static void
1316 print_stats(void)
1317 {
1318 	struct vhost_dev *vdev;
1319 	uint64_t tx_dropped, rx_dropped;
1320 	uint64_t tx, tx_total, rx, rx_total;
1321 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1322 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1323 
1324 	while(1) {
1325 		sleep(enable_stats);
1326 
1327 		/* Clear screen and move to top left */
1328 		printf("%s%s\n", clr, top_left);
1329 		printf("Device statistics =================================\n");
1330 
1331 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1332 			tx_total   = vdev->stats.tx_total;
1333 			tx         = vdev->stats.tx;
1334 			tx_dropped = tx_total - tx;
1335 
1336 			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1337 			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1338 			rx_dropped = rx_total - rx;
1339 
1340 			printf("Statistics for device %d\n"
1341 				"-----------------------\n"
1342 				"TX total:              %" PRIu64 "\n"
1343 				"TX dropped:            %" PRIu64 "\n"
1344 				"TX successful:         %" PRIu64 "\n"
1345 				"RX total:              %" PRIu64 "\n"
1346 				"RX dropped:            %" PRIu64 "\n"
1347 				"RX successful:         %" PRIu64 "\n",
1348 				vdev->vid,
1349 				tx_total, tx_dropped, tx,
1350 				rx_total, rx_dropped, rx);
1351 		}
1352 
1353 		printf("===================================================\n");
1354 	}
1355 }
1356 
1357 static void
1358 unregister_drivers(int socket_num)
1359 {
1360 	int i, ret;
1361 
1362 	for (i = 0; i < socket_num; i++) {
1363 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1364 		if (ret != 0)
1365 			RTE_LOG(ERR, VHOST_CONFIG,
1366 				"Fail to unregister vhost driver for %s.\n",
1367 				socket_files + i * PATH_MAX);
1368 	}
1369 }
1370 
1371 /* When we receive a INT signal, unregister vhost driver */
1372 static void
1373 sigint_handler(__rte_unused int signum)
1374 {
1375 	/* Unregister vhost driver. */
1376 	unregister_drivers(nb_sockets);
1377 
1378 	exit(0);
1379 }
1380 
1381 /*
1382  * While creating an mbuf pool, one key thing is to figure out how
1383  * many mbuf entries is enough for our use. FYI, here are some
1384  * guidelines:
1385  *
1386  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1387  *
1388  * - For each switch core (A CPU core does the packet switch), we need
1389  *   also make some reservation for receiving the packets from virtio
1390  *   Tx queue. How many is enough depends on the usage. It's normally
1391  *   a simple calculation like following:
1392  *
1393  *       MAX_PKT_BURST * max packet size / mbuf size
1394  *
1395  *   So, we definitely need allocate more mbufs when TSO is enabled.
1396  *
1397  * - Similarly, for each switching core, we should serve @nr_rx_desc
1398  *   mbufs for receiving the packets from physical NIC device.
1399  *
1400  * - We also need make sure, for each switch core, we have allocated
1401  *   enough mbufs to fill up the mbuf cache.
1402  */
1403 static void
1404 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1405 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1406 {
1407 	uint32_t nr_mbufs;
1408 	uint32_t nr_mbufs_per_core;
1409 	uint32_t mtu = 1500;
1410 
1411 	if (mergeable)
1412 		mtu = 9000;
1413 	if (enable_tso)
1414 		mtu = 64 * 1024;
1415 
1416 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1417 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1418 	nr_mbufs_per_core += nr_rx_desc;
1419 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1420 
1421 	nr_mbufs  = nr_queues * nr_rx_desc;
1422 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1423 	nr_mbufs *= nr_port;
1424 
1425 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1426 					    nr_mbuf_cache, 0, mbuf_size,
1427 					    rte_socket_id());
1428 	if (mbuf_pool == NULL)
1429 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1430 }
1431 
1432 /*
1433  * Main function, does initialisation and calls the per-lcore functions.
1434  */
1435 int
1436 main(int argc, char *argv[])
1437 {
1438 	unsigned lcore_id, core_id = 0;
1439 	unsigned nb_ports, valid_num_ports;
1440 	int ret, i;
1441 	uint16_t portid;
1442 	static pthread_t tid;
1443 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
1444 	uint64_t flags = 0;
1445 
1446 	signal(SIGINT, sigint_handler);
1447 
1448 	/* init EAL */
1449 	ret = rte_eal_init(argc, argv);
1450 	if (ret < 0)
1451 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1452 	argc -= ret;
1453 	argv += ret;
1454 
1455 	/* parse app arguments */
1456 	ret = us_vhost_parse_args(argc, argv);
1457 	if (ret < 0)
1458 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1459 
1460 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1461 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1462 
1463 		if (rte_lcore_is_enabled(lcore_id))
1464 			lcore_ids[core_id++] = lcore_id;
1465 	}
1466 
1467 	if (rte_lcore_count() > RTE_MAX_LCORE)
1468 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1469 
1470 	/* Get the number of physical ports. */
1471 	nb_ports = rte_eth_dev_count();
1472 
1473 	/*
1474 	 * Update the global var NUM_PORTS and global array PORTS
1475 	 * and get value of var VALID_NUM_PORTS according to system ports number
1476 	 */
1477 	valid_num_ports = check_ports_num(nb_ports);
1478 
1479 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1480 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1481 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1482 		return -1;
1483 	}
1484 
1485 	/*
1486 	 * FIXME: here we are trying to allocate mbufs big enough for
1487 	 * @MAX_QUEUES, but the truth is we're never going to use that
1488 	 * many queues here. We probably should only do allocation for
1489 	 * those queues we are going to use.
1490 	 */
1491 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1492 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1493 
1494 	if (vm2vm_mode == VM2VM_HARDWARE) {
1495 		/* Enable VT loop back to let L2 switch to do it. */
1496 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1497 		RTE_LOG(DEBUG, VHOST_CONFIG,
1498 			"Enable loop back for L2 switch in vmdq.\n");
1499 	}
1500 
1501 	/* initialize all ports */
1502 	for (portid = 0; portid < nb_ports; portid++) {
1503 		/* skip ports that are not enabled */
1504 		if ((enabled_port_mask & (1 << portid)) == 0) {
1505 			RTE_LOG(INFO, VHOST_PORT,
1506 				"Skipping disabled port %d\n", portid);
1507 			continue;
1508 		}
1509 		if (port_init(portid) != 0)
1510 			rte_exit(EXIT_FAILURE,
1511 				"Cannot initialize network ports\n");
1512 	}
1513 
1514 	/* Enable stats if the user option is set. */
1515 	if (enable_stats) {
1516 		ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
1517 		if (ret != 0)
1518 			rte_exit(EXIT_FAILURE,
1519 				"Cannot create print-stats thread\n");
1520 
1521 		/* Set thread_name for aid in debugging.  */
1522 		snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
1523 		ret = rte_thread_setname(tid, thread_name);
1524 		if (ret != 0)
1525 			RTE_LOG(DEBUG, VHOST_CONFIG,
1526 				"Cannot set print-stats name\n");
1527 	}
1528 
1529 	/* Launch all data cores. */
1530 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1531 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1532 
1533 	if (client_mode)
1534 		flags |= RTE_VHOST_USER_CLIENT;
1535 
1536 	if (dequeue_zero_copy)
1537 		flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1538 
1539 	/* Register vhost user driver to handle vhost messages. */
1540 	for (i = 0; i < nb_sockets; i++) {
1541 		char *file = socket_files + i * PATH_MAX;
1542 		ret = rte_vhost_driver_register(file, flags);
1543 		if (ret != 0) {
1544 			unregister_drivers(i);
1545 			rte_exit(EXIT_FAILURE,
1546 				"vhost driver register failure.\n");
1547 		}
1548 
1549 		if (builtin_net_driver)
1550 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1551 
1552 		if (mergeable == 0) {
1553 			rte_vhost_driver_disable_features(file,
1554 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1555 		}
1556 
1557 		if (enable_tx_csum == 0) {
1558 			rte_vhost_driver_disable_features(file,
1559 				1ULL << VIRTIO_NET_F_CSUM);
1560 		}
1561 
1562 		if (enable_tso == 0) {
1563 			rte_vhost_driver_disable_features(file,
1564 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1565 			rte_vhost_driver_disable_features(file,
1566 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1567 			rte_vhost_driver_disable_features(file,
1568 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1569 			rte_vhost_driver_disable_features(file,
1570 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1571 		}
1572 
1573 		if (promiscuous) {
1574 			rte_vhost_driver_enable_features(file,
1575 				1ULL << VIRTIO_NET_F_CTRL_RX);
1576 		}
1577 
1578 		ret = rte_vhost_driver_callback_register(file,
1579 			&virtio_net_device_ops);
1580 		if (ret != 0) {
1581 			rte_exit(EXIT_FAILURE,
1582 				"failed to register vhost driver callbacks.\n");
1583 		}
1584 
1585 		if (rte_vhost_driver_start(file) < 0) {
1586 			rte_exit(EXIT_FAILURE,
1587 				"failed to start vhost driver.\n");
1588 		}
1589 	}
1590 
1591 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1592 		rte_eal_wait_lcore(lcore_id);
1593 
1594 	return 0;
1595 
1596 }
1597