xref: /f-stack/dpdk/examples/vhost/main.c (revision e2391e5e)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2017 Intel Corporation
3  */
4 
5 #include <arpa/inet.h>
6 #include <getopt.h>
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
11 #include <signal.h>
12 #include <stdint.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
15 #include <unistd.h>
16 
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
20 #include <rte_log.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
24 #include <rte_ip.h>
25 #include <rte_tcp.h>
26 #include <rte_pause.h>
27 
28 #include "main.h"
29 
30 #ifndef MAX_QUEUES
31 #define MAX_QUEUES 128
32 #endif
33 
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
36 
37 #define MBUF_CACHE_SIZE	128
38 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
39 
40 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
41 
42 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4		/* Number of retries on RX. */
44 
45 #define JUMBO_FRAME_MAX_SIZE    0x2600
46 
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
49 #define DEVICE_RX			1
50 #define DEVICE_SAFE_REMOVE	2
51 
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
55 
56 #define INVALID_PORT_ID 0xFF
57 
58 /* Max number of devices. Limited by vmdq. */
59 #define MAX_DEVICES 64
60 
61 /* Maximum long option length for option parsing. */
62 #define MAX_LONG_OPT_SZ 64
63 
64 /* mask of enabled ports */
65 static uint32_t enabled_port_mask = 0;
66 
67 /* Promiscuous mode */
68 static uint32_t promiscuous;
69 
70 /* number of devices/queues to support*/
71 static uint32_t num_queues = 0;
72 static uint32_t num_devices;
73 
74 static struct rte_mempool *mbuf_pool;
75 static int mergeable;
76 
77 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
78 typedef enum {
79 	VM2VM_DISABLED = 0,
80 	VM2VM_SOFTWARE = 1,
81 	VM2VM_HARDWARE = 2,
82 	VM2VM_LAST
83 } vm2vm_type;
84 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
85 
86 /* Enable stats. */
87 static uint32_t enable_stats = 0;
88 /* Enable retries on RX. */
89 static uint32_t enable_retry = 1;
90 
91 /* Disable TX checksum offload */
92 static uint32_t enable_tx_csum;
93 
94 /* Disable TSO offload */
95 static uint32_t enable_tso;
96 
97 static int client_mode;
98 static int dequeue_zero_copy;
99 
100 static int builtin_net_driver;
101 
102 /* Specify timeout (in useconds) between retries on RX. */
103 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
104 /* Specify the number of retries on RX. */
105 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
106 
107 /* Socket file paths. Can be set by user */
108 static char *socket_files;
109 static int nb_sockets;
110 
111 /* empty vmdq configuration structure. Filled in programatically */
112 static struct rte_eth_conf vmdq_conf_default = {
113 	.rxmode = {
114 		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
115 		.split_hdr_size = 0,
116 		/*
117 		 * VLAN strip is necessary for 1G NIC such as I350,
118 		 * this fixes bug of ipv4 forwarding in guest can't
119 		 * forward pakets from one virtio dev to another virtio dev.
120 		 */
121 		.offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
122 	},
123 
124 	.txmode = {
125 		.mq_mode = ETH_MQ_TX_NONE,
126 		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
127 			     DEV_TX_OFFLOAD_TCP_CKSUM |
128 			     DEV_TX_OFFLOAD_VLAN_INSERT |
129 			     DEV_TX_OFFLOAD_MULTI_SEGS |
130 			     DEV_TX_OFFLOAD_TCP_TSO),
131 	},
132 	.rx_adv_conf = {
133 		/*
134 		 * should be overridden separately in code with
135 		 * appropriate values
136 		 */
137 		.vmdq_rx_conf = {
138 			.nb_queue_pools = ETH_8_POOLS,
139 			.enable_default_pool = 0,
140 			.default_pool = 0,
141 			.nb_pool_maps = 0,
142 			.pool_map = {{0, 0},},
143 		},
144 	},
145 };
146 
147 
148 static unsigned lcore_ids[RTE_MAX_LCORE];
149 static uint16_t ports[RTE_MAX_ETHPORTS];
150 static unsigned num_ports = 0; /**< The number of ports specified in command line */
151 static uint16_t num_pf_queues, num_vmdq_queues;
152 static uint16_t vmdq_pool_base, vmdq_queue_base;
153 static uint16_t queues_per_pool;
154 
155 const uint16_t vlan_tags[] = {
156 	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
157 	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
158 	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
159 	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
160 	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
161 	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
162 	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
163 	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
164 };
165 
166 /* ethernet addresses of ports */
167 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
168 
169 static struct vhost_dev_tailq_list vhost_dev_list =
170 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
171 
172 static struct lcore_info lcore_info[RTE_MAX_LCORE];
173 
174 /* Used for queueing bursts of TX packets. */
175 struct mbuf_table {
176 	unsigned len;
177 	unsigned txq_id;
178 	struct rte_mbuf *m_table[MAX_PKT_BURST];
179 };
180 
181 /* TX queue for each data core. */
182 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
183 
184 #define MBUF_TABLE_DRAIN_TSC	((rte_get_tsc_hz() + US_PER_S - 1) \
185 				 / US_PER_S * BURST_TX_DRAIN_US)
186 #define VLAN_HLEN       4
187 
188 /*
189  * Builds up the correct configuration for VMDQ VLAN pool map
190  * according to the pool & queue limits.
191  */
192 static inline int
193 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
194 {
195 	struct rte_eth_vmdq_rx_conf conf;
196 	struct rte_eth_vmdq_rx_conf *def_conf =
197 		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
198 	unsigned i;
199 
200 	memset(&conf, 0, sizeof(conf));
201 	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
202 	conf.nb_pool_maps = num_devices;
203 	conf.enable_loop_back = def_conf->enable_loop_back;
204 	conf.rx_mode = def_conf->rx_mode;
205 
206 	for (i = 0; i < conf.nb_pool_maps; i++) {
207 		conf.pool_map[i].vlan_id = vlan_tags[ i ];
208 		conf.pool_map[i].pools = (1UL << i);
209 	}
210 
211 	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
212 	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
213 		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
214 	return 0;
215 }
216 
217 /*
218  * Validate the device number according to the max pool number gotten form
219  * dev_info. If the device number is invalid, give the error message and
220  * return -1. Each device must have its own pool.
221  */
222 static inline int
223 validate_num_devices(uint32_t max_nb_devices)
224 {
225 	if (num_devices > max_nb_devices) {
226 		RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
227 		return -1;
228 	}
229 	return 0;
230 }
231 
232 /*
233  * Initialises a given port using global settings and with the rx buffers
234  * coming from the mbuf_pool passed as parameter
235  */
236 static inline int
237 port_init(uint16_t port)
238 {
239 	struct rte_eth_dev_info dev_info;
240 	struct rte_eth_conf port_conf;
241 	struct rte_eth_rxconf *rxconf;
242 	struct rte_eth_txconf *txconf;
243 	int16_t rx_rings, tx_rings;
244 	uint16_t rx_ring_size, tx_ring_size;
245 	int retval;
246 	uint16_t q;
247 
248 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
249 	rte_eth_dev_info_get (port, &dev_info);
250 
251 	rxconf = &dev_info.default_rxconf;
252 	txconf = &dev_info.default_txconf;
253 	rxconf->rx_drop_en = 1;
254 
255 	/*configure the number of supported virtio devices based on VMDQ limits */
256 	num_devices = dev_info.max_vmdq_pools;
257 
258 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
259 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
260 
261 	/*
262 	 * When dequeue zero copy is enabled, guest Tx used vring will be
263 	 * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
264 	 * (tx_ring_size here) must be small enough so that the driver will
265 	 * hit the free threshold easily and free mbufs timely. Otherwise,
266 	 * guest Tx vring would be starved.
267 	 */
268 	if (dequeue_zero_copy)
269 		tx_ring_size = 64;
270 
271 	tx_rings = (uint16_t)rte_lcore_count();
272 
273 	retval = validate_num_devices(MAX_DEVICES);
274 	if (retval < 0)
275 		return retval;
276 
277 	/* Get port configuration. */
278 	retval = get_eth_conf(&port_conf, num_devices);
279 	if (retval < 0)
280 		return retval;
281 	/* NIC queues are divided into pf queues and vmdq queues.  */
282 	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
283 	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
284 	num_vmdq_queues = num_devices * queues_per_pool;
285 	num_queues = num_pf_queues + num_vmdq_queues;
286 	vmdq_queue_base = dev_info.vmdq_queue_base;
287 	vmdq_pool_base  = dev_info.vmdq_pool_base;
288 	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
289 		num_pf_queues, num_devices, queues_per_pool);
290 
291 	if (!rte_eth_dev_is_valid_port(port))
292 		return -1;
293 
294 	rx_rings = (uint16_t)dev_info.max_rx_queues;
295 	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
296 		port_conf.txmode.offloads |=
297 			DEV_TX_OFFLOAD_MBUF_FAST_FREE;
298 	/* Configure ethernet device. */
299 	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
300 	if (retval != 0) {
301 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
302 			port, strerror(-retval));
303 		return retval;
304 	}
305 
306 	retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
307 		&tx_ring_size);
308 	if (retval != 0) {
309 		RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
310 			"for port %u: %s.\n", port, strerror(-retval));
311 		return retval;
312 	}
313 	if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
314 		RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
315 			"for Rx queues on port %u.\n", port);
316 		return -1;
317 	}
318 
319 	/* Setup the queues. */
320 	rxconf->offloads = port_conf.rxmode.offloads;
321 	for (q = 0; q < rx_rings; q ++) {
322 		retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
323 						rte_eth_dev_socket_id(port),
324 						rxconf,
325 						mbuf_pool);
326 		if (retval < 0) {
327 			RTE_LOG(ERR, VHOST_PORT,
328 				"Failed to setup rx queue %u of port %u: %s.\n",
329 				q, port, strerror(-retval));
330 			return retval;
331 		}
332 	}
333 	txconf->offloads = port_conf.txmode.offloads;
334 	for (q = 0; q < tx_rings; q ++) {
335 		retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
336 						rte_eth_dev_socket_id(port),
337 						txconf);
338 		if (retval < 0) {
339 			RTE_LOG(ERR, VHOST_PORT,
340 				"Failed to setup tx queue %u of port %u: %s.\n",
341 				q, port, strerror(-retval));
342 			return retval;
343 		}
344 	}
345 
346 	/* Start the device. */
347 	retval  = rte_eth_dev_start(port);
348 	if (retval < 0) {
349 		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
350 			port, strerror(-retval));
351 		return retval;
352 	}
353 
354 	if (promiscuous)
355 		rte_eth_promiscuous_enable(port);
356 
357 	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
358 	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
359 	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
360 			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
361 			port,
362 			vmdq_ports_eth_addr[port].addr_bytes[0],
363 			vmdq_ports_eth_addr[port].addr_bytes[1],
364 			vmdq_ports_eth_addr[port].addr_bytes[2],
365 			vmdq_ports_eth_addr[port].addr_bytes[3],
366 			vmdq_ports_eth_addr[port].addr_bytes[4],
367 			vmdq_ports_eth_addr[port].addr_bytes[5]);
368 
369 	return 0;
370 }
371 
372 /*
373  * Set socket file path.
374  */
375 static int
376 us_vhost_parse_socket_path(const char *q_arg)
377 {
378 	char *old;
379 
380 	/* parse number string */
381 	if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
382 		return -1;
383 
384 	old = socket_files;
385 	socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
386 	if (socket_files == NULL) {
387 		free(old);
388 		return -1;
389 	}
390 
391 	snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg);
392 	nb_sockets++;
393 
394 	return 0;
395 }
396 
397 /*
398  * Parse the portmask provided at run time.
399  */
400 static int
401 parse_portmask(const char *portmask)
402 {
403 	char *end = NULL;
404 	unsigned long pm;
405 
406 	errno = 0;
407 
408 	/* parse hexadecimal string */
409 	pm = strtoul(portmask, &end, 16);
410 	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
411 		return -1;
412 
413 	if (pm == 0)
414 		return -1;
415 
416 	return pm;
417 
418 }
419 
420 /*
421  * Parse num options at run time.
422  */
423 static int
424 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
425 {
426 	char *end = NULL;
427 	unsigned long num;
428 
429 	errno = 0;
430 
431 	/* parse unsigned int string */
432 	num = strtoul(q_arg, &end, 10);
433 	if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
434 		return -1;
435 
436 	if (num > max_valid_value)
437 		return -1;
438 
439 	return num;
440 
441 }
442 
443 /*
444  * Display usage
445  */
446 static void
447 us_vhost_usage(const char *prgname)
448 {
449 	RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
450 	"		--vm2vm [0|1|2]\n"
451 	"		--rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
452 	"		--socket-file <path>\n"
453 	"		--nb-devices ND\n"
454 	"		-p PORTMASK: Set mask for ports to be used by application\n"
455 	"		--vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
456 	"		--rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
457 	"		--rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
458 	"		--rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
459 	"		--mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
460 	"		--stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
461 	"		--socket-file: The path of the socket file.\n"
462 	"		--tx-csum [0|1] disable/enable TX checksum offload.\n"
463 	"		--tso [0|1] disable/enable TCP segment offload.\n"
464 	"		--client register a vhost-user socket as client mode.\n"
465 	"		--dequeue-zero-copy enables dequeue zero copy\n",
466 	       prgname);
467 }
468 
469 /*
470  * Parse the arguments given in the command line of the application.
471  */
472 static int
473 us_vhost_parse_args(int argc, char **argv)
474 {
475 	int opt, ret;
476 	int option_index;
477 	unsigned i;
478 	const char *prgname = argv[0];
479 	static struct option long_option[] = {
480 		{"vm2vm", required_argument, NULL, 0},
481 		{"rx-retry", required_argument, NULL, 0},
482 		{"rx-retry-delay", required_argument, NULL, 0},
483 		{"rx-retry-num", required_argument, NULL, 0},
484 		{"mergeable", required_argument, NULL, 0},
485 		{"stats", required_argument, NULL, 0},
486 		{"socket-file", required_argument, NULL, 0},
487 		{"tx-csum", required_argument, NULL, 0},
488 		{"tso", required_argument, NULL, 0},
489 		{"client", no_argument, &client_mode, 1},
490 		{"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
491 		{"builtin-net-driver", no_argument, &builtin_net_driver, 1},
492 		{NULL, 0, 0, 0},
493 	};
494 
495 	/* Parse command line */
496 	while ((opt = getopt_long(argc, argv, "p:P",
497 			long_option, &option_index)) != EOF) {
498 		switch (opt) {
499 		/* Portmask */
500 		case 'p':
501 			enabled_port_mask = parse_portmask(optarg);
502 			if (enabled_port_mask == 0) {
503 				RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
504 				us_vhost_usage(prgname);
505 				return -1;
506 			}
507 			break;
508 
509 		case 'P':
510 			promiscuous = 1;
511 			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
512 				ETH_VMDQ_ACCEPT_BROADCAST |
513 				ETH_VMDQ_ACCEPT_MULTICAST;
514 
515 			break;
516 
517 		case 0:
518 			/* Enable/disable vm2vm comms. */
519 			if (!strncmp(long_option[option_index].name, "vm2vm",
520 				MAX_LONG_OPT_SZ)) {
521 				ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
522 				if (ret == -1) {
523 					RTE_LOG(INFO, VHOST_CONFIG,
524 						"Invalid argument for "
525 						"vm2vm [0|1|2]\n");
526 					us_vhost_usage(prgname);
527 					return -1;
528 				} else {
529 					vm2vm_mode = (vm2vm_type)ret;
530 				}
531 			}
532 
533 			/* Enable/disable retries on RX. */
534 			if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
535 				ret = parse_num_opt(optarg, 1);
536 				if (ret == -1) {
537 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
538 					us_vhost_usage(prgname);
539 					return -1;
540 				} else {
541 					enable_retry = ret;
542 				}
543 			}
544 
545 			/* Enable/disable TX checksum offload. */
546 			if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
547 				ret = parse_num_opt(optarg, 1);
548 				if (ret == -1) {
549 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
550 					us_vhost_usage(prgname);
551 					return -1;
552 				} else
553 					enable_tx_csum = ret;
554 			}
555 
556 			/* Enable/disable TSO offload. */
557 			if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
558 				ret = parse_num_opt(optarg, 1);
559 				if (ret == -1) {
560 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
561 					us_vhost_usage(prgname);
562 					return -1;
563 				} else
564 					enable_tso = ret;
565 			}
566 
567 			/* Specify the retries delay time (in useconds) on RX. */
568 			if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
569 				ret = parse_num_opt(optarg, INT32_MAX);
570 				if (ret == -1) {
571 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
572 					us_vhost_usage(prgname);
573 					return -1;
574 				} else {
575 					burst_rx_delay_time = ret;
576 				}
577 			}
578 
579 			/* Specify the retries number on RX. */
580 			if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
581 				ret = parse_num_opt(optarg, INT32_MAX);
582 				if (ret == -1) {
583 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
584 					us_vhost_usage(prgname);
585 					return -1;
586 				} else {
587 					burst_rx_retry_num = ret;
588 				}
589 			}
590 
591 			/* Enable/disable RX mergeable buffers. */
592 			if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
593 				ret = parse_num_opt(optarg, 1);
594 				if (ret == -1) {
595 					RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
596 					us_vhost_usage(prgname);
597 					return -1;
598 				} else {
599 					mergeable = !!ret;
600 					if (ret) {
601 						vmdq_conf_default.rxmode.offloads |=
602 							DEV_RX_OFFLOAD_JUMBO_FRAME;
603 						vmdq_conf_default.rxmode.max_rx_pkt_len
604 							= JUMBO_FRAME_MAX_SIZE;
605 					}
606 				}
607 			}
608 
609 			/* Enable/disable stats. */
610 			if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
611 				ret = parse_num_opt(optarg, INT32_MAX);
612 				if (ret == -1) {
613 					RTE_LOG(INFO, VHOST_CONFIG,
614 						"Invalid argument for stats [0..N]\n");
615 					us_vhost_usage(prgname);
616 					return -1;
617 				} else {
618 					enable_stats = ret;
619 				}
620 			}
621 
622 			/* Set socket file path. */
623 			if (!strncmp(long_option[option_index].name,
624 						"socket-file", MAX_LONG_OPT_SZ)) {
625 				if (us_vhost_parse_socket_path(optarg) == -1) {
626 					RTE_LOG(INFO, VHOST_CONFIG,
627 					"Invalid argument for socket name (Max %d characters)\n",
628 					PATH_MAX);
629 					us_vhost_usage(prgname);
630 					return -1;
631 				}
632 			}
633 
634 			break;
635 
636 			/* Invalid option - print options. */
637 		default:
638 			us_vhost_usage(prgname);
639 			return -1;
640 		}
641 	}
642 
643 	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
644 		if (enabled_port_mask & (1 << i))
645 			ports[num_ports++] = i;
646 	}
647 
648 	if ((num_ports ==  0) || (num_ports > MAX_SUP_PORTS)) {
649 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
650 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
651 		return -1;
652 	}
653 
654 	return 0;
655 }
656 
657 /*
658  * Update the global var NUM_PORTS and array PORTS according to system ports number
659  * and return valid ports number
660  */
661 static unsigned check_ports_num(unsigned nb_ports)
662 {
663 	unsigned valid_num_ports = num_ports;
664 	unsigned portid;
665 
666 	if (num_ports > nb_ports) {
667 		RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
668 			num_ports, nb_ports);
669 		num_ports = nb_ports;
670 	}
671 
672 	for (portid = 0; portid < num_ports; portid ++) {
673 		if (!rte_eth_dev_is_valid_port(ports[portid])) {
674 			RTE_LOG(INFO, VHOST_PORT,
675 				"\nSpecified port ID(%u) is not valid\n",
676 				ports[portid]);
677 			ports[portid] = INVALID_PORT_ID;
678 			valid_num_ports--;
679 		}
680 	}
681 	return valid_num_ports;
682 }
683 
684 static __rte_always_inline struct vhost_dev *
685 find_vhost_dev(struct ether_addr *mac)
686 {
687 	struct vhost_dev *vdev;
688 
689 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
690 		if (vdev->ready == DEVICE_RX &&
691 		    is_same_ether_addr(mac, &vdev->mac_address))
692 			return vdev;
693 	}
694 
695 	return NULL;
696 }
697 
698 /*
699  * This function learns the MAC address of the device and registers this along with a
700  * vlan tag to a VMDQ.
701  */
702 static int
703 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
704 {
705 	struct ether_hdr *pkt_hdr;
706 	int i, ret;
707 
708 	/* Learn MAC address of guest device from packet */
709 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
710 
711 	if (find_vhost_dev(&pkt_hdr->s_addr)) {
712 		RTE_LOG(ERR, VHOST_DATA,
713 			"(%d) device is using a registered MAC!\n",
714 			vdev->vid);
715 		return -1;
716 	}
717 
718 	for (i = 0; i < ETHER_ADDR_LEN; i++)
719 		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
720 
721 	/* vlan_tag currently uses the device_id. */
722 	vdev->vlan_tag = vlan_tags[vdev->vid];
723 
724 	/* Print out VMDQ registration info. */
725 	RTE_LOG(INFO, VHOST_DATA,
726 		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
727 		vdev->vid,
728 		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
729 		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
730 		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
731 		vdev->vlan_tag);
732 
733 	/* Register the MAC address. */
734 	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
735 				(uint32_t)vdev->vid + vmdq_pool_base);
736 	if (ret)
737 		RTE_LOG(ERR, VHOST_DATA,
738 			"(%d) failed to add device MAC address to VMDQ\n",
739 			vdev->vid);
740 
741 	rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
742 
743 	/* Set device as ready for RX. */
744 	vdev->ready = DEVICE_RX;
745 
746 	return 0;
747 }
748 
749 /*
750  * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
751  * queue before disabling RX on the device.
752  */
753 static inline void
754 unlink_vmdq(struct vhost_dev *vdev)
755 {
756 	unsigned i = 0;
757 	unsigned rx_count;
758 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
759 
760 	if (vdev->ready == DEVICE_RX) {
761 		/*clear MAC and VLAN settings*/
762 		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
763 		for (i = 0; i < 6; i++)
764 			vdev->mac_address.addr_bytes[i] = 0;
765 
766 		vdev->vlan_tag = 0;
767 
768 		/*Clear out the receive buffers*/
769 		rx_count = rte_eth_rx_burst(ports[0],
770 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
771 
772 		while (rx_count) {
773 			for (i = 0; i < rx_count; i++)
774 				rte_pktmbuf_free(pkts_burst[i]);
775 
776 			rx_count = rte_eth_rx_burst(ports[0],
777 					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
778 		}
779 
780 		vdev->ready = DEVICE_MAC_LEARNING;
781 	}
782 }
783 
784 static __rte_always_inline void
785 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
786 	    struct rte_mbuf *m)
787 {
788 	uint16_t ret;
789 
790 	if (builtin_net_driver) {
791 		ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
792 	} else {
793 		ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
794 	}
795 
796 	if (enable_stats) {
797 		rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
798 		rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
799 		src_vdev->stats.tx_total++;
800 		src_vdev->stats.tx += ret;
801 	}
802 }
803 
804 /*
805  * Check if the packet destination MAC address is for a local device. If so then put
806  * the packet on that devices RX queue. If not then return.
807  */
808 static __rte_always_inline int
809 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
810 {
811 	struct ether_hdr *pkt_hdr;
812 	struct vhost_dev *dst_vdev;
813 
814 	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
815 
816 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
817 	if (!dst_vdev)
818 		return -1;
819 
820 	if (vdev->vid == dst_vdev->vid) {
821 		RTE_LOG_DP(DEBUG, VHOST_DATA,
822 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
823 			vdev->vid);
824 		return 0;
825 	}
826 
827 	RTE_LOG_DP(DEBUG, VHOST_DATA,
828 		"(%d) TX: MAC address is local\n", dst_vdev->vid);
829 
830 	if (unlikely(dst_vdev->remove)) {
831 		RTE_LOG_DP(DEBUG, VHOST_DATA,
832 			"(%d) device is marked for removal\n", dst_vdev->vid);
833 		return 0;
834 	}
835 
836 	virtio_xmit(dst_vdev, vdev, m);
837 	return 0;
838 }
839 
840 /*
841  * Check if the destination MAC of a packet is one local VM,
842  * and get its vlan tag, and offset if it is.
843  */
844 static __rte_always_inline int
845 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
846 	uint32_t *offset, uint16_t *vlan_tag)
847 {
848 	struct vhost_dev *dst_vdev;
849 	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
850 
851 	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
852 	if (!dst_vdev)
853 		return 0;
854 
855 	if (vdev->vid == dst_vdev->vid) {
856 		RTE_LOG_DP(DEBUG, VHOST_DATA,
857 			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
858 			vdev->vid);
859 		return -1;
860 	}
861 
862 	/*
863 	 * HW vlan strip will reduce the packet length
864 	 * by minus length of vlan tag, so need restore
865 	 * the packet length by plus it.
866 	 */
867 	*offset  = VLAN_HLEN;
868 	*vlan_tag = vlan_tags[vdev->vid];
869 
870 	RTE_LOG_DP(DEBUG, VHOST_DATA,
871 		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
872 		vdev->vid, dst_vdev->vid, *vlan_tag);
873 
874 	return 0;
875 }
876 
877 static uint16_t
878 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
879 {
880 	if (ol_flags & PKT_TX_IPV4)
881 		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
882 	else /* assume ethertype == ETHER_TYPE_IPv6 */
883 		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
884 }
885 
886 static void virtio_tx_offload(struct rte_mbuf *m)
887 {
888 	void *l3_hdr;
889 	struct ipv4_hdr *ipv4_hdr = NULL;
890 	struct tcp_hdr *tcp_hdr = NULL;
891 	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
892 
893 	l3_hdr = (char *)eth_hdr + m->l2_len;
894 
895 	if (m->ol_flags & PKT_TX_IPV4) {
896 		ipv4_hdr = l3_hdr;
897 		ipv4_hdr->hdr_checksum = 0;
898 		m->ol_flags |= PKT_TX_IP_CKSUM;
899 	}
900 
901 	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
902 	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
903 }
904 
905 static inline void
906 free_pkts(struct rte_mbuf **pkts, uint16_t n)
907 {
908 	while (n--)
909 		rte_pktmbuf_free(pkts[n]);
910 }
911 
912 static __rte_always_inline void
913 do_drain_mbuf_table(struct mbuf_table *tx_q)
914 {
915 	uint16_t count;
916 
917 	count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
918 				 tx_q->m_table, tx_q->len);
919 	if (unlikely(count < tx_q->len))
920 		free_pkts(&tx_q->m_table[count], tx_q->len - count);
921 
922 	tx_q->len = 0;
923 }
924 
925 /*
926  * This function routes the TX packet to the correct interface. This
927  * may be a local device or the physical port.
928  */
929 static __rte_always_inline void
930 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
931 {
932 	struct mbuf_table *tx_q;
933 	unsigned offset = 0;
934 	const uint16_t lcore_id = rte_lcore_id();
935 	struct ether_hdr *nh;
936 
937 
938 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
939 	if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
940 		struct vhost_dev *vdev2;
941 
942 		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
943 			if (vdev2 != vdev)
944 				virtio_xmit(vdev2, vdev, m);
945 		}
946 		goto queue2nic;
947 	}
948 
949 	/*check if destination is local VM*/
950 	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
951 		rte_pktmbuf_free(m);
952 		return;
953 	}
954 
955 	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
956 		if (unlikely(find_local_dest(vdev, m, &offset,
957 					     &vlan_tag) != 0)) {
958 			rte_pktmbuf_free(m);
959 			return;
960 		}
961 	}
962 
963 	RTE_LOG_DP(DEBUG, VHOST_DATA,
964 		"(%d) TX: MAC address is external\n", vdev->vid);
965 
966 queue2nic:
967 
968 	/*Add packet to the port tx queue*/
969 	tx_q = &lcore_tx_queue[lcore_id];
970 
971 	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
972 	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
973 		/* Guest has inserted the vlan tag. */
974 		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
975 		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
976 		if ((vm2vm_mode == VM2VM_HARDWARE) &&
977 			(vh->vlan_tci != vlan_tag_be))
978 			vh->vlan_tci = vlan_tag_be;
979 	} else {
980 		m->ol_flags |= PKT_TX_VLAN_PKT;
981 
982 		/*
983 		 * Find the right seg to adjust the data len when offset is
984 		 * bigger than tail room size.
985 		 */
986 		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
987 			if (likely(offset <= rte_pktmbuf_tailroom(m)))
988 				m->data_len += offset;
989 			else {
990 				struct rte_mbuf *seg = m;
991 
992 				while ((seg->next != NULL) &&
993 					(offset > rte_pktmbuf_tailroom(seg)))
994 					seg = seg->next;
995 
996 				seg->data_len += offset;
997 			}
998 			m->pkt_len += offset;
999 		}
1000 
1001 		m->vlan_tci = vlan_tag;
1002 	}
1003 
1004 	if (m->ol_flags & PKT_TX_TCP_SEG)
1005 		virtio_tx_offload(m);
1006 
1007 	tx_q->m_table[tx_q->len++] = m;
1008 	if (enable_stats) {
1009 		vdev->stats.tx_total++;
1010 		vdev->stats.tx++;
1011 	}
1012 
1013 	if (unlikely(tx_q->len == MAX_PKT_BURST))
1014 		do_drain_mbuf_table(tx_q);
1015 }
1016 
1017 
1018 static __rte_always_inline void
1019 drain_mbuf_table(struct mbuf_table *tx_q)
1020 {
1021 	static uint64_t prev_tsc;
1022 	uint64_t cur_tsc;
1023 
1024 	if (tx_q->len == 0)
1025 		return;
1026 
1027 	cur_tsc = rte_rdtsc();
1028 	if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1029 		prev_tsc = cur_tsc;
1030 
1031 		RTE_LOG_DP(DEBUG, VHOST_DATA,
1032 			"TX queue drained after timeout with burst size %u\n",
1033 			tx_q->len);
1034 		do_drain_mbuf_table(tx_q);
1035 	}
1036 }
1037 
1038 static __rte_always_inline void
1039 drain_eth_rx(struct vhost_dev *vdev)
1040 {
1041 	uint16_t rx_count, enqueue_count;
1042 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1043 
1044 	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1045 				    pkts, MAX_PKT_BURST);
1046 	if (!rx_count)
1047 		return;
1048 
1049 	/*
1050 	 * When "enable_retry" is set, here we wait and retry when there
1051 	 * is no enough free slots in the queue to hold @rx_count packets,
1052 	 * to diminish packet loss.
1053 	 */
1054 	if (enable_retry &&
1055 	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1056 			VIRTIO_RXQ))) {
1057 		uint32_t retry;
1058 
1059 		for (retry = 0; retry < burst_rx_retry_num; retry++) {
1060 			rte_delay_us(burst_rx_delay_time);
1061 			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1062 					VIRTIO_RXQ))
1063 				break;
1064 		}
1065 	}
1066 
1067 	if (builtin_net_driver) {
1068 		enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1069 						pkts, rx_count);
1070 	} else {
1071 		enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1072 						pkts, rx_count);
1073 	}
1074 	if (enable_stats) {
1075 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1076 		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1077 	}
1078 
1079 	free_pkts(pkts, rx_count);
1080 }
1081 
1082 static __rte_always_inline void
1083 drain_virtio_tx(struct vhost_dev *vdev)
1084 {
1085 	struct rte_mbuf *pkts[MAX_PKT_BURST];
1086 	uint16_t count;
1087 	uint16_t i;
1088 
1089 	if (builtin_net_driver) {
1090 		count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1091 					pkts, MAX_PKT_BURST);
1092 	} else {
1093 		count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1094 					mbuf_pool, pkts, MAX_PKT_BURST);
1095 	}
1096 
1097 	/* setup VMDq for the first packet */
1098 	if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1099 		if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1100 			free_pkts(pkts, count);
1101 	}
1102 
1103 	for (i = 0; i < count; ++i)
1104 		virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1105 }
1106 
1107 /*
1108  * Main function of vhost-switch. It basically does:
1109  *
1110  * for each vhost device {
1111  *    - drain_eth_rx()
1112  *
1113  *      Which drains the host eth Rx queue linked to the vhost device,
1114  *      and deliver all of them to guest virito Rx ring associated with
1115  *      this vhost device.
1116  *
1117  *    - drain_virtio_tx()
1118  *
1119  *      Which drains the guest virtio Tx queue and deliver all of them
1120  *      to the target, which could be another vhost device, or the
1121  *      physical eth dev. The route is done in function "virtio_tx_route".
1122  * }
1123  */
1124 static int
1125 switch_worker(void *arg __rte_unused)
1126 {
1127 	unsigned i;
1128 	unsigned lcore_id = rte_lcore_id();
1129 	struct vhost_dev *vdev;
1130 	struct mbuf_table *tx_q;
1131 
1132 	RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1133 
1134 	tx_q = &lcore_tx_queue[lcore_id];
1135 	for (i = 0; i < rte_lcore_count(); i++) {
1136 		if (lcore_ids[i] == lcore_id) {
1137 			tx_q->txq_id = i;
1138 			break;
1139 		}
1140 	}
1141 
1142 	while(1) {
1143 		drain_mbuf_table(tx_q);
1144 
1145 		/*
1146 		 * Inform the configuration core that we have exited the
1147 		 * linked list and that no devices are in use if requested.
1148 		 */
1149 		if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1150 			lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1151 
1152 		/*
1153 		 * Process vhost devices
1154 		 */
1155 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1156 			      lcore_vdev_entry) {
1157 			if (unlikely(vdev->remove)) {
1158 				unlink_vmdq(vdev);
1159 				vdev->ready = DEVICE_SAFE_REMOVE;
1160 				continue;
1161 			}
1162 
1163 			if (likely(vdev->ready == DEVICE_RX))
1164 				drain_eth_rx(vdev);
1165 
1166 			if (likely(!vdev->remove))
1167 				drain_virtio_tx(vdev);
1168 		}
1169 	}
1170 
1171 	return 0;
1172 }
1173 
1174 /*
1175  * Remove a device from the specific data core linked list and from the
1176  * main linked list. Synchonization  occurs through the use of the
1177  * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1178  * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1179  */
1180 static void
1181 destroy_device(int vid)
1182 {
1183 	struct vhost_dev *vdev = NULL;
1184 	int lcore;
1185 
1186 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1187 		if (vdev->vid == vid)
1188 			break;
1189 	}
1190 	if (!vdev)
1191 		return;
1192 	/*set the remove flag. */
1193 	vdev->remove = 1;
1194 	while(vdev->ready != DEVICE_SAFE_REMOVE) {
1195 		rte_pause();
1196 	}
1197 
1198 	if (builtin_net_driver)
1199 		vs_vhost_net_remove(vdev);
1200 
1201 	TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1202 		     lcore_vdev_entry);
1203 	TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1204 
1205 
1206 	/* Set the dev_removal_flag on each lcore. */
1207 	RTE_LCORE_FOREACH_SLAVE(lcore)
1208 		lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1209 
1210 	/*
1211 	 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1212 	 * we can be sure that they can no longer access the device removed
1213 	 * from the linked lists and that the devices are no longer in use.
1214 	 */
1215 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1216 		while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1217 			rte_pause();
1218 	}
1219 
1220 	lcore_info[vdev->coreid].device_num--;
1221 
1222 	RTE_LOG(INFO, VHOST_DATA,
1223 		"(%d) device has been removed from data core\n",
1224 		vdev->vid);
1225 
1226 	rte_free(vdev);
1227 }
1228 
1229 /*
1230  * A new device is added to a data core. First the device is added to the main linked list
1231  * and then allocated to a specific data core.
1232  */
1233 static int
1234 new_device(int vid)
1235 {
1236 	int lcore, core_add = 0;
1237 	uint32_t device_num_min = num_devices;
1238 	struct vhost_dev *vdev;
1239 
1240 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1241 	if (vdev == NULL) {
1242 		RTE_LOG(INFO, VHOST_DATA,
1243 			"(%d) couldn't allocate memory for vhost dev\n",
1244 			vid);
1245 		return -1;
1246 	}
1247 	vdev->vid = vid;
1248 
1249 	if (builtin_net_driver)
1250 		vs_vhost_net_setup(vdev);
1251 
1252 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1253 	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1254 
1255 	/*reset ready flag*/
1256 	vdev->ready = DEVICE_MAC_LEARNING;
1257 	vdev->remove = 0;
1258 
1259 	/* Find a suitable lcore to add the device. */
1260 	RTE_LCORE_FOREACH_SLAVE(lcore) {
1261 		if (lcore_info[lcore].device_num < device_num_min) {
1262 			device_num_min = lcore_info[lcore].device_num;
1263 			core_add = lcore;
1264 		}
1265 	}
1266 	vdev->coreid = core_add;
1267 
1268 	TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1269 			  lcore_vdev_entry);
1270 	lcore_info[vdev->coreid].device_num++;
1271 
1272 	/* Disable notifications. */
1273 	rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1274 	rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1275 
1276 	RTE_LOG(INFO, VHOST_DATA,
1277 		"(%d) device has been added to data core %d\n",
1278 		vid, vdev->coreid);
1279 
1280 	return 0;
1281 }
1282 
1283 /*
1284  * These callback allow devices to be added to the data core when configuration
1285  * has been fully complete.
1286  */
1287 static const struct vhost_device_ops virtio_net_device_ops =
1288 {
1289 	.new_device =  new_device,
1290 	.destroy_device = destroy_device,
1291 };
1292 
1293 /*
1294  * This is a thread will wake up after a period to print stats if the user has
1295  * enabled them.
1296  */
1297 static void *
1298 print_stats(__rte_unused void *arg)
1299 {
1300 	struct vhost_dev *vdev;
1301 	uint64_t tx_dropped, rx_dropped;
1302 	uint64_t tx, tx_total, rx, rx_total;
1303 	const char clr[] = { 27, '[', '2', 'J', '\0' };
1304 	const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1305 
1306 	while(1) {
1307 		sleep(enable_stats);
1308 
1309 		/* Clear screen and move to top left */
1310 		printf("%s%s\n", clr, top_left);
1311 		printf("Device statistics =================================\n");
1312 
1313 		TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1314 			tx_total   = vdev->stats.tx_total;
1315 			tx         = vdev->stats.tx;
1316 			tx_dropped = tx_total - tx;
1317 
1318 			rx_total   = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1319 			rx         = rte_atomic64_read(&vdev->stats.rx_atomic);
1320 			rx_dropped = rx_total - rx;
1321 
1322 			printf("Statistics for device %d\n"
1323 				"-----------------------\n"
1324 				"TX total:              %" PRIu64 "\n"
1325 				"TX dropped:            %" PRIu64 "\n"
1326 				"TX successful:         %" PRIu64 "\n"
1327 				"RX total:              %" PRIu64 "\n"
1328 				"RX dropped:            %" PRIu64 "\n"
1329 				"RX successful:         %" PRIu64 "\n",
1330 				vdev->vid,
1331 				tx_total, tx_dropped, tx,
1332 				rx_total, rx_dropped, rx);
1333 		}
1334 
1335 		printf("===================================================\n");
1336 	}
1337 
1338 	return NULL;
1339 }
1340 
1341 static void
1342 unregister_drivers(int socket_num)
1343 {
1344 	int i, ret;
1345 
1346 	for (i = 0; i < socket_num; i++) {
1347 		ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1348 		if (ret != 0)
1349 			RTE_LOG(ERR, VHOST_CONFIG,
1350 				"Fail to unregister vhost driver for %s.\n",
1351 				socket_files + i * PATH_MAX);
1352 	}
1353 }
1354 
1355 /* When we receive a INT signal, unregister vhost driver */
1356 static void
1357 sigint_handler(__rte_unused int signum)
1358 {
1359 	/* Unregister vhost driver. */
1360 	unregister_drivers(nb_sockets);
1361 
1362 	exit(0);
1363 }
1364 
1365 /*
1366  * While creating an mbuf pool, one key thing is to figure out how
1367  * many mbuf entries is enough for our use. FYI, here are some
1368  * guidelines:
1369  *
1370  * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1371  *
1372  * - For each switch core (A CPU core does the packet switch), we need
1373  *   also make some reservation for receiving the packets from virtio
1374  *   Tx queue. How many is enough depends on the usage. It's normally
1375  *   a simple calculation like following:
1376  *
1377  *       MAX_PKT_BURST * max packet size / mbuf size
1378  *
1379  *   So, we definitely need allocate more mbufs when TSO is enabled.
1380  *
1381  * - Similarly, for each switching core, we should serve @nr_rx_desc
1382  *   mbufs for receiving the packets from physical NIC device.
1383  *
1384  * - We also need make sure, for each switch core, we have allocated
1385  *   enough mbufs to fill up the mbuf cache.
1386  */
1387 static void
1388 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1389 	uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1390 {
1391 	uint32_t nr_mbufs;
1392 	uint32_t nr_mbufs_per_core;
1393 	uint32_t mtu = 1500;
1394 
1395 	if (mergeable)
1396 		mtu = 9000;
1397 	if (enable_tso)
1398 		mtu = 64 * 1024;
1399 
1400 	nr_mbufs_per_core  = (mtu + mbuf_size) * MAX_PKT_BURST /
1401 			(mbuf_size - RTE_PKTMBUF_HEADROOM);
1402 	nr_mbufs_per_core += nr_rx_desc;
1403 	nr_mbufs_per_core  = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1404 
1405 	nr_mbufs  = nr_queues * nr_rx_desc;
1406 	nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1407 	nr_mbufs *= nr_port;
1408 
1409 	mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1410 					    nr_mbuf_cache, 0, mbuf_size,
1411 					    rte_socket_id());
1412 	if (mbuf_pool == NULL)
1413 		rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1414 }
1415 
1416 /*
1417  * Main function, does initialisation and calls the per-lcore functions.
1418  */
1419 int
1420 main(int argc, char *argv[])
1421 {
1422 	unsigned lcore_id, core_id = 0;
1423 	unsigned nb_ports, valid_num_ports;
1424 	int ret, i;
1425 	uint16_t portid;
1426 	static pthread_t tid;
1427 	uint64_t flags = 0;
1428 
1429 	signal(SIGINT, sigint_handler);
1430 
1431 	/* init EAL */
1432 	ret = rte_eal_init(argc, argv);
1433 	if (ret < 0)
1434 		rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1435 	argc -= ret;
1436 	argv += ret;
1437 
1438 	/* parse app arguments */
1439 	ret = us_vhost_parse_args(argc, argv);
1440 	if (ret < 0)
1441 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
1442 
1443 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1444 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1445 
1446 		if (rte_lcore_is_enabled(lcore_id))
1447 			lcore_ids[core_id++] = lcore_id;
1448 	}
1449 
1450 	if (rte_lcore_count() > RTE_MAX_LCORE)
1451 		rte_exit(EXIT_FAILURE,"Not enough cores\n");
1452 
1453 	/* Get the number of physical ports. */
1454 	nb_ports = rte_eth_dev_count_avail();
1455 
1456 	/*
1457 	 * Update the global var NUM_PORTS and global array PORTS
1458 	 * and get value of var VALID_NUM_PORTS according to system ports number
1459 	 */
1460 	valid_num_ports = check_ports_num(nb_ports);
1461 
1462 	if ((valid_num_ports ==  0) || (valid_num_ports > MAX_SUP_PORTS)) {
1463 		RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1464 			"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1465 		return -1;
1466 	}
1467 
1468 	/*
1469 	 * FIXME: here we are trying to allocate mbufs big enough for
1470 	 * @MAX_QUEUES, but the truth is we're never going to use that
1471 	 * many queues here. We probably should only do allocation for
1472 	 * those queues we are going to use.
1473 	 */
1474 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1475 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1476 
1477 	if (vm2vm_mode == VM2VM_HARDWARE) {
1478 		/* Enable VT loop back to let L2 switch to do it. */
1479 		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1480 		RTE_LOG(DEBUG, VHOST_CONFIG,
1481 			"Enable loop back for L2 switch in vmdq.\n");
1482 	}
1483 
1484 	/* initialize all ports */
1485 	RTE_ETH_FOREACH_DEV(portid) {
1486 		/* skip ports that are not enabled */
1487 		if ((enabled_port_mask & (1 << portid)) == 0) {
1488 			RTE_LOG(INFO, VHOST_PORT,
1489 				"Skipping disabled port %d\n", portid);
1490 			continue;
1491 		}
1492 		if (port_init(portid) != 0)
1493 			rte_exit(EXIT_FAILURE,
1494 				"Cannot initialize network ports\n");
1495 	}
1496 
1497 	/* Enable stats if the user option is set. */
1498 	if (enable_stats) {
1499 		ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1500 					print_stats, NULL);
1501 		if (ret < 0)
1502 			rte_exit(EXIT_FAILURE,
1503 				"Cannot create print-stats thread\n");
1504 	}
1505 
1506 	/* Launch all data cores. */
1507 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1508 		rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1509 
1510 	if (client_mode)
1511 		flags |= RTE_VHOST_USER_CLIENT;
1512 
1513 	if (dequeue_zero_copy)
1514 		flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1515 
1516 	/* Register vhost user driver to handle vhost messages. */
1517 	for (i = 0; i < nb_sockets; i++) {
1518 		char *file = socket_files + i * PATH_MAX;
1519 		ret = rte_vhost_driver_register(file, flags);
1520 		if (ret != 0) {
1521 			unregister_drivers(i);
1522 			rte_exit(EXIT_FAILURE,
1523 				"vhost driver register failure.\n");
1524 		}
1525 
1526 		if (builtin_net_driver)
1527 			rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1528 
1529 		if (mergeable == 0) {
1530 			rte_vhost_driver_disable_features(file,
1531 				1ULL << VIRTIO_NET_F_MRG_RXBUF);
1532 		}
1533 
1534 		if (enable_tx_csum == 0) {
1535 			rte_vhost_driver_disable_features(file,
1536 				1ULL << VIRTIO_NET_F_CSUM);
1537 		}
1538 
1539 		if (enable_tso == 0) {
1540 			rte_vhost_driver_disable_features(file,
1541 				1ULL << VIRTIO_NET_F_HOST_TSO4);
1542 			rte_vhost_driver_disable_features(file,
1543 				1ULL << VIRTIO_NET_F_HOST_TSO6);
1544 			rte_vhost_driver_disable_features(file,
1545 				1ULL << VIRTIO_NET_F_GUEST_TSO4);
1546 			rte_vhost_driver_disable_features(file,
1547 				1ULL << VIRTIO_NET_F_GUEST_TSO6);
1548 		}
1549 
1550 		if (promiscuous) {
1551 			rte_vhost_driver_enable_features(file,
1552 				1ULL << VIRTIO_NET_F_CTRL_RX);
1553 		}
1554 
1555 		ret = rte_vhost_driver_callback_register(file,
1556 			&virtio_net_device_ops);
1557 		if (ret != 0) {
1558 			rte_exit(EXIT_FAILURE,
1559 				"failed to register vhost driver callbacks.\n");
1560 		}
1561 
1562 		if (rte_vhost_driver_start(file) < 0) {
1563 			rte_exit(EXIT_FAILURE,
1564 				"failed to start vhost driver.\n");
1565 		}
1566 	}
1567 
1568 	RTE_LCORE_FOREACH_SLAVE(lcore_id)
1569 		rte_eal_wait_lcore(lcore_id);
1570 
1571 	return 0;
1572 
1573 }
1574