1 /*- 2 * BSD LICENSE 3 * 4 * Copyright(c) 2010-2017 Intel Corporation. All rights reserved. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include <arpa/inet.h> 35 #include <getopt.h> 36 #include <linux/if_ether.h> 37 #include <linux/if_vlan.h> 38 #include <linux/virtio_net.h> 39 #include <linux/virtio_ring.h> 40 #include <signal.h> 41 #include <stdint.h> 42 #include <sys/eventfd.h> 43 #include <sys/param.h> 44 #include <unistd.h> 45 46 #include <rte_atomic.h> 47 #include <rte_cycles.h> 48 #include <rte_ethdev.h> 49 #include <rte_log.h> 50 #include <rte_string_fns.h> 51 #include <rte_malloc.h> 52 #include <rte_vhost.h> 53 #include <rte_ip.h> 54 #include <rte_tcp.h> 55 #include <rte_pause.h> 56 57 #include "main.h" 58 59 #ifndef MAX_QUEUES 60 #define MAX_QUEUES 128 61 #endif 62 63 /* the maximum number of external ports supported */ 64 #define MAX_SUP_PORTS 1 65 66 #define MBUF_CACHE_SIZE 128 67 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE 68 69 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 70 71 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */ 72 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */ 73 74 #define JUMBO_FRAME_MAX_SIZE 0x2600 75 76 /* State of virtio device. */ 77 #define DEVICE_MAC_LEARNING 0 78 #define DEVICE_RX 1 79 #define DEVICE_SAFE_REMOVE 2 80 81 /* Configurable number of RX/TX ring descriptors */ 82 #define RTE_TEST_RX_DESC_DEFAULT 1024 83 #define RTE_TEST_TX_DESC_DEFAULT 512 84 85 #define INVALID_PORT_ID 0xFF 86 87 /* Max number of devices. Limited by vmdq. */ 88 #define MAX_DEVICES 64 89 90 /* Size of buffers used for snprintfs. */ 91 #define MAX_PRINT_BUFF 6072 92 93 /* Maximum long option length for option parsing. */ 94 #define MAX_LONG_OPT_SZ 64 95 96 /* mask of enabled ports */ 97 static uint32_t enabled_port_mask = 0; 98 99 /* Promiscuous mode */ 100 static uint32_t promiscuous; 101 102 /* number of devices/queues to support*/ 103 static uint32_t num_queues = 0; 104 static uint32_t num_devices; 105 106 static struct rte_mempool *mbuf_pool; 107 static int mergeable; 108 109 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */ 110 typedef enum { 111 VM2VM_DISABLED = 0, 112 VM2VM_SOFTWARE = 1, 113 VM2VM_HARDWARE = 2, 114 VM2VM_LAST 115 } vm2vm_type; 116 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE; 117 118 /* Enable stats. */ 119 static uint32_t enable_stats = 0; 120 /* Enable retries on RX. */ 121 static uint32_t enable_retry = 1; 122 123 /* Disable TX checksum offload */ 124 static uint32_t enable_tx_csum; 125 126 /* Disable TSO offload */ 127 static uint32_t enable_tso; 128 129 static int client_mode; 130 static int dequeue_zero_copy; 131 132 static int builtin_net_driver; 133 134 /* Specify timeout (in useconds) between retries on RX. */ 135 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US; 136 /* Specify the number of retries on RX. */ 137 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; 138 139 /* Socket file paths. Can be set by user */ 140 static char *socket_files; 141 static int nb_sockets; 142 143 /* empty vmdq configuration structure. Filled in programatically */ 144 static struct rte_eth_conf vmdq_conf_default = { 145 .rxmode = { 146 .mq_mode = ETH_MQ_RX_VMDQ_ONLY, 147 .split_hdr_size = 0, 148 .header_split = 0, /**< Header Split disabled */ 149 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 150 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 151 /* 152 * It is necessary for 1G NIC such as I350, 153 * this fixes bug of ipv4 forwarding in guest can't 154 * forward pakets from one virtio dev to another virtio dev. 155 */ 156 .hw_vlan_strip = 1, /**< VLAN strip enabled. */ 157 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 158 .hw_strip_crc = 1, /**< CRC stripped by hardware */ 159 }, 160 161 .txmode = { 162 .mq_mode = ETH_MQ_TX_NONE, 163 }, 164 .rx_adv_conf = { 165 /* 166 * should be overridden separately in code with 167 * appropriate values 168 */ 169 .vmdq_rx_conf = { 170 .nb_queue_pools = ETH_8_POOLS, 171 .enable_default_pool = 0, 172 .default_pool = 0, 173 .nb_pool_maps = 0, 174 .pool_map = {{0, 0},}, 175 }, 176 }, 177 }; 178 179 static unsigned lcore_ids[RTE_MAX_LCORE]; 180 static uint16_t ports[RTE_MAX_ETHPORTS]; 181 static unsigned num_ports = 0; /**< The number of ports specified in command line */ 182 static uint16_t num_pf_queues, num_vmdq_queues; 183 static uint16_t vmdq_pool_base, vmdq_queue_base; 184 static uint16_t queues_per_pool; 185 186 const uint16_t vlan_tags[] = { 187 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 188 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 189 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 190 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 191 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 192 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 193 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 194 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 195 }; 196 197 /* ethernet addresses of ports */ 198 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS]; 199 200 static struct vhost_dev_tailq_list vhost_dev_list = 201 TAILQ_HEAD_INITIALIZER(vhost_dev_list); 202 203 static struct lcore_info lcore_info[RTE_MAX_LCORE]; 204 205 /* Used for queueing bursts of TX packets. */ 206 struct mbuf_table { 207 unsigned len; 208 unsigned txq_id; 209 struct rte_mbuf *m_table[MAX_PKT_BURST]; 210 }; 211 212 /* TX queue for each data core. */ 213 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE]; 214 215 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \ 216 / US_PER_S * BURST_TX_DRAIN_US) 217 #define VLAN_HLEN 4 218 219 /* 220 * Builds up the correct configuration for VMDQ VLAN pool map 221 * according to the pool & queue limits. 222 */ 223 static inline int 224 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices) 225 { 226 struct rte_eth_vmdq_rx_conf conf; 227 struct rte_eth_vmdq_rx_conf *def_conf = 228 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf; 229 unsigned i; 230 231 memset(&conf, 0, sizeof(conf)); 232 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices; 233 conf.nb_pool_maps = num_devices; 234 conf.enable_loop_back = def_conf->enable_loop_back; 235 conf.rx_mode = def_conf->rx_mode; 236 237 for (i = 0; i < conf.nb_pool_maps; i++) { 238 conf.pool_map[i].vlan_id = vlan_tags[ i ]; 239 conf.pool_map[i].pools = (1UL << i); 240 } 241 242 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); 243 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, 244 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); 245 return 0; 246 } 247 248 /* 249 * Validate the device number according to the max pool number gotten form 250 * dev_info. If the device number is invalid, give the error message and 251 * return -1. Each device must have its own pool. 252 */ 253 static inline int 254 validate_num_devices(uint32_t max_nb_devices) 255 { 256 if (num_devices > max_nb_devices) { 257 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n"); 258 return -1; 259 } 260 return 0; 261 } 262 263 /* 264 * Initialises a given port using global settings and with the rx buffers 265 * coming from the mbuf_pool passed as parameter 266 */ 267 static inline int 268 port_init(uint16_t port) 269 { 270 struct rte_eth_dev_info dev_info; 271 struct rte_eth_conf port_conf; 272 struct rte_eth_rxconf *rxconf; 273 struct rte_eth_txconf *txconf; 274 int16_t rx_rings, tx_rings; 275 uint16_t rx_ring_size, tx_ring_size; 276 int retval; 277 uint16_t q; 278 279 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */ 280 rte_eth_dev_info_get (port, &dev_info); 281 282 rxconf = &dev_info.default_rxconf; 283 txconf = &dev_info.default_txconf; 284 rxconf->rx_drop_en = 1; 285 286 /* Enable vlan offload */ 287 txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL; 288 289 /*configure the number of supported virtio devices based on VMDQ limits */ 290 num_devices = dev_info.max_vmdq_pools; 291 292 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT; 293 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT; 294 295 /* 296 * When dequeue zero copy is enabled, guest Tx used vring will be 297 * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc 298 * (tx_ring_size here) must be small enough so that the driver will 299 * hit the free threshold easily and free mbufs timely. Otherwise, 300 * guest Tx vring would be starved. 301 */ 302 if (dequeue_zero_copy) 303 tx_ring_size = 64; 304 305 tx_rings = (uint16_t)rte_lcore_count(); 306 307 retval = validate_num_devices(MAX_DEVICES); 308 if (retval < 0) 309 return retval; 310 311 /* Get port configuration. */ 312 retval = get_eth_conf(&port_conf, num_devices); 313 if (retval < 0) 314 return retval; 315 /* NIC queues are divided into pf queues and vmdq queues. */ 316 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num; 317 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools; 318 num_vmdq_queues = num_devices * queues_per_pool; 319 num_queues = num_pf_queues + num_vmdq_queues; 320 vmdq_queue_base = dev_info.vmdq_queue_base; 321 vmdq_pool_base = dev_info.vmdq_pool_base; 322 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n", 323 num_pf_queues, num_devices, queues_per_pool); 324 325 if (port >= rte_eth_dev_count()) return -1; 326 327 rx_rings = (uint16_t)dev_info.max_rx_queues; 328 /* Configure ethernet device. */ 329 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); 330 if (retval != 0) { 331 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n", 332 port, strerror(-retval)); 333 return retval; 334 } 335 336 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size, 337 &tx_ring_size); 338 if (retval != 0) { 339 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors " 340 "for port %u: %s.\n", port, strerror(-retval)); 341 return retval; 342 } 343 if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) { 344 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size " 345 "for Rx queues on port %u.\n", port); 346 return -1; 347 } 348 349 /* Setup the queues. */ 350 for (q = 0; q < rx_rings; q ++) { 351 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size, 352 rte_eth_dev_socket_id(port), 353 rxconf, 354 mbuf_pool); 355 if (retval < 0) { 356 RTE_LOG(ERR, VHOST_PORT, 357 "Failed to setup rx queue %u of port %u: %s.\n", 358 q, port, strerror(-retval)); 359 return retval; 360 } 361 } 362 for (q = 0; q < tx_rings; q ++) { 363 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size, 364 rte_eth_dev_socket_id(port), 365 txconf); 366 if (retval < 0) { 367 RTE_LOG(ERR, VHOST_PORT, 368 "Failed to setup tx queue %u of port %u: %s.\n", 369 q, port, strerror(-retval)); 370 return retval; 371 } 372 } 373 374 /* Start the device. */ 375 retval = rte_eth_dev_start(port); 376 if (retval < 0) { 377 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n", 378 port, strerror(-retval)); 379 return retval; 380 } 381 382 if (promiscuous) 383 rte_eth_promiscuous_enable(port); 384 385 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]); 386 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices); 387 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8 388 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n", 389 port, 390 vmdq_ports_eth_addr[port].addr_bytes[0], 391 vmdq_ports_eth_addr[port].addr_bytes[1], 392 vmdq_ports_eth_addr[port].addr_bytes[2], 393 vmdq_ports_eth_addr[port].addr_bytes[3], 394 vmdq_ports_eth_addr[port].addr_bytes[4], 395 vmdq_ports_eth_addr[port].addr_bytes[5]); 396 397 return 0; 398 } 399 400 /* 401 * Set socket file path. 402 */ 403 static int 404 us_vhost_parse_socket_path(const char *q_arg) 405 { 406 /* parse number string */ 407 if (strnlen(q_arg, PATH_MAX) == PATH_MAX) 408 return -1; 409 410 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1)); 411 snprintf(socket_files + nb_sockets * PATH_MAX, PATH_MAX, "%s", q_arg); 412 nb_sockets++; 413 414 return 0; 415 } 416 417 /* 418 * Parse the portmask provided at run time. 419 */ 420 static int 421 parse_portmask(const char *portmask) 422 { 423 char *end = NULL; 424 unsigned long pm; 425 426 errno = 0; 427 428 /* parse hexadecimal string */ 429 pm = strtoul(portmask, &end, 16); 430 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 431 return -1; 432 433 if (pm == 0) 434 return -1; 435 436 return pm; 437 438 } 439 440 /* 441 * Parse num options at run time. 442 */ 443 static int 444 parse_num_opt(const char *q_arg, uint32_t max_valid_value) 445 { 446 char *end = NULL; 447 unsigned long num; 448 449 errno = 0; 450 451 /* parse unsigned int string */ 452 num = strtoul(q_arg, &end, 10); 453 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0)) 454 return -1; 455 456 if (num > max_valid_value) 457 return -1; 458 459 return num; 460 461 } 462 463 /* 464 * Display usage 465 */ 466 static void 467 us_vhost_usage(const char *prgname) 468 { 469 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n" 470 " --vm2vm [0|1|2]\n" 471 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n" 472 " --socket-file <path>\n" 473 " --nb-devices ND\n" 474 " -p PORTMASK: Set mask for ports to be used by application\n" 475 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n" 476 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n" 477 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n" 478 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n" 479 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n" 480 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n" 481 " --socket-file: The path of the socket file.\n" 482 " --tx-csum [0|1] disable/enable TX checksum offload.\n" 483 " --tso [0|1] disable/enable TCP segment offload.\n" 484 " --client register a vhost-user socket as client mode.\n" 485 " --dequeue-zero-copy enables dequeue zero copy\n", 486 prgname); 487 } 488 489 /* 490 * Parse the arguments given in the command line of the application. 491 */ 492 static int 493 us_vhost_parse_args(int argc, char **argv) 494 { 495 int opt, ret; 496 int option_index; 497 unsigned i; 498 const char *prgname = argv[0]; 499 static struct option long_option[] = { 500 {"vm2vm", required_argument, NULL, 0}, 501 {"rx-retry", required_argument, NULL, 0}, 502 {"rx-retry-delay", required_argument, NULL, 0}, 503 {"rx-retry-num", required_argument, NULL, 0}, 504 {"mergeable", required_argument, NULL, 0}, 505 {"stats", required_argument, NULL, 0}, 506 {"socket-file", required_argument, NULL, 0}, 507 {"tx-csum", required_argument, NULL, 0}, 508 {"tso", required_argument, NULL, 0}, 509 {"client", no_argument, &client_mode, 1}, 510 {"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1}, 511 {"builtin-net-driver", no_argument, &builtin_net_driver, 1}, 512 {NULL, 0, 0, 0}, 513 }; 514 515 /* Parse command line */ 516 while ((opt = getopt_long(argc, argv, "p:P", 517 long_option, &option_index)) != EOF) { 518 switch (opt) { 519 /* Portmask */ 520 case 'p': 521 enabled_port_mask = parse_portmask(optarg); 522 if (enabled_port_mask == 0) { 523 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n"); 524 us_vhost_usage(prgname); 525 return -1; 526 } 527 break; 528 529 case 'P': 530 promiscuous = 1; 531 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode = 532 ETH_VMDQ_ACCEPT_BROADCAST | 533 ETH_VMDQ_ACCEPT_MULTICAST; 534 535 break; 536 537 case 0: 538 /* Enable/disable vm2vm comms. */ 539 if (!strncmp(long_option[option_index].name, "vm2vm", 540 MAX_LONG_OPT_SZ)) { 541 ret = parse_num_opt(optarg, (VM2VM_LAST - 1)); 542 if (ret == -1) { 543 RTE_LOG(INFO, VHOST_CONFIG, 544 "Invalid argument for " 545 "vm2vm [0|1|2]\n"); 546 us_vhost_usage(prgname); 547 return -1; 548 } else { 549 vm2vm_mode = (vm2vm_type)ret; 550 } 551 } 552 553 /* Enable/disable retries on RX. */ 554 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) { 555 ret = parse_num_opt(optarg, 1); 556 if (ret == -1) { 557 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n"); 558 us_vhost_usage(prgname); 559 return -1; 560 } else { 561 enable_retry = ret; 562 } 563 } 564 565 /* Enable/disable TX checksum offload. */ 566 if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) { 567 ret = parse_num_opt(optarg, 1); 568 if (ret == -1) { 569 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n"); 570 us_vhost_usage(prgname); 571 return -1; 572 } else 573 enable_tx_csum = ret; 574 } 575 576 /* Enable/disable TSO offload. */ 577 if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) { 578 ret = parse_num_opt(optarg, 1); 579 if (ret == -1) { 580 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n"); 581 us_vhost_usage(prgname); 582 return -1; 583 } else 584 enable_tso = ret; 585 } 586 587 /* Specify the retries delay time (in useconds) on RX. */ 588 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) { 589 ret = parse_num_opt(optarg, INT32_MAX); 590 if (ret == -1) { 591 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n"); 592 us_vhost_usage(prgname); 593 return -1; 594 } else { 595 burst_rx_delay_time = ret; 596 } 597 } 598 599 /* Specify the retries number on RX. */ 600 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) { 601 ret = parse_num_opt(optarg, INT32_MAX); 602 if (ret == -1) { 603 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n"); 604 us_vhost_usage(prgname); 605 return -1; 606 } else { 607 burst_rx_retry_num = ret; 608 } 609 } 610 611 /* Enable/disable RX mergeable buffers. */ 612 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) { 613 ret = parse_num_opt(optarg, 1); 614 if (ret == -1) { 615 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n"); 616 us_vhost_usage(prgname); 617 return -1; 618 } else { 619 mergeable = !!ret; 620 if (ret) { 621 vmdq_conf_default.rxmode.jumbo_frame = 1; 622 vmdq_conf_default.rxmode.max_rx_pkt_len 623 = JUMBO_FRAME_MAX_SIZE; 624 } 625 } 626 } 627 628 /* Enable/disable stats. */ 629 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) { 630 ret = parse_num_opt(optarg, INT32_MAX); 631 if (ret == -1) { 632 RTE_LOG(INFO, VHOST_CONFIG, 633 "Invalid argument for stats [0..N]\n"); 634 us_vhost_usage(prgname); 635 return -1; 636 } else { 637 enable_stats = ret; 638 } 639 } 640 641 /* Set socket file path. */ 642 if (!strncmp(long_option[option_index].name, 643 "socket-file", MAX_LONG_OPT_SZ)) { 644 if (us_vhost_parse_socket_path(optarg) == -1) { 645 RTE_LOG(INFO, VHOST_CONFIG, 646 "Invalid argument for socket name (Max %d characters)\n", 647 PATH_MAX); 648 us_vhost_usage(prgname); 649 return -1; 650 } 651 } 652 653 break; 654 655 /* Invalid option - print options. */ 656 default: 657 us_vhost_usage(prgname); 658 return -1; 659 } 660 } 661 662 for (i = 0; i < RTE_MAX_ETHPORTS; i++) { 663 if (enabled_port_mask & (1 << i)) 664 ports[num_ports++] = i; 665 } 666 667 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) { 668 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 669 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 670 return -1; 671 } 672 673 return 0; 674 } 675 676 /* 677 * Update the global var NUM_PORTS and array PORTS according to system ports number 678 * and return valid ports number 679 */ 680 static unsigned check_ports_num(unsigned nb_ports) 681 { 682 unsigned valid_num_ports = num_ports; 683 unsigned portid; 684 685 if (num_ports > nb_ports) { 686 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n", 687 num_ports, nb_ports); 688 num_ports = nb_ports; 689 } 690 691 for (portid = 0; portid < num_ports; portid ++) { 692 if (ports[portid] >= nb_ports) { 693 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n", 694 ports[portid], (nb_ports - 1)); 695 ports[portid] = INVALID_PORT_ID; 696 valid_num_ports--; 697 } 698 } 699 return valid_num_ports; 700 } 701 702 static __rte_always_inline struct vhost_dev * 703 find_vhost_dev(struct ether_addr *mac) 704 { 705 struct vhost_dev *vdev; 706 707 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 708 if (vdev->ready == DEVICE_RX && 709 is_same_ether_addr(mac, &vdev->mac_address)) 710 return vdev; 711 } 712 713 return NULL; 714 } 715 716 /* 717 * This function learns the MAC address of the device and registers this along with a 718 * vlan tag to a VMDQ. 719 */ 720 static int 721 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m) 722 { 723 struct ether_hdr *pkt_hdr; 724 int i, ret; 725 726 /* Learn MAC address of guest device from packet */ 727 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 728 729 if (find_vhost_dev(&pkt_hdr->s_addr)) { 730 RTE_LOG(ERR, VHOST_DATA, 731 "(%d) device is using a registered MAC!\n", 732 vdev->vid); 733 return -1; 734 } 735 736 for (i = 0; i < ETHER_ADDR_LEN; i++) 737 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i]; 738 739 /* vlan_tag currently uses the device_id. */ 740 vdev->vlan_tag = vlan_tags[vdev->vid]; 741 742 /* Print out VMDQ registration info. */ 743 RTE_LOG(INFO, VHOST_DATA, 744 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n", 745 vdev->vid, 746 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1], 747 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3], 748 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5], 749 vdev->vlan_tag); 750 751 /* Register the MAC address. */ 752 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, 753 (uint32_t)vdev->vid + vmdq_pool_base); 754 if (ret) 755 RTE_LOG(ERR, VHOST_DATA, 756 "(%d) failed to add device MAC address to VMDQ\n", 757 vdev->vid); 758 759 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1); 760 761 /* Set device as ready for RX. */ 762 vdev->ready = DEVICE_RX; 763 764 return 0; 765 } 766 767 /* 768 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX 769 * queue before disabling RX on the device. 770 */ 771 static inline void 772 unlink_vmdq(struct vhost_dev *vdev) 773 { 774 unsigned i = 0; 775 unsigned rx_count; 776 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 777 778 if (vdev->ready == DEVICE_RX) { 779 /*clear MAC and VLAN settings*/ 780 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address); 781 for (i = 0; i < 6; i++) 782 vdev->mac_address.addr_bytes[i] = 0; 783 784 vdev->vlan_tag = 0; 785 786 /*Clear out the receive buffers*/ 787 rx_count = rte_eth_rx_burst(ports[0], 788 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 789 790 while (rx_count) { 791 for (i = 0; i < rx_count; i++) 792 rte_pktmbuf_free(pkts_burst[i]); 793 794 rx_count = rte_eth_rx_burst(ports[0], 795 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST); 796 } 797 798 vdev->ready = DEVICE_MAC_LEARNING; 799 } 800 } 801 802 static __rte_always_inline void 803 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev, 804 struct rte_mbuf *m) 805 { 806 uint16_t ret; 807 808 if (builtin_net_driver) { 809 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1); 810 } else { 811 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1); 812 } 813 814 if (enable_stats) { 815 rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic); 816 rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret); 817 src_vdev->stats.tx_total++; 818 src_vdev->stats.tx += ret; 819 } 820 } 821 822 /* 823 * Check if the packet destination MAC address is for a local device. If so then put 824 * the packet on that devices RX queue. If not then return. 825 */ 826 static __rte_always_inline int 827 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m) 828 { 829 struct ether_hdr *pkt_hdr; 830 struct vhost_dev *dst_vdev; 831 832 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 833 834 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr); 835 if (!dst_vdev) 836 return -1; 837 838 if (vdev->vid == dst_vdev->vid) { 839 RTE_LOG_DP(DEBUG, VHOST_DATA, 840 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 841 vdev->vid); 842 return 0; 843 } 844 845 RTE_LOG_DP(DEBUG, VHOST_DATA, 846 "(%d) TX: MAC address is local\n", dst_vdev->vid); 847 848 if (unlikely(dst_vdev->remove)) { 849 RTE_LOG_DP(DEBUG, VHOST_DATA, 850 "(%d) device is marked for removal\n", dst_vdev->vid); 851 return 0; 852 } 853 854 virtio_xmit(dst_vdev, vdev, m); 855 return 0; 856 } 857 858 /* 859 * Check if the destination MAC of a packet is one local VM, 860 * and get its vlan tag, and offset if it is. 861 */ 862 static __rte_always_inline int 863 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m, 864 uint32_t *offset, uint16_t *vlan_tag) 865 { 866 struct vhost_dev *dst_vdev; 867 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 868 869 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr); 870 if (!dst_vdev) 871 return 0; 872 873 if (vdev->vid == dst_vdev->vid) { 874 RTE_LOG_DP(DEBUG, VHOST_DATA, 875 "(%d) TX: src and dst MAC is same. Dropping packet.\n", 876 vdev->vid); 877 return -1; 878 } 879 880 /* 881 * HW vlan strip will reduce the packet length 882 * by minus length of vlan tag, so need restore 883 * the packet length by plus it. 884 */ 885 *offset = VLAN_HLEN; 886 *vlan_tag = vlan_tags[vdev->vid]; 887 888 RTE_LOG_DP(DEBUG, VHOST_DATA, 889 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n", 890 vdev->vid, dst_vdev->vid, *vlan_tag); 891 892 return 0; 893 } 894 895 static uint16_t 896 get_psd_sum(void *l3_hdr, uint64_t ol_flags) 897 { 898 if (ol_flags & PKT_TX_IPV4) 899 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags); 900 else /* assume ethertype == ETHER_TYPE_IPv6 */ 901 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags); 902 } 903 904 static void virtio_tx_offload(struct rte_mbuf *m) 905 { 906 void *l3_hdr; 907 struct ipv4_hdr *ipv4_hdr = NULL; 908 struct tcp_hdr *tcp_hdr = NULL; 909 struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *); 910 911 l3_hdr = (char *)eth_hdr + m->l2_len; 912 913 if (m->ol_flags & PKT_TX_IPV4) { 914 ipv4_hdr = l3_hdr; 915 ipv4_hdr->hdr_checksum = 0; 916 m->ol_flags |= PKT_TX_IP_CKSUM; 917 } 918 919 tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len); 920 tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags); 921 } 922 923 static inline void 924 free_pkts(struct rte_mbuf **pkts, uint16_t n) 925 { 926 while (n--) 927 rte_pktmbuf_free(pkts[n]); 928 } 929 930 static __rte_always_inline void 931 do_drain_mbuf_table(struct mbuf_table *tx_q) 932 { 933 uint16_t count; 934 935 count = rte_eth_tx_burst(ports[0], tx_q->txq_id, 936 tx_q->m_table, tx_q->len); 937 if (unlikely(count < tx_q->len)) 938 free_pkts(&tx_q->m_table[count], tx_q->len - count); 939 940 tx_q->len = 0; 941 } 942 943 /* 944 * This function routes the TX packet to the correct interface. This 945 * may be a local device or the physical port. 946 */ 947 static __rte_always_inline void 948 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag) 949 { 950 struct mbuf_table *tx_q; 951 unsigned offset = 0; 952 const uint16_t lcore_id = rte_lcore_id(); 953 struct ether_hdr *nh; 954 955 956 nh = rte_pktmbuf_mtod(m, struct ether_hdr *); 957 if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) { 958 struct vhost_dev *vdev2; 959 960 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) { 961 if (vdev2 != vdev) 962 virtio_xmit(vdev2, vdev, m); 963 } 964 goto queue2nic; 965 } 966 967 /*check if destination is local VM*/ 968 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) { 969 rte_pktmbuf_free(m); 970 return; 971 } 972 973 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 974 if (unlikely(find_local_dest(vdev, m, &offset, 975 &vlan_tag) != 0)) { 976 rte_pktmbuf_free(m); 977 return; 978 } 979 } 980 981 RTE_LOG_DP(DEBUG, VHOST_DATA, 982 "(%d) TX: MAC address is external\n", vdev->vid); 983 984 queue2nic: 985 986 /*Add packet to the port tx queue*/ 987 tx_q = &lcore_tx_queue[lcore_id]; 988 989 nh = rte_pktmbuf_mtod(m, struct ether_hdr *); 990 if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) { 991 /* Guest has inserted the vlan tag. */ 992 struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1); 993 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag); 994 if ((vm2vm_mode == VM2VM_HARDWARE) && 995 (vh->vlan_tci != vlan_tag_be)) 996 vh->vlan_tci = vlan_tag_be; 997 } else { 998 m->ol_flags |= PKT_TX_VLAN_PKT; 999 1000 /* 1001 * Find the right seg to adjust the data len when offset is 1002 * bigger than tail room size. 1003 */ 1004 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) { 1005 if (likely(offset <= rte_pktmbuf_tailroom(m))) 1006 m->data_len += offset; 1007 else { 1008 struct rte_mbuf *seg = m; 1009 1010 while ((seg->next != NULL) && 1011 (offset > rte_pktmbuf_tailroom(seg))) 1012 seg = seg->next; 1013 1014 seg->data_len += offset; 1015 } 1016 m->pkt_len += offset; 1017 } 1018 1019 m->vlan_tci = vlan_tag; 1020 } 1021 1022 if (m->ol_flags & PKT_TX_TCP_SEG) 1023 virtio_tx_offload(m); 1024 1025 tx_q->m_table[tx_q->len++] = m; 1026 if (enable_stats) { 1027 vdev->stats.tx_total++; 1028 vdev->stats.tx++; 1029 } 1030 1031 if (unlikely(tx_q->len == MAX_PKT_BURST)) 1032 do_drain_mbuf_table(tx_q); 1033 } 1034 1035 1036 static __rte_always_inline void 1037 drain_mbuf_table(struct mbuf_table *tx_q) 1038 { 1039 static uint64_t prev_tsc; 1040 uint64_t cur_tsc; 1041 1042 if (tx_q->len == 0) 1043 return; 1044 1045 cur_tsc = rte_rdtsc(); 1046 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) { 1047 prev_tsc = cur_tsc; 1048 1049 RTE_LOG_DP(DEBUG, VHOST_DATA, 1050 "TX queue drained after timeout with burst size %u\n", 1051 tx_q->len); 1052 do_drain_mbuf_table(tx_q); 1053 } 1054 } 1055 1056 static __rte_always_inline void 1057 drain_eth_rx(struct vhost_dev *vdev) 1058 { 1059 uint16_t rx_count, enqueue_count; 1060 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1061 1062 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q, 1063 pkts, MAX_PKT_BURST); 1064 if (!rx_count) 1065 return; 1066 1067 /* 1068 * When "enable_retry" is set, here we wait and retry when there 1069 * is no enough free slots in the queue to hold @rx_count packets, 1070 * to diminish packet loss. 1071 */ 1072 if (enable_retry && 1073 unlikely(rx_count > rte_vhost_avail_entries(vdev->vid, 1074 VIRTIO_RXQ))) { 1075 uint32_t retry; 1076 1077 for (retry = 0; retry < burst_rx_retry_num; retry++) { 1078 rte_delay_us(burst_rx_delay_time); 1079 if (rx_count <= rte_vhost_avail_entries(vdev->vid, 1080 VIRTIO_RXQ)) 1081 break; 1082 } 1083 } 1084 1085 if (builtin_net_driver) { 1086 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ, 1087 pkts, rx_count); 1088 } else { 1089 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, 1090 pkts, rx_count); 1091 } 1092 if (enable_stats) { 1093 rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count); 1094 rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count); 1095 } 1096 1097 free_pkts(pkts, rx_count); 1098 } 1099 1100 static __rte_always_inline void 1101 drain_virtio_tx(struct vhost_dev *vdev) 1102 { 1103 struct rte_mbuf *pkts[MAX_PKT_BURST]; 1104 uint16_t count; 1105 uint16_t i; 1106 1107 if (builtin_net_driver) { 1108 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool, 1109 pkts, MAX_PKT_BURST); 1110 } else { 1111 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ, 1112 mbuf_pool, pkts, MAX_PKT_BURST); 1113 } 1114 1115 /* setup VMDq for the first packet */ 1116 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) { 1117 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1) 1118 free_pkts(pkts, count); 1119 } 1120 1121 for (i = 0; i < count; ++i) 1122 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]); 1123 } 1124 1125 /* 1126 * Main function of vhost-switch. It basically does: 1127 * 1128 * for each vhost device { 1129 * - drain_eth_rx() 1130 * 1131 * Which drains the host eth Rx queue linked to the vhost device, 1132 * and deliver all of them to guest virito Rx ring associated with 1133 * this vhost device. 1134 * 1135 * - drain_virtio_tx() 1136 * 1137 * Which drains the guest virtio Tx queue and deliver all of them 1138 * to the target, which could be another vhost device, or the 1139 * physical eth dev. The route is done in function "virtio_tx_route". 1140 * } 1141 */ 1142 static int 1143 switch_worker(void *arg __rte_unused) 1144 { 1145 unsigned i; 1146 unsigned lcore_id = rte_lcore_id(); 1147 struct vhost_dev *vdev; 1148 struct mbuf_table *tx_q; 1149 1150 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id); 1151 1152 tx_q = &lcore_tx_queue[lcore_id]; 1153 for (i = 0; i < rte_lcore_count(); i++) { 1154 if (lcore_ids[i] == lcore_id) { 1155 tx_q->txq_id = i; 1156 break; 1157 } 1158 } 1159 1160 while(1) { 1161 drain_mbuf_table(tx_q); 1162 1163 /* 1164 * Inform the configuration core that we have exited the 1165 * linked list and that no devices are in use if requested. 1166 */ 1167 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL) 1168 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL; 1169 1170 /* 1171 * Process vhost devices 1172 */ 1173 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list, 1174 lcore_vdev_entry) { 1175 if (unlikely(vdev->remove)) { 1176 unlink_vmdq(vdev); 1177 vdev->ready = DEVICE_SAFE_REMOVE; 1178 continue; 1179 } 1180 1181 if (likely(vdev->ready == DEVICE_RX)) 1182 drain_eth_rx(vdev); 1183 1184 if (likely(!vdev->remove)) 1185 drain_virtio_tx(vdev); 1186 } 1187 } 1188 1189 return 0; 1190 } 1191 1192 /* 1193 * Remove a device from the specific data core linked list and from the 1194 * main linked list. Synchonization occurs through the use of the 1195 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering 1196 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop. 1197 */ 1198 static void 1199 destroy_device(int vid) 1200 { 1201 struct vhost_dev *vdev = NULL; 1202 int lcore; 1203 1204 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1205 if (vdev->vid == vid) 1206 break; 1207 } 1208 if (!vdev) 1209 return; 1210 /*set the remove flag. */ 1211 vdev->remove = 1; 1212 while(vdev->ready != DEVICE_SAFE_REMOVE) { 1213 rte_pause(); 1214 } 1215 1216 if (builtin_net_driver) 1217 vs_vhost_net_remove(vdev); 1218 1219 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev, 1220 lcore_vdev_entry); 1221 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry); 1222 1223 1224 /* Set the dev_removal_flag on each lcore. */ 1225 RTE_LCORE_FOREACH_SLAVE(lcore) 1226 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL; 1227 1228 /* 1229 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL 1230 * we can be sure that they can no longer access the device removed 1231 * from the linked lists and that the devices are no longer in use. 1232 */ 1233 RTE_LCORE_FOREACH_SLAVE(lcore) { 1234 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL) 1235 rte_pause(); 1236 } 1237 1238 lcore_info[vdev->coreid].device_num--; 1239 1240 RTE_LOG(INFO, VHOST_DATA, 1241 "(%d) device has been removed from data core\n", 1242 vdev->vid); 1243 1244 rte_free(vdev); 1245 } 1246 1247 /* 1248 * A new device is added to a data core. First the device is added to the main linked list 1249 * and the allocated to a specific data core. 1250 */ 1251 static int 1252 new_device(int vid) 1253 { 1254 int lcore, core_add = 0; 1255 uint32_t device_num_min = num_devices; 1256 struct vhost_dev *vdev; 1257 1258 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE); 1259 if (vdev == NULL) { 1260 RTE_LOG(INFO, VHOST_DATA, 1261 "(%d) couldn't allocate memory for vhost dev\n", 1262 vid); 1263 return -1; 1264 } 1265 vdev->vid = vid; 1266 1267 if (builtin_net_driver) 1268 vs_vhost_net_setup(vdev); 1269 1270 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry); 1271 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base; 1272 1273 /*reset ready flag*/ 1274 vdev->ready = DEVICE_MAC_LEARNING; 1275 vdev->remove = 0; 1276 1277 /* Find a suitable lcore to add the device. */ 1278 RTE_LCORE_FOREACH_SLAVE(lcore) { 1279 if (lcore_info[lcore].device_num < device_num_min) { 1280 device_num_min = lcore_info[lcore].device_num; 1281 core_add = lcore; 1282 } 1283 } 1284 vdev->coreid = core_add; 1285 1286 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev, 1287 lcore_vdev_entry); 1288 lcore_info[vdev->coreid].device_num++; 1289 1290 /* Disable notifications. */ 1291 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0); 1292 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0); 1293 1294 RTE_LOG(INFO, VHOST_DATA, 1295 "(%d) device has been added to data core %d\n", 1296 vid, vdev->coreid); 1297 1298 return 0; 1299 } 1300 1301 /* 1302 * These callback allow devices to be added to the data core when configuration 1303 * has been fully complete. 1304 */ 1305 static const struct vhost_device_ops virtio_net_device_ops = 1306 { 1307 .new_device = new_device, 1308 .destroy_device = destroy_device, 1309 }; 1310 1311 /* 1312 * This is a thread will wake up after a period to print stats if the user has 1313 * enabled them. 1314 */ 1315 static void 1316 print_stats(void) 1317 { 1318 struct vhost_dev *vdev; 1319 uint64_t tx_dropped, rx_dropped; 1320 uint64_t tx, tx_total, rx, rx_total; 1321 const char clr[] = { 27, '[', '2', 'J', '\0' }; 1322 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' }; 1323 1324 while(1) { 1325 sleep(enable_stats); 1326 1327 /* Clear screen and move to top left */ 1328 printf("%s%s\n", clr, top_left); 1329 printf("Device statistics =================================\n"); 1330 1331 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) { 1332 tx_total = vdev->stats.tx_total; 1333 tx = vdev->stats.tx; 1334 tx_dropped = tx_total - tx; 1335 1336 rx_total = rte_atomic64_read(&vdev->stats.rx_total_atomic); 1337 rx = rte_atomic64_read(&vdev->stats.rx_atomic); 1338 rx_dropped = rx_total - rx; 1339 1340 printf("Statistics for device %d\n" 1341 "-----------------------\n" 1342 "TX total: %" PRIu64 "\n" 1343 "TX dropped: %" PRIu64 "\n" 1344 "TX successful: %" PRIu64 "\n" 1345 "RX total: %" PRIu64 "\n" 1346 "RX dropped: %" PRIu64 "\n" 1347 "RX successful: %" PRIu64 "\n", 1348 vdev->vid, 1349 tx_total, tx_dropped, tx, 1350 rx_total, rx_dropped, rx); 1351 } 1352 1353 printf("===================================================\n"); 1354 } 1355 } 1356 1357 static void 1358 unregister_drivers(int socket_num) 1359 { 1360 int i, ret; 1361 1362 for (i = 0; i < socket_num; i++) { 1363 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX); 1364 if (ret != 0) 1365 RTE_LOG(ERR, VHOST_CONFIG, 1366 "Fail to unregister vhost driver for %s.\n", 1367 socket_files + i * PATH_MAX); 1368 } 1369 } 1370 1371 /* When we receive a INT signal, unregister vhost driver */ 1372 static void 1373 sigint_handler(__rte_unused int signum) 1374 { 1375 /* Unregister vhost driver. */ 1376 unregister_drivers(nb_sockets); 1377 1378 exit(0); 1379 } 1380 1381 /* 1382 * While creating an mbuf pool, one key thing is to figure out how 1383 * many mbuf entries is enough for our use. FYI, here are some 1384 * guidelines: 1385 * 1386 * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage 1387 * 1388 * - For each switch core (A CPU core does the packet switch), we need 1389 * also make some reservation for receiving the packets from virtio 1390 * Tx queue. How many is enough depends on the usage. It's normally 1391 * a simple calculation like following: 1392 * 1393 * MAX_PKT_BURST * max packet size / mbuf size 1394 * 1395 * So, we definitely need allocate more mbufs when TSO is enabled. 1396 * 1397 * - Similarly, for each switching core, we should serve @nr_rx_desc 1398 * mbufs for receiving the packets from physical NIC device. 1399 * 1400 * - We also need make sure, for each switch core, we have allocated 1401 * enough mbufs to fill up the mbuf cache. 1402 */ 1403 static void 1404 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size, 1405 uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache) 1406 { 1407 uint32_t nr_mbufs; 1408 uint32_t nr_mbufs_per_core; 1409 uint32_t mtu = 1500; 1410 1411 if (mergeable) 1412 mtu = 9000; 1413 if (enable_tso) 1414 mtu = 64 * 1024; 1415 1416 nr_mbufs_per_core = (mtu + mbuf_size) * MAX_PKT_BURST / 1417 (mbuf_size - RTE_PKTMBUF_HEADROOM); 1418 nr_mbufs_per_core += nr_rx_desc; 1419 nr_mbufs_per_core = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache); 1420 1421 nr_mbufs = nr_queues * nr_rx_desc; 1422 nr_mbufs += nr_mbufs_per_core * nr_switch_core; 1423 nr_mbufs *= nr_port; 1424 1425 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs, 1426 nr_mbuf_cache, 0, mbuf_size, 1427 rte_socket_id()); 1428 if (mbuf_pool == NULL) 1429 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); 1430 } 1431 1432 /* 1433 * Main function, does initialisation and calls the per-lcore functions. 1434 */ 1435 int 1436 main(int argc, char *argv[]) 1437 { 1438 unsigned lcore_id, core_id = 0; 1439 unsigned nb_ports, valid_num_ports; 1440 int ret, i; 1441 uint16_t portid; 1442 static pthread_t tid; 1443 char thread_name[RTE_MAX_THREAD_NAME_LEN]; 1444 uint64_t flags = 0; 1445 1446 signal(SIGINT, sigint_handler); 1447 1448 /* init EAL */ 1449 ret = rte_eal_init(argc, argv); 1450 if (ret < 0) 1451 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 1452 argc -= ret; 1453 argv += ret; 1454 1455 /* parse app arguments */ 1456 ret = us_vhost_parse_args(argc, argv); 1457 if (ret < 0) 1458 rte_exit(EXIT_FAILURE, "Invalid argument\n"); 1459 1460 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) { 1461 TAILQ_INIT(&lcore_info[lcore_id].vdev_list); 1462 1463 if (rte_lcore_is_enabled(lcore_id)) 1464 lcore_ids[core_id++] = lcore_id; 1465 } 1466 1467 if (rte_lcore_count() > RTE_MAX_LCORE) 1468 rte_exit(EXIT_FAILURE,"Not enough cores\n"); 1469 1470 /* Get the number of physical ports. */ 1471 nb_ports = rte_eth_dev_count(); 1472 1473 /* 1474 * Update the global var NUM_PORTS and global array PORTS 1475 * and get value of var VALID_NUM_PORTS according to system ports number 1476 */ 1477 valid_num_ports = check_ports_num(nb_ports); 1478 1479 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) { 1480 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u," 1481 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS); 1482 return -1; 1483 } 1484 1485 /* 1486 * FIXME: here we are trying to allocate mbufs big enough for 1487 * @MAX_QUEUES, but the truth is we're never going to use that 1488 * many queues here. We probably should only do allocation for 1489 * those queues we are going to use. 1490 */ 1491 create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE, 1492 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE); 1493 1494 if (vm2vm_mode == VM2VM_HARDWARE) { 1495 /* Enable VT loop back to let L2 switch to do it. */ 1496 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1; 1497 RTE_LOG(DEBUG, VHOST_CONFIG, 1498 "Enable loop back for L2 switch in vmdq.\n"); 1499 } 1500 1501 /* initialize all ports */ 1502 for (portid = 0; portid < nb_ports; portid++) { 1503 /* skip ports that are not enabled */ 1504 if ((enabled_port_mask & (1 << portid)) == 0) { 1505 RTE_LOG(INFO, VHOST_PORT, 1506 "Skipping disabled port %d\n", portid); 1507 continue; 1508 } 1509 if (port_init(portid) != 0) 1510 rte_exit(EXIT_FAILURE, 1511 "Cannot initialize network ports\n"); 1512 } 1513 1514 /* Enable stats if the user option is set. */ 1515 if (enable_stats) { 1516 ret = pthread_create(&tid, NULL, (void *)print_stats, NULL); 1517 if (ret != 0) 1518 rte_exit(EXIT_FAILURE, 1519 "Cannot create print-stats thread\n"); 1520 1521 /* Set thread_name for aid in debugging. */ 1522 snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats"); 1523 ret = rte_thread_setname(tid, thread_name); 1524 if (ret != 0) 1525 RTE_LOG(DEBUG, VHOST_CONFIG, 1526 "Cannot set print-stats name\n"); 1527 } 1528 1529 /* Launch all data cores. */ 1530 RTE_LCORE_FOREACH_SLAVE(lcore_id) 1531 rte_eal_remote_launch(switch_worker, NULL, lcore_id); 1532 1533 if (client_mode) 1534 flags |= RTE_VHOST_USER_CLIENT; 1535 1536 if (dequeue_zero_copy) 1537 flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY; 1538 1539 /* Register vhost user driver to handle vhost messages. */ 1540 for (i = 0; i < nb_sockets; i++) { 1541 char *file = socket_files + i * PATH_MAX; 1542 ret = rte_vhost_driver_register(file, flags); 1543 if (ret != 0) { 1544 unregister_drivers(i); 1545 rte_exit(EXIT_FAILURE, 1546 "vhost driver register failure.\n"); 1547 } 1548 1549 if (builtin_net_driver) 1550 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES); 1551 1552 if (mergeable == 0) { 1553 rte_vhost_driver_disable_features(file, 1554 1ULL << VIRTIO_NET_F_MRG_RXBUF); 1555 } 1556 1557 if (enable_tx_csum == 0) { 1558 rte_vhost_driver_disable_features(file, 1559 1ULL << VIRTIO_NET_F_CSUM); 1560 } 1561 1562 if (enable_tso == 0) { 1563 rte_vhost_driver_disable_features(file, 1564 1ULL << VIRTIO_NET_F_HOST_TSO4); 1565 rte_vhost_driver_disable_features(file, 1566 1ULL << VIRTIO_NET_F_HOST_TSO6); 1567 rte_vhost_driver_disable_features(file, 1568 1ULL << VIRTIO_NET_F_GUEST_TSO4); 1569 rte_vhost_driver_disable_features(file, 1570 1ULL << VIRTIO_NET_F_GUEST_TSO6); 1571 } 1572 1573 if (promiscuous) { 1574 rte_vhost_driver_enable_features(file, 1575 1ULL << VIRTIO_NET_F_CTRL_RX); 1576 } 1577 1578 ret = rte_vhost_driver_callback_register(file, 1579 &virtio_net_device_ops); 1580 if (ret != 0) { 1581 rte_exit(EXIT_FAILURE, 1582 "failed to register vhost driver callbacks.\n"); 1583 } 1584 1585 if (rte_vhost_driver_start(file) < 0) { 1586 rte_exit(EXIT_FAILURE, 1587 "failed to start vhost driver.\n"); 1588 } 1589 } 1590 1591 RTE_LCORE_FOREACH_SLAVE(lcore_id) 1592 rte_eal_wait_lcore(lcore_id); 1593 1594 return 0; 1595 1596 } 1597