1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 #include <unistd.h> 28 #include <sys/mman.h> 29 #include <errno.h> 30 31 #include <rte_common.h> 32 #include <rte_byteorder.h> 33 #include <rte_log.h> 34 #include <rte_memory.h> 35 #include <rte_memcpy.h> 36 #include <rte_memzone.h> 37 #include <rte_config.h> 38 #include <rte_eal.h> 39 #include <rte_pci.h> 40 #include <rte_mbuf.h> 41 #include <rte_memory.h> 42 #include <rte_lcore.h> 43 #include <rte_launch.h> 44 #include <rte_ethdev.h> 45 #include <rte_debug.h> 46 #include <rte_common.h> 47 #include <rte_ether.h> 48 #include <rte_malloc.h> 49 #include <rte_cycles.h> 50 #include <rte_timer.h> 51 #include <rte_thash.h> 52 #include <rte_ip.h> 53 #include <rte_tcp.h> 54 #include <rte_udp.h> 55 #include <rte_eth_bond.h> 56 57 #include "ff_dpdk_if.h" 58 #include "ff_dpdk_pcap.h" 59 #include "ff_dpdk_kni.h" 60 #include "ff_config.h" 61 #include "ff_veth.h" 62 #include "ff_host_interface.h" 63 #include "ff_msg.h" 64 #include "ff_api.h" 65 #include "ff_memory.h" 66 67 #ifdef FF_KNI 68 #define KNI_MBUF_MAX 2048 69 #define KNI_QUEUE_SIZE 2048 70 71 int enable_kni; 72 static int kni_accept; 73 static int knictl_action = FF_KNICTL_ACTION_DEFAULT; 74 #endif 75 76 static int numa_on; 77 78 static unsigned idle_sleep; 79 static unsigned pkt_tx_delay; 80 81 static struct rte_timer freebsd_clock; 82 83 // Mellanox Linux's driver key 84 static uint8_t default_rsskey_40bytes[40] = { 85 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 86 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 87 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 88 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 89 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 90 }; 91 92 static int use_rsskey_52bytes = 0; 93 static uint8_t default_rsskey_52bytes[52] = { 94 0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23, 95 0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30, 96 0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02, 97 0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c, 98 0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55, 99 0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e, 100 0x81, 0x15, 0x03, 0x66 101 }; 102 103 struct lcore_conf lcore_conf; 104 105 struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 106 107 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; 108 static dispatch_func_t packet_dispatcher; 109 110 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 111 112 #define BOND_DRIVER_NAME "net_bonding" 113 114 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port); 115 116 struct ff_msg_ring { 117 char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE]; 118 /* ring[0] for lcore recv msg, other send */ 119 /* ring[1] for lcore send msg, other read */ 120 struct rte_ring *ring[FF_MSG_NUM]; 121 } __rte_cache_aligned; 122 123 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 124 static struct rte_mempool *message_pool; 125 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 126 127 static struct ff_top_args ff_top_status; 128 static struct ff_traffic_args ff_traffic; 129 extern void ff_hardclock(void); 130 131 static void 132 ff_hardclock_job(__rte_unused struct rte_timer *timer, 133 __rte_unused void *arg) { 134 ff_hardclock(); 135 ff_update_current_ts(); 136 } 137 138 struct ff_dpdk_if_context * 139 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 140 { 141 struct ff_dpdk_if_context *ctx; 142 143 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 144 if (ctx == NULL) 145 return NULL; 146 147 ctx->sc = sc; 148 ctx->ifp = ifp; 149 ctx->port_id = cfg->port_id; 150 ctx->hw_features = cfg->hw_features; 151 152 return ctx; 153 } 154 155 void 156 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 157 { 158 free(ctx); 159 } 160 161 static void 162 check_all_ports_link_status(void) 163 { 164 #define CHECK_INTERVAL 100 /* 100ms */ 165 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 166 167 uint16_t portid; 168 uint8_t count, all_ports_up, print_flag = 0; 169 struct rte_eth_link link; 170 171 printf("\nChecking link status"); 172 fflush(stdout); 173 174 int i, nb_ports; 175 nb_ports = ff_global_cfg.dpdk.nb_ports; 176 for (count = 0; count <= MAX_CHECK_TIME; count++) { 177 all_ports_up = 1; 178 for (i = 0; i < nb_ports; i++) { 179 uint16_t portid = ff_global_cfg.dpdk.portid_list[i]; 180 memset(&link, 0, sizeof(link)); 181 rte_eth_link_get_nowait(portid, &link); 182 183 /* print link status if flag set */ 184 if (print_flag == 1) { 185 if (link.link_status) { 186 printf("Port %d Link Up - speed %u " 187 "Mbps - %s\n", (int)portid, 188 (unsigned)link.link_speed, 189 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 190 ("full-duplex") : ("half-duplex\n")); 191 } else { 192 printf("Port %d Link Down\n", (int)portid); 193 } 194 continue; 195 } 196 /* clear all_ports_up flag if any link down */ 197 if (link.link_status == 0) { 198 all_ports_up = 0; 199 break; 200 } 201 } 202 203 /* after finally printing all link status, get out */ 204 if (print_flag == 1) 205 break; 206 207 if (all_ports_up == 0) { 208 printf("."); 209 fflush(stdout); 210 rte_delay_ms(CHECK_INTERVAL); 211 } 212 213 /* set the print_flag if all ports up or timeout */ 214 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 215 print_flag = 1; 216 printf("done\n"); 217 } 218 } 219 } 220 221 static int 222 init_lcore_conf(void) 223 { 224 uint8_t nb_dev_ports = rte_eth_dev_count_avail(); 225 if (nb_dev_ports == 0) { 226 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 227 } 228 229 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) { 230 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n", 231 ff_global_cfg.dpdk.max_portid); 232 } 233 234 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs; 235 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 236 237 uint16_t socket_id = 0; 238 if (numa_on) { 239 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 240 } 241 242 lcore_conf.socket_id = socket_id; 243 244 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id]; 245 if (!rte_lcore_is_enabled(lcore_id)) { 246 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 247 } 248 249 int j; 250 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) { 251 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j]; 252 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 253 254 int queueid = -1; 255 int i; 256 for (i = 0; i < pconf->nb_lcores; i++) { 257 if (pconf->lcore_list[i] == lcore_id) { 258 queueid = i; 259 } 260 } 261 if (queueid < 0) { 262 continue; 263 } 264 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid); 265 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 266 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 267 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid; 268 lcore_conf.nb_rx_queue++; 269 270 lcore_conf.tx_queue_id[port_id] = queueid; 271 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id; 272 lcore_conf.nb_tx_port++; 273 274 /* Enable pcap dump */ 275 if (ff_global_cfg.pcap.enable) { 276 ff_enable_pcap(ff_global_cfg.pcap.save_path, ff_global_cfg.pcap.snap_len); 277 } 278 279 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores; 280 } 281 282 if (lcore_conf.nb_rx_queue == 0) { 283 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id); 284 } 285 286 return 0; 287 } 288 289 static int 290 init_mem_pool(void) 291 { 292 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 293 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 294 uint32_t nb_tx_queue = nb_lcores; 295 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 296 uint16_t max_portid = ff_global_cfg.dpdk.max_portid; 297 298 unsigned nb_mbuf = RTE_ALIGN_CEIL ( 299 (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE + 300 nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST + 301 nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE + 302 nb_lcores * MEMPOOL_CACHE_SIZE + 303 #ifdef FF_KNI 304 nb_ports * KNI_MBUF_MAX + 305 nb_ports * KNI_QUEUE_SIZE + 306 #endif 307 nb_lcores * nb_ports * DISPATCH_RING_SIZE), 308 (unsigned)8192); 309 310 unsigned socketid = 0; 311 uint16_t i, lcore_id; 312 char s[64]; 313 314 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) { 315 lcore_id = ff_global_cfg.dpdk.proc_lcore[i]; 316 if (numa_on) { 317 socketid = rte_lcore_to_socket_id(lcore_id); 318 } 319 320 if (socketid >= NB_SOCKETS) { 321 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 322 socketid, i, NB_SOCKETS); 323 } 324 325 if (pktmbuf_pool[socketid] != NULL) { 326 continue; 327 } 328 329 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 330 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 331 pktmbuf_pool[socketid] = 332 rte_pktmbuf_pool_create(s, nb_mbuf, 333 MEMPOOL_CACHE_SIZE, 0, 334 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 335 } else { 336 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 337 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 338 } 339 340 if (pktmbuf_pool[socketid] == NULL) { 341 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 342 } else { 343 printf("create mbuf pool on socket %d\n", socketid); 344 } 345 346 #ifdef FF_USE_PAGE_ARRAY 347 nb_mbuf = RTE_ALIGN_CEIL ( 348 nb_ports*nb_lcores*MAX_PKT_BURST + 349 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 350 nb_lcores*MEMPOOL_CACHE_SIZE, 351 (unsigned)4096); 352 ff_init_ref_pool(nb_mbuf, socketid); 353 #endif 354 } 355 356 return 0; 357 } 358 359 static struct rte_ring * 360 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 361 { 362 struct rte_ring *ring; 363 364 if (name == NULL) { 365 rte_exit(EXIT_FAILURE, "create ring failed, no name!\n"); 366 } 367 368 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 369 ring = rte_ring_create(name, count, socket_id, flags); 370 } else { 371 ring = rte_ring_lookup(name); 372 } 373 374 if (ring == NULL) { 375 rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name); 376 } 377 378 return ring; 379 } 380 381 static int 382 init_dispatch_ring(void) 383 { 384 int j; 385 char name_buf[RTE_RING_NAMESIZE]; 386 int queueid; 387 388 unsigned socketid = lcore_conf.socket_id; 389 390 /* Create ring according to ports actually being used. */ 391 int nb_ports = ff_global_cfg.dpdk.nb_ports; 392 for (j = 0; j < nb_ports; j++) { 393 uint16_t portid = ff_global_cfg.dpdk.portid_list[j]; 394 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid]; 395 int nb_queues = pconf->nb_lcores; 396 if (dispatch_ring[portid] == NULL) { 397 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid); 398 399 dispatch_ring[portid] = rte_zmalloc(name_buf, 400 sizeof(struct rte_ring *) * nb_queues, 401 RTE_CACHE_LINE_SIZE); 402 if (dispatch_ring[portid] == NULL) { 403 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 404 "failed\n", name_buf); 405 } 406 } 407 408 for(queueid = 0; queueid < nb_queues; ++queueid) { 409 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d", 410 portid, queueid); 411 dispatch_ring[portid][queueid] = create_ring(name_buf, 412 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ); 413 414 if (dispatch_ring[portid][queueid] == NULL) 415 rte_panic("create ring:%s failed!\n", name_buf); 416 417 printf("create ring:%s success, %u ring entries are now free!\n", 418 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid])); 419 } 420 } 421 422 return 0; 423 } 424 425 static void 426 ff_msg_init(struct rte_mempool *mp, 427 __attribute__((unused)) void *opaque_arg, 428 void *obj, __attribute__((unused)) unsigned i) 429 { 430 struct ff_msg *msg = (struct ff_msg *)obj; 431 msg->msg_type = FF_UNKNOWN; 432 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 433 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 434 } 435 436 static int 437 init_msg_ring(void) 438 { 439 uint16_t i, j; 440 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 441 unsigned socketid = lcore_conf.socket_id; 442 443 /* Create message buffer pool */ 444 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 445 message_pool = rte_mempool_create(FF_MSG_POOL, 446 MSG_RING_SIZE * 2 * nb_procs, 447 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 448 NULL, NULL, ff_msg_init, NULL, 449 socketid, 0); 450 } else { 451 message_pool = rte_mempool_lookup(FF_MSG_POOL); 452 } 453 454 if (message_pool == NULL) { 455 rte_panic("Create msg mempool failed\n"); 456 } 457 458 for(i = 0; i < nb_procs; ++i) { 459 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 460 "%s%u", FF_MSG_RING_IN, i); 461 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 462 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 463 if (msg_ring[i].ring[0] == NULL) 464 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 465 466 for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) { 467 snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE, 468 "%s%u_%u", FF_MSG_RING_OUT, i, j); 469 msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j], 470 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 471 if (msg_ring[i].ring[j] == NULL) 472 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]); 473 } 474 } 475 476 return 0; 477 } 478 479 #ifdef FF_KNI 480 481 static enum FF_KNICTL_CMD get_kni_action(const char *c){ 482 if (!c) 483 return FF_KNICTL_ACTION_DEFAULT; 484 if (0 == strcasecmp(c, "alltokni")){ 485 return FF_KNICTL_ACTION_ALL_TO_KNI; 486 } else if (0 == strcasecmp(c, "alltoff")){ 487 return FF_KNICTL_ACTION_ALL_TO_FF; 488 } else if (0 == strcasecmp(c, "default")){ 489 return FF_KNICTL_ACTION_DEFAULT; 490 } else { 491 return FF_KNICTL_ACTION_DEFAULT; 492 } 493 } 494 495 static int 496 init_kni(void) 497 { 498 int nb_ports = rte_eth_dev_count_avail(); 499 kni_accept = 0; 500 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 501 kni_accept = 1; 502 503 knictl_action = get_kni_action(ff_global_cfg.kni.kni_action); 504 505 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 506 ff_global_cfg.kni.udp_port); 507 508 unsigned socket_id = lcore_conf.socket_id; 509 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 510 511 nb_ports = ff_global_cfg.dpdk.nb_ports; 512 int i, ret; 513 for (i = 0; i < nb_ports; i++) { 514 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 515 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 516 } 517 518 return 0; 519 } 520 #endif 521 522 static void 523 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues) 524 { 525 if (reta_size == 0) { 526 return; 527 } 528 529 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 530 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 531 532 /* config HW indirection table */ 533 unsigned i, j, hash=0; 534 for (i = 0; i < reta_conf_size; i++) { 535 reta_conf[i].mask = ~0ULL; 536 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 537 reta_conf[i].reta[j] = hash++ % nb_queues; 538 } 539 } 540 541 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 542 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 543 port_id); 544 } 545 } 546 547 static int 548 init_port_start(void) 549 { 550 int nb_ports = ff_global_cfg.dpdk.nb_ports; 551 unsigned socketid = 0; 552 struct rte_mempool *mbuf_pool; 553 uint16_t i, j; 554 555 for (i = 0; i < nb_ports; i++) { 556 uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i]; 557 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id]; 558 uint16_t nb_queues = pconf->nb_lcores; 559 560 for (j=0; j<=pconf->nb_slaves; j++) { 561 if (j < pconf->nb_slaves) { 562 port_id = pconf->slave_portid_list[j]; 563 printf("To init %s's %d'st slave port[%d]\n", 564 ff_global_cfg.dpdk.bond_cfgs->name, 565 j, port_id); 566 } else { 567 port_id = u_port_id; 568 } 569 570 struct rte_eth_dev_info dev_info; 571 struct rte_eth_conf port_conf = {0}; 572 struct rte_eth_rxconf rxq_conf; 573 struct rte_eth_txconf txq_conf; 574 575 int ret = rte_eth_dev_info_get(port_id, &dev_info); 576 if (ret != 0) 577 rte_exit(EXIT_FAILURE, 578 "Error during getting device (port %u) info: %s\n", 579 port_id, strerror(-ret)); 580 581 if (nb_queues > dev_info.max_rx_queues) { 582 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 583 nb_queues, 584 dev_info.max_rx_queues); 585 } 586 587 if (nb_queues > dev_info.max_tx_queues) { 588 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 589 nb_queues, 590 dev_info.max_tx_queues); 591 } 592 593 struct rte_ether_addr addr; 594 rte_eth_macaddr_get(port_id, &addr); 595 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 596 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 597 (unsigned)port_id, 598 addr.addr_bytes[0], addr.addr_bytes[1], 599 addr.addr_bytes[2], addr.addr_bytes[3], 600 addr.addr_bytes[4], addr.addr_bytes[5]); 601 602 rte_memcpy(pconf->mac, 603 addr.addr_bytes, RTE_ETHER_ADDR_LEN); 604 605 /* Set RSS mode */ 606 uint64_t default_rss_hf = ETH_RSS_PROTO_MASK; 607 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 608 port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf; 609 if (dev_info.hash_key_size == 52) { 610 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_52bytes; 611 port_conf.rx_adv_conf.rss_conf.rss_key_len = 52; 612 use_rsskey_52bytes = 1; 613 } else { 614 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes; 615 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; 616 } 617 port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads; 618 if (port_conf.rx_adv_conf.rss_conf.rss_hf != 619 ETH_RSS_PROTO_MASK) { 620 printf("Port %u modified RSS hash function based on hardware support," 621 "requested:%#"PRIx64" configured:%#"PRIx64"\n", 622 port_id, default_rss_hf, 623 port_conf.rx_adv_conf.rss_conf.rss_hf); 624 } 625 626 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) { 627 port_conf.txmode.offloads |= 628 DEV_TX_OFFLOAD_MBUF_FAST_FREE; 629 } 630 631 /* Set Rx VLAN stripping */ 632 if (ff_global_cfg.dpdk.vlan_strip) { 633 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 634 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP; 635 } 636 } 637 638 /* Enable HW CRC stripping */ 639 port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC; 640 641 /* FIXME: Enable TCP LRO ?*/ 642 #if 0 643 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 644 printf("LRO is supported\n"); 645 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO; 646 pconf->hw_features.rx_lro = 1; 647 } 648 #endif 649 650 /* Set Rx checksum checking */ 651 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 652 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 653 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 654 printf("RX checksum offload supported\n"); 655 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM; 656 pconf->hw_features.rx_csum = 1; 657 } 658 659 if (ff_global_cfg.dpdk.tx_csum_offoad_skip == 0) { 660 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 661 printf("TX ip checksum offload supported\n"); 662 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; 663 pconf->hw_features.tx_csum_ip = 1; 664 } 665 666 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 667 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 668 printf("TX TCP&UDP checksum offload supported\n"); 669 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM; 670 pconf->hw_features.tx_csum_l4 = 1; 671 } 672 } else { 673 printf("TX checksum offoad is disabled\n"); 674 } 675 676 if (ff_global_cfg.dpdk.tso) { 677 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 678 printf("TSO is supported\n"); 679 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO; 680 pconf->hw_features.tx_tso = 1; 681 } 682 } else { 683 printf("TSO is disabled\n"); 684 } 685 686 if (dev_info.reta_size) { 687 /* reta size must be power of 2 */ 688 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 689 690 rss_reta_size[port_id] = dev_info.reta_size; 691 printf("port[%d]: rss table size: %d\n", port_id, 692 dev_info.reta_size); 693 } 694 695 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 696 continue; 697 } 698 699 ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); 700 if (ret != 0) { 701 return ret; 702 } 703 704 static uint16_t nb_rxd = RX_QUEUE_SIZE; 705 static uint16_t nb_txd = TX_QUEUE_SIZE; 706 ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd); 707 if (ret < 0) 708 printf("Could not adjust number of descriptors " 709 "for port%u (%d)\n", (unsigned)port_id, ret); 710 711 uint16_t q; 712 for (q = 0; q < nb_queues; q++) { 713 if (numa_on) { 714 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q]; 715 socketid = rte_lcore_to_socket_id(lcore_id); 716 } 717 mbuf_pool = pktmbuf_pool[socketid]; 718 719 txq_conf = dev_info.default_txconf; 720 txq_conf.offloads = port_conf.txmode.offloads; 721 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd, 722 socketid, &txq_conf); 723 if (ret < 0) { 724 return ret; 725 } 726 727 rxq_conf = dev_info.default_rxconf; 728 rxq_conf.offloads = port_conf.rxmode.offloads; 729 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd, 730 socketid, &rxq_conf, mbuf_pool); 731 if (ret < 0) { 732 return ret; 733 } 734 } 735 736 737 if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME, 738 strlen(dev_info.driver_name)) == 0) { 739 740 rte_eth_macaddr_get(port_id, &addr); 741 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 742 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 743 (unsigned)port_id, 744 addr.addr_bytes[0], addr.addr_bytes[1], 745 addr.addr_bytes[2], addr.addr_bytes[3], 746 addr.addr_bytes[4], addr.addr_bytes[5]); 747 748 rte_memcpy(pconf->mac, 749 addr.addr_bytes, RTE_ETHER_ADDR_LEN); 750 751 int mode, count, x; 752 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS; 753 754 mode = rte_eth_bond_mode_get(port_id); 755 printf("Port %u, bond mode:%d\n", port_id, mode); 756 757 count = rte_eth_bond_slaves_get(port_id, slaves, len); 758 printf("Port %u, %s's slave ports count:%d\n", port_id, 759 ff_global_cfg.dpdk.bond_cfgs->name, count); 760 for (x=0; x<count; x++) { 761 printf("Port %u, %s's slave port[%u]\n", port_id, 762 ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]); 763 } 764 } 765 766 ret = rte_eth_dev_start(port_id); 767 if (ret < 0) { 768 return ret; 769 } 770 771 if (nb_queues > 1) { 772 /* set HW rss hash function to Toeplitz. */ 773 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 774 struct rte_eth_hash_filter_info info = {0}; 775 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 776 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 777 778 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 779 RTE_ETH_FILTER_SET, &info) < 0) { 780 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 781 port_id); 782 } 783 } 784 785 set_rss_table(port_id, dev_info.reta_size, nb_queues); 786 } 787 788 /* Enable RX in promiscuous mode for the Ethernet device. */ 789 if (ff_global_cfg.dpdk.promiscuous) { 790 ret = rte_eth_promiscuous_enable(port_id); 791 if (ret == 0) { 792 printf("set port %u to promiscuous mode ok\n", port_id); 793 } else { 794 printf("set port %u to promiscuous mode error\n", port_id); 795 } 796 } 797 } 798 } 799 800 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 801 check_all_ports_link_status(); 802 } 803 804 return 0; 805 } 806 807 static int 808 init_clock(void) 809 { 810 rte_timer_subsystem_init(); 811 uint64_t hz = rte_get_timer_hz(); 812 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 813 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 814 815 rte_timer_init(&freebsd_clock); 816 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 817 rte_lcore_id(), &ff_hardclock_job, NULL); 818 819 ff_update_current_ts(); 820 821 return 0; 822 } 823 824 int 825 ff_dpdk_init(int argc, char **argv) 826 { 827 if (ff_global_cfg.dpdk.nb_procs < 1 || 828 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 829 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 830 ff_global_cfg.dpdk.proc_id < 0) { 831 printf("param num_procs[%d] or proc_id[%d] error!\n", 832 ff_global_cfg.dpdk.nb_procs, 833 ff_global_cfg.dpdk.proc_id); 834 exit(1); 835 } 836 837 int ret = rte_eal_init(argc, argv); 838 if (ret < 0) { 839 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 840 } 841 842 numa_on = ff_global_cfg.dpdk.numa_on; 843 844 idle_sleep = ff_global_cfg.dpdk.idle_sleep; 845 pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \ 846 BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay; 847 848 init_lcore_conf(); 849 850 init_mem_pool(); 851 852 init_dispatch_ring(); 853 854 init_msg_ring(); 855 856 #ifdef FF_KNI 857 enable_kni = ff_global_cfg.kni.enable; 858 if (enable_kni) { 859 init_kni(); 860 } 861 #endif 862 863 #ifdef FF_USE_PAGE_ARRAY 864 ff_mmap_init(); 865 #endif 866 867 ret = init_port_start(); 868 if (ret < 0) { 869 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 870 } 871 872 init_clock(); 873 874 return 0; 875 } 876 877 static void 878 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 879 { 880 uint8_t rx_csum = ctx->hw_features.rx_csum; 881 if (rx_csum) { 882 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 883 rte_pktmbuf_free(pkt); 884 return; 885 } 886 } 887 888 void *data = rte_pktmbuf_mtod(pkt, void*); 889 uint16_t len = rte_pktmbuf_data_len(pkt); 890 891 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 892 if (hdr == NULL) { 893 rte_pktmbuf_free(pkt); 894 return; 895 } 896 897 if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) { 898 ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci); 899 } 900 901 struct rte_mbuf *pn = pkt->next; 902 void *prev = hdr; 903 while(pn != NULL) { 904 data = rte_pktmbuf_mtod(pn, void*); 905 len = rte_pktmbuf_data_len(pn); 906 907 void *mb = ff_mbuf_get(prev, data, len); 908 if (mb == NULL) { 909 ff_mbuf_free(hdr); 910 rte_pktmbuf_free(pkt); 911 return; 912 } 913 pn = pn->next; 914 prev = mb; 915 } 916 917 ff_veth_process_packet(ctx->ifp, hdr); 918 } 919 920 static enum FilterReturn 921 protocol_filter(const void *data, uint16_t len) 922 { 923 if(len < RTE_ETHER_ADDR_LEN) 924 return FILTER_UNKNOWN; 925 926 const struct rte_ether_hdr *hdr; 927 const struct rte_vlan_hdr *vlanhdr; 928 hdr = (const struct rte_ether_hdr *)data; 929 uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type); 930 data += RTE_ETHER_HDR_LEN; 931 len -= RTE_ETHER_HDR_LEN; 932 933 if (ether_type == RTE_ETHER_TYPE_VLAN) { 934 vlanhdr = (struct rte_vlan_hdr *)data; 935 ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto); 936 data += sizeof(struct rte_vlan_hdr); 937 len -= sizeof(struct rte_vlan_hdr); 938 } 939 940 if(ether_type == RTE_ETHER_TYPE_ARP) 941 return FILTER_ARP; 942 943 #ifdef INET6 944 if (ether_type == RTE_ETHER_TYPE_IPV6) { 945 return ff_kni_proto_filter(data, 946 len, ether_type); 947 } 948 #endif 949 950 #ifndef FF_KNI 951 return FILTER_UNKNOWN; 952 #else 953 if (!enable_kni) { 954 return FILTER_UNKNOWN; 955 } 956 957 if(ether_type != RTE_ETHER_TYPE_IPV4) 958 return FILTER_UNKNOWN; 959 960 return ff_kni_proto_filter(data, 961 len, ether_type); 962 #endif 963 } 964 965 static inline void 966 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m) 967 { 968 struct rte_mbuf *md; 969 void *src, *dst; 970 971 dst = rte_pktmbuf_mtod(mi, void *); 972 src = rte_pktmbuf_mtod(m, void *); 973 974 mi->data_len = m->data_len; 975 rte_memcpy(dst, src, m->data_len); 976 977 mi->port = m->port; 978 mi->vlan_tci = m->vlan_tci; 979 mi->vlan_tci_outer = m->vlan_tci_outer; 980 mi->tx_offload = m->tx_offload; 981 mi->hash = m->hash; 982 mi->ol_flags = m->ol_flags; 983 mi->packet_type = m->packet_type; 984 } 985 986 /* copied from rte_pktmbuf_clone */ 987 static inline struct rte_mbuf * 988 pktmbuf_deep_clone(const struct rte_mbuf *md, 989 struct rte_mempool *mp) 990 { 991 struct rte_mbuf *mc, *mi, **prev; 992 uint32_t pktlen; 993 uint8_t nseg; 994 995 if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) 996 return NULL; 997 998 mi = mc; 999 prev = &mi->next; 1000 pktlen = md->pkt_len; 1001 nseg = 0; 1002 1003 do { 1004 nseg++; 1005 pktmbuf_deep_attach(mi, md); 1006 *prev = mi; 1007 prev = &mi->next; 1008 } while ((md = md->next) != NULL && 1009 (mi = rte_pktmbuf_alloc(mp)) != NULL); 1010 1011 *prev = NULL; 1012 mc->nb_segs = nseg; 1013 mc->pkt_len = pktlen; 1014 1015 /* Allocation of new indirect segment failed */ 1016 if (unlikely (mi == NULL)) { 1017 rte_pktmbuf_free(mc); 1018 return NULL; 1019 } 1020 1021 __rte_mbuf_sanity_check(mc, 1); 1022 return mc; 1023 } 1024 1025 static inline void 1026 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 1027 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 1028 { 1029 struct lcore_conf *qconf = &lcore_conf; 1030 uint16_t nb_queues = qconf->nb_queue_list[port_id]; 1031 1032 uint16_t i; 1033 for (i = 0; i < count; i++) { 1034 struct rte_mbuf *rtem = bufs[i]; 1035 1036 if (unlikely( ff_global_cfg.pcap.enable)) { 1037 if (!pkts_from_ring) { 1038 ff_dump_packets( ff_global_cfg.pcap.save_path, rtem, ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len); 1039 } 1040 } 1041 1042 void *data = rte_pktmbuf_mtod(rtem, void*); 1043 uint16_t len = rte_pktmbuf_data_len(rtem); 1044 1045 if (!pkts_from_ring) { 1046 ff_traffic.rx_packets++; 1047 ff_traffic.rx_bytes += len; 1048 } 1049 1050 if (!pkts_from_ring && packet_dispatcher) { 1051 int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues); 1052 if (ret == FF_DISPATCH_RESPONSE) { 1053 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len; 1054 1055 /* 1056 * We have not support vlan out strip 1057 */ 1058 if (rtem->vlan_tci) { 1059 data = rte_pktmbuf_prepend(rtem, sizeof(struct rte_vlan_hdr)); 1060 if (data != NULL) { 1061 memmove(data, data + sizeof(struct rte_vlan_hdr), RTE_ETHER_HDR_LEN); 1062 struct rte_ether_hdr *etherhdr = (struct rte_ether_hdr *)data; 1063 struct rte_vlan_hdr *vlanhdr = (struct rte_vlan_hdr *)(data + RTE_ETHER_HDR_LEN); 1064 vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci); 1065 vlanhdr->eth_proto = etherhdr->ether_type; 1066 etherhdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN); 1067 } 1068 } 1069 send_single_packet(rtem, port_id); 1070 continue; 1071 } 1072 1073 if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) { 1074 rte_pktmbuf_free(rtem); 1075 continue; 1076 } 1077 1078 if (ret != queue_id) { 1079 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem); 1080 if (ret < 0) 1081 rte_pktmbuf_free(rtem); 1082 1083 continue; 1084 } 1085 } 1086 1087 enum FilterReturn filter = protocol_filter(data, len); 1088 #ifdef INET6 1089 if (filter == FILTER_ARP || filter == FILTER_NDP) { 1090 #else 1091 if (filter == FILTER_ARP) { 1092 #endif 1093 struct rte_mempool *mbuf_pool; 1094 struct rte_mbuf *mbuf_clone; 1095 if (!pkts_from_ring) { 1096 uint16_t j; 1097 for(j = 0; j < nb_queues; ++j) { 1098 if(j == queue_id) 1099 continue; 1100 1101 unsigned socket_id = 0; 1102 if (numa_on) { 1103 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j]; 1104 socket_id = rte_lcore_to_socket_id(lcore_id); 1105 } 1106 mbuf_pool = pktmbuf_pool[socket_id]; 1107 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1108 if(mbuf_clone) { 1109 int ret = rte_ring_enqueue(dispatch_ring[port_id][j], 1110 mbuf_clone); 1111 if (ret < 0) 1112 rte_pktmbuf_free(mbuf_clone); 1113 } 1114 } 1115 } 1116 1117 #ifdef FF_KNI 1118 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1119 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 1120 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1121 if(mbuf_clone) { 1122 ff_kni_enqueue(port_id, mbuf_clone); 1123 } 1124 } 1125 #endif 1126 ff_veth_input(ctx, rtem); 1127 #ifdef FF_KNI 1128 } else if (enable_kni) { 1129 if (knictl_action == FF_KNICTL_ACTION_ALL_TO_KNI){ 1130 ff_kni_enqueue(port_id, rtem); 1131 } else if (knictl_action == FF_KNICTL_ACTION_ALL_TO_FF){ 1132 ff_veth_input(ctx, rtem); 1133 } else if (knictl_action == FF_KNICTL_ACTION_DEFAULT){ 1134 if (enable_kni && 1135 ((filter == FILTER_KNI && kni_accept) || 1136 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 1137 ff_kni_enqueue(port_id, rtem); 1138 } else { 1139 ff_veth_input(ctx, rtem); 1140 } 1141 } else { 1142 ff_veth_input(ctx, rtem); 1143 } 1144 #endif 1145 } else { 1146 ff_veth_input(ctx, rtem); 1147 } 1148 } 1149 } 1150 1151 static inline int 1152 process_dispatch_ring(uint16_t port_id, uint16_t queue_id, 1153 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 1154 { 1155 /* read packet from ring buf and to process */ 1156 uint16_t nb_rb; 1157 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id], 1158 (void **)pkts_burst, MAX_PKT_BURST, NULL); 1159 1160 if(nb_rb > 0) { 1161 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 1162 } 1163 1164 return 0; 1165 } 1166 1167 static inline void 1168 handle_sysctl_msg(struct ff_msg *msg) 1169 { 1170 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 1171 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 1172 msg->sysctl.newlen); 1173 1174 if (ret < 0) { 1175 msg->result = errno; 1176 } else { 1177 msg->result = 0; 1178 } 1179 } 1180 1181 static inline void 1182 handle_ioctl_msg(struct ff_msg *msg) 1183 { 1184 int fd, ret; 1185 #ifdef INET6 1186 if (msg->msg_type == FF_IOCTL6) { 1187 fd = ff_socket(AF_INET6, SOCK_DGRAM, 0); 1188 } else 1189 #endif 1190 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 1191 1192 if (fd < 0) { 1193 ret = -1; 1194 goto done; 1195 } 1196 1197 ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data); 1198 1199 ff_close(fd); 1200 1201 done: 1202 if (ret < 0) { 1203 msg->result = errno; 1204 } else { 1205 msg->result = 0; 1206 } 1207 } 1208 1209 static inline void 1210 handle_route_msg(struct ff_msg *msg) 1211 { 1212 int ret = ff_rtioctl(msg->route.fib, msg->route.data, 1213 &msg->route.len, msg->route.maxlen); 1214 if (ret < 0) { 1215 msg->result = errno; 1216 } else { 1217 msg->result = 0; 1218 } 1219 } 1220 1221 static inline void 1222 handle_top_msg(struct ff_msg *msg) 1223 { 1224 msg->top = ff_top_status; 1225 msg->result = 0; 1226 } 1227 1228 #ifdef FF_NETGRAPH 1229 static inline void 1230 handle_ngctl_msg(struct ff_msg *msg) 1231 { 1232 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data); 1233 if (ret < 0) { 1234 msg->result = errno; 1235 } else { 1236 msg->result = 0; 1237 msg->ngctl.ret = ret; 1238 } 1239 } 1240 #endif 1241 1242 #ifdef FF_IPFW 1243 static inline void 1244 handle_ipfw_msg(struct ff_msg *msg) 1245 { 1246 int fd, ret; 1247 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW); 1248 if (fd < 0) { 1249 ret = -1; 1250 goto done; 1251 } 1252 1253 switch (msg->ipfw.cmd) { 1254 case FF_IPFW_GET: 1255 ret = ff_getsockopt_freebsd(fd, msg->ipfw.level, 1256 msg->ipfw.optname, msg->ipfw.optval, 1257 msg->ipfw.optlen); 1258 break; 1259 case FF_IPFW_SET: 1260 ret = ff_setsockopt_freebsd(fd, msg->ipfw.level, 1261 msg->ipfw.optname, msg->ipfw.optval, 1262 *(msg->ipfw.optlen)); 1263 break; 1264 default: 1265 ret = -1; 1266 errno = ENOTSUP; 1267 break; 1268 } 1269 1270 ff_close(fd); 1271 1272 done: 1273 if (ret < 0) { 1274 msg->result = errno; 1275 } else { 1276 msg->result = 0; 1277 } 1278 } 1279 #endif 1280 1281 static inline void 1282 handle_traffic_msg(struct ff_msg *msg) 1283 { 1284 msg->traffic = ff_traffic; 1285 msg->result = 0; 1286 } 1287 1288 #ifdef FF_KNI 1289 static inline void 1290 handle_knictl_msg(struct ff_msg *msg) 1291 { 1292 if (msg->knictl.kni_cmd == FF_KNICTL_CMD_SET){ 1293 switch (msg->knictl.kni_action){ 1294 case FF_KNICTL_ACTION_ALL_TO_FF: knictl_action = FF_KNICTL_ACTION_ALL_TO_FF; msg->result = 0; printf("new kni action: alltoff\n"); break; 1295 case FF_KNICTL_ACTION_ALL_TO_KNI: knictl_action = FF_KNICTL_ACTION_ALL_TO_KNI; msg->result = 0; printf("new kni action: alltokni\n"); break; 1296 case FF_KNICTL_ACTION_DEFAULT: knictl_action = FF_KNICTL_ACTION_DEFAULT; msg->result = 0; printf("new kni action: default\n"); break; 1297 default: msg->result = -1; 1298 } 1299 } 1300 else if (msg->knictl.kni_cmd == FF_KNICTL_CMD_GET){ 1301 msg->knictl.kni_action = knictl_action; 1302 } else { 1303 msg->result = -2; 1304 } 1305 } 1306 #endif 1307 1308 static inline void 1309 handle_default_msg(struct ff_msg *msg) 1310 { 1311 msg->result = ENOTSUP; 1312 } 1313 1314 static inline void 1315 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1316 { 1317 switch (msg->msg_type) { 1318 case FF_SYSCTL: 1319 handle_sysctl_msg(msg); 1320 break; 1321 case FF_IOCTL: 1322 #ifdef INET6 1323 case FF_IOCTL6: 1324 #endif 1325 handle_ioctl_msg(msg); 1326 break; 1327 case FF_ROUTE: 1328 handle_route_msg(msg); 1329 break; 1330 case FF_TOP: 1331 handle_top_msg(msg); 1332 break; 1333 #ifdef FF_NETGRAPH 1334 case FF_NGCTL: 1335 handle_ngctl_msg(msg); 1336 break; 1337 #endif 1338 #ifdef FF_IPFW 1339 case FF_IPFW_CTL: 1340 handle_ipfw_msg(msg); 1341 break; 1342 #endif 1343 case FF_TRAFFIC: 1344 handle_traffic_msg(msg); 1345 break; 1346 #ifdef FF_KNI 1347 case FF_KNICTL: 1348 handle_knictl_msg(msg); 1349 break; 1350 #endif 1351 default: 1352 handle_default_msg(msg); 1353 break; 1354 } 1355 rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg); 1356 } 1357 1358 static inline int 1359 process_msg_ring(uint16_t proc_id) 1360 { 1361 void *msg; 1362 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1363 1364 if (unlikely(ret == 0)) { 1365 handle_msg((struct ff_msg *)msg, proc_id); 1366 } 1367 1368 return 0; 1369 } 1370 1371 /* Send burst of packets on an output interface */ 1372 static inline int 1373 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1374 { 1375 struct rte_mbuf **m_table; 1376 int ret; 1377 uint16_t queueid; 1378 1379 queueid = qconf->tx_queue_id[port]; 1380 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1381 1382 if (unlikely(ff_global_cfg.pcap.enable)) { 1383 uint16_t i; 1384 for (i = 0; i < n; i++) { 1385 ff_dump_packets( ff_global_cfg.pcap.save_path, m_table[i], 1386 ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len); 1387 } 1388 } 1389 1390 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1391 ff_traffic.tx_packets += ret; 1392 uint16_t i; 1393 for (i = 0; i < ret; i++) { 1394 ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]); 1395 #ifdef FF_USE_PAGE_ARRAY 1396 if (qconf->tx_mbufs[port].bsd_m_table[i]) 1397 ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs); 1398 #endif 1399 } 1400 if (unlikely(ret < n)) { 1401 do { 1402 rte_pktmbuf_free(m_table[ret]); 1403 #ifdef FF_USE_PAGE_ARRAY 1404 if ( qconf->tx_mbufs[port].bsd_m_table[ret] ) 1405 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]); 1406 #endif 1407 } while (++ret < n); 1408 } 1409 return 0; 1410 } 1411 1412 /* Enqueue a single packet, and send burst if queue is filled */ 1413 static inline int 1414 send_single_packet(struct rte_mbuf *m, uint8_t port) 1415 { 1416 uint16_t len; 1417 struct lcore_conf *qconf; 1418 1419 qconf = &lcore_conf; 1420 len = qconf->tx_mbufs[port].len; 1421 qconf->tx_mbufs[port].m_table[len] = m; 1422 len++; 1423 1424 /* enough pkts to be sent */ 1425 if (unlikely(len == MAX_PKT_BURST)) { 1426 send_burst(qconf, MAX_PKT_BURST, port); 1427 len = 0; 1428 } 1429 1430 qconf->tx_mbufs[port].len = len; 1431 return 0; 1432 } 1433 1434 int 1435 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1436 int total) 1437 { 1438 #ifdef FF_USE_PAGE_ARRAY 1439 struct lcore_conf *qconf = &lcore_conf; 1440 int len = 0; 1441 1442 len = ff_if_send_onepkt(ctx, m,total); 1443 if (unlikely(len == MAX_PKT_BURST)) { 1444 send_burst(qconf, MAX_PKT_BURST, ctx->port_id); 1445 len = 0; 1446 } 1447 qconf->tx_mbufs[ctx->port_id].len = len; 1448 return 0; 1449 #endif 1450 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1451 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1452 if (head == NULL) { 1453 ff_mbuf_free(m); 1454 return -1; 1455 } 1456 1457 head->pkt_len = total; 1458 head->nb_segs = 0; 1459 1460 int off = 0; 1461 struct rte_mbuf *cur = head, *prev = NULL; 1462 while(total > 0) { 1463 if (cur == NULL) { 1464 cur = rte_pktmbuf_alloc(mbuf_pool); 1465 if (cur == NULL) { 1466 rte_pktmbuf_free(head); 1467 ff_mbuf_free(m); 1468 return -1; 1469 } 1470 } 1471 1472 if (prev != NULL) { 1473 prev->next = cur; 1474 } 1475 head->nb_segs++; 1476 1477 prev = cur; 1478 void *data = rte_pktmbuf_mtod(cur, void*); 1479 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1480 int ret = ff_mbuf_copydata(m, data, off, len); 1481 if (ret < 0) { 1482 rte_pktmbuf_free(head); 1483 ff_mbuf_free(m); 1484 return -1; 1485 } 1486 1487 1488 cur->data_len = len; 1489 off += len; 1490 total -= len; 1491 cur = NULL; 1492 } 1493 1494 struct ff_tx_offload offload = {0}; 1495 ff_mbuf_tx_offload(m, &offload); 1496 1497 void *data = rte_pktmbuf_mtod(head, void*); 1498 1499 if (offload.ip_csum) { 1500 /* ipv6 not supported yet */ 1501 struct rte_ipv4_hdr *iph; 1502 int iph_len; 1503 iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN); 1504 iph_len = (iph->version_ihl & 0x0f) << 2; 1505 1506 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 1507 head->l2_len = RTE_ETHER_HDR_LEN; 1508 head->l3_len = iph_len; 1509 } 1510 1511 if (ctx->hw_features.tx_csum_l4) { 1512 struct rte_ipv4_hdr *iph; 1513 int iph_len; 1514 iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN); 1515 iph_len = (iph->version_ihl & 0x0f) << 2; 1516 1517 if (offload.tcp_csum) { 1518 head->ol_flags |= PKT_TX_TCP_CKSUM; 1519 head->l2_len = RTE_ETHER_HDR_LEN; 1520 head->l3_len = iph_len; 1521 } 1522 1523 /* 1524 * TCP segmentation offload. 1525 * 1526 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 1527 * implies PKT_TX_TCP_CKSUM) 1528 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 1529 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 1530 * write the IP checksum to 0 in the packet 1531 * - fill the mbuf offload information: l2_len, 1532 * l3_len, l4_len, tso_segsz 1533 * - calculate the pseudo header checksum without taking ip_len 1534 * in account, and set it in the TCP header. Refer to 1535 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 1536 * used as helpers. 1537 */ 1538 if (offload.tso_seg_size) { 1539 struct rte_tcp_hdr *tcph; 1540 int tcph_len; 1541 tcph = (struct rte_tcp_hdr *)((char *)iph + iph_len); 1542 tcph_len = (tcph->data_off & 0xf0) >> 2; 1543 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 1544 1545 head->ol_flags |= PKT_TX_TCP_SEG; 1546 head->l4_len = tcph_len; 1547 head->tso_segsz = offload.tso_seg_size; 1548 } 1549 1550 if (offload.udp_csum) { 1551 head->ol_flags |= PKT_TX_UDP_CKSUM; 1552 head->l2_len = RTE_ETHER_HDR_LEN; 1553 head->l3_len = iph_len; 1554 } 1555 } 1556 1557 ff_mbuf_free(m); 1558 1559 return send_single_packet(head, ctx->port_id); 1560 } 1561 1562 static int 1563 main_loop(void *arg) 1564 { 1565 struct loop_routine *lr = (struct loop_routine *)arg; 1566 1567 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1568 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc; 1569 int i, j, nb_rx, idle; 1570 uint16_t port_id, queue_id; 1571 struct lcore_conf *qconf; 1572 uint64_t drain_tsc = 0; 1573 struct ff_dpdk_if_context *ctx; 1574 1575 if (pkt_tx_delay) { 1576 drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay; 1577 } 1578 1579 prev_tsc = 0; 1580 usch_tsc = 0; 1581 1582 qconf = &lcore_conf; 1583 1584 while (1) { 1585 cur_tsc = rte_rdtsc(); 1586 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1587 rte_timer_manage(); 1588 } 1589 1590 idle = 1; 1591 sys_tsc = 0; 1592 usr_tsc = 0; 1593 1594 /* 1595 * TX burst queue drain 1596 */ 1597 diff_tsc = cur_tsc - prev_tsc; 1598 if (unlikely(diff_tsc >= drain_tsc)) { 1599 for (i = 0; i < qconf->nb_tx_port; i++) { 1600 port_id = qconf->tx_port_id[i]; 1601 if (qconf->tx_mbufs[port_id].len == 0) 1602 continue; 1603 1604 idle = 0; 1605 1606 send_burst(qconf, 1607 qconf->tx_mbufs[port_id].len, 1608 port_id); 1609 qconf->tx_mbufs[port_id].len = 0; 1610 } 1611 1612 prev_tsc = cur_tsc; 1613 } 1614 1615 /* 1616 * Read packet from RX queues 1617 */ 1618 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1619 port_id = qconf->rx_queue_list[i].port_id; 1620 queue_id = qconf->rx_queue_list[i].queue_id; 1621 ctx = veth_ctx[port_id]; 1622 1623 #ifdef FF_KNI 1624 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1625 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1626 } 1627 #endif 1628 1629 process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); 1630 1631 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1632 MAX_PKT_BURST); 1633 if (nb_rx == 0) 1634 continue; 1635 1636 idle = 0; 1637 1638 /* Prefetch first packets */ 1639 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1640 rte_prefetch0(rte_pktmbuf_mtod( 1641 pkts_burst[j], void *)); 1642 } 1643 1644 /* Prefetch and handle already prefetched packets */ 1645 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1646 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1647 j + PREFETCH_OFFSET], void *)); 1648 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1649 } 1650 1651 /* Handle remaining prefetched packets */ 1652 for (; j < nb_rx; j++) { 1653 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1654 } 1655 } 1656 1657 process_msg_ring(qconf->proc_id); 1658 1659 div_tsc = rte_rdtsc(); 1660 1661 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) { 1662 usch_tsc = cur_tsc; 1663 lr->loop(lr->arg); 1664 } 1665 1666 idle_sleep_tsc = rte_rdtsc(); 1667 if (likely(idle && idle_sleep)) { 1668 usleep(idle_sleep); 1669 end_tsc = rte_rdtsc(); 1670 } else { 1671 end_tsc = idle_sleep_tsc; 1672 } 1673 1674 if (usch_tsc == cur_tsc) { 1675 usr_tsc = idle_sleep_tsc - div_tsc; 1676 } 1677 1678 if (!idle) { 1679 sys_tsc = div_tsc - cur_tsc; 1680 ff_top_status.sys_tsc += sys_tsc; 1681 } 1682 1683 ff_top_status.usr_tsc += usr_tsc; 1684 ff_top_status.work_tsc += end_tsc - cur_tsc; 1685 ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1686 1687 ff_top_status.loops++; 1688 } 1689 1690 return 0; 1691 } 1692 1693 int 1694 ff_dpdk_if_up(void) { 1695 int i; 1696 struct lcore_conf *qconf = &lcore_conf; 1697 for (i = 0; i < qconf->nb_tx_port; i++) { 1698 uint16_t port_id = qconf->tx_port_id[i]; 1699 1700 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id]; 1701 veth_ctx[port_id] = ff_veth_attach(pconf); 1702 if (veth_ctx[port_id] == NULL) { 1703 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1704 } 1705 } 1706 1707 return 0; 1708 } 1709 1710 void 1711 ff_dpdk_run(loop_func_t loop, void *arg) { 1712 struct loop_routine *lr = rte_malloc(NULL, 1713 sizeof(struct loop_routine), 0); 1714 lr->loop = loop; 1715 lr->arg = arg; 1716 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1717 rte_eal_mp_wait_lcore(); 1718 rte_free(lr); 1719 } 1720 1721 void 1722 ff_dpdk_pktmbuf_free(void *m) 1723 { 1724 rte_pktmbuf_free((struct rte_mbuf *)m); 1725 } 1726 1727 static uint32_t 1728 toeplitz_hash(unsigned keylen, const uint8_t *key, 1729 unsigned datalen, const uint8_t *data) 1730 { 1731 uint32_t hash = 0, v; 1732 u_int i, b; 1733 1734 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1735 1736 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1737 for (i = 0; i < datalen; i++) { 1738 for (b = 0; b < 8; b++) { 1739 if (data[i] & (1<<(7-b))) 1740 hash ^= v; 1741 v <<= 1; 1742 if ((i + 4) < keylen && 1743 (key[i+4] & (1<<(7-b)))) 1744 v |= 1; 1745 } 1746 } 1747 return (hash); 1748 } 1749 1750 int 1751 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1752 uint16_t sport, uint16_t dport) 1753 { 1754 struct lcore_conf *qconf = &lcore_conf; 1755 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 1756 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id]; 1757 1758 if (nb_queues <= 1) { 1759 return 1; 1760 } 1761 1762 uint16_t reta_size = rss_reta_size[ctx->port_id]; 1763 uint16_t queueid = qconf->tx_queue_id[ctx->port_id]; 1764 1765 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1766 sizeof(dport)]; 1767 1768 unsigned datalen = 0; 1769 1770 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1771 datalen += sizeof(saddr); 1772 1773 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1774 datalen += sizeof(daddr); 1775 1776 bcopy(&sport, &data[datalen], sizeof(sport)); 1777 datalen += sizeof(sport); 1778 1779 bcopy(&dport, &data[datalen], sizeof(dport)); 1780 datalen += sizeof(dport); 1781 1782 uint32_t hash = 0; 1783 if ( !use_rsskey_52bytes ) 1784 hash = toeplitz_hash(sizeof(default_rsskey_40bytes), 1785 default_rsskey_40bytes, datalen, data); 1786 else 1787 hash = toeplitz_hash(sizeof(default_rsskey_52bytes), 1788 default_rsskey_52bytes, datalen, data); 1789 return ((hash & (reta_size - 1)) % nb_queues) == queueid; 1790 } 1791 1792 void 1793 ff_regist_packet_dispatcher(dispatch_func_t func) 1794 { 1795 packet_dispatcher = func; 1796 } 1797 1798 uint64_t 1799 ff_get_tsc_ns() 1800 { 1801 uint64_t cur_tsc = rte_rdtsc(); 1802 uint64_t hz = rte_get_tsc_hz(); 1803 return ((double)cur_tsc/(double)hz) * NS_PER_S; 1804 } 1805 1806