1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 #include <unistd.h> 28 #include <sys/mman.h> 29 #include <errno.h> 30 31 #include <rte_common.h> 32 #include <rte_byteorder.h> 33 #include <rte_log.h> 34 #include <rte_memory.h> 35 #include <rte_memcpy.h> 36 #include <rte_memzone.h> 37 #include <rte_config.h> 38 #include <rte_eal.h> 39 #include <rte_pci.h> 40 #include <rte_mbuf.h> 41 #include <rte_memory.h> 42 #include <rte_lcore.h> 43 #include <rte_launch.h> 44 #include <rte_ethdev.h> 45 #include <rte_debug.h> 46 #include <rte_common.h> 47 #include <rte_ether.h> 48 #include <rte_malloc.h> 49 #include <rte_cycles.h> 50 #include <rte_timer.h> 51 #include <rte_thash.h> 52 #include <rte_ip.h> 53 #include <rte_tcp.h> 54 #include <rte_udp.h> 55 #include <rte_eth_bond.h> 56 57 #include "ff_dpdk_if.h" 58 #include "ff_dpdk_pcap.h" 59 #include "ff_dpdk_kni.h" 60 #include "ff_config.h" 61 #include "ff_veth.h" 62 #include "ff_host_interface.h" 63 #include "ff_msg.h" 64 #include "ff_api.h" 65 #include "ff_memory.h" 66 67 #ifdef FF_KNI 68 #define KNI_MBUF_MAX 2048 69 #define KNI_QUEUE_SIZE 2048 70 71 int enable_kni; 72 static int kni_accept; 73 static int knictl_action = FF_KNICTL_ACTION_DEFAULT; 74 #endif 75 76 static int numa_on; 77 78 static unsigned idle_sleep; 79 static unsigned pkt_tx_delay; 80 81 static struct rte_timer freebsd_clock; 82 83 // Mellanox Linux's driver key 84 static uint8_t default_rsskey_40bytes[40] = { 85 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 86 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 87 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 88 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 89 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 90 }; 91 92 static int use_rsskey_52bytes = 0; 93 static uint8_t default_rsskey_52bytes[52] = { 94 0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23, 95 0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30, 96 0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02, 97 0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c, 98 0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55, 99 0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e, 100 0x81, 0x15, 0x03, 0x66 101 }; 102 103 struct lcore_conf lcore_conf; 104 105 struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 106 107 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; 108 static dispatch_func_t packet_dispatcher; 109 110 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 111 112 #define BOND_DRIVER_NAME "net_bonding" 113 114 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port); 115 116 struct ff_msg_ring { 117 char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE]; 118 /* ring[0] for lcore recv msg, other send */ 119 /* ring[1] for lcore send msg, other read */ 120 struct rte_ring *ring[FF_MSG_NUM]; 121 } __rte_cache_aligned; 122 123 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 124 static struct rte_mempool *message_pool; 125 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 126 127 static struct ff_top_args ff_top_status; 128 static struct ff_traffic_args ff_traffic; 129 extern void ff_hardclock(void); 130 131 static void 132 ff_hardclock_job(__rte_unused struct rte_timer *timer, 133 __rte_unused void *arg) { 134 ff_hardclock(); 135 ff_update_current_ts(); 136 } 137 138 struct ff_dpdk_if_context * 139 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 140 { 141 struct ff_dpdk_if_context *ctx; 142 143 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 144 if (ctx == NULL) 145 return NULL; 146 147 ctx->sc = sc; 148 ctx->ifp = ifp; 149 ctx->port_id = cfg->port_id; 150 ctx->hw_features = cfg->hw_features; 151 152 return ctx; 153 } 154 155 void 156 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 157 { 158 free(ctx); 159 } 160 161 static void 162 check_all_ports_link_status(void) 163 { 164 #define CHECK_INTERVAL 100 /* 100ms */ 165 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 166 167 uint16_t portid; 168 uint8_t count, all_ports_up, print_flag = 0; 169 struct rte_eth_link link; 170 171 printf("\nChecking link status"); 172 fflush(stdout); 173 174 int i, nb_ports; 175 nb_ports = ff_global_cfg.dpdk.nb_ports; 176 for (count = 0; count <= MAX_CHECK_TIME; count++) { 177 all_ports_up = 1; 178 for (i = 0; i < nb_ports; i++) { 179 uint16_t portid = ff_global_cfg.dpdk.portid_list[i]; 180 memset(&link, 0, sizeof(link)); 181 rte_eth_link_get_nowait(portid, &link); 182 183 /* print link status if flag set */ 184 if (print_flag == 1) { 185 if (link.link_status) { 186 printf("Port %d Link Up - speed %u " 187 "Mbps - %s\n", (int)portid, 188 (unsigned)link.link_speed, 189 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 190 ("full-duplex") : ("half-duplex\n")); 191 } else { 192 printf("Port %d Link Down\n", (int)portid); 193 } 194 continue; 195 } 196 /* clear all_ports_up flag if any link down */ 197 if (link.link_status == 0) { 198 all_ports_up = 0; 199 break; 200 } 201 } 202 203 /* after finally printing all link status, get out */ 204 if (print_flag == 1) 205 break; 206 207 if (all_ports_up == 0) { 208 printf("."); 209 fflush(stdout); 210 rte_delay_ms(CHECK_INTERVAL); 211 } 212 213 /* set the print_flag if all ports up or timeout */ 214 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 215 print_flag = 1; 216 printf("done\n"); 217 } 218 } 219 } 220 221 static int 222 init_lcore_conf(void) 223 { 224 uint8_t nb_dev_ports = rte_eth_dev_count_avail(); 225 if (nb_dev_ports == 0) { 226 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 227 } 228 229 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) { 230 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n", 231 ff_global_cfg.dpdk.max_portid); 232 } 233 234 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs; 235 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 236 237 uint16_t proc_id; 238 for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) { 239 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id]; 240 if (!lcore_config[lcore_id].detected) { 241 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 242 } 243 } 244 245 uint16_t socket_id = 0; 246 if (numa_on) { 247 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 248 } 249 250 lcore_conf.socket_id = socket_id; 251 252 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id]; 253 int j; 254 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) { 255 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j]; 256 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 257 258 int queueid = -1; 259 int i; 260 for (i = 0; i < pconf->nb_lcores; i++) { 261 if (pconf->lcore_list[i] == lcore_id) { 262 queueid = i; 263 } 264 } 265 if (queueid < 0) { 266 continue; 267 } 268 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid); 269 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 270 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 271 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid; 272 lcore_conf.nb_rx_queue++; 273 274 lcore_conf.tx_queue_id[port_id] = queueid; 275 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id; 276 lcore_conf.nb_tx_port++; 277 278 /* Enable pcap dump */ 279 if (ff_global_cfg.pcap.enable) { 280 ff_enable_pcap(ff_global_cfg.pcap.save_path, ff_global_cfg.pcap.snap_len); 281 } 282 283 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores; 284 } 285 286 if (lcore_conf.nb_rx_queue == 0) { 287 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id); 288 } 289 290 return 0; 291 } 292 293 static int 294 init_mem_pool(void) 295 { 296 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 297 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 298 uint32_t nb_tx_queue = nb_lcores; 299 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 300 uint16_t max_portid = ff_global_cfg.dpdk.max_portid; 301 302 unsigned nb_mbuf = RTE_ALIGN_CEIL ( 303 (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE + 304 nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST + 305 nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE + 306 nb_lcores * MEMPOOL_CACHE_SIZE + 307 #ifdef FF_KNI 308 nb_ports * KNI_MBUF_MAX + 309 nb_ports * KNI_QUEUE_SIZE + 310 #endif 311 nb_lcores * nb_ports * DISPATCH_RING_SIZE), 312 (unsigned)8192); 313 314 unsigned socketid = 0; 315 uint16_t i, lcore_id; 316 char s[64]; 317 318 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) { 319 lcore_id = ff_global_cfg.dpdk.proc_lcore[i]; 320 if (numa_on) { 321 socketid = rte_lcore_to_socket_id(lcore_id); 322 } 323 324 if (socketid >= NB_SOCKETS) { 325 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 326 socketid, i, NB_SOCKETS); 327 } 328 329 if (pktmbuf_pool[socketid] != NULL) { 330 continue; 331 } 332 333 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 334 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 335 pktmbuf_pool[socketid] = 336 rte_pktmbuf_pool_create(s, nb_mbuf, 337 MEMPOOL_CACHE_SIZE, 0, 338 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 339 } else { 340 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 341 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 342 } 343 344 if (pktmbuf_pool[socketid] == NULL) { 345 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 346 } else { 347 printf("create mbuf pool on socket %d\n", socketid); 348 } 349 350 #ifdef FF_USE_PAGE_ARRAY 351 nb_mbuf = RTE_ALIGN_CEIL ( 352 nb_ports*nb_lcores*MAX_PKT_BURST + 353 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 354 nb_lcores*MEMPOOL_CACHE_SIZE, 355 (unsigned)4096); 356 ff_init_ref_pool(nb_mbuf, socketid); 357 #endif 358 } 359 360 return 0; 361 } 362 363 static struct rte_ring * 364 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 365 { 366 struct rte_ring *ring; 367 368 if (name == NULL) { 369 rte_exit(EXIT_FAILURE, "create ring failed, no name!\n"); 370 } 371 372 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 373 ring = rte_ring_create(name, count, socket_id, flags); 374 } else { 375 ring = rte_ring_lookup(name); 376 } 377 378 if (ring == NULL) { 379 rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name); 380 } 381 382 return ring; 383 } 384 385 static int 386 init_dispatch_ring(void) 387 { 388 int j; 389 char name_buf[RTE_RING_NAMESIZE]; 390 int queueid; 391 392 unsigned socketid = lcore_conf.socket_id; 393 394 /* Create ring according to ports actually being used. */ 395 int nb_ports = ff_global_cfg.dpdk.nb_ports; 396 for (j = 0; j < nb_ports; j++) { 397 uint16_t portid = ff_global_cfg.dpdk.portid_list[j]; 398 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid]; 399 int nb_queues = pconf->nb_lcores; 400 if (dispatch_ring[portid] == NULL) { 401 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid); 402 403 dispatch_ring[portid] = rte_zmalloc(name_buf, 404 sizeof(struct rte_ring *) * nb_queues, 405 RTE_CACHE_LINE_SIZE); 406 if (dispatch_ring[portid] == NULL) { 407 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 408 "failed\n", name_buf); 409 } 410 } 411 412 for(queueid = 0; queueid < nb_queues; ++queueid) { 413 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d", 414 portid, queueid); 415 dispatch_ring[portid][queueid] = create_ring(name_buf, 416 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ); 417 418 if (dispatch_ring[portid][queueid] == NULL) 419 rte_panic("create ring:%s failed!\n", name_buf); 420 421 printf("create ring:%s success, %u ring entries are now free!\n", 422 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid])); 423 } 424 } 425 426 return 0; 427 } 428 429 static void 430 ff_msg_init(struct rte_mempool *mp, 431 __attribute__((unused)) void *opaque_arg, 432 void *obj, __attribute__((unused)) unsigned i) 433 { 434 struct ff_msg *msg = (struct ff_msg *)obj; 435 msg->msg_type = FF_UNKNOWN; 436 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 437 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 438 } 439 440 static int 441 init_msg_ring(void) 442 { 443 uint16_t i, j; 444 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 445 unsigned socketid = lcore_conf.socket_id; 446 447 /* Create message buffer pool */ 448 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 449 message_pool = rte_mempool_create(FF_MSG_POOL, 450 MSG_RING_SIZE * 2 * nb_procs, 451 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 452 NULL, NULL, ff_msg_init, NULL, 453 socketid, 0); 454 } else { 455 message_pool = rte_mempool_lookup(FF_MSG_POOL); 456 } 457 458 if (message_pool == NULL) { 459 rte_panic("Create msg mempool failed\n"); 460 } 461 462 for(i = 0; i < nb_procs; ++i) { 463 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 464 "%s%u", FF_MSG_RING_IN, i); 465 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 466 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 467 if (msg_ring[i].ring[0] == NULL) 468 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 469 470 for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) { 471 snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE, 472 "%s%u_%u", FF_MSG_RING_OUT, i, j); 473 msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j], 474 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 475 if (msg_ring[i].ring[j] == NULL) 476 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]); 477 } 478 } 479 480 return 0; 481 } 482 483 #ifdef FF_KNI 484 485 static enum FF_KNICTL_CMD get_kni_action(const char *c){ 486 if (!c) 487 return FF_KNICTL_ACTION_DEFAULT; 488 if (0 == strcasecmp(c, "alltokni")){ 489 return FF_KNICTL_ACTION_ALL_TO_KNI; 490 } else if (0 == strcasecmp(c, "alltoff")){ 491 return FF_KNICTL_ACTION_ALL_TO_FF; 492 } else if (0 == strcasecmp(c, "default")){ 493 return FF_KNICTL_ACTION_DEFAULT; 494 } else { 495 return FF_KNICTL_ACTION_DEFAULT; 496 } 497 } 498 499 static int 500 init_kni(void) 501 { 502 int nb_ports = rte_eth_dev_count_avail(); 503 kni_accept = 0; 504 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 505 kni_accept = 1; 506 507 knictl_action = get_kni_action(ff_global_cfg.kni.kni_action); 508 509 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 510 ff_global_cfg.kni.udp_port); 511 512 unsigned socket_id = lcore_conf.socket_id; 513 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 514 515 nb_ports = ff_global_cfg.dpdk.nb_ports; 516 int i, ret; 517 for (i = 0; i < nb_ports; i++) { 518 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 519 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 520 } 521 522 return 0; 523 } 524 #endif 525 526 static void 527 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues) 528 { 529 if (reta_size == 0) { 530 return; 531 } 532 533 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 534 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 535 536 /* config HW indirection table */ 537 unsigned i, j, hash=0; 538 for (i = 0; i < reta_conf_size; i++) { 539 reta_conf[i].mask = ~0ULL; 540 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 541 reta_conf[i].reta[j] = hash++ % nb_queues; 542 } 543 } 544 545 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 546 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 547 port_id); 548 } 549 } 550 551 static int 552 init_port_start(void) 553 { 554 int nb_ports = ff_global_cfg.dpdk.nb_ports; 555 unsigned socketid = 0; 556 struct rte_mempool *mbuf_pool; 557 uint16_t i, j; 558 559 for (i = 0; i < nb_ports; i++) { 560 uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i]; 561 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id]; 562 uint16_t nb_queues = pconf->nb_lcores; 563 564 for (j=0; j<=pconf->nb_slaves; j++) { 565 if (j < pconf->nb_slaves) { 566 port_id = pconf->slave_portid_list[j]; 567 printf("To init %s's %d'st slave port[%d]\n", 568 ff_global_cfg.dpdk.bond_cfgs->name, 569 j, port_id); 570 } else { 571 port_id = u_port_id; 572 } 573 574 struct rte_eth_dev_info dev_info; 575 struct rte_eth_conf port_conf = {0}; 576 struct rte_eth_rxconf rxq_conf; 577 struct rte_eth_txconf txq_conf; 578 579 rte_eth_dev_info_get(port_id, &dev_info); 580 581 if (nb_queues > dev_info.max_rx_queues) { 582 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 583 nb_queues, 584 dev_info.max_rx_queues); 585 } 586 587 if (nb_queues > dev_info.max_tx_queues) { 588 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 589 nb_queues, 590 dev_info.max_tx_queues); 591 } 592 593 struct ether_addr addr; 594 rte_eth_macaddr_get(port_id, &addr); 595 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 596 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 597 (unsigned)port_id, 598 addr.addr_bytes[0], addr.addr_bytes[1], 599 addr.addr_bytes[2], addr.addr_bytes[3], 600 addr.addr_bytes[4], addr.addr_bytes[5]); 601 602 rte_memcpy(pconf->mac, 603 addr.addr_bytes, ETHER_ADDR_LEN); 604 605 /* Set RSS mode */ 606 uint64_t default_rss_hf = ETH_RSS_PROTO_MASK; 607 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 608 port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf; 609 if (dev_info.hash_key_size == 52) { 610 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_52bytes; 611 port_conf.rx_adv_conf.rss_conf.rss_key_len = 52; 612 use_rsskey_52bytes = 1; 613 } else { 614 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes; 615 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; 616 } 617 port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads; 618 if (port_conf.rx_adv_conf.rss_conf.rss_hf != 619 ETH_RSS_PROTO_MASK) { 620 printf("Port %u modified RSS hash function based on hardware support," 621 "requested:%#"PRIx64" configured:%#"PRIx64"\n", 622 port_id, default_rss_hf, 623 port_conf.rx_adv_conf.rss_conf.rss_hf); 624 } 625 626 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) { 627 port_conf.txmode.offloads |= 628 DEV_TX_OFFLOAD_MBUF_FAST_FREE; 629 } 630 631 /* Set Rx VLAN stripping */ 632 if (ff_global_cfg.dpdk.vlan_strip) { 633 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 634 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP; 635 } 636 } 637 638 /* Enable HW CRC stripping */ 639 port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC; 640 641 /* FIXME: Enable TCP LRO ?*/ 642 #if 0 643 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 644 printf("LRO is supported\n"); 645 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO; 646 pconf->hw_features.rx_lro = 1; 647 } 648 #endif 649 650 /* Set Rx checksum checking */ 651 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 652 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 653 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 654 printf("RX checksum offload supported\n"); 655 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM; 656 pconf->hw_features.rx_csum = 1; 657 } 658 659 if (ff_global_cfg.dpdk.tx_csum_offoad_skip == 0) { 660 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 661 printf("TX ip checksum offload supported\n"); 662 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; 663 pconf->hw_features.tx_csum_ip = 1; 664 } 665 666 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 667 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 668 printf("TX TCP&UDP checksum offload supported\n"); 669 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM; 670 pconf->hw_features.tx_csum_l4 = 1; 671 } 672 } else { 673 printf("TX checksum offoad is disabled\n"); 674 } 675 676 if (ff_global_cfg.dpdk.tso) { 677 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 678 printf("TSO is supported\n"); 679 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO; 680 pconf->hw_features.tx_tso = 1; 681 } 682 } else { 683 printf("TSO is disabled\n"); 684 } 685 686 if (dev_info.reta_size) { 687 /* reta size must be power of 2 */ 688 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 689 690 rss_reta_size[port_id] = dev_info.reta_size; 691 printf("port[%d]: rss table size: %d\n", port_id, 692 dev_info.reta_size); 693 } 694 695 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 696 continue; 697 } 698 699 int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); 700 if (ret != 0) { 701 return ret; 702 } 703 704 static uint16_t nb_rxd = RX_QUEUE_SIZE; 705 static uint16_t nb_txd = TX_QUEUE_SIZE; 706 ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd); 707 if (ret < 0) 708 printf("Could not adjust number of descriptors " 709 "for port%u (%d)\n", (unsigned)port_id, ret); 710 711 uint16_t q; 712 for (q = 0; q < nb_queues; q++) { 713 if (numa_on) { 714 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q]; 715 socketid = rte_lcore_to_socket_id(lcore_id); 716 } 717 mbuf_pool = pktmbuf_pool[socketid]; 718 719 txq_conf = dev_info.default_txconf; 720 txq_conf.offloads = port_conf.txmode.offloads; 721 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd, 722 socketid, &txq_conf); 723 if (ret < 0) { 724 return ret; 725 } 726 727 rxq_conf = dev_info.default_rxconf; 728 rxq_conf.offloads = port_conf.rxmode.offloads; 729 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd, 730 socketid, &rxq_conf, mbuf_pool); 731 if (ret < 0) { 732 return ret; 733 } 734 } 735 736 737 if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME, 738 strlen(dev_info.driver_name)) == 0) { 739 740 rte_eth_macaddr_get(port_id, &addr); 741 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 742 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 743 (unsigned)port_id, 744 addr.addr_bytes[0], addr.addr_bytes[1], 745 addr.addr_bytes[2], addr.addr_bytes[3], 746 addr.addr_bytes[4], addr.addr_bytes[5]); 747 748 rte_memcpy(pconf->mac, 749 addr.addr_bytes, ETHER_ADDR_LEN); 750 751 int mode, count, x; 752 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS; 753 754 mode = rte_eth_bond_mode_get(port_id); 755 printf("Port %u, bond mode:%d\n", port_id, mode); 756 757 count = rte_eth_bond_slaves_get(port_id, slaves, len); 758 printf("Port %u, %s's slave ports count:%d\n", port_id, 759 ff_global_cfg.dpdk.bond_cfgs->name, count); 760 for (x=0; x<count; x++) { 761 printf("Port %u, %s's slave port[%u]\n", port_id, 762 ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]); 763 } 764 } 765 766 ret = rte_eth_dev_start(port_id); 767 if (ret < 0) { 768 return ret; 769 } 770 771 if (nb_queues > 1) { 772 /* set HW rss hash function to Toeplitz. */ 773 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 774 struct rte_eth_hash_filter_info info = {0}; 775 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 776 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 777 778 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 779 RTE_ETH_FILTER_SET, &info) < 0) { 780 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 781 port_id); 782 } 783 } 784 785 set_rss_table(port_id, dev_info.reta_size, nb_queues); 786 } 787 788 /* Enable RX in promiscuous mode for the Ethernet device. */ 789 if (ff_global_cfg.dpdk.promiscuous) { 790 rte_eth_promiscuous_enable(port_id); 791 ret = rte_eth_promiscuous_get(port_id); 792 if (ret == 1) { 793 printf("set port %u to promiscuous mode ok\n", port_id); 794 } else { 795 printf("set port %u to promiscuous mode error\n", port_id); 796 } 797 } 798 } 799 } 800 801 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 802 check_all_ports_link_status(); 803 } 804 805 return 0; 806 } 807 808 static int 809 init_clock(void) 810 { 811 rte_timer_subsystem_init(); 812 uint64_t hz = rte_get_timer_hz(); 813 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 814 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 815 816 rte_timer_init(&freebsd_clock); 817 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 818 rte_lcore_id(), &ff_hardclock_job, NULL); 819 820 ff_update_current_ts(); 821 822 return 0; 823 } 824 825 int 826 ff_dpdk_init(int argc, char **argv) 827 { 828 if (ff_global_cfg.dpdk.nb_procs < 1 || 829 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 830 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 831 ff_global_cfg.dpdk.proc_id < 0) { 832 printf("param num_procs[%d] or proc_id[%d] error!\n", 833 ff_global_cfg.dpdk.nb_procs, 834 ff_global_cfg.dpdk.proc_id); 835 exit(1); 836 } 837 838 int ret = rte_eal_init(argc, argv); 839 if (ret < 0) { 840 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 841 } 842 843 numa_on = ff_global_cfg.dpdk.numa_on; 844 845 idle_sleep = ff_global_cfg.dpdk.idle_sleep; 846 pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \ 847 BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay; 848 849 init_lcore_conf(); 850 851 init_mem_pool(); 852 853 init_dispatch_ring(); 854 855 init_msg_ring(); 856 857 #ifdef FF_KNI 858 enable_kni = ff_global_cfg.kni.enable; 859 if (enable_kni) { 860 init_kni(); 861 } 862 #endif 863 864 #ifdef FF_USE_PAGE_ARRAY 865 ff_mmap_init(); 866 #endif 867 868 ret = init_port_start(); 869 if (ret < 0) { 870 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 871 } 872 873 init_clock(); 874 875 return 0; 876 } 877 878 static void 879 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 880 { 881 uint8_t rx_csum = ctx->hw_features.rx_csum; 882 if (rx_csum) { 883 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 884 rte_pktmbuf_free(pkt); 885 return; 886 } 887 } 888 889 void *data = rte_pktmbuf_mtod(pkt, void*); 890 uint16_t len = rte_pktmbuf_data_len(pkt); 891 892 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 893 if (hdr == NULL) { 894 rte_pktmbuf_free(pkt); 895 return; 896 } 897 898 if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) { 899 ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci); 900 } 901 902 struct rte_mbuf *pn = pkt->next; 903 void *prev = hdr; 904 while(pn != NULL) { 905 data = rte_pktmbuf_mtod(pn, void*); 906 len = rte_pktmbuf_data_len(pn); 907 908 void *mb = ff_mbuf_get(prev, data, len); 909 if (mb == NULL) { 910 ff_mbuf_free(hdr); 911 rte_pktmbuf_free(pkt); 912 return; 913 } 914 pn = pn->next; 915 prev = mb; 916 } 917 918 ff_veth_process_packet(ctx->ifp, hdr); 919 } 920 921 static enum FilterReturn 922 protocol_filter(const void *data, uint16_t len) 923 { 924 if(len < ETHER_HDR_LEN) 925 return FILTER_UNKNOWN; 926 927 const struct ether_hdr *hdr; 928 const struct vlan_hdr *vlanhdr; 929 hdr = (const struct ether_hdr *)data; 930 uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type); 931 data += ETHER_HDR_LEN; 932 len -= ETHER_HDR_LEN; 933 934 if (ether_type == ETHER_TYPE_VLAN) { 935 vlanhdr = (struct vlan_hdr *)data; 936 ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto); 937 data += sizeof(struct vlan_hdr); 938 len -= sizeof(struct vlan_hdr); 939 } 940 941 if(ether_type == ETHER_TYPE_ARP) 942 return FILTER_ARP; 943 944 #ifdef INET6 945 if (ether_type == ETHER_TYPE_IPv6) { 946 return ff_kni_proto_filter(data, 947 len, ether_type); 948 } 949 #endif 950 951 #ifndef FF_KNI 952 return FILTER_UNKNOWN; 953 #else 954 if (!enable_kni) { 955 return FILTER_UNKNOWN; 956 } 957 958 if(ether_type != ETHER_TYPE_IPv4) 959 return FILTER_UNKNOWN; 960 961 return ff_kni_proto_filter(data, 962 len, ether_type); 963 #endif 964 } 965 966 static inline void 967 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m) 968 { 969 struct rte_mbuf *md; 970 void *src, *dst; 971 972 dst = rte_pktmbuf_mtod(mi, void *); 973 src = rte_pktmbuf_mtod(m, void *); 974 975 mi->data_len = m->data_len; 976 rte_memcpy(dst, src, m->data_len); 977 978 mi->port = m->port; 979 mi->vlan_tci = m->vlan_tci; 980 mi->vlan_tci_outer = m->vlan_tci_outer; 981 mi->tx_offload = m->tx_offload; 982 mi->hash = m->hash; 983 mi->ol_flags = m->ol_flags; 984 mi->packet_type = m->packet_type; 985 } 986 987 /* copied from rte_pktmbuf_clone */ 988 static inline struct rte_mbuf * 989 pktmbuf_deep_clone(const struct rte_mbuf *md, 990 struct rte_mempool *mp) 991 { 992 struct rte_mbuf *mc, *mi, **prev; 993 uint32_t pktlen; 994 uint8_t nseg; 995 996 if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) 997 return NULL; 998 999 mi = mc; 1000 prev = &mi->next; 1001 pktlen = md->pkt_len; 1002 nseg = 0; 1003 1004 do { 1005 nseg++; 1006 pktmbuf_deep_attach(mi, md); 1007 *prev = mi; 1008 prev = &mi->next; 1009 } while ((md = md->next) != NULL && 1010 (mi = rte_pktmbuf_alloc(mp)) != NULL); 1011 1012 *prev = NULL; 1013 mc->nb_segs = nseg; 1014 mc->pkt_len = pktlen; 1015 1016 /* Allocation of new indirect segment failed */ 1017 if (unlikely (mi == NULL)) { 1018 rte_pktmbuf_free(mc); 1019 return NULL; 1020 } 1021 1022 __rte_mbuf_sanity_check(mc, 1); 1023 return mc; 1024 } 1025 1026 static inline void 1027 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 1028 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 1029 { 1030 struct lcore_conf *qconf = &lcore_conf; 1031 uint16_t nb_queues = qconf->nb_queue_list[port_id]; 1032 1033 uint16_t i; 1034 for (i = 0; i < count; i++) { 1035 struct rte_mbuf *rtem = bufs[i]; 1036 1037 if (unlikely( ff_global_cfg.pcap.enable)) { 1038 if (!pkts_from_ring) { 1039 ff_dump_packets( ff_global_cfg.pcap.save_path, rtem, ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len); 1040 } 1041 } 1042 1043 void *data = rte_pktmbuf_mtod(rtem, void*); 1044 uint16_t len = rte_pktmbuf_data_len(rtem); 1045 1046 if (!pkts_from_ring) { 1047 ff_traffic.rx_packets++; 1048 ff_traffic.rx_bytes += len; 1049 } 1050 1051 if (!pkts_from_ring && packet_dispatcher) { 1052 int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues); 1053 if (ret == FF_DISPATCH_RESPONSE) { 1054 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len; 1055 1056 /* 1057 * We have not support vlan out strip 1058 */ 1059 if (rtem->vlan_tci) { 1060 data = rte_pktmbuf_prepend(rtem, sizeof(struct vlan_hdr)); 1061 if (data != NULL) { 1062 memmove(data, data + sizeof(struct vlan_hdr), ETHER_HDR_LEN); 1063 struct ether_hdr *etherhdr = (struct ether_hdr *)data; 1064 struct vlan_hdr *vlanhdr = (struct vlan_hdr *)(data + ETHER_HDR_LEN); 1065 vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci); 1066 vlanhdr->eth_proto = etherhdr->ether_type; 1067 etherhdr->ether_type = rte_cpu_to_be_16(ETHER_TYPE_VLAN); 1068 } 1069 } 1070 send_single_packet(rtem, port_id); 1071 continue; 1072 } 1073 1074 if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) { 1075 rte_pktmbuf_free(rtem); 1076 continue; 1077 } 1078 1079 if (ret != queue_id) { 1080 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem); 1081 if (ret < 0) 1082 rte_pktmbuf_free(rtem); 1083 1084 continue; 1085 } 1086 } 1087 1088 enum FilterReturn filter = protocol_filter(data, len); 1089 #ifdef INET6 1090 if (filter == FILTER_ARP || filter == FILTER_NDP) { 1091 #else 1092 if (filter == FILTER_ARP) { 1093 #endif 1094 struct rte_mempool *mbuf_pool; 1095 struct rte_mbuf *mbuf_clone; 1096 if (!pkts_from_ring) { 1097 uint16_t j; 1098 for(j = 0; j < nb_queues; ++j) { 1099 if(j == queue_id) 1100 continue; 1101 1102 unsigned socket_id = 0; 1103 if (numa_on) { 1104 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j]; 1105 socket_id = rte_lcore_to_socket_id(lcore_id); 1106 } 1107 mbuf_pool = pktmbuf_pool[socket_id]; 1108 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1109 if(mbuf_clone) { 1110 int ret = rte_ring_enqueue(dispatch_ring[port_id][j], 1111 mbuf_clone); 1112 if (ret < 0) 1113 rte_pktmbuf_free(mbuf_clone); 1114 } 1115 } 1116 } 1117 1118 #ifdef FF_KNI 1119 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1120 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 1121 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1122 if(mbuf_clone) { 1123 ff_kni_enqueue(port_id, mbuf_clone); 1124 } 1125 } 1126 #endif 1127 ff_veth_input(ctx, rtem); 1128 #ifdef FF_KNI 1129 } else if (enable_kni) { 1130 if (knictl_action == FF_KNICTL_ACTION_ALL_TO_KNI){ 1131 ff_kni_enqueue(port_id, rtem); 1132 } else if (knictl_action == FF_KNICTL_ACTION_ALL_TO_FF){ 1133 ff_veth_input(ctx, rtem); 1134 } else if (knictl_action == FF_KNICTL_ACTION_DEFAULT){ 1135 if (enable_kni && 1136 ((filter == FILTER_KNI && kni_accept) || 1137 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 1138 ff_kni_enqueue(port_id, rtem); 1139 } else { 1140 ff_veth_input(ctx, rtem); 1141 } 1142 } else { 1143 ff_veth_input(ctx, rtem); 1144 } 1145 #endif 1146 } else { 1147 ff_veth_input(ctx, rtem); 1148 } 1149 } 1150 } 1151 1152 static inline int 1153 process_dispatch_ring(uint16_t port_id, uint16_t queue_id, 1154 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 1155 { 1156 /* read packet from ring buf and to process */ 1157 uint16_t nb_rb; 1158 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id], 1159 (void **)pkts_burst, MAX_PKT_BURST, NULL); 1160 1161 if(nb_rb > 0) { 1162 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 1163 } 1164 1165 return 0; 1166 } 1167 1168 static inline void 1169 handle_sysctl_msg(struct ff_msg *msg) 1170 { 1171 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 1172 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 1173 msg->sysctl.newlen); 1174 1175 if (ret < 0) { 1176 msg->result = errno; 1177 } else { 1178 msg->result = 0; 1179 } 1180 } 1181 1182 static inline void 1183 handle_ioctl_msg(struct ff_msg *msg) 1184 { 1185 int fd, ret; 1186 #ifdef INET6 1187 if (msg->msg_type == FF_IOCTL6) { 1188 fd = ff_socket(AF_INET6, SOCK_DGRAM, 0); 1189 } else 1190 #endif 1191 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 1192 1193 if (fd < 0) { 1194 ret = -1; 1195 goto done; 1196 } 1197 1198 ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data); 1199 1200 ff_close(fd); 1201 1202 done: 1203 if (ret < 0) { 1204 msg->result = errno; 1205 } else { 1206 msg->result = 0; 1207 } 1208 } 1209 1210 static inline void 1211 handle_route_msg(struct ff_msg *msg) 1212 { 1213 int ret = ff_rtioctl(msg->route.fib, msg->route.data, 1214 &msg->route.len, msg->route.maxlen); 1215 if (ret < 0) { 1216 msg->result = errno; 1217 } else { 1218 msg->result = 0; 1219 } 1220 } 1221 1222 static inline void 1223 handle_top_msg(struct ff_msg *msg) 1224 { 1225 msg->top = ff_top_status; 1226 msg->result = 0; 1227 } 1228 1229 #ifdef FF_NETGRAPH 1230 static inline void 1231 handle_ngctl_msg(struct ff_msg *msg) 1232 { 1233 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data); 1234 if (ret < 0) { 1235 msg->result = errno; 1236 } else { 1237 msg->result = 0; 1238 msg->ngctl.ret = ret; 1239 } 1240 } 1241 #endif 1242 1243 #ifdef FF_IPFW 1244 static inline void 1245 handle_ipfw_msg(struct ff_msg *msg) 1246 { 1247 int fd, ret; 1248 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW); 1249 if (fd < 0) { 1250 ret = -1; 1251 goto done; 1252 } 1253 1254 switch (msg->ipfw.cmd) { 1255 case FF_IPFW_GET: 1256 ret = ff_getsockopt_freebsd(fd, msg->ipfw.level, 1257 msg->ipfw.optname, msg->ipfw.optval, 1258 msg->ipfw.optlen); 1259 break; 1260 case FF_IPFW_SET: 1261 ret = ff_setsockopt_freebsd(fd, msg->ipfw.level, 1262 msg->ipfw.optname, msg->ipfw.optval, 1263 *(msg->ipfw.optlen)); 1264 break; 1265 default: 1266 ret = -1; 1267 errno = ENOTSUP; 1268 break; 1269 } 1270 1271 ff_close(fd); 1272 1273 done: 1274 if (ret < 0) { 1275 msg->result = errno; 1276 } else { 1277 msg->result = 0; 1278 } 1279 } 1280 #endif 1281 1282 static inline void 1283 handle_traffic_msg(struct ff_msg *msg) 1284 { 1285 msg->traffic = ff_traffic; 1286 msg->result = 0; 1287 } 1288 1289 #ifdef FF_KNI 1290 static inline void 1291 handle_knictl_msg(struct ff_msg *msg) 1292 { 1293 if (msg->knictl.kni_cmd == FF_KNICTL_CMD_SET){ 1294 switch (msg->knictl.kni_action){ 1295 case FF_KNICTL_ACTION_ALL_TO_FF: knictl_action = FF_KNICTL_ACTION_ALL_TO_FF; msg->result = 0; printf("new kni action: alltoff\n"); break; 1296 case FF_KNICTL_ACTION_ALL_TO_KNI: knictl_action = FF_KNICTL_ACTION_ALL_TO_KNI; msg->result = 0; printf("new kni action: alltokni\n"); break; 1297 case FF_KNICTL_ACTION_DEFAULT: knictl_action = FF_KNICTL_ACTION_DEFAULT; msg->result = 0; printf("new kni action: default\n"); break; 1298 default: msg->result = -1; 1299 } 1300 } 1301 else if (msg->knictl.kni_cmd == FF_KNICTL_CMD_GET){ 1302 msg->knictl.kni_action = knictl_action; 1303 } else { 1304 msg->result = -2; 1305 } 1306 } 1307 #endif 1308 1309 static inline void 1310 handle_default_msg(struct ff_msg *msg) 1311 { 1312 msg->result = ENOTSUP; 1313 } 1314 1315 static inline void 1316 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1317 { 1318 switch (msg->msg_type) { 1319 case FF_SYSCTL: 1320 handle_sysctl_msg(msg); 1321 break; 1322 case FF_IOCTL: 1323 #ifdef INET6 1324 case FF_IOCTL6: 1325 #endif 1326 handle_ioctl_msg(msg); 1327 break; 1328 case FF_ROUTE: 1329 handle_route_msg(msg); 1330 break; 1331 case FF_TOP: 1332 handle_top_msg(msg); 1333 break; 1334 #ifdef FF_NETGRAPH 1335 case FF_NGCTL: 1336 handle_ngctl_msg(msg); 1337 break; 1338 #endif 1339 #ifdef FF_IPFW 1340 case FF_IPFW_CTL: 1341 handle_ipfw_msg(msg); 1342 break; 1343 #endif 1344 case FF_TRAFFIC: 1345 handle_traffic_msg(msg); 1346 break; 1347 #ifdef FF_KNI 1348 case FF_KNICTL: 1349 handle_knictl_msg(msg); 1350 break; 1351 #endif 1352 default: 1353 handle_default_msg(msg); 1354 break; 1355 } 1356 rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg); 1357 } 1358 1359 static inline int 1360 process_msg_ring(uint16_t proc_id) 1361 { 1362 void *msg; 1363 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1364 1365 if (unlikely(ret == 0)) { 1366 handle_msg((struct ff_msg *)msg, proc_id); 1367 } 1368 1369 return 0; 1370 } 1371 1372 /* Send burst of packets on an output interface */ 1373 static inline int 1374 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1375 { 1376 struct rte_mbuf **m_table; 1377 int ret; 1378 uint16_t queueid; 1379 1380 queueid = qconf->tx_queue_id[port]; 1381 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1382 1383 if (unlikely(ff_global_cfg.pcap.enable)) { 1384 uint16_t i; 1385 for (i = 0; i < n; i++) { 1386 ff_dump_packets( ff_global_cfg.pcap.save_path, m_table[i], 1387 ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len); 1388 } 1389 } 1390 1391 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1392 ff_traffic.tx_packets += ret; 1393 uint16_t i; 1394 for (i = 0; i < ret; i++) { 1395 ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]); 1396 #ifdef FF_USE_PAGE_ARRAY 1397 if (qconf->tx_mbufs[port].bsd_m_table[i]) 1398 ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs); 1399 #endif 1400 } 1401 if (unlikely(ret < n)) { 1402 do { 1403 rte_pktmbuf_free(m_table[ret]); 1404 #ifdef FF_USE_PAGE_ARRAY 1405 if ( qconf->tx_mbufs[port].bsd_m_table[ret] ) 1406 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]); 1407 #endif 1408 } while (++ret < n); 1409 } 1410 return 0; 1411 } 1412 1413 /* Enqueue a single packet, and send burst if queue is filled */ 1414 static inline int 1415 send_single_packet(struct rte_mbuf *m, uint8_t port) 1416 { 1417 uint16_t len; 1418 struct lcore_conf *qconf; 1419 1420 qconf = &lcore_conf; 1421 len = qconf->tx_mbufs[port].len; 1422 qconf->tx_mbufs[port].m_table[len] = m; 1423 len++; 1424 1425 /* enough pkts to be sent */ 1426 if (unlikely(len == MAX_PKT_BURST)) { 1427 send_burst(qconf, MAX_PKT_BURST, port); 1428 len = 0; 1429 } 1430 1431 qconf->tx_mbufs[port].len = len; 1432 return 0; 1433 } 1434 1435 int 1436 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1437 int total) 1438 { 1439 #ifdef FF_USE_PAGE_ARRAY 1440 struct lcore_conf *qconf = &lcore_conf; 1441 int len = 0; 1442 1443 len = ff_if_send_onepkt(ctx, m,total); 1444 if (unlikely(len == MAX_PKT_BURST)) { 1445 send_burst(qconf, MAX_PKT_BURST, ctx->port_id); 1446 len = 0; 1447 } 1448 qconf->tx_mbufs[ctx->port_id].len = len; 1449 return 0; 1450 #endif 1451 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1452 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1453 if (head == NULL) { 1454 ff_mbuf_free(m); 1455 return -1; 1456 } 1457 1458 head->pkt_len = total; 1459 head->nb_segs = 0; 1460 1461 int off = 0; 1462 struct rte_mbuf *cur = head, *prev = NULL; 1463 while(total > 0) { 1464 if (cur == NULL) { 1465 cur = rte_pktmbuf_alloc(mbuf_pool); 1466 if (cur == NULL) { 1467 rte_pktmbuf_free(head); 1468 ff_mbuf_free(m); 1469 return -1; 1470 } 1471 } 1472 1473 if (prev != NULL) { 1474 prev->next = cur; 1475 } 1476 head->nb_segs++; 1477 1478 prev = cur; 1479 void *data = rte_pktmbuf_mtod(cur, void*); 1480 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1481 int ret = ff_mbuf_copydata(m, data, off, len); 1482 if (ret < 0) { 1483 rte_pktmbuf_free(head); 1484 ff_mbuf_free(m); 1485 return -1; 1486 } 1487 1488 1489 cur->data_len = len; 1490 off += len; 1491 total -= len; 1492 cur = NULL; 1493 } 1494 1495 struct ff_tx_offload offload = {0}; 1496 ff_mbuf_tx_offload(m, &offload); 1497 1498 void *data = rte_pktmbuf_mtod(head, void*); 1499 1500 if (offload.ip_csum) { 1501 /* ipv6 not supported yet */ 1502 struct ipv4_hdr *iph; 1503 int iph_len; 1504 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1505 iph_len = (iph->version_ihl & 0x0f) << 2; 1506 1507 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 1508 head->l2_len = ETHER_HDR_LEN; 1509 head->l3_len = iph_len; 1510 } 1511 1512 if (ctx->hw_features.tx_csum_l4) { 1513 struct ipv4_hdr *iph; 1514 int iph_len; 1515 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1516 iph_len = (iph->version_ihl & 0x0f) << 2; 1517 1518 if (offload.tcp_csum) { 1519 head->ol_flags |= PKT_TX_TCP_CKSUM; 1520 head->l2_len = ETHER_HDR_LEN; 1521 head->l3_len = iph_len; 1522 } 1523 1524 /* 1525 * TCP segmentation offload. 1526 * 1527 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 1528 * implies PKT_TX_TCP_CKSUM) 1529 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 1530 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 1531 * write the IP checksum to 0 in the packet 1532 * - fill the mbuf offload information: l2_len, 1533 * l3_len, l4_len, tso_segsz 1534 * - calculate the pseudo header checksum without taking ip_len 1535 * in account, and set it in the TCP header. Refer to 1536 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 1537 * used as helpers. 1538 */ 1539 if (offload.tso_seg_size) { 1540 struct tcp_hdr *tcph; 1541 int tcph_len; 1542 tcph = (struct tcp_hdr *)((char *)iph + iph_len); 1543 tcph_len = (tcph->data_off & 0xf0) >> 2; 1544 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 1545 1546 head->ol_flags |= PKT_TX_TCP_SEG; 1547 head->l4_len = tcph_len; 1548 head->tso_segsz = offload.tso_seg_size; 1549 } 1550 1551 if (offload.udp_csum) { 1552 head->ol_flags |= PKT_TX_UDP_CKSUM; 1553 head->l2_len = ETHER_HDR_LEN; 1554 head->l3_len = iph_len; 1555 } 1556 } 1557 1558 ff_mbuf_free(m); 1559 1560 return send_single_packet(head, ctx->port_id); 1561 } 1562 1563 static int 1564 main_loop(void *arg) 1565 { 1566 struct loop_routine *lr = (struct loop_routine *)arg; 1567 1568 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1569 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc; 1570 int i, j, nb_rx, idle; 1571 uint16_t port_id, queue_id; 1572 struct lcore_conf *qconf; 1573 uint64_t drain_tsc = 0; 1574 struct ff_dpdk_if_context *ctx; 1575 1576 if (pkt_tx_delay) { 1577 drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay; 1578 } 1579 1580 prev_tsc = 0; 1581 usch_tsc = 0; 1582 1583 qconf = &lcore_conf; 1584 1585 while (1) { 1586 cur_tsc = rte_rdtsc(); 1587 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1588 rte_timer_manage(); 1589 } 1590 1591 idle = 1; 1592 sys_tsc = 0; 1593 usr_tsc = 0; 1594 1595 /* 1596 * TX burst queue drain 1597 */ 1598 diff_tsc = cur_tsc - prev_tsc; 1599 if (unlikely(diff_tsc >= drain_tsc)) { 1600 for (i = 0; i < qconf->nb_tx_port; i++) { 1601 port_id = qconf->tx_port_id[i]; 1602 if (qconf->tx_mbufs[port_id].len == 0) 1603 continue; 1604 1605 idle = 0; 1606 1607 send_burst(qconf, 1608 qconf->tx_mbufs[port_id].len, 1609 port_id); 1610 qconf->tx_mbufs[port_id].len = 0; 1611 } 1612 1613 prev_tsc = cur_tsc; 1614 } 1615 1616 /* 1617 * Read packet from RX queues 1618 */ 1619 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1620 port_id = qconf->rx_queue_list[i].port_id; 1621 queue_id = qconf->rx_queue_list[i].queue_id; 1622 ctx = veth_ctx[port_id]; 1623 1624 #ifdef FF_KNI 1625 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1626 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1627 } 1628 #endif 1629 1630 process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); 1631 1632 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1633 MAX_PKT_BURST); 1634 if (nb_rx == 0) 1635 continue; 1636 1637 idle = 0; 1638 1639 /* Prefetch first packets */ 1640 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1641 rte_prefetch0(rte_pktmbuf_mtod( 1642 pkts_burst[j], void *)); 1643 } 1644 1645 /* Prefetch and handle already prefetched packets */ 1646 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1647 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1648 j + PREFETCH_OFFSET], void *)); 1649 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1650 } 1651 1652 /* Handle remaining prefetched packets */ 1653 for (; j < nb_rx; j++) { 1654 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1655 } 1656 } 1657 1658 process_msg_ring(qconf->proc_id); 1659 1660 div_tsc = rte_rdtsc(); 1661 1662 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) { 1663 usch_tsc = cur_tsc; 1664 lr->loop(lr->arg); 1665 } 1666 1667 idle_sleep_tsc = rte_rdtsc(); 1668 if (likely(idle && idle_sleep)) { 1669 usleep(idle_sleep); 1670 end_tsc = rte_rdtsc(); 1671 } else { 1672 end_tsc = idle_sleep_tsc; 1673 } 1674 1675 if (usch_tsc == cur_tsc) { 1676 usr_tsc = idle_sleep_tsc - div_tsc; 1677 } 1678 1679 if (!idle) { 1680 sys_tsc = div_tsc - cur_tsc; 1681 ff_top_status.sys_tsc += sys_tsc; 1682 } 1683 1684 ff_top_status.usr_tsc += usr_tsc; 1685 ff_top_status.work_tsc += end_tsc - cur_tsc; 1686 ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1687 1688 ff_top_status.loops++; 1689 } 1690 1691 return 0; 1692 } 1693 1694 int 1695 ff_dpdk_if_up(void) { 1696 int i; 1697 struct lcore_conf *qconf = &lcore_conf; 1698 for (i = 0; i < qconf->nb_tx_port; i++) { 1699 uint16_t port_id = qconf->tx_port_id[i]; 1700 1701 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id]; 1702 veth_ctx[port_id] = ff_veth_attach(pconf); 1703 if (veth_ctx[port_id] == NULL) { 1704 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1705 } 1706 } 1707 1708 return 0; 1709 } 1710 1711 void 1712 ff_dpdk_run(loop_func_t loop, void *arg) { 1713 struct loop_routine *lr = rte_malloc(NULL, 1714 sizeof(struct loop_routine), 0); 1715 lr->loop = loop; 1716 lr->arg = arg; 1717 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1718 rte_eal_mp_wait_lcore(); 1719 rte_free(lr); 1720 } 1721 1722 void 1723 ff_dpdk_pktmbuf_free(void *m) 1724 { 1725 rte_pktmbuf_free((struct rte_mbuf *)m); 1726 } 1727 1728 static uint32_t 1729 toeplitz_hash(unsigned keylen, const uint8_t *key, 1730 unsigned datalen, const uint8_t *data) 1731 { 1732 uint32_t hash = 0, v; 1733 u_int i, b; 1734 1735 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1736 1737 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1738 for (i = 0; i < datalen; i++) { 1739 for (b = 0; b < 8; b++) { 1740 if (data[i] & (1<<(7-b))) 1741 hash ^= v; 1742 v <<= 1; 1743 if ((i + 4) < keylen && 1744 (key[i+4] & (1<<(7-b)))) 1745 v |= 1; 1746 } 1747 } 1748 return (hash); 1749 } 1750 1751 int 1752 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1753 uint16_t sport, uint16_t dport) 1754 { 1755 struct lcore_conf *qconf = &lcore_conf; 1756 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 1757 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id]; 1758 1759 if (nb_queues <= 1) { 1760 return 1; 1761 } 1762 1763 uint16_t reta_size = rss_reta_size[ctx->port_id]; 1764 uint16_t queueid = qconf->tx_queue_id[ctx->port_id]; 1765 1766 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1767 sizeof(dport)]; 1768 1769 unsigned datalen = 0; 1770 1771 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1772 datalen += sizeof(saddr); 1773 1774 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1775 datalen += sizeof(daddr); 1776 1777 bcopy(&sport, &data[datalen], sizeof(sport)); 1778 datalen += sizeof(sport); 1779 1780 bcopy(&dport, &data[datalen], sizeof(dport)); 1781 datalen += sizeof(dport); 1782 1783 uint32_t hash = 0; 1784 if ( !use_rsskey_52bytes ) 1785 hash = toeplitz_hash(sizeof(default_rsskey_40bytes), 1786 default_rsskey_40bytes, datalen, data); 1787 else 1788 hash = toeplitz_hash(sizeof(default_rsskey_52bytes), 1789 default_rsskey_52bytes, datalen, data); 1790 return ((hash & (reta_size - 1)) % nb_queues) == queueid; 1791 } 1792 1793 void 1794 ff_regist_packet_dispatcher(dispatch_func_t func) 1795 { 1796 packet_dispatcher = func; 1797 } 1798 1799 uint64_t 1800 ff_get_tsc_ns() 1801 { 1802 uint64_t cur_tsc = rte_rdtsc(); 1803 uint64_t hz = rte_get_tsc_hz(); 1804 return ((double)cur_tsc/(double)hz) * NS_PER_S; 1805 } 1806 1807