1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 #include <unistd.h> 28 #include <sys/mman.h> 29 #include <errno.h> 30 31 #include <rte_common.h> 32 #include <rte_byteorder.h> 33 #include <rte_log.h> 34 #include <rte_memory.h> 35 #include <rte_memcpy.h> 36 #include <rte_memzone.h> 37 #include <rte_config.h> 38 #include <rte_eal.h> 39 #include <rte_pci.h> 40 #include <rte_mbuf.h> 41 #include <rte_memory.h> 42 #include <rte_lcore.h> 43 #include <rte_launch.h> 44 #include <rte_ethdev.h> 45 #include <rte_debug.h> 46 #include <rte_common.h> 47 #include <rte_ether.h> 48 #include <rte_malloc.h> 49 #include <rte_cycles.h> 50 #include <rte_timer.h> 51 #include <rte_thash.h> 52 #include <rte_ip.h> 53 #include <rte_tcp.h> 54 #include <rte_udp.h> 55 #include <rte_eth_bond.h> 56 57 #include "ff_dpdk_if.h" 58 #include "ff_dpdk_pcap.h" 59 #include "ff_dpdk_kni.h" 60 #include "ff_config.h" 61 #include "ff_veth.h" 62 #include "ff_host_interface.h" 63 #include "ff_msg.h" 64 #include "ff_api.h" 65 #include "ff_memory.h" 66 67 #ifdef FF_KNI 68 #define KNI_MBUF_MAX 2048 69 #define KNI_QUEUE_SIZE 2048 70 71 int enable_kni; 72 static int kni_accept; 73 static int knictl_action = FF_KNICTL_ACTION_DEFAULT; 74 #endif 75 76 static int numa_on; 77 78 static unsigned idle_sleep; 79 static unsigned pkt_tx_delay; 80 81 static struct rte_timer freebsd_clock; 82 83 // Mellanox Linux's driver key 84 static uint8_t default_rsskey_40bytes[40] = { 85 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 86 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 87 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 88 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 89 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 90 }; 91 92 static uint8_t default_rsskey_52bytes[52] = { 93 0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23, 94 0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30, 95 0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02, 96 0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c, 97 0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55, 98 0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e, 99 0x81, 0x15, 0x03, 0x66 100 }; 101 102 static uint8_t symmetric_rsskey[52] = { 103 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 104 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 105 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 106 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 107 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 108 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 109 0x6d, 0x5a, 0x6d, 0x5a 110 }; 111 112 static int rsskey_len = sizeof(default_rsskey_40bytes); 113 static uint8_t *rsskey = default_rsskey_40bytes; 114 115 struct lcore_conf lcore_conf; 116 117 struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 118 119 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; 120 static dispatch_func_t packet_dispatcher; 121 122 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 123 124 #define BOND_DRIVER_NAME "net_bonding" 125 126 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port); 127 128 struct ff_msg_ring { 129 char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE]; 130 /* ring[0] for lcore recv msg, other send */ 131 /* ring[1] for lcore send msg, other read */ 132 struct rte_ring *ring[FF_MSG_NUM]; 133 } __rte_cache_aligned; 134 135 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 136 static struct rte_mempool *message_pool; 137 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 138 139 static struct ff_top_args ff_top_status; 140 static struct ff_traffic_args ff_traffic; 141 extern void ff_hardclock(void); 142 143 static void 144 ff_hardclock_job(__rte_unused struct rte_timer *timer, 145 __rte_unused void *arg) { 146 ff_hardclock(); 147 ff_update_current_ts(); 148 } 149 150 struct ff_dpdk_if_context * 151 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 152 { 153 struct ff_dpdk_if_context *ctx; 154 155 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 156 if (ctx == NULL) 157 return NULL; 158 159 ctx->sc = sc; 160 ctx->ifp = ifp; 161 ctx->port_id = cfg->port_id; 162 ctx->hw_features = cfg->hw_features; 163 164 return ctx; 165 } 166 167 void 168 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 169 { 170 free(ctx); 171 } 172 173 static void 174 check_all_ports_link_status(void) 175 { 176 #define CHECK_INTERVAL 100 /* 100ms */ 177 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 178 179 uint16_t portid; 180 uint8_t count, all_ports_up, print_flag = 0; 181 struct rte_eth_link link; 182 183 printf("\nChecking link status"); 184 fflush(stdout); 185 186 int i, nb_ports; 187 nb_ports = ff_global_cfg.dpdk.nb_ports; 188 for (count = 0; count <= MAX_CHECK_TIME; count++) { 189 all_ports_up = 1; 190 for (i = 0; i < nb_ports; i++) { 191 uint16_t portid = ff_global_cfg.dpdk.portid_list[i]; 192 memset(&link, 0, sizeof(link)); 193 rte_eth_link_get_nowait(portid, &link); 194 195 /* print link status if flag set */ 196 if (print_flag == 1) { 197 if (link.link_status) { 198 printf("Port %d Link Up - speed %u " 199 "Mbps - %s\n", (int)portid, 200 (unsigned)link.link_speed, 201 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 202 ("full-duplex") : ("half-duplex\n")); 203 } else { 204 printf("Port %d Link Down\n", (int)portid); 205 } 206 continue; 207 } 208 /* clear all_ports_up flag if any link down */ 209 if (link.link_status == 0) { 210 all_ports_up = 0; 211 break; 212 } 213 } 214 215 /* after finally printing all link status, get out */ 216 if (print_flag == 1) 217 break; 218 219 if (all_ports_up == 0) { 220 printf("."); 221 fflush(stdout); 222 rte_delay_ms(CHECK_INTERVAL); 223 } 224 225 /* set the print_flag if all ports up or timeout */ 226 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 227 print_flag = 1; 228 printf("done\n"); 229 } 230 } 231 } 232 233 static int 234 init_lcore_conf(void) 235 { 236 uint8_t nb_dev_ports = rte_eth_dev_count_avail(); 237 if (nb_dev_ports == 0) { 238 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 239 } 240 241 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) { 242 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n", 243 ff_global_cfg.dpdk.max_portid); 244 } 245 246 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs; 247 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 248 249 uint16_t socket_id = 0; 250 if (numa_on) { 251 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 252 } 253 254 lcore_conf.socket_id = socket_id; 255 256 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id]; 257 if (!rte_lcore_is_enabled(lcore_id)) { 258 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 259 } 260 261 int j; 262 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) { 263 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j]; 264 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 265 266 int queueid = -1; 267 int i; 268 for (i = 0; i < pconf->nb_lcores; i++) { 269 if (pconf->lcore_list[i] == lcore_id) { 270 queueid = i; 271 } 272 } 273 if (queueid < 0) { 274 continue; 275 } 276 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid); 277 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 278 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 279 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid; 280 lcore_conf.nb_rx_queue++; 281 282 lcore_conf.tx_queue_id[port_id] = queueid; 283 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id; 284 lcore_conf.nb_tx_port++; 285 286 /* Enable pcap dump */ 287 if (ff_global_cfg.pcap.enable) { 288 ff_enable_pcap(ff_global_cfg.pcap.save_path, ff_global_cfg.pcap.snap_len); 289 } 290 291 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores; 292 } 293 294 if (lcore_conf.nb_rx_queue == 0) { 295 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id); 296 } 297 298 return 0; 299 } 300 301 static int 302 init_mem_pool(void) 303 { 304 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 305 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 306 uint32_t nb_tx_queue = nb_lcores; 307 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 308 uint16_t max_portid = ff_global_cfg.dpdk.max_portid; 309 310 unsigned nb_mbuf = RTE_ALIGN_CEIL ( 311 (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE + 312 nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST + 313 nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE + 314 nb_lcores * MEMPOOL_CACHE_SIZE + 315 #ifdef FF_KNI 316 nb_ports * KNI_MBUF_MAX + 317 nb_ports * KNI_QUEUE_SIZE + 318 #endif 319 nb_lcores * nb_ports * DISPATCH_RING_SIZE), 320 (unsigned)8192); 321 322 unsigned socketid = 0; 323 uint16_t i, lcore_id; 324 char s[64]; 325 326 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) { 327 lcore_id = ff_global_cfg.dpdk.proc_lcore[i]; 328 if (numa_on) { 329 socketid = rte_lcore_to_socket_id(lcore_id); 330 } 331 332 if (socketid >= NB_SOCKETS) { 333 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 334 socketid, i, NB_SOCKETS); 335 } 336 337 if (pktmbuf_pool[socketid] != NULL) { 338 continue; 339 } 340 341 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 342 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 343 pktmbuf_pool[socketid] = 344 rte_pktmbuf_pool_create(s, nb_mbuf, 345 MEMPOOL_CACHE_SIZE, 0, 346 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 347 } else { 348 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 349 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 350 } 351 352 if (pktmbuf_pool[socketid] == NULL) { 353 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 354 } else { 355 printf("create mbuf pool on socket %d\n", socketid); 356 } 357 358 #ifdef FF_USE_PAGE_ARRAY 359 nb_mbuf = RTE_ALIGN_CEIL ( 360 nb_ports*nb_lcores*MAX_PKT_BURST + 361 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 362 nb_lcores*MEMPOOL_CACHE_SIZE, 363 (unsigned)4096); 364 ff_init_ref_pool(nb_mbuf, socketid); 365 #endif 366 } 367 368 return 0; 369 } 370 371 static struct rte_ring * 372 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 373 { 374 struct rte_ring *ring; 375 376 if (name == NULL) { 377 rte_exit(EXIT_FAILURE, "create ring failed, no name!\n"); 378 } 379 380 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 381 ring = rte_ring_create(name, count, socket_id, flags); 382 } else { 383 ring = rte_ring_lookup(name); 384 } 385 386 if (ring == NULL) { 387 rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name); 388 } 389 390 return ring; 391 } 392 393 static int 394 init_dispatch_ring(void) 395 { 396 int j; 397 char name_buf[RTE_RING_NAMESIZE]; 398 int queueid; 399 400 unsigned socketid = lcore_conf.socket_id; 401 402 /* Create ring according to ports actually being used. */ 403 int nb_ports = ff_global_cfg.dpdk.nb_ports; 404 for (j = 0; j < nb_ports; j++) { 405 uint16_t portid = ff_global_cfg.dpdk.portid_list[j]; 406 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid]; 407 int nb_queues = pconf->nb_lcores; 408 if (dispatch_ring[portid] == NULL) { 409 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid); 410 411 dispatch_ring[portid] = rte_zmalloc(name_buf, 412 sizeof(struct rte_ring *) * nb_queues, 413 RTE_CACHE_LINE_SIZE); 414 if (dispatch_ring[portid] == NULL) { 415 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 416 "failed\n", name_buf); 417 } 418 } 419 420 for(queueid = 0; queueid < nb_queues; ++queueid) { 421 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d", 422 portid, queueid); 423 dispatch_ring[portid][queueid] = create_ring(name_buf, 424 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ); 425 426 if (dispatch_ring[portid][queueid] == NULL) 427 rte_panic("create ring:%s failed!\n", name_buf); 428 429 printf("create ring:%s success, %u ring entries are now free!\n", 430 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid])); 431 } 432 } 433 434 return 0; 435 } 436 437 static void 438 ff_msg_init(struct rte_mempool *mp, 439 __attribute__((unused)) void *opaque_arg, 440 void *obj, __attribute__((unused)) unsigned i) 441 { 442 struct ff_msg *msg = (struct ff_msg *)obj; 443 msg->msg_type = FF_UNKNOWN; 444 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 445 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 446 } 447 448 static int 449 init_msg_ring(void) 450 { 451 uint16_t i, j; 452 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 453 unsigned socketid = lcore_conf.socket_id; 454 455 /* Create message buffer pool */ 456 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 457 message_pool = rte_mempool_create(FF_MSG_POOL, 458 MSG_RING_SIZE * 2 * nb_procs, 459 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 460 NULL, NULL, ff_msg_init, NULL, 461 socketid, 0); 462 } else { 463 message_pool = rte_mempool_lookup(FF_MSG_POOL); 464 } 465 466 if (message_pool == NULL) { 467 rte_panic("Create msg mempool failed\n"); 468 } 469 470 for(i = 0; i < nb_procs; ++i) { 471 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 472 "%s%u", FF_MSG_RING_IN, i); 473 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 474 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 475 if (msg_ring[i].ring[0] == NULL) 476 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 477 478 for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) { 479 snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE, 480 "%s%u_%u", FF_MSG_RING_OUT, i, j); 481 msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j], 482 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 483 if (msg_ring[i].ring[j] == NULL) 484 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]); 485 } 486 } 487 488 return 0; 489 } 490 491 #ifdef FF_KNI 492 493 static enum FF_KNICTL_CMD get_kni_action(const char *c){ 494 if (!c) 495 return FF_KNICTL_ACTION_DEFAULT; 496 if (0 == strcasecmp(c, "alltokni")){ 497 return FF_KNICTL_ACTION_ALL_TO_KNI; 498 } else if (0 == strcasecmp(c, "alltoff")){ 499 return FF_KNICTL_ACTION_ALL_TO_FF; 500 } else if (0 == strcasecmp(c, "default")){ 501 return FF_KNICTL_ACTION_DEFAULT; 502 } else { 503 return FF_KNICTL_ACTION_DEFAULT; 504 } 505 } 506 507 static int 508 init_kni(void) 509 { 510 int nb_ports = rte_eth_dev_count_avail(); 511 kni_accept = 0; 512 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 513 kni_accept = 1; 514 515 knictl_action = get_kni_action(ff_global_cfg.kni.kni_action); 516 517 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 518 ff_global_cfg.kni.udp_port); 519 520 unsigned socket_id = lcore_conf.socket_id; 521 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 522 523 nb_ports = ff_global_cfg.dpdk.nb_ports; 524 int i, ret; 525 for (i = 0; i < nb_ports; i++) { 526 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 527 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 528 } 529 530 return 0; 531 } 532 #endif 533 534 static void 535 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues) 536 { 537 if (reta_size == 0) { 538 return; 539 } 540 541 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 542 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 543 544 /* config HW indirection table */ 545 unsigned i, j, hash=0; 546 for (i = 0; i < reta_conf_size; i++) { 547 reta_conf[i].mask = ~0ULL; 548 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 549 reta_conf[i].reta[j] = hash++ % nb_queues; 550 } 551 } 552 553 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 554 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 555 port_id); 556 } 557 } 558 559 static int 560 init_port_start(void) 561 { 562 int nb_ports = ff_global_cfg.dpdk.nb_ports; 563 unsigned socketid = 0; 564 struct rte_mempool *mbuf_pool; 565 uint16_t i, j; 566 567 for (i = 0; i < nb_ports; i++) { 568 uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i]; 569 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id]; 570 uint16_t nb_queues = pconf->nb_lcores; 571 572 for (j=0; j<=pconf->nb_slaves; j++) { 573 if (j < pconf->nb_slaves) { 574 port_id = pconf->slave_portid_list[j]; 575 printf("To init %s's %d'st slave port[%d]\n", 576 ff_global_cfg.dpdk.bond_cfgs->name, 577 j, port_id); 578 } else { 579 port_id = u_port_id; 580 } 581 582 struct rte_eth_dev_info dev_info; 583 struct rte_eth_conf port_conf = {0}; 584 struct rte_eth_rxconf rxq_conf; 585 struct rte_eth_txconf txq_conf; 586 587 int ret = rte_eth_dev_info_get(port_id, &dev_info); 588 if (ret != 0) 589 rte_exit(EXIT_FAILURE, 590 "Error during getting device (port %u) info: %s\n", 591 port_id, strerror(-ret)); 592 593 if (nb_queues > dev_info.max_rx_queues) { 594 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 595 nb_queues, 596 dev_info.max_rx_queues); 597 } 598 599 if (nb_queues > dev_info.max_tx_queues) { 600 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 601 nb_queues, 602 dev_info.max_tx_queues); 603 } 604 605 struct rte_ether_addr addr; 606 rte_eth_macaddr_get(port_id, &addr); 607 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 608 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 609 (unsigned)port_id, 610 addr.addr_bytes[0], addr.addr_bytes[1], 611 addr.addr_bytes[2], addr.addr_bytes[3], 612 addr.addr_bytes[4], addr.addr_bytes[5]); 613 614 rte_memcpy(pconf->mac, 615 addr.addr_bytes, RTE_ETHER_ADDR_LEN); 616 617 /* Set RSS mode */ 618 uint64_t default_rss_hf = ETH_RSS_PROTO_MASK; 619 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 620 port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf; 621 if (dev_info.hash_key_size == 52) { 622 rsskey = default_rsskey_52bytes; 623 rsskey_len = 52; 624 } 625 if (ff_global_cfg.dpdk.symmetric_rss) { 626 printf("Use symmetric Receive-side Scaling(RSS) key\n"); 627 rsskey = symmetric_rsskey; 628 } 629 port_conf.rx_adv_conf.rss_conf.rss_key = rsskey; 630 port_conf.rx_adv_conf.rss_conf.rss_key_len = rsskey_len; 631 port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads; 632 if (port_conf.rx_adv_conf.rss_conf.rss_hf != 633 ETH_RSS_PROTO_MASK) { 634 printf("Port %u modified RSS hash function based on hardware support," 635 "requested:%#"PRIx64" configured:%#"PRIx64"\n", 636 port_id, default_rss_hf, 637 port_conf.rx_adv_conf.rss_conf.rss_hf); 638 } 639 640 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) { 641 port_conf.txmode.offloads |= 642 DEV_TX_OFFLOAD_MBUF_FAST_FREE; 643 } 644 645 /* Set Rx VLAN stripping */ 646 if (ff_global_cfg.dpdk.vlan_strip) { 647 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 648 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP; 649 } 650 } 651 652 /* Enable HW CRC stripping */ 653 port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC; 654 655 /* FIXME: Enable TCP LRO ?*/ 656 #if 0 657 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 658 printf("LRO is supported\n"); 659 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO; 660 pconf->hw_features.rx_lro = 1; 661 } 662 #endif 663 664 /* Set Rx checksum checking */ 665 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 666 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 667 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 668 printf("RX checksum offload supported\n"); 669 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM; 670 pconf->hw_features.rx_csum = 1; 671 } 672 673 if (ff_global_cfg.dpdk.tx_csum_offoad_skip == 0) { 674 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 675 printf("TX ip checksum offload supported\n"); 676 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; 677 pconf->hw_features.tx_csum_ip = 1; 678 } 679 680 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 681 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 682 printf("TX TCP&UDP checksum offload supported\n"); 683 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM; 684 pconf->hw_features.tx_csum_l4 = 1; 685 } 686 } else { 687 printf("TX checksum offoad is disabled\n"); 688 } 689 690 if (ff_global_cfg.dpdk.tso) { 691 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 692 printf("TSO is supported\n"); 693 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO; 694 pconf->hw_features.tx_tso = 1; 695 } 696 } else { 697 printf("TSO is disabled\n"); 698 } 699 700 if (dev_info.reta_size) { 701 /* reta size must be power of 2 */ 702 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 703 704 rss_reta_size[port_id] = dev_info.reta_size; 705 printf("port[%d]: rss table size: %d\n", port_id, 706 dev_info.reta_size); 707 } 708 709 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 710 continue; 711 } 712 713 ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); 714 if (ret != 0) { 715 return ret; 716 } 717 718 static uint16_t nb_rxd = RX_QUEUE_SIZE; 719 static uint16_t nb_txd = TX_QUEUE_SIZE; 720 ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd); 721 if (ret < 0) 722 printf("Could not adjust number of descriptors " 723 "for port%u (%d)\n", (unsigned)port_id, ret); 724 725 uint16_t q; 726 for (q = 0; q < nb_queues; q++) { 727 if (numa_on) { 728 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q]; 729 socketid = rte_lcore_to_socket_id(lcore_id); 730 } 731 mbuf_pool = pktmbuf_pool[socketid]; 732 733 txq_conf = dev_info.default_txconf; 734 txq_conf.offloads = port_conf.txmode.offloads; 735 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd, 736 socketid, &txq_conf); 737 if (ret < 0) { 738 return ret; 739 } 740 741 rxq_conf = dev_info.default_rxconf; 742 rxq_conf.offloads = port_conf.rxmode.offloads; 743 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd, 744 socketid, &rxq_conf, mbuf_pool); 745 if (ret < 0) { 746 return ret; 747 } 748 } 749 750 751 if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME, 752 strlen(dev_info.driver_name)) == 0) { 753 754 rte_eth_macaddr_get(port_id, &addr); 755 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 756 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 757 (unsigned)port_id, 758 addr.addr_bytes[0], addr.addr_bytes[1], 759 addr.addr_bytes[2], addr.addr_bytes[3], 760 addr.addr_bytes[4], addr.addr_bytes[5]); 761 762 rte_memcpy(pconf->mac, 763 addr.addr_bytes, RTE_ETHER_ADDR_LEN); 764 765 int mode, count, x; 766 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS; 767 768 mode = rte_eth_bond_mode_get(port_id); 769 printf("Port %u, bond mode:%d\n", port_id, mode); 770 771 count = rte_eth_bond_slaves_get(port_id, slaves, len); 772 printf("Port %u, %s's slave ports count:%d\n", port_id, 773 ff_global_cfg.dpdk.bond_cfgs->name, count); 774 for (x=0; x<count; x++) { 775 printf("Port %u, %s's slave port[%u]\n", port_id, 776 ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]); 777 } 778 } 779 780 ret = rte_eth_dev_start(port_id); 781 if (ret < 0) { 782 return ret; 783 } 784 785 if (nb_queues > 1) { 786 /* set HW rss hash function to Toeplitz. */ 787 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 788 struct rte_eth_hash_filter_info info = {0}; 789 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 790 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 791 792 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 793 RTE_ETH_FILTER_SET, &info) < 0) { 794 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 795 port_id); 796 } 797 } 798 799 set_rss_table(port_id, dev_info.reta_size, nb_queues); 800 } 801 802 /* Enable RX in promiscuous mode for the Ethernet device. */ 803 if (ff_global_cfg.dpdk.promiscuous) { 804 ret = rte_eth_promiscuous_enable(port_id); 805 if (ret == 0) { 806 printf("set port %u to promiscuous mode ok\n", port_id); 807 } else { 808 printf("set port %u to promiscuous mode error\n", port_id); 809 } 810 } 811 } 812 } 813 814 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 815 check_all_ports_link_status(); 816 } 817 818 return 0; 819 } 820 821 static int 822 init_clock(void) 823 { 824 rte_timer_subsystem_init(); 825 uint64_t hz = rte_get_timer_hz(); 826 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 827 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 828 829 rte_timer_init(&freebsd_clock); 830 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 831 rte_lcore_id(), &ff_hardclock_job, NULL); 832 833 ff_update_current_ts(); 834 835 return 0; 836 } 837 838 int 839 ff_dpdk_init(int argc, char **argv) 840 { 841 if (ff_global_cfg.dpdk.nb_procs < 1 || 842 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 843 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 844 ff_global_cfg.dpdk.proc_id < 0) { 845 printf("param num_procs[%d] or proc_id[%d] error!\n", 846 ff_global_cfg.dpdk.nb_procs, 847 ff_global_cfg.dpdk.proc_id); 848 exit(1); 849 } 850 851 int ret = rte_eal_init(argc, argv); 852 if (ret < 0) { 853 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 854 } 855 856 numa_on = ff_global_cfg.dpdk.numa_on; 857 858 idle_sleep = ff_global_cfg.dpdk.idle_sleep; 859 pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \ 860 BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay; 861 862 init_lcore_conf(); 863 864 init_mem_pool(); 865 866 init_dispatch_ring(); 867 868 init_msg_ring(); 869 870 #ifdef FF_KNI 871 enable_kni = ff_global_cfg.kni.enable; 872 if (enable_kni) { 873 init_kni(); 874 } 875 #endif 876 877 #ifdef FF_USE_PAGE_ARRAY 878 ff_mmap_init(); 879 #endif 880 881 ret = init_port_start(); 882 if (ret < 0) { 883 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 884 } 885 886 init_clock(); 887 888 return 0; 889 } 890 891 static void 892 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 893 { 894 uint8_t rx_csum = ctx->hw_features.rx_csum; 895 if (rx_csum) { 896 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 897 rte_pktmbuf_free(pkt); 898 return; 899 } 900 } 901 902 void *data = rte_pktmbuf_mtod(pkt, void*); 903 uint16_t len = rte_pktmbuf_data_len(pkt); 904 905 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 906 if (hdr == NULL) { 907 rte_pktmbuf_free(pkt); 908 return; 909 } 910 911 if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) { 912 ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci); 913 } 914 915 struct rte_mbuf *pn = pkt->next; 916 void *prev = hdr; 917 while(pn != NULL) { 918 data = rte_pktmbuf_mtod(pn, void*); 919 len = rte_pktmbuf_data_len(pn); 920 921 void *mb = ff_mbuf_get(prev, data, len); 922 if (mb == NULL) { 923 ff_mbuf_free(hdr); 924 rte_pktmbuf_free(pkt); 925 return; 926 } 927 pn = pn->next; 928 prev = mb; 929 } 930 931 ff_veth_process_packet(ctx->ifp, hdr); 932 } 933 934 static enum FilterReturn 935 protocol_filter(const void *data, uint16_t len) 936 { 937 if(len < RTE_ETHER_ADDR_LEN) 938 return FILTER_UNKNOWN; 939 940 const struct rte_ether_hdr *hdr; 941 const struct rte_vlan_hdr *vlanhdr; 942 hdr = (const struct rte_ether_hdr *)data; 943 uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type); 944 data += RTE_ETHER_HDR_LEN; 945 len -= RTE_ETHER_HDR_LEN; 946 947 if (ether_type == RTE_ETHER_TYPE_VLAN) { 948 vlanhdr = (struct rte_vlan_hdr *)data; 949 ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto); 950 data += sizeof(struct rte_vlan_hdr); 951 len -= sizeof(struct rte_vlan_hdr); 952 } 953 954 if(ether_type == RTE_ETHER_TYPE_ARP) 955 return FILTER_ARP; 956 957 #ifdef INET6 958 if (ether_type == RTE_ETHER_TYPE_IPV6) { 959 return ff_kni_proto_filter(data, 960 len, ether_type); 961 } 962 #endif 963 964 #ifndef FF_KNI 965 return FILTER_UNKNOWN; 966 #else 967 if (!enable_kni) { 968 return FILTER_UNKNOWN; 969 } 970 971 if(ether_type != RTE_ETHER_TYPE_IPV4) 972 return FILTER_UNKNOWN; 973 974 return ff_kni_proto_filter(data, 975 len, ether_type); 976 #endif 977 } 978 979 static inline void 980 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m) 981 { 982 struct rte_mbuf *md; 983 void *src, *dst; 984 985 dst = rte_pktmbuf_mtod(mi, void *); 986 src = rte_pktmbuf_mtod(m, void *); 987 988 mi->data_len = m->data_len; 989 rte_memcpy(dst, src, m->data_len); 990 991 mi->port = m->port; 992 mi->vlan_tci = m->vlan_tci; 993 mi->vlan_tci_outer = m->vlan_tci_outer; 994 mi->tx_offload = m->tx_offload; 995 mi->hash = m->hash; 996 mi->ol_flags = m->ol_flags; 997 mi->packet_type = m->packet_type; 998 } 999 1000 /* copied from rte_pktmbuf_clone */ 1001 static inline struct rte_mbuf * 1002 pktmbuf_deep_clone(const struct rte_mbuf *md, 1003 struct rte_mempool *mp) 1004 { 1005 struct rte_mbuf *mc, *mi, **prev; 1006 uint32_t pktlen; 1007 uint8_t nseg; 1008 1009 if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) 1010 return NULL; 1011 1012 mi = mc; 1013 prev = &mi->next; 1014 pktlen = md->pkt_len; 1015 nseg = 0; 1016 1017 do { 1018 nseg++; 1019 pktmbuf_deep_attach(mi, md); 1020 *prev = mi; 1021 prev = &mi->next; 1022 } while ((md = md->next) != NULL && 1023 (mi = rte_pktmbuf_alloc(mp)) != NULL); 1024 1025 *prev = NULL; 1026 mc->nb_segs = nseg; 1027 mc->pkt_len = pktlen; 1028 1029 /* Allocation of new indirect segment failed */ 1030 if (unlikely (mi == NULL)) { 1031 rte_pktmbuf_free(mc); 1032 return NULL; 1033 } 1034 1035 __rte_mbuf_sanity_check(mc, 1); 1036 return mc; 1037 } 1038 1039 static inline void 1040 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 1041 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 1042 { 1043 struct lcore_conf *qconf = &lcore_conf; 1044 uint16_t nb_queues = qconf->nb_queue_list[port_id]; 1045 1046 uint16_t i; 1047 for (i = 0; i < count; i++) { 1048 struct rte_mbuf *rtem = bufs[i]; 1049 1050 if (unlikely( ff_global_cfg.pcap.enable)) { 1051 if (!pkts_from_ring) { 1052 ff_dump_packets( ff_global_cfg.pcap.save_path, rtem, ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len); 1053 } 1054 } 1055 1056 void *data = rte_pktmbuf_mtod(rtem, void*); 1057 uint16_t len = rte_pktmbuf_data_len(rtem); 1058 1059 if (!pkts_from_ring) { 1060 ff_traffic.rx_packets++; 1061 ff_traffic.rx_bytes += len; 1062 } 1063 1064 if (!pkts_from_ring && packet_dispatcher) { 1065 int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues); 1066 if (ret == FF_DISPATCH_RESPONSE) { 1067 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len; 1068 1069 /* 1070 * We have not support vlan out strip 1071 */ 1072 if (rtem->vlan_tci) { 1073 data = rte_pktmbuf_prepend(rtem, sizeof(struct rte_vlan_hdr)); 1074 if (data != NULL) { 1075 memmove(data, data + sizeof(struct rte_vlan_hdr), RTE_ETHER_HDR_LEN); 1076 struct rte_ether_hdr *etherhdr = (struct rte_ether_hdr *)data; 1077 struct rte_vlan_hdr *vlanhdr = (struct rte_vlan_hdr *)(data + RTE_ETHER_HDR_LEN); 1078 vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci); 1079 vlanhdr->eth_proto = etherhdr->ether_type; 1080 etherhdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN); 1081 } 1082 } 1083 send_single_packet(rtem, port_id); 1084 continue; 1085 } 1086 1087 if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) { 1088 rte_pktmbuf_free(rtem); 1089 continue; 1090 } 1091 1092 if (ret != queue_id) { 1093 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem); 1094 if (ret < 0) 1095 rte_pktmbuf_free(rtem); 1096 1097 continue; 1098 } 1099 } 1100 1101 enum FilterReturn filter = protocol_filter(data, len); 1102 #ifdef INET6 1103 if (filter == FILTER_ARP || filter == FILTER_NDP) { 1104 #else 1105 if (filter == FILTER_ARP) { 1106 #endif 1107 struct rte_mempool *mbuf_pool; 1108 struct rte_mbuf *mbuf_clone; 1109 if (!pkts_from_ring) { 1110 uint16_t j; 1111 for(j = 0; j < nb_queues; ++j) { 1112 if(j == queue_id) 1113 continue; 1114 1115 unsigned socket_id = 0; 1116 if (numa_on) { 1117 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j]; 1118 socket_id = rte_lcore_to_socket_id(lcore_id); 1119 } 1120 mbuf_pool = pktmbuf_pool[socket_id]; 1121 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1122 if(mbuf_clone) { 1123 int ret = rte_ring_enqueue(dispatch_ring[port_id][j], 1124 mbuf_clone); 1125 if (ret < 0) 1126 rte_pktmbuf_free(mbuf_clone); 1127 } 1128 } 1129 } 1130 1131 #ifdef FF_KNI 1132 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1133 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 1134 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1135 if(mbuf_clone) { 1136 ff_kni_enqueue(port_id, mbuf_clone); 1137 } 1138 } 1139 #endif 1140 ff_veth_input(ctx, rtem); 1141 #ifdef FF_KNI 1142 } else if (enable_kni) { 1143 if (knictl_action == FF_KNICTL_ACTION_ALL_TO_KNI){ 1144 ff_kni_enqueue(port_id, rtem); 1145 } else if (knictl_action == FF_KNICTL_ACTION_ALL_TO_FF){ 1146 ff_veth_input(ctx, rtem); 1147 } else if (knictl_action == FF_KNICTL_ACTION_DEFAULT){ 1148 if (enable_kni && 1149 ((filter == FILTER_KNI && kni_accept) || 1150 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 1151 ff_kni_enqueue(port_id, rtem); 1152 } else { 1153 ff_veth_input(ctx, rtem); 1154 } 1155 } else { 1156 ff_veth_input(ctx, rtem); 1157 } 1158 #endif 1159 } else { 1160 ff_veth_input(ctx, rtem); 1161 } 1162 } 1163 } 1164 1165 static inline int 1166 process_dispatch_ring(uint16_t port_id, uint16_t queue_id, 1167 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 1168 { 1169 /* read packet from ring buf and to process */ 1170 uint16_t nb_rb; 1171 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id], 1172 (void **)pkts_burst, MAX_PKT_BURST, NULL); 1173 1174 if(nb_rb > 0) { 1175 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 1176 } 1177 1178 return 0; 1179 } 1180 1181 static inline void 1182 handle_sysctl_msg(struct ff_msg *msg) 1183 { 1184 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 1185 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 1186 msg->sysctl.newlen); 1187 1188 if (ret < 0) { 1189 msg->result = errno; 1190 } else { 1191 msg->result = 0; 1192 } 1193 } 1194 1195 static inline void 1196 handle_ioctl_msg(struct ff_msg *msg) 1197 { 1198 int fd, ret; 1199 #ifdef INET6 1200 if (msg->msg_type == FF_IOCTL6) { 1201 fd = ff_socket(AF_INET6, SOCK_DGRAM, 0); 1202 } else 1203 #endif 1204 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 1205 1206 if (fd < 0) { 1207 ret = -1; 1208 goto done; 1209 } 1210 1211 ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data); 1212 1213 ff_close(fd); 1214 1215 done: 1216 if (ret < 0) { 1217 msg->result = errno; 1218 } else { 1219 msg->result = 0; 1220 } 1221 } 1222 1223 static inline void 1224 handle_route_msg(struct ff_msg *msg) 1225 { 1226 int ret = ff_rtioctl(msg->route.fib, msg->route.data, 1227 &msg->route.len, msg->route.maxlen); 1228 if (ret < 0) { 1229 msg->result = errno; 1230 } else { 1231 msg->result = 0; 1232 } 1233 } 1234 1235 static inline void 1236 handle_top_msg(struct ff_msg *msg) 1237 { 1238 msg->top = ff_top_status; 1239 msg->result = 0; 1240 } 1241 1242 #ifdef FF_NETGRAPH 1243 static inline void 1244 handle_ngctl_msg(struct ff_msg *msg) 1245 { 1246 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data); 1247 if (ret < 0) { 1248 msg->result = errno; 1249 } else { 1250 msg->result = 0; 1251 msg->ngctl.ret = ret; 1252 } 1253 } 1254 #endif 1255 1256 #ifdef FF_IPFW 1257 static inline void 1258 handle_ipfw_msg(struct ff_msg *msg) 1259 { 1260 int fd, ret; 1261 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW); 1262 if (fd < 0) { 1263 ret = -1; 1264 goto done; 1265 } 1266 1267 switch (msg->ipfw.cmd) { 1268 case FF_IPFW_GET: 1269 ret = ff_getsockopt_freebsd(fd, msg->ipfw.level, 1270 msg->ipfw.optname, msg->ipfw.optval, 1271 msg->ipfw.optlen); 1272 break; 1273 case FF_IPFW_SET: 1274 ret = ff_setsockopt_freebsd(fd, msg->ipfw.level, 1275 msg->ipfw.optname, msg->ipfw.optval, 1276 *(msg->ipfw.optlen)); 1277 break; 1278 default: 1279 ret = -1; 1280 errno = ENOTSUP; 1281 break; 1282 } 1283 1284 ff_close(fd); 1285 1286 done: 1287 if (ret < 0) { 1288 msg->result = errno; 1289 } else { 1290 msg->result = 0; 1291 } 1292 } 1293 #endif 1294 1295 static inline void 1296 handle_traffic_msg(struct ff_msg *msg) 1297 { 1298 msg->traffic = ff_traffic; 1299 msg->result = 0; 1300 } 1301 1302 #ifdef FF_KNI 1303 static inline void 1304 handle_knictl_msg(struct ff_msg *msg) 1305 { 1306 if (msg->knictl.kni_cmd == FF_KNICTL_CMD_SET){ 1307 switch (msg->knictl.kni_action){ 1308 case FF_KNICTL_ACTION_ALL_TO_FF: knictl_action = FF_KNICTL_ACTION_ALL_TO_FF; msg->result = 0; printf("new kni action: alltoff\n"); break; 1309 case FF_KNICTL_ACTION_ALL_TO_KNI: knictl_action = FF_KNICTL_ACTION_ALL_TO_KNI; msg->result = 0; printf("new kni action: alltokni\n"); break; 1310 case FF_KNICTL_ACTION_DEFAULT: knictl_action = FF_KNICTL_ACTION_DEFAULT; msg->result = 0; printf("new kni action: default\n"); break; 1311 default: msg->result = -1; 1312 } 1313 } 1314 else if (msg->knictl.kni_cmd == FF_KNICTL_CMD_GET){ 1315 msg->knictl.kni_action = knictl_action; 1316 } else { 1317 msg->result = -2; 1318 } 1319 } 1320 #endif 1321 1322 static inline void 1323 handle_default_msg(struct ff_msg *msg) 1324 { 1325 msg->result = ENOTSUP; 1326 } 1327 1328 static inline void 1329 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1330 { 1331 switch (msg->msg_type) { 1332 case FF_SYSCTL: 1333 handle_sysctl_msg(msg); 1334 break; 1335 case FF_IOCTL: 1336 #ifdef INET6 1337 case FF_IOCTL6: 1338 #endif 1339 handle_ioctl_msg(msg); 1340 break; 1341 case FF_ROUTE: 1342 handle_route_msg(msg); 1343 break; 1344 case FF_TOP: 1345 handle_top_msg(msg); 1346 break; 1347 #ifdef FF_NETGRAPH 1348 case FF_NGCTL: 1349 handle_ngctl_msg(msg); 1350 break; 1351 #endif 1352 #ifdef FF_IPFW 1353 case FF_IPFW_CTL: 1354 handle_ipfw_msg(msg); 1355 break; 1356 #endif 1357 case FF_TRAFFIC: 1358 handle_traffic_msg(msg); 1359 break; 1360 #ifdef FF_KNI 1361 case FF_KNICTL: 1362 handle_knictl_msg(msg); 1363 break; 1364 #endif 1365 default: 1366 handle_default_msg(msg); 1367 break; 1368 } 1369 rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg); 1370 } 1371 1372 static inline int 1373 process_msg_ring(uint16_t proc_id) 1374 { 1375 void *msg; 1376 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1377 1378 if (unlikely(ret == 0)) { 1379 handle_msg((struct ff_msg *)msg, proc_id); 1380 } 1381 1382 return 0; 1383 } 1384 1385 /* Send burst of packets on an output interface */ 1386 static inline int 1387 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1388 { 1389 struct rte_mbuf **m_table; 1390 int ret; 1391 uint16_t queueid; 1392 1393 queueid = qconf->tx_queue_id[port]; 1394 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1395 1396 if (unlikely(ff_global_cfg.pcap.enable)) { 1397 uint16_t i; 1398 for (i = 0; i < n; i++) { 1399 ff_dump_packets( ff_global_cfg.pcap.save_path, m_table[i], 1400 ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len); 1401 } 1402 } 1403 1404 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1405 ff_traffic.tx_packets += ret; 1406 uint16_t i; 1407 for (i = 0; i < ret; i++) { 1408 ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]); 1409 #ifdef FF_USE_PAGE_ARRAY 1410 if (qconf->tx_mbufs[port].bsd_m_table[i]) 1411 ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs); 1412 #endif 1413 } 1414 if (unlikely(ret < n)) { 1415 do { 1416 rte_pktmbuf_free(m_table[ret]); 1417 #ifdef FF_USE_PAGE_ARRAY 1418 if ( qconf->tx_mbufs[port].bsd_m_table[ret] ) 1419 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]); 1420 #endif 1421 } while (++ret < n); 1422 } 1423 return 0; 1424 } 1425 1426 /* Enqueue a single packet, and send burst if queue is filled */ 1427 static inline int 1428 send_single_packet(struct rte_mbuf *m, uint8_t port) 1429 { 1430 uint16_t len; 1431 struct lcore_conf *qconf; 1432 1433 qconf = &lcore_conf; 1434 len = qconf->tx_mbufs[port].len; 1435 qconf->tx_mbufs[port].m_table[len] = m; 1436 len++; 1437 1438 /* enough pkts to be sent */ 1439 if (unlikely(len == MAX_PKT_BURST)) { 1440 send_burst(qconf, MAX_PKT_BURST, port); 1441 len = 0; 1442 } 1443 1444 qconf->tx_mbufs[port].len = len; 1445 return 0; 1446 } 1447 1448 int 1449 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1450 int total) 1451 { 1452 #ifdef FF_USE_PAGE_ARRAY 1453 struct lcore_conf *qconf = &lcore_conf; 1454 int len = 0; 1455 1456 len = ff_if_send_onepkt(ctx, m,total); 1457 if (unlikely(len == MAX_PKT_BURST)) { 1458 send_burst(qconf, MAX_PKT_BURST, ctx->port_id); 1459 len = 0; 1460 } 1461 qconf->tx_mbufs[ctx->port_id].len = len; 1462 return 0; 1463 #endif 1464 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1465 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1466 if (head == NULL) { 1467 ff_mbuf_free(m); 1468 return -1; 1469 } 1470 1471 head->pkt_len = total; 1472 head->nb_segs = 0; 1473 1474 int off = 0; 1475 struct rte_mbuf *cur = head, *prev = NULL; 1476 while(total > 0) { 1477 if (cur == NULL) { 1478 cur = rte_pktmbuf_alloc(mbuf_pool); 1479 if (cur == NULL) { 1480 rte_pktmbuf_free(head); 1481 ff_mbuf_free(m); 1482 return -1; 1483 } 1484 } 1485 1486 if (prev != NULL) { 1487 prev->next = cur; 1488 } 1489 head->nb_segs++; 1490 1491 prev = cur; 1492 void *data = rte_pktmbuf_mtod(cur, void*); 1493 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1494 int ret = ff_mbuf_copydata(m, data, off, len); 1495 if (ret < 0) { 1496 rte_pktmbuf_free(head); 1497 ff_mbuf_free(m); 1498 return -1; 1499 } 1500 1501 1502 cur->data_len = len; 1503 off += len; 1504 total -= len; 1505 cur = NULL; 1506 } 1507 1508 struct ff_tx_offload offload = {0}; 1509 ff_mbuf_tx_offload(m, &offload); 1510 1511 void *data = rte_pktmbuf_mtod(head, void*); 1512 1513 if (offload.ip_csum) { 1514 /* ipv6 not supported yet */ 1515 struct rte_ipv4_hdr *iph; 1516 int iph_len; 1517 iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN); 1518 iph_len = (iph->version_ihl & 0x0f) << 2; 1519 1520 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 1521 head->l2_len = RTE_ETHER_HDR_LEN; 1522 head->l3_len = iph_len; 1523 } 1524 1525 if (ctx->hw_features.tx_csum_l4) { 1526 struct rte_ipv4_hdr *iph; 1527 int iph_len; 1528 iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN); 1529 iph_len = (iph->version_ihl & 0x0f) << 2; 1530 1531 if (offload.tcp_csum) { 1532 head->ol_flags |= PKT_TX_TCP_CKSUM; 1533 head->l2_len = RTE_ETHER_HDR_LEN; 1534 head->l3_len = iph_len; 1535 } 1536 1537 /* 1538 * TCP segmentation offload. 1539 * 1540 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 1541 * implies PKT_TX_TCP_CKSUM) 1542 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 1543 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 1544 * write the IP checksum to 0 in the packet 1545 * - fill the mbuf offload information: l2_len, 1546 * l3_len, l4_len, tso_segsz 1547 * - calculate the pseudo header checksum without taking ip_len 1548 * in account, and set it in the TCP header. Refer to 1549 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 1550 * used as helpers. 1551 */ 1552 if (offload.tso_seg_size) { 1553 struct rte_tcp_hdr *tcph; 1554 int tcph_len; 1555 tcph = (struct rte_tcp_hdr *)((char *)iph + iph_len); 1556 tcph_len = (tcph->data_off & 0xf0) >> 2; 1557 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 1558 1559 head->ol_flags |= PKT_TX_TCP_SEG; 1560 head->l4_len = tcph_len; 1561 head->tso_segsz = offload.tso_seg_size; 1562 } 1563 1564 if (offload.udp_csum) { 1565 head->ol_flags |= PKT_TX_UDP_CKSUM; 1566 head->l2_len = RTE_ETHER_HDR_LEN; 1567 head->l3_len = iph_len; 1568 } 1569 } 1570 1571 ff_mbuf_free(m); 1572 1573 return send_single_packet(head, ctx->port_id); 1574 } 1575 1576 static int 1577 main_loop(void *arg) 1578 { 1579 struct loop_routine *lr = (struct loop_routine *)arg; 1580 1581 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1582 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc; 1583 int i, j, nb_rx, idle; 1584 uint16_t port_id, queue_id; 1585 struct lcore_conf *qconf; 1586 uint64_t drain_tsc = 0; 1587 struct ff_dpdk_if_context *ctx; 1588 1589 if (pkt_tx_delay) { 1590 drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay; 1591 } 1592 1593 prev_tsc = 0; 1594 usch_tsc = 0; 1595 1596 qconf = &lcore_conf; 1597 1598 while (1) { 1599 cur_tsc = rte_rdtsc(); 1600 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1601 rte_timer_manage(); 1602 } 1603 1604 idle = 1; 1605 sys_tsc = 0; 1606 usr_tsc = 0; 1607 1608 /* 1609 * TX burst queue drain 1610 */ 1611 diff_tsc = cur_tsc - prev_tsc; 1612 if (unlikely(diff_tsc >= drain_tsc)) { 1613 for (i = 0; i < qconf->nb_tx_port; i++) { 1614 port_id = qconf->tx_port_id[i]; 1615 if (qconf->tx_mbufs[port_id].len == 0) 1616 continue; 1617 1618 idle = 0; 1619 1620 send_burst(qconf, 1621 qconf->tx_mbufs[port_id].len, 1622 port_id); 1623 qconf->tx_mbufs[port_id].len = 0; 1624 } 1625 1626 prev_tsc = cur_tsc; 1627 } 1628 1629 /* 1630 * Read packet from RX queues 1631 */ 1632 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1633 port_id = qconf->rx_queue_list[i].port_id; 1634 queue_id = qconf->rx_queue_list[i].queue_id; 1635 ctx = veth_ctx[port_id]; 1636 1637 #ifdef FF_KNI 1638 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1639 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1640 } 1641 #endif 1642 1643 process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); 1644 1645 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1646 MAX_PKT_BURST); 1647 if (nb_rx == 0) 1648 continue; 1649 1650 idle = 0; 1651 1652 /* Prefetch first packets */ 1653 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1654 rte_prefetch0(rte_pktmbuf_mtod( 1655 pkts_burst[j], void *)); 1656 } 1657 1658 /* Prefetch and handle already prefetched packets */ 1659 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1660 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1661 j + PREFETCH_OFFSET], void *)); 1662 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1663 } 1664 1665 /* Handle remaining prefetched packets */ 1666 for (; j < nb_rx; j++) { 1667 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1668 } 1669 } 1670 1671 process_msg_ring(qconf->proc_id); 1672 1673 div_tsc = rte_rdtsc(); 1674 1675 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) { 1676 usch_tsc = cur_tsc; 1677 lr->loop(lr->arg); 1678 } 1679 1680 idle_sleep_tsc = rte_rdtsc(); 1681 if (likely(idle && idle_sleep)) { 1682 usleep(idle_sleep); 1683 end_tsc = rte_rdtsc(); 1684 } else { 1685 end_tsc = idle_sleep_tsc; 1686 } 1687 1688 if (usch_tsc == cur_tsc) { 1689 usr_tsc = idle_sleep_tsc - div_tsc; 1690 } 1691 1692 if (!idle) { 1693 sys_tsc = div_tsc - cur_tsc; 1694 ff_top_status.sys_tsc += sys_tsc; 1695 } 1696 1697 ff_top_status.usr_tsc += usr_tsc; 1698 ff_top_status.work_tsc += end_tsc - cur_tsc; 1699 ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1700 1701 ff_top_status.loops++; 1702 } 1703 1704 return 0; 1705 } 1706 1707 int 1708 ff_dpdk_if_up(void) { 1709 int i; 1710 struct lcore_conf *qconf = &lcore_conf; 1711 for (i = 0; i < qconf->nb_tx_port; i++) { 1712 uint16_t port_id = qconf->tx_port_id[i]; 1713 1714 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id]; 1715 veth_ctx[port_id] = ff_veth_attach(pconf); 1716 if (veth_ctx[port_id] == NULL) { 1717 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1718 } 1719 } 1720 1721 return 0; 1722 } 1723 1724 void 1725 ff_dpdk_run(loop_func_t loop, void *arg) { 1726 struct loop_routine *lr = rte_malloc(NULL, 1727 sizeof(struct loop_routine), 0); 1728 lr->loop = loop; 1729 lr->arg = arg; 1730 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1731 rte_eal_mp_wait_lcore(); 1732 rte_free(lr); 1733 } 1734 1735 void 1736 ff_dpdk_pktmbuf_free(void *m) 1737 { 1738 rte_pktmbuf_free((struct rte_mbuf *)m); 1739 } 1740 1741 static uint32_t 1742 toeplitz_hash(unsigned keylen, const uint8_t *key, 1743 unsigned datalen, const uint8_t *data) 1744 { 1745 uint32_t hash = 0, v; 1746 u_int i, b; 1747 1748 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1749 1750 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1751 for (i = 0; i < datalen; i++) { 1752 for (b = 0; b < 8; b++) { 1753 if (data[i] & (1<<(7-b))) 1754 hash ^= v; 1755 v <<= 1; 1756 if ((i + 4) < keylen && 1757 (key[i+4] & (1<<(7-b)))) 1758 v |= 1; 1759 } 1760 } 1761 return (hash); 1762 } 1763 1764 int 1765 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1766 uint16_t sport, uint16_t dport) 1767 { 1768 struct lcore_conf *qconf = &lcore_conf; 1769 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 1770 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id]; 1771 1772 if (nb_queues <= 1) { 1773 return 1; 1774 } 1775 1776 uint16_t reta_size = rss_reta_size[ctx->port_id]; 1777 uint16_t queueid = qconf->tx_queue_id[ctx->port_id]; 1778 1779 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1780 sizeof(dport)]; 1781 1782 unsigned datalen = 0; 1783 1784 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1785 datalen += sizeof(saddr); 1786 1787 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1788 datalen += sizeof(daddr); 1789 1790 bcopy(&sport, &data[datalen], sizeof(sport)); 1791 datalen += sizeof(sport); 1792 1793 bcopy(&dport, &data[datalen], sizeof(dport)); 1794 datalen += sizeof(dport); 1795 1796 uint32_t hash = 0; 1797 hash = toeplitz_hash(rsskey_len, rsskey, datalen, data); 1798 1799 return ((hash & (reta_size - 1)) % nb_queues) == queueid; 1800 } 1801 1802 void 1803 ff_regist_packet_dispatcher(dispatch_func_t func) 1804 { 1805 packet_dispatcher = func; 1806 } 1807 1808 uint64_t 1809 ff_get_tsc_ns() 1810 { 1811 uint64_t cur_tsc = rte_rdtsc(); 1812 uint64_t hz = rte_get_tsc_hz(); 1813 return ((double)cur_tsc/(double)hz) * NS_PER_S; 1814 } 1815 1816