1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 #include <unistd.h> 28 #include <sys/mman.h> 29 #include <errno.h> 30 31 #include <rte_common.h> 32 #include <rte_byteorder.h> 33 #include <rte_log.h> 34 #include <rte_memory.h> 35 #include <rte_memcpy.h> 36 #include <rte_memzone.h> 37 #include <rte_config.h> 38 #include <rte_eal.h> 39 #include <rte_pci.h> 40 #include <rte_mbuf.h> 41 #include <rte_memory.h> 42 #include <rte_lcore.h> 43 #include <rte_launch.h> 44 #include <rte_ethdev.h> 45 #include <rte_debug.h> 46 #include <rte_common.h> 47 #include <rte_ether.h> 48 #include <rte_malloc.h> 49 #include <rte_cycles.h> 50 #include <rte_timer.h> 51 #include <rte_thash.h> 52 #include <rte_ip.h> 53 #include <rte_tcp.h> 54 #include <rte_udp.h> 55 #include <rte_eth_bond.h> 56 57 #include "ff_dpdk_if.h" 58 #include "ff_dpdk_pcap.h" 59 #include "ff_dpdk_kni.h" 60 #include "ff_config.h" 61 #include "ff_veth.h" 62 #include "ff_host_interface.h" 63 #include "ff_msg.h" 64 #include "ff_api.h" 65 #include "ff_memory.h" 66 67 #ifdef FF_KNI 68 #define KNI_MBUF_MAX 2048 69 #define KNI_QUEUE_SIZE 2048 70 71 int enable_kni; 72 static int kni_accept; 73 static int knictl_action = FF_KNICTL_ACTION_DEFAULT; 74 #endif 75 76 static int numa_on; 77 78 static unsigned idle_sleep; 79 static unsigned pkt_tx_delay; 80 81 static struct rte_timer freebsd_clock; 82 83 // Mellanox Linux's driver key 84 static uint8_t default_rsskey_40bytes[40] = { 85 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 86 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 87 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 88 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 89 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 90 }; 91 92 static uint8_t default_rsskey_52bytes[52] = { 93 0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23, 94 0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30, 95 0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02, 96 0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c, 97 0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55, 98 0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e, 99 0x81, 0x15, 0x03, 0x66 100 }; 101 102 static uint8_t symmetric_rsskey[52] = { 103 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 104 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 105 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 106 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 107 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 108 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 109 0x6d, 0x5a, 0x6d, 0x5a 110 }; 111 112 static int rsskey_len = sizeof(default_rsskey_40bytes); 113 static uint8_t *rsskey = default_rsskey_40bytes; 114 115 struct lcore_conf lcore_conf; 116 117 struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 118 119 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; 120 static dispatch_func_t packet_dispatcher; 121 122 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 123 124 #define BOND_DRIVER_NAME "net_bonding" 125 126 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port); 127 128 struct ff_msg_ring { 129 char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE]; 130 /* ring[0] for lcore recv msg, other send */ 131 /* ring[1] for lcore send msg, other read */ 132 struct rte_ring *ring[FF_MSG_NUM]; 133 } __rte_cache_aligned; 134 135 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 136 static struct rte_mempool *message_pool; 137 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 138 139 static struct ff_top_args ff_top_status; 140 static struct ff_traffic_args ff_traffic; 141 extern void ff_hardclock(void); 142 143 static void 144 ff_hardclock_job(__rte_unused struct rte_timer *timer, 145 __rte_unused void *arg) { 146 ff_hardclock(); 147 ff_update_current_ts(); 148 } 149 150 struct ff_dpdk_if_context * 151 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 152 { 153 struct ff_dpdk_if_context *ctx; 154 155 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 156 if (ctx == NULL) 157 return NULL; 158 159 ctx->sc = sc; 160 ctx->ifp = ifp; 161 ctx->port_id = cfg->port_id; 162 ctx->hw_features = cfg->hw_features; 163 164 return ctx; 165 } 166 167 void 168 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 169 { 170 free(ctx); 171 } 172 173 static void 174 check_all_ports_link_status(void) 175 { 176 #define CHECK_INTERVAL 100 /* 100ms */ 177 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 178 179 uint16_t portid; 180 uint8_t count, all_ports_up, print_flag = 0; 181 struct rte_eth_link link; 182 183 printf("\nChecking link status"); 184 fflush(stdout); 185 186 int i, nb_ports; 187 nb_ports = ff_global_cfg.dpdk.nb_ports; 188 for (count = 0; count <= MAX_CHECK_TIME; count++) { 189 all_ports_up = 1; 190 for (i = 0; i < nb_ports; i++) { 191 uint16_t portid = ff_global_cfg.dpdk.portid_list[i]; 192 memset(&link, 0, sizeof(link)); 193 rte_eth_link_get_nowait(portid, &link); 194 195 /* print link status if flag set */ 196 if (print_flag == 1) { 197 if (link.link_status) { 198 printf("Port %d Link Up - speed %u " 199 "Mbps - %s\n", (int)portid, 200 (unsigned)link.link_speed, 201 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 202 ("full-duplex") : ("half-duplex\n")); 203 } else { 204 printf("Port %d Link Down\n", (int)portid); 205 } 206 continue; 207 } 208 /* clear all_ports_up flag if any link down */ 209 if (link.link_status == 0) { 210 all_ports_up = 0; 211 break; 212 } 213 } 214 215 /* after finally printing all link status, get out */ 216 if (print_flag == 1) 217 break; 218 219 if (all_ports_up == 0) { 220 printf("."); 221 fflush(stdout); 222 rte_delay_ms(CHECK_INTERVAL); 223 } 224 225 /* set the print_flag if all ports up or timeout */ 226 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 227 print_flag = 1; 228 printf("done\n"); 229 } 230 } 231 } 232 233 static int 234 init_lcore_conf(void) 235 { 236 uint8_t nb_dev_ports = rte_eth_dev_count_avail(); 237 if (nb_dev_ports == 0) { 238 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 239 } 240 241 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) { 242 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n", 243 ff_global_cfg.dpdk.max_portid); 244 } 245 246 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs; 247 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 248 249 uint16_t socket_id = 0; 250 if (numa_on) { 251 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 252 } 253 254 lcore_conf.socket_id = socket_id; 255 256 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id]; 257 if (!rte_lcore_is_enabled(lcore_id)) { 258 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 259 } 260 261 int j; 262 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) { 263 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j]; 264 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 265 266 int queueid = -1; 267 int i; 268 for (i = 0; i < pconf->nb_lcores; i++) { 269 if (pconf->lcore_list[i] == lcore_id) { 270 queueid = i; 271 } 272 } 273 if (queueid < 0) { 274 continue; 275 } 276 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid); 277 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 278 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 279 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid; 280 lcore_conf.nb_rx_queue++; 281 282 lcore_conf.tx_queue_id[port_id] = queueid; 283 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id; 284 lcore_conf.nb_tx_port++; 285 286 /* Enable pcap dump */ 287 if (ff_global_cfg.pcap.enable) { 288 ff_enable_pcap(ff_global_cfg.pcap.save_path, ff_global_cfg.pcap.snap_len); 289 } 290 291 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores; 292 } 293 294 if (lcore_conf.nb_rx_queue == 0) { 295 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id); 296 } 297 298 return 0; 299 } 300 301 static int 302 init_mem_pool(void) 303 { 304 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 305 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 306 uint32_t nb_tx_queue = nb_lcores; 307 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 308 uint16_t max_portid = ff_global_cfg.dpdk.max_portid; 309 310 unsigned nb_mbuf = RTE_ALIGN_CEIL ( 311 (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE + 312 nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST + 313 nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE + 314 nb_lcores * MEMPOOL_CACHE_SIZE + 315 #ifdef FF_KNI 316 nb_ports * KNI_MBUF_MAX + 317 nb_ports * KNI_QUEUE_SIZE + 318 #endif 319 nb_lcores * nb_ports * DISPATCH_RING_SIZE), 320 (unsigned)8192); 321 322 unsigned socketid = 0; 323 uint16_t i, lcore_id; 324 char s[64]; 325 326 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) { 327 lcore_id = ff_global_cfg.dpdk.proc_lcore[i]; 328 if (numa_on) { 329 socketid = rte_lcore_to_socket_id(lcore_id); 330 } 331 332 if (socketid >= NB_SOCKETS) { 333 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 334 socketid, i, NB_SOCKETS); 335 } 336 337 if (pktmbuf_pool[socketid] != NULL) { 338 continue; 339 } 340 341 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 342 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 343 pktmbuf_pool[socketid] = 344 rte_pktmbuf_pool_create(s, nb_mbuf, 345 MEMPOOL_CACHE_SIZE, 0, 346 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 347 } else { 348 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 349 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 350 } 351 352 if (pktmbuf_pool[socketid] == NULL) { 353 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 354 } else { 355 printf("create mbuf pool on socket %d\n", socketid); 356 } 357 358 #ifdef FF_USE_PAGE_ARRAY 359 nb_mbuf = RTE_ALIGN_CEIL ( 360 nb_ports*nb_lcores*MAX_PKT_BURST + 361 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 362 nb_lcores*MEMPOOL_CACHE_SIZE, 363 (unsigned)4096); 364 ff_init_ref_pool(nb_mbuf, socketid); 365 #endif 366 } 367 368 return 0; 369 } 370 371 static struct rte_ring * 372 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 373 { 374 struct rte_ring *ring; 375 376 if (name == NULL) { 377 rte_exit(EXIT_FAILURE, "create ring failed, no name!\n"); 378 } 379 380 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 381 ring = rte_ring_create(name, count, socket_id, flags); 382 } else { 383 ring = rte_ring_lookup(name); 384 } 385 386 if (ring == NULL) { 387 rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name); 388 } 389 390 return ring; 391 } 392 393 static int 394 init_dispatch_ring(void) 395 { 396 int j; 397 char name_buf[RTE_RING_NAMESIZE]; 398 int queueid; 399 400 unsigned socketid = lcore_conf.socket_id; 401 402 /* Create ring according to ports actually being used. */ 403 int nb_ports = ff_global_cfg.dpdk.nb_ports; 404 for (j = 0; j < nb_ports; j++) { 405 uint16_t portid = ff_global_cfg.dpdk.portid_list[j]; 406 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid]; 407 int nb_queues = pconf->nb_lcores; 408 if (dispatch_ring[portid] == NULL) { 409 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid); 410 411 dispatch_ring[portid] = rte_zmalloc(name_buf, 412 sizeof(struct rte_ring *) * nb_queues, 413 RTE_CACHE_LINE_SIZE); 414 if (dispatch_ring[portid] == NULL) { 415 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 416 "failed\n", name_buf); 417 } 418 } 419 420 for(queueid = 0; queueid < nb_queues; ++queueid) { 421 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d", 422 portid, queueid); 423 dispatch_ring[portid][queueid] = create_ring(name_buf, 424 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ); 425 426 if (dispatch_ring[portid][queueid] == NULL) 427 rte_panic("create ring:%s failed!\n", name_buf); 428 429 printf("create ring:%s success, %u ring entries are now free!\n", 430 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid])); 431 } 432 } 433 434 return 0; 435 } 436 437 static void 438 ff_msg_init(struct rte_mempool *mp, 439 __attribute__((unused)) void *opaque_arg, 440 void *obj, __attribute__((unused)) unsigned i) 441 { 442 struct ff_msg *msg = (struct ff_msg *)obj; 443 msg->msg_type = FF_UNKNOWN; 444 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 445 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 446 msg->original_buf = NULL; 447 msg->original_buf_len = 0; 448 } 449 450 static int 451 init_msg_ring(void) 452 { 453 uint16_t i, j; 454 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 455 unsigned socketid = lcore_conf.socket_id; 456 457 /* Create message buffer pool */ 458 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 459 message_pool = rte_mempool_create(FF_MSG_POOL, 460 MSG_RING_SIZE * 2 * nb_procs, 461 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 462 NULL, NULL, ff_msg_init, NULL, 463 socketid, 0); 464 } else { 465 message_pool = rte_mempool_lookup(FF_MSG_POOL); 466 } 467 468 if (message_pool == NULL) { 469 rte_panic("Create msg mempool failed\n"); 470 } 471 472 for(i = 0; i < nb_procs; ++i) { 473 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 474 "%s%u", FF_MSG_RING_IN, i); 475 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 476 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 477 if (msg_ring[i].ring[0] == NULL) 478 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 479 480 for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) { 481 snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE, 482 "%s%u_%u", FF_MSG_RING_OUT, i, j); 483 msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j], 484 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 485 if (msg_ring[i].ring[j] == NULL) 486 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]); 487 } 488 } 489 490 return 0; 491 } 492 493 #ifdef FF_KNI 494 495 static enum FF_KNICTL_CMD get_kni_action(const char *c){ 496 if (!c) 497 return FF_KNICTL_ACTION_DEFAULT; 498 if (0 == strcasecmp(c, "alltokni")){ 499 return FF_KNICTL_ACTION_ALL_TO_KNI; 500 } else if (0 == strcasecmp(c, "alltoff")){ 501 return FF_KNICTL_ACTION_ALL_TO_FF; 502 } else if (0 == strcasecmp(c, "default")){ 503 return FF_KNICTL_ACTION_DEFAULT; 504 } else { 505 return FF_KNICTL_ACTION_DEFAULT; 506 } 507 } 508 509 static int 510 init_kni(void) 511 { 512 int nb_ports = rte_eth_dev_count_avail(); 513 kni_accept = 0; 514 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 515 kni_accept = 1; 516 517 knictl_action = get_kni_action(ff_global_cfg.kni.kni_action); 518 519 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 520 ff_global_cfg.kni.udp_port); 521 522 unsigned socket_id = lcore_conf.socket_id; 523 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 524 525 nb_ports = ff_global_cfg.dpdk.nb_ports; 526 int i, ret; 527 for (i = 0; i < nb_ports; i++) { 528 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 529 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 530 } 531 532 return 0; 533 } 534 #endif 535 536 static void 537 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues) 538 { 539 if (reta_size == 0) { 540 return; 541 } 542 543 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 544 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 545 546 /* config HW indirection table */ 547 unsigned i, j, hash=0; 548 for (i = 0; i < reta_conf_size; i++) { 549 reta_conf[i].mask = ~0ULL; 550 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 551 reta_conf[i].reta[j] = hash++ % nb_queues; 552 } 553 } 554 555 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 556 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 557 port_id); 558 } 559 } 560 561 static int 562 init_port_start(void) 563 { 564 int nb_ports = ff_global_cfg.dpdk.nb_ports; 565 unsigned socketid = 0; 566 struct rte_mempool *mbuf_pool; 567 uint16_t i, j; 568 569 for (i = 0; i < nb_ports; i++) { 570 uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i]; 571 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id]; 572 uint16_t nb_queues = pconf->nb_lcores; 573 574 for (j=0; j<=pconf->nb_slaves; j++) { 575 if (j < pconf->nb_slaves) { 576 port_id = pconf->slave_portid_list[j]; 577 printf("To init %s's %d'st slave port[%d]\n", 578 ff_global_cfg.dpdk.bond_cfgs->name, 579 j, port_id); 580 } else { 581 port_id = u_port_id; 582 } 583 584 struct rte_eth_dev_info dev_info; 585 struct rte_eth_conf port_conf = {0}; 586 struct rte_eth_rxconf rxq_conf; 587 struct rte_eth_txconf txq_conf; 588 589 int ret = rte_eth_dev_info_get(port_id, &dev_info); 590 if (ret != 0) 591 rte_exit(EXIT_FAILURE, 592 "Error during getting device (port %u) info: %s\n", 593 port_id, strerror(-ret)); 594 595 if (nb_queues > dev_info.max_rx_queues) { 596 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 597 nb_queues, 598 dev_info.max_rx_queues); 599 } 600 601 if (nb_queues > dev_info.max_tx_queues) { 602 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 603 nb_queues, 604 dev_info.max_tx_queues); 605 } 606 607 struct rte_ether_addr addr; 608 rte_eth_macaddr_get(port_id, &addr); 609 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 610 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 611 (unsigned)port_id, 612 addr.addr_bytes[0], addr.addr_bytes[1], 613 addr.addr_bytes[2], addr.addr_bytes[3], 614 addr.addr_bytes[4], addr.addr_bytes[5]); 615 616 rte_memcpy(pconf->mac, 617 addr.addr_bytes, RTE_ETHER_ADDR_LEN); 618 619 /* Set RSS mode */ 620 uint64_t default_rss_hf = ETH_RSS_PROTO_MASK; 621 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 622 port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf; 623 if (dev_info.hash_key_size == 52) { 624 rsskey = default_rsskey_52bytes; 625 rsskey_len = 52; 626 } 627 if (ff_global_cfg.dpdk.symmetric_rss) { 628 printf("Use symmetric Receive-side Scaling(RSS) key\n"); 629 rsskey = symmetric_rsskey; 630 } 631 port_conf.rx_adv_conf.rss_conf.rss_key = rsskey; 632 port_conf.rx_adv_conf.rss_conf.rss_key_len = rsskey_len; 633 port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads; 634 if (port_conf.rx_adv_conf.rss_conf.rss_hf != 635 ETH_RSS_PROTO_MASK) { 636 printf("Port %u modified RSS hash function based on hardware support," 637 "requested:%#"PRIx64" configured:%#"PRIx64"\n", 638 port_id, default_rss_hf, 639 port_conf.rx_adv_conf.rss_conf.rss_hf); 640 } 641 642 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) { 643 port_conf.txmode.offloads |= 644 DEV_TX_OFFLOAD_MBUF_FAST_FREE; 645 } 646 647 /* Set Rx VLAN stripping */ 648 if (ff_global_cfg.dpdk.vlan_strip) { 649 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 650 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP; 651 } 652 } 653 654 /* Enable HW CRC stripping */ 655 port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC; 656 657 /* FIXME: Enable TCP LRO ?*/ 658 #if 0 659 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 660 printf("LRO is supported\n"); 661 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO; 662 pconf->hw_features.rx_lro = 1; 663 } 664 #endif 665 666 /* Set Rx checksum checking */ 667 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 668 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 669 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 670 printf("RX checksum offload supported\n"); 671 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM; 672 pconf->hw_features.rx_csum = 1; 673 } 674 675 if (ff_global_cfg.dpdk.tx_csum_offoad_skip == 0) { 676 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 677 printf("TX ip checksum offload supported\n"); 678 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; 679 pconf->hw_features.tx_csum_ip = 1; 680 } 681 682 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 683 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 684 printf("TX TCP&UDP checksum offload supported\n"); 685 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM; 686 pconf->hw_features.tx_csum_l4 = 1; 687 } 688 } else { 689 printf("TX checksum offoad is disabled\n"); 690 } 691 692 if (ff_global_cfg.dpdk.tso) { 693 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 694 printf("TSO is supported\n"); 695 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO; 696 pconf->hw_features.tx_tso = 1; 697 } 698 } else { 699 printf("TSO is disabled\n"); 700 } 701 702 if (dev_info.reta_size) { 703 /* reta size must be power of 2 */ 704 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 705 706 rss_reta_size[port_id] = dev_info.reta_size; 707 printf("port[%d]: rss table size: %d\n", port_id, 708 dev_info.reta_size); 709 } 710 711 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 712 continue; 713 } 714 715 ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); 716 if (ret != 0) { 717 return ret; 718 } 719 720 static uint16_t nb_rxd = RX_QUEUE_SIZE; 721 static uint16_t nb_txd = TX_QUEUE_SIZE; 722 ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd); 723 if (ret < 0) 724 printf("Could not adjust number of descriptors " 725 "for port%u (%d)\n", (unsigned)port_id, ret); 726 727 uint16_t q; 728 for (q = 0; q < nb_queues; q++) { 729 if (numa_on) { 730 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q]; 731 socketid = rte_lcore_to_socket_id(lcore_id); 732 } 733 mbuf_pool = pktmbuf_pool[socketid]; 734 735 txq_conf = dev_info.default_txconf; 736 txq_conf.offloads = port_conf.txmode.offloads; 737 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd, 738 socketid, &txq_conf); 739 if (ret < 0) { 740 return ret; 741 } 742 743 rxq_conf = dev_info.default_rxconf; 744 rxq_conf.offloads = port_conf.rxmode.offloads; 745 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd, 746 socketid, &rxq_conf, mbuf_pool); 747 if (ret < 0) { 748 return ret; 749 } 750 } 751 752 753 if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME, 754 strlen(dev_info.driver_name)) == 0) { 755 756 rte_eth_macaddr_get(port_id, &addr); 757 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 758 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 759 (unsigned)port_id, 760 addr.addr_bytes[0], addr.addr_bytes[1], 761 addr.addr_bytes[2], addr.addr_bytes[3], 762 addr.addr_bytes[4], addr.addr_bytes[5]); 763 764 rte_memcpy(pconf->mac, 765 addr.addr_bytes, RTE_ETHER_ADDR_LEN); 766 767 int mode, count, x; 768 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS; 769 770 mode = rte_eth_bond_mode_get(port_id); 771 printf("Port %u, bond mode:%d\n", port_id, mode); 772 773 count = rte_eth_bond_slaves_get(port_id, slaves, len); 774 printf("Port %u, %s's slave ports count:%d\n", port_id, 775 ff_global_cfg.dpdk.bond_cfgs->name, count); 776 for (x=0; x<count; x++) { 777 printf("Port %u, %s's slave port[%u]\n", port_id, 778 ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]); 779 } 780 } 781 782 ret = rte_eth_dev_start(port_id); 783 if (ret < 0) { 784 return ret; 785 } 786 787 if (nb_queues > 1) { 788 /* set HW rss hash function to Toeplitz. */ 789 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 790 struct rte_eth_hash_filter_info info = {0}; 791 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 792 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 793 794 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 795 RTE_ETH_FILTER_SET, &info) < 0) { 796 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 797 port_id); 798 } 799 } 800 801 set_rss_table(port_id, dev_info.reta_size, nb_queues); 802 } 803 804 /* Enable RX in promiscuous mode for the Ethernet device. */ 805 if (ff_global_cfg.dpdk.promiscuous) { 806 ret = rte_eth_promiscuous_enable(port_id); 807 if (ret == 0) { 808 printf("set port %u to promiscuous mode ok\n", port_id); 809 } else { 810 printf("set port %u to promiscuous mode error\n", port_id); 811 } 812 } 813 } 814 } 815 816 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 817 check_all_ports_link_status(); 818 } 819 820 return 0; 821 } 822 823 static int 824 init_clock(void) 825 { 826 rte_timer_subsystem_init(); 827 uint64_t hz = rte_get_timer_hz(); 828 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 829 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 830 831 rte_timer_init(&freebsd_clock); 832 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 833 rte_lcore_id(), &ff_hardclock_job, NULL); 834 835 ff_update_current_ts(); 836 837 return 0; 838 } 839 840 int 841 ff_dpdk_init(int argc, char **argv) 842 { 843 if (ff_global_cfg.dpdk.nb_procs < 1 || 844 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 845 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 846 ff_global_cfg.dpdk.proc_id < 0) { 847 printf("param num_procs[%d] or proc_id[%d] error!\n", 848 ff_global_cfg.dpdk.nb_procs, 849 ff_global_cfg.dpdk.proc_id); 850 exit(1); 851 } 852 853 int ret = rte_eal_init(argc, argv); 854 if (ret < 0) { 855 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 856 } 857 858 numa_on = ff_global_cfg.dpdk.numa_on; 859 860 idle_sleep = ff_global_cfg.dpdk.idle_sleep; 861 pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \ 862 BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay; 863 864 init_lcore_conf(); 865 866 init_mem_pool(); 867 868 init_dispatch_ring(); 869 870 init_msg_ring(); 871 872 #ifdef FF_KNI 873 enable_kni = ff_global_cfg.kni.enable; 874 if (enable_kni) { 875 init_kni(); 876 } 877 #endif 878 879 #ifdef FF_USE_PAGE_ARRAY 880 ff_mmap_init(); 881 #endif 882 883 ret = init_port_start(); 884 if (ret < 0) { 885 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 886 } 887 888 init_clock(); 889 890 return 0; 891 } 892 893 static void 894 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 895 { 896 uint8_t rx_csum = ctx->hw_features.rx_csum; 897 if (rx_csum) { 898 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 899 rte_pktmbuf_free(pkt); 900 return; 901 } 902 } 903 904 void *data = rte_pktmbuf_mtod(pkt, void*); 905 uint16_t len = rte_pktmbuf_data_len(pkt); 906 907 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 908 if (hdr == NULL) { 909 rte_pktmbuf_free(pkt); 910 return; 911 } 912 913 if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) { 914 ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci); 915 } 916 917 struct rte_mbuf *pn = pkt->next; 918 void *prev = hdr; 919 while(pn != NULL) { 920 data = rte_pktmbuf_mtod(pn, void*); 921 len = rte_pktmbuf_data_len(pn); 922 923 void *mb = ff_mbuf_get(prev, data, len); 924 if (mb == NULL) { 925 ff_mbuf_free(hdr); 926 rte_pktmbuf_free(pkt); 927 return; 928 } 929 pn = pn->next; 930 prev = mb; 931 } 932 933 ff_veth_process_packet(ctx->ifp, hdr); 934 } 935 936 static enum FilterReturn 937 protocol_filter(const void *data, uint16_t len) 938 { 939 if(len < RTE_ETHER_ADDR_LEN) 940 return FILTER_UNKNOWN; 941 942 const struct rte_ether_hdr *hdr; 943 const struct rte_vlan_hdr *vlanhdr; 944 hdr = (const struct rte_ether_hdr *)data; 945 uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type); 946 data += RTE_ETHER_HDR_LEN; 947 len -= RTE_ETHER_HDR_LEN; 948 949 if (ether_type == RTE_ETHER_TYPE_VLAN) { 950 vlanhdr = (struct rte_vlan_hdr *)data; 951 ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto); 952 data += sizeof(struct rte_vlan_hdr); 953 len -= sizeof(struct rte_vlan_hdr); 954 } 955 956 if(ether_type == RTE_ETHER_TYPE_ARP) 957 return FILTER_ARP; 958 959 #ifdef INET6 960 if (ether_type == RTE_ETHER_TYPE_IPV6) { 961 return ff_kni_proto_filter(data, 962 len, ether_type); 963 } 964 #endif 965 966 #ifndef FF_KNI 967 return FILTER_UNKNOWN; 968 #else 969 if (!enable_kni) { 970 return FILTER_UNKNOWN; 971 } 972 973 if(ether_type != RTE_ETHER_TYPE_IPV4) 974 return FILTER_UNKNOWN; 975 976 return ff_kni_proto_filter(data, 977 len, ether_type); 978 #endif 979 } 980 981 static inline void 982 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m) 983 { 984 struct rte_mbuf *md; 985 void *src, *dst; 986 987 dst = rte_pktmbuf_mtod(mi, void *); 988 src = rte_pktmbuf_mtod(m, void *); 989 990 mi->data_len = m->data_len; 991 rte_memcpy(dst, src, m->data_len); 992 993 mi->port = m->port; 994 mi->vlan_tci = m->vlan_tci; 995 mi->vlan_tci_outer = m->vlan_tci_outer; 996 mi->tx_offload = m->tx_offload; 997 mi->hash = m->hash; 998 mi->ol_flags = m->ol_flags; 999 mi->packet_type = m->packet_type; 1000 } 1001 1002 /* copied from rte_pktmbuf_clone */ 1003 static inline struct rte_mbuf * 1004 pktmbuf_deep_clone(const struct rte_mbuf *md, 1005 struct rte_mempool *mp) 1006 { 1007 struct rte_mbuf *mc, *mi, **prev; 1008 uint32_t pktlen; 1009 uint8_t nseg; 1010 1011 if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) 1012 return NULL; 1013 1014 mi = mc; 1015 prev = &mi->next; 1016 pktlen = md->pkt_len; 1017 nseg = 0; 1018 1019 do { 1020 nseg++; 1021 pktmbuf_deep_attach(mi, md); 1022 *prev = mi; 1023 prev = &mi->next; 1024 } while ((md = md->next) != NULL && 1025 (mi = rte_pktmbuf_alloc(mp)) != NULL); 1026 1027 *prev = NULL; 1028 mc->nb_segs = nseg; 1029 mc->pkt_len = pktlen; 1030 1031 /* Allocation of new indirect segment failed */ 1032 if (unlikely (mi == NULL)) { 1033 rte_pktmbuf_free(mc); 1034 return NULL; 1035 } 1036 1037 __rte_mbuf_sanity_check(mc, 1); 1038 return mc; 1039 } 1040 1041 static inline void 1042 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 1043 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 1044 { 1045 struct lcore_conf *qconf = &lcore_conf; 1046 uint16_t nb_queues = qconf->nb_queue_list[port_id]; 1047 1048 uint16_t i; 1049 for (i = 0; i < count; i++) { 1050 struct rte_mbuf *rtem = bufs[i]; 1051 1052 if (unlikely( ff_global_cfg.pcap.enable)) { 1053 if (!pkts_from_ring) { 1054 ff_dump_packets( ff_global_cfg.pcap.save_path, rtem, ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len); 1055 } 1056 } 1057 1058 void *data = rte_pktmbuf_mtod(rtem, void*); 1059 uint16_t len = rte_pktmbuf_data_len(rtem); 1060 1061 if (!pkts_from_ring) { 1062 ff_traffic.rx_packets++; 1063 ff_traffic.rx_bytes += len; 1064 } 1065 1066 if (!pkts_from_ring && packet_dispatcher) { 1067 int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues); 1068 if (ret == FF_DISPATCH_RESPONSE) { 1069 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len; 1070 1071 /* 1072 * We have not support vlan out strip 1073 */ 1074 if (rtem->vlan_tci) { 1075 data = rte_pktmbuf_prepend(rtem, sizeof(struct rte_vlan_hdr)); 1076 if (data != NULL) { 1077 memmove(data, data + sizeof(struct rte_vlan_hdr), RTE_ETHER_HDR_LEN); 1078 struct rte_ether_hdr *etherhdr = (struct rte_ether_hdr *)data; 1079 struct rte_vlan_hdr *vlanhdr = (struct rte_vlan_hdr *)(data + RTE_ETHER_HDR_LEN); 1080 vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci); 1081 vlanhdr->eth_proto = etherhdr->ether_type; 1082 etherhdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN); 1083 } 1084 } 1085 send_single_packet(rtem, port_id); 1086 continue; 1087 } 1088 1089 if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) { 1090 rte_pktmbuf_free(rtem); 1091 continue; 1092 } 1093 1094 if (ret != queue_id) { 1095 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem); 1096 if (ret < 0) 1097 rte_pktmbuf_free(rtem); 1098 1099 continue; 1100 } 1101 } 1102 1103 enum FilterReturn filter = protocol_filter(data, len); 1104 #ifdef INET6 1105 if (filter == FILTER_ARP || filter == FILTER_NDP) { 1106 #else 1107 if (filter == FILTER_ARP) { 1108 #endif 1109 struct rte_mempool *mbuf_pool; 1110 struct rte_mbuf *mbuf_clone; 1111 if (!pkts_from_ring) { 1112 uint16_t j; 1113 for(j = 0; j < nb_queues; ++j) { 1114 if(j == queue_id) 1115 continue; 1116 1117 unsigned socket_id = 0; 1118 if (numa_on) { 1119 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j]; 1120 socket_id = rte_lcore_to_socket_id(lcore_id); 1121 } 1122 mbuf_pool = pktmbuf_pool[socket_id]; 1123 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1124 if(mbuf_clone) { 1125 int ret = rte_ring_enqueue(dispatch_ring[port_id][j], 1126 mbuf_clone); 1127 if (ret < 0) 1128 rte_pktmbuf_free(mbuf_clone); 1129 } 1130 } 1131 } 1132 1133 #ifdef FF_KNI 1134 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1135 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 1136 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1137 if(mbuf_clone) { 1138 ff_kni_enqueue(port_id, mbuf_clone); 1139 } 1140 } 1141 #endif 1142 ff_veth_input(ctx, rtem); 1143 #ifdef FF_KNI 1144 } else if (enable_kni) { 1145 if (knictl_action == FF_KNICTL_ACTION_ALL_TO_KNI){ 1146 ff_kni_enqueue(port_id, rtem); 1147 } else if (knictl_action == FF_KNICTL_ACTION_ALL_TO_FF){ 1148 ff_veth_input(ctx, rtem); 1149 } else if (knictl_action == FF_KNICTL_ACTION_DEFAULT){ 1150 if (enable_kni && 1151 ((filter == FILTER_KNI && kni_accept) || 1152 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 1153 ff_kni_enqueue(port_id, rtem); 1154 } else { 1155 ff_veth_input(ctx, rtem); 1156 } 1157 } else { 1158 ff_veth_input(ctx, rtem); 1159 } 1160 #endif 1161 } else { 1162 ff_veth_input(ctx, rtem); 1163 } 1164 } 1165 } 1166 1167 static inline int 1168 process_dispatch_ring(uint16_t port_id, uint16_t queue_id, 1169 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 1170 { 1171 /* read packet from ring buf and to process */ 1172 uint16_t nb_rb; 1173 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id], 1174 (void **)pkts_burst, MAX_PKT_BURST, NULL); 1175 1176 if(nb_rb > 0) { 1177 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 1178 } 1179 1180 return 0; 1181 } 1182 1183 static inline void 1184 handle_sysctl_msg(struct ff_msg *msg) 1185 { 1186 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 1187 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 1188 msg->sysctl.newlen); 1189 1190 if (ret < 0) { 1191 msg->result = errno; 1192 } else { 1193 msg->result = 0; 1194 } 1195 } 1196 1197 static inline void 1198 handle_ioctl_msg(struct ff_msg *msg) 1199 { 1200 int fd, ret; 1201 #ifdef INET6 1202 if (msg->msg_type == FF_IOCTL6) { 1203 fd = ff_socket(AF_INET6, SOCK_DGRAM, 0); 1204 } else 1205 #endif 1206 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 1207 1208 if (fd < 0) { 1209 ret = -1; 1210 goto done; 1211 } 1212 1213 ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data); 1214 1215 ff_close(fd); 1216 1217 done: 1218 if (ret < 0) { 1219 msg->result = errno; 1220 } else { 1221 msg->result = 0; 1222 } 1223 } 1224 1225 static inline void 1226 handle_route_msg(struct ff_msg *msg) 1227 { 1228 int ret = ff_rtioctl(msg->route.fib, msg->route.data, 1229 &msg->route.len, msg->route.maxlen); 1230 if (ret < 0) { 1231 msg->result = errno; 1232 } else { 1233 msg->result = 0; 1234 } 1235 } 1236 1237 static inline void 1238 handle_top_msg(struct ff_msg *msg) 1239 { 1240 msg->top = ff_top_status; 1241 msg->result = 0; 1242 } 1243 1244 #ifdef FF_NETGRAPH 1245 static inline void 1246 handle_ngctl_msg(struct ff_msg *msg) 1247 { 1248 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data); 1249 if (ret < 0) { 1250 msg->result = errno; 1251 } else { 1252 msg->result = 0; 1253 msg->ngctl.ret = ret; 1254 } 1255 } 1256 #endif 1257 1258 #ifdef FF_IPFW 1259 static inline void 1260 handle_ipfw_msg(struct ff_msg *msg) 1261 { 1262 int fd, ret; 1263 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW); 1264 if (fd < 0) { 1265 ret = -1; 1266 goto done; 1267 } 1268 1269 switch (msg->ipfw.cmd) { 1270 case FF_IPFW_GET: 1271 ret = ff_getsockopt_freebsd(fd, msg->ipfw.level, 1272 msg->ipfw.optname, msg->ipfw.optval, 1273 msg->ipfw.optlen); 1274 break; 1275 case FF_IPFW_SET: 1276 ret = ff_setsockopt_freebsd(fd, msg->ipfw.level, 1277 msg->ipfw.optname, msg->ipfw.optval, 1278 *(msg->ipfw.optlen)); 1279 break; 1280 default: 1281 ret = -1; 1282 errno = ENOTSUP; 1283 break; 1284 } 1285 1286 ff_close(fd); 1287 1288 done: 1289 if (ret < 0) { 1290 msg->result = errno; 1291 } else { 1292 msg->result = 0; 1293 } 1294 } 1295 #endif 1296 1297 static inline void 1298 handle_traffic_msg(struct ff_msg *msg) 1299 { 1300 msg->traffic = ff_traffic; 1301 msg->result = 0; 1302 } 1303 1304 #ifdef FF_KNI 1305 static inline void 1306 handle_knictl_msg(struct ff_msg *msg) 1307 { 1308 if (msg->knictl.kni_cmd == FF_KNICTL_CMD_SET){ 1309 switch (msg->knictl.kni_action){ 1310 case FF_KNICTL_ACTION_ALL_TO_FF: knictl_action = FF_KNICTL_ACTION_ALL_TO_FF; msg->result = 0; printf("new kni action: alltoff\n"); break; 1311 case FF_KNICTL_ACTION_ALL_TO_KNI: knictl_action = FF_KNICTL_ACTION_ALL_TO_KNI; msg->result = 0; printf("new kni action: alltokni\n"); break; 1312 case FF_KNICTL_ACTION_DEFAULT: knictl_action = FF_KNICTL_ACTION_DEFAULT; msg->result = 0; printf("new kni action: default\n"); break; 1313 default: msg->result = -1; 1314 } 1315 } 1316 else if (msg->knictl.kni_cmd == FF_KNICTL_CMD_GET){ 1317 msg->knictl.kni_action = knictl_action; 1318 } else { 1319 msg->result = -2; 1320 } 1321 } 1322 #endif 1323 1324 static inline void 1325 handle_default_msg(struct ff_msg *msg) 1326 { 1327 msg->result = ENOTSUP; 1328 } 1329 1330 static inline void 1331 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1332 { 1333 switch (msg->msg_type) { 1334 case FF_SYSCTL: 1335 handle_sysctl_msg(msg); 1336 break; 1337 case FF_IOCTL: 1338 #ifdef INET6 1339 case FF_IOCTL6: 1340 #endif 1341 handle_ioctl_msg(msg); 1342 break; 1343 case FF_ROUTE: 1344 handle_route_msg(msg); 1345 break; 1346 case FF_TOP: 1347 handle_top_msg(msg); 1348 break; 1349 #ifdef FF_NETGRAPH 1350 case FF_NGCTL: 1351 handle_ngctl_msg(msg); 1352 break; 1353 #endif 1354 #ifdef FF_IPFW 1355 case FF_IPFW_CTL: 1356 handle_ipfw_msg(msg); 1357 break; 1358 #endif 1359 case FF_TRAFFIC: 1360 handle_traffic_msg(msg); 1361 break; 1362 #ifdef FF_KNI 1363 case FF_KNICTL: 1364 handle_knictl_msg(msg); 1365 break; 1366 #endif 1367 default: 1368 handle_default_msg(msg); 1369 break; 1370 } 1371 rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg); 1372 } 1373 1374 static inline int 1375 process_msg_ring(uint16_t proc_id) 1376 { 1377 void *msg; 1378 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1379 1380 if (unlikely(ret == 0)) { 1381 handle_msg((struct ff_msg *)msg, proc_id); 1382 } 1383 1384 return 0; 1385 } 1386 1387 /* Send burst of packets on an output interface */ 1388 static inline int 1389 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1390 { 1391 struct rte_mbuf **m_table; 1392 int ret; 1393 uint16_t queueid; 1394 1395 queueid = qconf->tx_queue_id[port]; 1396 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1397 1398 if (unlikely(ff_global_cfg.pcap.enable)) { 1399 uint16_t i; 1400 for (i = 0; i < n; i++) { 1401 ff_dump_packets( ff_global_cfg.pcap.save_path, m_table[i], 1402 ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len); 1403 } 1404 } 1405 1406 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1407 ff_traffic.tx_packets += ret; 1408 uint16_t i; 1409 for (i = 0; i < ret; i++) { 1410 ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]); 1411 #ifdef FF_USE_PAGE_ARRAY 1412 if (qconf->tx_mbufs[port].bsd_m_table[i]) 1413 ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs); 1414 #endif 1415 } 1416 if (unlikely(ret < n)) { 1417 do { 1418 rte_pktmbuf_free(m_table[ret]); 1419 #ifdef FF_USE_PAGE_ARRAY 1420 if ( qconf->tx_mbufs[port].bsd_m_table[ret] ) 1421 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]); 1422 #endif 1423 } while (++ret < n); 1424 } 1425 return 0; 1426 } 1427 1428 /* Enqueue a single packet, and send burst if queue is filled */ 1429 static inline int 1430 send_single_packet(struct rte_mbuf *m, uint8_t port) 1431 { 1432 uint16_t len; 1433 struct lcore_conf *qconf; 1434 1435 qconf = &lcore_conf; 1436 len = qconf->tx_mbufs[port].len; 1437 qconf->tx_mbufs[port].m_table[len] = m; 1438 len++; 1439 1440 /* enough pkts to be sent */ 1441 if (unlikely(len == MAX_PKT_BURST)) { 1442 send_burst(qconf, MAX_PKT_BURST, port); 1443 len = 0; 1444 } 1445 1446 qconf->tx_mbufs[port].len = len; 1447 return 0; 1448 } 1449 1450 int 1451 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1452 int total) 1453 { 1454 #ifdef FF_USE_PAGE_ARRAY 1455 struct lcore_conf *qconf = &lcore_conf; 1456 int len = 0; 1457 1458 len = ff_if_send_onepkt(ctx, m,total); 1459 if (unlikely(len == MAX_PKT_BURST)) { 1460 send_burst(qconf, MAX_PKT_BURST, ctx->port_id); 1461 len = 0; 1462 } 1463 qconf->tx_mbufs[ctx->port_id].len = len; 1464 return 0; 1465 #endif 1466 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1467 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1468 if (head == NULL) { 1469 ff_mbuf_free(m); 1470 return -1; 1471 } 1472 1473 head->pkt_len = total; 1474 head->nb_segs = 0; 1475 1476 int off = 0; 1477 struct rte_mbuf *cur = head, *prev = NULL; 1478 while(total > 0) { 1479 if (cur == NULL) { 1480 cur = rte_pktmbuf_alloc(mbuf_pool); 1481 if (cur == NULL) { 1482 rte_pktmbuf_free(head); 1483 ff_mbuf_free(m); 1484 return -1; 1485 } 1486 } 1487 1488 if (prev != NULL) { 1489 prev->next = cur; 1490 } 1491 head->nb_segs++; 1492 1493 prev = cur; 1494 void *data = rte_pktmbuf_mtod(cur, void*); 1495 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1496 int ret = ff_mbuf_copydata(m, data, off, len); 1497 if (ret < 0) { 1498 rte_pktmbuf_free(head); 1499 ff_mbuf_free(m); 1500 return -1; 1501 } 1502 1503 1504 cur->data_len = len; 1505 off += len; 1506 total -= len; 1507 cur = NULL; 1508 } 1509 1510 struct ff_tx_offload offload = {0}; 1511 ff_mbuf_tx_offload(m, &offload); 1512 1513 void *data = rte_pktmbuf_mtod(head, void*); 1514 1515 if (offload.ip_csum) { 1516 /* ipv6 not supported yet */ 1517 struct rte_ipv4_hdr *iph; 1518 int iph_len; 1519 iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN); 1520 iph_len = (iph->version_ihl & 0x0f) << 2; 1521 1522 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 1523 head->l2_len = RTE_ETHER_HDR_LEN; 1524 head->l3_len = iph_len; 1525 } 1526 1527 if (ctx->hw_features.tx_csum_l4) { 1528 struct rte_ipv4_hdr *iph; 1529 int iph_len; 1530 iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN); 1531 iph_len = (iph->version_ihl & 0x0f) << 2; 1532 1533 if (offload.tcp_csum) { 1534 head->ol_flags |= PKT_TX_TCP_CKSUM; 1535 head->l2_len = RTE_ETHER_HDR_LEN; 1536 head->l3_len = iph_len; 1537 } 1538 1539 /* 1540 * TCP segmentation offload. 1541 * 1542 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 1543 * implies PKT_TX_TCP_CKSUM) 1544 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 1545 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 1546 * write the IP checksum to 0 in the packet 1547 * - fill the mbuf offload information: l2_len, 1548 * l3_len, l4_len, tso_segsz 1549 * - calculate the pseudo header checksum without taking ip_len 1550 * in account, and set it in the TCP header. Refer to 1551 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 1552 * used as helpers. 1553 */ 1554 if (offload.tso_seg_size) { 1555 struct rte_tcp_hdr *tcph; 1556 int tcph_len; 1557 tcph = (struct rte_tcp_hdr *)((char *)iph + iph_len); 1558 tcph_len = (tcph->data_off & 0xf0) >> 2; 1559 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 1560 1561 head->ol_flags |= PKT_TX_TCP_SEG; 1562 head->l4_len = tcph_len; 1563 head->tso_segsz = offload.tso_seg_size; 1564 } 1565 1566 if (offload.udp_csum) { 1567 head->ol_flags |= PKT_TX_UDP_CKSUM; 1568 head->l2_len = RTE_ETHER_HDR_LEN; 1569 head->l3_len = iph_len; 1570 } 1571 } 1572 1573 ff_mbuf_free(m); 1574 1575 return send_single_packet(head, ctx->port_id); 1576 } 1577 1578 static int 1579 main_loop(void *arg) 1580 { 1581 struct loop_routine *lr = (struct loop_routine *)arg; 1582 1583 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1584 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc; 1585 int i, j, nb_rx, idle; 1586 uint16_t port_id, queue_id; 1587 struct lcore_conf *qconf; 1588 uint64_t drain_tsc = 0; 1589 struct ff_dpdk_if_context *ctx; 1590 1591 if (pkt_tx_delay) { 1592 drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay; 1593 } 1594 1595 prev_tsc = 0; 1596 usch_tsc = 0; 1597 1598 qconf = &lcore_conf; 1599 1600 while (1) { 1601 cur_tsc = rte_rdtsc(); 1602 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1603 rte_timer_manage(); 1604 } 1605 1606 idle = 1; 1607 sys_tsc = 0; 1608 usr_tsc = 0; 1609 1610 /* 1611 * TX burst queue drain 1612 */ 1613 diff_tsc = cur_tsc - prev_tsc; 1614 if (unlikely(diff_tsc >= drain_tsc)) { 1615 for (i = 0; i < qconf->nb_tx_port; i++) { 1616 port_id = qconf->tx_port_id[i]; 1617 if (qconf->tx_mbufs[port_id].len == 0) 1618 continue; 1619 1620 idle = 0; 1621 1622 send_burst(qconf, 1623 qconf->tx_mbufs[port_id].len, 1624 port_id); 1625 qconf->tx_mbufs[port_id].len = 0; 1626 } 1627 1628 prev_tsc = cur_tsc; 1629 } 1630 1631 /* 1632 * Read packet from RX queues 1633 */ 1634 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1635 port_id = qconf->rx_queue_list[i].port_id; 1636 queue_id = qconf->rx_queue_list[i].queue_id; 1637 ctx = veth_ctx[port_id]; 1638 1639 #ifdef FF_KNI 1640 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1641 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1642 } 1643 #endif 1644 1645 process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); 1646 1647 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1648 MAX_PKT_BURST); 1649 if (nb_rx == 0) 1650 continue; 1651 1652 idle = 0; 1653 1654 /* Prefetch first packets */ 1655 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1656 rte_prefetch0(rte_pktmbuf_mtod( 1657 pkts_burst[j], void *)); 1658 } 1659 1660 /* Prefetch and handle already prefetched packets */ 1661 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1662 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1663 j + PREFETCH_OFFSET], void *)); 1664 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1665 } 1666 1667 /* Handle remaining prefetched packets */ 1668 for (; j < nb_rx; j++) { 1669 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1670 } 1671 } 1672 1673 process_msg_ring(qconf->proc_id); 1674 1675 div_tsc = rte_rdtsc(); 1676 1677 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) { 1678 usch_tsc = cur_tsc; 1679 lr->loop(lr->arg); 1680 } 1681 1682 idle_sleep_tsc = rte_rdtsc(); 1683 if (likely(idle && idle_sleep)) { 1684 usleep(idle_sleep); 1685 end_tsc = rte_rdtsc(); 1686 } else { 1687 end_tsc = idle_sleep_tsc; 1688 } 1689 1690 if (usch_tsc == cur_tsc) { 1691 usr_tsc = idle_sleep_tsc - div_tsc; 1692 } 1693 1694 if (!idle) { 1695 sys_tsc = div_tsc - cur_tsc; 1696 ff_top_status.sys_tsc += sys_tsc; 1697 } 1698 1699 ff_top_status.usr_tsc += usr_tsc; 1700 ff_top_status.work_tsc += end_tsc - cur_tsc; 1701 ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1702 1703 ff_top_status.loops++; 1704 } 1705 1706 return 0; 1707 } 1708 1709 int 1710 ff_dpdk_if_up(void) { 1711 int i; 1712 struct lcore_conf *qconf = &lcore_conf; 1713 for (i = 0; i < qconf->nb_tx_port; i++) { 1714 uint16_t port_id = qconf->tx_port_id[i]; 1715 1716 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id]; 1717 veth_ctx[port_id] = ff_veth_attach(pconf); 1718 if (veth_ctx[port_id] == NULL) { 1719 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1720 } 1721 } 1722 1723 return 0; 1724 } 1725 1726 void 1727 ff_dpdk_run(loop_func_t loop, void *arg) { 1728 struct loop_routine *lr = rte_malloc(NULL, 1729 sizeof(struct loop_routine), 0); 1730 lr->loop = loop; 1731 lr->arg = arg; 1732 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1733 rte_eal_mp_wait_lcore(); 1734 rte_free(lr); 1735 } 1736 1737 void 1738 ff_dpdk_pktmbuf_free(void *m) 1739 { 1740 rte_pktmbuf_free((struct rte_mbuf *)m); 1741 } 1742 1743 static uint32_t 1744 toeplitz_hash(unsigned keylen, const uint8_t *key, 1745 unsigned datalen, const uint8_t *data) 1746 { 1747 uint32_t hash = 0, v; 1748 u_int i, b; 1749 1750 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1751 1752 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1753 for (i = 0; i < datalen; i++) { 1754 for (b = 0; b < 8; b++) { 1755 if (data[i] & (1<<(7-b))) 1756 hash ^= v; 1757 v <<= 1; 1758 if ((i + 4) < keylen && 1759 (key[i+4] & (1<<(7-b)))) 1760 v |= 1; 1761 } 1762 } 1763 return (hash); 1764 } 1765 1766 int 1767 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1768 uint16_t sport, uint16_t dport) 1769 { 1770 struct lcore_conf *qconf = &lcore_conf; 1771 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 1772 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id]; 1773 1774 if (nb_queues <= 1) { 1775 return 1; 1776 } 1777 1778 uint16_t reta_size = rss_reta_size[ctx->port_id]; 1779 uint16_t queueid = qconf->tx_queue_id[ctx->port_id]; 1780 1781 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1782 sizeof(dport)]; 1783 1784 unsigned datalen = 0; 1785 1786 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1787 datalen += sizeof(saddr); 1788 1789 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1790 datalen += sizeof(daddr); 1791 1792 bcopy(&sport, &data[datalen], sizeof(sport)); 1793 datalen += sizeof(sport); 1794 1795 bcopy(&dport, &data[datalen], sizeof(dport)); 1796 datalen += sizeof(dport); 1797 1798 uint32_t hash = 0; 1799 hash = toeplitz_hash(rsskey_len, rsskey, datalen, data); 1800 1801 return ((hash & (reta_size - 1)) % nb_queues) == queueid; 1802 } 1803 1804 void 1805 ff_regist_packet_dispatcher(dispatch_func_t func) 1806 { 1807 packet_dispatcher = func; 1808 } 1809 1810 uint64_t 1811 ff_get_tsc_ns() 1812 { 1813 uint64_t cur_tsc = rte_rdtsc(); 1814 uint64_t hz = rte_get_tsc_hz(); 1815 return ((double)cur_tsc/(double)hz) * NS_PER_S; 1816 } 1817 1818