1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 #include <unistd.h> 28 #include <sys/mman.h> 29 #include <errno.h> 30 31 #include <rte_common.h> 32 #include <rte_byteorder.h> 33 #include <rte_log.h> 34 #include <rte_memory.h> 35 #include <rte_memcpy.h> 36 #include <rte_memzone.h> 37 #include <rte_config.h> 38 #include <rte_eal.h> 39 #include <rte_pci.h> 40 #include <rte_mbuf.h> 41 #include <rte_memory.h> 42 #include <rte_lcore.h> 43 #include <rte_launch.h> 44 #include <rte_ethdev.h> 45 #include <rte_debug.h> 46 #include <rte_common.h> 47 #include <rte_ether.h> 48 #include <rte_malloc.h> 49 #include <rte_cycles.h> 50 #include <rte_timer.h> 51 #include <rte_thash.h> 52 #include <rte_ip.h> 53 #include <rte_tcp.h> 54 #include <rte_udp.h> 55 #include <rte_eth_bond.h> 56 57 #include "ff_dpdk_if.h" 58 #include "ff_dpdk_pcap.h" 59 #include "ff_dpdk_kni.h" 60 #include "ff_config.h" 61 #include "ff_veth.h" 62 #include "ff_host_interface.h" 63 #include "ff_msg.h" 64 #include "ff_api.h" 65 #include "ff_memory.h" 66 67 #ifdef FF_KNI 68 #define KNI_MBUF_MAX 2048 69 #define KNI_QUEUE_SIZE 2048 70 71 int enable_kni; 72 static int kni_accept; 73 #endif 74 75 static int numa_on; 76 77 static unsigned idle_sleep; 78 static unsigned pkt_tx_delay; 79 80 static struct rte_timer freebsd_clock; 81 82 // Mellanox Linux's driver key 83 static uint8_t default_rsskey_40bytes[40] = { 84 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 85 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 86 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 87 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 88 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 89 }; 90 91 static int use_rsskey_52bytes = 0; 92 static uint8_t default_rsskey_52bytes[52] = { 93 0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23, 94 0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30, 95 0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02, 96 0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c, 97 0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55, 98 0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e, 99 0x81, 0x15, 0x03, 0x66 100 }; 101 102 struct lcore_conf lcore_conf; 103 104 struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 105 106 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; 107 static dispatch_func_t packet_dispatcher; 108 109 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 110 111 #define BOND_DRIVER_NAME "net_bonding" 112 113 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port); 114 115 struct ff_msg_ring { 116 char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE]; 117 /* ring[0] for lcore recv msg, other send */ 118 /* ring[1] for lcore send msg, other read */ 119 struct rte_ring *ring[FF_MSG_NUM]; 120 } __rte_cache_aligned; 121 122 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 123 static struct rte_mempool *message_pool; 124 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 125 126 static struct ff_top_args ff_top_status; 127 static struct ff_traffic_args ff_traffic; 128 extern void ff_hardclock(void); 129 130 static void 131 ff_hardclock_job(__rte_unused struct rte_timer *timer, 132 __rte_unused void *arg) { 133 ff_hardclock(); 134 ff_update_current_ts(); 135 } 136 137 struct ff_dpdk_if_context * 138 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 139 { 140 struct ff_dpdk_if_context *ctx; 141 142 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 143 if (ctx == NULL) 144 return NULL; 145 146 ctx->sc = sc; 147 ctx->ifp = ifp; 148 ctx->port_id = cfg->port_id; 149 ctx->hw_features = cfg->hw_features; 150 151 return ctx; 152 } 153 154 void 155 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 156 { 157 free(ctx); 158 } 159 160 static void 161 check_all_ports_link_status(void) 162 { 163 #define CHECK_INTERVAL 100 /* 100ms */ 164 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 165 166 uint16_t portid; 167 uint8_t count, all_ports_up, print_flag = 0; 168 struct rte_eth_link link; 169 170 printf("\nChecking link status"); 171 fflush(stdout); 172 173 int i, nb_ports; 174 nb_ports = ff_global_cfg.dpdk.nb_ports; 175 for (count = 0; count <= MAX_CHECK_TIME; count++) { 176 all_ports_up = 1; 177 for (i = 0; i < nb_ports; i++) { 178 uint16_t portid = ff_global_cfg.dpdk.portid_list[i]; 179 memset(&link, 0, sizeof(link)); 180 rte_eth_link_get_nowait(portid, &link); 181 182 /* print link status if flag set */ 183 if (print_flag == 1) { 184 if (link.link_status) { 185 printf("Port %d Link Up - speed %u " 186 "Mbps - %s\n", (int)portid, 187 (unsigned)link.link_speed, 188 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 189 ("full-duplex") : ("half-duplex\n")); 190 } else { 191 printf("Port %d Link Down\n", (int)portid); 192 } 193 continue; 194 } 195 /* clear all_ports_up flag if any link down */ 196 if (link.link_status == 0) { 197 all_ports_up = 0; 198 break; 199 } 200 } 201 202 /* after finally printing all link status, get out */ 203 if (print_flag == 1) 204 break; 205 206 if (all_ports_up == 0) { 207 printf("."); 208 fflush(stdout); 209 rte_delay_ms(CHECK_INTERVAL); 210 } 211 212 /* set the print_flag if all ports up or timeout */ 213 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 214 print_flag = 1; 215 printf("done\n"); 216 } 217 } 218 } 219 220 static int 221 init_lcore_conf(void) 222 { 223 uint8_t nb_dev_ports = rte_eth_dev_count_avail(); 224 if (nb_dev_ports == 0) { 225 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 226 } 227 228 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) { 229 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n", 230 ff_global_cfg.dpdk.max_portid); 231 } 232 233 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs; 234 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 235 236 uint16_t proc_id; 237 for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) { 238 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id]; 239 if (!lcore_config[lcore_id].detected) { 240 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 241 } 242 } 243 244 uint16_t socket_id = 0; 245 if (numa_on) { 246 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 247 } 248 249 lcore_conf.socket_id = socket_id; 250 251 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id]; 252 int j; 253 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) { 254 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j]; 255 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 256 257 int queueid = -1; 258 int i; 259 for (i = 0; i < pconf->nb_lcores; i++) { 260 if (pconf->lcore_list[i] == lcore_id) { 261 queueid = i; 262 } 263 } 264 if (queueid < 0) { 265 continue; 266 } 267 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid); 268 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 269 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 270 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid; 271 lcore_conf.nb_rx_queue++; 272 273 lcore_conf.tx_queue_id[port_id] = queueid; 274 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id; 275 lcore_conf.nb_tx_port++; 276 277 lcore_conf.pcap[port_id] = pconf->pcap; 278 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores; 279 } 280 281 if (lcore_conf.nb_rx_queue == 0) { 282 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id); 283 } 284 285 return 0; 286 } 287 288 static int 289 init_mem_pool(void) 290 { 291 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 292 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 293 uint32_t nb_tx_queue = nb_lcores; 294 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 295 uint16_t max_portid = ff_global_cfg.dpdk.max_portid; 296 297 unsigned nb_mbuf = RTE_ALIGN_CEIL ( 298 (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE + 299 nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST + 300 nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE + 301 nb_lcores * MEMPOOL_CACHE_SIZE + 302 #ifdef FF_KNI 303 nb_ports * KNI_MBUF_MAX + 304 nb_ports * KNI_QUEUE_SIZE + 305 #endif 306 nb_lcores * nb_ports * DISPATCH_RING_SIZE), 307 (unsigned)8192); 308 309 unsigned socketid = 0; 310 uint16_t i, lcore_id; 311 char s[64]; 312 313 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) { 314 lcore_id = ff_global_cfg.dpdk.proc_lcore[i]; 315 if (numa_on) { 316 socketid = rte_lcore_to_socket_id(lcore_id); 317 } 318 319 if (socketid >= NB_SOCKETS) { 320 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 321 socketid, i, NB_SOCKETS); 322 } 323 324 if (pktmbuf_pool[socketid] != NULL) { 325 continue; 326 } 327 328 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 329 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 330 pktmbuf_pool[socketid] = 331 rte_pktmbuf_pool_create(s, nb_mbuf, 332 MEMPOOL_CACHE_SIZE, 0, 333 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 334 } else { 335 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 336 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 337 } 338 339 if (pktmbuf_pool[socketid] == NULL) { 340 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 341 } else { 342 printf("create mbuf pool on socket %d\n", socketid); 343 } 344 345 #ifdef FF_USE_PAGE_ARRAY 346 nb_mbuf = RTE_ALIGN_CEIL ( 347 nb_ports*nb_lcores*MAX_PKT_BURST + 348 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 349 nb_lcores*MEMPOOL_CACHE_SIZE, 350 (unsigned)4096); 351 ff_init_ref_pool(nb_mbuf, socketid); 352 #endif 353 } 354 355 return 0; 356 } 357 358 static struct rte_ring * 359 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 360 { 361 struct rte_ring *ring; 362 363 if (name == NULL) { 364 rte_exit(EXIT_FAILURE, "create ring failed, no name!\n"); 365 } 366 367 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 368 ring = rte_ring_create(name, count, socket_id, flags); 369 } else { 370 ring = rte_ring_lookup(name); 371 } 372 373 if (ring == NULL) { 374 rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name); 375 } 376 377 return ring; 378 } 379 380 static int 381 init_dispatch_ring(void) 382 { 383 int j; 384 char name_buf[RTE_RING_NAMESIZE]; 385 int queueid; 386 387 unsigned socketid = lcore_conf.socket_id; 388 389 /* Create ring according to ports actually being used. */ 390 int nb_ports = ff_global_cfg.dpdk.nb_ports; 391 for (j = 0; j < nb_ports; j++) { 392 uint16_t portid = ff_global_cfg.dpdk.portid_list[j]; 393 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid]; 394 int nb_queues = pconf->nb_lcores; 395 if (dispatch_ring[portid] == NULL) { 396 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid); 397 398 dispatch_ring[portid] = rte_zmalloc(name_buf, 399 sizeof(struct rte_ring *) * nb_queues, 400 RTE_CACHE_LINE_SIZE); 401 if (dispatch_ring[portid] == NULL) { 402 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 403 "failed\n", name_buf); 404 } 405 } 406 407 for(queueid = 0; queueid < nb_queues; ++queueid) { 408 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d", 409 portid, queueid); 410 dispatch_ring[portid][queueid] = create_ring(name_buf, 411 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ); 412 413 if (dispatch_ring[portid][queueid] == NULL) 414 rte_panic("create ring:%s failed!\n", name_buf); 415 416 printf("create ring:%s success, %u ring entries are now free!\n", 417 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid])); 418 } 419 } 420 421 return 0; 422 } 423 424 static void 425 ff_msg_init(struct rte_mempool *mp, 426 __attribute__((unused)) void *opaque_arg, 427 void *obj, __attribute__((unused)) unsigned i) 428 { 429 struct ff_msg *msg = (struct ff_msg *)obj; 430 msg->msg_type = FF_UNKNOWN; 431 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 432 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 433 } 434 435 static int 436 init_msg_ring(void) 437 { 438 uint16_t i, j; 439 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 440 unsigned socketid = lcore_conf.socket_id; 441 442 /* Create message buffer pool */ 443 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 444 message_pool = rte_mempool_create(FF_MSG_POOL, 445 MSG_RING_SIZE * 2 * nb_procs, 446 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 447 NULL, NULL, ff_msg_init, NULL, 448 socketid, 0); 449 } else { 450 message_pool = rte_mempool_lookup(FF_MSG_POOL); 451 } 452 453 if (message_pool == NULL) { 454 rte_panic("Create msg mempool failed\n"); 455 } 456 457 for(i = 0; i < nb_procs; ++i) { 458 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 459 "%s%u", FF_MSG_RING_IN, i); 460 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 461 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 462 if (msg_ring[i].ring[0] == NULL) 463 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 464 465 for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) { 466 snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE, 467 "%s%u_%u", FF_MSG_RING_OUT, i, j); 468 msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j], 469 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 470 if (msg_ring[i].ring[j] == NULL) 471 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]); 472 } 473 } 474 475 return 0; 476 } 477 478 #ifdef FF_KNI 479 static int 480 init_kni(void) 481 { 482 int nb_ports = rte_eth_dev_count_avail(); 483 kni_accept = 0; 484 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 485 kni_accept = 1; 486 487 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 488 ff_global_cfg.kni.udp_port); 489 490 unsigned socket_id = lcore_conf.socket_id; 491 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 492 493 nb_ports = ff_global_cfg.dpdk.nb_ports; 494 int i, ret; 495 for (i = 0; i < nb_ports; i++) { 496 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 497 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 498 } 499 500 return 0; 501 } 502 #endif 503 504 static void 505 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues) 506 { 507 if (reta_size == 0) { 508 return; 509 } 510 511 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 512 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 513 514 /* config HW indirection table */ 515 unsigned i, j, hash=0; 516 for (i = 0; i < reta_conf_size; i++) { 517 reta_conf[i].mask = ~0ULL; 518 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 519 reta_conf[i].reta[j] = hash++ % nb_queues; 520 } 521 } 522 523 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 524 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 525 port_id); 526 } 527 } 528 529 static int 530 init_port_start(void) 531 { 532 int nb_ports = ff_global_cfg.dpdk.nb_ports; 533 unsigned socketid = 0; 534 struct rte_mempool *mbuf_pool; 535 uint16_t i, j; 536 537 for (i = 0; i < nb_ports; i++) { 538 uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i]; 539 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id]; 540 uint16_t nb_queues = pconf->nb_lcores; 541 542 for (j=0; j<=pconf->nb_slaves; j++) { 543 if (j < pconf->nb_slaves) { 544 port_id = pconf->slave_portid_list[j]; 545 printf("To init %s's %d'st slave port[%d]\n", 546 ff_global_cfg.dpdk.bond_cfgs->name, 547 j, port_id); 548 } else { 549 port_id = u_port_id; 550 } 551 552 struct rte_eth_dev_info dev_info; 553 struct rte_eth_conf port_conf = {0}; 554 struct rte_eth_rxconf rxq_conf; 555 struct rte_eth_txconf txq_conf; 556 557 rte_eth_dev_info_get(port_id, &dev_info); 558 559 if (nb_queues > dev_info.max_rx_queues) { 560 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 561 nb_queues, 562 dev_info.max_rx_queues); 563 } 564 565 if (nb_queues > dev_info.max_tx_queues) { 566 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 567 nb_queues, 568 dev_info.max_tx_queues); 569 } 570 571 struct ether_addr addr; 572 rte_eth_macaddr_get(port_id, &addr); 573 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 574 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 575 (unsigned)port_id, 576 addr.addr_bytes[0], addr.addr_bytes[1], 577 addr.addr_bytes[2], addr.addr_bytes[3], 578 addr.addr_bytes[4], addr.addr_bytes[5]); 579 580 rte_memcpy(pconf->mac, 581 addr.addr_bytes, ETHER_ADDR_LEN); 582 583 /* Set RSS mode */ 584 uint64_t default_rss_hf = ETH_RSS_PROTO_MASK; 585 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 586 port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf; 587 if (dev_info.hash_key_size == 52) { 588 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_52bytes; 589 port_conf.rx_adv_conf.rss_conf.rss_key_len = 52; 590 use_rsskey_52bytes = 1; 591 } else { 592 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes; 593 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; 594 } 595 port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads; 596 if (port_conf.rx_adv_conf.rss_conf.rss_hf != 597 ETH_RSS_PROTO_MASK) { 598 printf("Port %u modified RSS hash function based on hardware support," 599 "requested:%#"PRIx64" configured:%#"PRIx64"\n", 600 port_id, default_rss_hf, 601 port_conf.rx_adv_conf.rss_conf.rss_hf); 602 } 603 604 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) { 605 port_conf.txmode.offloads |= 606 DEV_TX_OFFLOAD_MBUF_FAST_FREE; 607 } 608 609 /* Set Rx VLAN stripping */ 610 if (ff_global_cfg.dpdk.vlan_strip) { 611 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 612 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP; 613 } 614 } 615 616 /* Enable HW CRC stripping */ 617 port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC; 618 619 /* FIXME: Enable TCP LRO ?*/ 620 #if 0 621 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 622 printf("LRO is supported\n"); 623 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO; 624 pconf->hw_features.rx_lro = 1; 625 } 626 #endif 627 628 /* Set Rx checksum checking */ 629 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 630 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 631 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 632 printf("RX checksum offload supported\n"); 633 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM; 634 pconf->hw_features.rx_csum = 1; 635 } 636 637 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 638 printf("TX ip checksum offload supported\n"); 639 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; 640 pconf->hw_features.tx_csum_ip = 1; 641 } 642 643 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 644 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 645 printf("TX TCP&UDP checksum offload supported\n"); 646 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM; 647 pconf->hw_features.tx_csum_l4 = 1; 648 } 649 650 if (ff_global_cfg.dpdk.tso) { 651 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 652 printf("TSO is supported\n"); 653 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO; 654 pconf->hw_features.tx_tso = 1; 655 } 656 } else { 657 printf("TSO is disabled\n"); 658 } 659 660 if (dev_info.reta_size) { 661 /* reta size must be power of 2 */ 662 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 663 664 rss_reta_size[port_id] = dev_info.reta_size; 665 printf("port[%d]: rss table size: %d\n", port_id, 666 dev_info.reta_size); 667 } 668 669 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 670 continue; 671 } 672 673 int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); 674 if (ret != 0) { 675 return ret; 676 } 677 678 static uint16_t nb_rxd = RX_QUEUE_SIZE; 679 static uint16_t nb_txd = TX_QUEUE_SIZE; 680 ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd); 681 if (ret < 0) 682 printf("Could not adjust number of descriptors " 683 "for port%u (%d)\n", (unsigned)port_id, ret); 684 685 uint16_t q; 686 for (q = 0; q < nb_queues; q++) { 687 if (numa_on) { 688 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q]; 689 socketid = rte_lcore_to_socket_id(lcore_id); 690 } 691 mbuf_pool = pktmbuf_pool[socketid]; 692 693 txq_conf = dev_info.default_txconf; 694 txq_conf.offloads = port_conf.txmode.offloads; 695 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd, 696 socketid, &txq_conf); 697 if (ret < 0) { 698 return ret; 699 } 700 701 rxq_conf = dev_info.default_rxconf; 702 rxq_conf.offloads = port_conf.rxmode.offloads; 703 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd, 704 socketid, &rxq_conf, mbuf_pool); 705 if (ret < 0) { 706 return ret; 707 } 708 } 709 710 711 if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME, 712 strlen(dev_info.driver_name)) == 0) { 713 714 rte_eth_macaddr_get(port_id, &addr); 715 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 716 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 717 (unsigned)port_id, 718 addr.addr_bytes[0], addr.addr_bytes[1], 719 addr.addr_bytes[2], addr.addr_bytes[3], 720 addr.addr_bytes[4], addr.addr_bytes[5]); 721 722 rte_memcpy(pconf->mac, 723 addr.addr_bytes, ETHER_ADDR_LEN); 724 725 int mode, count, x; 726 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS; 727 728 mode = rte_eth_bond_mode_get(port_id); 729 printf("Port %u, bond mode:%d\n", port_id, mode); 730 731 count = rte_eth_bond_slaves_get(port_id, slaves, len); 732 printf("Port %u, %s's slave ports count:%d\n", port_id, 733 ff_global_cfg.dpdk.bond_cfgs->name, count); 734 for (x=0; x<count; x++) { 735 printf("Port %u, %s's slave port[%u]\n", port_id, 736 ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]); 737 } 738 } 739 740 ret = rte_eth_dev_start(port_id); 741 if (ret < 0) { 742 return ret; 743 } 744 745 if (nb_queues > 1) { 746 /* set HW rss hash function to Toeplitz. */ 747 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 748 struct rte_eth_hash_filter_info info = {0}; 749 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 750 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 751 752 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 753 RTE_ETH_FILTER_SET, &info) < 0) { 754 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 755 port_id); 756 } 757 } 758 759 set_rss_table(port_id, dev_info.reta_size, nb_queues); 760 } 761 762 /* Enable RX in promiscuous mode for the Ethernet device. */ 763 if (ff_global_cfg.dpdk.promiscuous) { 764 rte_eth_promiscuous_enable(port_id); 765 ret = rte_eth_promiscuous_get(port_id); 766 if (ret == 1) { 767 printf("set port %u to promiscuous mode ok\n", port_id); 768 } else { 769 printf("set port %u to promiscuous mode error\n", port_id); 770 } 771 } 772 773 /* Enable pcap dump */ 774 if (pconf->pcap) { 775 ff_enable_pcap(pconf->pcap); 776 } 777 } 778 } 779 780 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 781 check_all_ports_link_status(); 782 } 783 784 return 0; 785 } 786 787 static int 788 init_clock(void) 789 { 790 rte_timer_subsystem_init(); 791 uint64_t hz = rte_get_timer_hz(); 792 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 793 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 794 795 rte_timer_init(&freebsd_clock); 796 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 797 rte_lcore_id(), &ff_hardclock_job, NULL); 798 799 ff_update_current_ts(); 800 801 return 0; 802 } 803 804 int 805 ff_dpdk_init(int argc, char **argv) 806 { 807 if (ff_global_cfg.dpdk.nb_procs < 1 || 808 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 809 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 810 ff_global_cfg.dpdk.proc_id < 0) { 811 printf("param num_procs[%d] or proc_id[%d] error!\n", 812 ff_global_cfg.dpdk.nb_procs, 813 ff_global_cfg.dpdk.proc_id); 814 exit(1); 815 } 816 817 int ret = rte_eal_init(argc, argv); 818 if (ret < 0) { 819 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 820 } 821 822 numa_on = ff_global_cfg.dpdk.numa_on; 823 824 idle_sleep = ff_global_cfg.dpdk.idle_sleep; 825 pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \ 826 BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay; 827 828 init_lcore_conf(); 829 830 init_mem_pool(); 831 832 init_dispatch_ring(); 833 834 init_msg_ring(); 835 836 #ifdef FF_KNI 837 enable_kni = ff_global_cfg.kni.enable; 838 if (enable_kni) { 839 init_kni(); 840 } 841 #endif 842 843 #ifdef FF_USE_PAGE_ARRAY 844 ff_mmap_init(); 845 #endif 846 847 ret = init_port_start(); 848 if (ret < 0) { 849 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 850 } 851 852 init_clock(); 853 854 return 0; 855 } 856 857 static void 858 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 859 { 860 uint8_t rx_csum = ctx->hw_features.rx_csum; 861 if (rx_csum) { 862 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 863 rte_pktmbuf_free(pkt); 864 return; 865 } 866 } 867 868 void *data = rte_pktmbuf_mtod(pkt, void*); 869 uint16_t len = rte_pktmbuf_data_len(pkt); 870 871 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 872 if (hdr == NULL) { 873 rte_pktmbuf_free(pkt); 874 return; 875 } 876 877 if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) { 878 ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci); 879 } 880 881 struct rte_mbuf *pn = pkt->next; 882 void *prev = hdr; 883 while(pn != NULL) { 884 data = rte_pktmbuf_mtod(pn, void*); 885 len = rte_pktmbuf_data_len(pn); 886 887 void *mb = ff_mbuf_get(prev, data, len); 888 if (mb == NULL) { 889 ff_mbuf_free(hdr); 890 rte_pktmbuf_free(pkt); 891 return; 892 } 893 pn = pn->next; 894 prev = mb; 895 } 896 897 ff_veth_process_packet(ctx->ifp, hdr); 898 } 899 900 static enum FilterReturn 901 protocol_filter(const void *data, uint16_t len) 902 { 903 if(len < ETHER_HDR_LEN) 904 return FILTER_UNKNOWN; 905 906 const struct ether_hdr *hdr; 907 const struct vlan_hdr *vlanhdr; 908 hdr = (const struct ether_hdr *)data; 909 uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type); 910 data += ETHER_HDR_LEN; 911 len -= ETHER_HDR_LEN; 912 913 if (ether_type == ETHER_TYPE_VLAN) { 914 vlanhdr = (struct vlan_hdr *)data; 915 ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto); 916 data += sizeof(struct vlan_hdr); 917 len -= sizeof(struct vlan_hdr); 918 } 919 920 if(ether_type == ETHER_TYPE_ARP) 921 return FILTER_ARP; 922 923 #ifdef INET6 924 if (ether_type == ETHER_TYPE_IPv6) { 925 return ff_kni_proto_filter(data, 926 len, ether_type); 927 } 928 #endif 929 930 #ifndef FF_KNI 931 return FILTER_UNKNOWN; 932 #else 933 if (!enable_kni) { 934 return FILTER_UNKNOWN; 935 } 936 937 if(ether_type != ETHER_TYPE_IPv4) 938 return FILTER_UNKNOWN; 939 940 return ff_kni_proto_filter(data, 941 len, ether_type); 942 #endif 943 } 944 945 static inline void 946 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m) 947 { 948 struct rte_mbuf *md; 949 void *src, *dst; 950 951 dst = rte_pktmbuf_mtod(mi, void *); 952 src = rte_pktmbuf_mtod(m, void *); 953 954 mi->data_len = m->data_len; 955 rte_memcpy(dst, src, m->data_len); 956 957 mi->port = m->port; 958 mi->vlan_tci = m->vlan_tci; 959 mi->vlan_tci_outer = m->vlan_tci_outer; 960 mi->tx_offload = m->tx_offload; 961 mi->hash = m->hash; 962 mi->ol_flags = m->ol_flags; 963 mi->packet_type = m->packet_type; 964 } 965 966 /* copied from rte_pktmbuf_clone */ 967 static inline struct rte_mbuf * 968 pktmbuf_deep_clone(const struct rte_mbuf *md, 969 struct rte_mempool *mp) 970 { 971 struct rte_mbuf *mc, *mi, **prev; 972 uint32_t pktlen; 973 uint8_t nseg; 974 975 if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) 976 return NULL; 977 978 mi = mc; 979 prev = &mi->next; 980 pktlen = md->pkt_len; 981 nseg = 0; 982 983 do { 984 nseg++; 985 pktmbuf_deep_attach(mi, md); 986 *prev = mi; 987 prev = &mi->next; 988 } while ((md = md->next) != NULL && 989 (mi = rte_pktmbuf_alloc(mp)) != NULL); 990 991 *prev = NULL; 992 mc->nb_segs = nseg; 993 mc->pkt_len = pktlen; 994 995 /* Allocation of new indirect segment failed */ 996 if (unlikely (mi == NULL)) { 997 rte_pktmbuf_free(mc); 998 return NULL; 999 } 1000 1001 __rte_mbuf_sanity_check(mc, 1); 1002 return mc; 1003 } 1004 1005 static inline void 1006 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 1007 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 1008 { 1009 struct lcore_conf *qconf = &lcore_conf; 1010 uint16_t nb_queues = qconf->nb_queue_list[port_id]; 1011 1012 uint16_t i; 1013 for (i = 0; i < count; i++) { 1014 struct rte_mbuf *rtem = bufs[i]; 1015 1016 if (unlikely(qconf->pcap[port_id] != NULL)) { 1017 if (!pkts_from_ring) { 1018 ff_dump_packets(qconf->pcap[port_id], rtem); 1019 } 1020 } 1021 1022 void *data = rte_pktmbuf_mtod(rtem, void*); 1023 uint16_t len = rte_pktmbuf_data_len(rtem); 1024 1025 if (!pkts_from_ring) { 1026 ff_traffic.rx_packets++; 1027 ff_traffic.rx_bytes += len; 1028 } 1029 1030 if (!pkts_from_ring && packet_dispatcher) { 1031 int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues); 1032 if (ret == FF_DISPATCH_RESPONSE) { 1033 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len; 1034 1035 /* 1036 * We have not support vlan out strip 1037 */ 1038 if (rtem->vlan_tci) { 1039 data = rte_pktmbuf_prepend(rtem, sizeof(struct vlan_hdr)); 1040 if (data != NULL) { 1041 memmove(data, data + sizeof(struct vlan_hdr), ETHER_HDR_LEN); 1042 struct ether_hdr *etherhdr = (struct ether_hdr *)data; 1043 struct vlan_hdr *vlanhdr = (struct vlan_hdr *)(data + ETHER_HDR_LEN); 1044 vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci); 1045 vlanhdr->eth_proto = etherhdr->ether_type; 1046 etherhdr->ether_type = rte_cpu_to_be_16(ETHER_TYPE_VLAN); 1047 } 1048 } 1049 send_single_packet(rtem, port_id); 1050 continue; 1051 } 1052 1053 if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) { 1054 rte_pktmbuf_free(rtem); 1055 continue; 1056 } 1057 1058 if (ret != queue_id) { 1059 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem); 1060 if (ret < 0) 1061 rte_pktmbuf_free(rtem); 1062 1063 continue; 1064 } 1065 } 1066 1067 enum FilterReturn filter = protocol_filter(data, len); 1068 #ifdef INET6 1069 if (filter == FILTER_ARP || filter == FILTER_NDP) { 1070 #else 1071 if (filter == FILTER_ARP) { 1072 #endif 1073 struct rte_mempool *mbuf_pool; 1074 struct rte_mbuf *mbuf_clone; 1075 if (!pkts_from_ring) { 1076 uint16_t j; 1077 for(j = 0; j < nb_queues; ++j) { 1078 if(j == queue_id) 1079 continue; 1080 1081 unsigned socket_id = 0; 1082 if (numa_on) { 1083 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j]; 1084 socket_id = rte_lcore_to_socket_id(lcore_id); 1085 } 1086 mbuf_pool = pktmbuf_pool[socket_id]; 1087 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1088 if(mbuf_clone) { 1089 int ret = rte_ring_enqueue(dispatch_ring[port_id][j], 1090 mbuf_clone); 1091 if (ret < 0) 1092 rte_pktmbuf_free(mbuf_clone); 1093 } 1094 } 1095 } 1096 1097 #ifdef FF_KNI 1098 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1099 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 1100 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1101 if(mbuf_clone) { 1102 ff_kni_enqueue(port_id, mbuf_clone); 1103 } 1104 } 1105 #endif 1106 ff_veth_input(ctx, rtem); 1107 #ifdef FF_KNI 1108 } else if (enable_kni && 1109 ((filter == FILTER_KNI && kni_accept) || 1110 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 1111 ff_kni_enqueue(port_id, rtem); 1112 #endif 1113 } else { 1114 ff_veth_input(ctx, rtem); 1115 } 1116 } 1117 } 1118 1119 static inline int 1120 process_dispatch_ring(uint16_t port_id, uint16_t queue_id, 1121 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 1122 { 1123 /* read packet from ring buf and to process */ 1124 uint16_t nb_rb; 1125 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id], 1126 (void **)pkts_burst, MAX_PKT_BURST, NULL); 1127 1128 if(nb_rb > 0) { 1129 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 1130 } 1131 1132 return 0; 1133 } 1134 1135 static inline void 1136 handle_sysctl_msg(struct ff_msg *msg) 1137 { 1138 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 1139 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 1140 msg->sysctl.newlen); 1141 1142 if (ret < 0) { 1143 msg->result = errno; 1144 } else { 1145 msg->result = 0; 1146 } 1147 } 1148 1149 static inline void 1150 handle_ioctl_msg(struct ff_msg *msg) 1151 { 1152 int fd, ret; 1153 #ifdef INET6 1154 if (msg->msg_type == FF_IOCTL6) { 1155 fd = ff_socket(AF_INET6, SOCK_DGRAM, 0); 1156 } else 1157 #endif 1158 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 1159 1160 if (fd < 0) { 1161 ret = -1; 1162 goto done; 1163 } 1164 1165 ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data); 1166 1167 ff_close(fd); 1168 1169 done: 1170 if (ret < 0) { 1171 msg->result = errno; 1172 } else { 1173 msg->result = 0; 1174 } 1175 } 1176 1177 static inline void 1178 handle_route_msg(struct ff_msg *msg) 1179 { 1180 int ret = ff_rtioctl(msg->route.fib, msg->route.data, 1181 &msg->route.len, msg->route.maxlen); 1182 if (ret < 0) { 1183 msg->result = errno; 1184 } else { 1185 msg->result = 0; 1186 } 1187 } 1188 1189 static inline void 1190 handle_top_msg(struct ff_msg *msg) 1191 { 1192 msg->top = ff_top_status; 1193 msg->result = 0; 1194 } 1195 1196 #ifdef FF_NETGRAPH 1197 static inline void 1198 handle_ngctl_msg(struct ff_msg *msg) 1199 { 1200 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data); 1201 if (ret < 0) { 1202 msg->result = errno; 1203 } else { 1204 msg->result = 0; 1205 msg->ngctl.ret = ret; 1206 } 1207 } 1208 #endif 1209 1210 #ifdef FF_IPFW 1211 static inline void 1212 handle_ipfw_msg(struct ff_msg *msg) 1213 { 1214 int fd, ret; 1215 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW); 1216 if (fd < 0) { 1217 ret = -1; 1218 goto done; 1219 } 1220 1221 switch (msg->ipfw.cmd) { 1222 case FF_IPFW_GET: 1223 ret = ff_getsockopt_freebsd(fd, msg->ipfw.level, 1224 msg->ipfw.optname, msg->ipfw.optval, 1225 msg->ipfw.optlen); 1226 break; 1227 case FF_IPFW_SET: 1228 ret = ff_setsockopt_freebsd(fd, msg->ipfw.level, 1229 msg->ipfw.optname, msg->ipfw.optval, 1230 *(msg->ipfw.optlen)); 1231 break; 1232 default: 1233 ret = -1; 1234 errno = ENOTSUP; 1235 break; 1236 } 1237 1238 ff_close(fd); 1239 1240 done: 1241 if (ret < 0) { 1242 msg->result = errno; 1243 } else { 1244 msg->result = 0; 1245 } 1246 } 1247 #endif 1248 1249 static inline void 1250 handle_traffic_msg(struct ff_msg *msg) 1251 { 1252 msg->traffic = ff_traffic; 1253 msg->result = 0; 1254 } 1255 1256 static inline void 1257 handle_default_msg(struct ff_msg *msg) 1258 { 1259 msg->result = ENOTSUP; 1260 } 1261 1262 static inline void 1263 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1264 { 1265 switch (msg->msg_type) { 1266 case FF_SYSCTL: 1267 handle_sysctl_msg(msg); 1268 break; 1269 case FF_IOCTL: 1270 #ifdef INET6 1271 case FF_IOCTL6: 1272 #endif 1273 handle_ioctl_msg(msg); 1274 break; 1275 case FF_ROUTE: 1276 handle_route_msg(msg); 1277 break; 1278 case FF_TOP: 1279 handle_top_msg(msg); 1280 break; 1281 #ifdef FF_NETGRAPH 1282 case FF_NGCTL: 1283 handle_ngctl_msg(msg); 1284 break; 1285 #endif 1286 #ifdef FF_IPFW 1287 case FF_IPFW_CTL: 1288 handle_ipfw_msg(msg); 1289 break; 1290 #endif 1291 case FF_TRAFFIC: 1292 handle_traffic_msg(msg); 1293 break; 1294 default: 1295 handle_default_msg(msg); 1296 break; 1297 } 1298 rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg); 1299 } 1300 1301 static inline int 1302 process_msg_ring(uint16_t proc_id) 1303 { 1304 void *msg; 1305 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1306 1307 if (unlikely(ret == 0)) { 1308 handle_msg((struct ff_msg *)msg, proc_id); 1309 } 1310 1311 return 0; 1312 } 1313 1314 /* Send burst of packets on an output interface */ 1315 static inline int 1316 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1317 { 1318 struct rte_mbuf **m_table; 1319 int ret; 1320 uint16_t queueid; 1321 1322 queueid = qconf->tx_queue_id[port]; 1323 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1324 1325 if (unlikely(qconf->pcap[port] != NULL)) { 1326 uint16_t i; 1327 for (i = 0; i < n; i++) { 1328 ff_dump_packets(qconf->pcap[port], m_table[i]); 1329 } 1330 } 1331 1332 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1333 ff_traffic.tx_packets += ret; 1334 uint16_t i; 1335 for (i = 0; i < ret; i++) { 1336 ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]); 1337 #ifdef FF_USE_PAGE_ARRAY 1338 if (qconf->tx_mbufs[port].bsd_m_table[i]) 1339 ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs); 1340 #endif 1341 } 1342 if (unlikely(ret < n)) { 1343 do { 1344 rte_pktmbuf_free(m_table[ret]); 1345 #ifdef FF_USE_PAGE_ARRAY 1346 if ( qconf->tx_mbufs[port].bsd_m_table[ret] ) 1347 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]); 1348 #endif 1349 } while (++ret < n); 1350 } 1351 return 0; 1352 } 1353 1354 /* Enqueue a single packet, and send burst if queue is filled */ 1355 static inline int 1356 send_single_packet(struct rte_mbuf *m, uint8_t port) 1357 { 1358 uint16_t len; 1359 struct lcore_conf *qconf; 1360 1361 qconf = &lcore_conf; 1362 len = qconf->tx_mbufs[port].len; 1363 qconf->tx_mbufs[port].m_table[len] = m; 1364 len++; 1365 1366 /* enough pkts to be sent */ 1367 if (unlikely(len == MAX_PKT_BURST)) { 1368 send_burst(qconf, MAX_PKT_BURST, port); 1369 len = 0; 1370 } 1371 1372 qconf->tx_mbufs[port].len = len; 1373 return 0; 1374 } 1375 1376 int 1377 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1378 int total) 1379 { 1380 #ifdef FF_USE_PAGE_ARRAY 1381 struct lcore_conf *qconf = &lcore_conf; 1382 int len = 0; 1383 1384 len = ff_if_send_onepkt(ctx, m,total); 1385 if (unlikely(len == MAX_PKT_BURST)) { 1386 send_burst(qconf, MAX_PKT_BURST, ctx->port_id); 1387 len = 0; 1388 } 1389 qconf->tx_mbufs[ctx->port_id].len = len; 1390 return 0; 1391 #endif 1392 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1393 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1394 if (head == NULL) { 1395 ff_mbuf_free(m); 1396 return -1; 1397 } 1398 1399 head->pkt_len = total; 1400 head->nb_segs = 0; 1401 1402 int off = 0; 1403 struct rte_mbuf *cur = head, *prev = NULL; 1404 while(total > 0) { 1405 if (cur == NULL) { 1406 cur = rte_pktmbuf_alloc(mbuf_pool); 1407 if (cur == NULL) { 1408 rte_pktmbuf_free(head); 1409 ff_mbuf_free(m); 1410 return -1; 1411 } 1412 } 1413 1414 if (prev != NULL) { 1415 prev->next = cur; 1416 } 1417 head->nb_segs++; 1418 1419 prev = cur; 1420 void *data = rte_pktmbuf_mtod(cur, void*); 1421 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1422 int ret = ff_mbuf_copydata(m, data, off, len); 1423 if (ret < 0) { 1424 rte_pktmbuf_free(head); 1425 ff_mbuf_free(m); 1426 return -1; 1427 } 1428 1429 1430 cur->data_len = len; 1431 off += len; 1432 total -= len; 1433 cur = NULL; 1434 } 1435 1436 struct ff_tx_offload offload = {0}; 1437 ff_mbuf_tx_offload(m, &offload); 1438 1439 void *data = rte_pktmbuf_mtod(head, void*); 1440 1441 if (offload.ip_csum) { 1442 /* ipv6 not supported yet */ 1443 struct ipv4_hdr *iph; 1444 int iph_len; 1445 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1446 iph_len = (iph->version_ihl & 0x0f) << 2; 1447 1448 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 1449 head->l2_len = ETHER_HDR_LEN; 1450 head->l3_len = iph_len; 1451 } 1452 1453 if (ctx->hw_features.tx_csum_l4) { 1454 struct ipv4_hdr *iph; 1455 int iph_len; 1456 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1457 iph_len = (iph->version_ihl & 0x0f) << 2; 1458 1459 if (offload.tcp_csum) { 1460 head->ol_flags |= PKT_TX_TCP_CKSUM; 1461 head->l2_len = ETHER_HDR_LEN; 1462 head->l3_len = iph_len; 1463 } 1464 1465 /* 1466 * TCP segmentation offload. 1467 * 1468 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 1469 * implies PKT_TX_TCP_CKSUM) 1470 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 1471 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 1472 * write the IP checksum to 0 in the packet 1473 * - fill the mbuf offload information: l2_len, 1474 * l3_len, l4_len, tso_segsz 1475 * - calculate the pseudo header checksum without taking ip_len 1476 * in account, and set it in the TCP header. Refer to 1477 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 1478 * used as helpers. 1479 */ 1480 if (offload.tso_seg_size) { 1481 struct tcp_hdr *tcph; 1482 int tcph_len; 1483 tcph = (struct tcp_hdr *)((char *)iph + iph_len); 1484 tcph_len = (tcph->data_off & 0xf0) >> 2; 1485 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 1486 1487 head->ol_flags |= PKT_TX_TCP_SEG; 1488 head->l4_len = tcph_len; 1489 head->tso_segsz = offload.tso_seg_size; 1490 } 1491 1492 if (offload.udp_csum) { 1493 head->ol_flags |= PKT_TX_UDP_CKSUM; 1494 head->l2_len = ETHER_HDR_LEN; 1495 head->l3_len = iph_len; 1496 } 1497 } 1498 1499 ff_mbuf_free(m); 1500 1501 return send_single_packet(head, ctx->port_id); 1502 } 1503 1504 static int 1505 main_loop(void *arg) 1506 { 1507 struct loop_routine *lr = (struct loop_routine *)arg; 1508 1509 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1510 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc; 1511 int i, j, nb_rx, idle; 1512 uint16_t port_id, queue_id; 1513 struct lcore_conf *qconf; 1514 uint64_t drain_tsc = 0; 1515 struct ff_dpdk_if_context *ctx; 1516 1517 if (pkt_tx_delay) { 1518 drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay; 1519 } 1520 1521 prev_tsc = 0; 1522 usch_tsc = 0; 1523 1524 qconf = &lcore_conf; 1525 1526 while (1) { 1527 cur_tsc = rte_rdtsc(); 1528 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1529 rte_timer_manage(); 1530 } 1531 1532 idle = 1; 1533 sys_tsc = 0; 1534 usr_tsc = 0; 1535 1536 /* 1537 * TX burst queue drain 1538 */ 1539 diff_tsc = cur_tsc - prev_tsc; 1540 if (unlikely(diff_tsc >= drain_tsc)) { 1541 for (i = 0; i < qconf->nb_tx_port; i++) { 1542 port_id = qconf->tx_port_id[i]; 1543 if (qconf->tx_mbufs[port_id].len == 0) 1544 continue; 1545 1546 idle = 0; 1547 1548 send_burst(qconf, 1549 qconf->tx_mbufs[port_id].len, 1550 port_id); 1551 qconf->tx_mbufs[port_id].len = 0; 1552 } 1553 1554 prev_tsc = cur_tsc; 1555 } 1556 1557 /* 1558 * Read packet from RX queues 1559 */ 1560 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1561 port_id = qconf->rx_queue_list[i].port_id; 1562 queue_id = qconf->rx_queue_list[i].queue_id; 1563 ctx = veth_ctx[port_id]; 1564 1565 #ifdef FF_KNI 1566 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1567 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1568 } 1569 #endif 1570 1571 process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); 1572 1573 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1574 MAX_PKT_BURST); 1575 if (nb_rx == 0) 1576 continue; 1577 1578 idle = 0; 1579 1580 /* Prefetch first packets */ 1581 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1582 rte_prefetch0(rte_pktmbuf_mtod( 1583 pkts_burst[j], void *)); 1584 } 1585 1586 /* Prefetch and handle already prefetched packets */ 1587 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1588 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1589 j + PREFETCH_OFFSET], void *)); 1590 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1591 } 1592 1593 /* Handle remaining prefetched packets */ 1594 for (; j < nb_rx; j++) { 1595 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1596 } 1597 } 1598 1599 process_msg_ring(qconf->proc_id); 1600 1601 div_tsc = rte_rdtsc(); 1602 1603 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) { 1604 usch_tsc = cur_tsc; 1605 lr->loop(lr->arg); 1606 } 1607 1608 idle_sleep_tsc = rte_rdtsc(); 1609 if (likely(idle && idle_sleep)) { 1610 usleep(idle_sleep); 1611 end_tsc = rte_rdtsc(); 1612 } else { 1613 end_tsc = idle_sleep_tsc; 1614 } 1615 1616 if (usch_tsc == cur_tsc) { 1617 usr_tsc = idle_sleep_tsc - div_tsc; 1618 } 1619 1620 if (!idle) { 1621 sys_tsc = div_tsc - cur_tsc; 1622 ff_top_status.sys_tsc += sys_tsc; 1623 } 1624 1625 ff_top_status.usr_tsc += usr_tsc; 1626 ff_top_status.work_tsc += end_tsc - cur_tsc; 1627 ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1628 1629 ff_top_status.loops++; 1630 } 1631 1632 return 0; 1633 } 1634 1635 int 1636 ff_dpdk_if_up(void) { 1637 int i; 1638 struct lcore_conf *qconf = &lcore_conf; 1639 for (i = 0; i < qconf->nb_tx_port; i++) { 1640 uint16_t port_id = qconf->tx_port_id[i]; 1641 1642 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id]; 1643 veth_ctx[port_id] = ff_veth_attach(pconf); 1644 if (veth_ctx[port_id] == NULL) { 1645 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1646 } 1647 } 1648 1649 return 0; 1650 } 1651 1652 void 1653 ff_dpdk_run(loop_func_t loop, void *arg) { 1654 struct loop_routine *lr = rte_malloc(NULL, 1655 sizeof(struct loop_routine), 0); 1656 lr->loop = loop; 1657 lr->arg = arg; 1658 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1659 rte_eal_mp_wait_lcore(); 1660 rte_free(lr); 1661 } 1662 1663 void 1664 ff_dpdk_pktmbuf_free(void *m) 1665 { 1666 rte_pktmbuf_free((struct rte_mbuf *)m); 1667 } 1668 1669 static uint32_t 1670 toeplitz_hash(unsigned keylen, const uint8_t *key, 1671 unsigned datalen, const uint8_t *data) 1672 { 1673 uint32_t hash = 0, v; 1674 u_int i, b; 1675 1676 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1677 1678 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1679 for (i = 0; i < datalen; i++) { 1680 for (b = 0; b < 8; b++) { 1681 if (data[i] & (1<<(7-b))) 1682 hash ^= v; 1683 v <<= 1; 1684 if ((i + 4) < keylen && 1685 (key[i+4] & (1<<(7-b)))) 1686 v |= 1; 1687 } 1688 } 1689 return (hash); 1690 } 1691 1692 int 1693 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1694 uint16_t sport, uint16_t dport) 1695 { 1696 struct lcore_conf *qconf = &lcore_conf; 1697 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 1698 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id]; 1699 1700 if (nb_queues <= 1) { 1701 return 1; 1702 } 1703 1704 uint16_t reta_size = rss_reta_size[ctx->port_id]; 1705 uint16_t queueid = qconf->tx_queue_id[ctx->port_id]; 1706 1707 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1708 sizeof(dport)]; 1709 1710 unsigned datalen = 0; 1711 1712 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1713 datalen += sizeof(saddr); 1714 1715 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1716 datalen += sizeof(daddr); 1717 1718 bcopy(&sport, &data[datalen], sizeof(sport)); 1719 datalen += sizeof(sport); 1720 1721 bcopy(&dport, &data[datalen], sizeof(dport)); 1722 datalen += sizeof(dport); 1723 1724 uint32_t hash = 0; 1725 if ( !use_rsskey_52bytes ) 1726 hash = toeplitz_hash(sizeof(default_rsskey_40bytes), 1727 default_rsskey_40bytes, datalen, data); 1728 else 1729 hash = toeplitz_hash(sizeof(default_rsskey_52bytes), 1730 default_rsskey_52bytes, datalen, data); 1731 return ((hash & (reta_size - 1)) % nb_queues) == queueid; 1732 } 1733 1734 void 1735 ff_regist_packet_dispatcher(dispatch_func_t func) 1736 { 1737 packet_dispatcher = func; 1738 } 1739 1740 uint64_t 1741 ff_get_tsc_ns() 1742 { 1743 uint64_t cur_tsc = rte_rdtsc(); 1744 uint64_t hz = rte_get_tsc_hz(); 1745 return ((double)cur_tsc/(double)hz) * NS_PER_S; 1746 } 1747 1748