1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 #include <unistd.h> 28 #include <sys/mman.h> 29 #include <errno.h> 30 31 #include <rte_common.h> 32 #include <rte_byteorder.h> 33 #include <rte_log.h> 34 #include <rte_memory.h> 35 #include <rte_memcpy.h> 36 #include <rte_memzone.h> 37 #include <rte_config.h> 38 #include <rte_eal.h> 39 #include <rte_pci.h> 40 #include <rte_mbuf.h> 41 #include <rte_memory.h> 42 #include <rte_lcore.h> 43 #include <rte_launch.h> 44 #include <rte_ethdev.h> 45 #include <rte_debug.h> 46 #include <rte_common.h> 47 #include <rte_ether.h> 48 #include <rte_malloc.h> 49 #include <rte_cycles.h> 50 #include <rte_timer.h> 51 #include <rte_thash.h> 52 #include <rte_ip.h> 53 #include <rte_tcp.h> 54 #include <rte_udp.h> 55 #include <rte_eth_bond.h> 56 57 #include "ff_dpdk_if.h" 58 #include "ff_dpdk_pcap.h" 59 #include "ff_dpdk_kni.h" 60 #include "ff_config.h" 61 #include "ff_veth.h" 62 #include "ff_host_interface.h" 63 #include "ff_msg.h" 64 #include "ff_api.h" 65 #include "ff_memory.h" 66 67 #ifdef FF_KNI 68 #define KNI_MBUF_MAX 2048 69 #define KNI_QUEUE_SIZE 2048 70 71 int enable_kni; 72 static int kni_accept; 73 #endif 74 75 static int numa_on; 76 77 static unsigned idle_sleep; 78 static unsigned pkt_tx_delay; 79 80 static struct rte_timer freebsd_clock; 81 82 // Mellanox Linux's driver key 83 static uint8_t default_rsskey_40bytes[40] = { 84 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 85 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 86 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 87 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 88 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 89 }; 90 91 static int use_rsskey_52bytes = 0; 92 static uint8_t default_rsskey_52bytes[52] = { 93 0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23, 94 0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30, 95 0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02, 96 0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c, 97 0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55, 98 0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e, 99 0x81, 0x15, 0x03, 0x66 100 }; 101 102 struct lcore_conf lcore_conf; 103 104 struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 105 106 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; 107 static dispatch_func_t packet_dispatcher; 108 109 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 110 111 #define BOND_DRIVER_NAME "net_bonding" 112 113 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port); 114 115 struct ff_msg_ring { 116 char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE]; 117 /* ring[0] for lcore recv msg, other send */ 118 /* ring[1] for lcore send msg, other read */ 119 struct rte_ring *ring[FF_MSG_NUM]; 120 } __rte_cache_aligned; 121 122 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 123 static struct rte_mempool *message_pool; 124 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 125 126 static struct ff_top_args ff_top_status; 127 static struct ff_traffic_args ff_traffic; 128 extern void ff_hardclock(void); 129 130 static void 131 ff_hardclock_job(__rte_unused struct rte_timer *timer, 132 __rte_unused void *arg) { 133 ff_hardclock(); 134 ff_update_current_ts(); 135 } 136 137 struct ff_dpdk_if_context * 138 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 139 { 140 struct ff_dpdk_if_context *ctx; 141 142 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 143 if (ctx == NULL) 144 return NULL; 145 146 ctx->sc = sc; 147 ctx->ifp = ifp; 148 ctx->port_id = cfg->port_id; 149 ctx->hw_features = cfg->hw_features; 150 151 return ctx; 152 } 153 154 void 155 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 156 { 157 free(ctx); 158 } 159 160 static void 161 check_all_ports_link_status(void) 162 { 163 #define CHECK_INTERVAL 100 /* 100ms */ 164 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 165 166 uint16_t portid; 167 uint8_t count, all_ports_up, print_flag = 0; 168 struct rte_eth_link link; 169 170 printf("\nChecking link status"); 171 fflush(stdout); 172 173 int i, nb_ports; 174 nb_ports = ff_global_cfg.dpdk.nb_ports; 175 for (count = 0; count <= MAX_CHECK_TIME; count++) { 176 all_ports_up = 1; 177 for (i = 0; i < nb_ports; i++) { 178 uint16_t portid = ff_global_cfg.dpdk.portid_list[i]; 179 memset(&link, 0, sizeof(link)); 180 rte_eth_link_get_nowait(portid, &link); 181 182 /* print link status if flag set */ 183 if (print_flag == 1) { 184 if (link.link_status) { 185 printf("Port %d Link Up - speed %u " 186 "Mbps - %s\n", (int)portid, 187 (unsigned)link.link_speed, 188 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 189 ("full-duplex") : ("half-duplex\n")); 190 } else { 191 printf("Port %d Link Down\n", (int)portid); 192 } 193 continue; 194 } 195 /* clear all_ports_up flag if any link down */ 196 if (link.link_status == 0) { 197 all_ports_up = 0; 198 break; 199 } 200 } 201 202 /* after finally printing all link status, get out */ 203 if (print_flag == 1) 204 break; 205 206 if (all_ports_up == 0) { 207 printf("."); 208 fflush(stdout); 209 rte_delay_ms(CHECK_INTERVAL); 210 } 211 212 /* set the print_flag if all ports up or timeout */ 213 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 214 print_flag = 1; 215 printf("done\n"); 216 } 217 } 218 } 219 220 static int 221 init_lcore_conf(void) 222 { 223 uint8_t nb_dev_ports = rte_eth_dev_count_avail(); 224 if (nb_dev_ports == 0) { 225 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 226 } 227 228 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) { 229 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n", 230 ff_global_cfg.dpdk.max_portid); 231 } 232 233 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs; 234 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 235 236 uint16_t proc_id; 237 for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) { 238 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id]; 239 if (!lcore_config[lcore_id].detected) { 240 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 241 } 242 } 243 244 uint16_t socket_id = 0; 245 if (numa_on) { 246 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 247 } 248 249 lcore_conf.socket_id = socket_id; 250 251 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id]; 252 int j; 253 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) { 254 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j]; 255 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 256 257 int queueid = -1; 258 int i; 259 for (i = 0; i < pconf->nb_lcores; i++) { 260 if (pconf->lcore_list[i] == lcore_id) { 261 queueid = i; 262 } 263 } 264 if (queueid < 0) { 265 continue; 266 } 267 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid); 268 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 269 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 270 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid; 271 lcore_conf.nb_rx_queue++; 272 273 lcore_conf.tx_queue_id[port_id] = queueid; 274 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id; 275 lcore_conf.nb_tx_port++; 276 277 lcore_conf.pcap[port_id] = pconf->pcap; 278 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores; 279 } 280 281 if (lcore_conf.nb_rx_queue == 0) { 282 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id); 283 } 284 285 return 0; 286 } 287 288 static int 289 init_mem_pool(void) 290 { 291 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 292 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 293 uint32_t nb_tx_queue = nb_lcores; 294 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 295 uint16_t max_portid = ff_global_cfg.dpdk.max_portid; 296 297 unsigned nb_mbuf = RTE_ALIGN_CEIL ( 298 (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE + 299 nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST + 300 nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE + 301 nb_lcores * MEMPOOL_CACHE_SIZE + 302 #ifdef FF_KNI 303 nb_ports * KNI_MBUF_MAX + 304 nb_ports * KNI_QUEUE_SIZE + 305 #endif 306 nb_lcores * nb_ports * DISPATCH_RING_SIZE), 307 (unsigned)8192); 308 309 unsigned socketid = 0; 310 uint16_t i, lcore_id; 311 char s[64]; 312 313 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) { 314 lcore_id = ff_global_cfg.dpdk.proc_lcore[i]; 315 if (numa_on) { 316 socketid = rte_lcore_to_socket_id(lcore_id); 317 } 318 319 if (socketid >= NB_SOCKETS) { 320 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 321 socketid, i, NB_SOCKETS); 322 } 323 324 if (pktmbuf_pool[socketid] != NULL) { 325 continue; 326 } 327 328 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 329 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 330 pktmbuf_pool[socketid] = 331 rte_pktmbuf_pool_create(s, nb_mbuf, 332 MEMPOOL_CACHE_SIZE, 0, 333 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 334 } else { 335 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 336 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 337 } 338 339 if (pktmbuf_pool[socketid] == NULL) { 340 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 341 } else { 342 printf("create mbuf pool on socket %d\n", socketid); 343 } 344 345 #ifdef FF_USE_PAGE_ARRAY 346 nb_mbuf = RTE_ALIGN_CEIL ( 347 nb_ports*nb_lcores*MAX_PKT_BURST + 348 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 349 nb_lcores*MEMPOOL_CACHE_SIZE, 350 (unsigned)4096); 351 ff_init_ref_pool(nb_mbuf, socketid); 352 #endif 353 } 354 355 return 0; 356 } 357 358 static struct rte_ring * 359 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 360 { 361 struct rte_ring *ring; 362 363 if (name == NULL) { 364 rte_exit(EXIT_FAILURE, "create ring failed, no name!\n"); 365 } 366 367 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 368 ring = rte_ring_create(name, count, socket_id, flags); 369 } else { 370 ring = rte_ring_lookup(name); 371 } 372 373 if (ring == NULL) { 374 rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name); 375 } 376 377 return ring; 378 } 379 380 static int 381 init_dispatch_ring(void) 382 { 383 int j; 384 char name_buf[RTE_RING_NAMESIZE]; 385 int queueid; 386 387 unsigned socketid = lcore_conf.socket_id; 388 389 /* Create ring according to ports actually being used. */ 390 int nb_ports = ff_global_cfg.dpdk.nb_ports; 391 for (j = 0; j < nb_ports; j++) { 392 uint16_t portid = ff_global_cfg.dpdk.portid_list[j]; 393 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid]; 394 int nb_queues = pconf->nb_lcores; 395 if (dispatch_ring[portid] == NULL) { 396 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid); 397 398 dispatch_ring[portid] = rte_zmalloc(name_buf, 399 sizeof(struct rte_ring *) * nb_queues, 400 RTE_CACHE_LINE_SIZE); 401 if (dispatch_ring[portid] == NULL) { 402 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 403 "failed\n", name_buf); 404 } 405 } 406 407 for(queueid = 0; queueid < nb_queues; ++queueid) { 408 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d", 409 portid, queueid); 410 dispatch_ring[portid][queueid] = create_ring(name_buf, 411 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ); 412 413 if (dispatch_ring[portid][queueid] == NULL) 414 rte_panic("create ring:%s failed!\n", name_buf); 415 416 printf("create ring:%s success, %u ring entries are now free!\n", 417 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid])); 418 } 419 } 420 421 return 0; 422 } 423 424 static void 425 ff_msg_init(struct rte_mempool *mp, 426 __attribute__((unused)) void *opaque_arg, 427 void *obj, __attribute__((unused)) unsigned i) 428 { 429 struct ff_msg *msg = (struct ff_msg *)obj; 430 msg->msg_type = FF_UNKNOWN; 431 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 432 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 433 } 434 435 static int 436 init_msg_ring(void) 437 { 438 uint16_t i, j; 439 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 440 unsigned socketid = lcore_conf.socket_id; 441 442 /* Create message buffer pool */ 443 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 444 message_pool = rte_mempool_create(FF_MSG_POOL, 445 MSG_RING_SIZE * 2 * nb_procs, 446 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 447 NULL, NULL, ff_msg_init, NULL, 448 socketid, 0); 449 } else { 450 message_pool = rte_mempool_lookup(FF_MSG_POOL); 451 } 452 453 if (message_pool == NULL) { 454 rte_panic("Create msg mempool failed\n"); 455 } 456 457 for(i = 0; i < nb_procs; ++i) { 458 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 459 "%s%u", FF_MSG_RING_IN, i); 460 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 461 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 462 if (msg_ring[i].ring[0] == NULL) 463 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 464 465 for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) { 466 snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE, 467 "%s%u_%u", FF_MSG_RING_OUT, i, j); 468 msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j], 469 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 470 if (msg_ring[i].ring[j] == NULL) 471 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]); 472 } 473 } 474 475 return 0; 476 } 477 478 #ifdef FF_KNI 479 static int 480 init_kni(void) 481 { 482 int nb_ports = rte_eth_dev_count_avail(); 483 kni_accept = 0; 484 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 485 kni_accept = 1; 486 487 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 488 ff_global_cfg.kni.udp_port); 489 490 unsigned socket_id = lcore_conf.socket_id; 491 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 492 493 nb_ports = ff_global_cfg.dpdk.nb_ports; 494 int i, ret; 495 for (i = 0; i < nb_ports; i++) { 496 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 497 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 498 } 499 500 return 0; 501 } 502 #endif 503 504 static void 505 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues) 506 { 507 if (reta_size == 0) { 508 return; 509 } 510 511 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 512 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 513 514 /* config HW indirection table */ 515 unsigned i, j, hash=0; 516 for (i = 0; i < reta_conf_size; i++) { 517 reta_conf[i].mask = ~0ULL; 518 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 519 reta_conf[i].reta[j] = hash++ % nb_queues; 520 } 521 } 522 523 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 524 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 525 port_id); 526 } 527 } 528 529 static int 530 init_port_start(void) 531 { 532 int nb_ports = ff_global_cfg.dpdk.nb_ports; 533 unsigned socketid = 0; 534 struct rte_mempool *mbuf_pool; 535 uint16_t i, j; 536 537 for (i = 0; i < nb_ports; i++) { 538 uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i]; 539 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id]; 540 uint16_t nb_queues = pconf->nb_lcores; 541 542 for (j=0; j<=pconf->nb_slaves; j++) { 543 if (j < pconf->nb_slaves) { 544 port_id = pconf->slave_portid_list[j]; 545 printf("To init %s's %d'st slave port[%d]\n", 546 ff_global_cfg.dpdk.bond_cfgs->name, 547 j, port_id); 548 } else { 549 port_id = u_port_id; 550 } 551 552 struct rte_eth_dev_info dev_info; 553 struct rte_eth_conf port_conf = {0}; 554 struct rte_eth_rxconf rxq_conf; 555 struct rte_eth_txconf txq_conf; 556 557 rte_eth_dev_info_get(port_id, &dev_info); 558 559 if (nb_queues > dev_info.max_rx_queues) { 560 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 561 nb_queues, 562 dev_info.max_rx_queues); 563 } 564 565 if (nb_queues > dev_info.max_tx_queues) { 566 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 567 nb_queues, 568 dev_info.max_tx_queues); 569 } 570 571 struct ether_addr addr; 572 rte_eth_macaddr_get(port_id, &addr); 573 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 574 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 575 (unsigned)port_id, 576 addr.addr_bytes[0], addr.addr_bytes[1], 577 addr.addr_bytes[2], addr.addr_bytes[3], 578 addr.addr_bytes[4], addr.addr_bytes[5]); 579 580 rte_memcpy(pconf->mac, 581 addr.addr_bytes, ETHER_ADDR_LEN); 582 583 /* Set RSS mode */ 584 uint64_t default_rss_hf = ETH_RSS_PROTO_MASK; 585 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 586 port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf; 587 if (dev_info.hash_key_size == 52) { 588 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_52bytes; 589 port_conf.rx_adv_conf.rss_conf.rss_key_len = 52; 590 use_rsskey_52bytes = 1; 591 } else { 592 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes; 593 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; 594 } 595 port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads; 596 if (port_conf.rx_adv_conf.rss_conf.rss_hf != 597 ETH_RSS_PROTO_MASK) { 598 printf("Port %u modified RSS hash function based on hardware support," 599 "requested:%#"PRIx64" configured:%#"PRIx64"\n", 600 port_id, default_rss_hf, 601 port_conf.rx_adv_conf.rss_conf.rss_hf); 602 } 603 604 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) { 605 port_conf.txmode.offloads |= 606 DEV_TX_OFFLOAD_MBUF_FAST_FREE; 607 } 608 609 /* Set Rx VLAN stripping */ 610 if (ff_global_cfg.dpdk.vlan_strip) { 611 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 612 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP; 613 } 614 } 615 616 /* Enable HW CRC stripping */ 617 port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC; 618 619 /* FIXME: Enable TCP LRO ?*/ 620 #if 0 621 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 622 printf("LRO is supported\n"); 623 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO; 624 pconf->hw_features.rx_lro = 1; 625 } 626 #endif 627 628 /* Set Rx checksum checking */ 629 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 630 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 631 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 632 printf("RX checksum offload supported\n"); 633 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM; 634 pconf->hw_features.rx_csum = 1; 635 } 636 637 if (ff_global_cfg.dpdk.tx_csum_offoad_skip == 0) { 638 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 639 printf("TX ip checksum offload supported\n"); 640 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; 641 pconf->hw_features.tx_csum_ip = 1; 642 } 643 644 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 645 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 646 printf("TX TCP&UDP checksum offload supported\n"); 647 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM; 648 pconf->hw_features.tx_csum_l4 = 1; 649 } 650 } else { 651 printf("TX checksum offoad is disabled\n"); 652 } 653 654 if (ff_global_cfg.dpdk.tso) { 655 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 656 printf("TSO is supported\n"); 657 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO; 658 pconf->hw_features.tx_tso = 1; 659 } 660 } else { 661 printf("TSO is disabled\n"); 662 } 663 664 if (dev_info.reta_size) { 665 /* reta size must be power of 2 */ 666 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 667 668 rss_reta_size[port_id] = dev_info.reta_size; 669 printf("port[%d]: rss table size: %d\n", port_id, 670 dev_info.reta_size); 671 } 672 673 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 674 continue; 675 } 676 677 int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); 678 if (ret != 0) { 679 return ret; 680 } 681 682 static uint16_t nb_rxd = RX_QUEUE_SIZE; 683 static uint16_t nb_txd = TX_QUEUE_SIZE; 684 ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd); 685 if (ret < 0) 686 printf("Could not adjust number of descriptors " 687 "for port%u (%d)\n", (unsigned)port_id, ret); 688 689 uint16_t q; 690 for (q = 0; q < nb_queues; q++) { 691 if (numa_on) { 692 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q]; 693 socketid = rte_lcore_to_socket_id(lcore_id); 694 } 695 mbuf_pool = pktmbuf_pool[socketid]; 696 697 txq_conf = dev_info.default_txconf; 698 txq_conf.offloads = port_conf.txmode.offloads; 699 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd, 700 socketid, &txq_conf); 701 if (ret < 0) { 702 return ret; 703 } 704 705 rxq_conf = dev_info.default_rxconf; 706 rxq_conf.offloads = port_conf.rxmode.offloads; 707 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd, 708 socketid, &rxq_conf, mbuf_pool); 709 if (ret < 0) { 710 return ret; 711 } 712 } 713 714 715 if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME, 716 strlen(dev_info.driver_name)) == 0) { 717 718 rte_eth_macaddr_get(port_id, &addr); 719 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 720 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 721 (unsigned)port_id, 722 addr.addr_bytes[0], addr.addr_bytes[1], 723 addr.addr_bytes[2], addr.addr_bytes[3], 724 addr.addr_bytes[4], addr.addr_bytes[5]); 725 726 rte_memcpy(pconf->mac, 727 addr.addr_bytes, ETHER_ADDR_LEN); 728 729 int mode, count, x; 730 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS; 731 732 mode = rte_eth_bond_mode_get(port_id); 733 printf("Port %u, bond mode:%d\n", port_id, mode); 734 735 count = rte_eth_bond_slaves_get(port_id, slaves, len); 736 printf("Port %u, %s's slave ports count:%d\n", port_id, 737 ff_global_cfg.dpdk.bond_cfgs->name, count); 738 for (x=0; x<count; x++) { 739 printf("Port %u, %s's slave port[%u]\n", port_id, 740 ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]); 741 } 742 } 743 744 ret = rte_eth_dev_start(port_id); 745 if (ret < 0) { 746 return ret; 747 } 748 749 if (nb_queues > 1) { 750 /* set HW rss hash function to Toeplitz. */ 751 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 752 struct rte_eth_hash_filter_info info = {0}; 753 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 754 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 755 756 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 757 RTE_ETH_FILTER_SET, &info) < 0) { 758 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 759 port_id); 760 } 761 } 762 763 set_rss_table(port_id, dev_info.reta_size, nb_queues); 764 } 765 766 /* Enable RX in promiscuous mode for the Ethernet device. */ 767 if (ff_global_cfg.dpdk.promiscuous) { 768 rte_eth_promiscuous_enable(port_id); 769 ret = rte_eth_promiscuous_get(port_id); 770 if (ret == 1) { 771 printf("set port %u to promiscuous mode ok\n", port_id); 772 } else { 773 printf("set port %u to promiscuous mode error\n", port_id); 774 } 775 } 776 777 /* Enable pcap dump */ 778 if (pconf->pcap) { 779 ff_enable_pcap(pconf->pcap); 780 } 781 } 782 } 783 784 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 785 check_all_ports_link_status(); 786 } 787 788 return 0; 789 } 790 791 static int 792 init_clock(void) 793 { 794 rte_timer_subsystem_init(); 795 uint64_t hz = rte_get_timer_hz(); 796 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 797 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 798 799 rte_timer_init(&freebsd_clock); 800 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 801 rte_lcore_id(), &ff_hardclock_job, NULL); 802 803 ff_update_current_ts(); 804 805 return 0; 806 } 807 808 int 809 ff_dpdk_init(int argc, char **argv) 810 { 811 if (ff_global_cfg.dpdk.nb_procs < 1 || 812 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 813 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 814 ff_global_cfg.dpdk.proc_id < 0) { 815 printf("param num_procs[%d] or proc_id[%d] error!\n", 816 ff_global_cfg.dpdk.nb_procs, 817 ff_global_cfg.dpdk.proc_id); 818 exit(1); 819 } 820 821 int ret = rte_eal_init(argc, argv); 822 if (ret < 0) { 823 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 824 } 825 826 numa_on = ff_global_cfg.dpdk.numa_on; 827 828 idle_sleep = ff_global_cfg.dpdk.idle_sleep; 829 pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \ 830 BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay; 831 832 init_lcore_conf(); 833 834 init_mem_pool(); 835 836 init_dispatch_ring(); 837 838 init_msg_ring(); 839 840 #ifdef FF_KNI 841 enable_kni = ff_global_cfg.kni.enable; 842 if (enable_kni) { 843 init_kni(); 844 } 845 #endif 846 847 #ifdef FF_USE_PAGE_ARRAY 848 ff_mmap_init(); 849 #endif 850 851 ret = init_port_start(); 852 if (ret < 0) { 853 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 854 } 855 856 init_clock(); 857 858 return 0; 859 } 860 861 static void 862 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 863 { 864 uint8_t rx_csum = ctx->hw_features.rx_csum; 865 if (rx_csum) { 866 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 867 rte_pktmbuf_free(pkt); 868 return; 869 } 870 } 871 872 void *data = rte_pktmbuf_mtod(pkt, void*); 873 uint16_t len = rte_pktmbuf_data_len(pkt); 874 875 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 876 if (hdr == NULL) { 877 rte_pktmbuf_free(pkt); 878 return; 879 } 880 881 if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) { 882 ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci); 883 } 884 885 struct rte_mbuf *pn = pkt->next; 886 void *prev = hdr; 887 while(pn != NULL) { 888 data = rte_pktmbuf_mtod(pn, void*); 889 len = rte_pktmbuf_data_len(pn); 890 891 void *mb = ff_mbuf_get(prev, data, len); 892 if (mb == NULL) { 893 ff_mbuf_free(hdr); 894 rte_pktmbuf_free(pkt); 895 return; 896 } 897 pn = pn->next; 898 prev = mb; 899 } 900 901 ff_veth_process_packet(ctx->ifp, hdr); 902 } 903 904 static enum FilterReturn 905 protocol_filter(const void *data, uint16_t len) 906 { 907 if(len < ETHER_HDR_LEN) 908 return FILTER_UNKNOWN; 909 910 const struct ether_hdr *hdr; 911 const struct vlan_hdr *vlanhdr; 912 hdr = (const struct ether_hdr *)data; 913 uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type); 914 data += ETHER_HDR_LEN; 915 len -= ETHER_HDR_LEN; 916 917 if (ether_type == ETHER_TYPE_VLAN) { 918 vlanhdr = (struct vlan_hdr *)data; 919 ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto); 920 data += sizeof(struct vlan_hdr); 921 len -= sizeof(struct vlan_hdr); 922 } 923 924 if(ether_type == ETHER_TYPE_ARP) 925 return FILTER_ARP; 926 927 #ifdef INET6 928 if (ether_type == ETHER_TYPE_IPv6) { 929 return ff_kni_proto_filter(data, 930 len, ether_type); 931 } 932 #endif 933 934 #ifndef FF_KNI 935 return FILTER_UNKNOWN; 936 #else 937 if (!enable_kni) { 938 return FILTER_UNKNOWN; 939 } 940 941 if(ether_type != ETHER_TYPE_IPv4) 942 return FILTER_UNKNOWN; 943 944 return ff_kni_proto_filter(data, 945 len, ether_type); 946 #endif 947 } 948 949 static inline void 950 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m) 951 { 952 struct rte_mbuf *md; 953 void *src, *dst; 954 955 dst = rte_pktmbuf_mtod(mi, void *); 956 src = rte_pktmbuf_mtod(m, void *); 957 958 mi->data_len = m->data_len; 959 rte_memcpy(dst, src, m->data_len); 960 961 mi->port = m->port; 962 mi->vlan_tci = m->vlan_tci; 963 mi->vlan_tci_outer = m->vlan_tci_outer; 964 mi->tx_offload = m->tx_offload; 965 mi->hash = m->hash; 966 mi->ol_flags = m->ol_flags; 967 mi->packet_type = m->packet_type; 968 } 969 970 /* copied from rte_pktmbuf_clone */ 971 static inline struct rte_mbuf * 972 pktmbuf_deep_clone(const struct rte_mbuf *md, 973 struct rte_mempool *mp) 974 { 975 struct rte_mbuf *mc, *mi, **prev; 976 uint32_t pktlen; 977 uint8_t nseg; 978 979 if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) 980 return NULL; 981 982 mi = mc; 983 prev = &mi->next; 984 pktlen = md->pkt_len; 985 nseg = 0; 986 987 do { 988 nseg++; 989 pktmbuf_deep_attach(mi, md); 990 *prev = mi; 991 prev = &mi->next; 992 } while ((md = md->next) != NULL && 993 (mi = rte_pktmbuf_alloc(mp)) != NULL); 994 995 *prev = NULL; 996 mc->nb_segs = nseg; 997 mc->pkt_len = pktlen; 998 999 /* Allocation of new indirect segment failed */ 1000 if (unlikely (mi == NULL)) { 1001 rte_pktmbuf_free(mc); 1002 return NULL; 1003 } 1004 1005 __rte_mbuf_sanity_check(mc, 1); 1006 return mc; 1007 } 1008 1009 static inline void 1010 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 1011 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 1012 { 1013 struct lcore_conf *qconf = &lcore_conf; 1014 uint16_t nb_queues = qconf->nb_queue_list[port_id]; 1015 1016 uint16_t i; 1017 for (i = 0; i < count; i++) { 1018 struct rte_mbuf *rtem = bufs[i]; 1019 1020 if (unlikely(qconf->pcap[port_id] != NULL)) { 1021 if (!pkts_from_ring) { 1022 ff_dump_packets(qconf->pcap[port_id], rtem); 1023 } 1024 } 1025 1026 void *data = rte_pktmbuf_mtod(rtem, void*); 1027 uint16_t len = rte_pktmbuf_data_len(rtem); 1028 1029 if (!pkts_from_ring) { 1030 ff_traffic.rx_packets++; 1031 ff_traffic.rx_bytes += len; 1032 } 1033 1034 if (!pkts_from_ring && packet_dispatcher) { 1035 int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues); 1036 if (ret == FF_DISPATCH_RESPONSE) { 1037 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len; 1038 1039 /* 1040 * We have not support vlan out strip 1041 */ 1042 if (rtem->vlan_tci) { 1043 data = rte_pktmbuf_prepend(rtem, sizeof(struct vlan_hdr)); 1044 if (data != NULL) { 1045 memmove(data, data + sizeof(struct vlan_hdr), ETHER_HDR_LEN); 1046 struct ether_hdr *etherhdr = (struct ether_hdr *)data; 1047 struct vlan_hdr *vlanhdr = (struct vlan_hdr *)(data + ETHER_HDR_LEN); 1048 vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci); 1049 vlanhdr->eth_proto = etherhdr->ether_type; 1050 etherhdr->ether_type = rte_cpu_to_be_16(ETHER_TYPE_VLAN); 1051 } 1052 } 1053 send_single_packet(rtem, port_id); 1054 continue; 1055 } 1056 1057 if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) { 1058 rte_pktmbuf_free(rtem); 1059 continue; 1060 } 1061 1062 if (ret != queue_id) { 1063 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem); 1064 if (ret < 0) 1065 rte_pktmbuf_free(rtem); 1066 1067 continue; 1068 } 1069 } 1070 1071 enum FilterReturn filter = protocol_filter(data, len); 1072 #ifdef INET6 1073 if (filter == FILTER_ARP || filter == FILTER_NDP) { 1074 #else 1075 if (filter == FILTER_ARP) { 1076 #endif 1077 struct rte_mempool *mbuf_pool; 1078 struct rte_mbuf *mbuf_clone; 1079 if (!pkts_from_ring) { 1080 uint16_t j; 1081 for(j = 0; j < nb_queues; ++j) { 1082 if(j == queue_id) 1083 continue; 1084 1085 unsigned socket_id = 0; 1086 if (numa_on) { 1087 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j]; 1088 socket_id = rte_lcore_to_socket_id(lcore_id); 1089 } 1090 mbuf_pool = pktmbuf_pool[socket_id]; 1091 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1092 if(mbuf_clone) { 1093 int ret = rte_ring_enqueue(dispatch_ring[port_id][j], 1094 mbuf_clone); 1095 if (ret < 0) 1096 rte_pktmbuf_free(mbuf_clone); 1097 } 1098 } 1099 } 1100 1101 #ifdef FF_KNI 1102 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1103 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 1104 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1105 if(mbuf_clone) { 1106 ff_kni_enqueue(port_id, mbuf_clone); 1107 } 1108 } 1109 #endif 1110 ff_veth_input(ctx, rtem); 1111 #ifdef FF_KNI 1112 } else if (enable_kni && 1113 ((filter == FILTER_KNI && kni_accept) || 1114 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 1115 ff_kni_enqueue(port_id, rtem); 1116 #endif 1117 } else { 1118 ff_veth_input(ctx, rtem); 1119 } 1120 } 1121 } 1122 1123 static inline int 1124 process_dispatch_ring(uint16_t port_id, uint16_t queue_id, 1125 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 1126 { 1127 /* read packet from ring buf and to process */ 1128 uint16_t nb_rb; 1129 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id], 1130 (void **)pkts_burst, MAX_PKT_BURST, NULL); 1131 1132 if(nb_rb > 0) { 1133 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 1134 } 1135 1136 return 0; 1137 } 1138 1139 static inline void 1140 handle_sysctl_msg(struct ff_msg *msg) 1141 { 1142 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 1143 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 1144 msg->sysctl.newlen); 1145 1146 if (ret < 0) { 1147 msg->result = errno; 1148 } else { 1149 msg->result = 0; 1150 } 1151 } 1152 1153 static inline void 1154 handle_ioctl_msg(struct ff_msg *msg) 1155 { 1156 int fd, ret; 1157 #ifdef INET6 1158 if (msg->msg_type == FF_IOCTL6) { 1159 fd = ff_socket(AF_INET6, SOCK_DGRAM, 0); 1160 } else 1161 #endif 1162 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 1163 1164 if (fd < 0) { 1165 ret = -1; 1166 goto done; 1167 } 1168 1169 ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data); 1170 1171 ff_close(fd); 1172 1173 done: 1174 if (ret < 0) { 1175 msg->result = errno; 1176 } else { 1177 msg->result = 0; 1178 } 1179 } 1180 1181 static inline void 1182 handle_route_msg(struct ff_msg *msg) 1183 { 1184 int ret = ff_rtioctl(msg->route.fib, msg->route.data, 1185 &msg->route.len, msg->route.maxlen); 1186 if (ret < 0) { 1187 msg->result = errno; 1188 } else { 1189 msg->result = 0; 1190 } 1191 } 1192 1193 static inline void 1194 handle_top_msg(struct ff_msg *msg) 1195 { 1196 msg->top = ff_top_status; 1197 msg->result = 0; 1198 } 1199 1200 #ifdef FF_NETGRAPH 1201 static inline void 1202 handle_ngctl_msg(struct ff_msg *msg) 1203 { 1204 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data); 1205 if (ret < 0) { 1206 msg->result = errno; 1207 } else { 1208 msg->result = 0; 1209 msg->ngctl.ret = ret; 1210 } 1211 } 1212 #endif 1213 1214 #ifdef FF_IPFW 1215 static inline void 1216 handle_ipfw_msg(struct ff_msg *msg) 1217 { 1218 int fd, ret; 1219 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW); 1220 if (fd < 0) { 1221 ret = -1; 1222 goto done; 1223 } 1224 1225 switch (msg->ipfw.cmd) { 1226 case FF_IPFW_GET: 1227 ret = ff_getsockopt_freebsd(fd, msg->ipfw.level, 1228 msg->ipfw.optname, msg->ipfw.optval, 1229 msg->ipfw.optlen); 1230 break; 1231 case FF_IPFW_SET: 1232 ret = ff_setsockopt_freebsd(fd, msg->ipfw.level, 1233 msg->ipfw.optname, msg->ipfw.optval, 1234 *(msg->ipfw.optlen)); 1235 break; 1236 default: 1237 ret = -1; 1238 errno = ENOTSUP; 1239 break; 1240 } 1241 1242 ff_close(fd); 1243 1244 done: 1245 if (ret < 0) { 1246 msg->result = errno; 1247 } else { 1248 msg->result = 0; 1249 } 1250 } 1251 #endif 1252 1253 static inline void 1254 handle_traffic_msg(struct ff_msg *msg) 1255 { 1256 msg->traffic = ff_traffic; 1257 msg->result = 0; 1258 } 1259 1260 static inline void 1261 handle_default_msg(struct ff_msg *msg) 1262 { 1263 msg->result = ENOTSUP; 1264 } 1265 1266 static inline void 1267 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1268 { 1269 switch (msg->msg_type) { 1270 case FF_SYSCTL: 1271 handle_sysctl_msg(msg); 1272 break; 1273 case FF_IOCTL: 1274 #ifdef INET6 1275 case FF_IOCTL6: 1276 #endif 1277 handle_ioctl_msg(msg); 1278 break; 1279 case FF_ROUTE: 1280 handle_route_msg(msg); 1281 break; 1282 case FF_TOP: 1283 handle_top_msg(msg); 1284 break; 1285 #ifdef FF_NETGRAPH 1286 case FF_NGCTL: 1287 handle_ngctl_msg(msg); 1288 break; 1289 #endif 1290 #ifdef FF_IPFW 1291 case FF_IPFW_CTL: 1292 handle_ipfw_msg(msg); 1293 break; 1294 #endif 1295 case FF_TRAFFIC: 1296 handle_traffic_msg(msg); 1297 break; 1298 default: 1299 handle_default_msg(msg); 1300 break; 1301 } 1302 rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg); 1303 } 1304 1305 static inline int 1306 process_msg_ring(uint16_t proc_id) 1307 { 1308 void *msg; 1309 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1310 1311 if (unlikely(ret == 0)) { 1312 handle_msg((struct ff_msg *)msg, proc_id); 1313 } 1314 1315 return 0; 1316 } 1317 1318 /* Send burst of packets on an output interface */ 1319 static inline int 1320 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1321 { 1322 struct rte_mbuf **m_table; 1323 int ret; 1324 uint16_t queueid; 1325 1326 queueid = qconf->tx_queue_id[port]; 1327 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1328 1329 if (unlikely(qconf->pcap[port] != NULL)) { 1330 uint16_t i; 1331 for (i = 0; i < n; i++) { 1332 ff_dump_packets(qconf->pcap[port], m_table[i]); 1333 } 1334 } 1335 1336 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1337 ff_traffic.tx_packets += ret; 1338 uint16_t i; 1339 for (i = 0; i < ret; i++) { 1340 ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]); 1341 #ifdef FF_USE_PAGE_ARRAY 1342 if (qconf->tx_mbufs[port].bsd_m_table[i]) 1343 ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs); 1344 #endif 1345 } 1346 if (unlikely(ret < n)) { 1347 do { 1348 rte_pktmbuf_free(m_table[ret]); 1349 #ifdef FF_USE_PAGE_ARRAY 1350 if ( qconf->tx_mbufs[port].bsd_m_table[ret] ) 1351 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]); 1352 #endif 1353 } while (++ret < n); 1354 } 1355 return 0; 1356 } 1357 1358 /* Enqueue a single packet, and send burst if queue is filled */ 1359 static inline int 1360 send_single_packet(struct rte_mbuf *m, uint8_t port) 1361 { 1362 uint16_t len; 1363 struct lcore_conf *qconf; 1364 1365 qconf = &lcore_conf; 1366 len = qconf->tx_mbufs[port].len; 1367 qconf->tx_mbufs[port].m_table[len] = m; 1368 len++; 1369 1370 /* enough pkts to be sent */ 1371 if (unlikely(len == MAX_PKT_BURST)) { 1372 send_burst(qconf, MAX_PKT_BURST, port); 1373 len = 0; 1374 } 1375 1376 qconf->tx_mbufs[port].len = len; 1377 return 0; 1378 } 1379 1380 int 1381 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1382 int total) 1383 { 1384 #ifdef FF_USE_PAGE_ARRAY 1385 struct lcore_conf *qconf = &lcore_conf; 1386 int len = 0; 1387 1388 len = ff_if_send_onepkt(ctx, m,total); 1389 if (unlikely(len == MAX_PKT_BURST)) { 1390 send_burst(qconf, MAX_PKT_BURST, ctx->port_id); 1391 len = 0; 1392 } 1393 qconf->tx_mbufs[ctx->port_id].len = len; 1394 return 0; 1395 #endif 1396 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1397 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1398 if (head == NULL) { 1399 ff_mbuf_free(m); 1400 return -1; 1401 } 1402 1403 head->pkt_len = total; 1404 head->nb_segs = 0; 1405 1406 int off = 0; 1407 struct rte_mbuf *cur = head, *prev = NULL; 1408 while(total > 0) { 1409 if (cur == NULL) { 1410 cur = rte_pktmbuf_alloc(mbuf_pool); 1411 if (cur == NULL) { 1412 rte_pktmbuf_free(head); 1413 ff_mbuf_free(m); 1414 return -1; 1415 } 1416 } 1417 1418 if (prev != NULL) { 1419 prev->next = cur; 1420 } 1421 head->nb_segs++; 1422 1423 prev = cur; 1424 void *data = rte_pktmbuf_mtod(cur, void*); 1425 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1426 int ret = ff_mbuf_copydata(m, data, off, len); 1427 if (ret < 0) { 1428 rte_pktmbuf_free(head); 1429 ff_mbuf_free(m); 1430 return -1; 1431 } 1432 1433 1434 cur->data_len = len; 1435 off += len; 1436 total -= len; 1437 cur = NULL; 1438 } 1439 1440 struct ff_tx_offload offload = {0}; 1441 ff_mbuf_tx_offload(m, &offload); 1442 1443 void *data = rte_pktmbuf_mtod(head, void*); 1444 1445 if (offload.ip_csum) { 1446 /* ipv6 not supported yet */ 1447 struct ipv4_hdr *iph; 1448 int iph_len; 1449 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1450 iph_len = (iph->version_ihl & 0x0f) << 2; 1451 1452 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 1453 head->l2_len = ETHER_HDR_LEN; 1454 head->l3_len = iph_len; 1455 } 1456 1457 if (ctx->hw_features.tx_csum_l4) { 1458 struct ipv4_hdr *iph; 1459 int iph_len; 1460 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1461 iph_len = (iph->version_ihl & 0x0f) << 2; 1462 1463 if (offload.tcp_csum) { 1464 head->ol_flags |= PKT_TX_TCP_CKSUM; 1465 head->l2_len = ETHER_HDR_LEN; 1466 head->l3_len = iph_len; 1467 } 1468 1469 /* 1470 * TCP segmentation offload. 1471 * 1472 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 1473 * implies PKT_TX_TCP_CKSUM) 1474 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 1475 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 1476 * write the IP checksum to 0 in the packet 1477 * - fill the mbuf offload information: l2_len, 1478 * l3_len, l4_len, tso_segsz 1479 * - calculate the pseudo header checksum without taking ip_len 1480 * in account, and set it in the TCP header. Refer to 1481 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 1482 * used as helpers. 1483 */ 1484 if (offload.tso_seg_size) { 1485 struct tcp_hdr *tcph; 1486 int tcph_len; 1487 tcph = (struct tcp_hdr *)((char *)iph + iph_len); 1488 tcph_len = (tcph->data_off & 0xf0) >> 2; 1489 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 1490 1491 head->ol_flags |= PKT_TX_TCP_SEG; 1492 head->l4_len = tcph_len; 1493 head->tso_segsz = offload.tso_seg_size; 1494 } 1495 1496 if (offload.udp_csum) { 1497 head->ol_flags |= PKT_TX_UDP_CKSUM; 1498 head->l2_len = ETHER_HDR_LEN; 1499 head->l3_len = iph_len; 1500 } 1501 } 1502 1503 ff_mbuf_free(m); 1504 1505 return send_single_packet(head, ctx->port_id); 1506 } 1507 1508 static int 1509 main_loop(void *arg) 1510 { 1511 struct loop_routine *lr = (struct loop_routine *)arg; 1512 1513 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1514 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc; 1515 int i, j, nb_rx, idle; 1516 uint16_t port_id, queue_id; 1517 struct lcore_conf *qconf; 1518 uint64_t drain_tsc = 0; 1519 struct ff_dpdk_if_context *ctx; 1520 1521 if (pkt_tx_delay) { 1522 drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay; 1523 } 1524 1525 prev_tsc = 0; 1526 usch_tsc = 0; 1527 1528 qconf = &lcore_conf; 1529 1530 while (1) { 1531 cur_tsc = rte_rdtsc(); 1532 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1533 rte_timer_manage(); 1534 } 1535 1536 idle = 1; 1537 sys_tsc = 0; 1538 usr_tsc = 0; 1539 1540 /* 1541 * TX burst queue drain 1542 */ 1543 diff_tsc = cur_tsc - prev_tsc; 1544 if (unlikely(diff_tsc >= drain_tsc)) { 1545 for (i = 0; i < qconf->nb_tx_port; i++) { 1546 port_id = qconf->tx_port_id[i]; 1547 if (qconf->tx_mbufs[port_id].len == 0) 1548 continue; 1549 1550 idle = 0; 1551 1552 send_burst(qconf, 1553 qconf->tx_mbufs[port_id].len, 1554 port_id); 1555 qconf->tx_mbufs[port_id].len = 0; 1556 } 1557 1558 prev_tsc = cur_tsc; 1559 } 1560 1561 /* 1562 * Read packet from RX queues 1563 */ 1564 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1565 port_id = qconf->rx_queue_list[i].port_id; 1566 queue_id = qconf->rx_queue_list[i].queue_id; 1567 ctx = veth_ctx[port_id]; 1568 1569 #ifdef FF_KNI 1570 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1571 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1572 } 1573 #endif 1574 1575 process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); 1576 1577 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1578 MAX_PKT_BURST); 1579 if (nb_rx == 0) 1580 continue; 1581 1582 idle = 0; 1583 1584 /* Prefetch first packets */ 1585 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1586 rte_prefetch0(rte_pktmbuf_mtod( 1587 pkts_burst[j], void *)); 1588 } 1589 1590 /* Prefetch and handle already prefetched packets */ 1591 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1592 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1593 j + PREFETCH_OFFSET], void *)); 1594 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1595 } 1596 1597 /* Handle remaining prefetched packets */ 1598 for (; j < nb_rx; j++) { 1599 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1600 } 1601 } 1602 1603 process_msg_ring(qconf->proc_id); 1604 1605 div_tsc = rte_rdtsc(); 1606 1607 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) { 1608 usch_tsc = cur_tsc; 1609 lr->loop(lr->arg); 1610 } 1611 1612 idle_sleep_tsc = rte_rdtsc(); 1613 if (likely(idle && idle_sleep)) { 1614 usleep(idle_sleep); 1615 end_tsc = rte_rdtsc(); 1616 } else { 1617 end_tsc = idle_sleep_tsc; 1618 } 1619 1620 if (usch_tsc == cur_tsc) { 1621 usr_tsc = idle_sleep_tsc - div_tsc; 1622 } 1623 1624 if (!idle) { 1625 sys_tsc = div_tsc - cur_tsc; 1626 ff_top_status.sys_tsc += sys_tsc; 1627 } 1628 1629 ff_top_status.usr_tsc += usr_tsc; 1630 ff_top_status.work_tsc += end_tsc - cur_tsc; 1631 ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1632 1633 ff_top_status.loops++; 1634 } 1635 1636 return 0; 1637 } 1638 1639 int 1640 ff_dpdk_if_up(void) { 1641 int i; 1642 struct lcore_conf *qconf = &lcore_conf; 1643 for (i = 0; i < qconf->nb_tx_port; i++) { 1644 uint16_t port_id = qconf->tx_port_id[i]; 1645 1646 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id]; 1647 veth_ctx[port_id] = ff_veth_attach(pconf); 1648 if (veth_ctx[port_id] == NULL) { 1649 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1650 } 1651 } 1652 1653 return 0; 1654 } 1655 1656 void 1657 ff_dpdk_run(loop_func_t loop, void *arg) { 1658 struct loop_routine *lr = rte_malloc(NULL, 1659 sizeof(struct loop_routine), 0); 1660 lr->loop = loop; 1661 lr->arg = arg; 1662 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1663 rte_eal_mp_wait_lcore(); 1664 rte_free(lr); 1665 } 1666 1667 void 1668 ff_dpdk_pktmbuf_free(void *m) 1669 { 1670 rte_pktmbuf_free((struct rte_mbuf *)m); 1671 } 1672 1673 static uint32_t 1674 toeplitz_hash(unsigned keylen, const uint8_t *key, 1675 unsigned datalen, const uint8_t *data) 1676 { 1677 uint32_t hash = 0, v; 1678 u_int i, b; 1679 1680 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1681 1682 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1683 for (i = 0; i < datalen; i++) { 1684 for (b = 0; b < 8; b++) { 1685 if (data[i] & (1<<(7-b))) 1686 hash ^= v; 1687 v <<= 1; 1688 if ((i + 4) < keylen && 1689 (key[i+4] & (1<<(7-b)))) 1690 v |= 1; 1691 } 1692 } 1693 return (hash); 1694 } 1695 1696 int 1697 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1698 uint16_t sport, uint16_t dport) 1699 { 1700 struct lcore_conf *qconf = &lcore_conf; 1701 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 1702 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id]; 1703 1704 if (nb_queues <= 1) { 1705 return 1; 1706 } 1707 1708 uint16_t reta_size = rss_reta_size[ctx->port_id]; 1709 uint16_t queueid = qconf->tx_queue_id[ctx->port_id]; 1710 1711 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1712 sizeof(dport)]; 1713 1714 unsigned datalen = 0; 1715 1716 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1717 datalen += sizeof(saddr); 1718 1719 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1720 datalen += sizeof(daddr); 1721 1722 bcopy(&sport, &data[datalen], sizeof(sport)); 1723 datalen += sizeof(sport); 1724 1725 bcopy(&dport, &data[datalen], sizeof(dport)); 1726 datalen += sizeof(dport); 1727 1728 uint32_t hash = 0; 1729 if ( !use_rsskey_52bytes ) 1730 hash = toeplitz_hash(sizeof(default_rsskey_40bytes), 1731 default_rsskey_40bytes, datalen, data); 1732 else 1733 hash = toeplitz_hash(sizeof(default_rsskey_52bytes), 1734 default_rsskey_52bytes, datalen, data); 1735 return ((hash & (reta_size - 1)) % nb_queues) == queueid; 1736 } 1737 1738 void 1739 ff_regist_packet_dispatcher(dispatch_func_t func) 1740 { 1741 packet_dispatcher = func; 1742 } 1743 1744 uint64_t 1745 ff_get_tsc_ns() 1746 { 1747 uint64_t cur_tsc = rte_rdtsc(); 1748 uint64_t hz = rte_get_tsc_hz(); 1749 return ((double)cur_tsc/(double)hz) * NS_PER_S; 1750 } 1751 1752