1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 #include <unistd.h> 28 #include <sys/mman.h> 29 #include <errno.h> 30 31 #include <rte_common.h> 32 #include <rte_byteorder.h> 33 #include <rte_log.h> 34 #include <rte_memory.h> 35 #include <rte_memcpy.h> 36 #include <rte_memzone.h> 37 #include <rte_config.h> 38 #include <rte_eal.h> 39 #include <rte_pci.h> 40 #include <rte_mbuf.h> 41 #include <rte_memory.h> 42 #include <rte_lcore.h> 43 #include <rte_launch.h> 44 #include <rte_ethdev.h> 45 #include <rte_debug.h> 46 #include <rte_common.h> 47 #include <rte_ether.h> 48 #include <rte_malloc.h> 49 #include <rte_cycles.h> 50 #include <rte_timer.h> 51 #include <rte_thash.h> 52 #include <rte_ip.h> 53 #include <rte_tcp.h> 54 #include <rte_udp.h> 55 #include <rte_eth_bond.h> 56 57 #include "ff_dpdk_if.h" 58 #include "ff_dpdk_pcap.h" 59 #include "ff_dpdk_kni.h" 60 #include "ff_config.h" 61 #include "ff_veth.h" 62 #include "ff_host_interface.h" 63 #include "ff_msg.h" 64 #include "ff_api.h" 65 #include "ff_memory.h" 66 67 #ifdef FF_KNI 68 #define KNI_MBUF_MAX 2048 69 #define KNI_QUEUE_SIZE 2048 70 71 int enable_kni; 72 static int kni_accept; 73 #endif 74 75 static int numa_on; 76 77 static unsigned idle_sleep; 78 static unsigned pkt_tx_delay; 79 80 static struct rte_timer freebsd_clock; 81 82 // Mellanox Linux's driver key 83 static uint8_t default_rsskey_40bytes[40] = { 84 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 85 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 86 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 87 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 88 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 89 }; 90 91 static int use_rsskey_52bytes = 0; 92 static uint8_t default_rsskey_52bytes[52] = { 93 0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23, 94 0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30, 95 0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02, 96 0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c, 97 0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55, 98 0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e, 99 0x81, 0x15, 0x03, 0x66 100 }; 101 102 struct lcore_conf lcore_conf; 103 104 struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 105 106 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; 107 static dispatch_func_t packet_dispatcher; 108 109 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 110 111 #define BOND_DRIVER_NAME "net_bonding" 112 113 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port); 114 115 struct ff_msg_ring { 116 char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE]; 117 /* ring[0] for lcore recv msg, other send */ 118 /* ring[1] for lcore send msg, other read */ 119 struct rte_ring *ring[FF_MSG_NUM]; 120 } __rte_cache_aligned; 121 122 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 123 static struct rte_mempool *message_pool; 124 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 125 126 static struct ff_top_args ff_top_status; 127 static struct ff_traffic_args ff_traffic; 128 extern void ff_hardclock(void); 129 130 static void 131 ff_hardclock_job(__rte_unused struct rte_timer *timer, 132 __rte_unused void *arg) { 133 ff_hardclock(); 134 ff_update_current_ts(); 135 } 136 137 struct ff_dpdk_if_context * 138 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 139 { 140 struct ff_dpdk_if_context *ctx; 141 142 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 143 if (ctx == NULL) 144 return NULL; 145 146 ctx->sc = sc; 147 ctx->ifp = ifp; 148 ctx->port_id = cfg->port_id; 149 ctx->hw_features = cfg->hw_features; 150 151 return ctx; 152 } 153 154 void 155 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 156 { 157 free(ctx); 158 } 159 160 static void 161 check_all_ports_link_status(void) 162 { 163 #define CHECK_INTERVAL 100 /* 100ms */ 164 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 165 166 uint16_t portid; 167 uint8_t count, all_ports_up, print_flag = 0; 168 struct rte_eth_link link; 169 170 printf("\nChecking link status"); 171 fflush(stdout); 172 173 int i, nb_ports; 174 nb_ports = ff_global_cfg.dpdk.nb_ports; 175 for (count = 0; count <= MAX_CHECK_TIME; count++) { 176 all_ports_up = 1; 177 for (i = 0; i < nb_ports; i++) { 178 uint16_t portid = ff_global_cfg.dpdk.portid_list[i]; 179 memset(&link, 0, sizeof(link)); 180 rte_eth_link_get_nowait(portid, &link); 181 182 /* print link status if flag set */ 183 if (print_flag == 1) { 184 if (link.link_status) { 185 printf("Port %d Link Up - speed %u " 186 "Mbps - %s\n", (int)portid, 187 (unsigned)link.link_speed, 188 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 189 ("full-duplex") : ("half-duplex\n")); 190 } else { 191 printf("Port %d Link Down\n", (int)portid); 192 } 193 continue; 194 } 195 /* clear all_ports_up flag if any link down */ 196 if (link.link_status == 0) { 197 all_ports_up = 0; 198 break; 199 } 200 } 201 202 /* after finally printing all link status, get out */ 203 if (print_flag == 1) 204 break; 205 206 if (all_ports_up == 0) { 207 printf("."); 208 fflush(stdout); 209 rte_delay_ms(CHECK_INTERVAL); 210 } 211 212 /* set the print_flag if all ports up or timeout */ 213 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 214 print_flag = 1; 215 printf("done\n"); 216 } 217 } 218 } 219 220 static int 221 init_lcore_conf(void) 222 { 223 uint8_t nb_dev_ports = rte_eth_dev_count_avail(); 224 if (nb_dev_ports == 0) { 225 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 226 } 227 228 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) { 229 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n", 230 ff_global_cfg.dpdk.max_portid); 231 } 232 233 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs; 234 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 235 236 uint16_t proc_id; 237 for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) { 238 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id]; 239 if (!lcore_config[lcore_id].detected) { 240 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 241 } 242 } 243 244 uint16_t socket_id = 0; 245 if (numa_on) { 246 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 247 } 248 249 lcore_conf.socket_id = socket_id; 250 251 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id]; 252 int j; 253 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) { 254 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j]; 255 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 256 257 int queueid = -1; 258 int i; 259 for (i = 0; i < pconf->nb_lcores; i++) { 260 if (pconf->lcore_list[i] == lcore_id) { 261 queueid = i; 262 } 263 } 264 if (queueid < 0) { 265 continue; 266 } 267 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid); 268 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 269 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 270 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid; 271 lcore_conf.nb_rx_queue++; 272 273 lcore_conf.tx_queue_id[port_id] = queueid; 274 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id; 275 lcore_conf.nb_tx_port++; 276 277 /* Enable pcap dump */ 278 if (ff_global_cfg.pcap.enable) { 279 ff_enable_pcap(ff_global_cfg.pcap.save_path, ff_global_cfg.pcap.snap_len); 280 } 281 282 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores; 283 } 284 285 if (lcore_conf.nb_rx_queue == 0) { 286 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id); 287 } 288 289 return 0; 290 } 291 292 static int 293 init_mem_pool(void) 294 { 295 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 296 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 297 uint32_t nb_tx_queue = nb_lcores; 298 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 299 uint16_t max_portid = ff_global_cfg.dpdk.max_portid; 300 301 unsigned nb_mbuf = RTE_ALIGN_CEIL ( 302 (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE + 303 nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST + 304 nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE + 305 nb_lcores * MEMPOOL_CACHE_SIZE + 306 #ifdef FF_KNI 307 nb_ports * KNI_MBUF_MAX + 308 nb_ports * KNI_QUEUE_SIZE + 309 #endif 310 nb_lcores * nb_ports * DISPATCH_RING_SIZE), 311 (unsigned)8192); 312 313 unsigned socketid = 0; 314 uint16_t i, lcore_id; 315 char s[64]; 316 317 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) { 318 lcore_id = ff_global_cfg.dpdk.proc_lcore[i]; 319 if (numa_on) { 320 socketid = rte_lcore_to_socket_id(lcore_id); 321 } 322 323 if (socketid >= NB_SOCKETS) { 324 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 325 socketid, i, NB_SOCKETS); 326 } 327 328 if (pktmbuf_pool[socketid] != NULL) { 329 continue; 330 } 331 332 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 333 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 334 pktmbuf_pool[socketid] = 335 rte_pktmbuf_pool_create(s, nb_mbuf, 336 MEMPOOL_CACHE_SIZE, 0, 337 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 338 } else { 339 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 340 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 341 } 342 343 if (pktmbuf_pool[socketid] == NULL) { 344 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 345 } else { 346 printf("create mbuf pool on socket %d\n", socketid); 347 } 348 349 #ifdef FF_USE_PAGE_ARRAY 350 nb_mbuf = RTE_ALIGN_CEIL ( 351 nb_ports*nb_lcores*MAX_PKT_BURST + 352 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 353 nb_lcores*MEMPOOL_CACHE_SIZE, 354 (unsigned)4096); 355 ff_init_ref_pool(nb_mbuf, socketid); 356 #endif 357 } 358 359 return 0; 360 } 361 362 static struct rte_ring * 363 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 364 { 365 struct rte_ring *ring; 366 367 if (name == NULL) { 368 rte_exit(EXIT_FAILURE, "create ring failed, no name!\n"); 369 } 370 371 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 372 ring = rte_ring_create(name, count, socket_id, flags); 373 } else { 374 ring = rte_ring_lookup(name); 375 } 376 377 if (ring == NULL) { 378 rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name); 379 } 380 381 return ring; 382 } 383 384 static int 385 init_dispatch_ring(void) 386 { 387 int j; 388 char name_buf[RTE_RING_NAMESIZE]; 389 int queueid; 390 391 unsigned socketid = lcore_conf.socket_id; 392 393 /* Create ring according to ports actually being used. */ 394 int nb_ports = ff_global_cfg.dpdk.nb_ports; 395 for (j = 0; j < nb_ports; j++) { 396 uint16_t portid = ff_global_cfg.dpdk.portid_list[j]; 397 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid]; 398 int nb_queues = pconf->nb_lcores; 399 if (dispatch_ring[portid] == NULL) { 400 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid); 401 402 dispatch_ring[portid] = rte_zmalloc(name_buf, 403 sizeof(struct rte_ring *) * nb_queues, 404 RTE_CACHE_LINE_SIZE); 405 if (dispatch_ring[portid] == NULL) { 406 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 407 "failed\n", name_buf); 408 } 409 } 410 411 for(queueid = 0; queueid < nb_queues; ++queueid) { 412 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d", 413 portid, queueid); 414 dispatch_ring[portid][queueid] = create_ring(name_buf, 415 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ); 416 417 if (dispatch_ring[portid][queueid] == NULL) 418 rte_panic("create ring:%s failed!\n", name_buf); 419 420 printf("create ring:%s success, %u ring entries are now free!\n", 421 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid])); 422 } 423 } 424 425 return 0; 426 } 427 428 static void 429 ff_msg_init(struct rte_mempool *mp, 430 __attribute__((unused)) void *opaque_arg, 431 void *obj, __attribute__((unused)) unsigned i) 432 { 433 struct ff_msg *msg = (struct ff_msg *)obj; 434 msg->msg_type = FF_UNKNOWN; 435 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 436 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 437 } 438 439 static int 440 init_msg_ring(void) 441 { 442 uint16_t i, j; 443 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 444 unsigned socketid = lcore_conf.socket_id; 445 446 /* Create message buffer pool */ 447 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 448 message_pool = rte_mempool_create(FF_MSG_POOL, 449 MSG_RING_SIZE * 2 * nb_procs, 450 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 451 NULL, NULL, ff_msg_init, NULL, 452 socketid, 0); 453 } else { 454 message_pool = rte_mempool_lookup(FF_MSG_POOL); 455 } 456 457 if (message_pool == NULL) { 458 rte_panic("Create msg mempool failed\n"); 459 } 460 461 for(i = 0; i < nb_procs; ++i) { 462 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 463 "%s%u", FF_MSG_RING_IN, i); 464 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 465 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 466 if (msg_ring[i].ring[0] == NULL) 467 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 468 469 for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) { 470 snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE, 471 "%s%u_%u", FF_MSG_RING_OUT, i, j); 472 msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j], 473 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 474 if (msg_ring[i].ring[j] == NULL) 475 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]); 476 } 477 } 478 479 return 0; 480 } 481 482 #ifdef FF_KNI 483 static int 484 init_kni(void) 485 { 486 int nb_ports = rte_eth_dev_count_avail(); 487 kni_accept = 0; 488 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 489 kni_accept = 1; 490 491 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 492 ff_global_cfg.kni.udp_port); 493 494 unsigned socket_id = lcore_conf.socket_id; 495 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 496 497 nb_ports = ff_global_cfg.dpdk.nb_ports; 498 int i, ret; 499 for (i = 0; i < nb_ports; i++) { 500 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 501 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 502 } 503 504 return 0; 505 } 506 #endif 507 508 static void 509 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues) 510 { 511 if (reta_size == 0) { 512 return; 513 } 514 515 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 516 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 517 518 /* config HW indirection table */ 519 unsigned i, j, hash=0; 520 for (i = 0; i < reta_conf_size; i++) { 521 reta_conf[i].mask = ~0ULL; 522 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 523 reta_conf[i].reta[j] = hash++ % nb_queues; 524 } 525 } 526 527 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 528 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 529 port_id); 530 } 531 } 532 533 static int 534 init_port_start(void) 535 { 536 int nb_ports = ff_global_cfg.dpdk.nb_ports; 537 unsigned socketid = 0; 538 struct rte_mempool *mbuf_pool; 539 uint16_t i, j; 540 541 for (i = 0; i < nb_ports; i++) { 542 uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i]; 543 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id]; 544 uint16_t nb_queues = pconf->nb_lcores; 545 546 for (j=0; j<=pconf->nb_slaves; j++) { 547 if (j < pconf->nb_slaves) { 548 port_id = pconf->slave_portid_list[j]; 549 printf("To init %s's %d'st slave port[%d]\n", 550 ff_global_cfg.dpdk.bond_cfgs->name, 551 j, port_id); 552 } else { 553 port_id = u_port_id; 554 } 555 556 struct rte_eth_dev_info dev_info; 557 struct rte_eth_conf port_conf = {0}; 558 struct rte_eth_rxconf rxq_conf; 559 struct rte_eth_txconf txq_conf; 560 561 rte_eth_dev_info_get(port_id, &dev_info); 562 563 if (nb_queues > dev_info.max_rx_queues) { 564 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 565 nb_queues, 566 dev_info.max_rx_queues); 567 } 568 569 if (nb_queues > dev_info.max_tx_queues) { 570 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 571 nb_queues, 572 dev_info.max_tx_queues); 573 } 574 575 struct ether_addr addr; 576 rte_eth_macaddr_get(port_id, &addr); 577 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 578 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 579 (unsigned)port_id, 580 addr.addr_bytes[0], addr.addr_bytes[1], 581 addr.addr_bytes[2], addr.addr_bytes[3], 582 addr.addr_bytes[4], addr.addr_bytes[5]); 583 584 rte_memcpy(pconf->mac, 585 addr.addr_bytes, ETHER_ADDR_LEN); 586 587 /* Set RSS mode */ 588 uint64_t default_rss_hf = ETH_RSS_PROTO_MASK; 589 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 590 port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf; 591 if (dev_info.hash_key_size == 52) { 592 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_52bytes; 593 port_conf.rx_adv_conf.rss_conf.rss_key_len = 52; 594 use_rsskey_52bytes = 1; 595 } else { 596 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes; 597 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; 598 } 599 port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads; 600 if (port_conf.rx_adv_conf.rss_conf.rss_hf != 601 ETH_RSS_PROTO_MASK) { 602 printf("Port %u modified RSS hash function based on hardware support," 603 "requested:%#"PRIx64" configured:%#"PRIx64"\n", 604 port_id, default_rss_hf, 605 port_conf.rx_adv_conf.rss_conf.rss_hf); 606 } 607 608 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) { 609 port_conf.txmode.offloads |= 610 DEV_TX_OFFLOAD_MBUF_FAST_FREE; 611 } 612 613 /* Set Rx VLAN stripping */ 614 if (ff_global_cfg.dpdk.vlan_strip) { 615 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 616 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP; 617 } 618 } 619 620 /* Enable HW CRC stripping */ 621 port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC; 622 623 /* FIXME: Enable TCP LRO ?*/ 624 #if 0 625 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 626 printf("LRO is supported\n"); 627 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO; 628 pconf->hw_features.rx_lro = 1; 629 } 630 #endif 631 632 /* Set Rx checksum checking */ 633 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 634 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 635 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 636 printf("RX checksum offload supported\n"); 637 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM; 638 pconf->hw_features.rx_csum = 1; 639 } 640 641 if (ff_global_cfg.dpdk.tx_csum_offoad_skip == 0) { 642 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 643 printf("TX ip checksum offload supported\n"); 644 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; 645 pconf->hw_features.tx_csum_ip = 1; 646 } 647 648 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 649 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 650 printf("TX TCP&UDP checksum offload supported\n"); 651 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM; 652 pconf->hw_features.tx_csum_l4 = 1; 653 } 654 } else { 655 printf("TX checksum offoad is disabled\n"); 656 } 657 658 if (ff_global_cfg.dpdk.tso) { 659 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 660 printf("TSO is supported\n"); 661 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO; 662 pconf->hw_features.tx_tso = 1; 663 } 664 } else { 665 printf("TSO is disabled\n"); 666 } 667 668 if (dev_info.reta_size) { 669 /* reta size must be power of 2 */ 670 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 671 672 rss_reta_size[port_id] = dev_info.reta_size; 673 printf("port[%d]: rss table size: %d\n", port_id, 674 dev_info.reta_size); 675 } 676 677 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 678 continue; 679 } 680 681 int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); 682 if (ret != 0) { 683 return ret; 684 } 685 686 static uint16_t nb_rxd = RX_QUEUE_SIZE; 687 static uint16_t nb_txd = TX_QUEUE_SIZE; 688 ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd); 689 if (ret < 0) 690 printf("Could not adjust number of descriptors " 691 "for port%u (%d)\n", (unsigned)port_id, ret); 692 693 uint16_t q; 694 for (q = 0; q < nb_queues; q++) { 695 if (numa_on) { 696 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q]; 697 socketid = rte_lcore_to_socket_id(lcore_id); 698 } 699 mbuf_pool = pktmbuf_pool[socketid]; 700 701 txq_conf = dev_info.default_txconf; 702 txq_conf.offloads = port_conf.txmode.offloads; 703 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd, 704 socketid, &txq_conf); 705 if (ret < 0) { 706 return ret; 707 } 708 709 rxq_conf = dev_info.default_rxconf; 710 rxq_conf.offloads = port_conf.rxmode.offloads; 711 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd, 712 socketid, &rxq_conf, mbuf_pool); 713 if (ret < 0) { 714 return ret; 715 } 716 } 717 718 719 if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME, 720 strlen(dev_info.driver_name)) == 0) { 721 722 rte_eth_macaddr_get(port_id, &addr); 723 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 724 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 725 (unsigned)port_id, 726 addr.addr_bytes[0], addr.addr_bytes[1], 727 addr.addr_bytes[2], addr.addr_bytes[3], 728 addr.addr_bytes[4], addr.addr_bytes[5]); 729 730 rte_memcpy(pconf->mac, 731 addr.addr_bytes, ETHER_ADDR_LEN); 732 733 int mode, count, x; 734 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS; 735 736 mode = rte_eth_bond_mode_get(port_id); 737 printf("Port %u, bond mode:%d\n", port_id, mode); 738 739 count = rte_eth_bond_slaves_get(port_id, slaves, len); 740 printf("Port %u, %s's slave ports count:%d\n", port_id, 741 ff_global_cfg.dpdk.bond_cfgs->name, count); 742 for (x=0; x<count; x++) { 743 printf("Port %u, %s's slave port[%u]\n", port_id, 744 ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]); 745 } 746 } 747 748 ret = rte_eth_dev_start(port_id); 749 if (ret < 0) { 750 return ret; 751 } 752 753 if (nb_queues > 1) { 754 /* set HW rss hash function to Toeplitz. */ 755 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 756 struct rte_eth_hash_filter_info info = {0}; 757 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 758 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 759 760 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 761 RTE_ETH_FILTER_SET, &info) < 0) { 762 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 763 port_id); 764 } 765 } 766 767 set_rss_table(port_id, dev_info.reta_size, nb_queues); 768 } 769 770 /* Enable RX in promiscuous mode for the Ethernet device. */ 771 if (ff_global_cfg.dpdk.promiscuous) { 772 rte_eth_promiscuous_enable(port_id); 773 ret = rte_eth_promiscuous_get(port_id); 774 if (ret == 1) { 775 printf("set port %u to promiscuous mode ok\n", port_id); 776 } else { 777 printf("set port %u to promiscuous mode error\n", port_id); 778 } 779 } 780 } 781 } 782 783 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 784 check_all_ports_link_status(); 785 } 786 787 return 0; 788 } 789 790 static int 791 init_clock(void) 792 { 793 rte_timer_subsystem_init(); 794 uint64_t hz = rte_get_timer_hz(); 795 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 796 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 797 798 rte_timer_init(&freebsd_clock); 799 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 800 rte_lcore_id(), &ff_hardclock_job, NULL); 801 802 ff_update_current_ts(); 803 804 return 0; 805 } 806 807 int 808 ff_dpdk_init(int argc, char **argv) 809 { 810 if (ff_global_cfg.dpdk.nb_procs < 1 || 811 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 812 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 813 ff_global_cfg.dpdk.proc_id < 0) { 814 printf("param num_procs[%d] or proc_id[%d] error!\n", 815 ff_global_cfg.dpdk.nb_procs, 816 ff_global_cfg.dpdk.proc_id); 817 exit(1); 818 } 819 820 int ret = rte_eal_init(argc, argv); 821 if (ret < 0) { 822 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 823 } 824 825 numa_on = ff_global_cfg.dpdk.numa_on; 826 827 idle_sleep = ff_global_cfg.dpdk.idle_sleep; 828 pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \ 829 BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay; 830 831 init_lcore_conf(); 832 833 init_mem_pool(); 834 835 init_dispatch_ring(); 836 837 init_msg_ring(); 838 839 #ifdef FF_KNI 840 enable_kni = ff_global_cfg.kni.enable; 841 if (enable_kni) { 842 init_kni(); 843 } 844 #endif 845 846 #ifdef FF_USE_PAGE_ARRAY 847 ff_mmap_init(); 848 #endif 849 850 ret = init_port_start(); 851 if (ret < 0) { 852 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 853 } 854 855 init_clock(); 856 857 return 0; 858 } 859 860 static void 861 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 862 { 863 uint8_t rx_csum = ctx->hw_features.rx_csum; 864 if (rx_csum) { 865 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 866 rte_pktmbuf_free(pkt); 867 return; 868 } 869 } 870 871 void *data = rte_pktmbuf_mtod(pkt, void*); 872 uint16_t len = rte_pktmbuf_data_len(pkt); 873 874 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 875 if (hdr == NULL) { 876 rte_pktmbuf_free(pkt); 877 return; 878 } 879 880 if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) { 881 ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci); 882 } 883 884 struct rte_mbuf *pn = pkt->next; 885 void *prev = hdr; 886 while(pn != NULL) { 887 data = rte_pktmbuf_mtod(pn, void*); 888 len = rte_pktmbuf_data_len(pn); 889 890 void *mb = ff_mbuf_get(prev, data, len); 891 if (mb == NULL) { 892 ff_mbuf_free(hdr); 893 rte_pktmbuf_free(pkt); 894 return; 895 } 896 pn = pn->next; 897 prev = mb; 898 } 899 900 ff_veth_process_packet(ctx->ifp, hdr); 901 } 902 903 static enum FilterReturn 904 protocol_filter(const void *data, uint16_t len) 905 { 906 if(len < ETHER_HDR_LEN) 907 return FILTER_UNKNOWN; 908 909 const struct ether_hdr *hdr; 910 const struct vlan_hdr *vlanhdr; 911 hdr = (const struct ether_hdr *)data; 912 uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type); 913 data += ETHER_HDR_LEN; 914 len -= ETHER_HDR_LEN; 915 916 if (ether_type == ETHER_TYPE_VLAN) { 917 vlanhdr = (struct vlan_hdr *)data; 918 ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto); 919 data += sizeof(struct vlan_hdr); 920 len -= sizeof(struct vlan_hdr); 921 } 922 923 if(ether_type == ETHER_TYPE_ARP) 924 return FILTER_ARP; 925 926 #ifdef INET6 927 if (ether_type == ETHER_TYPE_IPv6) { 928 return ff_kni_proto_filter(data, 929 len, ether_type); 930 } 931 #endif 932 933 #ifndef FF_KNI 934 return FILTER_UNKNOWN; 935 #else 936 if (!enable_kni) { 937 return FILTER_UNKNOWN; 938 } 939 940 if(ether_type != ETHER_TYPE_IPv4) 941 return FILTER_UNKNOWN; 942 943 return ff_kni_proto_filter(data, 944 len, ether_type); 945 #endif 946 } 947 948 static inline void 949 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m) 950 { 951 struct rte_mbuf *md; 952 void *src, *dst; 953 954 dst = rte_pktmbuf_mtod(mi, void *); 955 src = rte_pktmbuf_mtod(m, void *); 956 957 mi->data_len = m->data_len; 958 rte_memcpy(dst, src, m->data_len); 959 960 mi->port = m->port; 961 mi->vlan_tci = m->vlan_tci; 962 mi->vlan_tci_outer = m->vlan_tci_outer; 963 mi->tx_offload = m->tx_offload; 964 mi->hash = m->hash; 965 mi->ol_flags = m->ol_flags; 966 mi->packet_type = m->packet_type; 967 } 968 969 /* copied from rte_pktmbuf_clone */ 970 static inline struct rte_mbuf * 971 pktmbuf_deep_clone(const struct rte_mbuf *md, 972 struct rte_mempool *mp) 973 { 974 struct rte_mbuf *mc, *mi, **prev; 975 uint32_t pktlen; 976 uint8_t nseg; 977 978 if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) 979 return NULL; 980 981 mi = mc; 982 prev = &mi->next; 983 pktlen = md->pkt_len; 984 nseg = 0; 985 986 do { 987 nseg++; 988 pktmbuf_deep_attach(mi, md); 989 *prev = mi; 990 prev = &mi->next; 991 } while ((md = md->next) != NULL && 992 (mi = rte_pktmbuf_alloc(mp)) != NULL); 993 994 *prev = NULL; 995 mc->nb_segs = nseg; 996 mc->pkt_len = pktlen; 997 998 /* Allocation of new indirect segment failed */ 999 if (unlikely (mi == NULL)) { 1000 rte_pktmbuf_free(mc); 1001 return NULL; 1002 } 1003 1004 __rte_mbuf_sanity_check(mc, 1); 1005 return mc; 1006 } 1007 1008 static inline void 1009 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 1010 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 1011 { 1012 struct lcore_conf *qconf = &lcore_conf; 1013 uint16_t nb_queues = qconf->nb_queue_list[port_id]; 1014 1015 uint16_t i; 1016 for (i = 0; i < count; i++) { 1017 struct rte_mbuf *rtem = bufs[i]; 1018 1019 if (unlikely( ff_global_cfg.pcap.enable)) { 1020 if (!pkts_from_ring) { 1021 ff_dump_packets( ff_global_cfg.pcap.save_path, rtem, ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len); 1022 } 1023 } 1024 1025 void *data = rte_pktmbuf_mtod(rtem, void*); 1026 uint16_t len = rte_pktmbuf_data_len(rtem); 1027 1028 if (!pkts_from_ring) { 1029 ff_traffic.rx_packets++; 1030 ff_traffic.rx_bytes += len; 1031 } 1032 1033 if (!pkts_from_ring && packet_dispatcher) { 1034 int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues); 1035 if (ret == FF_DISPATCH_RESPONSE) { 1036 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len; 1037 1038 /* 1039 * We have not support vlan out strip 1040 */ 1041 if (rtem->vlan_tci) { 1042 data = rte_pktmbuf_prepend(rtem, sizeof(struct vlan_hdr)); 1043 if (data != NULL) { 1044 memmove(data, data + sizeof(struct vlan_hdr), ETHER_HDR_LEN); 1045 struct ether_hdr *etherhdr = (struct ether_hdr *)data; 1046 struct vlan_hdr *vlanhdr = (struct vlan_hdr *)(data + ETHER_HDR_LEN); 1047 vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci); 1048 vlanhdr->eth_proto = etherhdr->ether_type; 1049 etherhdr->ether_type = rte_cpu_to_be_16(ETHER_TYPE_VLAN); 1050 } 1051 } 1052 send_single_packet(rtem, port_id); 1053 continue; 1054 } 1055 1056 if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) { 1057 rte_pktmbuf_free(rtem); 1058 continue; 1059 } 1060 1061 if (ret != queue_id) { 1062 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem); 1063 if (ret < 0) 1064 rte_pktmbuf_free(rtem); 1065 1066 continue; 1067 } 1068 } 1069 1070 enum FilterReturn filter = protocol_filter(data, len); 1071 #ifdef INET6 1072 if (filter == FILTER_ARP || filter == FILTER_NDP) { 1073 #else 1074 if (filter == FILTER_ARP) { 1075 #endif 1076 struct rte_mempool *mbuf_pool; 1077 struct rte_mbuf *mbuf_clone; 1078 if (!pkts_from_ring) { 1079 uint16_t j; 1080 for(j = 0; j < nb_queues; ++j) { 1081 if(j == queue_id) 1082 continue; 1083 1084 unsigned socket_id = 0; 1085 if (numa_on) { 1086 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j]; 1087 socket_id = rte_lcore_to_socket_id(lcore_id); 1088 } 1089 mbuf_pool = pktmbuf_pool[socket_id]; 1090 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1091 if(mbuf_clone) { 1092 int ret = rte_ring_enqueue(dispatch_ring[port_id][j], 1093 mbuf_clone); 1094 if (ret < 0) 1095 rte_pktmbuf_free(mbuf_clone); 1096 } 1097 } 1098 } 1099 1100 #ifdef FF_KNI 1101 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1102 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 1103 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1104 if(mbuf_clone) { 1105 ff_kni_enqueue(port_id, mbuf_clone); 1106 } 1107 } 1108 #endif 1109 ff_veth_input(ctx, rtem); 1110 #ifdef FF_KNI 1111 } else if (enable_kni && 1112 ((filter == FILTER_KNI && kni_accept) || 1113 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 1114 ff_kni_enqueue(port_id, rtem); 1115 #endif 1116 } else { 1117 ff_veth_input(ctx, rtem); 1118 } 1119 } 1120 } 1121 1122 static inline int 1123 process_dispatch_ring(uint16_t port_id, uint16_t queue_id, 1124 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 1125 { 1126 /* read packet from ring buf and to process */ 1127 uint16_t nb_rb; 1128 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id], 1129 (void **)pkts_burst, MAX_PKT_BURST, NULL); 1130 1131 if(nb_rb > 0) { 1132 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 1133 } 1134 1135 return 0; 1136 } 1137 1138 static inline void 1139 handle_sysctl_msg(struct ff_msg *msg) 1140 { 1141 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 1142 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 1143 msg->sysctl.newlen); 1144 1145 if (ret < 0) { 1146 msg->result = errno; 1147 } else { 1148 msg->result = 0; 1149 } 1150 } 1151 1152 static inline void 1153 handle_ioctl_msg(struct ff_msg *msg) 1154 { 1155 int fd, ret; 1156 #ifdef INET6 1157 if (msg->msg_type == FF_IOCTL6) { 1158 fd = ff_socket(AF_INET6, SOCK_DGRAM, 0); 1159 } else 1160 #endif 1161 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 1162 1163 if (fd < 0) { 1164 ret = -1; 1165 goto done; 1166 } 1167 1168 ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data); 1169 1170 ff_close(fd); 1171 1172 done: 1173 if (ret < 0) { 1174 msg->result = errno; 1175 } else { 1176 msg->result = 0; 1177 } 1178 } 1179 1180 static inline void 1181 handle_route_msg(struct ff_msg *msg) 1182 { 1183 int ret = ff_rtioctl(msg->route.fib, msg->route.data, 1184 &msg->route.len, msg->route.maxlen); 1185 if (ret < 0) { 1186 msg->result = errno; 1187 } else { 1188 msg->result = 0; 1189 } 1190 } 1191 1192 static inline void 1193 handle_top_msg(struct ff_msg *msg) 1194 { 1195 msg->top = ff_top_status; 1196 msg->result = 0; 1197 } 1198 1199 #ifdef FF_NETGRAPH 1200 static inline void 1201 handle_ngctl_msg(struct ff_msg *msg) 1202 { 1203 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data); 1204 if (ret < 0) { 1205 msg->result = errno; 1206 } else { 1207 msg->result = 0; 1208 msg->ngctl.ret = ret; 1209 } 1210 } 1211 #endif 1212 1213 #ifdef FF_IPFW 1214 static inline void 1215 handle_ipfw_msg(struct ff_msg *msg) 1216 { 1217 int fd, ret; 1218 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW); 1219 if (fd < 0) { 1220 ret = -1; 1221 goto done; 1222 } 1223 1224 switch (msg->ipfw.cmd) { 1225 case FF_IPFW_GET: 1226 ret = ff_getsockopt_freebsd(fd, msg->ipfw.level, 1227 msg->ipfw.optname, msg->ipfw.optval, 1228 msg->ipfw.optlen); 1229 break; 1230 case FF_IPFW_SET: 1231 ret = ff_setsockopt_freebsd(fd, msg->ipfw.level, 1232 msg->ipfw.optname, msg->ipfw.optval, 1233 *(msg->ipfw.optlen)); 1234 break; 1235 default: 1236 ret = -1; 1237 errno = ENOTSUP; 1238 break; 1239 } 1240 1241 ff_close(fd); 1242 1243 done: 1244 if (ret < 0) { 1245 msg->result = errno; 1246 } else { 1247 msg->result = 0; 1248 } 1249 } 1250 #endif 1251 1252 static inline void 1253 handle_traffic_msg(struct ff_msg *msg) 1254 { 1255 msg->traffic = ff_traffic; 1256 msg->result = 0; 1257 } 1258 1259 static inline void 1260 handle_default_msg(struct ff_msg *msg) 1261 { 1262 msg->result = ENOTSUP; 1263 } 1264 1265 static inline void 1266 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1267 { 1268 switch (msg->msg_type) { 1269 case FF_SYSCTL: 1270 handle_sysctl_msg(msg); 1271 break; 1272 case FF_IOCTL: 1273 #ifdef INET6 1274 case FF_IOCTL6: 1275 #endif 1276 handle_ioctl_msg(msg); 1277 break; 1278 case FF_ROUTE: 1279 handle_route_msg(msg); 1280 break; 1281 case FF_TOP: 1282 handle_top_msg(msg); 1283 break; 1284 #ifdef FF_NETGRAPH 1285 case FF_NGCTL: 1286 handle_ngctl_msg(msg); 1287 break; 1288 #endif 1289 #ifdef FF_IPFW 1290 case FF_IPFW_CTL: 1291 handle_ipfw_msg(msg); 1292 break; 1293 #endif 1294 case FF_TRAFFIC: 1295 handle_traffic_msg(msg); 1296 break; 1297 default: 1298 handle_default_msg(msg); 1299 break; 1300 } 1301 rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg); 1302 } 1303 1304 static inline int 1305 process_msg_ring(uint16_t proc_id) 1306 { 1307 void *msg; 1308 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1309 1310 if (unlikely(ret == 0)) { 1311 handle_msg((struct ff_msg *)msg, proc_id); 1312 } 1313 1314 return 0; 1315 } 1316 1317 /* Send burst of packets on an output interface */ 1318 static inline int 1319 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1320 { 1321 struct rte_mbuf **m_table; 1322 int ret; 1323 uint16_t queueid; 1324 1325 queueid = qconf->tx_queue_id[port]; 1326 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1327 1328 if (unlikely(ff_global_cfg.pcap.enable)) { 1329 uint16_t i; 1330 for (i = 0; i < n; i++) { 1331 ff_dump_packets( ff_global_cfg.pcap.save_path, m_table[i], 1332 ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len); 1333 } 1334 } 1335 1336 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1337 ff_traffic.tx_packets += ret; 1338 uint16_t i; 1339 for (i = 0; i < ret; i++) { 1340 ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]); 1341 #ifdef FF_USE_PAGE_ARRAY 1342 if (qconf->tx_mbufs[port].bsd_m_table[i]) 1343 ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs); 1344 #endif 1345 } 1346 if (unlikely(ret < n)) { 1347 do { 1348 rte_pktmbuf_free(m_table[ret]); 1349 #ifdef FF_USE_PAGE_ARRAY 1350 if ( qconf->tx_mbufs[port].bsd_m_table[ret] ) 1351 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]); 1352 #endif 1353 } while (++ret < n); 1354 } 1355 return 0; 1356 } 1357 1358 /* Enqueue a single packet, and send burst if queue is filled */ 1359 static inline int 1360 send_single_packet(struct rte_mbuf *m, uint8_t port) 1361 { 1362 uint16_t len; 1363 struct lcore_conf *qconf; 1364 1365 qconf = &lcore_conf; 1366 len = qconf->tx_mbufs[port].len; 1367 qconf->tx_mbufs[port].m_table[len] = m; 1368 len++; 1369 1370 /* enough pkts to be sent */ 1371 if (unlikely(len == MAX_PKT_BURST)) { 1372 send_burst(qconf, MAX_PKT_BURST, port); 1373 len = 0; 1374 } 1375 1376 qconf->tx_mbufs[port].len = len; 1377 return 0; 1378 } 1379 1380 int 1381 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1382 int total) 1383 { 1384 #ifdef FF_USE_PAGE_ARRAY 1385 struct lcore_conf *qconf = &lcore_conf; 1386 int len = 0; 1387 1388 len = ff_if_send_onepkt(ctx, m,total); 1389 if (unlikely(len == MAX_PKT_BURST)) { 1390 send_burst(qconf, MAX_PKT_BURST, ctx->port_id); 1391 len = 0; 1392 } 1393 qconf->tx_mbufs[ctx->port_id].len = len; 1394 return 0; 1395 #endif 1396 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1397 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1398 if (head == NULL) { 1399 ff_mbuf_free(m); 1400 return -1; 1401 } 1402 1403 head->pkt_len = total; 1404 head->nb_segs = 0; 1405 1406 int off = 0; 1407 struct rte_mbuf *cur = head, *prev = NULL; 1408 while(total > 0) { 1409 if (cur == NULL) { 1410 cur = rte_pktmbuf_alloc(mbuf_pool); 1411 if (cur == NULL) { 1412 rte_pktmbuf_free(head); 1413 ff_mbuf_free(m); 1414 return -1; 1415 } 1416 } 1417 1418 if (prev != NULL) { 1419 prev->next = cur; 1420 } 1421 head->nb_segs++; 1422 1423 prev = cur; 1424 void *data = rte_pktmbuf_mtod(cur, void*); 1425 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1426 int ret = ff_mbuf_copydata(m, data, off, len); 1427 if (ret < 0) { 1428 rte_pktmbuf_free(head); 1429 ff_mbuf_free(m); 1430 return -1; 1431 } 1432 1433 1434 cur->data_len = len; 1435 off += len; 1436 total -= len; 1437 cur = NULL; 1438 } 1439 1440 struct ff_tx_offload offload = {0}; 1441 ff_mbuf_tx_offload(m, &offload); 1442 1443 void *data = rte_pktmbuf_mtod(head, void*); 1444 1445 if (offload.ip_csum) { 1446 /* ipv6 not supported yet */ 1447 struct ipv4_hdr *iph; 1448 int iph_len; 1449 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1450 iph_len = (iph->version_ihl & 0x0f) << 2; 1451 1452 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 1453 head->l2_len = ETHER_HDR_LEN; 1454 head->l3_len = iph_len; 1455 } 1456 1457 if (ctx->hw_features.tx_csum_l4) { 1458 struct ipv4_hdr *iph; 1459 int iph_len; 1460 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1461 iph_len = (iph->version_ihl & 0x0f) << 2; 1462 1463 if (offload.tcp_csum) { 1464 head->ol_flags |= PKT_TX_TCP_CKSUM; 1465 head->l2_len = ETHER_HDR_LEN; 1466 head->l3_len = iph_len; 1467 } 1468 1469 /* 1470 * TCP segmentation offload. 1471 * 1472 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 1473 * implies PKT_TX_TCP_CKSUM) 1474 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 1475 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 1476 * write the IP checksum to 0 in the packet 1477 * - fill the mbuf offload information: l2_len, 1478 * l3_len, l4_len, tso_segsz 1479 * - calculate the pseudo header checksum without taking ip_len 1480 * in account, and set it in the TCP header. Refer to 1481 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 1482 * used as helpers. 1483 */ 1484 if (offload.tso_seg_size) { 1485 struct tcp_hdr *tcph; 1486 int tcph_len; 1487 tcph = (struct tcp_hdr *)((char *)iph + iph_len); 1488 tcph_len = (tcph->data_off & 0xf0) >> 2; 1489 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 1490 1491 head->ol_flags |= PKT_TX_TCP_SEG; 1492 head->l4_len = tcph_len; 1493 head->tso_segsz = offload.tso_seg_size; 1494 } 1495 1496 if (offload.udp_csum) { 1497 head->ol_flags |= PKT_TX_UDP_CKSUM; 1498 head->l2_len = ETHER_HDR_LEN; 1499 head->l3_len = iph_len; 1500 } 1501 } 1502 1503 ff_mbuf_free(m); 1504 1505 return send_single_packet(head, ctx->port_id); 1506 } 1507 1508 static int 1509 main_loop(void *arg) 1510 { 1511 struct loop_routine *lr = (struct loop_routine *)arg; 1512 1513 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1514 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc; 1515 int i, j, nb_rx, idle; 1516 uint16_t port_id, queue_id; 1517 struct lcore_conf *qconf; 1518 uint64_t drain_tsc = 0; 1519 struct ff_dpdk_if_context *ctx; 1520 1521 if (pkt_tx_delay) { 1522 drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay; 1523 } 1524 1525 prev_tsc = 0; 1526 usch_tsc = 0; 1527 1528 qconf = &lcore_conf; 1529 1530 while (1) { 1531 cur_tsc = rte_rdtsc(); 1532 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1533 rte_timer_manage(); 1534 } 1535 1536 idle = 1; 1537 sys_tsc = 0; 1538 usr_tsc = 0; 1539 1540 /* 1541 * TX burst queue drain 1542 */ 1543 diff_tsc = cur_tsc - prev_tsc; 1544 if (unlikely(diff_tsc >= drain_tsc)) { 1545 for (i = 0; i < qconf->nb_tx_port; i++) { 1546 port_id = qconf->tx_port_id[i]; 1547 if (qconf->tx_mbufs[port_id].len == 0) 1548 continue; 1549 1550 idle = 0; 1551 1552 send_burst(qconf, 1553 qconf->tx_mbufs[port_id].len, 1554 port_id); 1555 qconf->tx_mbufs[port_id].len = 0; 1556 } 1557 1558 prev_tsc = cur_tsc; 1559 } 1560 1561 /* 1562 * Read packet from RX queues 1563 */ 1564 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1565 port_id = qconf->rx_queue_list[i].port_id; 1566 queue_id = qconf->rx_queue_list[i].queue_id; 1567 ctx = veth_ctx[port_id]; 1568 1569 #ifdef FF_KNI 1570 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1571 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1572 } 1573 #endif 1574 1575 process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); 1576 1577 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1578 MAX_PKT_BURST); 1579 if (nb_rx == 0) 1580 continue; 1581 1582 idle = 0; 1583 1584 /* Prefetch first packets */ 1585 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1586 rte_prefetch0(rte_pktmbuf_mtod( 1587 pkts_burst[j], void *)); 1588 } 1589 1590 /* Prefetch and handle already prefetched packets */ 1591 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1592 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1593 j + PREFETCH_OFFSET], void *)); 1594 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1595 } 1596 1597 /* Handle remaining prefetched packets */ 1598 for (; j < nb_rx; j++) { 1599 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1600 } 1601 } 1602 1603 process_msg_ring(qconf->proc_id); 1604 1605 div_tsc = rte_rdtsc(); 1606 1607 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) { 1608 usch_tsc = cur_tsc; 1609 lr->loop(lr->arg); 1610 } 1611 1612 idle_sleep_tsc = rte_rdtsc(); 1613 if (likely(idle && idle_sleep)) { 1614 usleep(idle_sleep); 1615 end_tsc = rte_rdtsc(); 1616 } else { 1617 end_tsc = idle_sleep_tsc; 1618 } 1619 1620 if (usch_tsc == cur_tsc) { 1621 usr_tsc = idle_sleep_tsc - div_tsc; 1622 } 1623 1624 if (!idle) { 1625 sys_tsc = div_tsc - cur_tsc; 1626 ff_top_status.sys_tsc += sys_tsc; 1627 } 1628 1629 ff_top_status.usr_tsc += usr_tsc; 1630 ff_top_status.work_tsc += end_tsc - cur_tsc; 1631 ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1632 1633 ff_top_status.loops++; 1634 } 1635 1636 return 0; 1637 } 1638 1639 int 1640 ff_dpdk_if_up(void) { 1641 int i; 1642 struct lcore_conf *qconf = &lcore_conf; 1643 for (i = 0; i < qconf->nb_tx_port; i++) { 1644 uint16_t port_id = qconf->tx_port_id[i]; 1645 1646 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id]; 1647 veth_ctx[port_id] = ff_veth_attach(pconf); 1648 if (veth_ctx[port_id] == NULL) { 1649 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1650 } 1651 } 1652 1653 return 0; 1654 } 1655 1656 void 1657 ff_dpdk_run(loop_func_t loop, void *arg) { 1658 struct loop_routine *lr = rte_malloc(NULL, 1659 sizeof(struct loop_routine), 0); 1660 lr->loop = loop; 1661 lr->arg = arg; 1662 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1663 rte_eal_mp_wait_lcore(); 1664 rte_free(lr); 1665 } 1666 1667 void 1668 ff_dpdk_pktmbuf_free(void *m) 1669 { 1670 rte_pktmbuf_free((struct rte_mbuf *)m); 1671 } 1672 1673 static uint32_t 1674 toeplitz_hash(unsigned keylen, const uint8_t *key, 1675 unsigned datalen, const uint8_t *data) 1676 { 1677 uint32_t hash = 0, v; 1678 u_int i, b; 1679 1680 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1681 1682 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1683 for (i = 0; i < datalen; i++) { 1684 for (b = 0; b < 8; b++) { 1685 if (data[i] & (1<<(7-b))) 1686 hash ^= v; 1687 v <<= 1; 1688 if ((i + 4) < keylen && 1689 (key[i+4] & (1<<(7-b)))) 1690 v |= 1; 1691 } 1692 } 1693 return (hash); 1694 } 1695 1696 int 1697 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1698 uint16_t sport, uint16_t dport) 1699 { 1700 struct lcore_conf *qconf = &lcore_conf; 1701 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 1702 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id]; 1703 1704 if (nb_queues <= 1) { 1705 return 1; 1706 } 1707 1708 uint16_t reta_size = rss_reta_size[ctx->port_id]; 1709 uint16_t queueid = qconf->tx_queue_id[ctx->port_id]; 1710 1711 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1712 sizeof(dport)]; 1713 1714 unsigned datalen = 0; 1715 1716 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1717 datalen += sizeof(saddr); 1718 1719 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1720 datalen += sizeof(daddr); 1721 1722 bcopy(&sport, &data[datalen], sizeof(sport)); 1723 datalen += sizeof(sport); 1724 1725 bcopy(&dport, &data[datalen], sizeof(dport)); 1726 datalen += sizeof(dport); 1727 1728 uint32_t hash = 0; 1729 if ( !use_rsskey_52bytes ) 1730 hash = toeplitz_hash(sizeof(default_rsskey_40bytes), 1731 default_rsskey_40bytes, datalen, data); 1732 else 1733 hash = toeplitz_hash(sizeof(default_rsskey_52bytes), 1734 default_rsskey_52bytes, datalen, data); 1735 return ((hash & (reta_size - 1)) % nb_queues) == queueid; 1736 } 1737 1738 void 1739 ff_regist_packet_dispatcher(dispatch_func_t func) 1740 { 1741 packet_dispatcher = func; 1742 } 1743 1744 uint64_t 1745 ff_get_tsc_ns() 1746 { 1747 uint64_t cur_tsc = rte_rdtsc(); 1748 uint64_t hz = rte_get_tsc_hz(); 1749 return ((double)cur_tsc/(double)hz) * NS_PER_S; 1750 } 1751 1752