1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 #include <unistd.h> 28 29 #include <rte_common.h> 30 #include <rte_byteorder.h> 31 #include <rte_log.h> 32 #include <rte_memory.h> 33 #include <rte_memcpy.h> 34 #include <rte_memzone.h> 35 #include <rte_config.h> 36 #include <rte_eal.h> 37 #include <rte_pci.h> 38 #include <rte_mbuf.h> 39 #include <rte_memory.h> 40 #include <rte_lcore.h> 41 #include <rte_launch.h> 42 #include <rte_ethdev.h> 43 #include <rte_debug.h> 44 #include <rte_common.h> 45 #include <rte_ether.h> 46 #include <rte_malloc.h> 47 #include <rte_cycles.h> 48 #include <rte_timer.h> 49 #include <rte_thash.h> 50 #include <rte_ip.h> 51 #include <rte_tcp.h> 52 #include <rte_udp.h> 53 54 #include "ff_dpdk_if.h" 55 #include "ff_dpdk_pcap.h" 56 #include "ff_dpdk_kni.h" 57 #include "ff_config.h" 58 #include "ff_veth.h" 59 #include "ff_host_interface.h" 60 #include "ff_msg.h" 61 #include "ff_api.h" 62 63 #define MEMPOOL_CACHE_SIZE 256 64 65 #define DISPATCH_RING_SIZE 2048 66 67 #define MSG_RING_SIZE 32 68 69 /* 70 * Configurable number of RX/TX ring descriptors 71 */ 72 #define RX_QUEUE_SIZE 512 73 #define TX_QUEUE_SIZE 512 74 75 #define MAX_PKT_BURST 32 76 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 77 78 /* 79 * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send. 80 */ 81 #define MAX_TX_BURST (MAX_PKT_BURST / 2) 82 83 #define NB_SOCKETS 8 84 85 /* Configure how many packets ahead to prefetch, when reading packets */ 86 #define PREFETCH_OFFSET 3 87 88 #define MAX_RX_QUEUE_PER_LCORE 16 89 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS 90 #define MAX_RX_QUEUE_PER_PORT 128 91 92 #ifdef FF_KNI 93 #define KNI_MBUF_MAX 2048 94 #define KNI_QUEUE_SIZE 2048 95 96 static int enable_kni; 97 static int kni_accept; 98 #endif 99 100 static int numa_on; 101 102 static unsigned idle_sleep; 103 104 static struct rte_timer freebsd_clock; 105 106 // Mellanox Linux's driver key 107 static uint8_t default_rsskey_40bytes[40] = { 108 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 109 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 110 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 111 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 112 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 113 }; 114 115 struct mbuf_table { 116 uint16_t len; 117 struct rte_mbuf *m_table[MAX_PKT_BURST]; 118 }; 119 120 struct lcore_rx_queue { 121 uint16_t port_id; 122 uint16_t queue_id; 123 } __rte_cache_aligned; 124 125 struct lcore_conf { 126 uint16_t proc_id; 127 uint16_t socket_id; 128 uint16_t nb_queue_list[RTE_MAX_ETHPORTS]; 129 struct ff_port_cfg *port_cfgs; 130 131 uint16_t nb_rx_queue; 132 struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; 133 uint16_t nb_tx_port; 134 uint16_t tx_port_id[RTE_MAX_ETHPORTS]; 135 uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; 136 struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS]; 137 char *pcap[RTE_MAX_ETHPORTS]; 138 } __rte_cache_aligned; 139 140 static struct lcore_conf lcore_conf; 141 142 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 143 144 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; 145 static dispatch_func_t packet_dispatcher; 146 147 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 148 149 struct ff_msg_ring { 150 char ring_name[2][RTE_RING_NAMESIZE]; 151 /* ring[0] for lcore recv msg, other send */ 152 /* ring[1] for lcore send msg, other read */ 153 struct rte_ring *ring[2]; 154 } __rte_cache_aligned; 155 156 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 157 static struct rte_mempool *message_pool; 158 159 struct ff_dpdk_if_context { 160 void *sc; 161 void *ifp; 162 uint16_t port_id; 163 struct ff_hw_features hw_features; 164 } __rte_cache_aligned; 165 166 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 167 168 static struct ff_top_args ff_top_status; 169 static struct ff_traffic_args ff_traffic; 170 171 extern void ff_hardclock(void); 172 173 static void 174 ff_hardclock_job(__rte_unused struct rte_timer *timer, 175 __rte_unused void *arg) { 176 ff_hardclock(); 177 ff_update_current_ts(); 178 } 179 180 struct ff_dpdk_if_context * 181 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 182 { 183 struct ff_dpdk_if_context *ctx; 184 185 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 186 if (ctx == NULL) 187 return NULL; 188 189 ctx->sc = sc; 190 ctx->ifp = ifp; 191 ctx->port_id = cfg->port_id; 192 ctx->hw_features = cfg->hw_features; 193 194 return ctx; 195 } 196 197 void 198 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 199 { 200 free(ctx); 201 } 202 203 static void 204 check_all_ports_link_status(void) 205 { 206 #define CHECK_INTERVAL 100 /* 100ms */ 207 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 208 209 uint16_t portid; 210 uint8_t count, all_ports_up, print_flag = 0; 211 struct rte_eth_link link; 212 213 printf("\nChecking link status"); 214 fflush(stdout); 215 216 int i, nb_ports; 217 nb_ports = ff_global_cfg.dpdk.nb_ports; 218 for (count = 0; count <= MAX_CHECK_TIME; count++) { 219 all_ports_up = 1; 220 for (i = 0; i < nb_ports; i++) { 221 uint16_t portid = ff_global_cfg.dpdk.portid_list[i]; 222 memset(&link, 0, sizeof(link)); 223 rte_eth_link_get_nowait(portid, &link); 224 225 /* print link status if flag set */ 226 if (print_flag == 1) { 227 if (link.link_status) { 228 printf("Port %d Link Up - speed %u " 229 "Mbps - %s\n", (int)portid, 230 (unsigned)link.link_speed, 231 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 232 ("full-duplex") : ("half-duplex\n")); 233 } else { 234 printf("Port %d Link Down\n", (int)portid); 235 } 236 continue; 237 } 238 /* clear all_ports_up flag if any link down */ 239 if (link.link_status == 0) { 240 all_ports_up = 0; 241 break; 242 } 243 } 244 245 /* after finally printing all link status, get out */ 246 if (print_flag == 1) 247 break; 248 249 if (all_ports_up == 0) { 250 printf("."); 251 fflush(stdout); 252 rte_delay_ms(CHECK_INTERVAL); 253 } 254 255 /* set the print_flag if all ports up or timeout */ 256 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 257 print_flag = 1; 258 printf("done\n"); 259 } 260 } 261 } 262 263 static int 264 init_lcore_conf(void) 265 { 266 uint8_t nb_dev_ports = rte_eth_dev_count_avail(); 267 if (nb_dev_ports == 0) { 268 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 269 } 270 271 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) { 272 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n", 273 ff_global_cfg.dpdk.max_portid); 274 } 275 276 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs; 277 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 278 279 uint16_t proc_id; 280 for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) { 281 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id]; 282 if (!lcore_config[lcore_id].detected) { 283 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 284 } 285 } 286 287 uint16_t socket_id = 0; 288 if (numa_on) { 289 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 290 } 291 292 lcore_conf.socket_id = socket_id; 293 294 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id]; 295 int j; 296 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) { 297 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j]; 298 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 299 300 int queueid = -1; 301 int i; 302 for (i = 0; i < pconf->nb_lcores; i++) { 303 if (pconf->lcore_list[i] == lcore_id) { 304 queueid = i; 305 } 306 } 307 if (queueid < 0) { 308 continue; 309 } 310 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid); 311 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 312 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 313 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid; 314 lcore_conf.nb_rx_queue++; 315 316 lcore_conf.tx_queue_id[port_id] = queueid; 317 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id; 318 lcore_conf.nb_tx_port++; 319 320 lcore_conf.pcap[port_id] = pconf->pcap; 321 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores; 322 } 323 324 if (lcore_conf.nb_rx_queue == 0) { 325 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id); 326 } 327 328 return 0; 329 } 330 331 static int 332 init_mem_pool(void) 333 { 334 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 335 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 336 uint32_t nb_tx_queue = nb_lcores; 337 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 338 339 unsigned nb_mbuf = RTE_MAX ( 340 (nb_rx_queue*RX_QUEUE_SIZE + 341 nb_ports*nb_lcores*MAX_PKT_BURST + 342 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 343 nb_lcores*MEMPOOL_CACHE_SIZE + 344 #ifdef FF_KNI 345 nb_ports*KNI_MBUF_MAX + 346 nb_ports*KNI_QUEUE_SIZE + 347 #endif 348 nb_lcores*nb_ports*DISPATCH_RING_SIZE), 349 (unsigned)8192); 350 351 unsigned socketid = 0; 352 uint16_t i, lcore_id; 353 char s[64]; 354 355 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) { 356 lcore_id = ff_global_cfg.dpdk.proc_lcore[i]; 357 if (numa_on) { 358 socketid = rte_lcore_to_socket_id(lcore_id); 359 } 360 361 if (socketid >= NB_SOCKETS) { 362 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 363 socketid, i, NB_SOCKETS); 364 } 365 366 if (pktmbuf_pool[socketid] != NULL) { 367 continue; 368 } 369 370 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 371 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 372 pktmbuf_pool[socketid] = 373 rte_pktmbuf_pool_create(s, nb_mbuf, 374 MEMPOOL_CACHE_SIZE, 0, 375 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 376 } else { 377 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 378 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 379 } 380 381 if (pktmbuf_pool[socketid] == NULL) { 382 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 383 } else { 384 printf("create mbuf pool on socket %d\n", socketid); 385 } 386 } 387 388 return 0; 389 } 390 391 static struct rte_ring * 392 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 393 { 394 struct rte_ring *ring; 395 396 if (name == NULL) { 397 rte_exit(EXIT_FAILURE, "create ring failed, no name!\n"); 398 } 399 400 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 401 ring = rte_ring_create(name, count, socket_id, flags); 402 } else { 403 ring = rte_ring_lookup(name); 404 } 405 406 if (ring == NULL) { 407 rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name); 408 } 409 410 return ring; 411 } 412 413 static int 414 init_dispatch_ring(void) 415 { 416 int j; 417 char name_buf[RTE_RING_NAMESIZE]; 418 int queueid; 419 420 unsigned socketid = lcore_conf.socket_id; 421 422 /* Create ring according to ports actually being used. */ 423 int nb_ports = ff_global_cfg.dpdk.nb_ports; 424 for (j = 0; j < nb_ports; j++) { 425 uint16_t portid = ff_global_cfg.dpdk.portid_list[j]; 426 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid]; 427 int nb_queues = pconf->nb_lcores; 428 if (dispatch_ring[portid] == NULL) { 429 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid); 430 431 dispatch_ring[portid] = rte_zmalloc(name_buf, 432 sizeof(struct rte_ring *) * nb_queues, 433 RTE_CACHE_LINE_SIZE); 434 if (dispatch_ring[portid] == NULL) { 435 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 436 "failed\n", name_buf); 437 } 438 } 439 440 for(queueid = 0; queueid < nb_queues; ++queueid) { 441 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d", 442 portid, queueid); 443 dispatch_ring[portid][queueid] = create_ring(name_buf, 444 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ); 445 446 if (dispatch_ring[portid][queueid] == NULL) 447 rte_panic("create ring:%s failed!\n", name_buf); 448 449 printf("create ring:%s success, %u ring entries are now free!\n", 450 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid])); 451 } 452 } 453 454 return 0; 455 } 456 457 static void 458 ff_msg_init(struct rte_mempool *mp, 459 __attribute__((unused)) void *opaque_arg, 460 void *obj, __attribute__((unused)) unsigned i) 461 { 462 struct ff_msg *msg = (struct ff_msg *)obj; 463 msg->msg_type = FF_UNKNOWN; 464 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 465 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 466 } 467 468 static int 469 init_msg_ring(void) 470 { 471 uint16_t i; 472 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 473 unsigned socketid = lcore_conf.socket_id; 474 475 /* Create message buffer pool */ 476 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 477 message_pool = rte_mempool_create(FF_MSG_POOL, 478 MSG_RING_SIZE * 2 * nb_procs, 479 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 480 NULL, NULL, ff_msg_init, NULL, 481 socketid, 0); 482 } else { 483 message_pool = rte_mempool_lookup(FF_MSG_POOL); 484 } 485 486 if (message_pool == NULL) { 487 rte_panic("Create msg mempool failed\n"); 488 } 489 490 for(i = 0; i < nb_procs; ++i) { 491 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 492 "%s%u", FF_MSG_RING_IN, i); 493 snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE, 494 "%s%u", FF_MSG_RING_OUT, i); 495 496 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 497 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 498 if (msg_ring[i].ring[0] == NULL) 499 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 500 501 msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1], 502 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 503 if (msg_ring[i].ring[1] == NULL) 504 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 505 } 506 507 return 0; 508 } 509 510 #ifdef FF_KNI 511 static int 512 init_kni(void) 513 { 514 int nb_ports = rte_eth_dev_count_avail(); 515 kni_accept = 0; 516 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 517 kni_accept = 1; 518 519 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 520 ff_global_cfg.kni.udp_port); 521 522 unsigned socket_id = lcore_conf.socket_id; 523 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 524 525 nb_ports = ff_global_cfg.dpdk.nb_ports; 526 int i, ret; 527 for (i = 0; i < nb_ports; i++) { 528 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 529 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 530 } 531 532 return 0; 533 } 534 #endif 535 536 static void 537 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues) 538 { 539 if (reta_size == 0) { 540 return; 541 } 542 543 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 544 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 545 546 /* config HW indirection table */ 547 unsigned i, j, hash=0; 548 for (i = 0; i < reta_conf_size; i++) { 549 reta_conf[i].mask = ~0ULL; 550 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 551 reta_conf[i].reta[j] = hash++ % nb_queues; 552 } 553 } 554 555 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 556 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 557 port_id); 558 } 559 } 560 561 static int 562 init_port_start(void) 563 { 564 int nb_ports = ff_global_cfg.dpdk.nb_ports; 565 unsigned socketid = 0; 566 struct rte_mempool *mbuf_pool; 567 uint16_t i; 568 569 for (i = 0; i < nb_ports; i++) { 570 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 571 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 572 uint16_t nb_queues = pconf->nb_lcores; 573 574 struct rte_eth_dev_info dev_info; 575 rte_eth_dev_info_get(port_id, &dev_info); 576 577 if (nb_queues > dev_info.max_rx_queues) { 578 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 579 nb_queues, 580 dev_info.max_rx_queues); 581 } 582 583 if (nb_queues > dev_info.max_tx_queues) { 584 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 585 nb_queues, 586 dev_info.max_tx_queues); 587 } 588 589 struct ether_addr addr; 590 rte_eth_macaddr_get(port_id, &addr); 591 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 592 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 593 (unsigned)port_id, 594 addr.addr_bytes[0], addr.addr_bytes[1], 595 addr.addr_bytes[2], addr.addr_bytes[3], 596 addr.addr_bytes[4], addr.addr_bytes[5]); 597 598 rte_memcpy(pconf->mac, 599 addr.addr_bytes, ETHER_ADDR_LEN); 600 601 struct rte_eth_conf port_conf = {0}; 602 603 /* Set RSS mode */ 604 uint64_t default_rss_hf = ETH_RSS_PROTO_MASK; 605 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 606 port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf; 607 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes; 608 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; 609 port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads; 610 if (port_conf.rx_adv_conf.rss_conf.rss_hf != 611 ETH_RSS_PROTO_MASK) { 612 printf("Port %u modified RSS hash function based on hardware support," 613 "requested:%#"PRIx64" configured:%#"PRIx64"\n", 614 port_id, default_rss_hf, 615 port_conf.rx_adv_conf.rss_conf.rss_hf); 616 } 617 618 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) 619 port_conf.txmode.offloads |= 620 DEV_TX_OFFLOAD_MBUF_FAST_FREE; 621 622 /* Set Rx VLAN stripping */ 623 if (ff_global_cfg.dpdk.vlan_strip) { 624 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 625 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP; 626 } 627 } 628 629 /* Enable HW CRC stripping */ 630 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_KEEP_CRC) { 631 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_KEEP_CRC; 632 } 633 634 /* FIXME: Enable TCP LRO ?*/ 635 #if 0 636 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 637 printf("LRO is supported\n"); 638 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO; 639 pconf->hw_features.rx_lro = 1; 640 } 641 #endif 642 643 /* Set Rx checksum checking */ 644 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 645 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 646 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 647 printf("RX checksum offload supported\n"); 648 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM; 649 pconf->hw_features.rx_csum = 1; 650 } 651 652 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 653 printf("TX ip checksum offload supported\n"); 654 pconf->hw_features.tx_csum_ip = 1; 655 } 656 657 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 658 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 659 printf("TX TCP&UDP checksum offload supported\n"); 660 pconf->hw_features.tx_csum_l4 = 1; 661 } 662 663 if (ff_global_cfg.dpdk.tso) { 664 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 665 printf("TSO is supported\n"); 666 pconf->hw_features.tx_tso = 1; 667 } 668 } else { 669 printf("TSO is disabled\n"); 670 } 671 672 if (dev_info.reta_size) { 673 /* reta size must be power of 2 */ 674 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 675 676 rss_reta_size[port_id] = dev_info.reta_size; 677 printf("port[%d]: rss table size: %d\n", port_id, 678 dev_info.reta_size); 679 } 680 681 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 682 continue; 683 } 684 685 int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); 686 if (ret != 0) { 687 return ret; 688 } 689 uint16_t q; 690 for (q = 0; q < nb_queues; q++) { 691 if (numa_on) { 692 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q]; 693 socketid = rte_lcore_to_socket_id(lcore_id); 694 } 695 mbuf_pool = pktmbuf_pool[socketid]; 696 697 ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE, 698 socketid, &dev_info.default_txconf); 699 if (ret < 0) { 700 return ret; 701 } 702 703 ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE, 704 socketid, &dev_info.default_rxconf, mbuf_pool); 705 if (ret < 0) { 706 return ret; 707 } 708 } 709 710 ret = rte_eth_dev_start(port_id); 711 if (ret < 0) { 712 return ret; 713 } 714 715 if (nb_queues > 1) { 716 /* set HW rss hash function to Toeplitz. */ 717 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 718 struct rte_eth_hash_filter_info info = {0}; 719 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 720 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 721 722 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 723 RTE_ETH_FILTER_SET, &info) < 0) { 724 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 725 port_id); 726 } 727 } 728 729 set_rss_table(port_id, dev_info.reta_size, nb_queues); 730 } 731 732 /* Enable RX in promiscuous mode for the Ethernet device. */ 733 if (ff_global_cfg.dpdk.promiscuous) { 734 rte_eth_promiscuous_enable(port_id); 735 ret = rte_eth_promiscuous_get(port_id); 736 if (ret == 1) { 737 printf("set port %u to promiscuous mode ok\n", port_id); 738 } else { 739 printf("set port %u to promiscuous mode error\n", port_id); 740 } 741 } 742 743 /* Enable pcap dump */ 744 if (pconf->pcap) { 745 ff_enable_pcap(pconf->pcap); 746 } 747 } 748 749 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 750 check_all_ports_link_status(); 751 } 752 753 return 0; 754 } 755 756 static int 757 init_clock(void) 758 { 759 rte_timer_subsystem_init(); 760 uint64_t hz = rte_get_timer_hz(); 761 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 762 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 763 764 rte_timer_init(&freebsd_clock); 765 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 766 rte_lcore_id(), &ff_hardclock_job, NULL); 767 768 ff_update_current_ts(); 769 770 return 0; 771 } 772 773 int 774 ff_dpdk_init(int argc, char **argv) 775 { 776 if (ff_global_cfg.dpdk.nb_procs < 1 || 777 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 778 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 779 ff_global_cfg.dpdk.proc_id < 0) { 780 printf("param num_procs[%d] or proc_id[%d] error!\n", 781 ff_global_cfg.dpdk.nb_procs, 782 ff_global_cfg.dpdk.proc_id); 783 exit(1); 784 } 785 786 int ret = rte_eal_init(argc, argv); 787 if (ret < 0) { 788 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 789 } 790 791 numa_on = ff_global_cfg.dpdk.numa_on; 792 793 idle_sleep = ff_global_cfg.dpdk.idle_sleep; 794 795 init_lcore_conf(); 796 797 init_mem_pool(); 798 799 init_dispatch_ring(); 800 801 init_msg_ring(); 802 803 #ifdef FF_KNI 804 enable_kni = ff_global_cfg.kni.enable; 805 if (enable_kni) { 806 init_kni(); 807 } 808 #endif 809 810 ret = init_port_start(); 811 if (ret < 0) { 812 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 813 } 814 815 init_clock(); 816 817 return 0; 818 } 819 820 static void 821 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 822 { 823 uint8_t rx_csum = ctx->hw_features.rx_csum; 824 if (rx_csum) { 825 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 826 rte_pktmbuf_free(pkt); 827 return; 828 } 829 } 830 831 /* 832 * FIXME: should we save pkt->vlan_tci 833 * if (pkt->ol_flags & PKT_RX_VLAN_PKT) 834 */ 835 836 void *data = rte_pktmbuf_mtod(pkt, void*); 837 uint16_t len = rte_pktmbuf_data_len(pkt); 838 839 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 840 if (hdr == NULL) { 841 rte_pktmbuf_free(pkt); 842 return; 843 } 844 845 struct rte_mbuf *pn = pkt->next; 846 void *prev = hdr; 847 while(pn != NULL) { 848 data = rte_pktmbuf_mtod(pn, void*); 849 len = rte_pktmbuf_data_len(pn); 850 851 void *mb = ff_mbuf_get(prev, data, len); 852 if (mb == NULL) { 853 ff_mbuf_free(hdr); 854 rte_pktmbuf_free(pkt); 855 return; 856 } 857 pn = pn->next; 858 prev = mb; 859 } 860 861 ff_veth_process_packet(ctx->ifp, hdr); 862 } 863 864 static enum FilterReturn 865 protocol_filter(const void *data, uint16_t len) 866 { 867 if(len < ETHER_HDR_LEN) 868 return FILTER_UNKNOWN; 869 870 const struct ether_hdr *hdr; 871 hdr = (const struct ether_hdr *)data; 872 873 if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP) 874 return FILTER_ARP; 875 876 #ifndef FF_KNI 877 return FILTER_UNKNOWN; 878 #else 879 if (!enable_kni) { 880 return FILTER_UNKNOWN; 881 } 882 883 if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4) 884 return FILTER_UNKNOWN; 885 886 return ff_kni_proto_filter(data + ETHER_HDR_LEN, 887 len - ETHER_HDR_LEN); 888 #endif 889 } 890 891 static inline void 892 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m) 893 { 894 struct rte_mbuf *md; 895 void *src, *dst; 896 897 dst = rte_pktmbuf_mtod(mi, void *); 898 src = rte_pktmbuf_mtod(m, void *); 899 900 mi->data_len = m->data_len; 901 rte_memcpy(dst, src, m->data_len); 902 903 mi->port = m->port; 904 mi->vlan_tci = m->vlan_tci; 905 mi->vlan_tci_outer = m->vlan_tci_outer; 906 mi->tx_offload = m->tx_offload; 907 mi->hash = m->hash; 908 mi->ol_flags = m->ol_flags; 909 mi->packet_type = m->packet_type; 910 } 911 912 /* copied from rte_pktmbuf_clone */ 913 static inline struct rte_mbuf * 914 pktmbuf_deep_clone(const struct rte_mbuf *md, 915 struct rte_mempool *mp) 916 { 917 struct rte_mbuf *mc, *mi, **prev; 918 uint32_t pktlen; 919 uint8_t nseg; 920 921 if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) 922 return NULL; 923 924 mi = mc; 925 prev = &mi->next; 926 pktlen = md->pkt_len; 927 nseg = 0; 928 929 do { 930 nseg++; 931 pktmbuf_deep_attach(mi, md); 932 *prev = mi; 933 prev = &mi->next; 934 } while ((md = md->next) != NULL && 935 (mi = rte_pktmbuf_alloc(mp)) != NULL); 936 937 *prev = NULL; 938 mc->nb_segs = nseg; 939 mc->pkt_len = pktlen; 940 941 /* Allocation of new indirect segment failed */ 942 if (unlikely (mi == NULL)) { 943 rte_pktmbuf_free(mc); 944 return NULL; 945 } 946 947 __rte_mbuf_sanity_check(mc, 1); 948 return mc; 949 } 950 951 static inline void 952 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 953 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 954 { 955 struct lcore_conf *qconf = &lcore_conf; 956 uint16_t nb_queues = qconf->nb_queue_list[port_id]; 957 958 uint16_t i; 959 for (i = 0; i < count; i++) { 960 struct rte_mbuf *rtem = bufs[i]; 961 962 if (unlikely(qconf->pcap[port_id] != NULL)) { 963 if (!pkts_from_ring) { 964 ff_dump_packets(qconf->pcap[port_id], rtem); 965 } 966 } 967 968 void *data = rte_pktmbuf_mtod(rtem, void*); 969 uint16_t len = rte_pktmbuf_data_len(rtem); 970 971 if (!pkts_from_ring) { 972 ff_traffic.rx_packets++; 973 ff_traffic.rx_bytes += len; 974 } 975 976 if (!pkts_from_ring && packet_dispatcher) { 977 int ret = (*packet_dispatcher)(data, len, queue_id, nb_queues); 978 if (ret < 0 || ret >= nb_queues) { 979 rte_pktmbuf_free(rtem); 980 continue; 981 } 982 983 if (ret != queue_id) { 984 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem); 985 if (ret < 0) 986 rte_pktmbuf_free(rtem); 987 988 continue; 989 } 990 } 991 992 enum FilterReturn filter = protocol_filter(data, len); 993 if (filter == FILTER_ARP) { 994 struct rte_mempool *mbuf_pool; 995 struct rte_mbuf *mbuf_clone; 996 if (!pkts_from_ring) { 997 uint16_t j; 998 for(j = 0; j < nb_queues; ++j) { 999 if(j == queue_id) 1000 continue; 1001 1002 unsigned socket_id = 0; 1003 if (numa_on) { 1004 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j]; 1005 socket_id = rte_lcore_to_socket_id(lcore_id); 1006 } 1007 mbuf_pool = pktmbuf_pool[socket_id]; 1008 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1009 if(mbuf_clone) { 1010 int ret = rte_ring_enqueue(dispatch_ring[port_id][j], 1011 mbuf_clone); 1012 if (ret < 0) 1013 rte_pktmbuf_free(mbuf_clone); 1014 } 1015 } 1016 } 1017 1018 #ifdef FF_KNI 1019 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1020 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 1021 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1022 if(mbuf_clone) { 1023 ff_kni_enqueue(port_id, mbuf_clone); 1024 } 1025 } 1026 #endif 1027 ff_veth_input(ctx, rtem); 1028 #ifdef FF_KNI 1029 } else if (enable_kni && 1030 ((filter == FILTER_KNI && kni_accept) || 1031 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 1032 ff_kni_enqueue(port_id, rtem); 1033 #endif 1034 } else { 1035 ff_veth_input(ctx, rtem); 1036 } 1037 } 1038 } 1039 1040 static inline int 1041 process_dispatch_ring(uint16_t port_id, uint16_t queue_id, 1042 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 1043 { 1044 /* read packet from ring buf and to process */ 1045 uint16_t nb_rb; 1046 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id], 1047 (void **)pkts_burst, MAX_PKT_BURST, NULL); 1048 1049 if(nb_rb > 0) { 1050 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 1051 } 1052 1053 return 0; 1054 } 1055 1056 static inline void 1057 handle_sysctl_msg(struct ff_msg *msg) 1058 { 1059 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 1060 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 1061 msg->sysctl.newlen); 1062 1063 if (ret < 0) { 1064 msg->result = errno; 1065 } else { 1066 msg->result = 0; 1067 } 1068 } 1069 1070 static inline void 1071 handle_ioctl_msg(struct ff_msg *msg) 1072 { 1073 int fd, ret; 1074 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 1075 if (fd < 0) { 1076 ret = -1; 1077 goto done; 1078 } 1079 1080 ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data); 1081 1082 ff_close(fd); 1083 1084 done: 1085 if (ret < 0) { 1086 msg->result = errno; 1087 } else { 1088 msg->result = 0; 1089 } 1090 } 1091 1092 static inline void 1093 handle_route_msg(struct ff_msg *msg) 1094 { 1095 int ret = ff_rtioctl(msg->route.fib, msg->route.data, 1096 &msg->route.len, msg->route.maxlen); 1097 if (ret < 0) { 1098 msg->result = errno; 1099 } else { 1100 msg->result = 0; 1101 } 1102 } 1103 1104 static inline void 1105 handle_top_msg(struct ff_msg *msg) 1106 { 1107 msg->top = ff_top_status; 1108 msg->result = 0; 1109 } 1110 1111 #ifdef FF_NETGRAPH 1112 static inline void 1113 handle_ngctl_msg(struct ff_msg *msg) 1114 { 1115 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data); 1116 if (ret < 0) { 1117 msg->result = errno; 1118 } else { 1119 msg->result = 0; 1120 msg->ngctl.ret = ret; 1121 } 1122 } 1123 #endif 1124 1125 #ifdef FF_IPFW 1126 static inline void 1127 handle_ipfw_msg(struct ff_msg *msg) 1128 { 1129 int fd, ret; 1130 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW); 1131 if (fd < 0) { 1132 ret = -1; 1133 goto done; 1134 } 1135 1136 switch (msg->ipfw.cmd) { 1137 case FF_IPFW_GET: 1138 ret = ff_getsockopt_freebsd(fd, msg->ipfw.level, 1139 msg->ipfw.optname, msg->ipfw.optval, 1140 msg->ipfw.optlen); 1141 break; 1142 case FF_IPFW_SET: 1143 ret = ff_setsockopt_freebsd(fd, msg->ipfw.level, 1144 msg->ipfw.optname, msg->ipfw.optval, 1145 *(msg->ipfw.optlen)); 1146 break; 1147 default: 1148 ret = -1; 1149 errno = ENOTSUP; 1150 break; 1151 } 1152 1153 ff_close(fd); 1154 1155 done: 1156 if (ret < 0) { 1157 msg->result = errno; 1158 } else { 1159 msg->result = 0; 1160 } 1161 } 1162 #endif 1163 1164 static inline void 1165 handle_traffic_msg(struct ff_msg *msg) 1166 { 1167 msg->traffic = ff_traffic; 1168 msg->result = 0; 1169 } 1170 1171 static inline void 1172 handle_default_msg(struct ff_msg *msg) 1173 { 1174 msg->result = ENOTSUP; 1175 } 1176 1177 static inline void 1178 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1179 { 1180 switch (msg->msg_type) { 1181 case FF_SYSCTL: 1182 handle_sysctl_msg(msg); 1183 break; 1184 case FF_IOCTL: 1185 handle_ioctl_msg(msg); 1186 break; 1187 case FF_ROUTE: 1188 handle_route_msg(msg); 1189 break; 1190 case FF_TOP: 1191 handle_top_msg(msg); 1192 break; 1193 #ifdef FF_NETGRAPH 1194 case FF_NGCTL: 1195 handle_ngctl_msg(msg); 1196 break; 1197 #endif 1198 #ifdef FF_IPFW 1199 case FF_IPFW_CTL: 1200 handle_ipfw_msg(msg); 1201 break; 1202 #endif 1203 case FF_TRAFFIC: 1204 handle_traffic_msg(msg); 1205 break; 1206 default: 1207 handle_default_msg(msg); 1208 break; 1209 } 1210 rte_ring_enqueue(msg_ring[proc_id].ring[1], msg); 1211 } 1212 1213 static inline int 1214 process_msg_ring(uint16_t proc_id) 1215 { 1216 void *msg; 1217 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1218 1219 if (unlikely(ret == 0)) { 1220 handle_msg((struct ff_msg *)msg, proc_id); 1221 } 1222 1223 return 0; 1224 } 1225 1226 /* Send burst of packets on an output interface */ 1227 static inline int 1228 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1229 { 1230 struct rte_mbuf **m_table; 1231 int ret; 1232 uint16_t queueid; 1233 1234 queueid = qconf->tx_queue_id[port]; 1235 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1236 1237 if (unlikely(qconf->pcap[port] != NULL)) { 1238 uint16_t i; 1239 for (i = 0; i < n; i++) { 1240 ff_dump_packets(qconf->pcap[port], m_table[i]); 1241 } 1242 } 1243 1244 ff_traffic.tx_packets += n; 1245 uint16_t i; 1246 for (i = 0; i < n; i++) { 1247 ff_traffic.tx_bytes += rte_pktmbuf_data_len(m_table[i]); 1248 } 1249 1250 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1251 if (unlikely(ret < n)) { 1252 do { 1253 rte_pktmbuf_free(m_table[ret]); 1254 } while (++ret < n); 1255 } 1256 1257 return 0; 1258 } 1259 1260 /* Enqueue a single packet, and send burst if queue is filled */ 1261 static inline int 1262 send_single_packet(struct rte_mbuf *m, uint8_t port) 1263 { 1264 uint16_t len; 1265 struct lcore_conf *qconf; 1266 1267 qconf = &lcore_conf; 1268 len = qconf->tx_mbufs[port].len; 1269 qconf->tx_mbufs[port].m_table[len] = m; 1270 len++; 1271 1272 /* enough pkts to be sent */ 1273 if (unlikely(len == MAX_PKT_BURST)) { 1274 send_burst(qconf, MAX_PKT_BURST, port); 1275 len = 0; 1276 } 1277 1278 qconf->tx_mbufs[port].len = len; 1279 return 0; 1280 } 1281 1282 int 1283 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1284 int total) 1285 { 1286 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1287 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1288 if (head == NULL) { 1289 ff_mbuf_free(m); 1290 return -1; 1291 } 1292 1293 head->pkt_len = total; 1294 head->nb_segs = 0; 1295 1296 int off = 0; 1297 struct rte_mbuf *cur = head, *prev = NULL; 1298 while(total > 0) { 1299 if (cur == NULL) { 1300 cur = rte_pktmbuf_alloc(mbuf_pool); 1301 if (cur == NULL) { 1302 rte_pktmbuf_free(head); 1303 ff_mbuf_free(m); 1304 return -1; 1305 } 1306 } 1307 1308 if (prev != NULL) { 1309 prev->next = cur; 1310 } 1311 head->nb_segs++; 1312 1313 prev = cur; 1314 void *data = rte_pktmbuf_mtod(cur, void*); 1315 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1316 int ret = ff_mbuf_copydata(m, data, off, len); 1317 if (ret < 0) { 1318 rte_pktmbuf_free(head); 1319 ff_mbuf_free(m); 1320 return -1; 1321 } 1322 1323 1324 cur->data_len = len; 1325 off += len; 1326 total -= len; 1327 cur = NULL; 1328 } 1329 1330 struct ff_tx_offload offload = {0}; 1331 ff_mbuf_tx_offload(m, &offload); 1332 1333 void *data = rte_pktmbuf_mtod(head, void*); 1334 1335 if (offload.ip_csum) { 1336 /* ipv6 not supported yet */ 1337 struct ipv4_hdr *iph; 1338 int iph_len; 1339 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1340 iph_len = (iph->version_ihl & 0x0f) << 2; 1341 1342 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 1343 head->l2_len = ETHER_HDR_LEN; 1344 head->l3_len = iph_len; 1345 } 1346 1347 if (ctx->hw_features.tx_csum_l4) { 1348 struct ipv4_hdr *iph; 1349 int iph_len; 1350 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1351 iph_len = (iph->version_ihl & 0x0f) << 2; 1352 1353 if (offload.tcp_csum) { 1354 head->ol_flags |= PKT_TX_TCP_CKSUM; 1355 head->l2_len = ETHER_HDR_LEN; 1356 head->l3_len = iph_len; 1357 } 1358 1359 /* 1360 * TCP segmentation offload. 1361 * 1362 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 1363 * implies PKT_TX_TCP_CKSUM) 1364 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 1365 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 1366 * write the IP checksum to 0 in the packet 1367 * - fill the mbuf offload information: l2_len, 1368 * l3_len, l4_len, tso_segsz 1369 * - calculate the pseudo header checksum without taking ip_len 1370 * in account, and set it in the TCP header. Refer to 1371 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 1372 * used as helpers. 1373 */ 1374 if (offload.tso_seg_size) { 1375 struct tcp_hdr *tcph; 1376 int tcph_len; 1377 tcph = (struct tcp_hdr *)((char *)iph + iph_len); 1378 tcph_len = (tcph->data_off & 0xf0) >> 2; 1379 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 1380 1381 head->ol_flags |= PKT_TX_TCP_SEG; 1382 head->l4_len = tcph_len; 1383 head->tso_segsz = offload.tso_seg_size; 1384 } 1385 1386 if (offload.udp_csum) { 1387 head->ol_flags |= PKT_TX_UDP_CKSUM; 1388 head->l2_len = ETHER_HDR_LEN; 1389 head->l3_len = iph_len; 1390 } 1391 } 1392 1393 ff_mbuf_free(m); 1394 1395 return send_single_packet(head, ctx->port_id); 1396 } 1397 1398 static int 1399 main_loop(void *arg) 1400 { 1401 struct loop_routine *lr = (struct loop_routine *)arg; 1402 1403 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1404 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc; 1405 int i, j, nb_rx, idle; 1406 uint16_t port_id, queue_id; 1407 struct lcore_conf *qconf; 1408 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 1409 US_PER_S * BURST_TX_DRAIN_US; 1410 struct ff_dpdk_if_context *ctx; 1411 1412 prev_tsc = 0; 1413 usch_tsc = 0; 1414 1415 qconf = &lcore_conf; 1416 1417 while (1) { 1418 cur_tsc = rte_rdtsc(); 1419 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1420 rte_timer_manage(); 1421 } 1422 1423 idle = 1; 1424 sys_tsc = 0; 1425 usr_tsc = 0; 1426 1427 /* 1428 * TX burst queue drain 1429 */ 1430 diff_tsc = cur_tsc - prev_tsc; 1431 if (unlikely(diff_tsc > drain_tsc)) { 1432 for (i = 0; i < qconf->nb_tx_port; i++) { 1433 port_id = qconf->tx_port_id[i]; 1434 if (qconf->tx_mbufs[port_id].len == 0) 1435 continue; 1436 1437 idle = 0; 1438 1439 send_burst(qconf, 1440 qconf->tx_mbufs[port_id].len, 1441 port_id); 1442 qconf->tx_mbufs[port_id].len = 0; 1443 } 1444 1445 prev_tsc = cur_tsc; 1446 } 1447 1448 /* 1449 * Read packet from RX queues 1450 */ 1451 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1452 port_id = qconf->rx_queue_list[i].port_id; 1453 queue_id = qconf->rx_queue_list[i].queue_id; 1454 ctx = veth_ctx[port_id]; 1455 1456 #ifdef FF_KNI 1457 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1458 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1459 } 1460 #endif 1461 1462 process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); 1463 1464 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1465 MAX_PKT_BURST); 1466 if (nb_rx == 0) 1467 continue; 1468 1469 idle = 0; 1470 1471 /* Prefetch first packets */ 1472 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1473 rte_prefetch0(rte_pktmbuf_mtod( 1474 pkts_burst[j], void *)); 1475 } 1476 1477 /* Prefetch and handle already prefetched packets */ 1478 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1479 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1480 j + PREFETCH_OFFSET], void *)); 1481 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1482 } 1483 1484 /* Handle remaining prefetched packets */ 1485 for (; j < nb_rx; j++) { 1486 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1487 } 1488 } 1489 1490 process_msg_ring(qconf->proc_id); 1491 1492 div_tsc = rte_rdtsc(); 1493 1494 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) { 1495 usch_tsc = cur_tsc; 1496 lr->loop(lr->arg); 1497 } 1498 1499 idle_sleep_tsc = rte_rdtsc(); 1500 if (likely(idle && idle_sleep)) { 1501 usleep(idle_sleep); 1502 end_tsc = rte_rdtsc(); 1503 } else { 1504 end_tsc = idle_sleep_tsc; 1505 } 1506 1507 end_tsc = rte_rdtsc(); 1508 1509 if (usch_tsc == cur_tsc) { 1510 usr_tsc = idle_sleep_tsc - div_tsc; 1511 } 1512 1513 if (!idle) { 1514 sys_tsc = div_tsc - cur_tsc; 1515 ff_top_status.sys_tsc += sys_tsc; 1516 } 1517 1518 ff_top_status.usr_tsc += usr_tsc; 1519 ff_top_status.work_tsc += end_tsc - cur_tsc; 1520 ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1521 1522 ff_top_status.loops++; 1523 } 1524 1525 return 0; 1526 } 1527 1528 int 1529 ff_dpdk_if_up(void) { 1530 int i; 1531 struct lcore_conf *qconf = &lcore_conf; 1532 for (i = 0; i < qconf->nb_tx_port; i++) { 1533 uint16_t port_id = qconf->tx_port_id[i]; 1534 1535 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id]; 1536 veth_ctx[port_id] = ff_veth_attach(pconf); 1537 if (veth_ctx[port_id] == NULL) { 1538 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1539 } 1540 } 1541 1542 return 0; 1543 } 1544 1545 void 1546 ff_dpdk_run(loop_func_t loop, void *arg) { 1547 struct loop_routine *lr = rte_malloc(NULL, 1548 sizeof(struct loop_routine), 0); 1549 lr->loop = loop; 1550 lr->arg = arg; 1551 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1552 rte_eal_mp_wait_lcore(); 1553 rte_free(lr); 1554 } 1555 1556 void 1557 ff_dpdk_pktmbuf_free(void *m) 1558 { 1559 rte_pktmbuf_free((struct rte_mbuf *)m); 1560 } 1561 1562 static uint32_t 1563 toeplitz_hash(unsigned keylen, const uint8_t *key, 1564 unsigned datalen, const uint8_t *data) 1565 { 1566 uint32_t hash = 0, v; 1567 u_int i, b; 1568 1569 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1570 1571 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1572 for (i = 0; i < datalen; i++) { 1573 for (b = 0; b < 8; b++) { 1574 if (data[i] & (1<<(7-b))) 1575 hash ^= v; 1576 v <<= 1; 1577 if ((i + 4) < keylen && 1578 (key[i+4] & (1<<(7-b)))) 1579 v |= 1; 1580 } 1581 } 1582 return (hash); 1583 } 1584 1585 int 1586 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1587 uint16_t sport, uint16_t dport) 1588 { 1589 struct lcore_conf *qconf = &lcore_conf; 1590 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 1591 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id]; 1592 1593 if (nb_queues <= 1) { 1594 return 1; 1595 } 1596 1597 uint16_t reta_size = rss_reta_size[ctx->port_id]; 1598 uint16_t queueid = qconf->tx_queue_id[ctx->port_id]; 1599 1600 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1601 sizeof(dport)]; 1602 1603 unsigned datalen = 0; 1604 1605 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1606 datalen += sizeof(saddr); 1607 1608 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1609 datalen += sizeof(daddr); 1610 1611 bcopy(&sport, &data[datalen], sizeof(sport)); 1612 datalen += sizeof(sport); 1613 1614 bcopy(&dport, &data[datalen], sizeof(dport)); 1615 datalen += sizeof(dport); 1616 1617 uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes), 1618 default_rsskey_40bytes, datalen, data); 1619 1620 return ((hash & (reta_size - 1)) % nb_queues) == queueid; 1621 } 1622 1623 void 1624 ff_regist_packet_dispatcher(dispatch_func_t func) 1625 { 1626 packet_dispatcher = func; 1627 } 1628 1629 uint64_t 1630 ff_get_tsc_ns() 1631 { 1632 uint64_t cur_tsc = rte_rdtsc(); 1633 uint64_t hz = rte_get_tsc_hz(); 1634 return ((double)cur_tsc/(double)hz) * NS_PER_S; 1635 } 1636 1637