1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 27 #include <rte_common.h> 28 #include <rte_byteorder.h> 29 #include <rte_log.h> 30 #include <rte_memory.h> 31 #include <rte_memcpy.h> 32 #include <rte_memzone.h> 33 #include <rte_config.h> 34 #include <rte_eal.h> 35 #include <rte_pci.h> 36 #include <rte_mbuf.h> 37 #include <rte_memory.h> 38 #include <rte_lcore.h> 39 #include <rte_launch.h> 40 #include <rte_ethdev.h> 41 #include <rte_debug.h> 42 #include <rte_common.h> 43 #include <rte_ether.h> 44 #include <rte_malloc.h> 45 #include <rte_cycles.h> 46 #include <rte_timer.h> 47 #include <rte_thash.h> 48 #include <rte_ip.h> 49 #include <rte_tcp.h> 50 #include <rte_udp.h> 51 52 #include "ff_dpdk_if.h" 53 #include "ff_dpdk_pcap.h" 54 #include "ff_dpdk_kni.h" 55 #include "ff_config.h" 56 #include "ff_veth.h" 57 #include "ff_host_interface.h" 58 59 #define MEMPOOL_CACHE_SIZE 256 60 61 #define ARP_RING_SIZE 2048 62 63 /* 64 * Configurable number of RX/TX ring descriptors 65 */ 66 #define RX_QUEUE_SIZE 512 67 #define TX_QUEUE_SIZE 256 68 69 #define MAX_PKT_BURST 32 70 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 71 72 /* 73 * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send. 74 */ 75 #define MAX_TX_BURST (MAX_PKT_BURST / 2) 76 77 #define NB_SOCKETS 8 78 79 /* Configure how many packets ahead to prefetch, when reading packets */ 80 #define PREFETCH_OFFSET 3 81 82 #define MAX_RX_QUEUE_PER_LCORE 16 83 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS 84 #define MAX_RX_QUEUE_PER_PORT 128 85 86 #define BITS_PER_HEX 4 87 88 static int enable_kni; 89 static int kni_accept; 90 91 static struct rte_timer freebsd_clock; 92 93 // Mellanox Linux's driver key 94 static uint8_t default_rsskey_40bytes[40] = { 95 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 96 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 97 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 98 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 99 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 100 }; 101 102 static struct rte_eth_conf default_port_conf = { 103 .rxmode = { 104 .mq_mode = ETH_MQ_RX_RSS, 105 .max_rx_pkt_len = ETHER_MAX_LEN, 106 .split_hdr_size = 0, /**< hdr buf size */ 107 .header_split = 0, /**< Header Split disabled */ 108 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 109 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 110 .hw_vlan_strip = 0, /**< VLAN strip disabled. */ 111 .hw_vlan_extend = 0, /**< Extended VLAN disabled. */ 112 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 113 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 114 .enable_lro = 0, /**< LRO disabled */ 115 }, 116 .rx_adv_conf = { 117 .rss_conf = { 118 .rss_key = default_rsskey_40bytes, 119 .rss_key_len = 40, 120 .rss_hf = ETH_RSS_PROTO_MASK, 121 }, 122 }, 123 .txmode = { 124 .mq_mode = ETH_MQ_TX_NONE, 125 }, 126 }; 127 128 struct mbuf_table { 129 uint16_t len; 130 struct rte_mbuf *m_table[MAX_PKT_BURST]; 131 }; 132 133 struct lcore_rx_queue { 134 uint8_t port_id; 135 uint8_t queue_id; 136 } __rte_cache_aligned; 137 138 struct lcore_conf { 139 uint16_t proc_id; 140 uint16_t nb_procs; 141 uint16_t socket_id; 142 uint16_t nb_rx_queue; 143 uint16_t *lcore_proc; 144 struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; 145 uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; 146 struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS]; 147 char *pcap[RTE_MAX_ETHPORTS]; 148 } __rte_cache_aligned; 149 150 static struct lcore_conf lcore_conf; 151 152 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 153 154 static struct rte_ring **arp_ring[RTE_MAX_LCORE]; 155 156 struct ff_dpdk_if_context { 157 void *sc; 158 void *ifp; 159 uint16_t port_id; 160 struct ff_hw_features hw_features; 161 } __rte_cache_aligned; 162 163 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 164 165 extern void ff_hardclock(void); 166 167 static void 168 freebsd_hardclock_job(__rte_unused struct rte_timer *timer, 169 __rte_unused void *arg) { 170 ff_hardclock(); 171 } 172 173 struct ff_dpdk_if_context * 174 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 175 { 176 struct ff_dpdk_if_context *ctx; 177 178 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 179 if (ctx == NULL) 180 return NULL; 181 182 ctx->sc = sc; 183 ctx->ifp = ifp; 184 ctx->port_id = cfg->port_id; 185 ctx->hw_features = cfg->hw_features; 186 187 return ctx; 188 } 189 190 void 191 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 192 { 193 free(ctx); 194 } 195 196 static void 197 check_all_ports_link_status(void) 198 { 199 #define CHECK_INTERVAL 100 /* 100ms */ 200 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 201 202 uint8_t portid, count, all_ports_up, print_flag = 0; 203 struct rte_eth_link link; 204 205 printf("\nChecking link status"); 206 fflush(stdout); 207 208 int i, nb_ports; 209 nb_ports = ff_global_cfg.dpdk.nb_ports; 210 for (count = 0; count <= MAX_CHECK_TIME; count++) { 211 all_ports_up = 1; 212 for (i = 0; i < nb_ports; i++) { 213 uint8_t portid = ff_global_cfg.dpdk.port_cfgs[i].port_id; 214 memset(&link, 0, sizeof(link)); 215 rte_eth_link_get_nowait(portid, &link); 216 217 /* print link status if flag set */ 218 if (print_flag == 1) { 219 if (link.link_status) { 220 printf("Port %d Link Up - speed %u " 221 "Mbps - %s\n", (int)portid, 222 (unsigned)link.link_speed, 223 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 224 ("full-duplex") : ("half-duplex\n")); 225 } else { 226 printf("Port %d Link Down\n", (int)portid); 227 } 228 continue; 229 } 230 /* clear all_ports_up flag if any link down */ 231 if (link.link_status == 0) { 232 all_ports_up = 0; 233 break; 234 } 235 } 236 237 /* after finally printing all link status, get out */ 238 if (print_flag == 1) 239 break; 240 241 if (all_ports_up == 0) { 242 printf("."); 243 fflush(stdout); 244 rte_delay_ms(CHECK_INTERVAL); 245 } 246 247 /* set the print_flag if all ports up or timeout */ 248 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 249 print_flag = 1; 250 printf("done\n"); 251 } 252 } 253 } 254 255 static int 256 xdigit2val(unsigned char c) 257 { 258 int val; 259 260 if (isdigit(c)) 261 val = c - '0'; 262 else if (isupper(c)) 263 val = c - 'A' + 10; 264 else 265 val = c - 'a' + 10; 266 return val; 267 } 268 269 static int 270 parse_lcore_mask(const char *coremask, uint16_t *lcore_proc, 271 uint16_t nb_procs) 272 { 273 int i, j, idx = 0; 274 unsigned count = 0; 275 char c; 276 int val; 277 278 if (coremask == NULL) 279 return -1; 280 281 /* Remove all blank characters ahead and after. 282 * Remove 0x/0X if exists. 283 */ 284 while (isblank(*coremask)) 285 coremask++; 286 if (coremask[0] == '0' && ((coremask[1] == 'x') 287 || (coremask[1] == 'X'))) 288 coremask += 2; 289 290 i = strlen(coremask); 291 while ((i > 0) && isblank(coremask[i - 1])) 292 i--; 293 294 if (i == 0) 295 return -1; 296 297 for (i = i - 1; i >= 0 && idx < RTE_MAX_LCORE && count < nb_procs; i--) { 298 c = coremask[i]; 299 if (isxdigit(c) == 0) { 300 return -1; 301 } 302 val = xdigit2val(c); 303 for (j = 0; j < BITS_PER_HEX && idx < RTE_MAX_LCORE && count < nb_procs; 304 j++, idx++) { 305 if ((1 << j) & val) { 306 if (!lcore_config[idx].detected) { 307 RTE_LOG(ERR, EAL, "lcore %u unavailable\n", idx); 308 return -1; 309 } 310 lcore_proc[count] = idx; 311 count++; 312 } 313 } 314 } 315 316 for (; i >= 0; i--) 317 if (coremask[i] != '0') 318 return -1; 319 320 if (count < nb_procs) 321 return -1; 322 323 return 0; 324 } 325 326 static int 327 init_lcore_conf(void) 328 { 329 uint8_t nb_ports = rte_eth_dev_count(); 330 if (nb_ports == 0) { 331 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 332 } 333 334 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 335 lcore_conf.nb_procs = ff_global_cfg.dpdk.nb_procs; 336 lcore_conf.lcore_proc = rte_zmalloc(NULL, 337 sizeof(uint16_t)*lcore_conf.nb_procs, 0); 338 if (lcore_conf.lcore_proc == NULL) { 339 rte_exit(EXIT_FAILURE, "rte_zmalloc lcore_proc failed\n"); 340 } 341 342 int ret = parse_lcore_mask(ff_global_cfg.dpdk.lcore_mask, 343 lcore_conf.lcore_proc, lcore_conf.nb_procs); 344 if (ret < 0) { 345 rte_exit(EXIT_FAILURE, "parse_lcore_mask failed:%s\n", 346 ff_global_cfg.dpdk.lcore_mask); 347 } 348 349 uint16_t socket_id = 0; 350 if (ff_global_cfg.dpdk.numa_on) { 351 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 352 } 353 354 lcore_conf.socket_id = socket_id; 355 356 /* Currently, proc id 1:1 map to rx/tx queue id per port. */ 357 uint8_t port_id, enabled_ports = 0; 358 for (port_id = 0; port_id < nb_ports; port_id++) { 359 if (ff_global_cfg.dpdk.port_mask && 360 (ff_global_cfg.dpdk.port_mask & (1 << port_id)) == 0) { 361 printf("\nSkipping disabled port %d\n", port_id); 362 continue; 363 } 364 365 if (port_id >= ff_global_cfg.dpdk.nb_ports) { 366 printf("\nSkipping non-configured port %d\n", port_id); 367 break; 368 } 369 370 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 371 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 372 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = lcore_conf.proc_id; 373 lcore_conf.nb_rx_queue++; 374 375 lcore_conf.tx_queue_id[port_id] = lcore_conf.proc_id; 376 lcore_conf.pcap[port_id] = ff_global_cfg.dpdk.port_cfgs[enabled_ports].pcap; 377 378 ff_global_cfg.dpdk.port_cfgs[enabled_ports].port_id = port_id; 379 380 enabled_ports++; 381 } 382 383 ff_global_cfg.dpdk.nb_ports = enabled_ports; 384 385 return 0; 386 } 387 388 static int 389 init_mem_pool(void) 390 { 391 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 392 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 393 uint32_t nb_tx_queue = nb_lcores; 394 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 395 396 unsigned nb_mbuf = RTE_MAX ( 397 (nb_rx_queue*RX_QUEUE_SIZE + 398 nb_ports*nb_lcores*MAX_PKT_BURST + 399 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 400 nb_lcores*MEMPOOL_CACHE_SIZE), 401 (unsigned)8192); 402 403 unsigned socketid = 0; 404 uint16_t i, lcore_id; 405 char s[64]; 406 int numa_on = ff_global_cfg.dpdk.numa_on; 407 408 for (i = 0; i < lcore_conf.nb_procs; i++) { 409 lcore_id = lcore_conf.lcore_proc[i]; 410 if (numa_on) { 411 socketid = rte_lcore_to_socket_id(lcore_id); 412 } 413 414 if (socketid >= NB_SOCKETS) { 415 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 416 socketid, i, NB_SOCKETS); 417 } 418 419 if (pktmbuf_pool[socketid] != NULL) { 420 continue; 421 } 422 423 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 424 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 425 pktmbuf_pool[socketid] = 426 rte_pktmbuf_pool_create(s, nb_mbuf, 427 MEMPOOL_CACHE_SIZE, 0, 428 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 429 } else { 430 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 431 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 432 } 433 434 if (pktmbuf_pool[socketid] == NULL) { 435 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 436 } else { 437 printf("create mbuf pool on socket %d\n", socketid); 438 } 439 } 440 441 return 0; 442 } 443 444 static int 445 init_arp_ring(void) 446 { 447 int i, ret; 448 char name_buf[RTE_RING_NAMESIZE]; 449 int nb_procs = ff_global_cfg.dpdk.nb_procs; 450 int proc_id = ff_global_cfg.dpdk.proc_id; 451 452 /* Allocate arp ring ptr according to eth dev count. */ 453 int nb_ports = rte_eth_dev_count(); 454 for(i = 0; i < nb_procs; ++i) { 455 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_%d_%d", 456 proc_id, i); 457 458 arp_ring[i] = rte_zmalloc(name_buf, 459 sizeof(struct rte_ring *) * nb_ports, 460 RTE_CACHE_LINE_SIZE); 461 if (arp_ring[i] == NULL) { 462 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 463 "failed\n", name_buf); 464 } 465 } 466 467 unsigned socketid = lcore_conf.socket_id; 468 469 /* Create ring according to ports actually being used. */ 470 nb_ports = ff_global_cfg.dpdk.nb_ports; 471 for (i = 0; i < nb_ports; i++) { 472 uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id; 473 474 for(i = 0; i < nb_procs; ++i) { 475 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_%d_%d", i, port_id); 476 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 477 arp_ring[i][port_id] = rte_ring_create(name_buf, 478 ARP_RING_SIZE, socketid, 479 RING_F_SC_DEQ); 480 if (rte_ring_lookup(name_buf) != arp_ring[i][port_id]) 481 rte_panic("lookup kni ring:%s failed!\n", name_buf); 482 } else { 483 arp_ring[i][port_id] = rte_ring_lookup(name_buf); 484 } 485 486 if (arp_ring[i][port_id] == NULL) 487 rte_panic("create kni ring::%s failed!\n", name_buf); 488 489 printf("create arp ring:%s success, %u ring entries are now free!\n", 490 name_buf, rte_ring_free_count(arp_ring[i][port_id])); 491 } 492 } 493 494 return 0; 495 } 496 497 static int 498 init_kni(void) 499 { 500 int nb_ports = rte_eth_dev_count(); 501 kni_accept = 0; 502 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 503 kni_accept = 1; 504 505 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 506 ff_global_cfg.kni.udp_port); 507 508 unsigned socket_id = lcore_conf.socket_id; 509 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 510 511 nb_ports = ff_global_cfg.dpdk.nb_ports; 512 int i, ret; 513 for (i = 0; i < nb_ports; i++) { 514 uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id; 515 ff_kni_alloc(port_id, socket_id, mbuf_pool); 516 } 517 518 return 0; 519 } 520 521 static int 522 init_port_start(void) 523 { 524 int nb_ports = ff_global_cfg.dpdk.nb_ports; 525 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 526 unsigned socketid = rte_lcore_to_socket_id(rte_lcore_id()); 527 struct rte_mempool *mbuf_pool = pktmbuf_pool[socketid]; 528 uint16_t i; 529 530 for (i = 0; i < nb_ports; i++) { 531 uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id; 532 533 struct rte_eth_dev_info dev_info; 534 rte_eth_dev_info_get(port_id, &dev_info); 535 536 if (nb_procs > dev_info.max_rx_queues) { 537 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 538 nb_procs, 539 dev_info.max_rx_queues); 540 } 541 542 if (nb_procs > dev_info.max_tx_queues) { 543 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 544 nb_procs, 545 dev_info.max_tx_queues); 546 } 547 548 struct ether_addr addr; 549 rte_eth_macaddr_get(port_id, &addr); 550 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 551 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 552 (unsigned)port_id, 553 addr.addr_bytes[0], addr.addr_bytes[1], 554 addr.addr_bytes[2], addr.addr_bytes[3], 555 addr.addr_bytes[4], addr.addr_bytes[5]); 556 557 rte_memcpy(ff_global_cfg.dpdk.port_cfgs[i].mac, 558 addr.addr_bytes, ETHER_ADDR_LEN); 559 560 /* Clear txq_flags - we do not need multi-mempool and refcnt */ 561 dev_info.default_txconf.txq_flags = ETH_TXQ_FLAGS_NOMULTMEMP | 562 ETH_TXQ_FLAGS_NOREFCOUNT; 563 564 /* Disable features that are not supported by port's HW */ 565 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) { 566 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP; 567 } 568 569 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 570 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP; 571 } 572 573 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) { 574 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP; 575 } 576 577 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { 578 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; 579 } 580 581 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { 582 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; 583 } 584 585 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) && 586 !(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TSO)) { 587 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; 588 } 589 590 struct rte_eth_conf port_conf = {0}; 591 592 /* Set RSS mode */ 593 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 594 port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK; 595 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes; 596 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; 597 598 /* Set Rx VLAN stripping */ 599 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 600 port_conf.rxmode.hw_vlan_strip = 1; 601 } 602 603 /* Enable HW CRC stripping */ 604 port_conf.rxmode.hw_strip_crc = 1; 605 606 /* FIXME: Enable TCP LRO ?*/ 607 #if 0 608 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 609 printf("LRO is supported\n"); 610 port_conf.rxmode.enable_lro = 1; 611 ff_global_cfg.dpdk.port_cfgs[i].hw_features.rx_lro = 1; 612 } 613 #endif 614 615 /* Set Rx checksum checking */ 616 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 617 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 618 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 619 printf("RX checksum offload supported\n"); 620 port_conf.rxmode.hw_ip_checksum = 1; 621 ff_global_cfg.dpdk.port_cfgs[i].hw_features.rx_csum = 1; 622 } 623 624 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 625 printf("TX ip checksum offload supported\n"); 626 ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_csum_ip = 1; 627 } 628 629 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 630 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 631 printf("TX TCP&UDP checksum offload supported\n"); 632 ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_csum_l4 = 1; 633 } 634 635 if (ff_global_cfg.dpdk.tso) { 636 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 637 printf("TSO is supported\n"); 638 ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_tso = 1; 639 } 640 } else { 641 printf("TSO is disabled\n"); 642 } 643 644 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 645 return 0; 646 } 647 648 /* Currently, proc id 1:1 map to queue id per port. */ 649 int ret = rte_eth_dev_configure(port_id, nb_procs, nb_procs, &port_conf); 650 if (ret != 0) { 651 return ret; 652 } 653 654 uint16_t q; 655 for (q = 0; q < nb_procs; q++) { 656 ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE, 657 socketid, &dev_info.default_txconf); 658 if (ret < 0) { 659 return ret; 660 } 661 662 ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE, 663 socketid, &dev_info.default_rxconf, mbuf_pool); 664 if (ret < 0) { 665 return ret; 666 } 667 } 668 669 ret = rte_eth_dev_start(port_id); 670 if (ret < 0) { 671 return ret; 672 } 673 674 /* Enable RX in promiscuous mode for the Ethernet device. */ 675 if (ff_global_cfg.dpdk.promiscuous) { 676 rte_eth_promiscuous_enable(port_id); 677 ret = rte_eth_promiscuous_get(port_id); 678 if (ret == 1) { 679 printf("set port %u to promiscuous mode ok\n", port_id); 680 } else { 681 printf("set port %u to promiscuous mode error\n", port_id); 682 } 683 } 684 685 /* Enable pcap dump */ 686 if (ff_global_cfg.dpdk.port_cfgs[i].pcap) { 687 ff_enable_pcap(ff_global_cfg.dpdk.port_cfgs[i].pcap); 688 } 689 } 690 691 return 0; 692 } 693 694 static int 695 init_freebsd_clock(void) 696 { 697 rte_timer_subsystem_init(); 698 uint64_t hz = rte_get_timer_hz(); 699 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 700 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 701 702 rte_timer_init(&freebsd_clock); 703 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 704 rte_lcore_id(), &freebsd_hardclock_job, NULL); 705 706 return 0; 707 } 708 709 int 710 ff_dpdk_init(int argc, char **argv) 711 { 712 if (ff_global_cfg.dpdk.nb_procs < 1 || 713 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 714 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 715 ff_global_cfg.dpdk.proc_id < 0) { 716 printf("param num_procs[%d] or proc_id[%d] error!\n", 717 ff_global_cfg.dpdk.nb_procs, 718 ff_global_cfg.dpdk.proc_id); 719 exit(1); 720 } 721 722 int ret = rte_eal_init(argc, argv); 723 if (ret < 0) { 724 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 725 } 726 727 init_lcore_conf(); 728 729 init_mem_pool(); 730 731 init_arp_ring(); 732 733 enable_kni = ff_global_cfg.kni.enable; 734 if (enable_kni) { 735 init_kni(); 736 } 737 738 ret = init_port_start(); 739 if (ret < 0) { 740 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 741 } 742 743 check_all_ports_link_status(); 744 745 init_freebsd_clock(); 746 747 return 0; 748 } 749 750 static void 751 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 752 { 753 uint8_t rx_csum = ctx->hw_features.rx_csum; 754 if (rx_csum) { 755 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 756 return; 757 } 758 } 759 760 /* 761 * FIXME: should we save pkt->vlan_tci 762 * if (pkt->ol_flags & PKT_RX_VLAN_PKT) 763 */ 764 765 void *data = rte_pktmbuf_mtod(pkt, void*); 766 uint16_t len = rte_pktmbuf_data_len(pkt); 767 768 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 769 if (hdr == NULL) { 770 rte_pktmbuf_free(pkt); 771 return; 772 } 773 774 pkt = pkt->next; 775 void *prev = hdr; 776 while(pkt != NULL) { 777 data = rte_pktmbuf_mtod(pkt, void*); 778 len = rte_pktmbuf_data_len(pkt); 779 780 void *mb = ff_mbuf_get(prev, data, len); 781 if (mb == NULL) { 782 ff_mbuf_free(hdr); 783 return; 784 } 785 pkt = pkt->next; 786 prev = mb; 787 } 788 789 ff_veth_process_packet(ctx->ifp, hdr); 790 } 791 792 static enum FilterReturn 793 protocol_filter(const void *data, uint16_t len) 794 { 795 if(len < sizeof(struct ether_hdr)) 796 return FILTER_UNKNOWN; 797 798 const struct ether_hdr *hdr; 799 hdr = (const struct ether_hdr *)data; 800 801 if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP) 802 return FILTER_ARP; 803 804 if (!enable_kni) { 805 return FILTER_UNKNOWN; 806 } 807 808 if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4) 809 return FILTER_UNKNOWN; 810 811 return ff_kni_proto_filter(data + sizeof(struct ether_hdr), 812 len - sizeof(struct ether_hdr)); 813 } 814 815 static inline void 816 process_packets(uint8_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 817 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 818 { 819 struct lcore_conf *qconf = &lcore_conf; 820 821 uint16_t i; 822 for (i = 0; i < count; i++) { 823 struct rte_mbuf *rtem = bufs[i]; 824 825 if (unlikely(qconf->pcap[port_id] != NULL)) { 826 ff_dump_packets(qconf->pcap[port_id], rtem); 827 } 828 829 void *data = rte_pktmbuf_mtod(rtem, void*); 830 uint16_t len = rte_pktmbuf_data_len(rtem); 831 832 enum FilterReturn filter = protocol_filter(data, len); 833 if (filter == FILTER_ARP) { 834 struct rte_mempool *mbuf_pool; 835 struct rte_mbuf *mbuf_clone; 836 if (pkts_from_ring == 0) { 837 uint16_t i; 838 for(i = 0; i < qconf->nb_procs; ++i) { 839 if(i == queue_id) 840 continue; 841 842 mbuf_pool = pktmbuf_pool[rte_lcore_to_socket_id(qconf->lcore_proc[i])]; 843 mbuf_clone = rte_pktmbuf_clone(rtem, mbuf_pool); 844 if(mbuf_clone) { 845 int ret = rte_ring_enqueue(arp_ring[i][port_id], mbuf_clone); 846 if (ret < 0) 847 rte_pktmbuf_free(mbuf_clone); 848 } 849 } 850 } 851 852 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 853 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 854 mbuf_clone = rte_pktmbuf_clone(rtem, mbuf_pool); 855 if(mbuf_clone) { 856 ff_kni_enqueue(port_id, rtem); 857 } 858 } 859 860 ff_veth_input(ctx, rtem); 861 } else if (enable_kni && ((filter == FILTER_KNI && kni_accept) || 862 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 863 ff_kni_enqueue(port_id, rtem); 864 } else { 865 ff_veth_input(ctx, rtem); 866 } 867 } 868 } 869 870 static inline int 871 process_arp_ring(uint8_t port_id, uint16_t queue_id, 872 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 873 { 874 /* read packet from ring buf and to process */ 875 uint16_t nb_tx; 876 nb_tx = rte_ring_dequeue_burst(arp_ring[queue_id][port_id], 877 (void **)pkts_burst, MAX_PKT_BURST); 878 879 if(nb_tx > 0) { 880 process_packets(port_id, queue_id, pkts_burst, nb_tx, ctx, 1); 881 } 882 883 return 0; 884 } 885 886 /* Send burst of packets on an output interface */ 887 static inline int 888 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 889 { 890 struct rte_mbuf **m_table; 891 int ret; 892 uint16_t queueid; 893 894 queueid = qconf->tx_queue_id[port]; 895 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 896 897 if (unlikely(qconf->pcap[port] != NULL)) { 898 uint16_t i; 899 for (i = 0; i < n; i++) { 900 ff_dump_packets(qconf->pcap[port], m_table[i]); 901 } 902 } 903 904 ret = rte_eth_tx_burst(port, queueid, m_table, n); 905 if (unlikely(ret < n)) { 906 do { 907 rte_pktmbuf_free(m_table[ret]); 908 } while (++ret < n); 909 } 910 911 return 0; 912 } 913 914 /* Enqueue a single packet, and send burst if queue is filled */ 915 static inline int 916 send_single_packet(struct rte_mbuf *m, uint8_t port) 917 { 918 uint16_t len; 919 struct lcore_conf *qconf; 920 921 qconf = &lcore_conf; 922 len = qconf->tx_mbufs[port].len; 923 qconf->tx_mbufs[port].m_table[len] = m; 924 len++; 925 926 /* enough pkts to be sent */ 927 if (unlikely(len == MAX_PKT_BURST)) { 928 send_burst(qconf, MAX_PKT_BURST, port); 929 len = 0; 930 } 931 932 qconf->tx_mbufs[port].len = len; 933 return 0; 934 } 935 936 int 937 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 938 int total) 939 { 940 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 941 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 942 if (head == NULL) { 943 ff_mbuf_free(m); 944 return -1; 945 } 946 947 head->pkt_len = total; 948 head->nb_segs = 0; 949 950 int off = 0; 951 struct rte_mbuf *cur = head, *prev = NULL; 952 while(total > 0) { 953 if (cur == NULL) { 954 cur = rte_pktmbuf_alloc(mbuf_pool); 955 if (cur == NULL) { 956 rte_pktmbuf_free(head); 957 ff_mbuf_free(m); 958 return -1; 959 } 960 } 961 962 void *data = rte_pktmbuf_mtod(cur, void*); 963 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 964 int ret = ff_mbuf_copydata(m, data, off, len); 965 if (ret < 0) { 966 rte_pktmbuf_free(head); 967 ff_mbuf_free(m); 968 return -1; 969 } 970 971 if (prev != NULL) { 972 prev->next = cur; 973 } 974 prev = cur; 975 976 cur->data_len = len; 977 off += len; 978 total -= len; 979 head->nb_segs++; 980 cur = NULL; 981 } 982 983 struct ff_tx_offload offload = {0}; 984 ff_mbuf_tx_offload(m, &offload); 985 986 if (offload.ip_csum) { 987 head->ol_flags |= PKT_TX_IP_CKSUM; 988 head->l2_len = sizeof(struct ether_hdr); 989 head->l3_len = sizeof(struct ipv4_hdr); 990 } 991 992 if (ctx->hw_features.tx_csum_l4) { 993 if (offload.tcp_csum) { 994 head->ol_flags |= PKT_TX_TCP_CKSUM; 995 head->l2_len = sizeof(struct ether_hdr); 996 head->l3_len = sizeof(struct ipv4_hdr); 997 } 998 999 if (offload.tso_seg_size) { 1000 head->ol_flags |= PKT_TX_TCP_SEG; 1001 head->l4_len = sizeof(struct tcp_hdr); 1002 head->tso_segsz = offload.tso_seg_size; 1003 } 1004 1005 if (offload.udp_csum) { 1006 head->ol_flags |= PKT_TX_UDP_CKSUM; 1007 head->l2_len = sizeof(struct ether_hdr); 1008 head->l3_len = sizeof(struct ipv4_hdr); 1009 } 1010 } 1011 1012 ff_mbuf_free(m); 1013 1014 return send_single_packet(head, ctx->port_id); 1015 } 1016 1017 static int 1018 main_loop(void *arg) 1019 { 1020 struct loop_routine *lr = (struct loop_routine *)arg; 1021 1022 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1023 unsigned lcore_id; 1024 uint64_t prev_tsc, diff_tsc, cur_tsc; 1025 int i, j, nb_rx; 1026 uint8_t port_id, queue_id; 1027 struct lcore_conf *qconf; 1028 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 1029 US_PER_S * BURST_TX_DRAIN_US; 1030 struct ff_dpdk_if_context *ctx; 1031 1032 prev_tsc = 0; 1033 1034 lcore_id = rte_lcore_id(); 1035 qconf = &lcore_conf; 1036 1037 if (qconf->nb_rx_queue == 0) { 1038 printf("lcore %u has nothing to do\n", lcore_id); 1039 return 0; 1040 } 1041 1042 while (1) { 1043 cur_tsc = rte_rdtsc(); 1044 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1045 rte_timer_manage(); 1046 } 1047 1048 /* 1049 * TX burst queue drain 1050 */ 1051 diff_tsc = cur_tsc - prev_tsc; 1052 if (unlikely(diff_tsc > drain_tsc)) { 1053 /* 1054 * This could be optimized (use queueid instead of 1055 * portid), but it is not called so often 1056 */ 1057 for (port_id = 0; port_id < RTE_MAX_ETHPORTS; port_id++) { 1058 if (qconf->tx_mbufs[port_id].len == 0) 1059 continue; 1060 send_burst(qconf, 1061 qconf->tx_mbufs[port_id].len, 1062 port_id); 1063 qconf->tx_mbufs[port_id].len = 0; 1064 } 1065 1066 prev_tsc = cur_tsc; 1067 } 1068 1069 /* 1070 * Read packet from RX queues 1071 */ 1072 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1073 port_id = qconf->rx_queue_list[i].port_id; 1074 queue_id = qconf->rx_queue_list[i].queue_id; 1075 ctx = veth_ctx[port_id]; 1076 1077 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1078 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1079 } 1080 1081 process_arp_ring(port_id, queue_id, pkts_burst, ctx); 1082 1083 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1084 MAX_PKT_BURST); 1085 if (nb_rx == 0) 1086 continue; 1087 1088 /* Prefetch first packets */ 1089 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1090 rte_prefetch0(rte_pktmbuf_mtod( 1091 pkts_burst[j], void *)); 1092 } 1093 1094 /* Prefetch and handle already prefetched packets */ 1095 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1096 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1097 j + PREFETCH_OFFSET], void *)); 1098 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1099 } 1100 1101 /* Handle remaining prefetched packets */ 1102 for (; j < nb_rx; j++) { 1103 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1104 } 1105 } 1106 1107 if (likely(lr->loop != NULL)) { 1108 lr->loop(lr->arg); 1109 } 1110 } 1111 } 1112 1113 int 1114 ff_dpdk_if_up(void) { 1115 int nb_ports = ff_global_cfg.dpdk.nb_ports; 1116 int i; 1117 for (i = 0; i < nb_ports; i++) { 1118 uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id; 1119 veth_ctx[port_id] = ff_veth_attach(ff_global_cfg.dpdk.port_cfgs + i); 1120 if (veth_ctx[port_id] == NULL) { 1121 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1122 } 1123 } 1124 1125 return 0; 1126 } 1127 1128 void 1129 ff_dpdk_run(loop_func_t loop, void *arg) { 1130 struct loop_routine *lr = malloc(sizeof(struct loop_routine)); 1131 lr->loop = loop; 1132 lr->arg = arg; 1133 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1134 rte_eal_mp_wait_lcore(); 1135 free(lr); 1136 } 1137 1138 void 1139 ff_dpdk_pktmbuf_free(void *m) 1140 { 1141 rte_pktmbuf_free((struct rte_mbuf *)m); 1142 } 1143 1144 static uint32_t 1145 toeplitz_hash(unsigned keylen, const uint8_t *key, 1146 unsigned datalen, const uint8_t *data) 1147 { 1148 uint32_t hash = 0, v; 1149 u_int i, b; 1150 1151 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1152 1153 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1154 for (i = 0; i < datalen; i++) { 1155 for (b = 0; b < 8; b++) { 1156 if (data[i] & (1<<(7-b))) 1157 hash ^= v; 1158 v <<= 1; 1159 if ((i + 4) < keylen && 1160 (key[i+4] & (1<<(7-b)))) 1161 v |= 1; 1162 } 1163 } 1164 return (hash); 1165 } 1166 1167 int 1168 ff_rss_check(uint32_t saddr, uint32_t daddr, uint16_t sport, uint16_t dport) 1169 { 1170 struct lcore_conf *qconf = &lcore_conf; 1171 1172 if (qconf->nb_procs == 1) { 1173 return 1; 1174 } 1175 1176 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1177 sizeof(dport)]; 1178 1179 unsigned datalen = 0; 1180 1181 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1182 datalen += sizeof(saddr); 1183 1184 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1185 datalen += sizeof(daddr); 1186 1187 bcopy(&sport, &data[datalen], sizeof(sport)); 1188 datalen += sizeof(sport); 1189 1190 bcopy(&dport, &data[datalen], sizeof(dport)); 1191 datalen += sizeof(dport); 1192 1193 uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes), default_rsskey_40bytes, datalen, data); 1194 1195 return (hash % qconf->nb_procs) == qconf->proc_id; 1196 } 1197 1198 1199