1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 27 #include <rte_common.h> 28 #include <rte_byteorder.h> 29 #include <rte_log.h> 30 #include <rte_memory.h> 31 #include <rte_memcpy.h> 32 #include <rte_memzone.h> 33 #include <rte_config.h> 34 #include <rte_eal.h> 35 #include <rte_pci.h> 36 #include <rte_mbuf.h> 37 #include <rte_memory.h> 38 #include <rte_lcore.h> 39 #include <rte_launch.h> 40 #include <rte_ethdev.h> 41 #include <rte_debug.h> 42 #include <rte_common.h> 43 #include <rte_ether.h> 44 #include <rte_malloc.h> 45 #include <rte_cycles.h> 46 #include <rte_timer.h> 47 #include <rte_thash.h> 48 #include <rte_ip.h> 49 #include <rte_tcp.h> 50 #include <rte_udp.h> 51 52 #include "ff_dpdk_if.h" 53 #include "ff_dpdk_pcap.h" 54 #include "ff_dpdk_kni.h" 55 #include "ff_config.h" 56 #include "ff_veth.h" 57 #include "ff_host_interface.h" 58 59 #define MEMPOOL_CACHE_SIZE 256 60 61 #define ARP_RING_SIZE 2048 62 63 /* 64 * Configurable number of RX/TX ring descriptors 65 */ 66 #define RX_QUEUE_SIZE 512 67 #define TX_QUEUE_SIZE 256 68 69 #define MAX_PKT_BURST 32 70 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 71 72 /* 73 * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send. 74 */ 75 #define MAX_TX_BURST (MAX_PKT_BURST / 2) 76 77 #define NB_SOCKETS 8 78 79 /* Configure how many packets ahead to prefetch, when reading packets */ 80 #define PREFETCH_OFFSET 3 81 82 #define MAX_RX_QUEUE_PER_LCORE 16 83 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS 84 #define MAX_RX_QUEUE_PER_PORT 128 85 86 #define BITS_PER_HEX 4 87 88 static int enable_kni; 89 static int kni_accept; 90 91 static struct rte_timer freebsd_clock; 92 93 // Mellanox Linux's driver key 94 static uint8_t default_rsskey_40bytes[40] = { 95 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 96 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 97 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 98 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 99 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 100 }; 101 102 static struct rte_eth_conf default_port_conf = { 103 .rxmode = { 104 .mq_mode = ETH_MQ_RX_RSS, 105 .max_rx_pkt_len = ETHER_MAX_LEN, 106 .split_hdr_size = 0, /**< hdr buf size */ 107 .header_split = 0, /**< Header Split disabled */ 108 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 109 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 110 .hw_vlan_strip = 0, /**< VLAN strip disabled. */ 111 .hw_vlan_extend = 0, /**< Extended VLAN disabled. */ 112 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 113 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 114 .enable_lro = 0, /**< LRO disabled */ 115 }, 116 .rx_adv_conf = { 117 .rss_conf = { 118 .rss_key = default_rsskey_40bytes, 119 .rss_key_len = 40, 120 .rss_hf = ETH_RSS_PROTO_MASK, 121 }, 122 }, 123 .txmode = { 124 .mq_mode = ETH_MQ_TX_NONE, 125 }, 126 }; 127 128 struct mbuf_table { 129 uint16_t len; 130 struct rte_mbuf *m_table[MAX_PKT_BURST]; 131 }; 132 133 struct lcore_rx_queue { 134 uint8_t port_id; 135 uint8_t queue_id; 136 } __rte_cache_aligned; 137 138 struct lcore_conf { 139 uint16_t proc_id; 140 uint16_t nb_procs; 141 uint16_t socket_id; 142 uint16_t nb_rx_queue; 143 uint16_t *lcore_proc; 144 struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; 145 uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; 146 struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS]; 147 char *pcap[RTE_MAX_ETHPORTS]; 148 } __rte_cache_aligned; 149 150 static struct lcore_conf lcore_conf; 151 152 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 153 154 static struct rte_ring **arp_ring[RTE_MAX_LCORE]; 155 156 struct ff_dpdk_if_context { 157 void *sc; 158 void *ifp; 159 uint16_t port_id; 160 struct ff_hw_features hw_features; 161 } __rte_cache_aligned; 162 163 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 164 165 extern void ff_hardclock(void); 166 167 static void 168 freebsd_hardclock_job(__rte_unused struct rte_timer *timer, 169 __rte_unused void *arg) { 170 ff_hardclock(); 171 } 172 173 struct ff_dpdk_if_context * 174 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 175 { 176 struct ff_dpdk_if_context *ctx; 177 178 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 179 if (ctx == NULL) 180 return NULL; 181 182 ctx->sc = sc; 183 ctx->ifp = ifp; 184 ctx->port_id = cfg->port_id; 185 ctx->hw_features = cfg->hw_features; 186 187 return ctx; 188 } 189 190 void 191 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 192 { 193 free(ctx); 194 } 195 196 static void 197 check_all_ports_link_status(void) 198 { 199 #define CHECK_INTERVAL 100 /* 100ms */ 200 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 201 202 uint8_t portid, count, all_ports_up, print_flag = 0; 203 struct rte_eth_link link; 204 205 printf("\nChecking link status"); 206 fflush(stdout); 207 208 int i, nb_ports; 209 nb_ports = ff_global_cfg.dpdk.nb_ports; 210 for (count = 0; count <= MAX_CHECK_TIME; count++) { 211 all_ports_up = 1; 212 for (i = 0; i < nb_ports; i++) { 213 uint8_t portid = ff_global_cfg.dpdk.port_cfgs[i].port_id; 214 memset(&link, 0, sizeof(link)); 215 rte_eth_link_get_nowait(portid, &link); 216 217 /* print link status if flag set */ 218 if (print_flag == 1) { 219 if (link.link_status) { 220 printf("Port %d Link Up - speed %u " 221 "Mbps - %s\n", (int)portid, 222 (unsigned)link.link_speed, 223 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 224 ("full-duplex") : ("half-duplex\n")); 225 } else { 226 printf("Port %d Link Down\n", (int)portid); 227 } 228 continue; 229 } 230 /* clear all_ports_up flag if any link down */ 231 if (link.link_status == 0) { 232 all_ports_up = 0; 233 break; 234 } 235 } 236 237 /* after finally printing all link status, get out */ 238 if (print_flag == 1) 239 break; 240 241 if (all_ports_up == 0) { 242 printf("."); 243 fflush(stdout); 244 rte_delay_ms(CHECK_INTERVAL); 245 } 246 247 /* set the print_flag if all ports up or timeout */ 248 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 249 print_flag = 1; 250 printf("done\n"); 251 } 252 } 253 } 254 255 static int 256 xdigit2val(unsigned char c) 257 { 258 int val; 259 260 if (isdigit(c)) 261 val = c - '0'; 262 else if (isupper(c)) 263 val = c - 'A' + 10; 264 else 265 val = c - 'a' + 10; 266 return val; 267 } 268 269 static int 270 parse_lcore_mask(const char *coremask, uint16_t *lcore_proc, 271 uint16_t nb_procs) 272 { 273 int i, j, idx = 0; 274 unsigned count = 0; 275 char c; 276 int val; 277 278 if (coremask == NULL) 279 return -1; 280 281 /* Remove all blank characters ahead and after. 282 * Remove 0x/0X if exists. 283 */ 284 while (isblank(*coremask)) 285 coremask++; 286 if (coremask[0] == '0' && ((coremask[1] == 'x') 287 || (coremask[1] == 'X'))) 288 coremask += 2; 289 290 i = strlen(coremask); 291 while ((i > 0) && isblank(coremask[i - 1])) 292 i--; 293 294 if (i == 0) 295 return -1; 296 297 for (i = i - 1; i >= 0 && idx < RTE_MAX_LCORE && count < nb_procs; i--) { 298 c = coremask[i]; 299 if (isxdigit(c) == 0) { 300 return -1; 301 } 302 val = xdigit2val(c); 303 for (j = 0; j < BITS_PER_HEX && idx < RTE_MAX_LCORE && count < nb_procs; 304 j++, idx++) { 305 if ((1 << j) & val) { 306 if (!lcore_config[idx].detected) { 307 RTE_LOG(ERR, EAL, "lcore %u unavailable\n", idx); 308 return -1; 309 } 310 lcore_proc[count] = idx; 311 count++; 312 } 313 } 314 } 315 316 for (; i >= 0; i--) 317 if (coremask[i] != '0') 318 return -1; 319 320 if (count < nb_procs) 321 return -1; 322 323 return 0; 324 } 325 326 static int 327 init_lcore_conf(void) 328 { 329 uint8_t nb_ports = rte_eth_dev_count(); 330 if (nb_ports == 0) { 331 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 332 } 333 334 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 335 lcore_conf.nb_procs = ff_global_cfg.dpdk.nb_procs; 336 lcore_conf.lcore_proc = rte_zmalloc(NULL, 337 sizeof(uint16_t)*lcore_conf.nb_procs, 0); 338 if (lcore_conf.lcore_proc == NULL) { 339 rte_exit(EXIT_FAILURE, "rte_zmalloc lcore_proc failed\n"); 340 } 341 342 int ret = parse_lcore_mask(ff_global_cfg.dpdk.lcore_mask, 343 lcore_conf.lcore_proc, lcore_conf.nb_procs); 344 if (ret < 0) { 345 rte_exit(EXIT_FAILURE, "parse_lcore_mask failed:%s\n", 346 ff_global_cfg.dpdk.lcore_mask); 347 } 348 349 uint16_t socket_id = 0; 350 if (ff_global_cfg.dpdk.numa_on) { 351 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 352 } 353 354 lcore_conf.socket_id = socket_id; 355 356 /* Currently, proc id 1:1 map to rx/tx queue id per port. */ 357 uint8_t port_id, enabled_ports = 0; 358 for (port_id = 0; port_id < nb_ports; port_id++) { 359 if (ff_global_cfg.dpdk.port_mask && 360 (ff_global_cfg.dpdk.port_mask & (1 << port_id)) == 0) { 361 printf("\nSkipping disabled port %d\n", port_id); 362 continue; 363 } 364 365 if (port_id >= ff_global_cfg.dpdk.nb_ports) { 366 printf("\nSkipping non-configured port %d\n", port_id); 367 break; 368 } 369 370 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 371 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 372 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = lcore_conf.proc_id; 373 lcore_conf.nb_rx_queue++; 374 375 lcore_conf.tx_queue_id[port_id] = lcore_conf.proc_id; 376 lcore_conf.pcap[port_id] = ff_global_cfg.dpdk.port_cfgs[enabled_ports].pcap; 377 378 ff_global_cfg.dpdk.port_cfgs[enabled_ports].port_id = port_id; 379 380 enabled_ports++; 381 } 382 383 ff_global_cfg.dpdk.nb_ports = enabled_ports; 384 385 return 0; 386 } 387 388 static int 389 init_mem_pool(void) 390 { 391 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 392 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 393 uint32_t nb_tx_queue = nb_lcores; 394 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 395 396 unsigned nb_mbuf = RTE_MAX ( 397 (nb_rx_queue*RX_QUEUE_SIZE + 398 nb_ports*nb_lcores*MAX_PKT_BURST + 399 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 400 nb_lcores*MEMPOOL_CACHE_SIZE), 401 (unsigned)8192); 402 403 unsigned socketid = 0; 404 uint16_t i, lcore_id; 405 char s[64]; 406 int numa_on = ff_global_cfg.dpdk.numa_on; 407 408 for (i = 0; i < lcore_conf.nb_procs; i++) { 409 lcore_id = lcore_conf.lcore_proc[i]; 410 if (numa_on) { 411 socketid = rte_lcore_to_socket_id(lcore_id); 412 } 413 414 if (socketid >= NB_SOCKETS) { 415 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 416 socketid, i, NB_SOCKETS); 417 } 418 419 if (pktmbuf_pool[socketid] != NULL) { 420 continue; 421 } 422 423 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 424 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 425 pktmbuf_pool[socketid] = 426 rte_pktmbuf_pool_create(s, nb_mbuf, 427 MEMPOOL_CACHE_SIZE, 0, 428 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 429 } else { 430 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 431 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 432 } 433 434 if (pktmbuf_pool[socketid] == NULL) { 435 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 436 } else { 437 printf("create mbuf pool on socket %d\n", socketid); 438 } 439 } 440 441 return 0; 442 } 443 444 static int 445 init_arp_ring(void) 446 { 447 int i, ret; 448 char name_buf[RTE_RING_NAMESIZE]; 449 int nb_procs = ff_global_cfg.dpdk.nb_procs; 450 int proc_id = ff_global_cfg.dpdk.proc_id; 451 452 /* Allocate arp ring ptr according to eth dev count. */ 453 int nb_ports = rte_eth_dev_count(); 454 for(i = 0; i < nb_procs; ++i) { 455 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_%d_%d", 456 proc_id, i); 457 458 arp_ring[i] = rte_zmalloc(name_buf, 459 sizeof(struct rte_ring *) * nb_ports, 460 RTE_CACHE_LINE_SIZE); 461 if (arp_ring[i] == NULL) { 462 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 463 "failed\n", name_buf); 464 } 465 } 466 467 unsigned socketid = lcore_conf.socket_id; 468 469 /* Create ring according to ports actually being used. */ 470 nb_ports = ff_global_cfg.dpdk.nb_ports; 471 for (i = 0; i < nb_ports; i++) { 472 uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id; 473 474 for(i = 0; i < nb_procs; ++i) { 475 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_%d_%d", i, port_id); 476 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 477 arp_ring[i][port_id] = rte_ring_create(name_buf, 478 ARP_RING_SIZE, socketid, 479 RING_F_SC_DEQ); 480 } else { 481 arp_ring[i][port_id] = rte_ring_lookup(name_buf); 482 } 483 484 if (arp_ring[i][port_id] == NULL) 485 rte_panic("create kni ring::%s failed!\n", name_buf); 486 487 if (rte_ring_lookup(name_buf) != arp_ring[i][port_id]) 488 rte_panic("lookup kni ring:%s failed!\n", name_buf); 489 490 printf("create arp ring:%s success, %u ring entries are now free!\n", 491 name_buf, rte_ring_free_count(arp_ring[i][port_id])); 492 } 493 } 494 495 return 0; 496 } 497 498 static int 499 init_kni(void) 500 { 501 int nb_ports = rte_eth_dev_count(); 502 kni_accept = 0; 503 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 504 kni_accept = 1; 505 506 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 507 ff_global_cfg.kni.udp_port); 508 509 unsigned socket_id = lcore_conf.socket_id; 510 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 511 512 nb_ports = ff_global_cfg.dpdk.nb_ports; 513 int i, ret; 514 for (i = 0; i < nb_ports; i++) { 515 uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id; 516 ff_kni_alloc(port_id, socket_id, mbuf_pool); 517 } 518 519 return 0; 520 } 521 522 static int 523 init_port_start(void) 524 { 525 int nb_ports = ff_global_cfg.dpdk.nb_ports; 526 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 527 unsigned socketid = rte_lcore_to_socket_id(rte_lcore_id()); 528 struct rte_mempool *mbuf_pool = pktmbuf_pool[socketid]; 529 uint16_t i; 530 531 for (i = 0; i < nb_ports; i++) { 532 uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id; 533 534 struct rte_eth_dev_info dev_info; 535 rte_eth_dev_info_get(port_id, &dev_info); 536 537 if (nb_procs > dev_info.max_rx_queues) { 538 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 539 nb_procs, 540 dev_info.max_rx_queues); 541 } 542 543 if (nb_procs > dev_info.max_tx_queues) { 544 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 545 nb_procs, 546 dev_info.max_tx_queues); 547 } 548 549 struct ether_addr addr; 550 rte_eth_macaddr_get(port_id, &addr); 551 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 552 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 553 (unsigned)port_id, 554 addr.addr_bytes[0], addr.addr_bytes[1], 555 addr.addr_bytes[2], addr.addr_bytes[3], 556 addr.addr_bytes[4], addr.addr_bytes[5]); 557 558 rte_memcpy(ff_global_cfg.dpdk.port_cfgs[i].mac, 559 addr.addr_bytes, ETHER_ADDR_LEN); 560 561 /* Clear txq_flags - we do not need multi-mempool and refcnt */ 562 dev_info.default_txconf.txq_flags = ETH_TXQ_FLAGS_NOMULTMEMP | 563 ETH_TXQ_FLAGS_NOREFCOUNT; 564 565 /* Disable features that are not supported by port's HW */ 566 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) { 567 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP; 568 } 569 570 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 571 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP; 572 } 573 574 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) { 575 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP; 576 } 577 578 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { 579 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; 580 } 581 582 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { 583 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; 584 } 585 586 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) && 587 !(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TSO)) { 588 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; 589 } 590 591 struct rte_eth_conf port_conf = {0}; 592 593 /* Set RSS mode */ 594 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 595 port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK; 596 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes; 597 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; 598 599 /* Set Rx VLAN stripping */ 600 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 601 port_conf.rxmode.hw_vlan_strip = 1; 602 } 603 604 /* Enable HW CRC stripping */ 605 port_conf.rxmode.hw_strip_crc = 1; 606 607 /* FIXME: Enable TCP LRO ?*/ 608 #if 0 609 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 610 printf("LRO is supported\n"); 611 port_conf.rxmode.enable_lro = 1; 612 ff_global_cfg.dpdk.port_cfgs[i].hw_features.rx_lro = 1; 613 } 614 #endif 615 616 /* Set Rx checksum checking */ 617 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 618 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 619 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 620 printf("RX checksum offload supported\n"); 621 port_conf.rxmode.hw_ip_checksum = 1; 622 ff_global_cfg.dpdk.port_cfgs[i].hw_features.rx_csum = 1; 623 } 624 625 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 626 printf("TX ip checksum offload supported\n"); 627 ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_csum_ip = 1; 628 } 629 630 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 631 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 632 printf("TX TCP&UDP checksum offload supported\n"); 633 ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_csum_l4 = 1; 634 } 635 636 if (ff_global_cfg.dpdk.tso) { 637 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 638 printf("TSO is supported\n"); 639 ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_tso = 1; 640 } 641 } else { 642 printf("TSO is disabled\n"); 643 } 644 645 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 646 return 0; 647 } 648 649 /* Currently, proc id 1:1 map to queue id per port. */ 650 int ret = rte_eth_dev_configure(port_id, nb_procs, nb_procs, &port_conf); 651 if (ret != 0) { 652 return ret; 653 } 654 655 uint16_t q; 656 for (q = 0; q < nb_procs; q++) { 657 ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE, 658 socketid, &dev_info.default_txconf); 659 if (ret < 0) { 660 return ret; 661 } 662 663 ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE, 664 socketid, &dev_info.default_rxconf, mbuf_pool); 665 if (ret < 0) { 666 return ret; 667 } 668 } 669 670 ret = rte_eth_dev_start(port_id); 671 if (ret < 0) { 672 return ret; 673 } 674 675 /* Enable RX in promiscuous mode for the Ethernet device. */ 676 if (ff_global_cfg.dpdk.promiscuous) { 677 rte_eth_promiscuous_enable(port_id); 678 ret = rte_eth_promiscuous_get(port_id); 679 if (ret == 1) { 680 printf("set port %u to promiscuous mode ok\n", port_id); 681 } else { 682 printf("set port %u to promiscuous mode error\n", port_id); 683 } 684 } 685 686 /* Enable pcap dump */ 687 if (ff_global_cfg.dpdk.port_cfgs[i].pcap) { 688 ff_enable_pcap(ff_global_cfg.dpdk.port_cfgs[i].pcap); 689 } 690 } 691 692 return 0; 693 } 694 695 static int 696 init_freebsd_clock(void) 697 { 698 rte_timer_subsystem_init(); 699 uint64_t hz = rte_get_timer_hz(); 700 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 701 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 702 703 rte_timer_init(&freebsd_clock); 704 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 705 rte_lcore_id(), &freebsd_hardclock_job, NULL); 706 707 return 0; 708 } 709 710 int 711 ff_dpdk_init(int argc, char **argv) 712 { 713 if (ff_global_cfg.dpdk.nb_procs < 1 || 714 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 715 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 716 ff_global_cfg.dpdk.nb_procs < 0) { 717 printf("param num_procs[%d] or proc_id[%d] error!\n", 718 ff_global_cfg.dpdk.nb_procs, 719 ff_global_cfg.dpdk.proc_id); 720 exit(1); 721 } 722 723 int ret = rte_eal_init(argc, argv); 724 if (ret < 0) { 725 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 726 } 727 728 init_lcore_conf(); 729 730 init_mem_pool(); 731 732 init_arp_ring(); 733 734 enable_kni = ff_global_cfg.kni.enable; 735 if (enable_kni) { 736 init_kni(); 737 } 738 739 ret = init_port_start(); 740 if (ret < 0) { 741 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 742 } 743 744 check_all_ports_link_status(); 745 746 init_freebsd_clock(); 747 748 return 0; 749 } 750 751 static void 752 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 753 { 754 uint8_t rx_csum = ctx->hw_features.rx_csum; 755 if (rx_csum) { 756 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 757 return; 758 } 759 } 760 761 /* 762 * FIXME: should we save pkt->vlan_tci 763 * if (pkt->ol_flags & PKT_RX_VLAN_PKT) 764 */ 765 766 void *data = rte_pktmbuf_mtod(pkt, void*); 767 uint16_t len = rte_pktmbuf_data_len(pkt); 768 769 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 770 if (hdr == NULL) { 771 rte_pktmbuf_free(pkt); 772 return; 773 } 774 775 pkt = pkt->next; 776 void *prev = hdr; 777 while(pkt != NULL) { 778 data = rte_pktmbuf_mtod(pkt, void*); 779 len = rte_pktmbuf_data_len(pkt); 780 781 void *mb = ff_mbuf_get(prev, data, len); 782 if (mb == NULL) { 783 ff_mbuf_free(hdr); 784 return; 785 } 786 pkt = pkt->next; 787 prev = mb; 788 } 789 790 ff_veth_process_packet(ctx->ifp, hdr); 791 } 792 793 static enum FilterReturn 794 protocol_filter(const void *data, uint16_t len) 795 { 796 if(len < sizeof(struct ether_hdr)) 797 return FILTER_UNKNOWN; 798 799 const struct ether_hdr *hdr; 800 hdr = (const struct ether_hdr *)data; 801 802 if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP) 803 return FILTER_ARP; 804 805 if (!enable_kni) { 806 return FILTER_UNKNOWN; 807 } 808 809 if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4) 810 return FILTER_UNKNOWN; 811 812 return ff_kni_proto_filter(data + sizeof(struct ether_hdr), 813 len - sizeof(struct ether_hdr)); 814 } 815 816 static inline void 817 process_packets(uint8_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 818 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 819 { 820 struct lcore_conf *qconf = &lcore_conf; 821 822 uint16_t i; 823 for (i = 0; i < count; i++) { 824 struct rte_mbuf *rtem = bufs[i]; 825 826 if (unlikely(qconf->pcap[port_id] != NULL)) { 827 ff_dump_packets(qconf->pcap[port_id], rtem); 828 } 829 830 void *data = rte_pktmbuf_mtod(rtem, void*); 831 uint16_t len = rte_pktmbuf_data_len(rtem); 832 833 enum FilterReturn filter = protocol_filter(data, len); 834 if (filter == FILTER_ARP) { 835 struct rte_mempool *mbuf_pool; 836 struct rte_mbuf *mbuf_clone; 837 if (pkts_from_ring == 0) { 838 uint16_t i; 839 for(i = 0; i < qconf->nb_procs; ++i) { 840 if(i == queue_id) 841 continue; 842 843 mbuf_pool = pktmbuf_pool[rte_lcore_to_socket_id(qconf->lcore_proc[i])]; 844 mbuf_clone = rte_pktmbuf_clone(rtem, mbuf_pool); 845 if(mbuf_clone) { 846 int ret = rte_ring_enqueue(arp_ring[i][port_id], mbuf_clone); 847 if (ret < 0) 848 rte_pktmbuf_free(mbuf_clone); 849 } 850 } 851 } 852 853 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 854 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 855 mbuf_clone = rte_pktmbuf_clone(rtem, mbuf_pool); 856 if(mbuf_clone) { 857 ff_kni_enqueue(port_id, rtem); 858 } 859 } 860 861 ff_veth_input(ctx, rtem); 862 } else if (enable_kni && ((filter == FILTER_KNI && kni_accept) || 863 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 864 ff_kni_enqueue(port_id, rtem); 865 } else { 866 ff_veth_input(ctx, rtem); 867 } 868 } 869 } 870 871 static inline int 872 process_arp_ring(uint8_t port_id, uint16_t queue_id, 873 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 874 { 875 /* read packet from ring buf and to process */ 876 uint16_t nb_tx; 877 nb_tx = rte_ring_dequeue_burst(arp_ring[queue_id][port_id], 878 (void **)pkts_burst, MAX_PKT_BURST); 879 880 if(nb_tx > 0) { 881 process_packets(port_id, queue_id, pkts_burst, nb_tx, ctx, 1); 882 } 883 884 return 0; 885 } 886 887 /* Send burst of packets on an output interface */ 888 static inline int 889 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 890 { 891 struct rte_mbuf **m_table; 892 int ret; 893 uint16_t queueid; 894 895 queueid = qconf->tx_queue_id[port]; 896 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 897 898 if (unlikely(qconf->pcap[port] != NULL)) { 899 uint16_t i; 900 for (i = 0; i < n; i++) { 901 ff_dump_packets(qconf->pcap[port], m_table[i]); 902 } 903 } 904 905 ret = rte_eth_tx_burst(port, queueid, m_table, n); 906 if (unlikely(ret < n)) { 907 do { 908 rte_pktmbuf_free(m_table[ret]); 909 } while (++ret < n); 910 } 911 912 return 0; 913 } 914 915 /* Enqueue a single packet, and send burst if queue is filled */ 916 static inline int 917 send_single_packet(struct rte_mbuf *m, uint8_t port) 918 { 919 uint16_t len; 920 struct lcore_conf *qconf; 921 922 qconf = &lcore_conf; 923 len = qconf->tx_mbufs[port].len; 924 qconf->tx_mbufs[port].m_table[len] = m; 925 len++; 926 927 /* enough pkts to be sent */ 928 if (unlikely(len == MAX_PKT_BURST)) { 929 send_burst(qconf, MAX_PKT_BURST, port); 930 len = 0; 931 } 932 933 qconf->tx_mbufs[port].len = len; 934 return 0; 935 } 936 937 int 938 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 939 int total) 940 { 941 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 942 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 943 if (head == NULL) { 944 ff_mbuf_free(m); 945 return -1; 946 } 947 948 head->pkt_len = total; 949 head->nb_segs = 0; 950 951 int off = 0; 952 struct rte_mbuf *cur = head, *prev = NULL; 953 while(total > 0) { 954 if (cur == NULL) { 955 cur = rte_pktmbuf_alloc(mbuf_pool); 956 if (cur == NULL) { 957 rte_pktmbuf_free(head); 958 ff_mbuf_free(m); 959 return -1; 960 } 961 } 962 963 void *data = rte_pktmbuf_mtod(cur, void*); 964 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 965 int ret = ff_mbuf_copydata(m, data, off, len); 966 if (ret < 0) { 967 rte_pktmbuf_free(head); 968 ff_mbuf_free(m); 969 return -1; 970 } 971 972 if (prev != NULL) { 973 prev->next = cur; 974 } 975 prev = cur; 976 977 cur->data_len = len; 978 off += len; 979 total -= len; 980 head->nb_segs++; 981 cur = NULL; 982 } 983 984 struct ff_tx_offload offload = {0}; 985 ff_mbuf_tx_offload(m, &offload); 986 987 if (offload.ip_csum) { 988 head->ol_flags |= PKT_TX_IP_CKSUM; 989 head->l2_len = sizeof(struct ether_hdr); 990 head->l3_len = sizeof(struct ipv4_hdr); 991 } 992 993 if (ctx->hw_features.tx_csum_l4) { 994 if (offload.tcp_csum) { 995 head->ol_flags |= PKT_TX_TCP_CKSUM; 996 head->l2_len = sizeof(struct ether_hdr); 997 head->l3_len = sizeof(struct ipv4_hdr); 998 } 999 1000 if (offload.tso_seg_size) { 1001 head->ol_flags |= PKT_TX_TCP_SEG; 1002 head->l4_len = sizeof(struct tcp_hdr); 1003 head->tso_segsz = offload.tso_seg_size; 1004 } 1005 1006 if (offload.udp_csum) { 1007 head->ol_flags |= PKT_TX_UDP_CKSUM; 1008 head->l2_len = sizeof(struct ether_hdr); 1009 head->l3_len = sizeof(struct ipv4_hdr); 1010 } 1011 } 1012 1013 ff_mbuf_free(m); 1014 1015 return send_single_packet(head, ctx->port_id); 1016 } 1017 1018 static int 1019 main_loop(void *arg) 1020 { 1021 struct loop_routine *lr = (struct loop_routine *)arg; 1022 1023 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1024 unsigned lcore_id; 1025 uint64_t prev_tsc, diff_tsc, cur_tsc; 1026 int i, j, nb_rx; 1027 uint8_t port_id, queue_id; 1028 struct lcore_conf *qconf; 1029 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 1030 US_PER_S * BURST_TX_DRAIN_US; 1031 struct ff_dpdk_if_context *ctx; 1032 1033 prev_tsc = 0; 1034 1035 lcore_id = rte_lcore_id(); 1036 qconf = &lcore_conf; 1037 1038 if (qconf->nb_rx_queue == 0) { 1039 printf("lcore %u has nothing to do\n", lcore_id); 1040 return 0; 1041 } 1042 1043 while (1) { 1044 cur_tsc = rte_rdtsc(); 1045 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1046 rte_timer_manage(); 1047 } 1048 1049 /* 1050 * TX burst queue drain 1051 */ 1052 diff_tsc = cur_tsc - prev_tsc; 1053 if (unlikely(diff_tsc > drain_tsc)) { 1054 /* 1055 * This could be optimized (use queueid instead of 1056 * portid), but it is not called so often 1057 */ 1058 for (port_id = 0; port_id < RTE_MAX_ETHPORTS; port_id++) { 1059 if (qconf->tx_mbufs[port_id].len == 0) 1060 continue; 1061 send_burst(qconf, 1062 qconf->tx_mbufs[port_id].len, 1063 port_id); 1064 qconf->tx_mbufs[port_id].len = 0; 1065 } 1066 1067 prev_tsc = cur_tsc; 1068 } 1069 1070 /* 1071 * Read packet from RX queues 1072 */ 1073 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1074 port_id = qconf->rx_queue_list[i].port_id; 1075 queue_id = qconf->rx_queue_list[i].queue_id; 1076 ctx = veth_ctx[port_id]; 1077 1078 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1079 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1080 } 1081 1082 process_arp_ring(port_id, queue_id, pkts_burst, ctx); 1083 1084 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1085 MAX_PKT_BURST); 1086 if (nb_rx == 0) 1087 continue; 1088 1089 /* Prefetch first packets */ 1090 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1091 rte_prefetch0(rte_pktmbuf_mtod( 1092 pkts_burst[j], void *)); 1093 } 1094 1095 /* Prefetch and handle already prefetched packets */ 1096 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1097 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1098 j + PREFETCH_OFFSET], void *)); 1099 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1100 } 1101 1102 /* Handle remaining prefetched packets */ 1103 for (; j < nb_rx; j++) { 1104 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1105 } 1106 } 1107 1108 if (likely(lr->loop != NULL)) { 1109 lr->loop(lr->arg); 1110 } 1111 } 1112 } 1113 1114 int 1115 ff_dpdk_if_up(void) { 1116 int nb_ports = ff_global_cfg.dpdk.nb_ports; 1117 int i; 1118 for (i = 0; i < nb_ports; i++) { 1119 uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id; 1120 veth_ctx[port_id] = ff_veth_attach(ff_global_cfg.dpdk.port_cfgs + i); 1121 if (veth_ctx[port_id] == NULL) { 1122 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1123 } 1124 } 1125 1126 return 0; 1127 } 1128 1129 void 1130 ff_dpdk_run(loop_func_t loop, void *arg) { 1131 struct loop_routine *lr = malloc(sizeof(struct loop_routine)); 1132 lr->loop = loop; 1133 lr->arg = arg; 1134 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1135 rte_eal_mp_wait_lcore(); 1136 free(lr); 1137 } 1138 1139 void 1140 ff_dpdk_pktmbuf_free(void *m) 1141 { 1142 rte_pktmbuf_free((struct rte_mbuf *)m); 1143 } 1144 1145 static uint32_t 1146 toeplitz_hash(unsigned keylen, const uint8_t *key, 1147 unsigned datalen, const uint8_t *data) 1148 { 1149 uint32_t hash = 0, v; 1150 u_int i, b; 1151 1152 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1153 1154 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1155 for (i = 0; i < datalen; i++) { 1156 for (b = 0; b < 8; b++) { 1157 if (data[i] & (1<<(7-b))) 1158 hash ^= v; 1159 v <<= 1; 1160 if ((i + 4) < keylen && 1161 (key[i+4] & (1<<(7-b)))) 1162 v |= 1; 1163 } 1164 } 1165 return (hash); 1166 } 1167 1168 int 1169 ff_rss_check(uint32_t saddr, uint32_t daddr, uint16_t sport, uint16_t dport) 1170 { 1171 struct lcore_conf *qconf = &lcore_conf; 1172 1173 if (qconf->nb_procs == 1) { 1174 return 1; 1175 } 1176 1177 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1178 sizeof(dport)]; 1179 1180 unsigned datalen = 0; 1181 1182 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1183 datalen += sizeof(saddr); 1184 1185 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1186 datalen += sizeof(daddr); 1187 1188 bcopy(&sport, &data[datalen], sizeof(sport)); 1189 datalen += sizeof(sport); 1190 1191 bcopy(&dport, &data[datalen], sizeof(dport)); 1192 datalen += sizeof(dport); 1193 1194 uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes), default_rsskey_40bytes, datalen, data); 1195 1196 return (hash % qconf->nb_procs) == qconf->proc_id; 1197 } 1198 1199 1200