1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 #include <unistd.h> 28 29 #include <rte_common.h> 30 #include <rte_byteorder.h> 31 #include <rte_log.h> 32 #include <rte_memory.h> 33 #include <rte_memcpy.h> 34 #include <rte_memzone.h> 35 #include <rte_config.h> 36 #include <rte_eal.h> 37 #include <rte_pci.h> 38 #include <rte_mbuf.h> 39 #include <rte_memory.h> 40 #include <rte_lcore.h> 41 #include <rte_launch.h> 42 #include <rte_ethdev.h> 43 #include <rte_debug.h> 44 #include <rte_common.h> 45 #include <rte_ether.h> 46 #include <rte_malloc.h> 47 #include <rte_cycles.h> 48 #include <rte_timer.h> 49 #include <rte_thash.h> 50 #include <rte_ip.h> 51 #include <rte_tcp.h> 52 #include <rte_udp.h> 53 54 #include "ff_dpdk_if.h" 55 #include "ff_dpdk_pcap.h" 56 #include "ff_dpdk_kni.h" 57 #include "ff_config.h" 58 #include "ff_veth.h" 59 #include "ff_host_interface.h" 60 #include "ff_msg.h" 61 #include "ff_api.h" 62 63 #define MEMPOOL_CACHE_SIZE 256 64 65 #define DISPATCH_RING_SIZE 2048 66 67 #define MSG_RING_SIZE 32 68 69 /* 70 * Configurable number of RX/TX ring descriptors 71 */ 72 #define RX_QUEUE_SIZE 512 73 #define TX_QUEUE_SIZE 512 74 75 #define MAX_PKT_BURST 32 76 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 77 78 /* 79 * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send. 80 */ 81 #define MAX_TX_BURST (MAX_PKT_BURST / 2) 82 83 #define NB_SOCKETS 8 84 85 /* Configure how many packets ahead to prefetch, when reading packets */ 86 #define PREFETCH_OFFSET 3 87 88 #define MAX_RX_QUEUE_PER_LCORE 16 89 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS 90 #define MAX_RX_QUEUE_PER_PORT 128 91 92 #ifdef FF_KNI 93 #define KNI_MBUF_MAX 2048 94 #define KNI_QUEUE_SIZE 2048 95 96 static int enable_kni; 97 static int kni_accept; 98 #endif 99 100 static int numa_on; 101 102 static unsigned idle_sleep; 103 104 static struct rte_timer freebsd_clock; 105 106 // Mellanox Linux's driver key 107 static uint8_t default_rsskey_40bytes[40] = { 108 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 109 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 110 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 111 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 112 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 113 }; 114 115 struct mbuf_table { 116 uint16_t len; 117 struct rte_mbuf *m_table[MAX_PKT_BURST]; 118 }; 119 120 struct lcore_rx_queue { 121 uint16_t port_id; 122 uint16_t queue_id; 123 } __rte_cache_aligned; 124 125 struct lcore_conf { 126 uint16_t proc_id; 127 uint16_t socket_id; 128 uint16_t nb_queue_list[RTE_MAX_ETHPORTS]; 129 struct ff_port_cfg *port_cfgs; 130 131 uint16_t nb_rx_queue; 132 struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; 133 uint16_t nb_tx_port; 134 uint16_t tx_port_id[RTE_MAX_ETHPORTS]; 135 uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; 136 struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS]; 137 char *pcap[RTE_MAX_ETHPORTS]; 138 } __rte_cache_aligned; 139 140 static struct lcore_conf lcore_conf; 141 142 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 143 144 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; 145 static dispatch_func_t packet_dispatcher; 146 147 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 148 149 struct ff_msg_ring { 150 char ring_name[2][RTE_RING_NAMESIZE]; 151 /* ring[0] for lcore recv msg, other send */ 152 /* ring[1] for lcore send msg, other read */ 153 struct rte_ring *ring[2]; 154 } __rte_cache_aligned; 155 156 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 157 static struct rte_mempool *message_pool; 158 159 struct ff_dpdk_if_context { 160 void *sc; 161 void *ifp; 162 uint16_t port_id; 163 struct ff_hw_features hw_features; 164 } __rte_cache_aligned; 165 166 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 167 168 static struct ff_top_args ff_top_status; 169 static struct ff_traffic_args ff_traffic; 170 171 extern void ff_hardclock(void); 172 173 static void 174 ff_hardclock_job(__rte_unused struct rte_timer *timer, 175 __rte_unused void *arg) { 176 ff_hardclock(); 177 ff_update_current_ts(); 178 } 179 180 struct ff_dpdk_if_context * 181 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 182 { 183 struct ff_dpdk_if_context *ctx; 184 185 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 186 if (ctx == NULL) 187 return NULL; 188 189 ctx->sc = sc; 190 ctx->ifp = ifp; 191 ctx->port_id = cfg->port_id; 192 ctx->hw_features = cfg->hw_features; 193 194 return ctx; 195 } 196 197 void 198 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 199 { 200 free(ctx); 201 } 202 203 static void 204 check_all_ports_link_status(void) 205 { 206 #define CHECK_INTERVAL 100 /* 100ms */ 207 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 208 209 uint16_t portid; 210 uint8_t count, all_ports_up, print_flag = 0; 211 struct rte_eth_link link; 212 213 printf("\nChecking link status"); 214 fflush(stdout); 215 216 int i, nb_ports; 217 nb_ports = ff_global_cfg.dpdk.nb_ports; 218 for (count = 0; count <= MAX_CHECK_TIME; count++) { 219 all_ports_up = 1; 220 for (i = 0; i < nb_ports; i++) { 221 uint16_t portid = ff_global_cfg.dpdk.portid_list[i]; 222 memset(&link, 0, sizeof(link)); 223 rte_eth_link_get_nowait(portid, &link); 224 225 /* print link status if flag set */ 226 if (print_flag == 1) { 227 if (link.link_status) { 228 printf("Port %d Link Up - speed %u " 229 "Mbps - %s\n", (int)portid, 230 (unsigned)link.link_speed, 231 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 232 ("full-duplex") : ("half-duplex\n")); 233 } else { 234 printf("Port %d Link Down\n", (int)portid); 235 } 236 continue; 237 } 238 /* clear all_ports_up flag if any link down */ 239 if (link.link_status == 0) { 240 all_ports_up = 0; 241 break; 242 } 243 } 244 245 /* after finally printing all link status, get out */ 246 if (print_flag == 1) 247 break; 248 249 if (all_ports_up == 0) { 250 printf("."); 251 fflush(stdout); 252 rte_delay_ms(CHECK_INTERVAL); 253 } 254 255 /* set the print_flag if all ports up or timeout */ 256 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 257 print_flag = 1; 258 printf("done\n"); 259 } 260 } 261 } 262 263 static int 264 init_lcore_conf(void) 265 { 266 uint8_t nb_dev_ports = rte_eth_dev_count_avail(); 267 if (nb_dev_ports == 0) { 268 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 269 } 270 271 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) { 272 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n", 273 ff_global_cfg.dpdk.max_portid); 274 } 275 276 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs; 277 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 278 279 uint16_t proc_id; 280 for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) { 281 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id]; 282 if (!lcore_config[lcore_id].detected) { 283 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 284 } 285 } 286 287 uint16_t socket_id = 0; 288 if (numa_on) { 289 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 290 } 291 292 lcore_conf.socket_id = socket_id; 293 294 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id]; 295 int j; 296 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) { 297 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j]; 298 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 299 300 int queueid = -1; 301 int i; 302 for (i = 0; i < pconf->nb_lcores; i++) { 303 if (pconf->lcore_list[i] == lcore_id) { 304 queueid = i; 305 } 306 } 307 if (queueid < 0) { 308 continue; 309 } 310 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid); 311 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 312 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 313 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid; 314 lcore_conf.nb_rx_queue++; 315 316 lcore_conf.tx_queue_id[port_id] = queueid; 317 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id; 318 lcore_conf.nb_tx_port++; 319 320 lcore_conf.pcap[port_id] = pconf->pcap; 321 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores; 322 } 323 324 if (lcore_conf.nb_rx_queue == 0) { 325 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id); 326 } 327 328 return 0; 329 } 330 331 static int 332 init_mem_pool(void) 333 { 334 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 335 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 336 uint32_t nb_tx_queue = nb_lcores; 337 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 338 339 unsigned nb_mbuf = RTE_MAX ( 340 (nb_rx_queue*RX_QUEUE_SIZE + 341 nb_ports*nb_lcores*MAX_PKT_BURST + 342 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 343 nb_lcores*MEMPOOL_CACHE_SIZE + 344 #ifdef FF_KNI 345 nb_ports*KNI_MBUF_MAX + 346 nb_ports*KNI_QUEUE_SIZE + 347 #endif 348 nb_lcores*nb_ports*DISPATCH_RING_SIZE), 349 (unsigned)8192); 350 351 unsigned socketid = 0; 352 uint16_t i, lcore_id; 353 char s[64]; 354 355 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) { 356 lcore_id = ff_global_cfg.dpdk.proc_lcore[i]; 357 if (numa_on) { 358 socketid = rte_lcore_to_socket_id(lcore_id); 359 } 360 361 if (socketid >= NB_SOCKETS) { 362 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 363 socketid, i, NB_SOCKETS); 364 } 365 366 if (pktmbuf_pool[socketid] != NULL) { 367 continue; 368 } 369 370 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 371 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 372 pktmbuf_pool[socketid] = 373 rte_pktmbuf_pool_create(s, nb_mbuf, 374 MEMPOOL_CACHE_SIZE, 0, 375 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 376 } else { 377 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 378 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 379 } 380 381 if (pktmbuf_pool[socketid] == NULL) { 382 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 383 } else { 384 printf("create mbuf pool on socket %d\n", socketid); 385 } 386 } 387 388 return 0; 389 } 390 391 static struct rte_ring * 392 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 393 { 394 struct rte_ring *ring; 395 396 if (name == NULL) { 397 rte_exit(EXIT_FAILURE, "create ring failed, no name!\n"); 398 } 399 400 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 401 ring = rte_ring_create(name, count, socket_id, flags); 402 } else { 403 ring = rte_ring_lookup(name); 404 } 405 406 if (ring == NULL) { 407 rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name); 408 } 409 410 return ring; 411 } 412 413 static int 414 init_dispatch_ring(void) 415 { 416 int j; 417 char name_buf[RTE_RING_NAMESIZE]; 418 int queueid; 419 420 unsigned socketid = lcore_conf.socket_id; 421 422 /* Create ring according to ports actually being used. */ 423 int nb_ports = ff_global_cfg.dpdk.nb_ports; 424 for (j = 0; j < nb_ports; j++) { 425 uint16_t portid = ff_global_cfg.dpdk.portid_list[j]; 426 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid]; 427 int nb_queues = pconf->nb_lcores; 428 if (dispatch_ring[portid] == NULL) { 429 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid); 430 431 dispatch_ring[portid] = rte_zmalloc(name_buf, 432 sizeof(struct rte_ring *) * nb_queues, 433 RTE_CACHE_LINE_SIZE); 434 if (dispatch_ring[portid] == NULL) { 435 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 436 "failed\n", name_buf); 437 } 438 } 439 440 for(queueid = 0; queueid < nb_queues; ++queueid) { 441 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d", 442 portid, queueid); 443 dispatch_ring[portid][queueid] = create_ring(name_buf, 444 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ); 445 446 if (dispatch_ring[portid][queueid] == NULL) 447 rte_panic("create ring:%s failed!\n", name_buf); 448 449 printf("create ring:%s success, %u ring entries are now free!\n", 450 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid])); 451 } 452 } 453 454 return 0; 455 } 456 457 static void 458 ff_msg_init(struct rte_mempool *mp, 459 __attribute__((unused)) void *opaque_arg, 460 void *obj, __attribute__((unused)) unsigned i) 461 { 462 struct ff_msg *msg = (struct ff_msg *)obj; 463 msg->msg_type = FF_UNKNOWN; 464 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 465 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 466 } 467 468 static int 469 init_msg_ring(void) 470 { 471 uint16_t i; 472 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 473 unsigned socketid = lcore_conf.socket_id; 474 475 /* Create message buffer pool */ 476 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 477 message_pool = rte_mempool_create(FF_MSG_POOL, 478 MSG_RING_SIZE * 2 * nb_procs, 479 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 480 NULL, NULL, ff_msg_init, NULL, 481 socketid, 0); 482 } else { 483 message_pool = rte_mempool_lookup(FF_MSG_POOL); 484 } 485 486 if (message_pool == NULL) { 487 rte_panic("Create msg mempool failed\n"); 488 } 489 490 for(i = 0; i < nb_procs; ++i) { 491 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 492 "%s%u", FF_MSG_RING_IN, i); 493 snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE, 494 "%s%u", FF_MSG_RING_OUT, i); 495 496 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 497 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 498 if (msg_ring[i].ring[0] == NULL) 499 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 500 501 msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1], 502 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 503 if (msg_ring[i].ring[1] == NULL) 504 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 505 } 506 507 return 0; 508 } 509 510 #ifdef FF_KNI 511 static int 512 init_kni(void) 513 { 514 int nb_ports = rte_eth_dev_count_avail(); 515 kni_accept = 0; 516 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 517 kni_accept = 1; 518 519 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 520 ff_global_cfg.kni.udp_port); 521 522 unsigned socket_id = lcore_conf.socket_id; 523 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 524 525 nb_ports = ff_global_cfg.dpdk.nb_ports; 526 int i, ret; 527 for (i = 0; i < nb_ports; i++) { 528 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 529 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 530 } 531 532 return 0; 533 } 534 #endif 535 536 static void 537 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues) 538 { 539 if (reta_size == 0) { 540 return; 541 } 542 543 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 544 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 545 546 /* config HW indirection table */ 547 unsigned i, j, hash=0; 548 for (i = 0; i < reta_conf_size; i++) { 549 reta_conf[i].mask = ~0ULL; 550 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 551 reta_conf[i].reta[j] = hash++ % nb_queues; 552 } 553 } 554 555 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 556 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 557 port_id); 558 } 559 } 560 561 static int 562 init_port_start(void) 563 { 564 int nb_ports = ff_global_cfg.dpdk.nb_ports; 565 unsigned socketid = 0; 566 struct rte_mempool *mbuf_pool; 567 uint16_t i; 568 569 for (i = 0; i < nb_ports; i++) { 570 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 571 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 572 uint16_t nb_queues = pconf->nb_lcores; 573 574 struct rte_eth_dev_info dev_info; 575 rte_eth_dev_info_get(port_id, &dev_info); 576 577 if (nb_queues > dev_info.max_rx_queues) { 578 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 579 nb_queues, 580 dev_info.max_rx_queues); 581 } 582 583 if (nb_queues > dev_info.max_tx_queues) { 584 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 585 nb_queues, 586 dev_info.max_tx_queues); 587 } 588 589 struct ether_addr addr; 590 rte_eth_macaddr_get(port_id, &addr); 591 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 592 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 593 (unsigned)port_id, 594 addr.addr_bytes[0], addr.addr_bytes[1], 595 addr.addr_bytes[2], addr.addr_bytes[3], 596 addr.addr_bytes[4], addr.addr_bytes[5]); 597 598 rte_memcpy(pconf->mac, 599 addr.addr_bytes, ETHER_ADDR_LEN); 600 601 struct rte_eth_conf port_conf = {0}; 602 603 /* Set RSS mode */ 604 uint64_t default_rss_hf = ETH_RSS_PROTO_MASK; 605 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 606 port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf; 607 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes; 608 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; 609 port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads; 610 if (port_conf.rx_adv_conf.rss_conf.rss_hf != 611 ETH_RSS_PROTO_MASK) { 612 printf("Port %u modified RSS hash function based on hardware support," 613 "requested:%#"PRIx64" configured:%#"PRIx64"\n", 614 port_id, default_rss_hf, 615 port_conf.rx_adv_conf.rss_conf.rss_hf); 616 } 617 618 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) 619 port_conf.txmode.offloads |= 620 DEV_TX_OFFLOAD_MBUF_FAST_FREE; 621 622 /* Set Rx VLAN stripping */ 623 if (ff_global_cfg.dpdk.vlan_strip) { 624 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 625 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP; 626 } 627 } 628 629 /* Enable HW CRC stripping */ 630 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_KEEP_CRC) { 631 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_KEEP_CRC; 632 } 633 634 /* FIXME: Enable TCP LRO ?*/ 635 #if 0 636 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 637 printf("LRO is supported\n"); 638 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO; 639 pconf->hw_features.rx_lro = 1; 640 } 641 #endif 642 643 /* Set Rx checksum checking */ 644 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 645 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 646 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 647 printf("RX checksum offload supported\n"); 648 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM; 649 pconf->hw_features.rx_csum = 1; 650 } 651 652 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 653 printf("TX ip checksum offload supported\n"); 654 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; 655 pconf->hw_features.tx_csum_ip = 1; 656 } 657 658 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 659 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 660 printf("TX TCP&UDP checksum offload supported\n"); 661 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM; 662 pconf->hw_features.tx_csum_l4 = 1; 663 } 664 665 if (ff_global_cfg.dpdk.tso) { 666 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 667 printf("TSO is supported\n"); 668 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO; 669 pconf->hw_features.tx_tso = 1; 670 } 671 } else { 672 printf("TSO is disabled\n"); 673 } 674 675 if (dev_info.reta_size) { 676 /* reta size must be power of 2 */ 677 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 678 679 rss_reta_size[port_id] = dev_info.reta_size; 680 printf("port[%d]: rss table size: %d\n", port_id, 681 dev_info.reta_size); 682 } 683 684 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 685 continue; 686 } 687 688 int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); 689 if (ret != 0) { 690 return ret; 691 } 692 uint16_t q; 693 for (q = 0; q < nb_queues; q++) { 694 if (numa_on) { 695 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q]; 696 socketid = rte_lcore_to_socket_id(lcore_id); 697 } 698 mbuf_pool = pktmbuf_pool[socketid]; 699 700 ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE, 701 socketid, &dev_info.default_txconf); 702 if (ret < 0) { 703 return ret; 704 } 705 706 ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE, 707 socketid, &dev_info.default_rxconf, mbuf_pool); 708 if (ret < 0) { 709 return ret; 710 } 711 } 712 713 ret = rte_eth_dev_start(port_id); 714 if (ret < 0) { 715 return ret; 716 } 717 718 if (nb_queues > 1) { 719 /* set HW rss hash function to Toeplitz. */ 720 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 721 struct rte_eth_hash_filter_info info = {0}; 722 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 723 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 724 725 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 726 RTE_ETH_FILTER_SET, &info) < 0) { 727 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 728 port_id); 729 } 730 } 731 732 set_rss_table(port_id, dev_info.reta_size, nb_queues); 733 } 734 735 /* Enable RX in promiscuous mode for the Ethernet device. */ 736 if (ff_global_cfg.dpdk.promiscuous) { 737 rte_eth_promiscuous_enable(port_id); 738 ret = rte_eth_promiscuous_get(port_id); 739 if (ret == 1) { 740 printf("set port %u to promiscuous mode ok\n", port_id); 741 } else { 742 printf("set port %u to promiscuous mode error\n", port_id); 743 } 744 } 745 746 /* Enable pcap dump */ 747 if (pconf->pcap) { 748 ff_enable_pcap(pconf->pcap); 749 } 750 } 751 752 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 753 check_all_ports_link_status(); 754 } 755 756 return 0; 757 } 758 759 static int 760 init_clock(void) 761 { 762 rte_timer_subsystem_init(); 763 uint64_t hz = rte_get_timer_hz(); 764 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 765 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 766 767 rte_timer_init(&freebsd_clock); 768 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 769 rte_lcore_id(), &ff_hardclock_job, NULL); 770 771 ff_update_current_ts(); 772 773 return 0; 774 } 775 776 int 777 ff_dpdk_init(int argc, char **argv) 778 { 779 if (ff_global_cfg.dpdk.nb_procs < 1 || 780 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 781 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 782 ff_global_cfg.dpdk.proc_id < 0) { 783 printf("param num_procs[%d] or proc_id[%d] error!\n", 784 ff_global_cfg.dpdk.nb_procs, 785 ff_global_cfg.dpdk.proc_id); 786 exit(1); 787 } 788 789 int ret = rte_eal_init(argc, argv); 790 if (ret < 0) { 791 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 792 } 793 794 numa_on = ff_global_cfg.dpdk.numa_on; 795 796 idle_sleep = ff_global_cfg.dpdk.idle_sleep; 797 798 init_lcore_conf(); 799 800 init_mem_pool(); 801 802 init_dispatch_ring(); 803 804 init_msg_ring(); 805 806 #ifdef FF_KNI 807 enable_kni = ff_global_cfg.kni.enable; 808 if (enable_kni) { 809 init_kni(); 810 } 811 #endif 812 813 ret = init_port_start(); 814 if (ret < 0) { 815 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 816 } 817 818 init_clock(); 819 820 return 0; 821 } 822 823 static void 824 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 825 { 826 uint8_t rx_csum = ctx->hw_features.rx_csum; 827 if (rx_csum) { 828 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 829 rte_pktmbuf_free(pkt); 830 return; 831 } 832 } 833 834 /* 835 * FIXME: should we save pkt->vlan_tci 836 * if (pkt->ol_flags & PKT_RX_VLAN_PKT) 837 */ 838 839 void *data = rte_pktmbuf_mtod(pkt, void*); 840 uint16_t len = rte_pktmbuf_data_len(pkt); 841 842 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 843 if (hdr == NULL) { 844 rte_pktmbuf_free(pkt); 845 return; 846 } 847 848 struct rte_mbuf *pn = pkt->next; 849 void *prev = hdr; 850 while(pn != NULL) { 851 data = rte_pktmbuf_mtod(pn, void*); 852 len = rte_pktmbuf_data_len(pn); 853 854 void *mb = ff_mbuf_get(prev, data, len); 855 if (mb == NULL) { 856 ff_mbuf_free(hdr); 857 rte_pktmbuf_free(pkt); 858 return; 859 } 860 pn = pn->next; 861 prev = mb; 862 } 863 864 ff_veth_process_packet(ctx->ifp, hdr); 865 } 866 867 static enum FilterReturn 868 protocol_filter(const void *data, uint16_t len) 869 { 870 if(len < ETHER_HDR_LEN) 871 return FILTER_UNKNOWN; 872 873 const struct ether_hdr *hdr; 874 hdr = (const struct ether_hdr *)data; 875 876 if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP) 877 return FILTER_ARP; 878 879 #ifndef FF_KNI 880 return FILTER_UNKNOWN; 881 #else 882 if (!enable_kni) { 883 return FILTER_UNKNOWN; 884 } 885 886 if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4) 887 return FILTER_UNKNOWN; 888 889 return ff_kni_proto_filter(data + ETHER_HDR_LEN, 890 len - ETHER_HDR_LEN); 891 #endif 892 } 893 894 static inline void 895 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m) 896 { 897 struct rte_mbuf *md; 898 void *src, *dst; 899 900 dst = rte_pktmbuf_mtod(mi, void *); 901 src = rte_pktmbuf_mtod(m, void *); 902 903 mi->data_len = m->data_len; 904 rte_memcpy(dst, src, m->data_len); 905 906 mi->port = m->port; 907 mi->vlan_tci = m->vlan_tci; 908 mi->vlan_tci_outer = m->vlan_tci_outer; 909 mi->tx_offload = m->tx_offload; 910 mi->hash = m->hash; 911 mi->ol_flags = m->ol_flags; 912 mi->packet_type = m->packet_type; 913 } 914 915 /* copied from rte_pktmbuf_clone */ 916 static inline struct rte_mbuf * 917 pktmbuf_deep_clone(const struct rte_mbuf *md, 918 struct rte_mempool *mp) 919 { 920 struct rte_mbuf *mc, *mi, **prev; 921 uint32_t pktlen; 922 uint8_t nseg; 923 924 if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) 925 return NULL; 926 927 mi = mc; 928 prev = &mi->next; 929 pktlen = md->pkt_len; 930 nseg = 0; 931 932 do { 933 nseg++; 934 pktmbuf_deep_attach(mi, md); 935 *prev = mi; 936 prev = &mi->next; 937 } while ((md = md->next) != NULL && 938 (mi = rte_pktmbuf_alloc(mp)) != NULL); 939 940 *prev = NULL; 941 mc->nb_segs = nseg; 942 mc->pkt_len = pktlen; 943 944 /* Allocation of new indirect segment failed */ 945 if (unlikely (mi == NULL)) { 946 rte_pktmbuf_free(mc); 947 return NULL; 948 } 949 950 __rte_mbuf_sanity_check(mc, 1); 951 return mc; 952 } 953 954 static inline void 955 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 956 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 957 { 958 struct lcore_conf *qconf = &lcore_conf; 959 uint16_t nb_queues = qconf->nb_queue_list[port_id]; 960 961 uint16_t i; 962 for (i = 0; i < count; i++) { 963 struct rte_mbuf *rtem = bufs[i]; 964 965 if (unlikely(qconf->pcap[port_id] != NULL)) { 966 if (!pkts_from_ring) { 967 ff_dump_packets(qconf->pcap[port_id], rtem); 968 } 969 } 970 971 void *data = rte_pktmbuf_mtod(rtem, void*); 972 uint16_t len = rte_pktmbuf_data_len(rtem); 973 974 if (!pkts_from_ring) { 975 ff_traffic.rx_packets++; 976 ff_traffic.rx_bytes += len; 977 } 978 979 if (!pkts_from_ring && packet_dispatcher) { 980 int ret = (*packet_dispatcher)(data, len, queue_id, nb_queues); 981 if (ret < 0 || ret >= nb_queues) { 982 rte_pktmbuf_free(rtem); 983 continue; 984 } 985 986 if (ret != queue_id) { 987 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem); 988 if (ret < 0) 989 rte_pktmbuf_free(rtem); 990 991 continue; 992 } 993 } 994 995 enum FilterReturn filter = protocol_filter(data, len); 996 if (filter == FILTER_ARP) { 997 struct rte_mempool *mbuf_pool; 998 struct rte_mbuf *mbuf_clone; 999 if (!pkts_from_ring) { 1000 uint16_t j; 1001 for(j = 0; j < nb_queues; ++j) { 1002 if(j == queue_id) 1003 continue; 1004 1005 unsigned socket_id = 0; 1006 if (numa_on) { 1007 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j]; 1008 socket_id = rte_lcore_to_socket_id(lcore_id); 1009 } 1010 mbuf_pool = pktmbuf_pool[socket_id]; 1011 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1012 if(mbuf_clone) { 1013 int ret = rte_ring_enqueue(dispatch_ring[port_id][j], 1014 mbuf_clone); 1015 if (ret < 0) 1016 rte_pktmbuf_free(mbuf_clone); 1017 } 1018 } 1019 } 1020 1021 #ifdef FF_KNI 1022 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1023 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 1024 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1025 if(mbuf_clone) { 1026 ff_kni_enqueue(port_id, mbuf_clone); 1027 } 1028 } 1029 #endif 1030 ff_veth_input(ctx, rtem); 1031 #ifdef FF_KNI 1032 } else if (enable_kni && 1033 ((filter == FILTER_KNI && kni_accept) || 1034 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 1035 ff_kni_enqueue(port_id, rtem); 1036 #endif 1037 } else { 1038 ff_veth_input(ctx, rtem); 1039 } 1040 } 1041 } 1042 1043 static inline int 1044 process_dispatch_ring(uint16_t port_id, uint16_t queue_id, 1045 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 1046 { 1047 /* read packet from ring buf and to process */ 1048 uint16_t nb_rb; 1049 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id], 1050 (void **)pkts_burst, MAX_PKT_BURST, NULL); 1051 1052 if(nb_rb > 0) { 1053 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 1054 } 1055 1056 return 0; 1057 } 1058 1059 static inline void 1060 handle_sysctl_msg(struct ff_msg *msg) 1061 { 1062 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 1063 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 1064 msg->sysctl.newlen); 1065 1066 if (ret < 0) { 1067 msg->result = errno; 1068 } else { 1069 msg->result = 0; 1070 } 1071 } 1072 1073 static inline void 1074 handle_ioctl_msg(struct ff_msg *msg) 1075 { 1076 int fd, ret; 1077 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 1078 if (fd < 0) { 1079 ret = -1; 1080 goto done; 1081 } 1082 1083 ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data); 1084 1085 ff_close(fd); 1086 1087 done: 1088 if (ret < 0) { 1089 msg->result = errno; 1090 } else { 1091 msg->result = 0; 1092 } 1093 } 1094 1095 static inline void 1096 handle_route_msg(struct ff_msg *msg) 1097 { 1098 int ret = ff_rtioctl(msg->route.fib, msg->route.data, 1099 &msg->route.len, msg->route.maxlen); 1100 if (ret < 0) { 1101 msg->result = errno; 1102 } else { 1103 msg->result = 0; 1104 } 1105 } 1106 1107 static inline void 1108 handle_top_msg(struct ff_msg *msg) 1109 { 1110 msg->top = ff_top_status; 1111 msg->result = 0; 1112 } 1113 1114 #ifdef FF_NETGRAPH 1115 static inline void 1116 handle_ngctl_msg(struct ff_msg *msg) 1117 { 1118 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data); 1119 if (ret < 0) { 1120 msg->result = errno; 1121 } else { 1122 msg->result = 0; 1123 msg->ngctl.ret = ret; 1124 } 1125 } 1126 #endif 1127 1128 #ifdef FF_IPFW 1129 static inline void 1130 handle_ipfw_msg(struct ff_msg *msg) 1131 { 1132 int fd, ret; 1133 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW); 1134 if (fd < 0) { 1135 ret = -1; 1136 goto done; 1137 } 1138 1139 switch (msg->ipfw.cmd) { 1140 case FF_IPFW_GET: 1141 ret = ff_getsockopt_freebsd(fd, msg->ipfw.level, 1142 msg->ipfw.optname, msg->ipfw.optval, 1143 msg->ipfw.optlen); 1144 break; 1145 case FF_IPFW_SET: 1146 ret = ff_setsockopt_freebsd(fd, msg->ipfw.level, 1147 msg->ipfw.optname, msg->ipfw.optval, 1148 *(msg->ipfw.optlen)); 1149 break; 1150 default: 1151 ret = -1; 1152 errno = ENOTSUP; 1153 break; 1154 } 1155 1156 ff_close(fd); 1157 1158 done: 1159 if (ret < 0) { 1160 msg->result = errno; 1161 } else { 1162 msg->result = 0; 1163 } 1164 } 1165 #endif 1166 1167 static inline void 1168 handle_traffic_msg(struct ff_msg *msg) 1169 { 1170 msg->traffic = ff_traffic; 1171 msg->result = 0; 1172 } 1173 1174 static inline void 1175 handle_default_msg(struct ff_msg *msg) 1176 { 1177 msg->result = ENOTSUP; 1178 } 1179 1180 static inline void 1181 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1182 { 1183 switch (msg->msg_type) { 1184 case FF_SYSCTL: 1185 handle_sysctl_msg(msg); 1186 break; 1187 case FF_IOCTL: 1188 handle_ioctl_msg(msg); 1189 break; 1190 case FF_ROUTE: 1191 handle_route_msg(msg); 1192 break; 1193 case FF_TOP: 1194 handle_top_msg(msg); 1195 break; 1196 #ifdef FF_NETGRAPH 1197 case FF_NGCTL: 1198 handle_ngctl_msg(msg); 1199 break; 1200 #endif 1201 #ifdef FF_IPFW 1202 case FF_IPFW_CTL: 1203 handle_ipfw_msg(msg); 1204 break; 1205 #endif 1206 case FF_TRAFFIC: 1207 handle_traffic_msg(msg); 1208 break; 1209 default: 1210 handle_default_msg(msg); 1211 break; 1212 } 1213 rte_ring_enqueue(msg_ring[proc_id].ring[1], msg); 1214 } 1215 1216 static inline int 1217 process_msg_ring(uint16_t proc_id) 1218 { 1219 void *msg; 1220 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1221 1222 if (unlikely(ret == 0)) { 1223 handle_msg((struct ff_msg *)msg, proc_id); 1224 } 1225 1226 return 0; 1227 } 1228 1229 /* Send burst of packets on an output interface */ 1230 static inline int 1231 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1232 { 1233 struct rte_mbuf **m_table; 1234 int ret; 1235 uint16_t queueid; 1236 1237 queueid = qconf->tx_queue_id[port]; 1238 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1239 1240 if (unlikely(qconf->pcap[port] != NULL)) { 1241 uint16_t i; 1242 for (i = 0; i < n; i++) { 1243 ff_dump_packets(qconf->pcap[port], m_table[i]); 1244 } 1245 } 1246 1247 ff_traffic.tx_packets += n; 1248 uint16_t i; 1249 for (i = 0; i < n; i++) { 1250 ff_traffic.tx_bytes += rte_pktmbuf_data_len(m_table[i]); 1251 } 1252 1253 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1254 if (unlikely(ret < n)) { 1255 do { 1256 rte_pktmbuf_free(m_table[ret]); 1257 } while (++ret < n); 1258 } 1259 1260 return 0; 1261 } 1262 1263 /* Enqueue a single packet, and send burst if queue is filled */ 1264 static inline int 1265 send_single_packet(struct rte_mbuf *m, uint8_t port) 1266 { 1267 uint16_t len; 1268 struct lcore_conf *qconf; 1269 1270 qconf = &lcore_conf; 1271 len = qconf->tx_mbufs[port].len; 1272 qconf->tx_mbufs[port].m_table[len] = m; 1273 len++; 1274 1275 /* enough pkts to be sent */ 1276 if (unlikely(len == MAX_PKT_BURST)) { 1277 send_burst(qconf, MAX_PKT_BURST, port); 1278 len = 0; 1279 } 1280 1281 qconf->tx_mbufs[port].len = len; 1282 return 0; 1283 } 1284 1285 int 1286 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1287 int total) 1288 { 1289 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1290 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1291 if (head == NULL) { 1292 ff_mbuf_free(m); 1293 return -1; 1294 } 1295 1296 head->pkt_len = total; 1297 head->nb_segs = 0; 1298 1299 int off = 0; 1300 struct rte_mbuf *cur = head, *prev = NULL; 1301 while(total > 0) { 1302 if (cur == NULL) { 1303 cur = rte_pktmbuf_alloc(mbuf_pool); 1304 if (cur == NULL) { 1305 rte_pktmbuf_free(head); 1306 ff_mbuf_free(m); 1307 return -1; 1308 } 1309 } 1310 1311 if (prev != NULL) { 1312 prev->next = cur; 1313 } 1314 head->nb_segs++; 1315 1316 prev = cur; 1317 void *data = rte_pktmbuf_mtod(cur, void*); 1318 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1319 int ret = ff_mbuf_copydata(m, data, off, len); 1320 if (ret < 0) { 1321 rte_pktmbuf_free(head); 1322 ff_mbuf_free(m); 1323 return -1; 1324 } 1325 1326 1327 cur->data_len = len; 1328 off += len; 1329 total -= len; 1330 cur = NULL; 1331 } 1332 1333 struct ff_tx_offload offload = {0}; 1334 ff_mbuf_tx_offload(m, &offload); 1335 1336 void *data = rte_pktmbuf_mtod(head, void*); 1337 1338 if (offload.ip_csum) { 1339 /* ipv6 not supported yet */ 1340 struct ipv4_hdr *iph; 1341 int iph_len; 1342 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1343 iph_len = (iph->version_ihl & 0x0f) << 2; 1344 1345 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 1346 head->l2_len = ETHER_HDR_LEN; 1347 head->l3_len = iph_len; 1348 } 1349 1350 if (ctx->hw_features.tx_csum_l4) { 1351 struct ipv4_hdr *iph; 1352 int iph_len; 1353 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1354 iph_len = (iph->version_ihl & 0x0f) << 2; 1355 1356 if (offload.tcp_csum) { 1357 head->ol_flags |= PKT_TX_TCP_CKSUM; 1358 head->l2_len = ETHER_HDR_LEN; 1359 head->l3_len = iph_len; 1360 } 1361 1362 /* 1363 * TCP segmentation offload. 1364 * 1365 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 1366 * implies PKT_TX_TCP_CKSUM) 1367 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 1368 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 1369 * write the IP checksum to 0 in the packet 1370 * - fill the mbuf offload information: l2_len, 1371 * l3_len, l4_len, tso_segsz 1372 * - calculate the pseudo header checksum without taking ip_len 1373 * in account, and set it in the TCP header. Refer to 1374 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 1375 * used as helpers. 1376 */ 1377 if (offload.tso_seg_size) { 1378 struct tcp_hdr *tcph; 1379 int tcph_len; 1380 tcph = (struct tcp_hdr *)((char *)iph + iph_len); 1381 tcph_len = (tcph->data_off & 0xf0) >> 2; 1382 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 1383 1384 head->ol_flags |= PKT_TX_TCP_SEG; 1385 head->l4_len = tcph_len; 1386 head->tso_segsz = offload.tso_seg_size; 1387 } 1388 1389 if (offload.udp_csum) { 1390 head->ol_flags |= PKT_TX_UDP_CKSUM; 1391 head->l2_len = ETHER_HDR_LEN; 1392 head->l3_len = iph_len; 1393 } 1394 } 1395 1396 ff_mbuf_free(m); 1397 1398 return send_single_packet(head, ctx->port_id); 1399 } 1400 1401 static int 1402 main_loop(void *arg) 1403 { 1404 struct loop_routine *lr = (struct loop_routine *)arg; 1405 1406 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1407 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc; 1408 int i, j, nb_rx, idle; 1409 uint16_t port_id, queue_id; 1410 struct lcore_conf *qconf; 1411 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 1412 US_PER_S * BURST_TX_DRAIN_US; 1413 struct ff_dpdk_if_context *ctx; 1414 1415 prev_tsc = 0; 1416 usch_tsc = 0; 1417 1418 qconf = &lcore_conf; 1419 1420 while (1) { 1421 cur_tsc = rte_rdtsc(); 1422 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1423 rte_timer_manage(); 1424 } 1425 1426 idle = 1; 1427 sys_tsc = 0; 1428 usr_tsc = 0; 1429 1430 /* 1431 * TX burst queue drain 1432 */ 1433 diff_tsc = cur_tsc - prev_tsc; 1434 if (unlikely(diff_tsc > drain_tsc)) { 1435 for (i = 0; i < qconf->nb_tx_port; i++) { 1436 port_id = qconf->tx_port_id[i]; 1437 if (qconf->tx_mbufs[port_id].len == 0) 1438 continue; 1439 1440 idle = 0; 1441 1442 send_burst(qconf, 1443 qconf->tx_mbufs[port_id].len, 1444 port_id); 1445 qconf->tx_mbufs[port_id].len = 0; 1446 } 1447 1448 prev_tsc = cur_tsc; 1449 } 1450 1451 /* 1452 * Read packet from RX queues 1453 */ 1454 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1455 port_id = qconf->rx_queue_list[i].port_id; 1456 queue_id = qconf->rx_queue_list[i].queue_id; 1457 ctx = veth_ctx[port_id]; 1458 1459 #ifdef FF_KNI 1460 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1461 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1462 } 1463 #endif 1464 1465 process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); 1466 1467 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1468 MAX_PKT_BURST); 1469 if (nb_rx == 0) 1470 continue; 1471 1472 idle = 0; 1473 1474 /* Prefetch first packets */ 1475 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1476 rte_prefetch0(rte_pktmbuf_mtod( 1477 pkts_burst[j], void *)); 1478 } 1479 1480 /* Prefetch and handle already prefetched packets */ 1481 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1482 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1483 j + PREFETCH_OFFSET], void *)); 1484 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1485 } 1486 1487 /* Handle remaining prefetched packets */ 1488 for (; j < nb_rx; j++) { 1489 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1490 } 1491 } 1492 1493 process_msg_ring(qconf->proc_id); 1494 1495 div_tsc = rte_rdtsc(); 1496 1497 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) { 1498 usch_tsc = cur_tsc; 1499 lr->loop(lr->arg); 1500 } 1501 1502 idle_sleep_tsc = rte_rdtsc(); 1503 if (likely(idle && idle_sleep)) { 1504 usleep(idle_sleep); 1505 end_tsc = rte_rdtsc(); 1506 } else { 1507 end_tsc = idle_sleep_tsc; 1508 } 1509 1510 end_tsc = rte_rdtsc(); 1511 1512 if (usch_tsc == cur_tsc) { 1513 usr_tsc = idle_sleep_tsc - div_tsc; 1514 } 1515 1516 if (!idle) { 1517 sys_tsc = div_tsc - cur_tsc; 1518 ff_top_status.sys_tsc += sys_tsc; 1519 } 1520 1521 ff_top_status.usr_tsc += usr_tsc; 1522 ff_top_status.work_tsc += end_tsc - cur_tsc; 1523 ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1524 1525 ff_top_status.loops++; 1526 } 1527 1528 return 0; 1529 } 1530 1531 int 1532 ff_dpdk_if_up(void) { 1533 int i; 1534 struct lcore_conf *qconf = &lcore_conf; 1535 for (i = 0; i < qconf->nb_tx_port; i++) { 1536 uint16_t port_id = qconf->tx_port_id[i]; 1537 1538 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id]; 1539 veth_ctx[port_id] = ff_veth_attach(pconf); 1540 if (veth_ctx[port_id] == NULL) { 1541 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1542 } 1543 } 1544 1545 return 0; 1546 } 1547 1548 void 1549 ff_dpdk_run(loop_func_t loop, void *arg) { 1550 struct loop_routine *lr = rte_malloc(NULL, 1551 sizeof(struct loop_routine), 0); 1552 lr->loop = loop; 1553 lr->arg = arg; 1554 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1555 rte_eal_mp_wait_lcore(); 1556 rte_free(lr); 1557 } 1558 1559 void 1560 ff_dpdk_pktmbuf_free(void *m) 1561 { 1562 rte_pktmbuf_free((struct rte_mbuf *)m); 1563 } 1564 1565 static uint32_t 1566 toeplitz_hash(unsigned keylen, const uint8_t *key, 1567 unsigned datalen, const uint8_t *data) 1568 { 1569 uint32_t hash = 0, v; 1570 u_int i, b; 1571 1572 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1573 1574 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1575 for (i = 0; i < datalen; i++) { 1576 for (b = 0; b < 8; b++) { 1577 if (data[i] & (1<<(7-b))) 1578 hash ^= v; 1579 v <<= 1; 1580 if ((i + 4) < keylen && 1581 (key[i+4] & (1<<(7-b)))) 1582 v |= 1; 1583 } 1584 } 1585 return (hash); 1586 } 1587 1588 int 1589 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1590 uint16_t sport, uint16_t dport) 1591 { 1592 struct lcore_conf *qconf = &lcore_conf; 1593 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 1594 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id]; 1595 1596 if (nb_queues <= 1) { 1597 return 1; 1598 } 1599 1600 uint16_t reta_size = rss_reta_size[ctx->port_id]; 1601 uint16_t queueid = qconf->tx_queue_id[ctx->port_id]; 1602 1603 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1604 sizeof(dport)]; 1605 1606 unsigned datalen = 0; 1607 1608 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1609 datalen += sizeof(saddr); 1610 1611 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1612 datalen += sizeof(daddr); 1613 1614 bcopy(&sport, &data[datalen], sizeof(sport)); 1615 datalen += sizeof(sport); 1616 1617 bcopy(&dport, &data[datalen], sizeof(dport)); 1618 datalen += sizeof(dport); 1619 1620 uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes), 1621 default_rsskey_40bytes, datalen, data); 1622 1623 return ((hash & (reta_size - 1)) % nb_queues) == queueid; 1624 } 1625 1626 void 1627 ff_regist_packet_dispatcher(dispatch_func_t func) 1628 { 1629 packet_dispatcher = func; 1630 } 1631 1632 uint64_t 1633 ff_get_tsc_ns() 1634 { 1635 uint64_t cur_tsc = rte_rdtsc(); 1636 uint64_t hz = rte_get_tsc_hz(); 1637 return ((double)cur_tsc/(double)hz) * NS_PER_S; 1638 } 1639 1640