1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 #include <unistd.h> 28 #include <sys/mman.h> 29 #include <errno.h> 30 31 #include <rte_common.h> 32 #include <rte_byteorder.h> 33 #include <rte_log.h> 34 #include <rte_memory.h> 35 #include <rte_memcpy.h> 36 #include <rte_memzone.h> 37 #include <rte_config.h> 38 #include <rte_eal.h> 39 #include <rte_pci.h> 40 #include <rte_mbuf.h> 41 #include <rte_memory.h> 42 #include <rte_lcore.h> 43 #include <rte_launch.h> 44 #include <rte_ethdev.h> 45 #include <rte_debug.h> 46 #include <rte_common.h> 47 #include <rte_ether.h> 48 #include <rte_malloc.h> 49 #include <rte_cycles.h> 50 #include <rte_timer.h> 51 #include <rte_thash.h> 52 #include <rte_ip.h> 53 #include <rte_tcp.h> 54 #include <rte_udp.h> 55 56 #include "ff_dpdk_if.h" 57 #include "ff_dpdk_pcap.h" 58 #include "ff_dpdk_kni.h" 59 #include "ff_config.h" 60 #include "ff_veth.h" 61 #include "ff_host_interface.h" 62 #include "ff_msg.h" 63 #include "ff_api.h" 64 #include "ff_memory.h" 65 66 #ifdef FF_KNI 67 #define KNI_MBUF_MAX 2048 68 #define KNI_QUEUE_SIZE 2048 69 70 static int enable_kni; 71 static int kni_accept; 72 #endif 73 74 static int numa_on; 75 76 static unsigned idle_sleep; 77 static unsigned pkt_tx_delay; 78 79 static struct rte_timer freebsd_clock; 80 81 // Mellanox Linux's driver key 82 static uint8_t default_rsskey_40bytes[40] = { 83 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 84 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 85 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 86 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 87 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 88 }; 89 90 struct lcore_conf lcore_conf; 91 92 struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 93 94 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; 95 static dispatch_func_t packet_dispatcher; 96 97 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 98 99 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port); 100 101 struct ff_msg_ring { 102 char ring_name[2][RTE_RING_NAMESIZE]; 103 /* ring[0] for lcore recv msg, other send */ 104 /* ring[1] for lcore send msg, other read */ 105 struct rte_ring *ring[2]; 106 } __rte_cache_aligned; 107 108 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 109 static struct rte_mempool *message_pool; 110 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 111 112 static struct ff_top_args ff_top_status; 113 static struct ff_traffic_args ff_traffic; 114 extern void ff_hardclock(void); 115 116 static void 117 ff_hardclock_job(__rte_unused struct rte_timer *timer, 118 __rte_unused void *arg) { 119 ff_hardclock(); 120 ff_update_current_ts(); 121 } 122 123 struct ff_dpdk_if_context * 124 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 125 { 126 struct ff_dpdk_if_context *ctx; 127 128 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 129 if (ctx == NULL) 130 return NULL; 131 132 ctx->sc = sc; 133 ctx->ifp = ifp; 134 ctx->port_id = cfg->port_id; 135 ctx->hw_features = cfg->hw_features; 136 137 return ctx; 138 } 139 140 void 141 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 142 { 143 free(ctx); 144 } 145 146 static void 147 check_all_ports_link_status(void) 148 { 149 #define CHECK_INTERVAL 100 /* 100ms */ 150 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 151 152 uint16_t portid; 153 uint8_t count, all_ports_up, print_flag = 0; 154 struct rte_eth_link link; 155 156 printf("\nChecking link status"); 157 fflush(stdout); 158 159 int i, nb_ports; 160 nb_ports = ff_global_cfg.dpdk.nb_ports; 161 for (count = 0; count <= MAX_CHECK_TIME; count++) { 162 all_ports_up = 1; 163 for (i = 0; i < nb_ports; i++) { 164 uint16_t portid = ff_global_cfg.dpdk.portid_list[i]; 165 memset(&link, 0, sizeof(link)); 166 rte_eth_link_get_nowait(portid, &link); 167 168 /* print link status if flag set */ 169 if (print_flag == 1) { 170 if (link.link_status) { 171 printf("Port %d Link Up - speed %u " 172 "Mbps - %s\n", (int)portid, 173 (unsigned)link.link_speed, 174 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 175 ("full-duplex") : ("half-duplex\n")); 176 } else { 177 printf("Port %d Link Down\n", (int)portid); 178 } 179 continue; 180 } 181 /* clear all_ports_up flag if any link down */ 182 if (link.link_status == 0) { 183 all_ports_up = 0; 184 break; 185 } 186 } 187 188 /* after finally printing all link status, get out */ 189 if (print_flag == 1) 190 break; 191 192 if (all_ports_up == 0) { 193 printf("."); 194 fflush(stdout); 195 rte_delay_ms(CHECK_INTERVAL); 196 } 197 198 /* set the print_flag if all ports up or timeout */ 199 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 200 print_flag = 1; 201 printf("done\n"); 202 } 203 } 204 } 205 206 static int 207 init_lcore_conf(void) 208 { 209 uint8_t nb_dev_ports = rte_eth_dev_count_avail(); 210 if (nb_dev_ports == 0) { 211 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 212 } 213 214 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) { 215 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n", 216 ff_global_cfg.dpdk.max_portid); 217 } 218 219 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs; 220 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 221 222 uint16_t proc_id; 223 for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) { 224 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id]; 225 if (!lcore_config[lcore_id].detected) { 226 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 227 } 228 } 229 230 uint16_t socket_id = 0; 231 if (numa_on) { 232 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 233 } 234 235 lcore_conf.socket_id = socket_id; 236 237 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id]; 238 int j; 239 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) { 240 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j]; 241 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 242 243 int queueid = -1; 244 int i; 245 for (i = 0; i < pconf->nb_lcores; i++) { 246 if (pconf->lcore_list[i] == lcore_id) { 247 queueid = i; 248 } 249 } 250 if (queueid < 0) { 251 continue; 252 } 253 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid); 254 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 255 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 256 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid; 257 lcore_conf.nb_rx_queue++; 258 259 lcore_conf.tx_queue_id[port_id] = queueid; 260 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id; 261 lcore_conf.nb_tx_port++; 262 263 lcore_conf.pcap[port_id] = pconf->pcap; 264 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores; 265 } 266 267 if (lcore_conf.nb_rx_queue == 0) { 268 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id); 269 } 270 271 return 0; 272 } 273 274 static int 275 init_mem_pool(void) 276 { 277 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 278 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 279 uint32_t nb_tx_queue = nb_lcores; 280 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 281 282 unsigned nb_mbuf = RTE_ALIGN_CEIL ( 283 (nb_rx_queue*RX_QUEUE_SIZE + 284 nb_ports*nb_lcores*MAX_PKT_BURST + 285 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 286 nb_lcores*MEMPOOL_CACHE_SIZE + 287 #ifdef FF_KNI 288 nb_ports*KNI_MBUF_MAX + 289 nb_ports*KNI_QUEUE_SIZE + 290 #endif 291 nb_lcores*nb_ports*DISPATCH_RING_SIZE), 292 (unsigned)8192); 293 294 unsigned socketid = 0; 295 uint16_t i, lcore_id; 296 char s[64]; 297 298 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) { 299 lcore_id = ff_global_cfg.dpdk.proc_lcore[i]; 300 if (numa_on) { 301 socketid = rte_lcore_to_socket_id(lcore_id); 302 } 303 304 if (socketid >= NB_SOCKETS) { 305 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 306 socketid, i, NB_SOCKETS); 307 } 308 309 if (pktmbuf_pool[socketid] != NULL) { 310 continue; 311 } 312 313 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 314 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 315 pktmbuf_pool[socketid] = 316 rte_pktmbuf_pool_create(s, nb_mbuf, 317 MEMPOOL_CACHE_SIZE, 0, 318 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 319 } else { 320 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 321 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 322 } 323 324 if (pktmbuf_pool[socketid] == NULL) { 325 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 326 } else { 327 printf("create mbuf pool on socket %d\n", socketid); 328 } 329 330 #ifdef FF_USE_PAGE_ARRAY 331 nb_mbuf = RTE_ALIGN_CEIL ( 332 nb_ports*nb_lcores*MAX_PKT_BURST + 333 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 334 nb_lcores*MEMPOOL_CACHE_SIZE, 335 (unsigned)4096); 336 ff_init_ref_pool(nb_mbuf, socketid); 337 #endif 338 } 339 340 return 0; 341 } 342 343 static struct rte_ring * 344 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 345 { 346 struct rte_ring *ring; 347 348 if (name == NULL) { 349 rte_exit(EXIT_FAILURE, "create ring failed, no name!\n"); 350 } 351 352 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 353 ring = rte_ring_create(name, count, socket_id, flags); 354 } else { 355 ring = rte_ring_lookup(name); 356 } 357 358 if (ring == NULL) { 359 rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name); 360 } 361 362 return ring; 363 } 364 365 static int 366 init_dispatch_ring(void) 367 { 368 int j; 369 char name_buf[RTE_RING_NAMESIZE]; 370 int queueid; 371 372 unsigned socketid = lcore_conf.socket_id; 373 374 /* Create ring according to ports actually being used. */ 375 int nb_ports = ff_global_cfg.dpdk.nb_ports; 376 for (j = 0; j < nb_ports; j++) { 377 uint16_t portid = ff_global_cfg.dpdk.portid_list[j]; 378 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid]; 379 int nb_queues = pconf->nb_lcores; 380 if (dispatch_ring[portid] == NULL) { 381 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid); 382 383 dispatch_ring[portid] = rte_zmalloc(name_buf, 384 sizeof(struct rte_ring *) * nb_queues, 385 RTE_CACHE_LINE_SIZE); 386 if (dispatch_ring[portid] == NULL) { 387 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 388 "failed\n", name_buf); 389 } 390 } 391 392 for(queueid = 0; queueid < nb_queues; ++queueid) { 393 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d", 394 portid, queueid); 395 dispatch_ring[portid][queueid] = create_ring(name_buf, 396 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ); 397 398 if (dispatch_ring[portid][queueid] == NULL) 399 rte_panic("create ring:%s failed!\n", name_buf); 400 401 printf("create ring:%s success, %u ring entries are now free!\n", 402 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid])); 403 } 404 } 405 406 return 0; 407 } 408 409 static void 410 ff_msg_init(struct rte_mempool *mp, 411 __attribute__((unused)) void *opaque_arg, 412 void *obj, __attribute__((unused)) unsigned i) 413 { 414 struct ff_msg *msg = (struct ff_msg *)obj; 415 msg->msg_type = FF_UNKNOWN; 416 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 417 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 418 } 419 420 static int 421 init_msg_ring(void) 422 { 423 uint16_t i; 424 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 425 unsigned socketid = lcore_conf.socket_id; 426 427 /* Create message buffer pool */ 428 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 429 message_pool = rte_mempool_create(FF_MSG_POOL, 430 MSG_RING_SIZE * 2 * nb_procs, 431 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 432 NULL, NULL, ff_msg_init, NULL, 433 socketid, 0); 434 } else { 435 message_pool = rte_mempool_lookup(FF_MSG_POOL); 436 } 437 438 if (message_pool == NULL) { 439 rte_panic("Create msg mempool failed\n"); 440 } 441 442 for(i = 0; i < nb_procs; ++i) { 443 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 444 "%s%u", FF_MSG_RING_IN, i); 445 snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE, 446 "%s%u", FF_MSG_RING_OUT, i); 447 448 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 449 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 450 if (msg_ring[i].ring[0] == NULL) 451 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 452 453 msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1], 454 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 455 if (msg_ring[i].ring[1] == NULL) 456 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 457 } 458 459 return 0; 460 } 461 462 #ifdef FF_KNI 463 static int 464 init_kni(void) 465 { 466 int nb_ports = rte_eth_dev_count_avail(); 467 kni_accept = 0; 468 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 469 kni_accept = 1; 470 471 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 472 ff_global_cfg.kni.udp_port); 473 474 unsigned socket_id = lcore_conf.socket_id; 475 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 476 477 nb_ports = ff_global_cfg.dpdk.nb_ports; 478 int i, ret; 479 for (i = 0; i < nb_ports; i++) { 480 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 481 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 482 } 483 484 return 0; 485 } 486 #endif 487 488 static void 489 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues) 490 { 491 if (reta_size == 0) { 492 return; 493 } 494 495 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 496 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 497 498 /* config HW indirection table */ 499 unsigned i, j, hash=0; 500 for (i = 0; i < reta_conf_size; i++) { 501 reta_conf[i].mask = ~0ULL; 502 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 503 reta_conf[i].reta[j] = hash++ % nb_queues; 504 } 505 } 506 507 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 508 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 509 port_id); 510 } 511 } 512 513 static int 514 init_port_start(void) 515 { 516 int nb_ports = ff_global_cfg.dpdk.nb_ports; 517 unsigned socketid = 0; 518 struct rte_mempool *mbuf_pool; 519 uint16_t i; 520 521 for (i = 0; i < nb_ports; i++) { 522 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 523 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 524 uint16_t nb_queues = pconf->nb_lcores; 525 526 struct rte_eth_dev_info dev_info; 527 struct rte_eth_conf port_conf = {0}; 528 struct rte_eth_rxconf rxq_conf; 529 struct rte_eth_txconf txq_conf; 530 531 rte_eth_dev_info_get(port_id, &dev_info); 532 533 if (nb_queues > dev_info.max_rx_queues) { 534 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 535 nb_queues, 536 dev_info.max_rx_queues); 537 } 538 539 if (nb_queues > dev_info.max_tx_queues) { 540 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 541 nb_queues, 542 dev_info.max_tx_queues); 543 } 544 545 struct ether_addr addr; 546 rte_eth_macaddr_get(port_id, &addr); 547 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 548 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 549 (unsigned)port_id, 550 addr.addr_bytes[0], addr.addr_bytes[1], 551 addr.addr_bytes[2], addr.addr_bytes[3], 552 addr.addr_bytes[4], addr.addr_bytes[5]); 553 554 rte_memcpy(pconf->mac, 555 addr.addr_bytes, ETHER_ADDR_LEN); 556 557 /* Set RSS mode */ 558 uint64_t default_rss_hf = ETH_RSS_PROTO_MASK; 559 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 560 port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf; 561 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes; 562 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; 563 port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads; 564 if (port_conf.rx_adv_conf.rss_conf.rss_hf != 565 ETH_RSS_PROTO_MASK) { 566 printf("Port %u modified RSS hash function based on hardware support," 567 "requested:%#"PRIx64" configured:%#"PRIx64"\n", 568 port_id, default_rss_hf, 569 port_conf.rx_adv_conf.rss_conf.rss_hf); 570 } 571 572 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) { 573 port_conf.txmode.offloads |= 574 DEV_TX_OFFLOAD_MBUF_FAST_FREE; 575 } 576 577 /* Set Rx VLAN stripping */ 578 if (ff_global_cfg.dpdk.vlan_strip) { 579 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 580 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP; 581 } 582 } 583 584 /* Enable HW CRC stripping */ 585 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_KEEP_CRC) { 586 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_KEEP_CRC; 587 } 588 589 /* FIXME: Enable TCP LRO ?*/ 590 #if 0 591 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 592 printf("LRO is supported\n"); 593 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO; 594 pconf->hw_features.rx_lro = 1; 595 } 596 #endif 597 598 /* Set Rx checksum checking */ 599 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 600 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 601 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 602 printf("RX checksum offload supported\n"); 603 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM; 604 pconf->hw_features.rx_csum = 1; 605 } 606 607 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 608 printf("TX ip checksum offload supported\n"); 609 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; 610 pconf->hw_features.tx_csum_ip = 1; 611 } 612 613 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 614 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 615 printf("TX TCP&UDP checksum offload supported\n"); 616 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM; 617 pconf->hw_features.tx_csum_l4 = 1; 618 } 619 620 if (ff_global_cfg.dpdk.tso) { 621 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 622 printf("TSO is supported\n"); 623 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO; 624 pconf->hw_features.tx_tso = 1; 625 } 626 } else { 627 printf("TSO is disabled\n"); 628 } 629 630 if (dev_info.reta_size) { 631 /* reta size must be power of 2 */ 632 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 633 634 rss_reta_size[port_id] = dev_info.reta_size; 635 printf("port[%d]: rss table size: %d\n", port_id, 636 dev_info.reta_size); 637 } 638 639 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 640 continue; 641 } 642 643 int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); 644 if (ret != 0) { 645 return ret; 646 } 647 648 static uint16_t nb_rxd = RX_QUEUE_SIZE; 649 static uint16_t nb_txd = TX_QUEUE_SIZE; 650 ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd); 651 if (ret < 0) 652 printf("Could not adjust number of descriptors " 653 "for port%u (%d)\n", (unsigned)port_id, ret); 654 655 uint16_t q; 656 for (q = 0; q < nb_queues; q++) { 657 if (numa_on) { 658 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q]; 659 socketid = rte_lcore_to_socket_id(lcore_id); 660 } 661 mbuf_pool = pktmbuf_pool[socketid]; 662 663 txq_conf = dev_info.default_txconf; 664 txq_conf.offloads = port_conf.txmode.offloads; 665 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd, 666 socketid, &txq_conf); 667 if (ret < 0) { 668 return ret; 669 } 670 671 rxq_conf = dev_info.default_rxconf; 672 rxq_conf.offloads = port_conf.rxmode.offloads; 673 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd, 674 socketid, &rxq_conf, mbuf_pool); 675 if (ret < 0) { 676 return ret; 677 } 678 } 679 680 ret = rte_eth_dev_start(port_id); 681 if (ret < 0) { 682 return ret; 683 } 684 685 if (nb_queues > 1) { 686 /* set HW rss hash function to Toeplitz. */ 687 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 688 struct rte_eth_hash_filter_info info = {0}; 689 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 690 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 691 692 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 693 RTE_ETH_FILTER_SET, &info) < 0) { 694 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 695 port_id); 696 } 697 } 698 699 set_rss_table(port_id, dev_info.reta_size, nb_queues); 700 } 701 702 /* Enable RX in promiscuous mode for the Ethernet device. */ 703 if (ff_global_cfg.dpdk.promiscuous) { 704 rte_eth_promiscuous_enable(port_id); 705 ret = rte_eth_promiscuous_get(port_id); 706 if (ret == 1) { 707 printf("set port %u to promiscuous mode ok\n", port_id); 708 } else { 709 printf("set port %u to promiscuous mode error\n", port_id); 710 } 711 } 712 713 /* Enable pcap dump */ 714 if (pconf->pcap) { 715 ff_enable_pcap(pconf->pcap); 716 } 717 } 718 719 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 720 check_all_ports_link_status(); 721 } 722 723 return 0; 724 } 725 726 static int 727 init_clock(void) 728 { 729 rte_timer_subsystem_init(); 730 uint64_t hz = rte_get_timer_hz(); 731 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 732 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 733 734 rte_timer_init(&freebsd_clock); 735 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 736 rte_lcore_id(), &ff_hardclock_job, NULL); 737 738 ff_update_current_ts(); 739 740 return 0; 741 } 742 743 int 744 ff_dpdk_init(int argc, char **argv) 745 { 746 if (ff_global_cfg.dpdk.nb_procs < 1 || 747 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 748 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 749 ff_global_cfg.dpdk.proc_id < 0) { 750 printf("param num_procs[%d] or proc_id[%d] error!\n", 751 ff_global_cfg.dpdk.nb_procs, 752 ff_global_cfg.dpdk.proc_id); 753 exit(1); 754 } 755 756 int ret = rte_eal_init(argc, argv); 757 if (ret < 0) { 758 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 759 } 760 761 numa_on = ff_global_cfg.dpdk.numa_on; 762 763 idle_sleep = ff_global_cfg.dpdk.idle_sleep; 764 pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \ 765 BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay; 766 767 init_lcore_conf(); 768 769 init_mem_pool(); 770 771 init_dispatch_ring(); 772 773 init_msg_ring(); 774 775 #ifdef FF_KNI 776 enable_kni = ff_global_cfg.kni.enable; 777 if (enable_kni) { 778 init_kni(); 779 } 780 #endif 781 782 #ifdef FF_USE_PAGE_ARRAY 783 ff_mmap_init(); 784 #endif 785 786 ret = init_port_start(); 787 if (ret < 0) { 788 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 789 } 790 791 init_clock(); 792 793 return 0; 794 } 795 796 static void 797 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 798 { 799 uint8_t rx_csum = ctx->hw_features.rx_csum; 800 if (rx_csum) { 801 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 802 rte_pktmbuf_free(pkt); 803 return; 804 } 805 } 806 807 /* 808 * FIXME: should we save pkt->vlan_tci 809 * if (pkt->ol_flags & PKT_RX_VLAN_PKT) 810 */ 811 812 void *data = rte_pktmbuf_mtod(pkt, void*); 813 uint16_t len = rte_pktmbuf_data_len(pkt); 814 815 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 816 if (hdr == NULL) { 817 rte_pktmbuf_free(pkt); 818 return; 819 } 820 821 struct rte_mbuf *pn = pkt->next; 822 void *prev = hdr; 823 while(pn != NULL) { 824 data = rte_pktmbuf_mtod(pn, void*); 825 len = rte_pktmbuf_data_len(pn); 826 827 void *mb = ff_mbuf_get(prev, data, len); 828 if (mb == NULL) { 829 ff_mbuf_free(hdr); 830 rte_pktmbuf_free(pkt); 831 return; 832 } 833 pn = pn->next; 834 prev = mb; 835 } 836 837 ff_veth_process_packet(ctx->ifp, hdr); 838 } 839 840 static enum FilterReturn 841 protocol_filter(const void *data, uint16_t len) 842 { 843 if(len < ETHER_HDR_LEN) 844 return FILTER_UNKNOWN; 845 846 const struct ether_hdr *hdr; 847 hdr = (const struct ether_hdr *)data; 848 uint16_t eth_frame_type = rte_be_to_cpu_16(hdr->ether_type); 849 850 if(eth_frame_type == ETHER_TYPE_ARP) 851 return FILTER_ARP; 852 853 #ifndef FF_KNI 854 return FILTER_UNKNOWN; 855 #else 856 if (!enable_kni) { 857 return FILTER_UNKNOWN; 858 } 859 860 if(eth_frame_type != ETHER_TYPE_IPv4 861 #ifdef INET6 862 && eth_frame_type != ETHER_TYPE_IPv6 863 #endif 864 ) 865 return FILTER_UNKNOWN; 866 867 return ff_kni_proto_filter(data + ETHER_HDR_LEN, 868 len - ETHER_HDR_LEN, eth_frame_type); 869 #endif 870 } 871 872 static inline void 873 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m) 874 { 875 struct rte_mbuf *md; 876 void *src, *dst; 877 878 dst = rte_pktmbuf_mtod(mi, void *); 879 src = rte_pktmbuf_mtod(m, void *); 880 881 mi->data_len = m->data_len; 882 rte_memcpy(dst, src, m->data_len); 883 884 mi->port = m->port; 885 mi->vlan_tci = m->vlan_tci; 886 mi->vlan_tci_outer = m->vlan_tci_outer; 887 mi->tx_offload = m->tx_offload; 888 mi->hash = m->hash; 889 mi->ol_flags = m->ol_flags; 890 mi->packet_type = m->packet_type; 891 } 892 893 /* copied from rte_pktmbuf_clone */ 894 static inline struct rte_mbuf * 895 pktmbuf_deep_clone(const struct rte_mbuf *md, 896 struct rte_mempool *mp) 897 { 898 struct rte_mbuf *mc, *mi, **prev; 899 uint32_t pktlen; 900 uint8_t nseg; 901 902 if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) 903 return NULL; 904 905 mi = mc; 906 prev = &mi->next; 907 pktlen = md->pkt_len; 908 nseg = 0; 909 910 do { 911 nseg++; 912 pktmbuf_deep_attach(mi, md); 913 *prev = mi; 914 prev = &mi->next; 915 } while ((md = md->next) != NULL && 916 (mi = rte_pktmbuf_alloc(mp)) != NULL); 917 918 *prev = NULL; 919 mc->nb_segs = nseg; 920 mc->pkt_len = pktlen; 921 922 /* Allocation of new indirect segment failed */ 923 if (unlikely (mi == NULL)) { 924 rte_pktmbuf_free(mc); 925 return NULL; 926 } 927 928 __rte_mbuf_sanity_check(mc, 1); 929 return mc; 930 } 931 932 static inline void 933 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 934 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 935 { 936 struct lcore_conf *qconf = &lcore_conf; 937 uint16_t nb_queues = qconf->nb_queue_list[port_id]; 938 939 uint16_t i; 940 for (i = 0; i < count; i++) { 941 struct rte_mbuf *rtem = bufs[i]; 942 943 if (unlikely(qconf->pcap[port_id] != NULL)) { 944 if (!pkts_from_ring) { 945 ff_dump_packets(qconf->pcap[port_id], rtem); 946 } 947 } 948 949 void *data = rte_pktmbuf_mtod(rtem, void*); 950 uint16_t len = rte_pktmbuf_data_len(rtem); 951 952 if (!pkts_from_ring) { 953 ff_traffic.rx_packets++; 954 ff_traffic.rx_bytes += len; 955 } 956 957 if (!pkts_from_ring && packet_dispatcher) { 958 int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues); 959 if (ret == FF_DISPATCH_RESPONSE) { 960 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len; 961 send_single_packet(rtem, port_id); 962 continue; 963 } 964 965 if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) { 966 rte_pktmbuf_free(rtem); 967 continue; 968 } 969 970 if (ret != queue_id) { 971 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem); 972 if (ret < 0) 973 rte_pktmbuf_free(rtem); 974 975 continue; 976 } 977 } 978 979 enum FilterReturn filter = protocol_filter(data, len); 980 if (filter == FILTER_ARP) { 981 struct rte_mempool *mbuf_pool; 982 struct rte_mbuf *mbuf_clone; 983 if (!pkts_from_ring) { 984 uint16_t j; 985 for(j = 0; j < nb_queues; ++j) { 986 if(j == queue_id) 987 continue; 988 989 unsigned socket_id = 0; 990 if (numa_on) { 991 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j]; 992 socket_id = rte_lcore_to_socket_id(lcore_id); 993 } 994 mbuf_pool = pktmbuf_pool[socket_id]; 995 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 996 if(mbuf_clone) { 997 int ret = rte_ring_enqueue(dispatch_ring[port_id][j], 998 mbuf_clone); 999 if (ret < 0) 1000 rte_pktmbuf_free(mbuf_clone); 1001 } 1002 } 1003 } 1004 1005 #ifdef FF_KNI 1006 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1007 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 1008 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1009 if(mbuf_clone) { 1010 ff_kni_enqueue(port_id, mbuf_clone); 1011 } 1012 } 1013 #endif 1014 ff_veth_input(ctx, rtem); 1015 #ifdef FF_KNI 1016 } else if (enable_kni && 1017 ((filter == FILTER_KNI && kni_accept) || 1018 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 1019 ff_kni_enqueue(port_id, rtem); 1020 #endif 1021 } else { 1022 ff_veth_input(ctx, rtem); 1023 } 1024 } 1025 } 1026 1027 static inline int 1028 process_dispatch_ring(uint16_t port_id, uint16_t queue_id, 1029 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 1030 { 1031 /* read packet from ring buf and to process */ 1032 uint16_t nb_rb; 1033 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id], 1034 (void **)pkts_burst, MAX_PKT_BURST, NULL); 1035 1036 if(nb_rb > 0) { 1037 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 1038 } 1039 1040 return 0; 1041 } 1042 1043 static inline void 1044 handle_sysctl_msg(struct ff_msg *msg) 1045 { 1046 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 1047 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 1048 msg->sysctl.newlen); 1049 1050 if (ret < 0) { 1051 msg->result = errno; 1052 } else { 1053 msg->result = 0; 1054 } 1055 } 1056 1057 static inline void 1058 handle_ioctl_msg(struct ff_msg *msg) 1059 { 1060 int fd, ret; 1061 #ifdef INET6 1062 if (msg->msg_type == FF_IOCTL6) { 1063 fd = ff_socket(AF_INET6, SOCK_DGRAM, 0); 1064 } else 1065 #endif 1066 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 1067 1068 if (fd < 0) { 1069 ret = -1; 1070 goto done; 1071 } 1072 1073 ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data); 1074 1075 ff_close(fd); 1076 1077 done: 1078 if (ret < 0) { 1079 msg->result = errno; 1080 } else { 1081 msg->result = 0; 1082 } 1083 } 1084 1085 static inline void 1086 handle_route_msg(struct ff_msg *msg) 1087 { 1088 int ret = ff_rtioctl(msg->route.fib, msg->route.data, 1089 &msg->route.len, msg->route.maxlen); 1090 if (ret < 0) { 1091 msg->result = errno; 1092 } else { 1093 msg->result = 0; 1094 } 1095 } 1096 1097 static inline void 1098 handle_top_msg(struct ff_msg *msg) 1099 { 1100 msg->top = ff_top_status; 1101 msg->result = 0; 1102 } 1103 1104 #ifdef FF_NETGRAPH 1105 static inline void 1106 handle_ngctl_msg(struct ff_msg *msg) 1107 { 1108 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data); 1109 if (ret < 0) { 1110 msg->result = errno; 1111 } else { 1112 msg->result = 0; 1113 msg->ngctl.ret = ret; 1114 } 1115 } 1116 #endif 1117 1118 #ifdef FF_IPFW 1119 static inline void 1120 handle_ipfw_msg(struct ff_msg *msg) 1121 { 1122 int fd, ret; 1123 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW); 1124 if (fd < 0) { 1125 ret = -1; 1126 goto done; 1127 } 1128 1129 switch (msg->ipfw.cmd) { 1130 case FF_IPFW_GET: 1131 ret = ff_getsockopt_freebsd(fd, msg->ipfw.level, 1132 msg->ipfw.optname, msg->ipfw.optval, 1133 msg->ipfw.optlen); 1134 break; 1135 case FF_IPFW_SET: 1136 ret = ff_setsockopt_freebsd(fd, msg->ipfw.level, 1137 msg->ipfw.optname, msg->ipfw.optval, 1138 *(msg->ipfw.optlen)); 1139 break; 1140 default: 1141 ret = -1; 1142 errno = ENOTSUP; 1143 break; 1144 } 1145 1146 ff_close(fd); 1147 1148 done: 1149 if (ret < 0) { 1150 msg->result = errno; 1151 } else { 1152 msg->result = 0; 1153 } 1154 } 1155 #endif 1156 1157 static inline void 1158 handle_traffic_msg(struct ff_msg *msg) 1159 { 1160 msg->traffic = ff_traffic; 1161 msg->result = 0; 1162 } 1163 1164 static inline void 1165 handle_default_msg(struct ff_msg *msg) 1166 { 1167 msg->result = ENOTSUP; 1168 } 1169 1170 static inline void 1171 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1172 { 1173 switch (msg->msg_type) { 1174 case FF_SYSCTL: 1175 handle_sysctl_msg(msg); 1176 break; 1177 case FF_IOCTL: 1178 #ifdef INET6 1179 case FF_IOCTL6: 1180 #endif 1181 handle_ioctl_msg(msg); 1182 break; 1183 case FF_ROUTE: 1184 handle_route_msg(msg); 1185 break; 1186 case FF_TOP: 1187 handle_top_msg(msg); 1188 break; 1189 #ifdef FF_NETGRAPH 1190 case FF_NGCTL: 1191 handle_ngctl_msg(msg); 1192 break; 1193 #endif 1194 #ifdef FF_IPFW 1195 case FF_IPFW_CTL: 1196 handle_ipfw_msg(msg); 1197 break; 1198 #endif 1199 case FF_TRAFFIC: 1200 handle_traffic_msg(msg); 1201 break; 1202 default: 1203 handle_default_msg(msg); 1204 break; 1205 } 1206 rte_ring_enqueue(msg_ring[proc_id].ring[1], msg); 1207 } 1208 1209 static inline int 1210 process_msg_ring(uint16_t proc_id) 1211 { 1212 void *msg; 1213 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1214 1215 if (unlikely(ret == 0)) { 1216 handle_msg((struct ff_msg *)msg, proc_id); 1217 } 1218 1219 return 0; 1220 } 1221 1222 /* Send burst of packets on an output interface */ 1223 static inline int 1224 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1225 { 1226 struct rte_mbuf **m_table; 1227 int ret; 1228 uint16_t queueid; 1229 1230 queueid = qconf->tx_queue_id[port]; 1231 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1232 1233 if (unlikely(qconf->pcap[port] != NULL)) { 1234 uint16_t i; 1235 for (i = 0; i < n; i++) { 1236 ff_dump_packets(qconf->pcap[port], m_table[i]); 1237 } 1238 } 1239 1240 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1241 ff_traffic.tx_packets += ret; 1242 uint16_t i; 1243 for (i = 0; i < ret; i++) { 1244 ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]); 1245 #ifdef FF_USE_PAGE_ARRAY 1246 if (qconf->tx_mbufs[port].bsd_m_table[i]) 1247 ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs); 1248 #endif 1249 } 1250 if (unlikely(ret < n)) { 1251 do { 1252 rte_pktmbuf_free(m_table[ret]); 1253 #ifdef FF_USE_PAGE_ARRAY 1254 if ( qconf->tx_mbufs[port].bsd_m_table[ret] ) 1255 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]); 1256 #endif 1257 } while (++ret < n); 1258 } 1259 return 0; 1260 } 1261 1262 /* Enqueue a single packet, and send burst if queue is filled */ 1263 static inline int 1264 send_single_packet(struct rte_mbuf *m, uint8_t port) 1265 { 1266 uint16_t len; 1267 struct lcore_conf *qconf; 1268 1269 qconf = &lcore_conf; 1270 len = qconf->tx_mbufs[port].len; 1271 qconf->tx_mbufs[port].m_table[len] = m; 1272 len++; 1273 1274 /* enough pkts to be sent */ 1275 if (unlikely(len == MAX_PKT_BURST)) { 1276 send_burst(qconf, MAX_PKT_BURST, port); 1277 len = 0; 1278 } 1279 1280 qconf->tx_mbufs[port].len = len; 1281 return 0; 1282 } 1283 1284 int 1285 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1286 int total) 1287 { 1288 #ifdef FF_USE_PAGE_ARRAY 1289 struct lcore_conf *qconf = &lcore_conf; 1290 int len = 0; 1291 1292 len = ff_if_send_onepkt(ctx, m,total); 1293 if (unlikely(len == MAX_PKT_BURST)) { 1294 send_burst(qconf, MAX_PKT_BURST, ctx->port_id); 1295 len = 0; 1296 } 1297 qconf->tx_mbufs[ctx->port_id].len = len; 1298 return 0; 1299 #endif 1300 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1301 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1302 if (head == NULL) { 1303 ff_mbuf_free(m); 1304 return -1; 1305 } 1306 1307 head->pkt_len = total; 1308 head->nb_segs = 0; 1309 1310 int off = 0; 1311 struct rte_mbuf *cur = head, *prev = NULL; 1312 while(total > 0) { 1313 if (cur == NULL) { 1314 cur = rte_pktmbuf_alloc(mbuf_pool); 1315 if (cur == NULL) { 1316 rte_pktmbuf_free(head); 1317 ff_mbuf_free(m); 1318 return -1; 1319 } 1320 } 1321 1322 if (prev != NULL) { 1323 prev->next = cur; 1324 } 1325 head->nb_segs++; 1326 1327 prev = cur; 1328 void *data = rte_pktmbuf_mtod(cur, void*); 1329 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1330 int ret = ff_mbuf_copydata(m, data, off, len); 1331 if (ret < 0) { 1332 rte_pktmbuf_free(head); 1333 ff_mbuf_free(m); 1334 return -1; 1335 } 1336 1337 1338 cur->data_len = len; 1339 off += len; 1340 total -= len; 1341 cur = NULL; 1342 } 1343 1344 struct ff_tx_offload offload = {0}; 1345 ff_mbuf_tx_offload(m, &offload); 1346 1347 void *data = rte_pktmbuf_mtod(head, void*); 1348 1349 if (offload.ip_csum) { 1350 /* ipv6 not supported yet */ 1351 struct ipv4_hdr *iph; 1352 int iph_len; 1353 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1354 iph_len = (iph->version_ihl & 0x0f) << 2; 1355 1356 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 1357 head->l2_len = ETHER_HDR_LEN; 1358 head->l3_len = iph_len; 1359 } 1360 1361 if (ctx->hw_features.tx_csum_l4) { 1362 struct ipv4_hdr *iph; 1363 int iph_len; 1364 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1365 iph_len = (iph->version_ihl & 0x0f) << 2; 1366 1367 if (offload.tcp_csum) { 1368 head->ol_flags |= PKT_TX_TCP_CKSUM; 1369 head->l2_len = ETHER_HDR_LEN; 1370 head->l3_len = iph_len; 1371 } 1372 1373 /* 1374 * TCP segmentation offload. 1375 * 1376 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 1377 * implies PKT_TX_TCP_CKSUM) 1378 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 1379 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 1380 * write the IP checksum to 0 in the packet 1381 * - fill the mbuf offload information: l2_len, 1382 * l3_len, l4_len, tso_segsz 1383 * - calculate the pseudo header checksum without taking ip_len 1384 * in account, and set it in the TCP header. Refer to 1385 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 1386 * used as helpers. 1387 */ 1388 if (offload.tso_seg_size) { 1389 struct tcp_hdr *tcph; 1390 int tcph_len; 1391 tcph = (struct tcp_hdr *)((char *)iph + iph_len); 1392 tcph_len = (tcph->data_off & 0xf0) >> 2; 1393 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 1394 1395 head->ol_flags |= PKT_TX_TCP_SEG; 1396 head->l4_len = tcph_len; 1397 head->tso_segsz = offload.tso_seg_size; 1398 } 1399 1400 if (offload.udp_csum) { 1401 head->ol_flags |= PKT_TX_UDP_CKSUM; 1402 head->l2_len = ETHER_HDR_LEN; 1403 head->l3_len = iph_len; 1404 } 1405 } 1406 1407 ff_mbuf_free(m); 1408 1409 return send_single_packet(head, ctx->port_id); 1410 } 1411 1412 static int 1413 main_loop(void *arg) 1414 { 1415 struct loop_routine *lr = (struct loop_routine *)arg; 1416 1417 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1418 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc; 1419 int i, j, nb_rx, idle; 1420 uint16_t port_id, queue_id; 1421 struct lcore_conf *qconf; 1422 uint64_t drain_tsc = 0; 1423 struct ff_dpdk_if_context *ctx; 1424 1425 if (pkt_tx_delay) { 1426 drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay; 1427 } 1428 1429 prev_tsc = 0; 1430 usch_tsc = 0; 1431 1432 qconf = &lcore_conf; 1433 1434 while (1) { 1435 cur_tsc = rte_rdtsc(); 1436 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1437 rte_timer_manage(); 1438 } 1439 1440 idle = 1; 1441 sys_tsc = 0; 1442 usr_tsc = 0; 1443 1444 /* 1445 * TX burst queue drain 1446 */ 1447 diff_tsc = cur_tsc - prev_tsc; 1448 if (unlikely(diff_tsc >= drain_tsc)) { 1449 for (i = 0; i < qconf->nb_tx_port; i++) { 1450 port_id = qconf->tx_port_id[i]; 1451 if (qconf->tx_mbufs[port_id].len == 0) 1452 continue; 1453 1454 idle = 0; 1455 1456 send_burst(qconf, 1457 qconf->tx_mbufs[port_id].len, 1458 port_id); 1459 qconf->tx_mbufs[port_id].len = 0; 1460 } 1461 1462 prev_tsc = cur_tsc; 1463 } 1464 1465 /* 1466 * Read packet from RX queues 1467 */ 1468 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1469 port_id = qconf->rx_queue_list[i].port_id; 1470 queue_id = qconf->rx_queue_list[i].queue_id; 1471 ctx = veth_ctx[port_id]; 1472 1473 #ifdef FF_KNI 1474 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1475 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1476 } 1477 #endif 1478 1479 process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); 1480 1481 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1482 MAX_PKT_BURST); 1483 if (nb_rx == 0) 1484 continue; 1485 1486 idle = 0; 1487 1488 /* Prefetch first packets */ 1489 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1490 rte_prefetch0(rte_pktmbuf_mtod( 1491 pkts_burst[j], void *)); 1492 } 1493 1494 /* Prefetch and handle already prefetched packets */ 1495 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1496 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1497 j + PREFETCH_OFFSET], void *)); 1498 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1499 } 1500 1501 /* Handle remaining prefetched packets */ 1502 for (; j < nb_rx; j++) { 1503 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1504 } 1505 } 1506 1507 process_msg_ring(qconf->proc_id); 1508 1509 div_tsc = rte_rdtsc(); 1510 1511 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) { 1512 usch_tsc = cur_tsc; 1513 lr->loop(lr->arg); 1514 } 1515 1516 idle_sleep_tsc = rte_rdtsc(); 1517 if (likely(idle && idle_sleep)) { 1518 usleep(idle_sleep); 1519 end_tsc = rte_rdtsc(); 1520 } else { 1521 end_tsc = idle_sleep_tsc; 1522 } 1523 1524 if (usch_tsc == cur_tsc) { 1525 usr_tsc = idle_sleep_tsc - div_tsc; 1526 } 1527 1528 if (!idle) { 1529 sys_tsc = div_tsc - cur_tsc; 1530 ff_top_status.sys_tsc += sys_tsc; 1531 } 1532 1533 ff_top_status.usr_tsc += usr_tsc; 1534 ff_top_status.work_tsc += end_tsc - cur_tsc; 1535 ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1536 1537 ff_top_status.loops++; 1538 } 1539 1540 return 0; 1541 } 1542 1543 int 1544 ff_dpdk_if_up(void) { 1545 int i; 1546 struct lcore_conf *qconf = &lcore_conf; 1547 for (i = 0; i < qconf->nb_tx_port; i++) { 1548 uint16_t port_id = qconf->tx_port_id[i]; 1549 1550 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id]; 1551 veth_ctx[port_id] = ff_veth_attach(pconf); 1552 if (veth_ctx[port_id] == NULL) { 1553 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1554 } 1555 } 1556 1557 return 0; 1558 } 1559 1560 void 1561 ff_dpdk_run(loop_func_t loop, void *arg) { 1562 struct loop_routine *lr = rte_malloc(NULL, 1563 sizeof(struct loop_routine), 0); 1564 lr->loop = loop; 1565 lr->arg = arg; 1566 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1567 rte_eal_mp_wait_lcore(); 1568 rte_free(lr); 1569 } 1570 1571 void 1572 ff_dpdk_pktmbuf_free(void *m) 1573 { 1574 rte_pktmbuf_free((struct rte_mbuf *)m); 1575 } 1576 1577 static uint32_t 1578 toeplitz_hash(unsigned keylen, const uint8_t *key, 1579 unsigned datalen, const uint8_t *data) 1580 { 1581 uint32_t hash = 0, v; 1582 u_int i, b; 1583 1584 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1585 1586 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1587 for (i = 0; i < datalen; i++) { 1588 for (b = 0; b < 8; b++) { 1589 if (data[i] & (1<<(7-b))) 1590 hash ^= v; 1591 v <<= 1; 1592 if ((i + 4) < keylen && 1593 (key[i+4] & (1<<(7-b)))) 1594 v |= 1; 1595 } 1596 } 1597 return (hash); 1598 } 1599 1600 int 1601 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1602 uint16_t sport, uint16_t dport) 1603 { 1604 struct lcore_conf *qconf = &lcore_conf; 1605 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 1606 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id]; 1607 1608 if (nb_queues <= 1) { 1609 return 1; 1610 } 1611 1612 uint16_t reta_size = rss_reta_size[ctx->port_id]; 1613 uint16_t queueid = qconf->tx_queue_id[ctx->port_id]; 1614 1615 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1616 sizeof(dport)]; 1617 1618 unsigned datalen = 0; 1619 1620 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1621 datalen += sizeof(saddr); 1622 1623 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1624 datalen += sizeof(daddr); 1625 1626 bcopy(&sport, &data[datalen], sizeof(sport)); 1627 datalen += sizeof(sport); 1628 1629 bcopy(&dport, &data[datalen], sizeof(dport)); 1630 datalen += sizeof(dport); 1631 1632 uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes), 1633 default_rsskey_40bytes, datalen, data); 1634 1635 return ((hash & (reta_size - 1)) % nb_queues) == queueid; 1636 } 1637 1638 void 1639 ff_regist_packet_dispatcher(dispatch_func_t func) 1640 { 1641 packet_dispatcher = func; 1642 } 1643 1644 uint64_t 1645 ff_get_tsc_ns() 1646 { 1647 uint64_t cur_tsc = rte_rdtsc(); 1648 uint64_t hz = rte_get_tsc_hz(); 1649 return ((double)cur_tsc/(double)hz) * NS_PER_S; 1650 } 1651 1652