1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 #include <unistd.h> 28 #include <sys/mman.h> 29 #include <errno.h> 30 31 #include <rte_common.h> 32 #include <rte_byteorder.h> 33 #include <rte_log.h> 34 #include <rte_memory.h> 35 #include <rte_memcpy.h> 36 #include <rte_memzone.h> 37 #include <rte_config.h> 38 #include <rte_eal.h> 39 #include <rte_pci.h> 40 #include <rte_mbuf.h> 41 #include <rte_memory.h> 42 #include <rte_lcore.h> 43 #include <rte_launch.h> 44 #include <rte_ethdev.h> 45 #include <rte_debug.h> 46 #include <rte_common.h> 47 #include <rte_ether.h> 48 #include <rte_malloc.h> 49 #include <rte_cycles.h> 50 #include <rte_timer.h> 51 #include <rte_thash.h> 52 #include <rte_ip.h> 53 #include <rte_tcp.h> 54 #include <rte_udp.h> 55 56 #include "ff_dpdk_if.h" 57 #include "ff_dpdk_pcap.h" 58 #include "ff_dpdk_kni.h" 59 #include "ff_config.h" 60 #include "ff_veth.h" 61 #include "ff_host_interface.h" 62 #include "ff_msg.h" 63 #include "ff_api.h" 64 #include "ff_memory.h" 65 66 #ifdef FF_KNI 67 #define KNI_MBUF_MAX 2048 68 #define KNI_QUEUE_SIZE 2048 69 70 static int enable_kni; 71 static int kni_accept; 72 #endif 73 74 static int numa_on; 75 76 static unsigned idle_sleep; 77 78 static struct rte_timer freebsd_clock; 79 80 // Mellanox Linux's driver key 81 static uint8_t default_rsskey_40bytes[40] = { 82 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 83 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 84 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 85 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 86 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 87 }; 88 89 struct lcore_conf lcore_conf; 90 91 struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 92 93 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; 94 static dispatch_func_t packet_dispatcher; 95 96 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 97 98 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port); 99 100 struct ff_msg_ring { 101 char ring_name[2][RTE_RING_NAMESIZE]; 102 /* ring[0] for lcore recv msg, other send */ 103 /* ring[1] for lcore send msg, other read */ 104 struct rte_ring *ring[2]; 105 } __rte_cache_aligned; 106 107 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 108 static struct rte_mempool *message_pool; 109 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 110 111 static struct ff_top_args ff_top_status; 112 static struct ff_traffic_args ff_traffic; 113 extern void ff_hardclock(void); 114 115 static void 116 ff_hardclock_job(__rte_unused struct rte_timer *timer, 117 __rte_unused void *arg) { 118 ff_hardclock(); 119 ff_update_current_ts(); 120 } 121 122 struct ff_dpdk_if_context * 123 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 124 { 125 struct ff_dpdk_if_context *ctx; 126 127 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 128 if (ctx == NULL) 129 return NULL; 130 131 ctx->sc = sc; 132 ctx->ifp = ifp; 133 ctx->port_id = cfg->port_id; 134 ctx->hw_features = cfg->hw_features; 135 136 return ctx; 137 } 138 139 void 140 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 141 { 142 free(ctx); 143 } 144 145 static void 146 check_all_ports_link_status(void) 147 { 148 #define CHECK_INTERVAL 100 /* 100ms */ 149 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 150 151 uint16_t portid; 152 uint8_t count, all_ports_up, print_flag = 0; 153 struct rte_eth_link link; 154 155 printf("\nChecking link status"); 156 fflush(stdout); 157 158 int i, nb_ports; 159 nb_ports = ff_global_cfg.dpdk.nb_ports; 160 for (count = 0; count <= MAX_CHECK_TIME; count++) { 161 all_ports_up = 1; 162 for (i = 0; i < nb_ports; i++) { 163 uint16_t portid = ff_global_cfg.dpdk.portid_list[i]; 164 memset(&link, 0, sizeof(link)); 165 rte_eth_link_get_nowait(portid, &link); 166 167 /* print link status if flag set */ 168 if (print_flag == 1) { 169 if (link.link_status) { 170 printf("Port %d Link Up - speed %u " 171 "Mbps - %s\n", (int)portid, 172 (unsigned)link.link_speed, 173 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 174 ("full-duplex") : ("half-duplex\n")); 175 } else { 176 printf("Port %d Link Down\n", (int)portid); 177 } 178 continue; 179 } 180 /* clear all_ports_up flag if any link down */ 181 if (link.link_status == 0) { 182 all_ports_up = 0; 183 break; 184 } 185 } 186 187 /* after finally printing all link status, get out */ 188 if (print_flag == 1) 189 break; 190 191 if (all_ports_up == 0) { 192 printf("."); 193 fflush(stdout); 194 rte_delay_ms(CHECK_INTERVAL); 195 } 196 197 /* set the print_flag if all ports up or timeout */ 198 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 199 print_flag = 1; 200 printf("done\n"); 201 } 202 } 203 } 204 205 static int 206 init_lcore_conf(void) 207 { 208 uint8_t nb_dev_ports = rte_eth_dev_count_avail(); 209 if (nb_dev_ports == 0) { 210 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 211 } 212 213 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) { 214 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n", 215 ff_global_cfg.dpdk.max_portid); 216 } 217 218 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs; 219 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 220 221 uint16_t proc_id; 222 for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) { 223 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id]; 224 if (!lcore_config[lcore_id].detected) { 225 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 226 } 227 } 228 229 uint16_t socket_id = 0; 230 if (numa_on) { 231 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 232 } 233 234 lcore_conf.socket_id = socket_id; 235 236 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id]; 237 int j; 238 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) { 239 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j]; 240 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 241 242 int queueid = -1; 243 int i; 244 for (i = 0; i < pconf->nb_lcores; i++) { 245 if (pconf->lcore_list[i] == lcore_id) { 246 queueid = i; 247 } 248 } 249 if (queueid < 0) { 250 continue; 251 } 252 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid); 253 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 254 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 255 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid; 256 lcore_conf.nb_rx_queue++; 257 258 lcore_conf.tx_queue_id[port_id] = queueid; 259 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id; 260 lcore_conf.nb_tx_port++; 261 262 lcore_conf.pcap[port_id] = pconf->pcap; 263 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores; 264 } 265 266 if (lcore_conf.nb_rx_queue == 0) { 267 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id); 268 } 269 270 return 0; 271 } 272 273 static int 274 init_mem_pool(void) 275 { 276 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 277 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 278 uint32_t nb_tx_queue = nb_lcores; 279 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 280 281 unsigned nb_mbuf = RTE_MAX ( 282 (nb_rx_queue*RX_QUEUE_SIZE + 283 nb_ports*nb_lcores*MAX_PKT_BURST + 284 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 285 nb_lcores*MEMPOOL_CACHE_SIZE + 286 #ifdef FF_KNI 287 nb_ports*KNI_MBUF_MAX + 288 nb_ports*KNI_QUEUE_SIZE + 289 #endif 290 nb_lcores*nb_ports*DISPATCH_RING_SIZE), 291 (unsigned)8192); 292 293 unsigned socketid = 0; 294 uint16_t i, lcore_id; 295 char s[64]; 296 297 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) { 298 lcore_id = ff_global_cfg.dpdk.proc_lcore[i]; 299 if (numa_on) { 300 socketid = rte_lcore_to_socket_id(lcore_id); 301 } 302 303 if (socketid >= NB_SOCKETS) { 304 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 305 socketid, i, NB_SOCKETS); 306 } 307 308 if (pktmbuf_pool[socketid] != NULL) { 309 continue; 310 } 311 312 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 313 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 314 pktmbuf_pool[socketid] = 315 rte_pktmbuf_pool_create(s, nb_mbuf, 316 MEMPOOL_CACHE_SIZE, 0, 317 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 318 } else { 319 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 320 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 321 } 322 323 if (pktmbuf_pool[socketid] == NULL) { 324 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 325 } else { 326 printf("create mbuf pool on socket %d\n", socketid); 327 } 328 329 #ifdef FF_USE_PAGE_ARRAY 330 nb_mbuf = RTE_MAX ( 331 nb_ports*nb_lcores*MAX_PKT_BURST + 332 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 333 nb_lcores*MEMPOOL_CACHE_SIZE, 334 (unsigned)4096); 335 ff_init_ref_pool(nb_mbuf, socketid); 336 #endif 337 } 338 339 return 0; 340 } 341 342 static struct rte_ring * 343 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 344 { 345 struct rte_ring *ring; 346 347 if (name == NULL) { 348 rte_exit(EXIT_FAILURE, "create ring failed, no name!\n"); 349 } 350 351 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 352 ring = rte_ring_create(name, count, socket_id, flags); 353 } else { 354 ring = rte_ring_lookup(name); 355 } 356 357 if (ring == NULL) { 358 rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name); 359 } 360 361 return ring; 362 } 363 364 static int 365 init_dispatch_ring(void) 366 { 367 int j; 368 char name_buf[RTE_RING_NAMESIZE]; 369 int queueid; 370 371 unsigned socketid = lcore_conf.socket_id; 372 373 /* Create ring according to ports actually being used. */ 374 int nb_ports = ff_global_cfg.dpdk.nb_ports; 375 for (j = 0; j < nb_ports; j++) { 376 uint16_t portid = ff_global_cfg.dpdk.portid_list[j]; 377 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid]; 378 int nb_queues = pconf->nb_lcores; 379 if (dispatch_ring[portid] == NULL) { 380 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid); 381 382 dispatch_ring[portid] = rte_zmalloc(name_buf, 383 sizeof(struct rte_ring *) * nb_queues, 384 RTE_CACHE_LINE_SIZE); 385 if (dispatch_ring[portid] == NULL) { 386 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 387 "failed\n", name_buf); 388 } 389 } 390 391 for(queueid = 0; queueid < nb_queues; ++queueid) { 392 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d", 393 portid, queueid); 394 dispatch_ring[portid][queueid] = create_ring(name_buf, 395 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ); 396 397 if (dispatch_ring[portid][queueid] == NULL) 398 rte_panic("create ring:%s failed!\n", name_buf); 399 400 printf("create ring:%s success, %u ring entries are now free!\n", 401 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid])); 402 } 403 } 404 405 return 0; 406 } 407 408 static void 409 ff_msg_init(struct rte_mempool *mp, 410 __attribute__((unused)) void *opaque_arg, 411 void *obj, __attribute__((unused)) unsigned i) 412 { 413 struct ff_msg *msg = (struct ff_msg *)obj; 414 msg->msg_type = FF_UNKNOWN; 415 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 416 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 417 } 418 419 static int 420 init_msg_ring(void) 421 { 422 uint16_t i; 423 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 424 unsigned socketid = lcore_conf.socket_id; 425 426 /* Create message buffer pool */ 427 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 428 message_pool = rte_mempool_create(FF_MSG_POOL, 429 MSG_RING_SIZE * 2 * nb_procs, 430 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 431 NULL, NULL, ff_msg_init, NULL, 432 socketid, 0); 433 } else { 434 message_pool = rte_mempool_lookup(FF_MSG_POOL); 435 } 436 437 if (message_pool == NULL) { 438 rte_panic("Create msg mempool failed\n"); 439 } 440 441 for(i = 0; i < nb_procs; ++i) { 442 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 443 "%s%u", FF_MSG_RING_IN, i); 444 snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE, 445 "%s%u", FF_MSG_RING_OUT, i); 446 447 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 448 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 449 if (msg_ring[i].ring[0] == NULL) 450 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 451 452 msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1], 453 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 454 if (msg_ring[i].ring[1] == NULL) 455 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 456 } 457 458 return 0; 459 } 460 461 #ifdef FF_KNI 462 static int 463 init_kni(void) 464 { 465 int nb_ports = rte_eth_dev_count_avail(); 466 kni_accept = 0; 467 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 468 kni_accept = 1; 469 470 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 471 ff_global_cfg.kni.udp_port); 472 473 unsigned socket_id = lcore_conf.socket_id; 474 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 475 476 nb_ports = ff_global_cfg.dpdk.nb_ports; 477 int i, ret; 478 for (i = 0; i < nb_ports; i++) { 479 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 480 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 481 } 482 483 return 0; 484 } 485 #endif 486 487 static void 488 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues) 489 { 490 if (reta_size == 0) { 491 return; 492 } 493 494 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 495 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 496 497 /* config HW indirection table */ 498 unsigned i, j, hash=0; 499 for (i = 0; i < reta_conf_size; i++) { 500 reta_conf[i].mask = ~0ULL; 501 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 502 reta_conf[i].reta[j] = hash++ % nb_queues; 503 } 504 } 505 506 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 507 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 508 port_id); 509 } 510 } 511 512 static int 513 init_port_start(void) 514 { 515 int nb_ports = ff_global_cfg.dpdk.nb_ports; 516 unsigned socketid = 0; 517 struct rte_mempool *mbuf_pool; 518 uint16_t i; 519 520 for (i = 0; i < nb_ports; i++) { 521 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 522 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 523 uint16_t nb_queues = pconf->nb_lcores; 524 525 struct rte_eth_dev_info dev_info; 526 struct rte_eth_conf port_conf = {0}; 527 struct rte_eth_rxconf rxq_conf; 528 struct rte_eth_txconf txq_conf; 529 530 rte_eth_dev_info_get(port_id, &dev_info); 531 532 if (nb_queues > dev_info.max_rx_queues) { 533 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 534 nb_queues, 535 dev_info.max_rx_queues); 536 } 537 538 if (nb_queues > dev_info.max_tx_queues) { 539 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 540 nb_queues, 541 dev_info.max_tx_queues); 542 } 543 544 struct ether_addr addr; 545 rte_eth_macaddr_get(port_id, &addr); 546 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 547 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 548 (unsigned)port_id, 549 addr.addr_bytes[0], addr.addr_bytes[1], 550 addr.addr_bytes[2], addr.addr_bytes[3], 551 addr.addr_bytes[4], addr.addr_bytes[5]); 552 553 rte_memcpy(pconf->mac, 554 addr.addr_bytes, ETHER_ADDR_LEN); 555 556 /* Set RSS mode */ 557 uint64_t default_rss_hf = ETH_RSS_PROTO_MASK; 558 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 559 port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf; 560 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes; 561 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; 562 port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads; 563 if (port_conf.rx_adv_conf.rss_conf.rss_hf != 564 ETH_RSS_PROTO_MASK) { 565 printf("Port %u modified RSS hash function based on hardware support," 566 "requested:%#"PRIx64" configured:%#"PRIx64"\n", 567 port_id, default_rss_hf, 568 port_conf.rx_adv_conf.rss_conf.rss_hf); 569 } 570 571 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) { 572 port_conf.txmode.offloads |= 573 DEV_TX_OFFLOAD_MBUF_FAST_FREE; 574 } 575 576 /* Set Rx VLAN stripping */ 577 if (ff_global_cfg.dpdk.vlan_strip) { 578 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 579 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP; 580 } 581 } 582 583 /* Enable HW CRC stripping */ 584 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_KEEP_CRC) { 585 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_KEEP_CRC; 586 } 587 588 /* FIXME: Enable TCP LRO ?*/ 589 #if 0 590 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 591 printf("LRO is supported\n"); 592 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO; 593 pconf->hw_features.rx_lro = 1; 594 } 595 #endif 596 597 /* Set Rx checksum checking */ 598 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 599 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 600 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 601 printf("RX checksum offload supported\n"); 602 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM; 603 pconf->hw_features.rx_csum = 1; 604 } 605 606 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 607 printf("TX ip checksum offload supported\n"); 608 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; 609 pconf->hw_features.tx_csum_ip = 1; 610 } 611 612 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 613 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 614 printf("TX TCP&UDP checksum offload supported\n"); 615 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM; 616 pconf->hw_features.tx_csum_l4 = 1; 617 } 618 619 if (ff_global_cfg.dpdk.tso) { 620 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 621 printf("TSO is supported\n"); 622 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO; 623 pconf->hw_features.tx_tso = 1; 624 } 625 } else { 626 printf("TSO is disabled\n"); 627 } 628 629 if (dev_info.reta_size) { 630 /* reta size must be power of 2 */ 631 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 632 633 rss_reta_size[port_id] = dev_info.reta_size; 634 printf("port[%d]: rss table size: %d\n", port_id, 635 dev_info.reta_size); 636 } 637 638 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 639 continue; 640 } 641 642 int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); 643 if (ret != 0) { 644 return ret; 645 } 646 647 static uint16_t nb_rxd = RX_QUEUE_SIZE; 648 static uint16_t nb_txd = TX_QUEUE_SIZE; 649 ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd); 650 if (ret < 0) 651 printf("Could not adjust number of descriptors " 652 "for port%u (%d)\n", (unsigned)port_id, ret); 653 654 uint16_t q; 655 for (q = 0; q < nb_queues; q++) { 656 if (numa_on) { 657 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q]; 658 socketid = rte_lcore_to_socket_id(lcore_id); 659 } 660 mbuf_pool = pktmbuf_pool[socketid]; 661 662 txq_conf = dev_info.default_txconf; 663 txq_conf.offloads = port_conf.txmode.offloads; 664 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd, 665 socketid, &txq_conf); 666 if (ret < 0) { 667 return ret; 668 } 669 670 rxq_conf = dev_info.default_rxconf; 671 rxq_conf.offloads = port_conf.rxmode.offloads; 672 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd, 673 socketid, &rxq_conf, mbuf_pool); 674 if (ret < 0) { 675 return ret; 676 } 677 } 678 679 ret = rte_eth_dev_start(port_id); 680 if (ret < 0) { 681 return ret; 682 } 683 684 if (nb_queues > 1) { 685 /* set HW rss hash function to Toeplitz. */ 686 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 687 struct rte_eth_hash_filter_info info = {0}; 688 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 689 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 690 691 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 692 RTE_ETH_FILTER_SET, &info) < 0) { 693 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 694 port_id); 695 } 696 } 697 698 set_rss_table(port_id, dev_info.reta_size, nb_queues); 699 } 700 701 /* Enable RX in promiscuous mode for the Ethernet device. */ 702 if (ff_global_cfg.dpdk.promiscuous) { 703 rte_eth_promiscuous_enable(port_id); 704 ret = rte_eth_promiscuous_get(port_id); 705 if (ret == 1) { 706 printf("set port %u to promiscuous mode ok\n", port_id); 707 } else { 708 printf("set port %u to promiscuous mode error\n", port_id); 709 } 710 } 711 712 /* Enable pcap dump */ 713 if (pconf->pcap) { 714 ff_enable_pcap(pconf->pcap); 715 } 716 } 717 718 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 719 check_all_ports_link_status(); 720 } 721 722 return 0; 723 } 724 725 static int 726 init_clock(void) 727 { 728 rte_timer_subsystem_init(); 729 uint64_t hz = rte_get_timer_hz(); 730 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 731 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 732 733 rte_timer_init(&freebsd_clock); 734 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 735 rte_lcore_id(), &ff_hardclock_job, NULL); 736 737 ff_update_current_ts(); 738 739 return 0; 740 } 741 742 int 743 ff_dpdk_init(int argc, char **argv) 744 { 745 if (ff_global_cfg.dpdk.nb_procs < 1 || 746 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 747 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 748 ff_global_cfg.dpdk.proc_id < 0) { 749 printf("param num_procs[%d] or proc_id[%d] error!\n", 750 ff_global_cfg.dpdk.nb_procs, 751 ff_global_cfg.dpdk.proc_id); 752 exit(1); 753 } 754 755 int ret = rte_eal_init(argc, argv); 756 if (ret < 0) { 757 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 758 } 759 760 numa_on = ff_global_cfg.dpdk.numa_on; 761 762 idle_sleep = ff_global_cfg.dpdk.idle_sleep; 763 764 init_lcore_conf(); 765 766 init_mem_pool(); 767 768 init_dispatch_ring(); 769 770 init_msg_ring(); 771 772 #ifdef FF_KNI 773 enable_kni = ff_global_cfg.kni.enable; 774 if (enable_kni) { 775 init_kni(); 776 } 777 #endif 778 779 #ifdef FF_USE_PAGE_ARRAY 780 ff_mmap_init(); 781 #endif 782 783 ret = init_port_start(); 784 if (ret < 0) { 785 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 786 } 787 788 init_clock(); 789 790 return 0; 791 } 792 793 static void 794 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 795 { 796 uint8_t rx_csum = ctx->hw_features.rx_csum; 797 if (rx_csum) { 798 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 799 rte_pktmbuf_free(pkt); 800 return; 801 } 802 } 803 804 /* 805 * FIXME: should we save pkt->vlan_tci 806 * if (pkt->ol_flags & PKT_RX_VLAN_PKT) 807 */ 808 809 void *data = rte_pktmbuf_mtod(pkt, void*); 810 uint16_t len = rte_pktmbuf_data_len(pkt); 811 812 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 813 if (hdr == NULL) { 814 rte_pktmbuf_free(pkt); 815 return; 816 } 817 818 struct rte_mbuf *pn = pkt->next; 819 void *prev = hdr; 820 while(pn != NULL) { 821 data = rte_pktmbuf_mtod(pn, void*); 822 len = rte_pktmbuf_data_len(pn); 823 824 void *mb = ff_mbuf_get(prev, data, len); 825 if (mb == NULL) { 826 ff_mbuf_free(hdr); 827 rte_pktmbuf_free(pkt); 828 return; 829 } 830 pn = pn->next; 831 prev = mb; 832 } 833 834 ff_veth_process_packet(ctx->ifp, hdr); 835 } 836 837 static enum FilterReturn 838 protocol_filter(const void *data, uint16_t len) 839 { 840 if(len < ETHER_HDR_LEN) 841 return FILTER_UNKNOWN; 842 843 const struct ether_hdr *hdr; 844 hdr = (const struct ether_hdr *)data; 845 846 if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP) 847 return FILTER_ARP; 848 849 #ifndef FF_KNI 850 return FILTER_UNKNOWN; 851 #else 852 if (!enable_kni) { 853 return FILTER_UNKNOWN; 854 } 855 856 if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4) 857 return FILTER_UNKNOWN; 858 859 return ff_kni_proto_filter(data + ETHER_HDR_LEN, 860 len - ETHER_HDR_LEN); 861 #endif 862 } 863 864 static inline void 865 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m) 866 { 867 struct rte_mbuf *md; 868 void *src, *dst; 869 870 dst = rte_pktmbuf_mtod(mi, void *); 871 src = rte_pktmbuf_mtod(m, void *); 872 873 mi->data_len = m->data_len; 874 rte_memcpy(dst, src, m->data_len); 875 876 mi->port = m->port; 877 mi->vlan_tci = m->vlan_tci; 878 mi->vlan_tci_outer = m->vlan_tci_outer; 879 mi->tx_offload = m->tx_offload; 880 mi->hash = m->hash; 881 mi->ol_flags = m->ol_flags; 882 mi->packet_type = m->packet_type; 883 } 884 885 /* copied from rte_pktmbuf_clone */ 886 static inline struct rte_mbuf * 887 pktmbuf_deep_clone(const struct rte_mbuf *md, 888 struct rte_mempool *mp) 889 { 890 struct rte_mbuf *mc, *mi, **prev; 891 uint32_t pktlen; 892 uint8_t nseg; 893 894 if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) 895 return NULL; 896 897 mi = mc; 898 prev = &mi->next; 899 pktlen = md->pkt_len; 900 nseg = 0; 901 902 do { 903 nseg++; 904 pktmbuf_deep_attach(mi, md); 905 *prev = mi; 906 prev = &mi->next; 907 } while ((md = md->next) != NULL && 908 (mi = rte_pktmbuf_alloc(mp)) != NULL); 909 910 *prev = NULL; 911 mc->nb_segs = nseg; 912 mc->pkt_len = pktlen; 913 914 /* Allocation of new indirect segment failed */ 915 if (unlikely (mi == NULL)) { 916 rte_pktmbuf_free(mc); 917 return NULL; 918 } 919 920 __rte_mbuf_sanity_check(mc, 1); 921 return mc; 922 } 923 924 static inline void 925 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 926 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 927 { 928 struct lcore_conf *qconf = &lcore_conf; 929 uint16_t nb_queues = qconf->nb_queue_list[port_id]; 930 931 uint16_t i; 932 for (i = 0; i < count; i++) { 933 struct rte_mbuf *rtem = bufs[i]; 934 935 if (unlikely(qconf->pcap[port_id] != NULL)) { 936 if (!pkts_from_ring) { 937 ff_dump_packets(qconf->pcap[port_id], rtem); 938 } 939 } 940 941 void *data = rte_pktmbuf_mtod(rtem, void*); 942 uint16_t len = rte_pktmbuf_data_len(rtem); 943 944 if (!pkts_from_ring) { 945 ff_traffic.rx_packets++; 946 ff_traffic.rx_bytes += len; 947 } 948 949 if (!pkts_from_ring && packet_dispatcher) { 950 int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues); 951 if (ret == FF_DISPATCH_RESPONSE) { 952 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len; 953 send_single_packet(rtem, port_id); 954 continue; 955 } 956 957 if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) { 958 rte_pktmbuf_free(rtem); 959 continue; 960 } 961 962 if (ret != queue_id) { 963 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem); 964 if (ret < 0) 965 rte_pktmbuf_free(rtem); 966 967 continue; 968 } 969 } 970 971 enum FilterReturn filter = protocol_filter(data, len); 972 if (filter == FILTER_ARP) { 973 struct rte_mempool *mbuf_pool; 974 struct rte_mbuf *mbuf_clone; 975 if (!pkts_from_ring) { 976 uint16_t j; 977 for(j = 0; j < nb_queues; ++j) { 978 if(j == queue_id) 979 continue; 980 981 unsigned socket_id = 0; 982 if (numa_on) { 983 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j]; 984 socket_id = rte_lcore_to_socket_id(lcore_id); 985 } 986 mbuf_pool = pktmbuf_pool[socket_id]; 987 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 988 if(mbuf_clone) { 989 int ret = rte_ring_enqueue(dispatch_ring[port_id][j], 990 mbuf_clone); 991 if (ret < 0) 992 rte_pktmbuf_free(mbuf_clone); 993 } 994 } 995 } 996 997 #ifdef FF_KNI 998 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 999 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 1000 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1001 if(mbuf_clone) { 1002 ff_kni_enqueue(port_id, mbuf_clone); 1003 } 1004 } 1005 #endif 1006 ff_veth_input(ctx, rtem); 1007 #ifdef FF_KNI 1008 } else if (enable_kni && 1009 ((filter == FILTER_KNI && kni_accept) || 1010 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 1011 ff_kni_enqueue(port_id, rtem); 1012 #endif 1013 } else { 1014 ff_veth_input(ctx, rtem); 1015 } 1016 } 1017 } 1018 1019 static inline int 1020 process_dispatch_ring(uint16_t port_id, uint16_t queue_id, 1021 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 1022 { 1023 /* read packet from ring buf and to process */ 1024 uint16_t nb_rb; 1025 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id], 1026 (void **)pkts_burst, MAX_PKT_BURST, NULL); 1027 1028 if(nb_rb > 0) { 1029 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 1030 } 1031 1032 return 0; 1033 } 1034 1035 static inline void 1036 handle_sysctl_msg(struct ff_msg *msg) 1037 { 1038 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 1039 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 1040 msg->sysctl.newlen); 1041 1042 if (ret < 0) { 1043 msg->result = errno; 1044 } else { 1045 msg->result = 0; 1046 } 1047 } 1048 1049 static inline void 1050 handle_ioctl_msg(struct ff_msg *msg) 1051 { 1052 int fd, ret; 1053 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 1054 if (fd < 0) { 1055 ret = -1; 1056 goto done; 1057 } 1058 1059 ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data); 1060 1061 ff_close(fd); 1062 1063 done: 1064 if (ret < 0) { 1065 msg->result = errno; 1066 } else { 1067 msg->result = 0; 1068 } 1069 } 1070 1071 static inline void 1072 handle_route_msg(struct ff_msg *msg) 1073 { 1074 int ret = ff_rtioctl(msg->route.fib, msg->route.data, 1075 &msg->route.len, msg->route.maxlen); 1076 if (ret < 0) { 1077 msg->result = errno; 1078 } else { 1079 msg->result = 0; 1080 } 1081 } 1082 1083 static inline void 1084 handle_top_msg(struct ff_msg *msg) 1085 { 1086 msg->top = ff_top_status; 1087 msg->result = 0; 1088 } 1089 1090 #ifdef FF_NETGRAPH 1091 static inline void 1092 handle_ngctl_msg(struct ff_msg *msg) 1093 { 1094 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data); 1095 if (ret < 0) { 1096 msg->result = errno; 1097 } else { 1098 msg->result = 0; 1099 msg->ngctl.ret = ret; 1100 } 1101 } 1102 #endif 1103 1104 #ifdef FF_IPFW 1105 static inline void 1106 handle_ipfw_msg(struct ff_msg *msg) 1107 { 1108 int fd, ret; 1109 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW); 1110 if (fd < 0) { 1111 ret = -1; 1112 goto done; 1113 } 1114 1115 switch (msg->ipfw.cmd) { 1116 case FF_IPFW_GET: 1117 ret = ff_getsockopt_freebsd(fd, msg->ipfw.level, 1118 msg->ipfw.optname, msg->ipfw.optval, 1119 msg->ipfw.optlen); 1120 break; 1121 case FF_IPFW_SET: 1122 ret = ff_setsockopt_freebsd(fd, msg->ipfw.level, 1123 msg->ipfw.optname, msg->ipfw.optval, 1124 *(msg->ipfw.optlen)); 1125 break; 1126 default: 1127 ret = -1; 1128 errno = ENOTSUP; 1129 break; 1130 } 1131 1132 ff_close(fd); 1133 1134 done: 1135 if (ret < 0) { 1136 msg->result = errno; 1137 } else { 1138 msg->result = 0; 1139 } 1140 } 1141 #endif 1142 1143 static inline void 1144 handle_traffic_msg(struct ff_msg *msg) 1145 { 1146 msg->traffic = ff_traffic; 1147 msg->result = 0; 1148 } 1149 1150 static inline void 1151 handle_default_msg(struct ff_msg *msg) 1152 { 1153 msg->result = ENOTSUP; 1154 } 1155 1156 static inline void 1157 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1158 { 1159 switch (msg->msg_type) { 1160 case FF_SYSCTL: 1161 handle_sysctl_msg(msg); 1162 break; 1163 case FF_IOCTL: 1164 handle_ioctl_msg(msg); 1165 break; 1166 case FF_ROUTE: 1167 handle_route_msg(msg); 1168 break; 1169 case FF_TOP: 1170 handle_top_msg(msg); 1171 break; 1172 #ifdef FF_NETGRAPH 1173 case FF_NGCTL: 1174 handle_ngctl_msg(msg); 1175 break; 1176 #endif 1177 #ifdef FF_IPFW 1178 case FF_IPFW_CTL: 1179 handle_ipfw_msg(msg); 1180 break; 1181 #endif 1182 case FF_TRAFFIC: 1183 handle_traffic_msg(msg); 1184 break; 1185 default: 1186 handle_default_msg(msg); 1187 break; 1188 } 1189 rte_ring_enqueue(msg_ring[proc_id].ring[1], msg); 1190 } 1191 1192 static inline int 1193 process_msg_ring(uint16_t proc_id) 1194 { 1195 void *msg; 1196 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1197 1198 if (unlikely(ret == 0)) { 1199 handle_msg((struct ff_msg *)msg, proc_id); 1200 } 1201 1202 return 0; 1203 } 1204 1205 /* Send burst of packets on an output interface */ 1206 static inline int 1207 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1208 { 1209 struct rte_mbuf **m_table; 1210 int ret; 1211 uint16_t queueid; 1212 1213 queueid = qconf->tx_queue_id[port]; 1214 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1215 1216 if (unlikely(qconf->pcap[port] != NULL)) { 1217 uint16_t i; 1218 for (i = 0; i < n; i++) { 1219 ff_dump_packets(qconf->pcap[port], m_table[i]); 1220 } 1221 } 1222 1223 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1224 ff_traffic.tx_packets += ret; 1225 uint16_t i; 1226 for (i = 0; i < ret; i++) { 1227 ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]); 1228 #ifdef FF_USE_PAGE_ARRAY 1229 if (qconf->tx_mbufs[port].bsd_m_table[i]) 1230 ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs); 1231 #endif 1232 } 1233 if (unlikely(ret < n)) { 1234 do { 1235 rte_pktmbuf_free(m_table[ret]); 1236 #ifdef FF_USE_PAGE_ARRAY 1237 if ( qconf->tx_mbufs[port].bsd_m_table[ret] ) 1238 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]); 1239 #endif 1240 } while (++ret < n); 1241 } 1242 return 0; 1243 } 1244 1245 /* Enqueue a single packet, and send burst if queue is filled */ 1246 static inline int 1247 send_single_packet(struct rte_mbuf *m, uint8_t port) 1248 { 1249 uint16_t len; 1250 struct lcore_conf *qconf; 1251 1252 qconf = &lcore_conf; 1253 len = qconf->tx_mbufs[port].len; 1254 qconf->tx_mbufs[port].m_table[len] = m; 1255 len++; 1256 1257 /* enough pkts to be sent */ 1258 if (unlikely(len == MAX_PKT_BURST)) { 1259 send_burst(qconf, MAX_PKT_BURST, port); 1260 len = 0; 1261 } 1262 1263 qconf->tx_mbufs[port].len = len; 1264 return 0; 1265 } 1266 1267 int 1268 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1269 int total) 1270 { 1271 #ifdef FF_USE_PAGE_ARRAY 1272 struct lcore_conf *qconf = &lcore_conf; 1273 int len = 0; 1274 1275 len = ff_if_send_onepkt(ctx, m,total); 1276 if (unlikely(len == MAX_PKT_BURST)) { 1277 send_burst(qconf, MAX_PKT_BURST, ctx->port_id); 1278 len = 0; 1279 } 1280 qconf->tx_mbufs[ctx->port_id].len = len; 1281 return 0; 1282 #endif 1283 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1284 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1285 if (head == NULL) { 1286 ff_mbuf_free(m); 1287 return -1; 1288 } 1289 1290 head->pkt_len = total; 1291 head->nb_segs = 0; 1292 1293 int off = 0; 1294 struct rte_mbuf *cur = head, *prev = NULL; 1295 while(total > 0) { 1296 if (cur == NULL) { 1297 cur = rte_pktmbuf_alloc(mbuf_pool); 1298 if (cur == NULL) { 1299 rte_pktmbuf_free(head); 1300 ff_mbuf_free(m); 1301 return -1; 1302 } 1303 } 1304 1305 if (prev != NULL) { 1306 prev->next = cur; 1307 } 1308 head->nb_segs++; 1309 1310 prev = cur; 1311 void *data = rte_pktmbuf_mtod(cur, void*); 1312 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1313 int ret = ff_mbuf_copydata(m, data, off, len); 1314 if (ret < 0) { 1315 rte_pktmbuf_free(head); 1316 ff_mbuf_free(m); 1317 return -1; 1318 } 1319 1320 1321 cur->data_len = len; 1322 off += len; 1323 total -= len; 1324 cur = NULL; 1325 } 1326 1327 struct ff_tx_offload offload = {0}; 1328 ff_mbuf_tx_offload(m, &offload); 1329 1330 void *data = rte_pktmbuf_mtod(head, void*); 1331 1332 if (offload.ip_csum) { 1333 /* ipv6 not supported yet */ 1334 struct ipv4_hdr *iph; 1335 int iph_len; 1336 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1337 iph_len = (iph->version_ihl & 0x0f) << 2; 1338 1339 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 1340 head->l2_len = ETHER_HDR_LEN; 1341 head->l3_len = iph_len; 1342 } 1343 1344 if (ctx->hw_features.tx_csum_l4) { 1345 struct ipv4_hdr *iph; 1346 int iph_len; 1347 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1348 iph_len = (iph->version_ihl & 0x0f) << 2; 1349 1350 if (offload.tcp_csum) { 1351 head->ol_flags |= PKT_TX_TCP_CKSUM; 1352 head->l2_len = ETHER_HDR_LEN; 1353 head->l3_len = iph_len; 1354 } 1355 1356 /* 1357 * TCP segmentation offload. 1358 * 1359 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 1360 * implies PKT_TX_TCP_CKSUM) 1361 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 1362 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 1363 * write the IP checksum to 0 in the packet 1364 * - fill the mbuf offload information: l2_len, 1365 * l3_len, l4_len, tso_segsz 1366 * - calculate the pseudo header checksum without taking ip_len 1367 * in account, and set it in the TCP header. Refer to 1368 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 1369 * used as helpers. 1370 */ 1371 if (offload.tso_seg_size) { 1372 struct tcp_hdr *tcph; 1373 int tcph_len; 1374 tcph = (struct tcp_hdr *)((char *)iph + iph_len); 1375 tcph_len = (tcph->data_off & 0xf0) >> 2; 1376 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 1377 1378 head->ol_flags |= PKT_TX_TCP_SEG; 1379 head->l4_len = tcph_len; 1380 head->tso_segsz = offload.tso_seg_size; 1381 } 1382 1383 if (offload.udp_csum) { 1384 head->ol_flags |= PKT_TX_UDP_CKSUM; 1385 head->l2_len = ETHER_HDR_LEN; 1386 head->l3_len = iph_len; 1387 } 1388 } 1389 1390 ff_mbuf_free(m); 1391 1392 return send_single_packet(head, ctx->port_id); 1393 } 1394 1395 static int 1396 main_loop(void *arg) 1397 { 1398 struct loop_routine *lr = (struct loop_routine *)arg; 1399 1400 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1401 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc; 1402 int i, j, nb_rx, idle; 1403 uint16_t port_id, queue_id; 1404 struct lcore_conf *qconf; 1405 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 1406 US_PER_S * BURST_TX_DRAIN_US; 1407 struct ff_dpdk_if_context *ctx; 1408 1409 prev_tsc = 0; 1410 usch_tsc = 0; 1411 1412 qconf = &lcore_conf; 1413 1414 while (1) { 1415 cur_tsc = rte_rdtsc(); 1416 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1417 rte_timer_manage(); 1418 } 1419 1420 idle = 1; 1421 sys_tsc = 0; 1422 usr_tsc = 0; 1423 1424 /* 1425 * TX burst queue drain 1426 */ 1427 diff_tsc = cur_tsc - prev_tsc; 1428 if (unlikely(diff_tsc > drain_tsc)) { 1429 for (i = 0; i < qconf->nb_tx_port; i++) { 1430 port_id = qconf->tx_port_id[i]; 1431 if (qconf->tx_mbufs[port_id].len == 0) 1432 continue; 1433 1434 idle = 0; 1435 1436 send_burst(qconf, 1437 qconf->tx_mbufs[port_id].len, 1438 port_id); 1439 qconf->tx_mbufs[port_id].len = 0; 1440 } 1441 1442 prev_tsc = cur_tsc; 1443 } 1444 1445 /* 1446 * Read packet from RX queues 1447 */ 1448 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1449 port_id = qconf->rx_queue_list[i].port_id; 1450 queue_id = qconf->rx_queue_list[i].queue_id; 1451 ctx = veth_ctx[port_id]; 1452 1453 #ifdef FF_KNI 1454 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1455 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1456 } 1457 #endif 1458 1459 process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); 1460 1461 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1462 MAX_PKT_BURST); 1463 if (nb_rx == 0) 1464 continue; 1465 1466 idle = 0; 1467 1468 /* Prefetch first packets */ 1469 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1470 rte_prefetch0(rte_pktmbuf_mtod( 1471 pkts_burst[j], void *)); 1472 } 1473 1474 /* Prefetch and handle already prefetched packets */ 1475 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1476 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1477 j + PREFETCH_OFFSET], void *)); 1478 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1479 } 1480 1481 /* Handle remaining prefetched packets */ 1482 for (; j < nb_rx; j++) { 1483 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1484 } 1485 } 1486 1487 process_msg_ring(qconf->proc_id); 1488 1489 div_tsc = rte_rdtsc(); 1490 1491 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) { 1492 usch_tsc = cur_tsc; 1493 lr->loop(lr->arg); 1494 } 1495 1496 idle_sleep_tsc = rte_rdtsc(); 1497 if (likely(idle && idle_sleep)) { 1498 usleep(idle_sleep); 1499 end_tsc = rte_rdtsc(); 1500 } else { 1501 end_tsc = idle_sleep_tsc; 1502 } 1503 1504 end_tsc = rte_rdtsc(); 1505 1506 if (usch_tsc == cur_tsc) { 1507 usr_tsc = idle_sleep_tsc - div_tsc; 1508 } 1509 1510 if (!idle) { 1511 sys_tsc = div_tsc - cur_tsc; 1512 ff_top_status.sys_tsc += sys_tsc; 1513 } 1514 1515 ff_top_status.usr_tsc += usr_tsc; 1516 ff_top_status.work_tsc += end_tsc - cur_tsc; 1517 ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1518 1519 ff_top_status.loops++; 1520 } 1521 1522 return 0; 1523 } 1524 1525 int 1526 ff_dpdk_if_up(void) { 1527 int i; 1528 struct lcore_conf *qconf = &lcore_conf; 1529 for (i = 0; i < qconf->nb_tx_port; i++) { 1530 uint16_t port_id = qconf->tx_port_id[i]; 1531 1532 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id]; 1533 veth_ctx[port_id] = ff_veth_attach(pconf); 1534 if (veth_ctx[port_id] == NULL) { 1535 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1536 } 1537 } 1538 1539 return 0; 1540 } 1541 1542 void 1543 ff_dpdk_run(loop_func_t loop, void *arg) { 1544 struct loop_routine *lr = rte_malloc(NULL, 1545 sizeof(struct loop_routine), 0); 1546 lr->loop = loop; 1547 lr->arg = arg; 1548 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1549 rte_eal_mp_wait_lcore(); 1550 rte_free(lr); 1551 } 1552 1553 void 1554 ff_dpdk_pktmbuf_free(void *m) 1555 { 1556 rte_pktmbuf_free((struct rte_mbuf *)m); 1557 } 1558 1559 static uint32_t 1560 toeplitz_hash(unsigned keylen, const uint8_t *key, 1561 unsigned datalen, const uint8_t *data) 1562 { 1563 uint32_t hash = 0, v; 1564 u_int i, b; 1565 1566 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1567 1568 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1569 for (i = 0; i < datalen; i++) { 1570 for (b = 0; b < 8; b++) { 1571 if (data[i] & (1<<(7-b))) 1572 hash ^= v; 1573 v <<= 1; 1574 if ((i + 4) < keylen && 1575 (key[i+4] & (1<<(7-b)))) 1576 v |= 1; 1577 } 1578 } 1579 return (hash); 1580 } 1581 1582 int 1583 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1584 uint16_t sport, uint16_t dport) 1585 { 1586 struct lcore_conf *qconf = &lcore_conf; 1587 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 1588 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id]; 1589 1590 if (nb_queues <= 1) { 1591 return 1; 1592 } 1593 1594 uint16_t reta_size = rss_reta_size[ctx->port_id]; 1595 uint16_t queueid = qconf->tx_queue_id[ctx->port_id]; 1596 1597 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1598 sizeof(dport)]; 1599 1600 unsigned datalen = 0; 1601 1602 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1603 datalen += sizeof(saddr); 1604 1605 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1606 datalen += sizeof(daddr); 1607 1608 bcopy(&sport, &data[datalen], sizeof(sport)); 1609 datalen += sizeof(sport); 1610 1611 bcopy(&dport, &data[datalen], sizeof(dport)); 1612 datalen += sizeof(dport); 1613 1614 uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes), 1615 default_rsskey_40bytes, datalen, data); 1616 1617 return ((hash & (reta_size - 1)) % nb_queues) == queueid; 1618 } 1619 1620 void 1621 ff_regist_packet_dispatcher(dispatch_func_t func) 1622 { 1623 packet_dispatcher = func; 1624 } 1625 1626 uint64_t 1627 ff_get_tsc_ns() 1628 { 1629 uint64_t cur_tsc = rte_rdtsc(); 1630 uint64_t hz = rte_get_tsc_hz(); 1631 return ((double)cur_tsc/(double)hz) * NS_PER_S; 1632 } 1633 1634