1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 #include <unistd.h> 28 #include <sys/mman.h> 29 #include <errno.h> 30 31 #include <rte_common.h> 32 #include <rte_byteorder.h> 33 #include <rte_log.h> 34 #include <rte_memory.h> 35 #include <rte_memcpy.h> 36 #include <rte_memzone.h> 37 #include <rte_config.h> 38 #include <rte_eal.h> 39 #include <rte_pci.h> 40 #include <rte_mbuf.h> 41 #include <rte_memory.h> 42 #include <rte_lcore.h> 43 #include <rte_launch.h> 44 #include <rte_ethdev.h> 45 #include <rte_debug.h> 46 #include <rte_common.h> 47 #include <rte_ether.h> 48 #include <rte_malloc.h> 49 #include <rte_cycles.h> 50 #include <rte_timer.h> 51 #include <rte_thash.h> 52 #include <rte_ip.h> 53 #include <rte_tcp.h> 54 #include <rte_udp.h> 55 56 #include "ff_dpdk_if.h" 57 #include "ff_dpdk_pcap.h" 58 #include "ff_dpdk_kni.h" 59 #include "ff_config.h" 60 #include "ff_veth.h" 61 #include "ff_host_interface.h" 62 #include "ff_msg.h" 63 #include "ff_api.h" 64 #include "ff_memory.h" 65 66 #ifdef FF_KNI 67 #define KNI_MBUF_MAX 2048 68 #define KNI_QUEUE_SIZE 2048 69 70 int enable_kni; 71 static int kni_accept; 72 #endif 73 74 static int numa_on; 75 76 static unsigned idle_sleep; 77 static unsigned pkt_tx_delay; 78 79 static struct rte_timer freebsd_clock; 80 81 // Mellanox Linux's driver key 82 static uint8_t default_rsskey_40bytes[40] = { 83 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 84 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 85 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 86 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 87 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 88 }; 89 90 static int use_rsskey_52bytes = 0; 91 static uint8_t default_rsskey_52bytes[52] = { 92 0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23, 93 0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30, 94 0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02, 95 0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c, 96 0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55, 97 0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e, 98 0x81, 0x15, 0x03, 0x66 99 }; 100 101 struct lcore_conf lcore_conf; 102 103 struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 104 105 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; 106 static dispatch_func_t packet_dispatcher; 107 108 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 109 110 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port); 111 112 struct ff_msg_ring { 113 char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE]; 114 /* ring[0] for lcore recv msg, other send */ 115 /* ring[1] for lcore send msg, other read */ 116 struct rte_ring *ring[FF_MSG_NUM]; 117 } __rte_cache_aligned; 118 119 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 120 static struct rte_mempool *message_pool; 121 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 122 123 static struct ff_top_args ff_top_status; 124 static struct ff_traffic_args ff_traffic; 125 extern void ff_hardclock(void); 126 127 static void 128 ff_hardclock_job(__rte_unused struct rte_timer *timer, 129 __rte_unused void *arg) { 130 ff_hardclock(); 131 ff_update_current_ts(); 132 } 133 134 struct ff_dpdk_if_context * 135 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 136 { 137 struct ff_dpdk_if_context *ctx; 138 139 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 140 if (ctx == NULL) 141 return NULL; 142 143 ctx->sc = sc; 144 ctx->ifp = ifp; 145 ctx->port_id = cfg->port_id; 146 ctx->hw_features = cfg->hw_features; 147 148 return ctx; 149 } 150 151 void 152 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 153 { 154 free(ctx); 155 } 156 157 static void 158 check_all_ports_link_status(void) 159 { 160 #define CHECK_INTERVAL 100 /* 100ms */ 161 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 162 163 uint16_t portid; 164 uint8_t count, all_ports_up, print_flag = 0; 165 struct rte_eth_link link; 166 167 printf("\nChecking link status"); 168 fflush(stdout); 169 170 int i, nb_ports; 171 nb_ports = ff_global_cfg.dpdk.nb_ports; 172 for (count = 0; count <= MAX_CHECK_TIME; count++) { 173 all_ports_up = 1; 174 for (i = 0; i < nb_ports; i++) { 175 uint16_t portid = ff_global_cfg.dpdk.portid_list[i]; 176 memset(&link, 0, sizeof(link)); 177 rte_eth_link_get_nowait(portid, &link); 178 179 /* print link status if flag set */ 180 if (print_flag == 1) { 181 if (link.link_status) { 182 printf("Port %d Link Up - speed %u " 183 "Mbps - %s\n", (int)portid, 184 (unsigned)link.link_speed, 185 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 186 ("full-duplex") : ("half-duplex\n")); 187 } else { 188 printf("Port %d Link Down\n", (int)portid); 189 } 190 continue; 191 } 192 /* clear all_ports_up flag if any link down */ 193 if (link.link_status == 0) { 194 all_ports_up = 0; 195 break; 196 } 197 } 198 199 /* after finally printing all link status, get out */ 200 if (print_flag == 1) 201 break; 202 203 if (all_ports_up == 0) { 204 printf("."); 205 fflush(stdout); 206 rte_delay_ms(CHECK_INTERVAL); 207 } 208 209 /* set the print_flag if all ports up or timeout */ 210 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 211 print_flag = 1; 212 printf("done\n"); 213 } 214 } 215 } 216 217 static int 218 init_lcore_conf(void) 219 { 220 uint8_t nb_dev_ports = rte_eth_dev_count_avail(); 221 if (nb_dev_ports == 0) { 222 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 223 } 224 225 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) { 226 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n", 227 ff_global_cfg.dpdk.max_portid); 228 } 229 230 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs; 231 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 232 233 uint16_t proc_id; 234 for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) { 235 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id]; 236 if (!lcore_config[lcore_id].detected) { 237 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 238 } 239 } 240 241 uint16_t socket_id = 0; 242 if (numa_on) { 243 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 244 } 245 246 lcore_conf.socket_id = socket_id; 247 248 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id]; 249 int j; 250 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) { 251 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j]; 252 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 253 254 int queueid = -1; 255 int i; 256 for (i = 0; i < pconf->nb_lcores; i++) { 257 if (pconf->lcore_list[i] == lcore_id) { 258 queueid = i; 259 } 260 } 261 if (queueid < 0) { 262 continue; 263 } 264 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid); 265 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 266 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 267 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid; 268 lcore_conf.nb_rx_queue++; 269 270 lcore_conf.tx_queue_id[port_id] = queueid; 271 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id; 272 lcore_conf.nb_tx_port++; 273 274 lcore_conf.pcap[port_id] = pconf->pcap; 275 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores; 276 } 277 278 if (lcore_conf.nb_rx_queue == 0) { 279 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id); 280 } 281 282 return 0; 283 } 284 285 static int 286 init_mem_pool(void) 287 { 288 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 289 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 290 uint32_t nb_tx_queue = nb_lcores; 291 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 292 293 unsigned nb_mbuf = RTE_ALIGN_CEIL ( 294 (nb_rx_queue*RX_QUEUE_SIZE + 295 nb_ports*nb_lcores*MAX_PKT_BURST + 296 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 297 nb_lcores*MEMPOOL_CACHE_SIZE + 298 #ifdef FF_KNI 299 nb_ports*KNI_MBUF_MAX + 300 nb_ports*KNI_QUEUE_SIZE + 301 #endif 302 nb_lcores*nb_ports*DISPATCH_RING_SIZE), 303 (unsigned)8192); 304 305 unsigned socketid = 0; 306 uint16_t i, lcore_id; 307 char s[64]; 308 309 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) { 310 lcore_id = ff_global_cfg.dpdk.proc_lcore[i]; 311 if (numa_on) { 312 socketid = rte_lcore_to_socket_id(lcore_id); 313 } 314 315 if (socketid >= NB_SOCKETS) { 316 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 317 socketid, i, NB_SOCKETS); 318 } 319 320 if (pktmbuf_pool[socketid] != NULL) { 321 continue; 322 } 323 324 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 325 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 326 pktmbuf_pool[socketid] = 327 rte_pktmbuf_pool_create(s, nb_mbuf, 328 MEMPOOL_CACHE_SIZE, 0, 329 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 330 } else { 331 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 332 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 333 } 334 335 if (pktmbuf_pool[socketid] == NULL) { 336 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 337 } else { 338 printf("create mbuf pool on socket %d\n", socketid); 339 } 340 341 #ifdef FF_USE_PAGE_ARRAY 342 nb_mbuf = RTE_ALIGN_CEIL ( 343 nb_ports*nb_lcores*MAX_PKT_BURST + 344 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 345 nb_lcores*MEMPOOL_CACHE_SIZE, 346 (unsigned)4096); 347 ff_init_ref_pool(nb_mbuf, socketid); 348 #endif 349 } 350 351 return 0; 352 } 353 354 static struct rte_ring * 355 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 356 { 357 struct rte_ring *ring; 358 359 if (name == NULL) { 360 rte_exit(EXIT_FAILURE, "create ring failed, no name!\n"); 361 } 362 363 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 364 ring = rte_ring_create(name, count, socket_id, flags); 365 } else { 366 ring = rte_ring_lookup(name); 367 } 368 369 if (ring == NULL) { 370 rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name); 371 } 372 373 return ring; 374 } 375 376 static int 377 init_dispatch_ring(void) 378 { 379 int j; 380 char name_buf[RTE_RING_NAMESIZE]; 381 int queueid; 382 383 unsigned socketid = lcore_conf.socket_id; 384 385 /* Create ring according to ports actually being used. */ 386 int nb_ports = ff_global_cfg.dpdk.nb_ports; 387 for (j = 0; j < nb_ports; j++) { 388 uint16_t portid = ff_global_cfg.dpdk.portid_list[j]; 389 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid]; 390 int nb_queues = pconf->nb_lcores; 391 if (dispatch_ring[portid] == NULL) { 392 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid); 393 394 dispatch_ring[portid] = rte_zmalloc(name_buf, 395 sizeof(struct rte_ring *) * nb_queues, 396 RTE_CACHE_LINE_SIZE); 397 if (dispatch_ring[portid] == NULL) { 398 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 399 "failed\n", name_buf); 400 } 401 } 402 403 for(queueid = 0; queueid < nb_queues; ++queueid) { 404 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d", 405 portid, queueid); 406 dispatch_ring[portid][queueid] = create_ring(name_buf, 407 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ); 408 409 if (dispatch_ring[portid][queueid] == NULL) 410 rte_panic("create ring:%s failed!\n", name_buf); 411 412 printf("create ring:%s success, %u ring entries are now free!\n", 413 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid])); 414 } 415 } 416 417 return 0; 418 } 419 420 static void 421 ff_msg_init(struct rte_mempool *mp, 422 __attribute__((unused)) void *opaque_arg, 423 void *obj, __attribute__((unused)) unsigned i) 424 { 425 struct ff_msg *msg = (struct ff_msg *)obj; 426 msg->msg_type = FF_UNKNOWN; 427 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 428 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 429 } 430 431 static int 432 init_msg_ring(void) 433 { 434 uint16_t i, j; 435 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 436 unsigned socketid = lcore_conf.socket_id; 437 438 /* Create message buffer pool */ 439 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 440 message_pool = rte_mempool_create(FF_MSG_POOL, 441 MSG_RING_SIZE * 2 * nb_procs, 442 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 443 NULL, NULL, ff_msg_init, NULL, 444 socketid, 0); 445 } else { 446 message_pool = rte_mempool_lookup(FF_MSG_POOL); 447 } 448 449 if (message_pool == NULL) { 450 rte_panic("Create msg mempool failed\n"); 451 } 452 453 for(i = 0; i < nb_procs; ++i) { 454 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 455 "%s%u", FF_MSG_RING_IN, i); 456 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 457 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 458 if (msg_ring[i].ring[0] == NULL) 459 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 460 461 for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) { 462 snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE, 463 "%s%u_%u", FF_MSG_RING_OUT, i, j); 464 msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j], 465 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 466 if (msg_ring[i].ring[j] == NULL) 467 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]); 468 } 469 } 470 471 return 0; 472 } 473 474 #ifdef FF_KNI 475 static int 476 init_kni(void) 477 { 478 int nb_ports = rte_eth_dev_count_avail(); 479 kni_accept = 0; 480 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 481 kni_accept = 1; 482 483 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 484 ff_global_cfg.kni.udp_port); 485 486 unsigned socket_id = lcore_conf.socket_id; 487 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 488 489 nb_ports = ff_global_cfg.dpdk.nb_ports; 490 int i, ret; 491 for (i = 0; i < nb_ports; i++) { 492 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 493 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 494 } 495 496 return 0; 497 } 498 #endif 499 500 static void 501 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues) 502 { 503 if (reta_size == 0) { 504 return; 505 } 506 507 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 508 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 509 510 /* config HW indirection table */ 511 unsigned i, j, hash=0; 512 for (i = 0; i < reta_conf_size; i++) { 513 reta_conf[i].mask = ~0ULL; 514 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 515 reta_conf[i].reta[j] = hash++ % nb_queues; 516 } 517 } 518 519 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 520 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 521 port_id); 522 } 523 } 524 525 static int 526 init_port_start(void) 527 { 528 int nb_ports = ff_global_cfg.dpdk.nb_ports; 529 unsigned socketid = 0; 530 struct rte_mempool *mbuf_pool; 531 uint16_t i; 532 533 for (i = 0; i < nb_ports; i++) { 534 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 535 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 536 uint16_t nb_queues = pconf->nb_lcores; 537 538 struct rte_eth_dev_info dev_info; 539 struct rte_eth_conf port_conf = {0}; 540 struct rte_eth_rxconf rxq_conf; 541 struct rte_eth_txconf txq_conf; 542 543 rte_eth_dev_info_get(port_id, &dev_info); 544 545 if (nb_queues > dev_info.max_rx_queues) { 546 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 547 nb_queues, 548 dev_info.max_rx_queues); 549 } 550 551 if (nb_queues > dev_info.max_tx_queues) { 552 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 553 nb_queues, 554 dev_info.max_tx_queues); 555 } 556 557 struct ether_addr addr; 558 rte_eth_macaddr_get(port_id, &addr); 559 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 560 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 561 (unsigned)port_id, 562 addr.addr_bytes[0], addr.addr_bytes[1], 563 addr.addr_bytes[2], addr.addr_bytes[3], 564 addr.addr_bytes[4], addr.addr_bytes[5]); 565 566 rte_memcpy(pconf->mac, 567 addr.addr_bytes, ETHER_ADDR_LEN); 568 569 /* Set RSS mode */ 570 uint64_t default_rss_hf = ETH_RSS_PROTO_MASK; 571 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 572 port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf; 573 if (dev_info.hash_key_size == 52) { 574 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_52bytes; 575 port_conf.rx_adv_conf.rss_conf.rss_key_len = 52; 576 use_rsskey_52bytes = 1; 577 }else{ 578 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes; 579 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; 580 } 581 port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads; 582 if (port_conf.rx_adv_conf.rss_conf.rss_hf != 583 ETH_RSS_PROTO_MASK) { 584 printf("Port %u modified RSS hash function based on hardware support," 585 "requested:%#"PRIx64" configured:%#"PRIx64"\n", 586 port_id, default_rss_hf, 587 port_conf.rx_adv_conf.rss_conf.rss_hf); 588 } 589 590 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) { 591 port_conf.txmode.offloads |= 592 DEV_TX_OFFLOAD_MBUF_FAST_FREE; 593 } 594 595 /* Set Rx VLAN stripping */ 596 if (ff_global_cfg.dpdk.vlan_strip) { 597 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 598 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP; 599 } 600 } 601 602 /* Enable HW CRC stripping */ 603 port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC; 604 605 /* FIXME: Enable TCP LRO ?*/ 606 #if 0 607 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 608 printf("LRO is supported\n"); 609 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO; 610 pconf->hw_features.rx_lro = 1; 611 } 612 #endif 613 614 /* Set Rx checksum checking */ 615 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 616 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 617 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 618 printf("RX checksum offload supported\n"); 619 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM; 620 pconf->hw_features.rx_csum = 1; 621 } 622 623 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 624 printf("TX ip checksum offload supported\n"); 625 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; 626 pconf->hw_features.tx_csum_ip = 1; 627 } 628 629 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 630 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 631 printf("TX TCP&UDP checksum offload supported\n"); 632 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM; 633 pconf->hw_features.tx_csum_l4 = 1; 634 } 635 636 if (ff_global_cfg.dpdk.tso) { 637 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 638 printf("TSO is supported\n"); 639 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO; 640 pconf->hw_features.tx_tso = 1; 641 } 642 } else { 643 printf("TSO is disabled\n"); 644 } 645 646 if (dev_info.reta_size) { 647 /* reta size must be power of 2 */ 648 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 649 650 rss_reta_size[port_id] = dev_info.reta_size; 651 printf("port[%d]: rss table size: %d\n", port_id, 652 dev_info.reta_size); 653 } 654 655 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 656 continue; 657 } 658 659 int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); 660 if (ret != 0) { 661 return ret; 662 } 663 664 static uint16_t nb_rxd = RX_QUEUE_SIZE; 665 static uint16_t nb_txd = TX_QUEUE_SIZE; 666 ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd); 667 if (ret < 0) 668 printf("Could not adjust number of descriptors " 669 "for port%u (%d)\n", (unsigned)port_id, ret); 670 671 uint16_t q; 672 for (q = 0; q < nb_queues; q++) { 673 if (numa_on) { 674 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q]; 675 socketid = rte_lcore_to_socket_id(lcore_id); 676 } 677 mbuf_pool = pktmbuf_pool[socketid]; 678 679 txq_conf = dev_info.default_txconf; 680 txq_conf.offloads = port_conf.txmode.offloads; 681 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd, 682 socketid, &txq_conf); 683 if (ret < 0) { 684 return ret; 685 } 686 687 rxq_conf = dev_info.default_rxconf; 688 rxq_conf.offloads = port_conf.rxmode.offloads; 689 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd, 690 socketid, &rxq_conf, mbuf_pool); 691 if (ret < 0) { 692 return ret; 693 } 694 } 695 696 ret = rte_eth_dev_start(port_id); 697 if (ret < 0) { 698 return ret; 699 } 700 701 if (nb_queues > 1) { 702 /* set HW rss hash function to Toeplitz. */ 703 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 704 struct rte_eth_hash_filter_info info = {0}; 705 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 706 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 707 708 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 709 RTE_ETH_FILTER_SET, &info) < 0) { 710 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 711 port_id); 712 } 713 } 714 715 set_rss_table(port_id, dev_info.reta_size, nb_queues); 716 } 717 718 /* Enable RX in promiscuous mode for the Ethernet device. */ 719 if (ff_global_cfg.dpdk.promiscuous) { 720 rte_eth_promiscuous_enable(port_id); 721 ret = rte_eth_promiscuous_get(port_id); 722 if (ret == 1) { 723 printf("set port %u to promiscuous mode ok\n", port_id); 724 } else { 725 printf("set port %u to promiscuous mode error\n", port_id); 726 } 727 } 728 729 /* Enable pcap dump */ 730 if (pconf->pcap) { 731 ff_enable_pcap(pconf->pcap); 732 } 733 } 734 735 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 736 check_all_ports_link_status(); 737 } 738 739 return 0; 740 } 741 742 static int 743 init_clock(void) 744 { 745 rte_timer_subsystem_init(); 746 uint64_t hz = rte_get_timer_hz(); 747 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 748 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 749 750 rte_timer_init(&freebsd_clock); 751 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 752 rte_lcore_id(), &ff_hardclock_job, NULL); 753 754 ff_update_current_ts(); 755 756 return 0; 757 } 758 759 int 760 ff_dpdk_init(int argc, char **argv) 761 { 762 if (ff_global_cfg.dpdk.nb_procs < 1 || 763 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 764 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 765 ff_global_cfg.dpdk.proc_id < 0) { 766 printf("param num_procs[%d] or proc_id[%d] error!\n", 767 ff_global_cfg.dpdk.nb_procs, 768 ff_global_cfg.dpdk.proc_id); 769 exit(1); 770 } 771 772 int ret = rte_eal_init(argc, argv); 773 if (ret < 0) { 774 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 775 } 776 777 numa_on = ff_global_cfg.dpdk.numa_on; 778 779 idle_sleep = ff_global_cfg.dpdk.idle_sleep; 780 pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \ 781 BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay; 782 783 init_lcore_conf(); 784 785 init_mem_pool(); 786 787 init_dispatch_ring(); 788 789 init_msg_ring(); 790 791 #ifdef FF_KNI 792 enable_kni = ff_global_cfg.kni.enable; 793 if (enable_kni) { 794 init_kni(); 795 } 796 #endif 797 798 #ifdef FF_USE_PAGE_ARRAY 799 ff_mmap_init(); 800 #endif 801 802 ret = init_port_start(); 803 if (ret < 0) { 804 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 805 } 806 807 init_clock(); 808 809 return 0; 810 } 811 812 static void 813 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 814 { 815 uint8_t rx_csum = ctx->hw_features.rx_csum; 816 if (rx_csum) { 817 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 818 rte_pktmbuf_free(pkt); 819 return; 820 } 821 } 822 823 void *data = rte_pktmbuf_mtod(pkt, void*); 824 uint16_t len = rte_pktmbuf_data_len(pkt); 825 826 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 827 if (hdr == NULL) { 828 rte_pktmbuf_free(pkt); 829 return; 830 } 831 832 if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) { 833 ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci); 834 } 835 836 struct rte_mbuf *pn = pkt->next; 837 void *prev = hdr; 838 while(pn != NULL) { 839 data = rte_pktmbuf_mtod(pn, void*); 840 len = rte_pktmbuf_data_len(pn); 841 842 void *mb = ff_mbuf_get(prev, data, len); 843 if (mb == NULL) { 844 ff_mbuf_free(hdr); 845 rte_pktmbuf_free(pkt); 846 return; 847 } 848 pn = pn->next; 849 prev = mb; 850 } 851 852 ff_veth_process_packet(ctx->ifp, hdr); 853 } 854 855 static enum FilterReturn 856 protocol_filter(const void *data, uint16_t len) 857 { 858 if(len < ETHER_HDR_LEN) 859 return FILTER_UNKNOWN; 860 861 const struct ether_hdr *hdr; 862 const struct vlan_hdr *vlanhdr; 863 hdr = (const struct ether_hdr *)data; 864 uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type); 865 data += ETHER_HDR_LEN; 866 len -= ETHER_HDR_LEN; 867 868 if (ether_type == ETHER_TYPE_VLAN) { 869 vlanhdr = (struct vlan_hdr *)data; 870 ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto); 871 data += sizeof(struct vlan_hdr); 872 len -= sizeof(struct vlan_hdr); 873 } 874 875 if(ether_type == ETHER_TYPE_ARP) 876 return FILTER_ARP; 877 878 #ifdef INET6 879 if (ether_type == ETHER_TYPE_IPv6) { 880 return ff_kni_proto_filter(data, 881 len, ether_type); 882 } 883 #endif 884 885 #ifndef FF_KNI 886 return FILTER_UNKNOWN; 887 #else 888 if (!enable_kni) { 889 return FILTER_UNKNOWN; 890 } 891 892 if(ether_type != ETHER_TYPE_IPv4) 893 return FILTER_UNKNOWN; 894 895 return ff_kni_proto_filter(data, 896 len, ether_type); 897 #endif 898 } 899 900 static inline void 901 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m) 902 { 903 struct rte_mbuf *md; 904 void *src, *dst; 905 906 dst = rte_pktmbuf_mtod(mi, void *); 907 src = rte_pktmbuf_mtod(m, void *); 908 909 mi->data_len = m->data_len; 910 rte_memcpy(dst, src, m->data_len); 911 912 mi->port = m->port; 913 mi->vlan_tci = m->vlan_tci; 914 mi->vlan_tci_outer = m->vlan_tci_outer; 915 mi->tx_offload = m->tx_offload; 916 mi->hash = m->hash; 917 mi->ol_flags = m->ol_flags; 918 mi->packet_type = m->packet_type; 919 } 920 921 /* copied from rte_pktmbuf_clone */ 922 static inline struct rte_mbuf * 923 pktmbuf_deep_clone(const struct rte_mbuf *md, 924 struct rte_mempool *mp) 925 { 926 struct rte_mbuf *mc, *mi, **prev; 927 uint32_t pktlen; 928 uint8_t nseg; 929 930 if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) 931 return NULL; 932 933 mi = mc; 934 prev = &mi->next; 935 pktlen = md->pkt_len; 936 nseg = 0; 937 938 do { 939 nseg++; 940 pktmbuf_deep_attach(mi, md); 941 *prev = mi; 942 prev = &mi->next; 943 } while ((md = md->next) != NULL && 944 (mi = rte_pktmbuf_alloc(mp)) != NULL); 945 946 *prev = NULL; 947 mc->nb_segs = nseg; 948 mc->pkt_len = pktlen; 949 950 /* Allocation of new indirect segment failed */ 951 if (unlikely (mi == NULL)) { 952 rte_pktmbuf_free(mc); 953 return NULL; 954 } 955 956 __rte_mbuf_sanity_check(mc, 1); 957 return mc; 958 } 959 960 static inline void 961 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 962 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 963 { 964 struct lcore_conf *qconf = &lcore_conf; 965 uint16_t nb_queues = qconf->nb_queue_list[port_id]; 966 967 uint16_t i; 968 for (i = 0; i < count; i++) { 969 struct rte_mbuf *rtem = bufs[i]; 970 971 if (unlikely(qconf->pcap[port_id] != NULL)) { 972 if (!pkts_from_ring) { 973 ff_dump_packets(qconf->pcap[port_id], rtem); 974 } 975 } 976 977 void *data = rte_pktmbuf_mtod(rtem, void*); 978 uint16_t len = rte_pktmbuf_data_len(rtem); 979 980 if (!pkts_from_ring) { 981 ff_traffic.rx_packets++; 982 ff_traffic.rx_bytes += len; 983 } 984 985 if (!pkts_from_ring && packet_dispatcher) { 986 int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues); 987 if (ret == FF_DISPATCH_RESPONSE) { 988 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len; 989 990 /* 991 * We have not support vlan out strip 992 */ 993 if (rtem->vlan_tci) { 994 data = rte_pktmbuf_prepend(rtem, sizeof(struct vlan_hdr)); 995 if (data != NULL) { 996 memmove(data, data + sizeof(struct vlan_hdr), ETHER_HDR_LEN); 997 struct ether_hdr *etherhdr = (struct ether_hdr *)data; 998 struct vlan_hdr *vlanhdr = (struct vlan_hdr *)(data + ETHER_HDR_LEN); 999 vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci); 1000 vlanhdr->eth_proto = etherhdr->ether_type; 1001 etherhdr->ether_type = rte_cpu_to_be_16(ETHER_TYPE_VLAN); 1002 } 1003 } 1004 send_single_packet(rtem, port_id); 1005 continue; 1006 } 1007 1008 if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) { 1009 rte_pktmbuf_free(rtem); 1010 continue; 1011 } 1012 1013 if (ret != queue_id) { 1014 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem); 1015 if (ret < 0) 1016 rte_pktmbuf_free(rtem); 1017 1018 continue; 1019 } 1020 } 1021 1022 enum FilterReturn filter = protocol_filter(data, len); 1023 #ifdef INET6 1024 if (filter == FILTER_ARP || filter == FILTER_NDP) { 1025 #else 1026 if (filter == FILTER_ARP) { 1027 #endif 1028 struct rte_mempool *mbuf_pool; 1029 struct rte_mbuf *mbuf_clone; 1030 if (!pkts_from_ring) { 1031 uint16_t j; 1032 for(j = 0; j < nb_queues; ++j) { 1033 if(j == queue_id) 1034 continue; 1035 1036 unsigned socket_id = 0; 1037 if (numa_on) { 1038 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j]; 1039 socket_id = rte_lcore_to_socket_id(lcore_id); 1040 } 1041 mbuf_pool = pktmbuf_pool[socket_id]; 1042 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1043 if(mbuf_clone) { 1044 int ret = rte_ring_enqueue(dispatch_ring[port_id][j], 1045 mbuf_clone); 1046 if (ret < 0) 1047 rte_pktmbuf_free(mbuf_clone); 1048 } 1049 } 1050 } 1051 1052 #ifdef FF_KNI 1053 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1054 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 1055 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1056 if(mbuf_clone) { 1057 ff_kni_enqueue(port_id, mbuf_clone); 1058 } 1059 } 1060 #endif 1061 ff_veth_input(ctx, rtem); 1062 #ifdef FF_KNI 1063 } else if (enable_kni && 1064 ((filter == FILTER_KNI && kni_accept) || 1065 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 1066 ff_kni_enqueue(port_id, rtem); 1067 #endif 1068 } else { 1069 ff_veth_input(ctx, rtem); 1070 } 1071 } 1072 } 1073 1074 static inline int 1075 process_dispatch_ring(uint16_t port_id, uint16_t queue_id, 1076 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 1077 { 1078 /* read packet from ring buf and to process */ 1079 uint16_t nb_rb; 1080 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id], 1081 (void **)pkts_burst, MAX_PKT_BURST, NULL); 1082 1083 if(nb_rb > 0) { 1084 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 1085 } 1086 1087 return 0; 1088 } 1089 1090 static inline void 1091 handle_sysctl_msg(struct ff_msg *msg) 1092 { 1093 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 1094 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 1095 msg->sysctl.newlen); 1096 1097 if (ret < 0) { 1098 msg->result = errno; 1099 } else { 1100 msg->result = 0; 1101 } 1102 } 1103 1104 static inline void 1105 handle_ioctl_msg(struct ff_msg *msg) 1106 { 1107 int fd, ret; 1108 #ifdef INET6 1109 if (msg->msg_type == FF_IOCTL6) { 1110 fd = ff_socket(AF_INET6, SOCK_DGRAM, 0); 1111 } else 1112 #endif 1113 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 1114 1115 if (fd < 0) { 1116 ret = -1; 1117 goto done; 1118 } 1119 1120 ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data); 1121 1122 ff_close(fd); 1123 1124 done: 1125 if (ret < 0) { 1126 msg->result = errno; 1127 } else { 1128 msg->result = 0; 1129 } 1130 } 1131 1132 static inline void 1133 handle_route_msg(struct ff_msg *msg) 1134 { 1135 int ret = ff_rtioctl(msg->route.fib, msg->route.data, 1136 &msg->route.len, msg->route.maxlen); 1137 if (ret < 0) { 1138 msg->result = errno; 1139 } else { 1140 msg->result = 0; 1141 } 1142 } 1143 1144 static inline void 1145 handle_top_msg(struct ff_msg *msg) 1146 { 1147 msg->top = ff_top_status; 1148 msg->result = 0; 1149 } 1150 1151 #ifdef FF_NETGRAPH 1152 static inline void 1153 handle_ngctl_msg(struct ff_msg *msg) 1154 { 1155 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data); 1156 if (ret < 0) { 1157 msg->result = errno; 1158 } else { 1159 msg->result = 0; 1160 msg->ngctl.ret = ret; 1161 } 1162 } 1163 #endif 1164 1165 #ifdef FF_IPFW 1166 static inline void 1167 handle_ipfw_msg(struct ff_msg *msg) 1168 { 1169 int fd, ret; 1170 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW); 1171 if (fd < 0) { 1172 ret = -1; 1173 goto done; 1174 } 1175 1176 switch (msg->ipfw.cmd) { 1177 case FF_IPFW_GET: 1178 ret = ff_getsockopt_freebsd(fd, msg->ipfw.level, 1179 msg->ipfw.optname, msg->ipfw.optval, 1180 msg->ipfw.optlen); 1181 break; 1182 case FF_IPFW_SET: 1183 ret = ff_setsockopt_freebsd(fd, msg->ipfw.level, 1184 msg->ipfw.optname, msg->ipfw.optval, 1185 *(msg->ipfw.optlen)); 1186 break; 1187 default: 1188 ret = -1; 1189 errno = ENOTSUP; 1190 break; 1191 } 1192 1193 ff_close(fd); 1194 1195 done: 1196 if (ret < 0) { 1197 msg->result = errno; 1198 } else { 1199 msg->result = 0; 1200 } 1201 } 1202 #endif 1203 1204 static inline void 1205 handle_traffic_msg(struct ff_msg *msg) 1206 { 1207 msg->traffic = ff_traffic; 1208 msg->result = 0; 1209 } 1210 1211 static inline void 1212 handle_default_msg(struct ff_msg *msg) 1213 { 1214 msg->result = ENOTSUP; 1215 } 1216 1217 static inline void 1218 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1219 { 1220 switch (msg->msg_type) { 1221 case FF_SYSCTL: 1222 handle_sysctl_msg(msg); 1223 break; 1224 case FF_IOCTL: 1225 #ifdef INET6 1226 case FF_IOCTL6: 1227 #endif 1228 handle_ioctl_msg(msg); 1229 break; 1230 case FF_ROUTE: 1231 handle_route_msg(msg); 1232 break; 1233 case FF_TOP: 1234 handle_top_msg(msg); 1235 break; 1236 #ifdef FF_NETGRAPH 1237 case FF_NGCTL: 1238 handle_ngctl_msg(msg); 1239 break; 1240 #endif 1241 #ifdef FF_IPFW 1242 case FF_IPFW_CTL: 1243 handle_ipfw_msg(msg); 1244 break; 1245 #endif 1246 case FF_TRAFFIC: 1247 handle_traffic_msg(msg); 1248 break; 1249 default: 1250 handle_default_msg(msg); 1251 break; 1252 } 1253 rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg); 1254 } 1255 1256 static inline int 1257 process_msg_ring(uint16_t proc_id) 1258 { 1259 void *msg; 1260 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1261 1262 if (unlikely(ret == 0)) { 1263 handle_msg((struct ff_msg *)msg, proc_id); 1264 } 1265 1266 return 0; 1267 } 1268 1269 /* Send burst of packets on an output interface */ 1270 static inline int 1271 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1272 { 1273 struct rte_mbuf **m_table; 1274 int ret; 1275 uint16_t queueid; 1276 1277 queueid = qconf->tx_queue_id[port]; 1278 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1279 1280 if (unlikely(qconf->pcap[port] != NULL)) { 1281 uint16_t i; 1282 for (i = 0; i < n; i++) { 1283 ff_dump_packets(qconf->pcap[port], m_table[i]); 1284 } 1285 } 1286 1287 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1288 ff_traffic.tx_packets += ret; 1289 uint16_t i; 1290 for (i = 0; i < ret; i++) { 1291 ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]); 1292 #ifdef FF_USE_PAGE_ARRAY 1293 if (qconf->tx_mbufs[port].bsd_m_table[i]) 1294 ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs); 1295 #endif 1296 } 1297 if (unlikely(ret < n)) { 1298 do { 1299 rte_pktmbuf_free(m_table[ret]); 1300 #ifdef FF_USE_PAGE_ARRAY 1301 if ( qconf->tx_mbufs[port].bsd_m_table[ret] ) 1302 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]); 1303 #endif 1304 } while (++ret < n); 1305 } 1306 return 0; 1307 } 1308 1309 /* Enqueue a single packet, and send burst if queue is filled */ 1310 static inline int 1311 send_single_packet(struct rte_mbuf *m, uint8_t port) 1312 { 1313 uint16_t len; 1314 struct lcore_conf *qconf; 1315 1316 qconf = &lcore_conf; 1317 len = qconf->tx_mbufs[port].len; 1318 qconf->tx_mbufs[port].m_table[len] = m; 1319 len++; 1320 1321 /* enough pkts to be sent */ 1322 if (unlikely(len == MAX_PKT_BURST)) { 1323 send_burst(qconf, MAX_PKT_BURST, port); 1324 len = 0; 1325 } 1326 1327 qconf->tx_mbufs[port].len = len; 1328 return 0; 1329 } 1330 1331 int 1332 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1333 int total) 1334 { 1335 #ifdef FF_USE_PAGE_ARRAY 1336 struct lcore_conf *qconf = &lcore_conf; 1337 int len = 0; 1338 1339 len = ff_if_send_onepkt(ctx, m,total); 1340 if (unlikely(len == MAX_PKT_BURST)) { 1341 send_burst(qconf, MAX_PKT_BURST, ctx->port_id); 1342 len = 0; 1343 } 1344 qconf->tx_mbufs[ctx->port_id].len = len; 1345 return 0; 1346 #endif 1347 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1348 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1349 if (head == NULL) { 1350 ff_mbuf_free(m); 1351 return -1; 1352 } 1353 1354 head->pkt_len = total; 1355 head->nb_segs = 0; 1356 1357 int off = 0; 1358 struct rte_mbuf *cur = head, *prev = NULL; 1359 while(total > 0) { 1360 if (cur == NULL) { 1361 cur = rte_pktmbuf_alloc(mbuf_pool); 1362 if (cur == NULL) { 1363 rte_pktmbuf_free(head); 1364 ff_mbuf_free(m); 1365 return -1; 1366 } 1367 } 1368 1369 if (prev != NULL) { 1370 prev->next = cur; 1371 } 1372 head->nb_segs++; 1373 1374 prev = cur; 1375 void *data = rte_pktmbuf_mtod(cur, void*); 1376 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1377 int ret = ff_mbuf_copydata(m, data, off, len); 1378 if (ret < 0) { 1379 rte_pktmbuf_free(head); 1380 ff_mbuf_free(m); 1381 return -1; 1382 } 1383 1384 1385 cur->data_len = len; 1386 off += len; 1387 total -= len; 1388 cur = NULL; 1389 } 1390 1391 struct ff_tx_offload offload = {0}; 1392 ff_mbuf_tx_offload(m, &offload); 1393 1394 void *data = rte_pktmbuf_mtod(head, void*); 1395 1396 if (offload.ip_csum) { 1397 /* ipv6 not supported yet */ 1398 struct ipv4_hdr *iph; 1399 int iph_len; 1400 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1401 iph_len = (iph->version_ihl & 0x0f) << 2; 1402 1403 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 1404 head->l2_len = ETHER_HDR_LEN; 1405 head->l3_len = iph_len; 1406 } 1407 1408 if (ctx->hw_features.tx_csum_l4) { 1409 struct ipv4_hdr *iph; 1410 int iph_len; 1411 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1412 iph_len = (iph->version_ihl & 0x0f) << 2; 1413 1414 if (offload.tcp_csum) { 1415 head->ol_flags |= PKT_TX_TCP_CKSUM; 1416 head->l2_len = ETHER_HDR_LEN; 1417 head->l3_len = iph_len; 1418 } 1419 1420 /* 1421 * TCP segmentation offload. 1422 * 1423 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 1424 * implies PKT_TX_TCP_CKSUM) 1425 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 1426 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 1427 * write the IP checksum to 0 in the packet 1428 * - fill the mbuf offload information: l2_len, 1429 * l3_len, l4_len, tso_segsz 1430 * - calculate the pseudo header checksum without taking ip_len 1431 * in account, and set it in the TCP header. Refer to 1432 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 1433 * used as helpers. 1434 */ 1435 if (offload.tso_seg_size) { 1436 struct tcp_hdr *tcph; 1437 int tcph_len; 1438 tcph = (struct tcp_hdr *)((char *)iph + iph_len); 1439 tcph_len = (tcph->data_off & 0xf0) >> 2; 1440 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 1441 1442 head->ol_flags |= PKT_TX_TCP_SEG; 1443 head->l4_len = tcph_len; 1444 head->tso_segsz = offload.tso_seg_size; 1445 } 1446 1447 if (offload.udp_csum) { 1448 head->ol_flags |= PKT_TX_UDP_CKSUM; 1449 head->l2_len = ETHER_HDR_LEN; 1450 head->l3_len = iph_len; 1451 } 1452 } 1453 1454 ff_mbuf_free(m); 1455 1456 return send_single_packet(head, ctx->port_id); 1457 } 1458 1459 static int 1460 main_loop(void *arg) 1461 { 1462 struct loop_routine *lr = (struct loop_routine *)arg; 1463 1464 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1465 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc; 1466 int i, j, nb_rx, idle; 1467 uint16_t port_id, queue_id; 1468 struct lcore_conf *qconf; 1469 uint64_t drain_tsc = 0; 1470 struct ff_dpdk_if_context *ctx; 1471 1472 if (pkt_tx_delay) { 1473 drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay; 1474 } 1475 1476 prev_tsc = 0; 1477 usch_tsc = 0; 1478 1479 qconf = &lcore_conf; 1480 1481 while (1) { 1482 cur_tsc = rte_rdtsc(); 1483 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1484 rte_timer_manage(); 1485 } 1486 1487 idle = 1; 1488 sys_tsc = 0; 1489 usr_tsc = 0; 1490 1491 /* 1492 * TX burst queue drain 1493 */ 1494 diff_tsc = cur_tsc - prev_tsc; 1495 if (unlikely(diff_tsc >= drain_tsc)) { 1496 for (i = 0; i < qconf->nb_tx_port; i++) { 1497 port_id = qconf->tx_port_id[i]; 1498 if (qconf->tx_mbufs[port_id].len == 0) 1499 continue; 1500 1501 idle = 0; 1502 1503 send_burst(qconf, 1504 qconf->tx_mbufs[port_id].len, 1505 port_id); 1506 qconf->tx_mbufs[port_id].len = 0; 1507 } 1508 1509 prev_tsc = cur_tsc; 1510 } 1511 1512 /* 1513 * Read packet from RX queues 1514 */ 1515 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1516 port_id = qconf->rx_queue_list[i].port_id; 1517 queue_id = qconf->rx_queue_list[i].queue_id; 1518 ctx = veth_ctx[port_id]; 1519 1520 #ifdef FF_KNI 1521 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1522 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1523 } 1524 #endif 1525 1526 process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); 1527 1528 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1529 MAX_PKT_BURST); 1530 if (nb_rx == 0) 1531 continue; 1532 1533 idle = 0; 1534 1535 /* Prefetch first packets */ 1536 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1537 rte_prefetch0(rte_pktmbuf_mtod( 1538 pkts_burst[j], void *)); 1539 } 1540 1541 /* Prefetch and handle already prefetched packets */ 1542 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1543 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1544 j + PREFETCH_OFFSET], void *)); 1545 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1546 } 1547 1548 /* Handle remaining prefetched packets */ 1549 for (; j < nb_rx; j++) { 1550 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1551 } 1552 } 1553 1554 process_msg_ring(qconf->proc_id); 1555 1556 div_tsc = rte_rdtsc(); 1557 1558 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) { 1559 usch_tsc = cur_tsc; 1560 lr->loop(lr->arg); 1561 } 1562 1563 idle_sleep_tsc = rte_rdtsc(); 1564 if (likely(idle && idle_sleep)) { 1565 usleep(idle_sleep); 1566 end_tsc = rte_rdtsc(); 1567 } else { 1568 end_tsc = idle_sleep_tsc; 1569 } 1570 1571 if (usch_tsc == cur_tsc) { 1572 usr_tsc = idle_sleep_tsc - div_tsc; 1573 } 1574 1575 if (!idle) { 1576 sys_tsc = div_tsc - cur_tsc; 1577 ff_top_status.sys_tsc += sys_tsc; 1578 } 1579 1580 ff_top_status.usr_tsc += usr_tsc; 1581 ff_top_status.work_tsc += end_tsc - cur_tsc; 1582 ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1583 1584 ff_top_status.loops++; 1585 } 1586 1587 return 0; 1588 } 1589 1590 int 1591 ff_dpdk_if_up(void) { 1592 int i; 1593 struct lcore_conf *qconf = &lcore_conf; 1594 for (i = 0; i < qconf->nb_tx_port; i++) { 1595 uint16_t port_id = qconf->tx_port_id[i]; 1596 1597 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id]; 1598 veth_ctx[port_id] = ff_veth_attach(pconf); 1599 if (veth_ctx[port_id] == NULL) { 1600 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1601 } 1602 } 1603 1604 return 0; 1605 } 1606 1607 void 1608 ff_dpdk_run(loop_func_t loop, void *arg) { 1609 struct loop_routine *lr = rte_malloc(NULL, 1610 sizeof(struct loop_routine), 0); 1611 lr->loop = loop; 1612 lr->arg = arg; 1613 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1614 rte_eal_mp_wait_lcore(); 1615 rte_free(lr); 1616 } 1617 1618 void 1619 ff_dpdk_pktmbuf_free(void *m) 1620 { 1621 rte_pktmbuf_free((struct rte_mbuf *)m); 1622 } 1623 1624 static uint32_t 1625 toeplitz_hash(unsigned keylen, const uint8_t *key, 1626 unsigned datalen, const uint8_t *data) 1627 { 1628 uint32_t hash = 0, v; 1629 u_int i, b; 1630 1631 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1632 1633 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1634 for (i = 0; i < datalen; i++) { 1635 for (b = 0; b < 8; b++) { 1636 if (data[i] & (1<<(7-b))) 1637 hash ^= v; 1638 v <<= 1; 1639 if ((i + 4) < keylen && 1640 (key[i+4] & (1<<(7-b)))) 1641 v |= 1; 1642 } 1643 } 1644 return (hash); 1645 } 1646 1647 int 1648 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1649 uint16_t sport, uint16_t dport) 1650 { 1651 struct lcore_conf *qconf = &lcore_conf; 1652 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 1653 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id]; 1654 1655 if (nb_queues <= 1) { 1656 return 1; 1657 } 1658 1659 uint16_t reta_size = rss_reta_size[ctx->port_id]; 1660 uint16_t queueid = qconf->tx_queue_id[ctx->port_id]; 1661 1662 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1663 sizeof(dport)]; 1664 1665 unsigned datalen = 0; 1666 1667 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1668 datalen += sizeof(saddr); 1669 1670 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1671 datalen += sizeof(daddr); 1672 1673 bcopy(&sport, &data[datalen], sizeof(sport)); 1674 datalen += sizeof(sport); 1675 1676 bcopy(&dport, &data[datalen], sizeof(dport)); 1677 datalen += sizeof(dport); 1678 1679 uint32_t hash = 0; 1680 if ( !use_rsskey_52bytes ) 1681 hash = toeplitz_hash(sizeof(default_rsskey_40bytes), 1682 default_rsskey_40bytes, datalen, data); 1683 else 1684 hash = toeplitz_hash(sizeof(default_rsskey_52bytes), 1685 default_rsskey_52bytes, datalen, data); 1686 return ((hash & (reta_size - 1)) % nb_queues) == queueid; 1687 } 1688 1689 void 1690 ff_regist_packet_dispatcher(dispatch_func_t func) 1691 { 1692 packet_dispatcher = func; 1693 } 1694 1695 uint64_t 1696 ff_get_tsc_ns() 1697 { 1698 uint64_t cur_tsc = rte_rdtsc(); 1699 uint64_t hz = rte_get_tsc_hz(); 1700 return ((double)cur_tsc/(double)hz) * NS_PER_S; 1701 } 1702 1703