1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 #include <unistd.h> 28 29 #include <rte_common.h> 30 #include <rte_byteorder.h> 31 #include <rte_log.h> 32 #include <rte_memory.h> 33 #include <rte_memcpy.h> 34 #include <rte_memzone.h> 35 #include <rte_config.h> 36 #include <rte_eal.h> 37 #include <rte_pci.h> 38 #include <rte_mbuf.h> 39 #include <rte_memory.h> 40 #include <rte_lcore.h> 41 #include <rte_launch.h> 42 #include <rte_ethdev.h> 43 #include <rte_debug.h> 44 #include <rte_common.h> 45 #include <rte_ether.h> 46 #include <rte_malloc.h> 47 #include <rte_cycles.h> 48 #include <rte_timer.h> 49 #include <rte_thash.h> 50 #include <rte_ip.h> 51 #include <rte_tcp.h> 52 #include <rte_udp.h> 53 54 #include "ff_dpdk_if.h" 55 #include "ff_dpdk_pcap.h" 56 #include "ff_dpdk_kni.h" 57 #include "ff_config.h" 58 #include "ff_veth.h" 59 #include "ff_host_interface.h" 60 #include "ff_msg.h" 61 #include "ff_api.h" 62 63 #define MEMPOOL_CACHE_SIZE 256 64 65 #define DISPATCH_RING_SIZE 2048 66 67 #define MSG_RING_SIZE 32 68 69 /* 70 * Configurable number of RX/TX ring descriptors 71 */ 72 #define RX_QUEUE_SIZE 512 73 #define TX_QUEUE_SIZE 512 74 75 #define MAX_PKT_BURST 32 76 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 77 78 /* 79 * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send. 80 */ 81 #define MAX_TX_BURST (MAX_PKT_BURST / 2) 82 83 #define NB_SOCKETS 8 84 85 /* Configure how many packets ahead to prefetch, when reading packets */ 86 #define PREFETCH_OFFSET 3 87 88 #define MAX_RX_QUEUE_PER_LCORE 16 89 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS 90 #define MAX_RX_QUEUE_PER_PORT 128 91 92 #ifdef FF_KNI 93 #define KNI_MBUF_MAX 2048 94 #define KNI_QUEUE_SIZE 2048 95 96 static int enable_kni; 97 static int kni_accept; 98 #endif 99 100 static int numa_on; 101 102 static unsigned idle_sleep; 103 104 static struct rte_timer freebsd_clock; 105 106 // Mellanox Linux's driver key 107 static uint8_t default_rsskey_40bytes[40] = { 108 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 109 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 110 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 111 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 112 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 113 }; 114 115 static struct rte_eth_conf default_port_conf = { 116 .rxmode = { 117 .mq_mode = ETH_MQ_RX_RSS, 118 .max_rx_pkt_len = ETHER_MAX_LEN, 119 .split_hdr_size = 0, /**< hdr buf size */ 120 .header_split = 0, /**< Header Split disabled */ 121 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 122 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 123 .hw_vlan_strip = 0, /**< VLAN strip disabled. */ 124 .hw_vlan_extend = 0, /**< Extended VLAN disabled. */ 125 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 126 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 127 .enable_lro = 0, /**< LRO disabled */ 128 }, 129 .rx_adv_conf = { 130 .rss_conf = { 131 .rss_key = default_rsskey_40bytes, 132 .rss_key_len = 40, 133 .rss_hf = ETH_RSS_PROTO_MASK, 134 }, 135 }, 136 .txmode = { 137 .mq_mode = ETH_MQ_TX_NONE, 138 }, 139 }; 140 141 struct mbuf_table { 142 uint16_t len; 143 struct rte_mbuf *m_table[MAX_PKT_BURST]; 144 }; 145 146 struct lcore_rx_queue { 147 uint16_t port_id; 148 uint16_t queue_id; 149 } __rte_cache_aligned; 150 151 struct lcore_conf { 152 uint16_t proc_id; 153 uint16_t socket_id; 154 uint16_t nb_queue_list[RTE_MAX_ETHPORTS]; 155 struct ff_port_cfg *port_cfgs; 156 157 uint16_t nb_rx_queue; 158 struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; 159 uint16_t nb_tx_port; 160 uint16_t tx_port_id[RTE_MAX_ETHPORTS]; 161 uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; 162 struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS]; 163 char *pcap[RTE_MAX_ETHPORTS]; 164 } __rte_cache_aligned; 165 166 static struct lcore_conf lcore_conf; 167 168 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 169 170 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; 171 static dispatch_func_t packet_dispatcher; 172 173 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 174 175 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port); 176 177 struct ff_msg_ring { 178 char ring_name[2][RTE_RING_NAMESIZE]; 179 /* ring[0] for lcore recv msg, other send */ 180 /* ring[1] for lcore send msg, other read */ 181 struct rte_ring *ring[2]; 182 } __rte_cache_aligned; 183 184 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 185 static struct rte_mempool *message_pool; 186 187 struct ff_dpdk_if_context { 188 void *sc; 189 void *ifp; 190 uint16_t port_id; 191 struct ff_hw_features hw_features; 192 } __rte_cache_aligned; 193 194 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 195 196 static struct ff_top_args ff_top_status; 197 static struct ff_traffic_args ff_traffic; 198 199 extern void ff_hardclock(void); 200 201 static void 202 ff_hardclock_job(__rte_unused struct rte_timer *timer, 203 __rte_unused void *arg) { 204 ff_hardclock(); 205 ff_update_current_ts(); 206 } 207 208 struct ff_dpdk_if_context * 209 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 210 { 211 struct ff_dpdk_if_context *ctx; 212 213 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 214 if (ctx == NULL) 215 return NULL; 216 217 ctx->sc = sc; 218 ctx->ifp = ifp; 219 ctx->port_id = cfg->port_id; 220 ctx->hw_features = cfg->hw_features; 221 222 return ctx; 223 } 224 225 void 226 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 227 { 228 free(ctx); 229 } 230 231 static void 232 check_all_ports_link_status(void) 233 { 234 #define CHECK_INTERVAL 100 /* 100ms */ 235 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 236 237 uint16_t portid; 238 uint8_t count, all_ports_up, print_flag = 0; 239 struct rte_eth_link link; 240 241 printf("\nChecking link status"); 242 fflush(stdout); 243 244 int i, nb_ports; 245 nb_ports = ff_global_cfg.dpdk.nb_ports; 246 for (count = 0; count <= MAX_CHECK_TIME; count++) { 247 all_ports_up = 1; 248 for (i = 0; i < nb_ports; i++) { 249 uint16_t portid = ff_global_cfg.dpdk.portid_list[i]; 250 memset(&link, 0, sizeof(link)); 251 rte_eth_link_get_nowait(portid, &link); 252 253 /* print link status if flag set */ 254 if (print_flag == 1) { 255 if (link.link_status) { 256 printf("Port %d Link Up - speed %u " 257 "Mbps - %s\n", (int)portid, 258 (unsigned)link.link_speed, 259 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 260 ("full-duplex") : ("half-duplex\n")); 261 } else { 262 printf("Port %d Link Down\n", (int)portid); 263 } 264 continue; 265 } 266 /* clear all_ports_up flag if any link down */ 267 if (link.link_status == 0) { 268 all_ports_up = 0; 269 break; 270 } 271 } 272 273 /* after finally printing all link status, get out */ 274 if (print_flag == 1) 275 break; 276 277 if (all_ports_up == 0) { 278 printf("."); 279 fflush(stdout); 280 rte_delay_ms(CHECK_INTERVAL); 281 } 282 283 /* set the print_flag if all ports up or timeout */ 284 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 285 print_flag = 1; 286 printf("done\n"); 287 } 288 } 289 } 290 291 static int 292 init_lcore_conf(void) 293 { 294 uint8_t nb_dev_ports = rte_eth_dev_count(); 295 if (nb_dev_ports == 0) { 296 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 297 } 298 299 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) { 300 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n", 301 ff_global_cfg.dpdk.max_portid); 302 } 303 304 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs; 305 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 306 307 uint16_t proc_id; 308 for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) { 309 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id]; 310 if (!lcore_config[lcore_id].detected) { 311 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 312 } 313 } 314 315 uint16_t socket_id = 0; 316 if (numa_on) { 317 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 318 } 319 320 lcore_conf.socket_id = socket_id; 321 322 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id]; 323 int j; 324 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) { 325 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j]; 326 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 327 328 int queueid = -1; 329 int i; 330 for (i = 0; i < pconf->nb_lcores; i++) { 331 if (pconf->lcore_list[i] == lcore_id) { 332 queueid = i; 333 } 334 } 335 if (queueid < 0) { 336 continue; 337 } 338 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid); 339 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 340 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 341 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid; 342 lcore_conf.nb_rx_queue++; 343 344 lcore_conf.tx_queue_id[port_id] = queueid; 345 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id; 346 lcore_conf.nb_tx_port++; 347 348 lcore_conf.pcap[port_id] = pconf->pcap; 349 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores; 350 } 351 352 if (lcore_conf.nb_rx_queue == 0) { 353 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id); 354 } 355 356 return 0; 357 } 358 359 static int 360 init_mem_pool(void) 361 { 362 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 363 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 364 uint32_t nb_tx_queue = nb_lcores; 365 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 366 367 unsigned nb_mbuf = RTE_MAX ( 368 (nb_rx_queue*RX_QUEUE_SIZE + 369 nb_ports*nb_lcores*MAX_PKT_BURST + 370 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 371 nb_lcores*MEMPOOL_CACHE_SIZE + 372 #ifdef FF_KNI 373 nb_ports*KNI_MBUF_MAX + 374 nb_ports*KNI_QUEUE_SIZE + 375 #endif 376 nb_lcores*nb_ports*DISPATCH_RING_SIZE), 377 (unsigned)8192); 378 379 unsigned socketid = 0; 380 uint16_t i, lcore_id; 381 char s[64]; 382 383 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) { 384 lcore_id = ff_global_cfg.dpdk.proc_lcore[i]; 385 if (numa_on) { 386 socketid = rte_lcore_to_socket_id(lcore_id); 387 } 388 389 if (socketid >= NB_SOCKETS) { 390 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 391 socketid, i, NB_SOCKETS); 392 } 393 394 if (pktmbuf_pool[socketid] != NULL) { 395 continue; 396 } 397 398 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 399 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 400 pktmbuf_pool[socketid] = 401 rte_pktmbuf_pool_create(s, nb_mbuf, 402 MEMPOOL_CACHE_SIZE, 0, 403 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 404 } else { 405 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 406 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 407 } 408 409 if (pktmbuf_pool[socketid] == NULL) { 410 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 411 } else { 412 printf("create mbuf pool on socket %d\n", socketid); 413 } 414 } 415 416 return 0; 417 } 418 419 static struct rte_ring * 420 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 421 { 422 struct rte_ring *ring; 423 424 if (name == NULL) { 425 rte_exit(EXIT_FAILURE, "create ring failed, no name!\n"); 426 } 427 428 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 429 ring = rte_ring_create(name, count, socket_id, flags); 430 } else { 431 ring = rte_ring_lookup(name); 432 } 433 434 if (ring == NULL) { 435 rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name); 436 } 437 438 return ring; 439 } 440 441 static int 442 init_dispatch_ring(void) 443 { 444 int j; 445 char name_buf[RTE_RING_NAMESIZE]; 446 int queueid; 447 448 unsigned socketid = lcore_conf.socket_id; 449 450 /* Create ring according to ports actually being used. */ 451 int nb_ports = ff_global_cfg.dpdk.nb_ports; 452 for (j = 0; j < nb_ports; j++) { 453 uint16_t portid = ff_global_cfg.dpdk.portid_list[j]; 454 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid]; 455 int nb_queues = pconf->nb_lcores; 456 if (dispatch_ring[portid] == NULL) { 457 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid); 458 459 dispatch_ring[portid] = rte_zmalloc(name_buf, 460 sizeof(struct rte_ring *) * nb_queues, 461 RTE_CACHE_LINE_SIZE); 462 if (dispatch_ring[portid] == NULL) { 463 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 464 "failed\n", name_buf); 465 } 466 } 467 468 for(queueid = 0; queueid < nb_queues; ++queueid) { 469 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d", 470 portid, queueid); 471 dispatch_ring[portid][queueid] = create_ring(name_buf, 472 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ); 473 474 if (dispatch_ring[portid][queueid] == NULL) 475 rte_panic("create ring:%s failed!\n", name_buf); 476 477 printf("create ring:%s success, %u ring entries are now free!\n", 478 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid])); 479 } 480 } 481 482 return 0; 483 } 484 485 static void 486 ff_msg_init(struct rte_mempool *mp, 487 __attribute__((unused)) void *opaque_arg, 488 void *obj, __attribute__((unused)) unsigned i) 489 { 490 struct ff_msg *msg = (struct ff_msg *)obj; 491 msg->msg_type = FF_UNKNOWN; 492 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 493 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 494 } 495 496 static int 497 init_msg_ring(void) 498 { 499 uint16_t i; 500 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 501 unsigned socketid = lcore_conf.socket_id; 502 503 /* Create message buffer pool */ 504 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 505 message_pool = rte_mempool_create(FF_MSG_POOL, 506 MSG_RING_SIZE * 2 * nb_procs, 507 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 508 NULL, NULL, ff_msg_init, NULL, 509 socketid, 0); 510 } else { 511 message_pool = rte_mempool_lookup(FF_MSG_POOL); 512 } 513 514 if (message_pool == NULL) { 515 rte_panic("Create msg mempool failed\n"); 516 } 517 518 for(i = 0; i < nb_procs; ++i) { 519 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 520 "%s%u", FF_MSG_RING_IN, i); 521 snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE, 522 "%s%u", FF_MSG_RING_OUT, i); 523 524 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 525 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 526 if (msg_ring[i].ring[0] == NULL) 527 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 528 529 msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1], 530 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 531 if (msg_ring[i].ring[1] == NULL) 532 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 533 } 534 535 return 0; 536 } 537 538 #ifdef FF_KNI 539 static int 540 init_kni(void) 541 { 542 int nb_ports = rte_eth_dev_count(); 543 kni_accept = 0; 544 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 545 kni_accept = 1; 546 547 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 548 ff_global_cfg.kni.udp_port); 549 550 unsigned socket_id = lcore_conf.socket_id; 551 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 552 553 nb_ports = ff_global_cfg.dpdk.nb_ports; 554 int i, ret; 555 for (i = 0; i < nb_ports; i++) { 556 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 557 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 558 } 559 560 return 0; 561 } 562 #endif 563 564 static void 565 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues) 566 { 567 if (reta_size == 0) { 568 return; 569 } 570 571 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 572 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 573 574 /* config HW indirection table */ 575 unsigned i, j, hash=0; 576 for (i = 0; i < reta_conf_size; i++) { 577 reta_conf[i].mask = ~0ULL; 578 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 579 reta_conf[i].reta[j] = hash++ % nb_queues; 580 } 581 } 582 583 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 584 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 585 port_id); 586 } 587 } 588 589 static int 590 init_port_start(void) 591 { 592 int nb_ports = ff_global_cfg.dpdk.nb_ports; 593 unsigned socketid = 0; 594 struct rte_mempool *mbuf_pool; 595 uint16_t i; 596 597 for (i = 0; i < nb_ports; i++) { 598 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 599 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 600 uint16_t nb_queues = pconf->nb_lcores; 601 602 struct rte_eth_dev_info dev_info; 603 rte_eth_dev_info_get(port_id, &dev_info); 604 605 if (nb_queues > dev_info.max_rx_queues) { 606 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 607 nb_queues, 608 dev_info.max_rx_queues); 609 } 610 611 if (nb_queues > dev_info.max_tx_queues) { 612 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 613 nb_queues, 614 dev_info.max_tx_queues); 615 } 616 617 struct ether_addr addr; 618 rte_eth_macaddr_get(port_id, &addr); 619 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 620 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 621 (unsigned)port_id, 622 addr.addr_bytes[0], addr.addr_bytes[1], 623 addr.addr_bytes[2], addr.addr_bytes[3], 624 addr.addr_bytes[4], addr.addr_bytes[5]); 625 626 rte_memcpy(pconf->mac, 627 addr.addr_bytes, ETHER_ADDR_LEN); 628 629 /* Clear txq_flags - we do not need multi-mempool and refcnt */ 630 dev_info.default_txconf.txq_flags = ETH_TXQ_FLAGS_NOMULTMEMP | 631 ETH_TXQ_FLAGS_NOREFCOUNT; 632 633 /* Disable features that are not supported by port's HW */ 634 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) { 635 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP; 636 } 637 638 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 639 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP; 640 } 641 642 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) { 643 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP; 644 } 645 646 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { 647 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; 648 } 649 650 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) && 651 !(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TSO)) { 652 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; 653 } 654 655 struct rte_eth_conf port_conf = {0}; 656 657 /* Set RSS mode */ 658 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 659 port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK; 660 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes; 661 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; 662 663 /* Set Rx VLAN stripping */ 664 if (ff_global_cfg.dpdk.vlan_strip) { 665 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 666 port_conf.rxmode.hw_vlan_strip = 1; 667 } 668 } 669 670 /* Enable HW CRC stripping */ 671 port_conf.rxmode.hw_strip_crc = 1; 672 673 /* FIXME: Enable TCP LRO ?*/ 674 #if 0 675 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 676 printf("LRO is supported\n"); 677 port_conf.rxmode.enable_lro = 1; 678 pconf->hw_features.rx_lro = 1; 679 } 680 #endif 681 682 /* Set Rx checksum checking */ 683 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 684 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 685 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 686 printf("RX checksum offload supported\n"); 687 port_conf.rxmode.hw_ip_checksum = 1; 688 pconf->hw_features.rx_csum = 1; 689 } 690 691 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 692 printf("TX ip checksum offload supported\n"); 693 pconf->hw_features.tx_csum_ip = 1; 694 } 695 696 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 697 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 698 printf("TX TCP&UDP checksum offload supported\n"); 699 pconf->hw_features.tx_csum_l4 = 1; 700 } 701 702 if (ff_global_cfg.dpdk.tso) { 703 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 704 printf("TSO is supported\n"); 705 pconf->hw_features.tx_tso = 1; 706 } 707 } else { 708 printf("TSO is disabled\n"); 709 } 710 711 if (dev_info.reta_size) { 712 /* reta size must be power of 2 */ 713 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 714 715 rss_reta_size[port_id] = dev_info.reta_size; 716 printf("port[%d]: rss table size: %d\n", port_id, 717 dev_info.reta_size); 718 } 719 720 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 721 continue; 722 } 723 724 int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); 725 if (ret != 0) { 726 return ret; 727 } 728 uint16_t q; 729 for (q = 0; q < nb_queues; q++) { 730 if (numa_on) { 731 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q]; 732 socketid = rte_lcore_to_socket_id(lcore_id); 733 } 734 mbuf_pool = pktmbuf_pool[socketid]; 735 736 ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE, 737 socketid, &dev_info.default_txconf); 738 if (ret < 0) { 739 return ret; 740 } 741 742 ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE, 743 socketid, &dev_info.default_rxconf, mbuf_pool); 744 if (ret < 0) { 745 return ret; 746 } 747 } 748 749 ret = rte_eth_dev_start(port_id); 750 if (ret < 0) { 751 return ret; 752 } 753 754 if (nb_queues > 1) { 755 /* set HW rss hash function to Toeplitz. */ 756 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 757 struct rte_eth_hash_filter_info info = {0}; 758 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 759 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 760 761 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 762 RTE_ETH_FILTER_SET, &info) < 0) { 763 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 764 port_id); 765 } 766 } 767 768 set_rss_table(port_id, dev_info.reta_size, nb_queues); 769 } 770 771 /* Enable RX in promiscuous mode for the Ethernet device. */ 772 if (ff_global_cfg.dpdk.promiscuous) { 773 rte_eth_promiscuous_enable(port_id); 774 ret = rte_eth_promiscuous_get(port_id); 775 if (ret == 1) { 776 printf("set port %u to promiscuous mode ok\n", port_id); 777 } else { 778 printf("set port %u to promiscuous mode error\n", port_id); 779 } 780 } 781 782 /* Enable pcap dump */ 783 if (pconf->pcap) { 784 ff_enable_pcap(pconf->pcap); 785 } 786 } 787 788 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 789 check_all_ports_link_status(); 790 } 791 792 return 0; 793 } 794 795 static int 796 init_clock(void) 797 { 798 rte_timer_subsystem_init(); 799 uint64_t hz = rte_get_timer_hz(); 800 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 801 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 802 803 rte_timer_init(&freebsd_clock); 804 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 805 rte_lcore_id(), &ff_hardclock_job, NULL); 806 807 ff_update_current_ts(); 808 809 return 0; 810 } 811 812 int 813 ff_dpdk_init(int argc, char **argv) 814 { 815 if (ff_global_cfg.dpdk.nb_procs < 1 || 816 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 817 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 818 ff_global_cfg.dpdk.proc_id < 0) { 819 printf("param num_procs[%d] or proc_id[%d] error!\n", 820 ff_global_cfg.dpdk.nb_procs, 821 ff_global_cfg.dpdk.proc_id); 822 exit(1); 823 } 824 825 int ret = rte_eal_init(argc, argv); 826 if (ret < 0) { 827 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 828 } 829 830 numa_on = ff_global_cfg.dpdk.numa_on; 831 832 idle_sleep = ff_global_cfg.dpdk.idle_sleep; 833 834 init_lcore_conf(); 835 836 init_mem_pool(); 837 838 init_dispatch_ring(); 839 840 init_msg_ring(); 841 842 #ifdef FF_KNI 843 enable_kni = ff_global_cfg.kni.enable; 844 if (enable_kni) { 845 init_kni(); 846 } 847 #endif 848 849 ret = init_port_start(); 850 if (ret < 0) { 851 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 852 } 853 854 init_clock(); 855 856 return 0; 857 } 858 859 static void 860 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 861 { 862 uint8_t rx_csum = ctx->hw_features.rx_csum; 863 if (rx_csum) { 864 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 865 rte_pktmbuf_free(pkt); 866 return; 867 } 868 } 869 870 /* 871 * FIXME: should we save pkt->vlan_tci 872 * if (pkt->ol_flags & PKT_RX_VLAN_PKT) 873 */ 874 875 void *data = rte_pktmbuf_mtod(pkt, void*); 876 uint16_t len = rte_pktmbuf_data_len(pkt); 877 878 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 879 if (hdr == NULL) { 880 rte_pktmbuf_free(pkt); 881 return; 882 } 883 884 struct rte_mbuf *pn = pkt->next; 885 void *prev = hdr; 886 while(pn != NULL) { 887 data = rte_pktmbuf_mtod(pn, void*); 888 len = rte_pktmbuf_data_len(pn); 889 890 void *mb = ff_mbuf_get(prev, data, len); 891 if (mb == NULL) { 892 ff_mbuf_free(hdr); 893 rte_pktmbuf_free(pkt); 894 return; 895 } 896 pn = pn->next; 897 prev = mb; 898 } 899 900 ff_veth_process_packet(ctx->ifp, hdr); 901 } 902 903 static enum FilterReturn 904 protocol_filter(const void *data, uint16_t len) 905 { 906 if(len < ETHER_HDR_LEN) 907 return FILTER_UNKNOWN; 908 909 const struct ether_hdr *hdr; 910 hdr = (const struct ether_hdr *)data; 911 912 if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP) 913 return FILTER_ARP; 914 915 #ifndef FF_KNI 916 return FILTER_UNKNOWN; 917 #else 918 if (!enable_kni) { 919 return FILTER_UNKNOWN; 920 } 921 922 if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4) 923 return FILTER_UNKNOWN; 924 925 return ff_kni_proto_filter(data + ETHER_HDR_LEN, 926 len - ETHER_HDR_LEN); 927 #endif 928 } 929 930 static inline void 931 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m) 932 { 933 struct rte_mbuf *md; 934 void *src, *dst; 935 936 dst = rte_pktmbuf_mtod(mi, void *); 937 src = rte_pktmbuf_mtod(m, void *); 938 939 mi->data_len = m->data_len; 940 rte_memcpy(dst, src, m->data_len); 941 942 mi->port = m->port; 943 mi->vlan_tci = m->vlan_tci; 944 mi->vlan_tci_outer = m->vlan_tci_outer; 945 mi->tx_offload = m->tx_offload; 946 mi->hash = m->hash; 947 mi->ol_flags = m->ol_flags; 948 mi->packet_type = m->packet_type; 949 } 950 951 /* copied from rte_pktmbuf_clone */ 952 static inline struct rte_mbuf * 953 pktmbuf_deep_clone(const struct rte_mbuf *md, 954 struct rte_mempool *mp) 955 { 956 struct rte_mbuf *mc, *mi, **prev; 957 uint32_t pktlen; 958 uint8_t nseg; 959 960 if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) 961 return NULL; 962 963 mi = mc; 964 prev = &mi->next; 965 pktlen = md->pkt_len; 966 nseg = 0; 967 968 do { 969 nseg++; 970 pktmbuf_deep_attach(mi, md); 971 *prev = mi; 972 prev = &mi->next; 973 } while ((md = md->next) != NULL && 974 (mi = rte_pktmbuf_alloc(mp)) != NULL); 975 976 *prev = NULL; 977 mc->nb_segs = nseg; 978 mc->pkt_len = pktlen; 979 980 /* Allocation of new indirect segment failed */ 981 if (unlikely (mi == NULL)) { 982 rte_pktmbuf_free(mc); 983 return NULL; 984 } 985 986 __rte_mbuf_sanity_check(mc, 1); 987 return mc; 988 } 989 990 static inline void 991 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 992 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 993 { 994 struct lcore_conf *qconf = &lcore_conf; 995 uint16_t nb_queues = qconf->nb_queue_list[port_id]; 996 997 uint16_t i; 998 for (i = 0; i < count; i++) { 999 struct rte_mbuf *rtem = bufs[i]; 1000 1001 if (unlikely(qconf->pcap[port_id] != NULL)) { 1002 if (!pkts_from_ring) { 1003 ff_dump_packets(qconf->pcap[port_id], rtem); 1004 } 1005 } 1006 1007 void *data = rte_pktmbuf_mtod(rtem, void*); 1008 uint16_t len = rte_pktmbuf_data_len(rtem); 1009 1010 if (!pkts_from_ring) { 1011 ff_traffic.rx_packets++; 1012 ff_traffic.rx_bytes += len; 1013 } 1014 1015 if (!pkts_from_ring && packet_dispatcher) { 1016 int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues); 1017 if (ret == FF_DISPATCH_RESPONSE) { 1018 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len; 1019 send_single_packet(rtem, port_id); 1020 continue; 1021 } 1022 1023 if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) { 1024 rte_pktmbuf_free(rtem); 1025 continue; 1026 } 1027 1028 if (ret != queue_id) { 1029 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem); 1030 if (ret < 0) 1031 rte_pktmbuf_free(rtem); 1032 1033 continue; 1034 } 1035 } 1036 1037 enum FilterReturn filter = protocol_filter(data, len); 1038 if (filter == FILTER_ARP) { 1039 struct rte_mempool *mbuf_pool; 1040 struct rte_mbuf *mbuf_clone; 1041 if (!pkts_from_ring) { 1042 uint16_t j; 1043 for(j = 0; j < nb_queues; ++j) { 1044 if(j == queue_id) 1045 continue; 1046 1047 unsigned socket_id = 0; 1048 if (numa_on) { 1049 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j]; 1050 socket_id = rte_lcore_to_socket_id(lcore_id); 1051 } 1052 mbuf_pool = pktmbuf_pool[socket_id]; 1053 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1054 if(mbuf_clone) { 1055 int ret = rte_ring_enqueue(dispatch_ring[port_id][j], 1056 mbuf_clone); 1057 if (ret < 0) 1058 rte_pktmbuf_free(mbuf_clone); 1059 } 1060 } 1061 } 1062 1063 #ifdef FF_KNI 1064 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1065 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 1066 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1067 if(mbuf_clone) { 1068 ff_kni_enqueue(port_id, mbuf_clone); 1069 } 1070 } 1071 #endif 1072 ff_veth_input(ctx, rtem); 1073 #ifdef FF_KNI 1074 } else if (enable_kni && 1075 ((filter == FILTER_KNI && kni_accept) || 1076 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 1077 ff_kni_enqueue(port_id, rtem); 1078 #endif 1079 } else { 1080 ff_veth_input(ctx, rtem); 1081 } 1082 } 1083 } 1084 1085 static inline int 1086 process_dispatch_ring(uint16_t port_id, uint16_t queue_id, 1087 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 1088 { 1089 /* read packet from ring buf and to process */ 1090 uint16_t nb_rb; 1091 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id], 1092 (void **)pkts_burst, MAX_PKT_BURST, NULL); 1093 1094 if(nb_rb > 0) { 1095 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 1096 } 1097 1098 return 0; 1099 } 1100 1101 static inline void 1102 handle_sysctl_msg(struct ff_msg *msg) 1103 { 1104 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 1105 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 1106 msg->sysctl.newlen); 1107 1108 if (ret < 0) { 1109 msg->result = errno; 1110 } else { 1111 msg->result = 0; 1112 } 1113 } 1114 1115 static inline void 1116 handle_ioctl_msg(struct ff_msg *msg) 1117 { 1118 int fd, ret; 1119 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 1120 if (fd < 0) { 1121 ret = -1; 1122 goto done; 1123 } 1124 1125 ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data); 1126 1127 ff_close(fd); 1128 1129 done: 1130 if (ret < 0) { 1131 msg->result = errno; 1132 } else { 1133 msg->result = 0; 1134 } 1135 } 1136 1137 static inline void 1138 handle_route_msg(struct ff_msg *msg) 1139 { 1140 int ret = ff_rtioctl(msg->route.fib, msg->route.data, 1141 &msg->route.len, msg->route.maxlen); 1142 if (ret < 0) { 1143 msg->result = errno; 1144 } else { 1145 msg->result = 0; 1146 } 1147 } 1148 1149 static inline void 1150 handle_top_msg(struct ff_msg *msg) 1151 { 1152 msg->top = ff_top_status; 1153 msg->result = 0; 1154 } 1155 1156 #ifdef FF_NETGRAPH 1157 static inline void 1158 handle_ngctl_msg(struct ff_msg *msg) 1159 { 1160 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data); 1161 if (ret < 0) { 1162 msg->result = errno; 1163 } else { 1164 msg->result = 0; 1165 msg->ngctl.ret = ret; 1166 } 1167 } 1168 #endif 1169 1170 #ifdef FF_IPFW 1171 static inline void 1172 handle_ipfw_msg(struct ff_msg *msg) 1173 { 1174 int fd, ret; 1175 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW); 1176 if (fd < 0) { 1177 ret = -1; 1178 goto done; 1179 } 1180 1181 switch (msg->ipfw.cmd) { 1182 case FF_IPFW_GET: 1183 ret = ff_getsockopt_freebsd(fd, msg->ipfw.level, 1184 msg->ipfw.optname, msg->ipfw.optval, 1185 msg->ipfw.optlen); 1186 break; 1187 case FF_IPFW_SET: 1188 ret = ff_setsockopt_freebsd(fd, msg->ipfw.level, 1189 msg->ipfw.optname, msg->ipfw.optval, 1190 *(msg->ipfw.optlen)); 1191 break; 1192 default: 1193 ret = -1; 1194 errno = ENOTSUP; 1195 break; 1196 } 1197 1198 ff_close(fd); 1199 1200 done: 1201 if (ret < 0) { 1202 msg->result = errno; 1203 } else { 1204 msg->result = 0; 1205 } 1206 } 1207 #endif 1208 1209 static inline void 1210 handle_traffic_msg(struct ff_msg *msg) 1211 { 1212 msg->traffic = ff_traffic; 1213 msg->result = 0; 1214 } 1215 1216 static inline void 1217 handle_default_msg(struct ff_msg *msg) 1218 { 1219 msg->result = ENOTSUP; 1220 } 1221 1222 static inline void 1223 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1224 { 1225 switch (msg->msg_type) { 1226 case FF_SYSCTL: 1227 handle_sysctl_msg(msg); 1228 break; 1229 case FF_IOCTL: 1230 handle_ioctl_msg(msg); 1231 break; 1232 case FF_ROUTE: 1233 handle_route_msg(msg); 1234 break; 1235 case FF_TOP: 1236 handle_top_msg(msg); 1237 break; 1238 #ifdef FF_NETGRAPH 1239 case FF_NGCTL: 1240 handle_ngctl_msg(msg); 1241 break; 1242 #endif 1243 #ifdef FF_IPFW 1244 case FF_IPFW_CTL: 1245 handle_ipfw_msg(msg); 1246 break; 1247 #endif 1248 case FF_TRAFFIC: 1249 handle_traffic_msg(msg); 1250 break; 1251 default: 1252 handle_default_msg(msg); 1253 break; 1254 } 1255 rte_ring_enqueue(msg_ring[proc_id].ring[1], msg); 1256 } 1257 1258 static inline int 1259 process_msg_ring(uint16_t proc_id) 1260 { 1261 void *msg; 1262 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1263 1264 if (unlikely(ret == 0)) { 1265 handle_msg((struct ff_msg *)msg, proc_id); 1266 } 1267 1268 return 0; 1269 } 1270 1271 /* Send burst of packets on an output interface */ 1272 static inline int 1273 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1274 { 1275 struct rte_mbuf **m_table; 1276 int ret; 1277 uint16_t queueid; 1278 1279 queueid = qconf->tx_queue_id[port]; 1280 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1281 1282 if (unlikely(qconf->pcap[port] != NULL)) { 1283 uint16_t i; 1284 for (i = 0; i < n; i++) { 1285 ff_dump_packets(qconf->pcap[port], m_table[i]); 1286 } 1287 } 1288 1289 ff_traffic.tx_packets += n; 1290 uint16_t i; 1291 for (i = 0; i < n; i++) { 1292 ff_traffic.tx_bytes += rte_pktmbuf_data_len(m_table[i]); 1293 } 1294 1295 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1296 if (unlikely(ret < n)) { 1297 do { 1298 rte_pktmbuf_free(m_table[ret]); 1299 } while (++ret < n); 1300 } 1301 1302 return 0; 1303 } 1304 1305 /* Enqueue a single packet, and send burst if queue is filled */ 1306 static inline int 1307 send_single_packet(struct rte_mbuf *m, uint8_t port) 1308 { 1309 uint16_t len; 1310 struct lcore_conf *qconf; 1311 1312 qconf = &lcore_conf; 1313 len = qconf->tx_mbufs[port].len; 1314 qconf->tx_mbufs[port].m_table[len] = m; 1315 len++; 1316 1317 /* enough pkts to be sent */ 1318 if (unlikely(len == MAX_PKT_BURST)) { 1319 send_burst(qconf, MAX_PKT_BURST, port); 1320 len = 0; 1321 } 1322 1323 qconf->tx_mbufs[port].len = len; 1324 return 0; 1325 } 1326 1327 int 1328 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1329 int total) 1330 { 1331 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1332 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1333 if (head == NULL) { 1334 ff_mbuf_free(m); 1335 return -1; 1336 } 1337 1338 head->pkt_len = total; 1339 head->nb_segs = 0; 1340 1341 int off = 0; 1342 struct rte_mbuf *cur = head, *prev = NULL; 1343 while(total > 0) { 1344 if (cur == NULL) { 1345 cur = rte_pktmbuf_alloc(mbuf_pool); 1346 if (cur == NULL) { 1347 rte_pktmbuf_free(head); 1348 ff_mbuf_free(m); 1349 return -1; 1350 } 1351 } 1352 1353 if (prev != NULL) { 1354 prev->next = cur; 1355 } 1356 head->nb_segs++; 1357 1358 prev = cur; 1359 void *data = rte_pktmbuf_mtod(cur, void*); 1360 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1361 int ret = ff_mbuf_copydata(m, data, off, len); 1362 if (ret < 0) { 1363 rte_pktmbuf_free(head); 1364 ff_mbuf_free(m); 1365 return -1; 1366 } 1367 1368 1369 cur->data_len = len; 1370 off += len; 1371 total -= len; 1372 cur = NULL; 1373 } 1374 1375 struct ff_tx_offload offload = {0}; 1376 ff_mbuf_tx_offload(m, &offload); 1377 1378 void *data = rte_pktmbuf_mtod(head, void*); 1379 1380 if (offload.ip_csum) { 1381 /* ipv6 not supported yet */ 1382 struct ipv4_hdr *iph; 1383 int iph_len; 1384 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1385 iph_len = (iph->version_ihl & 0x0f) << 2; 1386 1387 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 1388 head->l2_len = ETHER_HDR_LEN; 1389 head->l3_len = iph_len; 1390 } 1391 1392 if (ctx->hw_features.tx_csum_l4) { 1393 struct ipv4_hdr *iph; 1394 int iph_len; 1395 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1396 iph_len = (iph->version_ihl & 0x0f) << 2; 1397 1398 if (offload.tcp_csum) { 1399 head->ol_flags |= PKT_TX_TCP_CKSUM; 1400 head->l2_len = ETHER_HDR_LEN; 1401 head->l3_len = iph_len; 1402 } 1403 1404 /* 1405 * TCP segmentation offload. 1406 * 1407 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 1408 * implies PKT_TX_TCP_CKSUM) 1409 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 1410 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 1411 * write the IP checksum to 0 in the packet 1412 * - fill the mbuf offload information: l2_len, 1413 * l3_len, l4_len, tso_segsz 1414 * - calculate the pseudo header checksum without taking ip_len 1415 * in account, and set it in the TCP header. Refer to 1416 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 1417 * used as helpers. 1418 */ 1419 if (offload.tso_seg_size) { 1420 struct tcp_hdr *tcph; 1421 int tcph_len; 1422 tcph = (struct tcp_hdr *)((char *)iph + iph_len); 1423 tcph_len = (tcph->data_off & 0xf0) >> 2; 1424 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 1425 1426 head->ol_flags |= PKT_TX_TCP_SEG; 1427 head->l4_len = tcph_len; 1428 head->tso_segsz = offload.tso_seg_size; 1429 } 1430 1431 if (offload.udp_csum) { 1432 head->ol_flags |= PKT_TX_UDP_CKSUM; 1433 head->l2_len = ETHER_HDR_LEN; 1434 head->l3_len = iph_len; 1435 } 1436 } 1437 1438 ff_mbuf_free(m); 1439 1440 return send_single_packet(head, ctx->port_id); 1441 } 1442 1443 static int 1444 main_loop(void *arg) 1445 { 1446 struct loop_routine *lr = (struct loop_routine *)arg; 1447 1448 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1449 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc; 1450 int i, j, nb_rx, idle; 1451 uint16_t port_id, queue_id; 1452 struct lcore_conf *qconf; 1453 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 1454 US_PER_S * BURST_TX_DRAIN_US; 1455 struct ff_dpdk_if_context *ctx; 1456 1457 prev_tsc = 0; 1458 usch_tsc = 0; 1459 1460 qconf = &lcore_conf; 1461 1462 while (1) { 1463 cur_tsc = rte_rdtsc(); 1464 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1465 rte_timer_manage(); 1466 } 1467 1468 idle = 1; 1469 sys_tsc = 0; 1470 usr_tsc = 0; 1471 1472 /* 1473 * TX burst queue drain 1474 */ 1475 diff_tsc = cur_tsc - prev_tsc; 1476 if (unlikely(diff_tsc > drain_tsc)) { 1477 for (i = 0; i < qconf->nb_tx_port; i++) { 1478 port_id = qconf->tx_port_id[i]; 1479 if (qconf->tx_mbufs[port_id].len == 0) 1480 continue; 1481 1482 idle = 0; 1483 1484 send_burst(qconf, 1485 qconf->tx_mbufs[port_id].len, 1486 port_id); 1487 qconf->tx_mbufs[port_id].len = 0; 1488 } 1489 1490 prev_tsc = cur_tsc; 1491 } 1492 1493 /* 1494 * Read packet from RX queues 1495 */ 1496 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1497 port_id = qconf->rx_queue_list[i].port_id; 1498 queue_id = qconf->rx_queue_list[i].queue_id; 1499 ctx = veth_ctx[port_id]; 1500 1501 #ifdef FF_KNI 1502 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1503 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1504 } 1505 #endif 1506 1507 process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); 1508 1509 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1510 MAX_PKT_BURST); 1511 if (nb_rx == 0) 1512 continue; 1513 1514 idle = 0; 1515 1516 /* Prefetch first packets */ 1517 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1518 rte_prefetch0(rte_pktmbuf_mtod( 1519 pkts_burst[j], void *)); 1520 } 1521 1522 /* Prefetch and handle already prefetched packets */ 1523 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1524 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1525 j + PREFETCH_OFFSET], void *)); 1526 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1527 } 1528 1529 /* Handle remaining prefetched packets */ 1530 for (; j < nb_rx; j++) { 1531 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1532 } 1533 } 1534 1535 process_msg_ring(qconf->proc_id); 1536 1537 div_tsc = rte_rdtsc(); 1538 1539 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) { 1540 usch_tsc = cur_tsc; 1541 lr->loop(lr->arg); 1542 } 1543 1544 idle_sleep_tsc = rte_rdtsc(); 1545 if (likely(idle && idle_sleep)) { 1546 usleep(idle_sleep); 1547 end_tsc = rte_rdtsc(); 1548 } else { 1549 end_tsc = idle_sleep_tsc; 1550 } 1551 1552 end_tsc = rte_rdtsc(); 1553 1554 if (usch_tsc == cur_tsc) { 1555 usr_tsc = idle_sleep_tsc - div_tsc; 1556 } 1557 1558 if (!idle) { 1559 sys_tsc = div_tsc - cur_tsc; 1560 ff_top_status.sys_tsc += sys_tsc; 1561 } 1562 1563 ff_top_status.usr_tsc += usr_tsc; 1564 ff_top_status.work_tsc += end_tsc - cur_tsc; 1565 ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1566 1567 ff_top_status.loops++; 1568 } 1569 1570 return 0; 1571 } 1572 1573 int 1574 ff_dpdk_if_up(void) { 1575 int i; 1576 struct lcore_conf *qconf = &lcore_conf; 1577 for (i = 0; i < qconf->nb_tx_port; i++) { 1578 uint16_t port_id = qconf->tx_port_id[i]; 1579 1580 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id]; 1581 veth_ctx[port_id] = ff_veth_attach(pconf); 1582 if (veth_ctx[port_id] == NULL) { 1583 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1584 } 1585 } 1586 1587 return 0; 1588 } 1589 1590 void 1591 ff_dpdk_run(loop_func_t loop, void *arg) { 1592 struct loop_routine *lr = rte_malloc(NULL, 1593 sizeof(struct loop_routine), 0); 1594 lr->loop = loop; 1595 lr->arg = arg; 1596 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1597 rte_eal_mp_wait_lcore(); 1598 rte_free(lr); 1599 } 1600 1601 void 1602 ff_dpdk_pktmbuf_free(void *m) 1603 { 1604 rte_pktmbuf_free((struct rte_mbuf *)m); 1605 } 1606 1607 static uint32_t 1608 toeplitz_hash(unsigned keylen, const uint8_t *key, 1609 unsigned datalen, const uint8_t *data) 1610 { 1611 uint32_t hash = 0, v; 1612 u_int i, b; 1613 1614 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1615 1616 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1617 for (i = 0; i < datalen; i++) { 1618 for (b = 0; b < 8; b++) { 1619 if (data[i] & (1<<(7-b))) 1620 hash ^= v; 1621 v <<= 1; 1622 if ((i + 4) < keylen && 1623 (key[i+4] & (1<<(7-b)))) 1624 v |= 1; 1625 } 1626 } 1627 return (hash); 1628 } 1629 1630 int 1631 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1632 uint16_t sport, uint16_t dport) 1633 { 1634 struct lcore_conf *qconf = &lcore_conf; 1635 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 1636 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id]; 1637 1638 if (nb_queues <= 1) { 1639 return 1; 1640 } 1641 1642 uint16_t reta_size = rss_reta_size[ctx->port_id]; 1643 uint16_t queueid = qconf->tx_queue_id[ctx->port_id]; 1644 1645 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1646 sizeof(dport)]; 1647 1648 unsigned datalen = 0; 1649 1650 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1651 datalen += sizeof(saddr); 1652 1653 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1654 datalen += sizeof(daddr); 1655 1656 bcopy(&sport, &data[datalen], sizeof(sport)); 1657 datalen += sizeof(sport); 1658 1659 bcopy(&dport, &data[datalen], sizeof(dport)); 1660 datalen += sizeof(dport); 1661 1662 uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes), 1663 default_rsskey_40bytes, datalen, data); 1664 1665 return ((hash & (reta_size - 1)) % nb_queues) == queueid; 1666 } 1667 1668 void 1669 ff_regist_packet_dispatcher(dispatch_func_t func) 1670 { 1671 packet_dispatcher = func; 1672 } 1673 1674 uint64_t 1675 ff_get_tsc_ns() 1676 { 1677 uint64_t cur_tsc = rte_rdtsc(); 1678 uint64_t hz = rte_get_tsc_hz(); 1679 return ((double)cur_tsc/(double)hz) * NS_PER_S; 1680 } 1681 1682