1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 #include <unistd.h> 28 29 #include <rte_common.h> 30 #include <rte_byteorder.h> 31 #include <rte_log.h> 32 #include <rte_memory.h> 33 #include <rte_memcpy.h> 34 #include <rte_memzone.h> 35 #include <rte_config.h> 36 #include <rte_eal.h> 37 #include <rte_pci.h> 38 #include <rte_mbuf.h> 39 #include <rte_memory.h> 40 #include <rte_lcore.h> 41 #include <rte_launch.h> 42 #include <rte_ethdev.h> 43 #include <rte_debug.h> 44 #include <rte_common.h> 45 #include <rte_ether.h> 46 #include <rte_malloc.h> 47 #include <rte_cycles.h> 48 #include <rte_timer.h> 49 #include <rte_thash.h> 50 #include <rte_ip.h> 51 #include <rte_tcp.h> 52 #include <rte_udp.h> 53 54 #include "ff_dpdk_if.h" 55 #include "ff_dpdk_pcap.h" 56 #include "ff_dpdk_kni.h" 57 #include "ff_config.h" 58 #include "ff_veth.h" 59 #include "ff_host_interface.h" 60 #include "ff_msg.h" 61 #include "ff_api.h" 62 63 #define MEMPOOL_CACHE_SIZE 256 64 65 #define DISPATCH_RING_SIZE 2048 66 67 #define MSG_RING_SIZE 32 68 69 /* 70 * Configurable number of RX/TX ring descriptors 71 */ 72 #define RX_QUEUE_SIZE 512 73 #define TX_QUEUE_SIZE 512 74 75 #define MAX_PKT_BURST 32 76 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 77 78 /* 79 * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send. 80 */ 81 #define MAX_TX_BURST (MAX_PKT_BURST / 2) 82 83 #define NB_SOCKETS 8 84 85 /* Configure how many packets ahead to prefetch, when reading packets */ 86 #define PREFETCH_OFFSET 3 87 88 #define MAX_RX_QUEUE_PER_LCORE 16 89 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS 90 #define MAX_RX_QUEUE_PER_PORT 128 91 92 #ifdef FF_KNI 93 #define KNI_MBUF_MAX 2048 94 #define KNI_QUEUE_SIZE 2048 95 96 static int enable_kni; 97 static int kni_accept; 98 #endif 99 100 static int numa_on; 101 102 static unsigned idle_sleep; 103 104 static struct rte_timer freebsd_clock; 105 106 // Mellanox Linux's driver key 107 static uint8_t default_rsskey_40bytes[40] = { 108 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 109 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 110 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 111 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 112 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 113 }; 114 115 static struct rte_eth_conf default_port_conf = { 116 .rxmode = { 117 .mq_mode = ETH_MQ_RX_RSS, 118 .max_rx_pkt_len = ETHER_MAX_LEN, 119 .split_hdr_size = 0, /**< hdr buf size */ 120 .header_split = 0, /**< Header Split disabled */ 121 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 122 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 123 .hw_vlan_strip = 0, /**< VLAN strip disabled. */ 124 .hw_vlan_extend = 0, /**< Extended VLAN disabled. */ 125 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 126 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 127 .enable_lro = 0, /**< LRO disabled */ 128 }, 129 .rx_adv_conf = { 130 .rss_conf = { 131 .rss_key = default_rsskey_40bytes, 132 .rss_key_len = 40, 133 .rss_hf = ETH_RSS_PROTO_MASK, 134 }, 135 }, 136 .txmode = { 137 .mq_mode = ETH_MQ_TX_NONE, 138 }, 139 }; 140 141 struct mbuf_table { 142 uint16_t len; 143 struct rte_mbuf *m_table[MAX_PKT_BURST]; 144 }; 145 146 struct lcore_rx_queue { 147 uint16_t port_id; 148 uint16_t queue_id; 149 } __rte_cache_aligned; 150 151 struct lcore_conf { 152 uint16_t proc_id; 153 uint16_t socket_id; 154 uint16_t nb_queue_list[RTE_MAX_ETHPORTS]; 155 struct ff_port_cfg *port_cfgs; 156 157 uint16_t nb_rx_queue; 158 struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; 159 uint16_t nb_tx_port; 160 uint16_t tx_port_id[RTE_MAX_ETHPORTS]; 161 uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; 162 struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS]; 163 char *pcap[RTE_MAX_ETHPORTS]; 164 } __rte_cache_aligned; 165 166 static struct lcore_conf lcore_conf; 167 168 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 169 170 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; 171 static dispatch_func_t packet_dispatcher; 172 173 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 174 175 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port); 176 177 struct ff_msg_ring { 178 char ring_name[2][RTE_RING_NAMESIZE]; 179 /* ring[0] for lcore recv msg, other send */ 180 /* ring[1] for lcore send msg, other read */ 181 struct rte_ring *ring[2]; 182 } __rte_cache_aligned; 183 184 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 185 static struct rte_mempool *message_pool; 186 187 struct ff_dpdk_if_context { 188 void *sc; 189 void *ifp; 190 uint16_t port_id; 191 struct ff_hw_features hw_features; 192 } __rte_cache_aligned; 193 194 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 195 196 static struct ff_top_args ff_top_status; 197 static struct ff_traffic_args ff_traffic; 198 199 extern void ff_hardclock(void); 200 201 static void 202 ff_hardclock_job(__rte_unused struct rte_timer *timer, 203 __rte_unused void *arg) { 204 ff_hardclock(); 205 ff_update_current_ts(); 206 } 207 208 struct ff_dpdk_if_context * 209 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 210 { 211 struct ff_dpdk_if_context *ctx; 212 213 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 214 if (ctx == NULL) 215 return NULL; 216 217 ctx->sc = sc; 218 ctx->ifp = ifp; 219 ctx->port_id = cfg->port_id; 220 ctx->hw_features = cfg->hw_features; 221 222 return ctx; 223 } 224 225 void 226 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 227 { 228 free(ctx); 229 } 230 231 static void 232 check_all_ports_link_status(void) 233 { 234 #define CHECK_INTERVAL 100 /* 100ms */ 235 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 236 237 uint16_t portid; 238 uint8_t count, all_ports_up, print_flag = 0; 239 struct rte_eth_link link; 240 241 printf("\nChecking link status"); 242 fflush(stdout); 243 244 int i, nb_ports; 245 nb_ports = ff_global_cfg.dpdk.nb_ports; 246 for (count = 0; count <= MAX_CHECK_TIME; count++) { 247 all_ports_up = 1; 248 for (i = 0; i < nb_ports; i++) { 249 uint16_t portid = ff_global_cfg.dpdk.portid_list[i]; 250 memset(&link, 0, sizeof(link)); 251 rte_eth_link_get_nowait(portid, &link); 252 253 /* print link status if flag set */ 254 if (print_flag == 1) { 255 if (link.link_status) { 256 printf("Port %d Link Up - speed %u " 257 "Mbps - %s\n", (int)portid, 258 (unsigned)link.link_speed, 259 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 260 ("full-duplex") : ("half-duplex\n")); 261 } else { 262 printf("Port %d Link Down\n", (int)portid); 263 } 264 continue; 265 } 266 /* clear all_ports_up flag if any link down */ 267 if (link.link_status == 0) { 268 all_ports_up = 0; 269 break; 270 } 271 } 272 273 /* after finally printing all link status, get out */ 274 if (print_flag == 1) 275 break; 276 277 if (all_ports_up == 0) { 278 printf("."); 279 fflush(stdout); 280 rte_delay_ms(CHECK_INTERVAL); 281 } 282 283 /* set the print_flag if all ports up or timeout */ 284 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 285 print_flag = 1; 286 printf("done\n"); 287 } 288 } 289 } 290 291 static int 292 init_lcore_conf(void) 293 { 294 uint8_t nb_dev_ports = rte_eth_dev_count(); 295 if (nb_dev_ports == 0) { 296 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 297 } 298 299 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) { 300 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n", 301 ff_global_cfg.dpdk.max_portid); 302 } 303 304 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs; 305 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 306 307 uint16_t proc_id; 308 for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) { 309 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id]; 310 if (!lcore_config[lcore_id].detected) { 311 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 312 } 313 } 314 315 uint16_t socket_id = 0; 316 if (numa_on) { 317 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 318 } 319 320 lcore_conf.socket_id = socket_id; 321 322 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id]; 323 int j; 324 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) { 325 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j]; 326 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 327 328 int queueid = -1; 329 int i; 330 for (i = 0; i < pconf->nb_lcores; i++) { 331 if (pconf->lcore_list[i] == lcore_id) { 332 queueid = i; 333 } 334 } 335 if (queueid < 0) { 336 continue; 337 } 338 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid); 339 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 340 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 341 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid; 342 lcore_conf.nb_rx_queue++; 343 344 lcore_conf.tx_queue_id[port_id] = queueid; 345 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id; 346 lcore_conf.nb_tx_port++; 347 348 lcore_conf.pcap[port_id] = pconf->pcap; 349 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores; 350 } 351 352 if (lcore_conf.nb_rx_queue == 0) { 353 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id); 354 } 355 356 return 0; 357 } 358 359 static int 360 init_mem_pool(void) 361 { 362 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 363 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 364 uint32_t nb_tx_queue = nb_lcores; 365 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 366 367 unsigned nb_mbuf = RTE_MAX ( 368 (nb_rx_queue*RX_QUEUE_SIZE + 369 nb_ports*nb_lcores*MAX_PKT_BURST + 370 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 371 nb_lcores*MEMPOOL_CACHE_SIZE + 372 #ifdef FF_KNI 373 nb_ports*KNI_MBUF_MAX + 374 nb_ports*KNI_QUEUE_SIZE + 375 #endif 376 nb_lcores*nb_ports*DISPATCH_RING_SIZE), 377 (unsigned)8192); 378 379 unsigned socketid = 0; 380 uint16_t i, lcore_id; 381 char s[64]; 382 383 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) { 384 lcore_id = ff_global_cfg.dpdk.proc_lcore[i]; 385 if (numa_on) { 386 socketid = rte_lcore_to_socket_id(lcore_id); 387 } 388 389 if (socketid >= NB_SOCKETS) { 390 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 391 socketid, i, NB_SOCKETS); 392 } 393 394 if (pktmbuf_pool[socketid] != NULL) { 395 continue; 396 } 397 398 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 399 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 400 pktmbuf_pool[socketid] = 401 rte_pktmbuf_pool_create(s, nb_mbuf, 402 MEMPOOL_CACHE_SIZE, 0, 403 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 404 } else { 405 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 406 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 407 } 408 409 if (pktmbuf_pool[socketid] == NULL) { 410 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 411 } else { 412 printf("create mbuf pool on socket %d\n", socketid); 413 } 414 } 415 416 return 0; 417 } 418 419 static struct rte_ring * 420 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 421 { 422 struct rte_ring *ring; 423 424 if (name == NULL) { 425 rte_exit(EXIT_FAILURE, "create ring failed, no name!\n"); 426 } 427 428 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 429 ring = rte_ring_create(name, count, socket_id, flags); 430 } else { 431 ring = rte_ring_lookup(name); 432 } 433 434 if (ring == NULL) { 435 rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name); 436 } 437 438 return ring; 439 } 440 441 static int 442 init_dispatch_ring(void) 443 { 444 int j; 445 char name_buf[RTE_RING_NAMESIZE]; 446 int queueid; 447 448 unsigned socketid = lcore_conf.socket_id; 449 450 /* Create ring according to ports actually being used. */ 451 int nb_ports = ff_global_cfg.dpdk.nb_ports; 452 for (j = 0; j < nb_ports; j++) { 453 uint16_t portid = ff_global_cfg.dpdk.portid_list[j]; 454 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid]; 455 int nb_queues = pconf->nb_lcores; 456 if (dispatch_ring[portid] == NULL) { 457 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid); 458 459 dispatch_ring[portid] = rte_zmalloc(name_buf, 460 sizeof(struct rte_ring *) * nb_queues, 461 RTE_CACHE_LINE_SIZE); 462 if (dispatch_ring[portid] == NULL) { 463 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 464 "failed\n", name_buf); 465 } 466 } 467 468 for(queueid = 0; queueid < nb_queues; ++queueid) { 469 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d", 470 portid, queueid); 471 dispatch_ring[portid][queueid] = create_ring(name_buf, 472 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ); 473 474 if (dispatch_ring[portid][queueid] == NULL) 475 rte_panic("create ring:%s failed!\n", name_buf); 476 477 printf("create ring:%s success, %u ring entries are now free!\n", 478 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid])); 479 } 480 } 481 482 return 0; 483 } 484 485 static void 486 ff_msg_init(struct rte_mempool *mp, 487 __attribute__((unused)) void *opaque_arg, 488 void *obj, __attribute__((unused)) unsigned i) 489 { 490 struct ff_msg *msg = (struct ff_msg *)obj; 491 msg->msg_type = FF_UNKNOWN; 492 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 493 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 494 } 495 496 static int 497 init_msg_ring(void) 498 { 499 uint16_t i; 500 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 501 unsigned socketid = lcore_conf.socket_id; 502 503 /* Create message buffer pool */ 504 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 505 message_pool = rte_mempool_create(FF_MSG_POOL, 506 MSG_RING_SIZE * 2 * nb_procs, 507 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 508 NULL, NULL, ff_msg_init, NULL, 509 socketid, 0); 510 } else { 511 message_pool = rte_mempool_lookup(FF_MSG_POOL); 512 } 513 514 if (message_pool == NULL) { 515 rte_panic("Create msg mempool failed\n"); 516 } 517 518 for(i = 0; i < nb_procs; ++i) { 519 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 520 "%s%u", FF_MSG_RING_IN, i); 521 snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE, 522 "%s%u", FF_MSG_RING_OUT, i); 523 524 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 525 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 526 if (msg_ring[i].ring[0] == NULL) 527 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 528 529 msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1], 530 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 531 if (msg_ring[i].ring[1] == NULL) 532 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 533 } 534 535 return 0; 536 } 537 538 #ifdef FF_KNI 539 static int 540 init_kni(void) 541 { 542 int nb_ports = rte_eth_dev_count(); 543 kni_accept = 0; 544 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 545 kni_accept = 1; 546 547 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 548 ff_global_cfg.kni.udp_port); 549 550 unsigned socket_id = lcore_conf.socket_id; 551 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 552 553 nb_ports = ff_global_cfg.dpdk.nb_ports; 554 int i, ret; 555 for (i = 0; i < nb_ports; i++) { 556 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 557 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 558 } 559 560 return 0; 561 } 562 #endif 563 564 static void 565 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues) 566 { 567 if (reta_size == 0) { 568 return; 569 } 570 571 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 572 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 573 574 /* config HW indirection table */ 575 unsigned i, j, hash=0; 576 for (i = 0; i < reta_conf_size; i++) { 577 reta_conf[i].mask = ~0ULL; 578 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 579 reta_conf[i].reta[j] = hash++ % nb_queues; 580 } 581 } 582 583 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 584 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 585 port_id); 586 } 587 } 588 589 static int 590 init_port_start(void) 591 { 592 int nb_ports = ff_global_cfg.dpdk.nb_ports; 593 unsigned socketid = 0; 594 struct rte_mempool *mbuf_pool; 595 uint16_t i; 596 597 for (i = 0; i < nb_ports; i++) { 598 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 599 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 600 uint16_t nb_queues = pconf->nb_lcores; 601 602 struct rte_eth_dev_info dev_info; 603 rte_eth_dev_info_get(port_id, &dev_info); 604 605 if (nb_queues > dev_info.max_rx_queues) { 606 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 607 nb_queues, 608 dev_info.max_rx_queues); 609 } 610 611 if (nb_queues > dev_info.max_tx_queues) { 612 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 613 nb_queues, 614 dev_info.max_tx_queues); 615 } 616 617 struct ether_addr addr; 618 rte_eth_macaddr_get(port_id, &addr); 619 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 620 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 621 (unsigned)port_id, 622 addr.addr_bytes[0], addr.addr_bytes[1], 623 addr.addr_bytes[2], addr.addr_bytes[3], 624 addr.addr_bytes[4], addr.addr_bytes[5]); 625 626 rte_memcpy(pconf->mac, 627 addr.addr_bytes, ETHER_ADDR_LEN); 628 629 /* Clear txq_flags - we do not need multi-mempool and refcnt */ 630 dev_info.default_txconf.txq_flags = ETH_TXQ_FLAGS_NOMULTMEMP | 631 ETH_TXQ_FLAGS_NOREFCOUNT; 632 633 /* Disable features that are not supported by port's HW */ 634 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) { 635 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP; 636 } 637 638 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 639 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP; 640 } 641 642 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) { 643 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP; 644 } 645 646 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { 647 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; 648 } 649 650 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) && 651 !(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TSO)) { 652 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; 653 } 654 655 struct rte_eth_conf port_conf = {0}; 656 657 /* Set RSS mode */ 658 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 659 port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK; 660 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes; 661 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; 662 663 /* Set Rx VLAN stripping */ 664 if (ff_global_cfg.dpdk.vlan_strip) { 665 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 666 port_conf.rxmode.hw_vlan_strip = 1; 667 } 668 } 669 670 /* Enable HW CRC stripping */ 671 port_conf.rxmode.hw_strip_crc = 1; 672 673 /* FIXME: Enable TCP LRO ?*/ 674 #if 0 675 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 676 printf("LRO is supported\n"); 677 port_conf.rxmode.enable_lro = 1; 678 pconf->hw_features.rx_lro = 1; 679 } 680 #endif 681 682 /* Set Rx checksum checking */ 683 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 684 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 685 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 686 printf("RX checksum offload supported\n"); 687 port_conf.rxmode.hw_ip_checksum = 1; 688 pconf->hw_features.rx_csum = 1; 689 } 690 691 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 692 printf("TX ip checksum offload supported\n"); 693 pconf->hw_features.tx_csum_ip = 1; 694 } 695 696 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 697 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 698 printf("TX TCP&UDP checksum offload supported\n"); 699 pconf->hw_features.tx_csum_l4 = 1; 700 } 701 702 if (ff_global_cfg.dpdk.tso) { 703 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 704 printf("TSO is supported\n"); 705 pconf->hw_features.tx_tso = 1; 706 } 707 } else { 708 printf("TSO is disabled\n"); 709 } 710 711 if (dev_info.reta_size) { 712 /* reta size must be power of 2 */ 713 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 714 715 rss_reta_size[port_id] = dev_info.reta_size; 716 printf("port[%d]: rss table size: %d\n", port_id, 717 dev_info.reta_size); 718 } 719 720 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 721 continue; 722 } 723 724 int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); 725 if (ret != 0) { 726 return ret; 727 } 728 uint16_t q; 729 for (q = 0; q < nb_queues; q++) { 730 if (numa_on) { 731 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q]; 732 socketid = rte_lcore_to_socket_id(lcore_id); 733 } 734 mbuf_pool = pktmbuf_pool[socketid]; 735 736 ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE, 737 socketid, &dev_info.default_txconf); 738 if (ret < 0) { 739 return ret; 740 } 741 742 ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE, 743 socketid, &dev_info.default_rxconf, mbuf_pool); 744 if (ret < 0) { 745 return ret; 746 } 747 } 748 749 ret = rte_eth_dev_start(port_id); 750 if (ret < 0) { 751 return ret; 752 } 753 754 if (nb_queues > 1) { 755 /* set HW rss hash function to Toeplitz. */ 756 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 757 struct rte_eth_hash_filter_info info = {0}; 758 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 759 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 760 761 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 762 RTE_ETH_FILTER_SET, &info) < 0) { 763 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 764 port_id); 765 } 766 } 767 768 set_rss_table(port_id, dev_info.reta_size, nb_queues); 769 } 770 771 /* Enable RX in promiscuous mode for the Ethernet device. */ 772 if (ff_global_cfg.dpdk.promiscuous) { 773 rte_eth_promiscuous_enable(port_id); 774 ret = rte_eth_promiscuous_get(port_id); 775 if (ret == 1) { 776 printf("set port %u to promiscuous mode ok\n", port_id); 777 } else { 778 printf("set port %u to promiscuous mode error\n", port_id); 779 } 780 } 781 782 /* Enable pcap dump */ 783 if (pconf->pcap) { 784 ff_enable_pcap(pconf->pcap); 785 } 786 } 787 788 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 789 check_all_ports_link_status(); 790 } 791 792 return 0; 793 } 794 795 static int 796 init_clock(void) 797 { 798 rte_timer_subsystem_init(); 799 uint64_t hz = rte_get_timer_hz(); 800 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 801 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 802 803 rte_timer_init(&freebsd_clock); 804 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 805 rte_lcore_id(), &ff_hardclock_job, NULL); 806 807 ff_update_current_ts(); 808 809 return 0; 810 } 811 812 int 813 ff_dpdk_init(int argc, char **argv) 814 { 815 if (ff_global_cfg.dpdk.nb_procs < 1 || 816 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 817 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 818 ff_global_cfg.dpdk.proc_id < 0) { 819 printf("param num_procs[%d] or proc_id[%d] error!\n", 820 ff_global_cfg.dpdk.nb_procs, 821 ff_global_cfg.dpdk.proc_id); 822 exit(1); 823 } 824 825 int ret = rte_eal_init(argc, argv); 826 if (ret < 0) { 827 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 828 } 829 830 numa_on = ff_global_cfg.dpdk.numa_on; 831 832 idle_sleep = ff_global_cfg.dpdk.idle_sleep; 833 834 init_lcore_conf(); 835 836 init_mem_pool(); 837 838 init_dispatch_ring(); 839 840 init_msg_ring(); 841 842 #ifdef FF_KNI 843 enable_kni = ff_global_cfg.kni.enable; 844 if (enable_kni) { 845 init_kni(); 846 } 847 #endif 848 849 ret = init_port_start(); 850 if (ret < 0) { 851 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 852 } 853 854 init_clock(); 855 856 return 0; 857 } 858 859 static void 860 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 861 { 862 uint8_t rx_csum = ctx->hw_features.rx_csum; 863 if (rx_csum) { 864 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 865 rte_pktmbuf_free(pkt); 866 return; 867 } 868 } 869 870 /* 871 * FIXME: should we save pkt->vlan_tci 872 * if (pkt->ol_flags & PKT_RX_VLAN_PKT) 873 */ 874 875 void *data = rte_pktmbuf_mtod(pkt, void*); 876 uint16_t len = rte_pktmbuf_data_len(pkt); 877 878 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 879 if (hdr == NULL) { 880 rte_pktmbuf_free(pkt); 881 return; 882 } 883 884 struct rte_mbuf *pn = pkt->next; 885 void *prev = hdr; 886 while(pn != NULL) { 887 data = rte_pktmbuf_mtod(pn, void*); 888 len = rte_pktmbuf_data_len(pn); 889 890 void *mb = ff_mbuf_get(prev, data, len); 891 if (mb == NULL) { 892 ff_mbuf_free(hdr); 893 rte_pktmbuf_free(pkt); 894 return; 895 } 896 pn = pn->next; 897 prev = mb; 898 } 899 900 ff_veth_process_packet(ctx->ifp, hdr); 901 } 902 903 static enum FilterReturn 904 protocol_filter(const void *data, uint16_t len) 905 { 906 if(len < ETHER_HDR_LEN) 907 return FILTER_UNKNOWN; 908 909 const struct ether_hdr *hdr; 910 hdr = (const struct ether_hdr *)data; 911 912 if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP) 913 return FILTER_ARP; 914 915 #ifndef FF_KNI 916 return FILTER_UNKNOWN; 917 #else 918 if (!enable_kni) { 919 return FILTER_UNKNOWN; 920 } 921 922 if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4) 923 return FILTER_UNKNOWN; 924 925 return ff_kni_proto_filter(data + ETHER_HDR_LEN, 926 len - ETHER_HDR_LEN); 927 #endif 928 } 929 930 static inline void 931 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m) 932 { 933 struct rte_mbuf *md; 934 void *src, *dst; 935 936 dst = rte_pktmbuf_mtod(mi, void *); 937 src = rte_pktmbuf_mtod(m, void *); 938 939 mi->data_len = m->data_len; 940 rte_memcpy(dst, src, m->data_len); 941 942 mi->port = m->port; 943 mi->vlan_tci = m->vlan_tci; 944 mi->vlan_tci_outer = m->vlan_tci_outer; 945 mi->tx_offload = m->tx_offload; 946 mi->hash = m->hash; 947 mi->ol_flags = m->ol_flags; 948 mi->packet_type = m->packet_type; 949 } 950 951 /* copied from rte_pktmbuf_clone */ 952 static inline struct rte_mbuf * 953 pktmbuf_deep_clone(const struct rte_mbuf *md, 954 struct rte_mempool *mp) 955 { 956 struct rte_mbuf *mc, *mi, **prev; 957 uint32_t pktlen; 958 uint8_t nseg; 959 960 if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) 961 return NULL; 962 963 mi = mc; 964 prev = &mi->next; 965 pktlen = md->pkt_len; 966 nseg = 0; 967 968 do { 969 nseg++; 970 pktmbuf_deep_attach(mi, md); 971 *prev = mi; 972 prev = &mi->next; 973 } while ((md = md->next) != NULL && 974 (mi = rte_pktmbuf_alloc(mp)) != NULL); 975 976 *prev = NULL; 977 mc->nb_segs = nseg; 978 mc->pkt_len = pktlen; 979 980 /* Allocation of new indirect segment failed */ 981 if (unlikely (mi == NULL)) { 982 rte_pktmbuf_free(mc); 983 return NULL; 984 } 985 986 __rte_mbuf_sanity_check(mc, 1); 987 return mc; 988 } 989 990 static inline void 991 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 992 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 993 { 994 struct lcore_conf *qconf = &lcore_conf; 995 uint16_t nb_queues = qconf->nb_queue_list[port_id]; 996 997 uint16_t i; 998 for (i = 0; i < count; i++) { 999 struct rte_mbuf *rtem = bufs[i]; 1000 1001 if (unlikely(qconf->pcap[port_id] != NULL)) { 1002 if (!pkts_from_ring) { 1003 ff_dump_packets(qconf->pcap[port_id], rtem); 1004 } 1005 } 1006 1007 void *data = rte_pktmbuf_mtod(rtem, void*); 1008 uint16_t len = rte_pktmbuf_data_len(rtem); 1009 1010 if (!pkts_from_ring) { 1011 ff_traffic.rx_packets++; 1012 ff_traffic.rx_bytes += len; 1013 } 1014 1015 if (!pkts_from_ring && packet_dispatcher) { 1016 int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues); 1017 if (ret == FF_DISPATCH_RESPONSE) { 1018 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len; 1019 send_single_packet(rtem, port_id); 1020 continue; 1021 } 1022 1023 if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) { 1024 rte_pktmbuf_free(rtem); 1025 continue; 1026 } 1027 1028 if (ret != queue_id) { 1029 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem); 1030 if (ret < 0) 1031 rte_pktmbuf_free(rtem); 1032 1033 continue; 1034 } 1035 } 1036 1037 enum FilterReturn filter = protocol_filter(data, len); 1038 if (filter == FILTER_ARP) { 1039 struct rte_mempool *mbuf_pool; 1040 struct rte_mbuf *mbuf_clone; 1041 if (!pkts_from_ring) { 1042 uint16_t j; 1043 for(j = 0; j < nb_queues; ++j) { 1044 if(j == queue_id) 1045 continue; 1046 1047 unsigned socket_id = 0; 1048 if (numa_on) { 1049 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j]; 1050 socket_id = rte_lcore_to_socket_id(lcore_id); 1051 } 1052 mbuf_pool = pktmbuf_pool[socket_id]; 1053 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1054 if(mbuf_clone) { 1055 int ret = rte_ring_enqueue(dispatch_ring[port_id][j], 1056 mbuf_clone); 1057 if (ret < 0) 1058 rte_pktmbuf_free(mbuf_clone); 1059 } 1060 } 1061 } 1062 1063 #ifdef FF_KNI 1064 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1065 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 1066 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1067 if(mbuf_clone) { 1068 ff_kni_enqueue(port_id, mbuf_clone); 1069 } 1070 } 1071 #endif 1072 ff_veth_input(ctx, rtem); 1073 #ifdef FF_KNI 1074 } else if (enable_kni && 1075 ((filter == FILTER_KNI && kni_accept) || 1076 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 1077 ff_kni_enqueue(port_id, rtem); 1078 #endif 1079 } else { 1080 ff_veth_input(ctx, rtem); 1081 } 1082 } 1083 } 1084 1085 static inline int 1086 process_dispatch_ring(uint16_t port_id, uint16_t queue_id, 1087 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 1088 { 1089 /* read packet from ring buf and to process */ 1090 uint16_t nb_rb; 1091 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id], 1092 (void **)pkts_burst, MAX_PKT_BURST, NULL); 1093 1094 if(nb_rb > 0) { 1095 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 1096 } 1097 1098 return 0; 1099 } 1100 1101 static inline void 1102 handle_sysctl_msg(struct ff_msg *msg) 1103 { 1104 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 1105 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 1106 msg->sysctl.newlen); 1107 1108 if (ret < 0) { 1109 msg->result = errno; 1110 } else { 1111 msg->result = 0; 1112 } 1113 } 1114 1115 static inline void 1116 handle_ioctl_msg(struct ff_msg *msg) 1117 { 1118 int fd, ret; 1119 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 1120 if (fd < 0) { 1121 ret = -1; 1122 goto done; 1123 } 1124 1125 ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data); 1126 1127 ff_close(fd); 1128 1129 done: 1130 if (ret < 0) { 1131 msg->result = errno; 1132 } else { 1133 msg->result = 0; 1134 } 1135 } 1136 1137 static inline void 1138 handle_route_msg(struct ff_msg *msg) 1139 { 1140 int ret = ff_rtioctl(msg->route.fib, msg->route.data, 1141 &msg->route.len, msg->route.maxlen); 1142 if (ret < 0) { 1143 msg->result = errno; 1144 } else { 1145 msg->result = 0; 1146 } 1147 } 1148 1149 static inline void 1150 handle_top_msg(struct ff_msg *msg) 1151 { 1152 msg->top = ff_top_status; 1153 msg->result = 0; 1154 } 1155 1156 #ifdef FF_NETGRAPH 1157 static inline void 1158 handle_ngctl_msg(struct ff_msg *msg) 1159 { 1160 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data); 1161 if (ret < 0) { 1162 msg->result = errno; 1163 } else { 1164 msg->result = 0; 1165 msg->ngctl.ret = ret; 1166 } 1167 } 1168 #endif 1169 1170 #ifdef FF_IPFW 1171 static inline void 1172 handle_ipfw_msg(struct ff_msg *msg) 1173 { 1174 int fd, ret; 1175 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW); 1176 if (fd < 0) { 1177 ret = -1; 1178 goto done; 1179 } 1180 1181 switch (msg->ipfw.cmd) { 1182 case FF_IPFW_GET: 1183 ret = ff_getsockopt_freebsd(fd, msg->ipfw.level, 1184 msg->ipfw.optname, msg->ipfw.optval, 1185 msg->ipfw.optlen); 1186 break; 1187 case FF_IPFW_SET: 1188 ret = ff_setsockopt_freebsd(fd, msg->ipfw.level, 1189 msg->ipfw.optname, msg->ipfw.optval, 1190 *(msg->ipfw.optlen)); 1191 break; 1192 default: 1193 ret = -1; 1194 errno = ENOTSUP; 1195 break; 1196 } 1197 1198 ff_close(fd); 1199 1200 done: 1201 if (ret < 0) { 1202 msg->result = errno; 1203 } else { 1204 msg->result = 0; 1205 } 1206 } 1207 #endif 1208 1209 static inline void 1210 handle_traffic_msg(struct ff_msg *msg) 1211 { 1212 msg->traffic = ff_traffic; 1213 msg->result = 0; 1214 } 1215 1216 static inline void 1217 handle_default_msg(struct ff_msg *msg) 1218 { 1219 msg->result = ENOTSUP; 1220 } 1221 1222 static inline void 1223 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1224 { 1225 switch (msg->msg_type) { 1226 case FF_SYSCTL: 1227 handle_sysctl_msg(msg); 1228 break; 1229 case FF_IOCTL: 1230 handle_ioctl_msg(msg); 1231 break; 1232 case FF_ROUTE: 1233 handle_route_msg(msg); 1234 break; 1235 case FF_TOP: 1236 handle_top_msg(msg); 1237 break; 1238 #ifdef FF_NETGRAPH 1239 case FF_NGCTL: 1240 handle_ngctl_msg(msg); 1241 break; 1242 #endif 1243 #ifdef FF_IPFW 1244 case FF_IPFW_CTL: 1245 handle_ipfw_msg(msg); 1246 break; 1247 #endif 1248 case FF_TRAFFIC: 1249 handle_traffic_msg(msg); 1250 break; 1251 default: 1252 handle_default_msg(msg); 1253 break; 1254 } 1255 rte_ring_enqueue(msg_ring[proc_id].ring[1], msg); 1256 } 1257 1258 static inline int 1259 process_msg_ring(uint16_t proc_id) 1260 { 1261 void *msg; 1262 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1263 1264 if (unlikely(ret == 0)) { 1265 handle_msg((struct ff_msg *)msg, proc_id); 1266 } 1267 1268 return 0; 1269 } 1270 1271 /* Send burst of packets on an output interface */ 1272 static inline int 1273 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1274 { 1275 struct rte_mbuf **m_table; 1276 int ret; 1277 uint16_t queueid; 1278 1279 queueid = qconf->tx_queue_id[port]; 1280 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1281 1282 if (unlikely(qconf->pcap[port] != NULL)) { 1283 uint16_t i; 1284 for (i = 0; i < n; i++) { 1285 ff_dump_packets(qconf->pcap[port], m_table[i]); 1286 } 1287 } 1288 1289 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1290 ff_traffic.tx_packets += ret; 1291 uint16_t i; 1292 for (i = 0; i < ret; i++) { 1293 ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]); 1294 } 1295 if (unlikely(ret < n)) { 1296 do { 1297 rte_pktmbuf_free(m_table[ret]); 1298 } while (++ret < n); 1299 } 1300 return 0; 1301 } 1302 1303 /* Enqueue a single packet, and send burst if queue is filled */ 1304 static inline int 1305 send_single_packet(struct rte_mbuf *m, uint8_t port) 1306 { 1307 uint16_t len; 1308 struct lcore_conf *qconf; 1309 1310 qconf = &lcore_conf; 1311 len = qconf->tx_mbufs[port].len; 1312 qconf->tx_mbufs[port].m_table[len] = m; 1313 len++; 1314 1315 /* enough pkts to be sent */ 1316 if (unlikely(len == MAX_PKT_BURST)) { 1317 send_burst(qconf, MAX_PKT_BURST, port); 1318 len = 0; 1319 } 1320 1321 qconf->tx_mbufs[port].len = len; 1322 return 0; 1323 } 1324 1325 int 1326 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1327 int total) 1328 { 1329 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1330 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1331 if (head == NULL) { 1332 ff_mbuf_free(m); 1333 return -1; 1334 } 1335 1336 head->pkt_len = total; 1337 head->nb_segs = 0; 1338 1339 int off = 0; 1340 struct rte_mbuf *cur = head, *prev = NULL; 1341 while(total > 0) { 1342 if (cur == NULL) { 1343 cur = rte_pktmbuf_alloc(mbuf_pool); 1344 if (cur == NULL) { 1345 rte_pktmbuf_free(head); 1346 ff_mbuf_free(m); 1347 return -1; 1348 } 1349 } 1350 1351 if (prev != NULL) { 1352 prev->next = cur; 1353 } 1354 head->nb_segs++; 1355 1356 prev = cur; 1357 void *data = rte_pktmbuf_mtod(cur, void*); 1358 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1359 int ret = ff_mbuf_copydata(m, data, off, len); 1360 if (ret < 0) { 1361 rte_pktmbuf_free(head); 1362 ff_mbuf_free(m); 1363 return -1; 1364 } 1365 1366 1367 cur->data_len = len; 1368 off += len; 1369 total -= len; 1370 cur = NULL; 1371 } 1372 1373 struct ff_tx_offload offload = {0}; 1374 ff_mbuf_tx_offload(m, &offload); 1375 1376 void *data = rte_pktmbuf_mtod(head, void*); 1377 1378 if (offload.ip_csum) { 1379 /* ipv6 not supported yet */ 1380 struct ipv4_hdr *iph; 1381 int iph_len; 1382 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1383 iph_len = (iph->version_ihl & 0x0f) << 2; 1384 1385 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 1386 head->l2_len = ETHER_HDR_LEN; 1387 head->l3_len = iph_len; 1388 } 1389 1390 if (ctx->hw_features.tx_csum_l4) { 1391 struct ipv4_hdr *iph; 1392 int iph_len; 1393 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1394 iph_len = (iph->version_ihl & 0x0f) << 2; 1395 1396 if (offload.tcp_csum) { 1397 head->ol_flags |= PKT_TX_TCP_CKSUM; 1398 head->l2_len = ETHER_HDR_LEN; 1399 head->l3_len = iph_len; 1400 } 1401 1402 /* 1403 * TCP segmentation offload. 1404 * 1405 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 1406 * implies PKT_TX_TCP_CKSUM) 1407 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 1408 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 1409 * write the IP checksum to 0 in the packet 1410 * - fill the mbuf offload information: l2_len, 1411 * l3_len, l4_len, tso_segsz 1412 * - calculate the pseudo header checksum without taking ip_len 1413 * in account, and set it in the TCP header. Refer to 1414 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 1415 * used as helpers. 1416 */ 1417 if (offload.tso_seg_size) { 1418 struct tcp_hdr *tcph; 1419 int tcph_len; 1420 tcph = (struct tcp_hdr *)((char *)iph + iph_len); 1421 tcph_len = (tcph->data_off & 0xf0) >> 2; 1422 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 1423 1424 head->ol_flags |= PKT_TX_TCP_SEG; 1425 head->l4_len = tcph_len; 1426 head->tso_segsz = offload.tso_seg_size; 1427 } 1428 1429 if (offload.udp_csum) { 1430 head->ol_flags |= PKT_TX_UDP_CKSUM; 1431 head->l2_len = ETHER_HDR_LEN; 1432 head->l3_len = iph_len; 1433 } 1434 } 1435 1436 ff_mbuf_free(m); 1437 1438 return send_single_packet(head, ctx->port_id); 1439 } 1440 1441 static int 1442 main_loop(void *arg) 1443 { 1444 struct loop_routine *lr = (struct loop_routine *)arg; 1445 1446 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1447 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc; 1448 int i, j, nb_rx, idle; 1449 uint16_t port_id, queue_id; 1450 struct lcore_conf *qconf; 1451 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 1452 US_PER_S * BURST_TX_DRAIN_US; 1453 struct ff_dpdk_if_context *ctx; 1454 1455 prev_tsc = 0; 1456 usch_tsc = 0; 1457 1458 qconf = &lcore_conf; 1459 1460 while (1) { 1461 cur_tsc = rte_rdtsc(); 1462 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1463 rte_timer_manage(); 1464 } 1465 1466 idle = 1; 1467 sys_tsc = 0; 1468 usr_tsc = 0; 1469 1470 /* 1471 * TX burst queue drain 1472 */ 1473 diff_tsc = cur_tsc - prev_tsc; 1474 if (unlikely(diff_tsc > drain_tsc)) { 1475 for (i = 0; i < qconf->nb_tx_port; i++) { 1476 port_id = qconf->tx_port_id[i]; 1477 if (qconf->tx_mbufs[port_id].len == 0) 1478 continue; 1479 1480 idle = 0; 1481 1482 send_burst(qconf, 1483 qconf->tx_mbufs[port_id].len, 1484 port_id); 1485 qconf->tx_mbufs[port_id].len = 0; 1486 } 1487 1488 prev_tsc = cur_tsc; 1489 } 1490 1491 /* 1492 * Read packet from RX queues 1493 */ 1494 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1495 port_id = qconf->rx_queue_list[i].port_id; 1496 queue_id = qconf->rx_queue_list[i].queue_id; 1497 ctx = veth_ctx[port_id]; 1498 1499 #ifdef FF_KNI 1500 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1501 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1502 } 1503 #endif 1504 1505 process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); 1506 1507 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1508 MAX_PKT_BURST); 1509 if (nb_rx == 0) 1510 continue; 1511 1512 idle = 0; 1513 1514 /* Prefetch first packets */ 1515 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1516 rte_prefetch0(rte_pktmbuf_mtod( 1517 pkts_burst[j], void *)); 1518 } 1519 1520 /* Prefetch and handle already prefetched packets */ 1521 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1522 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1523 j + PREFETCH_OFFSET], void *)); 1524 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1525 } 1526 1527 /* Handle remaining prefetched packets */ 1528 for (; j < nb_rx; j++) { 1529 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1530 } 1531 } 1532 1533 process_msg_ring(qconf->proc_id); 1534 1535 div_tsc = rte_rdtsc(); 1536 1537 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) { 1538 usch_tsc = cur_tsc; 1539 lr->loop(lr->arg); 1540 } 1541 1542 idle_sleep_tsc = rte_rdtsc(); 1543 if (likely(idle && idle_sleep)) { 1544 usleep(idle_sleep); 1545 end_tsc = rte_rdtsc(); 1546 } else { 1547 end_tsc = idle_sleep_tsc; 1548 } 1549 1550 end_tsc = rte_rdtsc(); 1551 1552 if (usch_tsc == cur_tsc) { 1553 usr_tsc = idle_sleep_tsc - div_tsc; 1554 } 1555 1556 if (!idle) { 1557 sys_tsc = div_tsc - cur_tsc; 1558 ff_top_status.sys_tsc += sys_tsc; 1559 } 1560 1561 ff_top_status.usr_tsc += usr_tsc; 1562 ff_top_status.work_tsc += end_tsc - cur_tsc; 1563 ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1564 1565 ff_top_status.loops++; 1566 } 1567 1568 return 0; 1569 } 1570 1571 int 1572 ff_dpdk_if_up(void) { 1573 int i; 1574 struct lcore_conf *qconf = &lcore_conf; 1575 for (i = 0; i < qconf->nb_tx_port; i++) { 1576 uint16_t port_id = qconf->tx_port_id[i]; 1577 1578 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id]; 1579 veth_ctx[port_id] = ff_veth_attach(pconf); 1580 if (veth_ctx[port_id] == NULL) { 1581 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1582 } 1583 } 1584 1585 return 0; 1586 } 1587 1588 void 1589 ff_dpdk_run(loop_func_t loop, void *arg) { 1590 struct loop_routine *lr = rte_malloc(NULL, 1591 sizeof(struct loop_routine), 0); 1592 lr->loop = loop; 1593 lr->arg = arg; 1594 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1595 rte_eal_mp_wait_lcore(); 1596 rte_free(lr); 1597 } 1598 1599 void 1600 ff_dpdk_pktmbuf_free(void *m) 1601 { 1602 rte_pktmbuf_free((struct rte_mbuf *)m); 1603 } 1604 1605 static uint32_t 1606 toeplitz_hash(unsigned keylen, const uint8_t *key, 1607 unsigned datalen, const uint8_t *data) 1608 { 1609 uint32_t hash = 0, v; 1610 u_int i, b; 1611 1612 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1613 1614 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1615 for (i = 0; i < datalen; i++) { 1616 for (b = 0; b < 8; b++) { 1617 if (data[i] & (1<<(7-b))) 1618 hash ^= v; 1619 v <<= 1; 1620 if ((i + 4) < keylen && 1621 (key[i+4] & (1<<(7-b)))) 1622 v |= 1; 1623 } 1624 } 1625 return (hash); 1626 } 1627 1628 int 1629 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1630 uint16_t sport, uint16_t dport) 1631 { 1632 struct lcore_conf *qconf = &lcore_conf; 1633 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 1634 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id]; 1635 1636 if (nb_queues <= 1) { 1637 return 1; 1638 } 1639 1640 uint16_t reta_size = rss_reta_size[ctx->port_id]; 1641 uint16_t queueid = qconf->tx_queue_id[ctx->port_id]; 1642 1643 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1644 sizeof(dport)]; 1645 1646 unsigned datalen = 0; 1647 1648 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1649 datalen += sizeof(saddr); 1650 1651 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1652 datalen += sizeof(daddr); 1653 1654 bcopy(&sport, &data[datalen], sizeof(sport)); 1655 datalen += sizeof(sport); 1656 1657 bcopy(&dport, &data[datalen], sizeof(dport)); 1658 datalen += sizeof(dport); 1659 1660 uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes), 1661 default_rsskey_40bytes, datalen, data); 1662 1663 return ((hash & (reta_size - 1)) % nb_queues) == queueid; 1664 } 1665 1666 void 1667 ff_regist_packet_dispatcher(dispatch_func_t func) 1668 { 1669 packet_dispatcher = func; 1670 } 1671 1672 uint64_t 1673 ff_get_tsc_ns() 1674 { 1675 uint64_t cur_tsc = rte_rdtsc(); 1676 uint64_t hz = rte_get_tsc_hz(); 1677 return ((double)cur_tsc/(double)hz) * NS_PER_S; 1678 } 1679 1680