1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 #include <unistd.h> 28 29 #include <rte_common.h> 30 #include <rte_byteorder.h> 31 #include <rte_log.h> 32 #include <rte_memory.h> 33 #include <rte_memcpy.h> 34 #include <rte_memzone.h> 35 #include <rte_config.h> 36 #include <rte_eal.h> 37 #include <rte_pci.h> 38 #include <rte_mbuf.h> 39 #include <rte_memory.h> 40 #include <rte_lcore.h> 41 #include <rte_launch.h> 42 #include <rte_ethdev.h> 43 #include <rte_debug.h> 44 #include <rte_common.h> 45 #include <rte_ether.h> 46 #include <rte_malloc.h> 47 #include <rte_cycles.h> 48 #include <rte_timer.h> 49 #include <rte_thash.h> 50 #include <rte_ip.h> 51 #include <rte_tcp.h> 52 #include <rte_udp.h> 53 54 #include "ff_dpdk_if.h" 55 #include "ff_dpdk_pcap.h" 56 #include "ff_dpdk_kni.h" 57 #include "ff_config.h" 58 #include "ff_veth.h" 59 #include "ff_host_interface.h" 60 #include "ff_msg.h" 61 #include "ff_api.h" 62 63 #define MEMPOOL_CACHE_SIZE 256 64 65 #define DISPATCH_RING_SIZE 2048 66 67 #define MSG_RING_SIZE 32 68 69 /* 70 * Configurable number of RX/TX ring descriptors 71 */ 72 #define RX_QUEUE_SIZE 512 73 #define TX_QUEUE_SIZE 512 74 75 #define MAX_PKT_BURST 32 76 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 77 78 /* 79 * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send. 80 */ 81 #define MAX_TX_BURST (MAX_PKT_BURST / 2) 82 83 #define NB_SOCKETS 8 84 85 /* Configure how many packets ahead to prefetch, when reading packets */ 86 #define PREFETCH_OFFSET 3 87 88 #define MAX_RX_QUEUE_PER_LCORE 16 89 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS 90 #define MAX_RX_QUEUE_PER_PORT 128 91 92 #ifdef FF_KNI 93 #define KNI_MBUF_MAX 2048 94 #define KNI_QUEUE_SIZE 2048 95 96 static int enable_kni; 97 static int kni_accept; 98 #endif 99 100 static int numa_on; 101 102 static unsigned idle_sleep; 103 104 static struct rte_timer freebsd_clock; 105 106 // Mellanox Linux's driver key 107 static uint8_t default_rsskey_40bytes[40] = { 108 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 109 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 110 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 111 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 112 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 113 }; 114 115 static struct rte_eth_conf default_port_conf = { 116 .rxmode = { 117 .mq_mode = ETH_MQ_RX_RSS, 118 .max_rx_pkt_len = ETHER_MAX_LEN, 119 .split_hdr_size = 0, /**< hdr buf size */ 120 .header_split = 0, /**< Header Split disabled */ 121 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 122 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 123 .hw_vlan_strip = 0, /**< VLAN strip disabled. */ 124 .hw_vlan_extend = 0, /**< Extended VLAN disabled. */ 125 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 126 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 127 .enable_lro = 0, /**< LRO disabled */ 128 }, 129 .rx_adv_conf = { 130 .rss_conf = { 131 .rss_key = default_rsskey_40bytes, 132 .rss_key_len = 40, 133 .rss_hf = ETH_RSS_PROTO_MASK, 134 }, 135 }, 136 .txmode = { 137 .mq_mode = ETH_MQ_TX_NONE, 138 }, 139 }; 140 141 struct mbuf_table { 142 uint16_t len; 143 struct rte_mbuf *m_table[MAX_PKT_BURST]; 144 }; 145 146 struct lcore_rx_queue { 147 uint16_t port_id; 148 uint16_t queue_id; 149 } __rte_cache_aligned; 150 151 struct lcore_conf { 152 uint16_t proc_id; 153 uint16_t socket_id; 154 uint16_t nb_queue_list[RTE_MAX_ETHPORTS]; 155 struct ff_port_cfg *port_cfgs; 156 157 uint16_t nb_rx_queue; 158 struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; 159 uint16_t nb_tx_port; 160 uint16_t tx_port_id[RTE_MAX_ETHPORTS]; 161 uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; 162 struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS]; 163 char *pcap[RTE_MAX_ETHPORTS]; 164 } __rte_cache_aligned; 165 166 static struct lcore_conf lcore_conf; 167 168 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 169 170 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; 171 static dispatch_func_t packet_dispatcher; 172 173 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 174 175 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port); 176 177 struct ff_msg_ring { 178 char ring_name[2][RTE_RING_NAMESIZE]; 179 /* ring[0] for lcore recv msg, other send */ 180 /* ring[1] for lcore send msg, other read */ 181 struct rte_ring *ring[2]; 182 } __rte_cache_aligned; 183 184 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 185 static struct rte_mempool *message_pool; 186 187 struct ff_dpdk_if_context { 188 void *sc; 189 void *ifp; 190 uint16_t port_id; 191 struct ff_hw_features hw_features; 192 } __rte_cache_aligned; 193 194 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 195 196 static struct ff_top_args ff_top_status; 197 static struct ff_traffic_args ff_traffic; 198 199 extern void ff_hardclock(void); 200 201 static void 202 ff_hardclock_job(__rte_unused struct rte_timer *timer, 203 __rte_unused void *arg) { 204 ff_hardclock(); 205 ff_update_current_ts(); 206 } 207 208 struct ff_dpdk_if_context * 209 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 210 { 211 struct ff_dpdk_if_context *ctx; 212 213 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 214 if (ctx == NULL) 215 return NULL; 216 217 ctx->sc = sc; 218 ctx->ifp = ifp; 219 ctx->port_id = cfg->port_id; 220 ctx->hw_features = cfg->hw_features; 221 222 return ctx; 223 } 224 225 void 226 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 227 { 228 free(ctx); 229 } 230 231 static void 232 check_all_ports_link_status(void) 233 { 234 #define CHECK_INTERVAL 100 /* 100ms */ 235 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 236 237 uint16_t portid; 238 uint8_t count, all_ports_up, print_flag = 0; 239 struct rte_eth_link link; 240 241 printf("\nChecking link status"); 242 fflush(stdout); 243 244 int i, nb_ports; 245 nb_ports = ff_global_cfg.dpdk.nb_ports; 246 for (count = 0; count <= MAX_CHECK_TIME; count++) { 247 all_ports_up = 1; 248 for (i = 0; i < nb_ports; i++) { 249 uint16_t portid = ff_global_cfg.dpdk.portid_list[i]; 250 memset(&link, 0, sizeof(link)); 251 rte_eth_link_get_nowait(portid, &link); 252 253 /* print link status if flag set */ 254 if (print_flag == 1) { 255 if (link.link_status) { 256 printf("Port %d Link Up - speed %u " 257 "Mbps - %s\n", (int)portid, 258 (unsigned)link.link_speed, 259 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 260 ("full-duplex") : ("half-duplex\n")); 261 } else { 262 printf("Port %d Link Down\n", (int)portid); 263 } 264 continue; 265 } 266 /* clear all_ports_up flag if any link down */ 267 if (link.link_status == 0) { 268 all_ports_up = 0; 269 break; 270 } 271 } 272 273 /* after finally printing all link status, get out */ 274 if (print_flag == 1) 275 break; 276 277 if (all_ports_up == 0) { 278 printf("."); 279 fflush(stdout); 280 rte_delay_ms(CHECK_INTERVAL); 281 } 282 283 /* set the print_flag if all ports up or timeout */ 284 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 285 print_flag = 1; 286 printf("done\n"); 287 } 288 } 289 } 290 291 static int 292 init_lcore_conf(void) 293 { 294 uint8_t nb_dev_ports = rte_eth_dev_count(); 295 if (nb_dev_ports == 0) { 296 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 297 } 298 299 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) { 300 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n", 301 ff_global_cfg.dpdk.max_portid); 302 } 303 304 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs; 305 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 306 307 uint16_t proc_id; 308 for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) { 309 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id]; 310 if (!lcore_config[lcore_id].detected) { 311 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 312 } 313 } 314 315 uint16_t socket_id = 0; 316 if (numa_on) { 317 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 318 } 319 320 lcore_conf.socket_id = socket_id; 321 322 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id]; 323 int j; 324 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) { 325 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j]; 326 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 327 328 int queueid = -1; 329 int i; 330 for (i = 0; i < pconf->nb_lcores; i++) { 331 if (pconf->lcore_list[i] == lcore_id) { 332 queueid = i; 333 } 334 } 335 if (queueid < 0) { 336 continue; 337 } 338 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid); 339 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 340 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 341 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid; 342 lcore_conf.nb_rx_queue++; 343 344 lcore_conf.tx_queue_id[port_id] = queueid; 345 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id; 346 lcore_conf.nb_tx_port++; 347 348 lcore_conf.pcap[port_id] = pconf->pcap; 349 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores; 350 } 351 352 if (lcore_conf.nb_rx_queue == 0) { 353 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id); 354 } 355 356 return 0; 357 } 358 359 static int 360 init_mem_pool(void) 361 { 362 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 363 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 364 uint32_t nb_tx_queue = nb_lcores; 365 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 366 367 unsigned nb_mbuf = RTE_MAX ( 368 (nb_rx_queue*RX_QUEUE_SIZE + 369 nb_ports*nb_lcores*MAX_PKT_BURST + 370 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 371 nb_lcores*MEMPOOL_CACHE_SIZE + 372 #ifdef FF_KNI 373 nb_ports*KNI_MBUF_MAX + 374 nb_ports*KNI_QUEUE_SIZE + 375 #endif 376 nb_lcores*nb_ports*DISPATCH_RING_SIZE), 377 (unsigned)8192); 378 379 unsigned socketid = 0; 380 uint16_t i, lcore_id; 381 char s[64]; 382 383 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) { 384 lcore_id = ff_global_cfg.dpdk.proc_lcore[i]; 385 if (numa_on) { 386 socketid = rte_lcore_to_socket_id(lcore_id); 387 } 388 389 if (socketid >= NB_SOCKETS) { 390 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 391 socketid, i, NB_SOCKETS); 392 } 393 394 if (pktmbuf_pool[socketid] != NULL) { 395 continue; 396 } 397 398 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 399 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 400 pktmbuf_pool[socketid] = 401 rte_pktmbuf_pool_create(s, nb_mbuf, 402 MEMPOOL_CACHE_SIZE, 0, 403 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 404 } else { 405 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 406 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 407 } 408 409 if (pktmbuf_pool[socketid] == NULL) { 410 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 411 } else { 412 printf("create mbuf pool on socket %d\n", socketid); 413 } 414 } 415 416 return 0; 417 } 418 419 static struct rte_ring * 420 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 421 { 422 struct rte_ring *ring; 423 424 if (name == NULL) { 425 rte_exit(EXIT_FAILURE, "create ring failed, no name!\n"); 426 } 427 428 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 429 ring = rte_ring_create(name, count, socket_id, flags); 430 } else { 431 ring = rte_ring_lookup(name); 432 } 433 434 if (ring == NULL) { 435 rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name); 436 } 437 438 return ring; 439 } 440 441 static int 442 init_dispatch_ring(void) 443 { 444 int j; 445 char name_buf[RTE_RING_NAMESIZE]; 446 int queueid; 447 448 unsigned socketid = lcore_conf.socket_id; 449 450 /* Create ring according to ports actually being used. */ 451 int nb_ports = ff_global_cfg.dpdk.nb_ports; 452 for (j = 0; j < nb_ports; j++) { 453 uint16_t portid = ff_global_cfg.dpdk.portid_list[j]; 454 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid]; 455 int nb_queues = pconf->nb_lcores; 456 if (dispatch_ring[portid] == NULL) { 457 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid); 458 459 dispatch_ring[portid] = rte_zmalloc(name_buf, 460 sizeof(struct rte_ring *) * nb_queues, 461 RTE_CACHE_LINE_SIZE); 462 if (dispatch_ring[portid] == NULL) { 463 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 464 "failed\n", name_buf); 465 } 466 } 467 468 for(queueid = 0; queueid < nb_queues; ++queueid) { 469 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d", 470 portid, queueid); 471 dispatch_ring[portid][queueid] = create_ring(name_buf, 472 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ); 473 474 if (dispatch_ring[portid][queueid] == NULL) 475 rte_panic("create ring:%s failed!\n", name_buf); 476 477 printf("create ring:%s success, %u ring entries are now free!\n", 478 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid])); 479 } 480 } 481 482 return 0; 483 } 484 485 static void 486 ff_msg_init(struct rte_mempool *mp, 487 __attribute__((unused)) void *opaque_arg, 488 void *obj, __attribute__((unused)) unsigned i) 489 { 490 struct ff_msg *msg = (struct ff_msg *)obj; 491 msg->msg_type = FF_UNKNOWN; 492 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 493 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 494 } 495 496 static int 497 init_msg_ring(void) 498 { 499 uint16_t i; 500 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 501 unsigned socketid = lcore_conf.socket_id; 502 503 /* Create message buffer pool */ 504 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 505 message_pool = rte_mempool_create(FF_MSG_POOL, 506 MSG_RING_SIZE * 2 * nb_procs, 507 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 508 NULL, NULL, ff_msg_init, NULL, 509 socketid, 0); 510 } else { 511 message_pool = rte_mempool_lookup(FF_MSG_POOL); 512 } 513 514 if (message_pool == NULL) { 515 rte_panic("Create msg mempool failed\n"); 516 } 517 518 for(i = 0; i < nb_procs; ++i) { 519 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 520 "%s%u", FF_MSG_RING_IN, i); 521 snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE, 522 "%s%u", FF_MSG_RING_OUT, i); 523 524 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 525 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 526 if (msg_ring[i].ring[0] == NULL) 527 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 528 529 msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1], 530 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 531 if (msg_ring[i].ring[1] == NULL) 532 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 533 } 534 535 return 0; 536 } 537 538 #ifdef FF_KNI 539 static int 540 init_kni(void) 541 { 542 int nb_ports = rte_eth_dev_count(); 543 kni_accept = 0; 544 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 545 kni_accept = 1; 546 547 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 548 ff_global_cfg.kni.udp_port); 549 550 unsigned socket_id = lcore_conf.socket_id; 551 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 552 553 nb_ports = ff_global_cfg.dpdk.nb_ports; 554 int i, ret; 555 for (i = 0; i < nb_ports; i++) { 556 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 557 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 558 } 559 560 return 0; 561 } 562 #endif 563 564 static void 565 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues) 566 { 567 if (reta_size == 0) { 568 return; 569 } 570 571 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 572 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 573 574 /* config HW indirection table */ 575 unsigned i, j, hash=0; 576 for (i = 0; i < reta_conf_size; i++) { 577 reta_conf[i].mask = ~0ULL; 578 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 579 reta_conf[i].reta[j] = hash++ % nb_queues; 580 } 581 } 582 583 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 584 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 585 port_id); 586 } 587 } 588 589 static int 590 init_port_start(void) 591 { 592 int nb_ports = ff_global_cfg.dpdk.nb_ports; 593 unsigned socketid = 0; 594 struct rte_mempool *mbuf_pool; 595 uint16_t i; 596 597 for (i = 0; i < nb_ports; i++) { 598 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 599 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 600 uint16_t nb_queues = pconf->nb_lcores; 601 602 struct rte_eth_dev_info dev_info; 603 rte_eth_dev_info_get(port_id, &dev_info); 604 605 if (nb_queues > dev_info.max_rx_queues) { 606 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 607 nb_queues, 608 dev_info.max_rx_queues); 609 } 610 611 if (nb_queues > dev_info.max_tx_queues) { 612 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 613 nb_queues, 614 dev_info.max_tx_queues); 615 } 616 617 struct ether_addr addr; 618 rte_eth_macaddr_get(port_id, &addr); 619 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 620 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 621 (unsigned)port_id, 622 addr.addr_bytes[0], addr.addr_bytes[1], 623 addr.addr_bytes[2], addr.addr_bytes[3], 624 addr.addr_bytes[4], addr.addr_bytes[5]); 625 626 rte_memcpy(pconf->mac, 627 addr.addr_bytes, ETHER_ADDR_LEN); 628 629 /* Clear txq_flags - we do not need multi-mempool and refcnt */ 630 dev_info.default_txconf.txq_flags = ETH_TXQ_FLAGS_NOMULTMEMP | 631 ETH_TXQ_FLAGS_NOREFCOUNT; 632 633 /* Disable features that are not supported by port's HW */ 634 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) { 635 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP; 636 } 637 638 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 639 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP; 640 } 641 642 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) { 643 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP; 644 } 645 646 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { 647 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; 648 } 649 650 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) && 651 !(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TSO)) { 652 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; 653 } 654 655 struct rte_eth_conf port_conf = {0}; 656 657 /* Set RSS mode */ 658 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 659 port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK; 660 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes; 661 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; 662 663 /* Set Rx VLAN stripping */ 664 if (ff_global_cfg.dpdk.vlan_strip) { 665 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 666 port_conf.rxmode.hw_vlan_strip = 1; 667 } 668 } 669 670 /* Enable HW CRC stripping */ 671 port_conf.rxmode.hw_strip_crc = 1; 672 673 /* FIXME: Enable TCP LRO ?*/ 674 #if 0 675 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 676 printf("LRO is supported\n"); 677 port_conf.rxmode.enable_lro = 1; 678 pconf->hw_features.rx_lro = 1; 679 } 680 #endif 681 682 /* Set Rx checksum checking */ 683 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 684 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 685 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 686 printf("RX checksum offload supported\n"); 687 port_conf.rxmode.hw_ip_checksum = 1; 688 pconf->hw_features.rx_csum = 1; 689 } 690 691 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 692 printf("TX ip checksum offload supported\n"); 693 pconf->hw_features.tx_csum_ip = 1; 694 } 695 696 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 697 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 698 printf("TX TCP&UDP checksum offload supported\n"); 699 pconf->hw_features.tx_csum_l4 = 1; 700 } 701 702 if (ff_global_cfg.dpdk.tso) { 703 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 704 printf("TSO is supported\n"); 705 pconf->hw_features.tx_tso = 1; 706 } 707 } else { 708 printf("TSO is disabled\n"); 709 } 710 711 if (dev_info.reta_size) { 712 /* reta size must be power of 2 */ 713 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 714 715 rss_reta_size[port_id] = dev_info.reta_size; 716 printf("port[%d]: rss table size: %d\n", port_id, 717 dev_info.reta_size); 718 } 719 720 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 721 continue; 722 } 723 724 int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); 725 if (ret != 0) { 726 return ret; 727 } 728 uint16_t q; 729 for (q = 0; q < nb_queues; q++) { 730 if (numa_on) { 731 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q]; 732 socketid = rte_lcore_to_socket_id(lcore_id); 733 } 734 mbuf_pool = pktmbuf_pool[socketid]; 735 736 ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE, 737 socketid, &dev_info.default_txconf); 738 if (ret < 0) { 739 return ret; 740 } 741 742 ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE, 743 socketid, &dev_info.default_rxconf, mbuf_pool); 744 if (ret < 0) { 745 return ret; 746 } 747 } 748 749 ret = rte_eth_dev_start(port_id); 750 if (ret < 0) { 751 return ret; 752 } 753 754 if (nb_queues > 1) { 755 /* set HW rss hash function to Toeplitz. */ 756 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 757 struct rte_eth_hash_filter_info info = {0}; 758 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 759 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 760 761 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 762 RTE_ETH_FILTER_SET, &info) < 0) { 763 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 764 port_id); 765 } 766 } 767 768 set_rss_table(port_id, dev_info.reta_size, nb_queues); 769 } 770 771 /* Enable RX in promiscuous mode for the Ethernet device. */ 772 if (ff_global_cfg.dpdk.promiscuous) { 773 rte_eth_promiscuous_enable(port_id); 774 ret = rte_eth_promiscuous_get(port_id); 775 if (ret == 1) { 776 printf("set port %u to promiscuous mode ok\n", port_id); 777 } else { 778 printf("set port %u to promiscuous mode error\n", port_id); 779 } 780 } 781 782 /* Enable pcap dump */ 783 if (pconf->pcap) { 784 ff_enable_pcap(pconf->pcap); 785 } 786 } 787 788 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 789 check_all_ports_link_status(); 790 } 791 792 return 0; 793 } 794 795 static int 796 init_clock(void) 797 { 798 rte_timer_subsystem_init(); 799 uint64_t hz = rte_get_timer_hz(); 800 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 801 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 802 803 rte_timer_init(&freebsd_clock); 804 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 805 rte_lcore_id(), &ff_hardclock_job, NULL); 806 807 ff_update_current_ts(); 808 809 return 0; 810 } 811 812 int 813 ff_dpdk_init(int argc, char **argv) 814 { 815 if (ff_global_cfg.dpdk.nb_procs < 1 || 816 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 817 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 818 ff_global_cfg.dpdk.proc_id < 0) { 819 printf("param num_procs[%d] or proc_id[%d] error!\n", 820 ff_global_cfg.dpdk.nb_procs, 821 ff_global_cfg.dpdk.proc_id); 822 exit(1); 823 } 824 825 int ret = rte_eal_init(argc, argv); 826 if (ret < 0) { 827 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 828 } 829 830 numa_on = ff_global_cfg.dpdk.numa_on; 831 832 idle_sleep = ff_global_cfg.dpdk.idle_sleep; 833 834 init_lcore_conf(); 835 836 init_mem_pool(); 837 838 init_dispatch_ring(); 839 840 init_msg_ring(); 841 842 #ifdef FF_KNI 843 enable_kni = ff_global_cfg.kni.enable; 844 if (enable_kni) { 845 init_kni(); 846 } 847 #endif 848 849 ret = init_port_start(); 850 if (ret < 0) { 851 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 852 } 853 854 init_clock(); 855 856 return 0; 857 } 858 859 static void 860 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 861 { 862 uint8_t rx_csum = ctx->hw_features.rx_csum; 863 if (rx_csum) { 864 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 865 rte_pktmbuf_free(pkt); 866 return; 867 } 868 } 869 870 void *data = rte_pktmbuf_mtod(pkt, void*); 871 uint16_t len = rte_pktmbuf_data_len(pkt); 872 873 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 874 if (hdr == NULL) { 875 rte_pktmbuf_free(pkt); 876 return; 877 } 878 879 if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) { 880 ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci); 881 } 882 883 struct rte_mbuf *pn = pkt->next; 884 void *prev = hdr; 885 while(pn != NULL) { 886 data = rte_pktmbuf_mtod(pn, void*); 887 len = rte_pktmbuf_data_len(pn); 888 889 void *mb = ff_mbuf_get(prev, data, len); 890 if (mb == NULL) { 891 ff_mbuf_free(hdr); 892 rte_pktmbuf_free(pkt); 893 return; 894 } 895 pn = pn->next; 896 prev = mb; 897 } 898 899 ff_veth_process_packet(ctx->ifp, hdr); 900 } 901 902 static enum FilterReturn 903 protocol_filter(const void *data, uint16_t len) 904 { 905 if(len < ETHER_HDR_LEN) 906 return FILTER_UNKNOWN; 907 908 const struct ether_hdr *hdr; 909 const struct vlan_hdr *vlanhdr; 910 hdr = (const struct ether_hdr *)data; 911 uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type); 912 data += ETHER_HDR_LEN; 913 len -= ETHER_HDR_LEN; 914 915 if (ether_type == ETHER_TYPE_VLAN) { 916 vlanhdr = (struct vlan_hdr *)data; 917 ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto); 918 data += sizeof(struct vlan_hdr); 919 len -= sizeof(struct vlan_hdr); 920 } 921 922 if(ether_type == ETHER_TYPE_ARP) 923 return FILTER_ARP; 924 925 #ifndef FF_KNI 926 return FILTER_UNKNOWN; 927 #else 928 if (!enable_kni) { 929 return FILTER_UNKNOWN; 930 } 931 932 if(ether_type != ETHER_TYPE_IPv4) 933 return FILTER_UNKNOWN; 934 935 return ff_kni_proto_filter(data, len); 936 #endif 937 } 938 939 static inline void 940 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m) 941 { 942 struct rte_mbuf *md; 943 void *src, *dst; 944 945 dst = rte_pktmbuf_mtod(mi, void *); 946 src = rte_pktmbuf_mtod(m, void *); 947 948 mi->data_len = m->data_len; 949 rte_memcpy(dst, src, m->data_len); 950 951 mi->port = m->port; 952 mi->vlan_tci = m->vlan_tci; 953 mi->vlan_tci_outer = m->vlan_tci_outer; 954 mi->tx_offload = m->tx_offload; 955 mi->hash = m->hash; 956 mi->ol_flags = m->ol_flags; 957 mi->packet_type = m->packet_type; 958 } 959 960 /* copied from rte_pktmbuf_clone */ 961 static inline struct rte_mbuf * 962 pktmbuf_deep_clone(const struct rte_mbuf *md, 963 struct rte_mempool *mp) 964 { 965 struct rte_mbuf *mc, *mi, **prev; 966 uint32_t pktlen; 967 uint8_t nseg; 968 969 if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) 970 return NULL; 971 972 mi = mc; 973 prev = &mi->next; 974 pktlen = md->pkt_len; 975 nseg = 0; 976 977 do { 978 nseg++; 979 pktmbuf_deep_attach(mi, md); 980 *prev = mi; 981 prev = &mi->next; 982 } while ((md = md->next) != NULL && 983 (mi = rte_pktmbuf_alloc(mp)) != NULL); 984 985 *prev = NULL; 986 mc->nb_segs = nseg; 987 mc->pkt_len = pktlen; 988 989 /* Allocation of new indirect segment failed */ 990 if (unlikely (mi == NULL)) { 991 rte_pktmbuf_free(mc); 992 return NULL; 993 } 994 995 __rte_mbuf_sanity_check(mc, 1); 996 return mc; 997 } 998 999 static inline void 1000 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 1001 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 1002 { 1003 struct lcore_conf *qconf = &lcore_conf; 1004 uint16_t nb_queues = qconf->nb_queue_list[port_id]; 1005 1006 uint16_t i; 1007 for (i = 0; i < count; i++) { 1008 struct rte_mbuf *rtem = bufs[i]; 1009 1010 if (unlikely(qconf->pcap[port_id] != NULL)) { 1011 if (!pkts_from_ring) { 1012 ff_dump_packets(qconf->pcap[port_id], rtem); 1013 } 1014 } 1015 1016 void *data = rte_pktmbuf_mtod(rtem, void*); 1017 uint16_t len = rte_pktmbuf_data_len(rtem); 1018 1019 if (!pkts_from_ring) { 1020 ff_traffic.rx_packets++; 1021 ff_traffic.rx_bytes += len; 1022 } 1023 1024 if (!pkts_from_ring && packet_dispatcher) { 1025 int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues); 1026 if (ret == FF_DISPATCH_RESPONSE) { 1027 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len; 1028 send_single_packet(rtem, port_id); 1029 continue; 1030 } 1031 1032 if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) { 1033 rte_pktmbuf_free(rtem); 1034 continue; 1035 } 1036 1037 if (ret != queue_id) { 1038 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem); 1039 if (ret < 0) 1040 rte_pktmbuf_free(rtem); 1041 1042 continue; 1043 } 1044 } 1045 1046 enum FilterReturn filter = protocol_filter(data, len); 1047 if (filter == FILTER_ARP) { 1048 struct rte_mempool *mbuf_pool; 1049 struct rte_mbuf *mbuf_clone; 1050 if (!pkts_from_ring) { 1051 uint16_t j; 1052 for(j = 0; j < nb_queues; ++j) { 1053 if(j == queue_id) 1054 continue; 1055 1056 unsigned socket_id = 0; 1057 if (numa_on) { 1058 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j]; 1059 socket_id = rte_lcore_to_socket_id(lcore_id); 1060 } 1061 mbuf_pool = pktmbuf_pool[socket_id]; 1062 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1063 if(mbuf_clone) { 1064 int ret = rte_ring_enqueue(dispatch_ring[port_id][j], 1065 mbuf_clone); 1066 if (ret < 0) 1067 rte_pktmbuf_free(mbuf_clone); 1068 } 1069 } 1070 } 1071 1072 #ifdef FF_KNI 1073 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1074 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 1075 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1076 if(mbuf_clone) { 1077 ff_kni_enqueue(port_id, mbuf_clone); 1078 } 1079 } 1080 #endif 1081 ff_veth_input(ctx, rtem); 1082 #ifdef FF_KNI 1083 } else if (enable_kni && 1084 ((filter == FILTER_KNI && kni_accept) || 1085 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 1086 ff_kni_enqueue(port_id, rtem); 1087 #endif 1088 } else { 1089 ff_veth_input(ctx, rtem); 1090 } 1091 } 1092 } 1093 1094 static inline int 1095 process_dispatch_ring(uint16_t port_id, uint16_t queue_id, 1096 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 1097 { 1098 /* read packet from ring buf and to process */ 1099 uint16_t nb_rb; 1100 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id], 1101 (void **)pkts_burst, MAX_PKT_BURST, NULL); 1102 1103 if(nb_rb > 0) { 1104 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 1105 } 1106 1107 return 0; 1108 } 1109 1110 static inline void 1111 handle_sysctl_msg(struct ff_msg *msg) 1112 { 1113 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 1114 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 1115 msg->sysctl.newlen); 1116 1117 if (ret < 0) { 1118 msg->result = errno; 1119 } else { 1120 msg->result = 0; 1121 } 1122 } 1123 1124 static inline void 1125 handle_ioctl_msg(struct ff_msg *msg) 1126 { 1127 int fd, ret; 1128 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 1129 if (fd < 0) { 1130 ret = -1; 1131 goto done; 1132 } 1133 1134 ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data); 1135 1136 ff_close(fd); 1137 1138 done: 1139 if (ret < 0) { 1140 msg->result = errno; 1141 } else { 1142 msg->result = 0; 1143 } 1144 } 1145 1146 static inline void 1147 handle_route_msg(struct ff_msg *msg) 1148 { 1149 int ret = ff_rtioctl(msg->route.fib, msg->route.data, 1150 &msg->route.len, msg->route.maxlen); 1151 if (ret < 0) { 1152 msg->result = errno; 1153 } else { 1154 msg->result = 0; 1155 } 1156 } 1157 1158 static inline void 1159 handle_top_msg(struct ff_msg *msg) 1160 { 1161 msg->top = ff_top_status; 1162 msg->result = 0; 1163 } 1164 1165 #ifdef FF_NETGRAPH 1166 static inline void 1167 handle_ngctl_msg(struct ff_msg *msg) 1168 { 1169 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data); 1170 if (ret < 0) { 1171 msg->result = errno; 1172 } else { 1173 msg->result = 0; 1174 msg->ngctl.ret = ret; 1175 } 1176 } 1177 #endif 1178 1179 #ifdef FF_IPFW 1180 static inline void 1181 handle_ipfw_msg(struct ff_msg *msg) 1182 { 1183 int fd, ret; 1184 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW); 1185 if (fd < 0) { 1186 ret = -1; 1187 goto done; 1188 } 1189 1190 switch (msg->ipfw.cmd) { 1191 case FF_IPFW_GET: 1192 ret = ff_getsockopt_freebsd(fd, msg->ipfw.level, 1193 msg->ipfw.optname, msg->ipfw.optval, 1194 msg->ipfw.optlen); 1195 break; 1196 case FF_IPFW_SET: 1197 ret = ff_setsockopt_freebsd(fd, msg->ipfw.level, 1198 msg->ipfw.optname, msg->ipfw.optval, 1199 *(msg->ipfw.optlen)); 1200 break; 1201 default: 1202 ret = -1; 1203 errno = ENOTSUP; 1204 break; 1205 } 1206 1207 ff_close(fd); 1208 1209 done: 1210 if (ret < 0) { 1211 msg->result = errno; 1212 } else { 1213 msg->result = 0; 1214 } 1215 } 1216 #endif 1217 1218 static inline void 1219 handle_traffic_msg(struct ff_msg *msg) 1220 { 1221 msg->traffic = ff_traffic; 1222 msg->result = 0; 1223 } 1224 1225 static inline void 1226 handle_default_msg(struct ff_msg *msg) 1227 { 1228 msg->result = ENOTSUP; 1229 } 1230 1231 static inline void 1232 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1233 { 1234 switch (msg->msg_type) { 1235 case FF_SYSCTL: 1236 handle_sysctl_msg(msg); 1237 break; 1238 case FF_IOCTL: 1239 handle_ioctl_msg(msg); 1240 break; 1241 case FF_ROUTE: 1242 handle_route_msg(msg); 1243 break; 1244 case FF_TOP: 1245 handle_top_msg(msg); 1246 break; 1247 #ifdef FF_NETGRAPH 1248 case FF_NGCTL: 1249 handle_ngctl_msg(msg); 1250 break; 1251 #endif 1252 #ifdef FF_IPFW 1253 case FF_IPFW_CTL: 1254 handle_ipfw_msg(msg); 1255 break; 1256 #endif 1257 case FF_TRAFFIC: 1258 handle_traffic_msg(msg); 1259 break; 1260 default: 1261 handle_default_msg(msg); 1262 break; 1263 } 1264 rte_ring_enqueue(msg_ring[proc_id].ring[1], msg); 1265 } 1266 1267 static inline int 1268 process_msg_ring(uint16_t proc_id) 1269 { 1270 void *msg; 1271 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1272 1273 if (unlikely(ret == 0)) { 1274 handle_msg((struct ff_msg *)msg, proc_id); 1275 } 1276 1277 return 0; 1278 } 1279 1280 /* Send burst of packets on an output interface */ 1281 static inline int 1282 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1283 { 1284 struct rte_mbuf **m_table; 1285 int ret; 1286 uint16_t queueid; 1287 1288 queueid = qconf->tx_queue_id[port]; 1289 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1290 1291 if (unlikely(qconf->pcap[port] != NULL)) { 1292 uint16_t i; 1293 for (i = 0; i < n; i++) { 1294 ff_dump_packets(qconf->pcap[port], m_table[i]); 1295 } 1296 } 1297 1298 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1299 ff_traffic.tx_packets += ret; 1300 uint16_t i; 1301 for (i = 0; i < ret; i++) { 1302 ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]); 1303 } 1304 if (unlikely(ret < n)) { 1305 do { 1306 rte_pktmbuf_free(m_table[ret]); 1307 } while (++ret < n); 1308 } 1309 return 0; 1310 } 1311 1312 /* Enqueue a single packet, and send burst if queue is filled */ 1313 static inline int 1314 send_single_packet(struct rte_mbuf *m, uint8_t port) 1315 { 1316 uint16_t len; 1317 struct lcore_conf *qconf; 1318 1319 qconf = &lcore_conf; 1320 len = qconf->tx_mbufs[port].len; 1321 qconf->tx_mbufs[port].m_table[len] = m; 1322 len++; 1323 1324 /* enough pkts to be sent */ 1325 if (unlikely(len == MAX_PKT_BURST)) { 1326 send_burst(qconf, MAX_PKT_BURST, port); 1327 len = 0; 1328 } 1329 1330 qconf->tx_mbufs[port].len = len; 1331 return 0; 1332 } 1333 1334 int 1335 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1336 int total) 1337 { 1338 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1339 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1340 if (head == NULL) { 1341 ff_mbuf_free(m); 1342 return -1; 1343 } 1344 1345 head->pkt_len = total; 1346 head->nb_segs = 0; 1347 1348 int off = 0; 1349 struct rte_mbuf *cur = head, *prev = NULL; 1350 while(total > 0) { 1351 if (cur == NULL) { 1352 cur = rte_pktmbuf_alloc(mbuf_pool); 1353 if (cur == NULL) { 1354 rte_pktmbuf_free(head); 1355 ff_mbuf_free(m); 1356 return -1; 1357 } 1358 } 1359 1360 if (prev != NULL) { 1361 prev->next = cur; 1362 } 1363 head->nb_segs++; 1364 1365 prev = cur; 1366 void *data = rte_pktmbuf_mtod(cur, void*); 1367 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1368 int ret = ff_mbuf_copydata(m, data, off, len); 1369 if (ret < 0) { 1370 rte_pktmbuf_free(head); 1371 ff_mbuf_free(m); 1372 return -1; 1373 } 1374 1375 1376 cur->data_len = len; 1377 off += len; 1378 total -= len; 1379 cur = NULL; 1380 } 1381 1382 struct ff_tx_offload offload = {0}; 1383 ff_mbuf_tx_offload(m, &offload); 1384 1385 void *data = rte_pktmbuf_mtod(head, void*); 1386 1387 if (offload.ip_csum) { 1388 /* ipv6 not supported yet */ 1389 struct ipv4_hdr *iph; 1390 int iph_len; 1391 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1392 iph_len = (iph->version_ihl & 0x0f) << 2; 1393 1394 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 1395 head->l2_len = ETHER_HDR_LEN; 1396 head->l3_len = iph_len; 1397 } 1398 1399 if (ctx->hw_features.tx_csum_l4) { 1400 struct ipv4_hdr *iph; 1401 int iph_len; 1402 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1403 iph_len = (iph->version_ihl & 0x0f) << 2; 1404 1405 if (offload.tcp_csum) { 1406 head->ol_flags |= PKT_TX_TCP_CKSUM; 1407 head->l2_len = ETHER_HDR_LEN; 1408 head->l3_len = iph_len; 1409 } 1410 1411 /* 1412 * TCP segmentation offload. 1413 * 1414 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 1415 * implies PKT_TX_TCP_CKSUM) 1416 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 1417 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 1418 * write the IP checksum to 0 in the packet 1419 * - fill the mbuf offload information: l2_len, 1420 * l3_len, l4_len, tso_segsz 1421 * - calculate the pseudo header checksum without taking ip_len 1422 * in account, and set it in the TCP header. Refer to 1423 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 1424 * used as helpers. 1425 */ 1426 if (offload.tso_seg_size) { 1427 struct tcp_hdr *tcph; 1428 int tcph_len; 1429 tcph = (struct tcp_hdr *)((char *)iph + iph_len); 1430 tcph_len = (tcph->data_off & 0xf0) >> 2; 1431 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 1432 1433 head->ol_flags |= PKT_TX_TCP_SEG; 1434 head->l4_len = tcph_len; 1435 head->tso_segsz = offload.tso_seg_size; 1436 } 1437 1438 if (offload.udp_csum) { 1439 head->ol_flags |= PKT_TX_UDP_CKSUM; 1440 head->l2_len = ETHER_HDR_LEN; 1441 head->l3_len = iph_len; 1442 } 1443 } 1444 1445 ff_mbuf_free(m); 1446 1447 return send_single_packet(head, ctx->port_id); 1448 } 1449 1450 static int 1451 main_loop(void *arg) 1452 { 1453 struct loop_routine *lr = (struct loop_routine *)arg; 1454 1455 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1456 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc; 1457 int i, j, nb_rx, idle; 1458 uint16_t port_id, queue_id; 1459 struct lcore_conf *qconf; 1460 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 1461 US_PER_S * BURST_TX_DRAIN_US; 1462 struct ff_dpdk_if_context *ctx; 1463 1464 prev_tsc = 0; 1465 usch_tsc = 0; 1466 1467 qconf = &lcore_conf; 1468 1469 while (1) { 1470 cur_tsc = rte_rdtsc(); 1471 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1472 rte_timer_manage(); 1473 } 1474 1475 idle = 1; 1476 sys_tsc = 0; 1477 usr_tsc = 0; 1478 1479 /* 1480 * TX burst queue drain 1481 */ 1482 diff_tsc = cur_tsc - prev_tsc; 1483 if (unlikely(diff_tsc > drain_tsc)) { 1484 for (i = 0; i < qconf->nb_tx_port; i++) { 1485 port_id = qconf->tx_port_id[i]; 1486 if (qconf->tx_mbufs[port_id].len == 0) 1487 continue; 1488 1489 idle = 0; 1490 1491 send_burst(qconf, 1492 qconf->tx_mbufs[port_id].len, 1493 port_id); 1494 qconf->tx_mbufs[port_id].len = 0; 1495 } 1496 1497 prev_tsc = cur_tsc; 1498 } 1499 1500 /* 1501 * Read packet from RX queues 1502 */ 1503 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1504 port_id = qconf->rx_queue_list[i].port_id; 1505 queue_id = qconf->rx_queue_list[i].queue_id; 1506 ctx = veth_ctx[port_id]; 1507 1508 #ifdef FF_KNI 1509 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1510 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1511 } 1512 #endif 1513 1514 process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); 1515 1516 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1517 MAX_PKT_BURST); 1518 if (nb_rx == 0) 1519 continue; 1520 1521 idle = 0; 1522 1523 /* Prefetch first packets */ 1524 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1525 rte_prefetch0(rte_pktmbuf_mtod( 1526 pkts_burst[j], void *)); 1527 } 1528 1529 /* Prefetch and handle already prefetched packets */ 1530 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1531 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1532 j + PREFETCH_OFFSET], void *)); 1533 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1534 } 1535 1536 /* Handle remaining prefetched packets */ 1537 for (; j < nb_rx; j++) { 1538 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1539 } 1540 } 1541 1542 process_msg_ring(qconf->proc_id); 1543 1544 div_tsc = rte_rdtsc(); 1545 1546 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) { 1547 usch_tsc = cur_tsc; 1548 lr->loop(lr->arg); 1549 } 1550 1551 idle_sleep_tsc = rte_rdtsc(); 1552 if (likely(idle && idle_sleep)) { 1553 usleep(idle_sleep); 1554 end_tsc = rte_rdtsc(); 1555 } else { 1556 end_tsc = idle_sleep_tsc; 1557 } 1558 1559 end_tsc = rte_rdtsc(); 1560 1561 if (usch_tsc == cur_tsc) { 1562 usr_tsc = idle_sleep_tsc - div_tsc; 1563 } 1564 1565 if (!idle) { 1566 sys_tsc = div_tsc - cur_tsc; 1567 ff_top_status.sys_tsc += sys_tsc; 1568 } 1569 1570 ff_top_status.usr_tsc += usr_tsc; 1571 ff_top_status.work_tsc += end_tsc - cur_tsc; 1572 ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1573 1574 ff_top_status.loops++; 1575 } 1576 1577 return 0; 1578 } 1579 1580 int 1581 ff_dpdk_if_up(void) { 1582 int i; 1583 struct lcore_conf *qconf = &lcore_conf; 1584 for (i = 0; i < qconf->nb_tx_port; i++) { 1585 uint16_t port_id = qconf->tx_port_id[i]; 1586 1587 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id]; 1588 veth_ctx[port_id] = ff_veth_attach(pconf); 1589 if (veth_ctx[port_id] == NULL) { 1590 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1591 } 1592 } 1593 1594 return 0; 1595 } 1596 1597 void 1598 ff_dpdk_run(loop_func_t loop, void *arg) { 1599 struct loop_routine *lr = rte_malloc(NULL, 1600 sizeof(struct loop_routine), 0); 1601 lr->loop = loop; 1602 lr->arg = arg; 1603 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1604 rte_eal_mp_wait_lcore(); 1605 rte_free(lr); 1606 } 1607 1608 void 1609 ff_dpdk_pktmbuf_free(void *m) 1610 { 1611 rte_pktmbuf_free((struct rte_mbuf *)m); 1612 } 1613 1614 static uint32_t 1615 toeplitz_hash(unsigned keylen, const uint8_t *key, 1616 unsigned datalen, const uint8_t *data) 1617 { 1618 uint32_t hash = 0, v; 1619 u_int i, b; 1620 1621 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1622 1623 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1624 for (i = 0; i < datalen; i++) { 1625 for (b = 0; b < 8; b++) { 1626 if (data[i] & (1<<(7-b))) 1627 hash ^= v; 1628 v <<= 1; 1629 if ((i + 4) < keylen && 1630 (key[i+4] & (1<<(7-b)))) 1631 v |= 1; 1632 } 1633 } 1634 return (hash); 1635 } 1636 1637 int 1638 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1639 uint16_t sport, uint16_t dport) 1640 { 1641 struct lcore_conf *qconf = &lcore_conf; 1642 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 1643 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id]; 1644 1645 if (nb_queues <= 1) { 1646 return 1; 1647 } 1648 1649 uint16_t reta_size = rss_reta_size[ctx->port_id]; 1650 uint16_t queueid = qconf->tx_queue_id[ctx->port_id]; 1651 1652 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1653 sizeof(dport)]; 1654 1655 unsigned datalen = 0; 1656 1657 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1658 datalen += sizeof(saddr); 1659 1660 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1661 datalen += sizeof(daddr); 1662 1663 bcopy(&sport, &data[datalen], sizeof(sport)); 1664 datalen += sizeof(sport); 1665 1666 bcopy(&dport, &data[datalen], sizeof(dport)); 1667 datalen += sizeof(dport); 1668 1669 uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes), 1670 default_rsskey_40bytes, datalen, data); 1671 1672 return ((hash & (reta_size - 1)) % nb_queues) == queueid; 1673 } 1674 1675 void 1676 ff_regist_packet_dispatcher(dispatch_func_t func) 1677 { 1678 packet_dispatcher = func; 1679 } 1680 1681 uint64_t 1682 ff_get_tsc_ns() 1683 { 1684 uint64_t cur_tsc = rte_rdtsc(); 1685 uint64_t hz = rte_get_tsc_hz(); 1686 return ((double)cur_tsc/(double)hz) * NS_PER_S; 1687 } 1688 1689