1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 28 #include <rte_common.h> 29 #include <rte_byteorder.h> 30 #include <rte_log.h> 31 #include <rte_memory.h> 32 #include <rte_memcpy.h> 33 #include <rte_memzone.h> 34 #include <rte_config.h> 35 #include <rte_eal.h> 36 #include <rte_pci.h> 37 #include <rte_mbuf.h> 38 #include <rte_memory.h> 39 #include <rte_lcore.h> 40 #include <rte_launch.h> 41 #include <rte_ethdev.h> 42 #include <rte_debug.h> 43 #include <rte_common.h> 44 #include <rte_ether.h> 45 #include <rte_malloc.h> 46 #include <rte_cycles.h> 47 #include <rte_timer.h> 48 #include <rte_thash.h> 49 #include <rte_ip.h> 50 #include <rte_tcp.h> 51 #include <rte_udp.h> 52 53 #include "ff_dpdk_if.h" 54 #include "ff_dpdk_pcap.h" 55 #include "ff_dpdk_kni.h" 56 #include "ff_config.h" 57 #include "ff_veth.h" 58 #include "ff_host_interface.h" 59 #include "ff_msg.h" 60 #include "ff_api.h" 61 62 #define MEMPOOL_CACHE_SIZE 256 63 64 #define DISPATCH_RING_SIZE 2048 65 66 #define MSG_RING_SIZE 32 67 68 /* 69 * Configurable number of RX/TX ring descriptors 70 */ 71 #define RX_QUEUE_SIZE 512 72 #define TX_QUEUE_SIZE 512 73 74 #define MAX_PKT_BURST 32 75 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 76 77 /* 78 * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send. 79 */ 80 #define MAX_TX_BURST (MAX_PKT_BURST / 2) 81 82 #define NB_SOCKETS 8 83 84 /* Configure how many packets ahead to prefetch, when reading packets */ 85 #define PREFETCH_OFFSET 3 86 87 #define MAX_RX_QUEUE_PER_LCORE 16 88 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS 89 #define MAX_RX_QUEUE_PER_PORT 128 90 91 #ifdef FF_KNI 92 #define KNI_MBUF_MAX 2048 93 #define KNI_QUEUE_SIZE 2048 94 95 static int enable_kni; 96 static int kni_accept; 97 #endif 98 99 static int numa_on; 100 101 static struct rte_timer freebsd_clock; 102 103 // Mellanox Linux's driver key 104 static uint8_t default_rsskey_40bytes[40] = { 105 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 106 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 107 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 108 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 109 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 110 }; 111 112 static struct rte_eth_conf default_port_conf = { 113 .rxmode = { 114 .mq_mode = ETH_MQ_RX_RSS, 115 .max_rx_pkt_len = ETHER_MAX_LEN, 116 .split_hdr_size = 0, /**< hdr buf size */ 117 .header_split = 0, /**< Header Split disabled */ 118 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 119 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 120 .hw_vlan_strip = 0, /**< VLAN strip disabled. */ 121 .hw_vlan_extend = 0, /**< Extended VLAN disabled. */ 122 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 123 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 124 .enable_lro = 0, /**< LRO disabled */ 125 }, 126 .rx_adv_conf = { 127 .rss_conf = { 128 .rss_key = default_rsskey_40bytes, 129 .rss_key_len = 40, 130 .rss_hf = ETH_RSS_PROTO_MASK, 131 }, 132 }, 133 .txmode = { 134 .mq_mode = ETH_MQ_TX_NONE, 135 }, 136 }; 137 138 struct mbuf_table { 139 uint16_t len; 140 struct rte_mbuf *m_table[MAX_PKT_BURST]; 141 }; 142 143 struct lcore_rx_queue { 144 uint16_t port_id; 145 uint16_t queue_id; 146 } __rte_cache_aligned; 147 148 struct lcore_conf { 149 uint16_t proc_id; 150 uint16_t socket_id; 151 uint16_t nb_queue_list[RTE_MAX_ETHPORTS]; 152 struct ff_port_cfg *port_cfgs; 153 154 uint16_t nb_rx_queue; 155 struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; 156 uint16_t nb_tx_port; 157 uint16_t tx_port_id[RTE_MAX_ETHPORTS]; 158 uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; 159 struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS]; 160 char *pcap[RTE_MAX_ETHPORTS]; 161 } __rte_cache_aligned; 162 163 static struct lcore_conf lcore_conf; 164 165 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 166 167 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; 168 static dispatch_func_t packet_dispatcher; 169 170 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 171 172 struct ff_msg_ring { 173 char ring_name[2][RTE_RING_NAMESIZE]; 174 /* ring[0] for lcore recv msg, other send */ 175 /* ring[1] for lcore send msg, other read */ 176 struct rte_ring *ring[2]; 177 } __rte_cache_aligned; 178 179 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 180 static struct rte_mempool *message_pool; 181 182 struct ff_dpdk_if_context { 183 void *sc; 184 void *ifp; 185 uint16_t port_id; 186 struct ff_hw_features hw_features; 187 } __rte_cache_aligned; 188 189 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 190 191 static struct ff_top_args ff_top_status; 192 static struct ff_traffic_args ff_traffic; 193 194 extern void ff_hardclock(void); 195 196 static void 197 ff_hardclock_job(__rte_unused struct rte_timer *timer, 198 __rte_unused void *arg) { 199 ff_hardclock(); 200 ff_update_current_ts(); 201 } 202 203 struct ff_dpdk_if_context * 204 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 205 { 206 struct ff_dpdk_if_context *ctx; 207 208 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 209 if (ctx == NULL) 210 return NULL; 211 212 ctx->sc = sc; 213 ctx->ifp = ifp; 214 ctx->port_id = cfg->port_id; 215 ctx->hw_features = cfg->hw_features; 216 217 return ctx; 218 } 219 220 void 221 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 222 { 223 free(ctx); 224 } 225 226 static void 227 check_all_ports_link_status(void) 228 { 229 #define CHECK_INTERVAL 100 /* 100ms */ 230 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 231 232 uint16_t portid; 233 uint8_t count, all_ports_up, print_flag = 0; 234 struct rte_eth_link link; 235 236 printf("\nChecking link status"); 237 fflush(stdout); 238 239 int i, nb_ports; 240 nb_ports = ff_global_cfg.dpdk.nb_ports; 241 for (count = 0; count <= MAX_CHECK_TIME; count++) { 242 all_ports_up = 1; 243 for (i = 0; i < nb_ports; i++) { 244 uint16_t portid = ff_global_cfg.dpdk.portid_list[i]; 245 memset(&link, 0, sizeof(link)); 246 rte_eth_link_get_nowait(portid, &link); 247 248 /* print link status if flag set */ 249 if (print_flag == 1) { 250 if (link.link_status) { 251 printf("Port %d Link Up - speed %u " 252 "Mbps - %s\n", (int)portid, 253 (unsigned)link.link_speed, 254 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 255 ("full-duplex") : ("half-duplex\n")); 256 } else { 257 printf("Port %d Link Down\n", (int)portid); 258 } 259 continue; 260 } 261 /* clear all_ports_up flag if any link down */ 262 if (link.link_status == 0) { 263 all_ports_up = 0; 264 break; 265 } 266 } 267 268 /* after finally printing all link status, get out */ 269 if (print_flag == 1) 270 break; 271 272 if (all_ports_up == 0) { 273 printf("."); 274 fflush(stdout); 275 rte_delay_ms(CHECK_INTERVAL); 276 } 277 278 /* set the print_flag if all ports up or timeout */ 279 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 280 print_flag = 1; 281 printf("done\n"); 282 } 283 } 284 } 285 286 static int 287 init_lcore_conf(void) 288 { 289 uint8_t nb_dev_ports = rte_eth_dev_count(); 290 if (nb_dev_ports == 0) { 291 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 292 } 293 294 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) { 295 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n", 296 ff_global_cfg.dpdk.max_portid); 297 } 298 299 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs; 300 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 301 302 uint16_t proc_id; 303 for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) { 304 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id]; 305 if (!lcore_config[lcore_id].detected) { 306 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 307 } 308 } 309 310 uint16_t socket_id = 0; 311 if (numa_on) { 312 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 313 } 314 315 lcore_conf.socket_id = socket_id; 316 317 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id]; 318 int j; 319 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) { 320 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j]; 321 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 322 323 int queueid = -1; 324 int i; 325 for (i = 0; i < pconf->nb_lcores; i++) { 326 if (pconf->lcore_list[i] == lcore_id) { 327 queueid = i; 328 } 329 } 330 if (queueid < 0) { 331 continue; 332 } 333 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid); 334 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 335 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 336 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid; 337 lcore_conf.nb_rx_queue++; 338 339 lcore_conf.tx_queue_id[port_id] = queueid; 340 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id; 341 lcore_conf.nb_tx_port++; 342 343 lcore_conf.pcap[port_id] = pconf->pcap; 344 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores; 345 } 346 347 if (lcore_conf.nb_rx_queue == 0) { 348 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id); 349 } 350 351 return 0; 352 } 353 354 static int 355 init_mem_pool(void) 356 { 357 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 358 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 359 uint32_t nb_tx_queue = nb_lcores; 360 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 361 362 unsigned nb_mbuf = RTE_MAX ( 363 (nb_rx_queue*RX_QUEUE_SIZE + 364 nb_ports*nb_lcores*MAX_PKT_BURST + 365 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 366 nb_lcores*MEMPOOL_CACHE_SIZE + 367 #ifdef FF_KNI 368 nb_ports*KNI_MBUF_MAX + 369 nb_ports*KNI_QUEUE_SIZE + 370 #endif 371 nb_lcores*nb_ports*DISPATCH_RING_SIZE), 372 (unsigned)8192); 373 374 unsigned socketid = 0; 375 uint16_t i, lcore_id; 376 char s[64]; 377 378 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) { 379 lcore_id = ff_global_cfg.dpdk.proc_lcore[i]; 380 if (numa_on) { 381 socketid = rte_lcore_to_socket_id(lcore_id); 382 } 383 384 if (socketid >= NB_SOCKETS) { 385 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 386 socketid, i, NB_SOCKETS); 387 } 388 389 if (pktmbuf_pool[socketid] != NULL) { 390 continue; 391 } 392 393 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 394 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 395 pktmbuf_pool[socketid] = 396 rte_pktmbuf_pool_create(s, nb_mbuf, 397 MEMPOOL_CACHE_SIZE, 0, 398 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 399 } else { 400 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 401 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 402 } 403 404 if (pktmbuf_pool[socketid] == NULL) { 405 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 406 } else { 407 printf("create mbuf pool on socket %d\n", socketid); 408 } 409 } 410 411 return 0; 412 } 413 414 static struct rte_ring * 415 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 416 { 417 struct rte_ring *ring; 418 419 if (name == NULL) 420 return NULL; 421 422 /* If already create, just attached it */ 423 if (likely((ring = rte_ring_lookup(name)) != NULL)) 424 return ring; 425 426 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 427 return rte_ring_create(name, count, socket_id, flags); 428 } else { 429 return rte_ring_lookup(name); 430 } 431 } 432 433 static int 434 init_dispatch_ring(void) 435 { 436 int j; 437 char name_buf[RTE_RING_NAMESIZE]; 438 int queueid; 439 440 unsigned socketid = lcore_conf.socket_id; 441 442 /* Create ring according to ports actually being used. */ 443 int nb_ports = ff_global_cfg.dpdk.nb_ports; 444 for (j = 0; j < nb_ports; j++) { 445 uint16_t portid = ff_global_cfg.dpdk.portid_list[j]; 446 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid]; 447 int nb_queues = pconf->nb_lcores; 448 if (dispatch_ring[portid] == NULL) { 449 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid); 450 451 dispatch_ring[portid] = rte_zmalloc(name_buf, 452 sizeof(struct rte_ring *) * nb_queues, 453 RTE_CACHE_LINE_SIZE); 454 if (dispatch_ring[portid] == NULL) { 455 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 456 "failed\n", name_buf); 457 } 458 } 459 460 for(queueid = 0; queueid < nb_queues; ++queueid) { 461 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d", 462 portid, queueid); 463 dispatch_ring[portid][queueid] = create_ring(name_buf, 464 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ); 465 466 if (dispatch_ring[portid][queueid] == NULL) 467 rte_panic("create ring:%s failed!\n", name_buf); 468 469 printf("create ring:%s success, %u ring entries are now free!\n", 470 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid])); 471 } 472 } 473 474 return 0; 475 } 476 477 static void 478 ff_msg_init(struct rte_mempool *mp, 479 __attribute__((unused)) void *opaque_arg, 480 void *obj, __attribute__((unused)) unsigned i) 481 { 482 struct ff_msg *msg = (struct ff_msg *)obj; 483 msg->msg_type = FF_UNKNOWN; 484 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 485 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 486 } 487 488 static int 489 init_msg_ring(void) 490 { 491 uint16_t i; 492 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 493 unsigned socketid = lcore_conf.socket_id; 494 495 /* Create message buffer pool */ 496 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 497 message_pool = rte_mempool_create(FF_MSG_POOL, 498 MSG_RING_SIZE * 2 * nb_procs, 499 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 500 NULL, NULL, ff_msg_init, NULL, 501 socketid, 0); 502 } else { 503 message_pool = rte_mempool_lookup(FF_MSG_POOL); 504 } 505 506 if (message_pool == NULL) { 507 rte_panic("Create msg mempool failed\n"); 508 } 509 510 for(i = 0; i < nb_procs; ++i) { 511 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 512 "%s%u", FF_MSG_RING_IN, i); 513 snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE, 514 "%s%u", FF_MSG_RING_OUT, i); 515 516 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 517 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 518 if (msg_ring[i].ring[0] == NULL) 519 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 520 521 msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1], 522 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 523 if (msg_ring[i].ring[1] == NULL) 524 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 525 } 526 527 return 0; 528 } 529 530 #ifdef FF_KNI 531 static int 532 init_kni(void) 533 { 534 int nb_ports = rte_eth_dev_count(); 535 kni_accept = 0; 536 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 537 kni_accept = 1; 538 539 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 540 ff_global_cfg.kni.udp_port); 541 542 unsigned socket_id = lcore_conf.socket_id; 543 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 544 545 nb_ports = ff_global_cfg.dpdk.nb_ports; 546 int i, ret; 547 for (i = 0; i < nb_ports; i++) { 548 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 549 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 550 } 551 552 return 0; 553 } 554 #endif 555 556 static void 557 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues) 558 { 559 if (reta_size == 0) { 560 return; 561 } 562 563 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 564 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 565 566 /* config HW indirection table */ 567 unsigned i, j, hash=0; 568 for (i = 0; i < reta_conf_size; i++) { 569 reta_conf[i].mask = ~0ULL; 570 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 571 reta_conf[i].reta[j] = hash++ % nb_queues; 572 } 573 } 574 575 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 576 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 577 port_id); 578 } 579 } 580 581 static int 582 init_port_start(void) 583 { 584 int nb_ports = ff_global_cfg.dpdk.nb_ports; 585 unsigned socketid = 0; 586 struct rte_mempool *mbuf_pool; 587 uint16_t i; 588 589 for (i = 0; i < nb_ports; i++) { 590 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 591 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 592 uint16_t nb_queues = pconf->nb_lcores; 593 594 struct rte_eth_dev_info dev_info; 595 rte_eth_dev_info_get(port_id, &dev_info); 596 597 if (nb_queues > dev_info.max_rx_queues) { 598 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 599 nb_queues, 600 dev_info.max_rx_queues); 601 } 602 603 if (nb_queues > dev_info.max_tx_queues) { 604 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 605 nb_queues, 606 dev_info.max_tx_queues); 607 } 608 609 struct ether_addr addr; 610 rte_eth_macaddr_get(port_id, &addr); 611 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 612 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 613 (unsigned)port_id, 614 addr.addr_bytes[0], addr.addr_bytes[1], 615 addr.addr_bytes[2], addr.addr_bytes[3], 616 addr.addr_bytes[4], addr.addr_bytes[5]); 617 618 rte_memcpy(pconf->mac, 619 addr.addr_bytes, ETHER_ADDR_LEN); 620 621 /* Clear txq_flags - we do not need multi-mempool and refcnt */ 622 dev_info.default_txconf.txq_flags = ETH_TXQ_FLAGS_NOMULTMEMP | 623 ETH_TXQ_FLAGS_NOREFCOUNT; 624 625 /* Disable features that are not supported by port's HW */ 626 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) { 627 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP; 628 } 629 630 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 631 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP; 632 } 633 634 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) { 635 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP; 636 } 637 638 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { 639 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; 640 } 641 642 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) && 643 !(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TSO)) { 644 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; 645 } 646 647 struct rte_eth_conf port_conf = {0}; 648 649 /* Set RSS mode */ 650 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 651 port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK; 652 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes; 653 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; 654 655 /* Set Rx VLAN stripping */ 656 if (ff_global_cfg.dpdk.vlan_strip) { 657 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 658 port_conf.rxmode.hw_vlan_strip = 1; 659 } 660 } 661 662 /* Enable HW CRC stripping */ 663 port_conf.rxmode.hw_strip_crc = 1; 664 665 /* FIXME: Enable TCP LRO ?*/ 666 #if 0 667 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 668 printf("LRO is supported\n"); 669 port_conf.rxmode.enable_lro = 1; 670 pconf->hw_features.rx_lro = 1; 671 } 672 #endif 673 674 /* Set Rx checksum checking */ 675 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 676 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 677 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 678 printf("RX checksum offload supported\n"); 679 port_conf.rxmode.hw_ip_checksum = 1; 680 pconf->hw_features.rx_csum = 1; 681 } 682 683 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 684 printf("TX ip checksum offload supported\n"); 685 pconf->hw_features.tx_csum_ip = 1; 686 } 687 688 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 689 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 690 printf("TX TCP&UDP checksum offload supported\n"); 691 pconf->hw_features.tx_csum_l4 = 1; 692 } 693 694 if (ff_global_cfg.dpdk.tso) { 695 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 696 printf("TSO is supported\n"); 697 pconf->hw_features.tx_tso = 1; 698 } 699 } else { 700 printf("TSO is disabled\n"); 701 } 702 703 if (dev_info.reta_size) { 704 /* reta size must be power of 2 */ 705 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 706 707 rss_reta_size[port_id] = dev_info.reta_size; 708 printf("port[%d]: rss table size: %d\n", port_id, 709 dev_info.reta_size); 710 } 711 712 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 713 continue; 714 } 715 716 int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); 717 if (ret != 0) { 718 return ret; 719 } 720 uint16_t q; 721 for (q = 0; q < nb_queues; q++) { 722 if (numa_on) { 723 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q]; 724 socketid = rte_lcore_to_socket_id(lcore_id); 725 } 726 mbuf_pool = pktmbuf_pool[socketid]; 727 728 ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE, 729 socketid, &dev_info.default_txconf); 730 if (ret < 0) { 731 return ret; 732 } 733 734 ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE, 735 socketid, &dev_info.default_rxconf, mbuf_pool); 736 if (ret < 0) { 737 return ret; 738 } 739 } 740 741 ret = rte_eth_dev_start(port_id); 742 if (ret < 0) { 743 return ret; 744 } 745 746 if (nb_queues > 1) { 747 /* set HW rss hash function to Toeplitz. */ 748 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 749 struct rte_eth_hash_filter_info info = {0}; 750 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 751 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 752 753 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 754 RTE_ETH_FILTER_SET, &info) < 0) { 755 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 756 port_id); 757 } 758 } 759 760 set_rss_table(port_id, dev_info.reta_size, nb_queues); 761 } 762 763 /* Enable RX in promiscuous mode for the Ethernet device. */ 764 if (ff_global_cfg.dpdk.promiscuous) { 765 rte_eth_promiscuous_enable(port_id); 766 ret = rte_eth_promiscuous_get(port_id); 767 if (ret == 1) { 768 printf("set port %u to promiscuous mode ok\n", port_id); 769 } else { 770 printf("set port %u to promiscuous mode error\n", port_id); 771 } 772 } 773 774 /* Enable pcap dump */ 775 if (pconf->pcap) { 776 ff_enable_pcap(pconf->pcap); 777 } 778 } 779 780 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 781 check_all_ports_link_status(); 782 } 783 784 return 0; 785 } 786 787 static int 788 init_clock(void) 789 { 790 rte_timer_subsystem_init(); 791 uint64_t hz = rte_get_timer_hz(); 792 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 793 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 794 795 rte_timer_init(&freebsd_clock); 796 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 797 rte_lcore_id(), &ff_hardclock_job, NULL); 798 799 ff_update_current_ts(); 800 801 return 0; 802 } 803 804 int 805 ff_dpdk_init(int argc, char **argv) 806 { 807 if (ff_global_cfg.dpdk.nb_procs < 1 || 808 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 809 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 810 ff_global_cfg.dpdk.proc_id < 0) { 811 printf("param num_procs[%d] or proc_id[%d] error!\n", 812 ff_global_cfg.dpdk.nb_procs, 813 ff_global_cfg.dpdk.proc_id); 814 exit(1); 815 } 816 817 int ret = rte_eal_init(argc, argv); 818 if (ret < 0) { 819 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 820 } 821 822 numa_on = ff_global_cfg.dpdk.numa_on; 823 824 init_lcore_conf(); 825 826 init_mem_pool(); 827 828 init_dispatch_ring(); 829 830 init_msg_ring(); 831 832 #ifdef FF_KNI 833 enable_kni = ff_global_cfg.kni.enable; 834 if (enable_kni) { 835 init_kni(); 836 } 837 #endif 838 839 ret = init_port_start(); 840 if (ret < 0) { 841 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 842 } 843 844 init_clock(); 845 846 return 0; 847 } 848 849 static void 850 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 851 { 852 uint8_t rx_csum = ctx->hw_features.rx_csum; 853 if (rx_csum) { 854 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 855 rte_pktmbuf_free(pkt); 856 return; 857 } 858 } 859 860 /* 861 * FIXME: should we save pkt->vlan_tci 862 * if (pkt->ol_flags & PKT_RX_VLAN_PKT) 863 */ 864 865 void *data = rte_pktmbuf_mtod(pkt, void*); 866 uint16_t len = rte_pktmbuf_data_len(pkt); 867 868 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 869 if (hdr == NULL) { 870 rte_pktmbuf_free(pkt); 871 return; 872 } 873 874 struct rte_mbuf *pn = pkt->next; 875 void *prev = hdr; 876 while(pn != NULL) { 877 data = rte_pktmbuf_mtod(pn, void*); 878 len = rte_pktmbuf_data_len(pn); 879 880 void *mb = ff_mbuf_get(prev, data, len); 881 if (mb == NULL) { 882 ff_mbuf_free(hdr); 883 rte_pktmbuf_free(pkt); 884 return; 885 } 886 pn = pn->next; 887 prev = mb; 888 } 889 890 ff_veth_process_packet(ctx->ifp, hdr); 891 } 892 893 static enum FilterReturn 894 protocol_filter(const void *data, uint16_t len) 895 { 896 if(len < ETHER_HDR_LEN) 897 return FILTER_UNKNOWN; 898 899 const struct ether_hdr *hdr; 900 hdr = (const struct ether_hdr *)data; 901 902 if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP) 903 return FILTER_ARP; 904 905 #ifndef FF_KNI 906 return FILTER_UNKNOWN; 907 #else 908 if (!enable_kni) { 909 return FILTER_UNKNOWN; 910 } 911 912 if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4) 913 return FILTER_UNKNOWN; 914 915 return ff_kni_proto_filter(data + ETHER_HDR_LEN, 916 len - ETHER_HDR_LEN); 917 #endif 918 } 919 920 static inline void 921 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m) 922 { 923 struct rte_mbuf *md; 924 void *src, *dst; 925 926 dst = rte_pktmbuf_mtod(mi, void *); 927 src = rte_pktmbuf_mtod(m, void *); 928 929 mi->data_len = m->data_len; 930 rte_memcpy(dst, src, m->data_len); 931 932 mi->port = m->port; 933 mi->vlan_tci = m->vlan_tci; 934 mi->vlan_tci_outer = m->vlan_tci_outer; 935 mi->tx_offload = m->tx_offload; 936 mi->hash = m->hash; 937 mi->ol_flags = m->ol_flags; 938 mi->packet_type = m->packet_type; 939 } 940 941 /* copied from rte_pktmbuf_clone */ 942 static inline struct rte_mbuf * 943 pktmbuf_deep_clone(const struct rte_mbuf *md, 944 struct rte_mempool *mp) 945 { 946 struct rte_mbuf *mc, *mi, **prev; 947 uint32_t pktlen; 948 uint8_t nseg; 949 950 if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) 951 return NULL; 952 953 mi = mc; 954 prev = &mi->next; 955 pktlen = md->pkt_len; 956 nseg = 0; 957 958 do { 959 nseg++; 960 pktmbuf_deep_attach(mi, md); 961 *prev = mi; 962 prev = &mi->next; 963 } while ((md = md->next) != NULL && 964 (mi = rte_pktmbuf_alloc(mp)) != NULL); 965 966 *prev = NULL; 967 mc->nb_segs = nseg; 968 mc->pkt_len = pktlen; 969 970 /* Allocation of new indirect segment failed */ 971 if (unlikely (mi == NULL)) { 972 rte_pktmbuf_free(mc); 973 return NULL; 974 } 975 976 __rte_mbuf_sanity_check(mc, 1); 977 return mc; 978 } 979 980 static inline void 981 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 982 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 983 { 984 struct lcore_conf *qconf = &lcore_conf; 985 uint16_t nb_queues = qconf->nb_queue_list[port_id]; 986 987 uint16_t i; 988 for (i = 0; i < count; i++) { 989 struct rte_mbuf *rtem = bufs[i]; 990 991 if (unlikely(qconf->pcap[port_id] != NULL)) { 992 if (!pkts_from_ring) { 993 ff_dump_packets(qconf->pcap[port_id], rtem); 994 } 995 } 996 997 void *data = rte_pktmbuf_mtod(rtem, void*); 998 uint16_t len = rte_pktmbuf_data_len(rtem); 999 1000 if (!pkts_from_ring) { 1001 ff_traffic.rx_packets++; 1002 ff_traffic.rx_bytes += len; 1003 } 1004 1005 if (!pkts_from_ring && packet_dispatcher) { 1006 int ret = (*packet_dispatcher)(data, len, queue_id, nb_queues); 1007 if (ret < 0 || ret >= nb_queues) { 1008 rte_pktmbuf_free(rtem); 1009 continue; 1010 } 1011 1012 if (ret != queue_id) { 1013 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem); 1014 if (ret < 0) 1015 rte_pktmbuf_free(rtem); 1016 1017 continue; 1018 } 1019 } 1020 1021 enum FilterReturn filter = protocol_filter(data, len); 1022 if (filter == FILTER_ARP) { 1023 struct rte_mempool *mbuf_pool; 1024 struct rte_mbuf *mbuf_clone; 1025 if (!pkts_from_ring) { 1026 uint16_t j; 1027 for(j = 0; j < nb_queues; ++j) { 1028 if(j == queue_id) 1029 continue; 1030 1031 unsigned socket_id = 0; 1032 if (numa_on) { 1033 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j]; 1034 socket_id = rte_lcore_to_socket_id(lcore_id); 1035 } 1036 mbuf_pool = pktmbuf_pool[socket_id]; 1037 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1038 if(mbuf_clone) { 1039 int ret = rte_ring_enqueue(dispatch_ring[port_id][j], 1040 mbuf_clone); 1041 if (ret < 0) 1042 rte_pktmbuf_free(mbuf_clone); 1043 } 1044 } 1045 } 1046 1047 #ifdef FF_KNI 1048 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1049 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 1050 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1051 if(mbuf_clone) { 1052 ff_kni_enqueue(port_id, mbuf_clone); 1053 } 1054 } 1055 #endif 1056 ff_veth_input(ctx, rtem); 1057 #ifdef FF_KNI 1058 } else if (enable_kni && 1059 ((filter == FILTER_KNI && kni_accept) || 1060 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 1061 ff_kni_enqueue(port_id, rtem); 1062 #endif 1063 } else { 1064 ff_veth_input(ctx, rtem); 1065 } 1066 } 1067 } 1068 1069 static inline int 1070 process_dispatch_ring(uint16_t port_id, uint16_t queue_id, 1071 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 1072 { 1073 /* read packet from ring buf and to process */ 1074 uint16_t nb_rb; 1075 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id], 1076 (void **)pkts_burst, MAX_PKT_BURST, NULL); 1077 1078 if(nb_rb > 0) { 1079 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 1080 } 1081 1082 return 0; 1083 } 1084 1085 static inline void 1086 handle_sysctl_msg(struct ff_msg *msg) 1087 { 1088 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 1089 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 1090 msg->sysctl.newlen); 1091 1092 if (ret < 0) { 1093 msg->result = errno; 1094 } else { 1095 msg->result = 0; 1096 } 1097 } 1098 1099 static inline void 1100 handle_ioctl_msg(struct ff_msg *msg) 1101 { 1102 int fd, ret; 1103 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 1104 if (fd < 0) { 1105 ret = -1; 1106 goto done; 1107 } 1108 1109 ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data); 1110 1111 ff_close(fd); 1112 1113 done: 1114 if (ret < 0) { 1115 msg->result = errno; 1116 } else { 1117 msg->result = 0; 1118 } 1119 } 1120 1121 static inline void 1122 handle_route_msg(struct ff_msg *msg) 1123 { 1124 int ret = ff_rtioctl(msg->route.fib, msg->route.data, 1125 &msg->route.len, msg->route.maxlen); 1126 if (ret < 0) { 1127 msg->result = errno; 1128 } else { 1129 msg->result = 0; 1130 } 1131 } 1132 1133 static struct ff_top_args ff_status; 1134 static inline void 1135 handle_top_msg(struct ff_msg *msg) 1136 { 1137 msg->top = ff_status; 1138 msg->result = 0; 1139 } 1140 1141 #ifdef FF_NETGRAPH 1142 static inline void 1143 handle_ngctl_msg(struct ff_msg *msg) 1144 { 1145 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data); 1146 if (ret < 0) { 1147 msg->result = errno; 1148 } else { 1149 msg->result = 0; 1150 msg->ngctl.ret = ret; 1151 } 1152 } 1153 #endif 1154 1155 #ifdef FF_IPFW 1156 static inline void 1157 handle_ipfw_msg(struct ff_msg *msg) 1158 { 1159 int fd, ret; 1160 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW); 1161 if (fd < 0) { 1162 ret = -1; 1163 goto done; 1164 } 1165 1166 switch (msg->ipfw.cmd) { 1167 case FF_IPFW_GET: 1168 ret = ff_getsockopt_freebsd(fd, msg->ipfw.level, 1169 msg->ipfw.optname, msg->ipfw.optval, 1170 msg->ipfw.optlen); 1171 break; 1172 case FF_IPFW_SET: 1173 ret = ff_setsockopt_freebsd(fd, msg->ipfw.level, 1174 msg->ipfw.optname, msg->ipfw.optval, 1175 *(msg->ipfw.optlen)); 1176 break; 1177 default: 1178 ret = -1; 1179 errno = ENOTSUP; 1180 break; 1181 } 1182 1183 ff_close(fd); 1184 1185 done: 1186 if (ret < 0) { 1187 msg->result = errno; 1188 } else { 1189 msg->result = 0; 1190 } 1191 } 1192 #endif 1193 1194 static inline void 1195 handle_traffic_msg(struct ff_msg *msg) 1196 { 1197 msg->traffic = ff_traffic; 1198 msg->result = 0; 1199 } 1200 1201 static inline void 1202 handle_default_msg(struct ff_msg *msg) 1203 { 1204 msg->result = ENOTSUP; 1205 } 1206 1207 static inline void 1208 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1209 { 1210 switch (msg->msg_type) { 1211 case FF_SYSCTL: 1212 handle_sysctl_msg(msg); 1213 break; 1214 case FF_IOCTL: 1215 handle_ioctl_msg(msg); 1216 break; 1217 case FF_ROUTE: 1218 handle_route_msg(msg); 1219 break; 1220 case FF_TOP: 1221 handle_top_msg(msg); 1222 break; 1223 #ifdef FF_NETGRAPH 1224 case FF_NGCTL: 1225 handle_ngctl_msg(msg); 1226 break; 1227 #endif 1228 #ifdef FF_IPFW 1229 case FF_IPFW_CTL: 1230 handle_ipfw_msg(msg); 1231 break; 1232 #endif 1233 case FF_TRAFFIC: 1234 handle_traffic_msg(msg); 1235 break; 1236 default: 1237 handle_default_msg(msg); 1238 break; 1239 } 1240 rte_ring_enqueue(msg_ring[proc_id].ring[1], msg); 1241 } 1242 1243 static inline int 1244 process_msg_ring(uint16_t proc_id) 1245 { 1246 void *msg; 1247 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1248 1249 if (unlikely(ret == 0)) { 1250 handle_msg((struct ff_msg *)msg, proc_id); 1251 } 1252 1253 return 0; 1254 } 1255 1256 /* Send burst of packets on an output interface */ 1257 static inline int 1258 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1259 { 1260 struct rte_mbuf **m_table; 1261 int ret; 1262 uint16_t queueid; 1263 1264 queueid = qconf->tx_queue_id[port]; 1265 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1266 1267 if (unlikely(qconf->pcap[port] != NULL)) { 1268 uint16_t i; 1269 for (i = 0; i < n; i++) { 1270 ff_dump_packets(qconf->pcap[port], m_table[i]); 1271 } 1272 } 1273 1274 ff_traffic.tx_packets += n; 1275 uint16_t i; 1276 for (i = 0; i < n; i++) { 1277 ff_traffic.tx_bytes += rte_pktmbuf_data_len(m_table[i]); 1278 } 1279 1280 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1281 if (unlikely(ret < n)) { 1282 do { 1283 rte_pktmbuf_free(m_table[ret]); 1284 } while (++ret < n); 1285 } 1286 1287 return 0; 1288 } 1289 1290 /* Enqueue a single packet, and send burst if queue is filled */ 1291 static inline int 1292 send_single_packet(struct rte_mbuf *m, uint8_t port) 1293 { 1294 uint16_t len; 1295 struct lcore_conf *qconf; 1296 1297 qconf = &lcore_conf; 1298 len = qconf->tx_mbufs[port].len; 1299 qconf->tx_mbufs[port].m_table[len] = m; 1300 len++; 1301 1302 /* enough pkts to be sent */ 1303 if (unlikely(len == MAX_PKT_BURST)) { 1304 send_burst(qconf, MAX_PKT_BURST, port); 1305 len = 0; 1306 } 1307 1308 qconf->tx_mbufs[port].len = len; 1309 return 0; 1310 } 1311 1312 int 1313 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1314 int total) 1315 { 1316 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1317 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1318 if (head == NULL) { 1319 ff_mbuf_free(m); 1320 return -1; 1321 } 1322 1323 head->pkt_len = total; 1324 head->nb_segs = 0; 1325 1326 int off = 0; 1327 struct rte_mbuf *cur = head, *prev = NULL; 1328 while(total > 0) { 1329 if (cur == NULL) { 1330 cur = rte_pktmbuf_alloc(mbuf_pool); 1331 if (cur == NULL) { 1332 rte_pktmbuf_free(head); 1333 ff_mbuf_free(m); 1334 return -1; 1335 } 1336 } 1337 1338 void *data = rte_pktmbuf_mtod(cur, void*); 1339 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1340 int ret = ff_mbuf_copydata(m, data, off, len); 1341 if (ret < 0) { 1342 rte_pktmbuf_free(head); 1343 ff_mbuf_free(m); 1344 return -1; 1345 } 1346 1347 if (prev != NULL) { 1348 prev->next = cur; 1349 } 1350 prev = cur; 1351 1352 cur->data_len = len; 1353 off += len; 1354 total -= len; 1355 head->nb_segs++; 1356 cur = NULL; 1357 } 1358 1359 struct ff_tx_offload offload = {0}; 1360 ff_mbuf_tx_offload(m, &offload); 1361 1362 void *data = rte_pktmbuf_mtod(head, void*); 1363 1364 if (offload.ip_csum) { 1365 /* ipv6 not supported yet */ 1366 struct ipv4_hdr *iph; 1367 int iph_len; 1368 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1369 iph_len = (iph->version_ihl & 0x0f) << 2; 1370 1371 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 1372 head->l2_len = ETHER_HDR_LEN; 1373 head->l3_len = iph_len; 1374 } 1375 1376 if (ctx->hw_features.tx_csum_l4) { 1377 struct ipv4_hdr *iph; 1378 int iph_len; 1379 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1380 iph_len = (iph->version_ihl & 0x0f) << 2; 1381 1382 if (offload.tcp_csum) { 1383 head->ol_flags |= PKT_TX_TCP_CKSUM; 1384 head->l2_len = ETHER_HDR_LEN; 1385 head->l3_len = iph_len; 1386 } 1387 1388 /* 1389 * TCP segmentation offload. 1390 * 1391 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 1392 * implies PKT_TX_TCP_CKSUM) 1393 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 1394 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 1395 * write the IP checksum to 0 in the packet 1396 * - fill the mbuf offload information: l2_len, 1397 * l3_len, l4_len, tso_segsz 1398 * - calculate the pseudo header checksum without taking ip_len 1399 * in account, and set it in the TCP header. Refer to 1400 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 1401 * used as helpers. 1402 */ 1403 if (offload.tso_seg_size) { 1404 struct tcp_hdr *tcph; 1405 int tcph_len; 1406 tcph = (struct tcp_hdr *)((char *)iph + iph_len); 1407 tcph_len = (tcph->data_off & 0xf0) >> 2; 1408 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 1409 1410 head->ol_flags |= PKT_TX_TCP_SEG; 1411 head->l4_len = tcph_len; 1412 head->tso_segsz = offload.tso_seg_size; 1413 } 1414 1415 if (offload.udp_csum) { 1416 head->ol_flags |= PKT_TX_UDP_CKSUM; 1417 head->l2_len = ETHER_HDR_LEN; 1418 head->l3_len = iph_len; 1419 } 1420 } 1421 1422 ff_mbuf_free(m); 1423 1424 return send_single_packet(head, ctx->port_id); 1425 } 1426 1427 static int 1428 main_loop(void *arg) 1429 { 1430 struct loop_routine *lr = (struct loop_routine *)arg; 1431 1432 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1433 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc; 1434 int i, j, nb_rx, idle; 1435 uint16_t port_id, queue_id; 1436 struct lcore_conf *qconf; 1437 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 1438 US_PER_S * BURST_TX_DRAIN_US; 1439 struct ff_dpdk_if_context *ctx; 1440 1441 prev_tsc = 0; 1442 usch_tsc = 0; 1443 1444 qconf = &lcore_conf; 1445 1446 while (1) { 1447 cur_tsc = rte_rdtsc(); 1448 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1449 rte_timer_manage(); 1450 } 1451 1452 idle = 1; 1453 sys_tsc = 0; 1454 usr_tsc = 0; 1455 1456 /* 1457 * TX burst queue drain 1458 */ 1459 diff_tsc = cur_tsc - prev_tsc; 1460 if (unlikely(diff_tsc > drain_tsc)) { 1461 for (i = 0; i < qconf->nb_tx_port; i++) { 1462 port_id = qconf->tx_port_id[i]; 1463 if (qconf->tx_mbufs[port_id].len == 0) 1464 continue; 1465 1466 idle = 0; 1467 1468 send_burst(qconf, 1469 qconf->tx_mbufs[port_id].len, 1470 port_id); 1471 qconf->tx_mbufs[port_id].len = 0; 1472 } 1473 1474 prev_tsc = cur_tsc; 1475 } 1476 1477 /* 1478 * Read packet from RX queues 1479 */ 1480 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1481 port_id = qconf->rx_queue_list[i].port_id; 1482 queue_id = qconf->rx_queue_list[i].queue_id; 1483 ctx = veth_ctx[port_id]; 1484 1485 #ifdef FF_KNI 1486 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1487 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1488 } 1489 #endif 1490 1491 process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); 1492 1493 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1494 MAX_PKT_BURST); 1495 if (nb_rx == 0) 1496 continue; 1497 1498 idle = 0; 1499 1500 /* Prefetch first packets */ 1501 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1502 rte_prefetch0(rte_pktmbuf_mtod( 1503 pkts_burst[j], void *)); 1504 } 1505 1506 /* Prefetch and handle already prefetched packets */ 1507 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1508 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1509 j + PREFETCH_OFFSET], void *)); 1510 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1511 } 1512 1513 /* Handle remaining prefetched packets */ 1514 for (; j < nb_rx; j++) { 1515 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1516 } 1517 } 1518 1519 process_msg_ring(qconf->proc_id); 1520 1521 div_tsc = rte_rdtsc(); 1522 1523 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) { 1524 usch_tsc = cur_tsc; 1525 lr->loop(lr->arg); 1526 } 1527 1528 end_tsc = rte_rdtsc(); 1529 1530 if (usch_tsc == cur_tsc) { 1531 usr_tsc = end_tsc - div_tsc; 1532 } 1533 1534 if (!idle) { 1535 sys_tsc = div_tsc - cur_tsc; 1536 ff_top_status.sys_tsc += sys_tsc; 1537 } 1538 1539 ff_top_status.usr_tsc += usr_tsc; 1540 ff_top_status.work_tsc += end_tsc - cur_tsc; 1541 ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1542 1543 ff_top_status.loops++; 1544 } 1545 1546 return 0; 1547 } 1548 1549 int 1550 ff_dpdk_if_up(void) { 1551 int i; 1552 struct lcore_conf *qconf = &lcore_conf; 1553 for (i = 0; i < qconf->nb_tx_port; i++) { 1554 uint16_t port_id = qconf->tx_port_id[i]; 1555 1556 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id]; 1557 veth_ctx[port_id] = ff_veth_attach(pconf); 1558 if (veth_ctx[port_id] == NULL) { 1559 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1560 } 1561 } 1562 1563 return 0; 1564 } 1565 1566 void 1567 ff_dpdk_run(loop_func_t loop, void *arg) { 1568 struct loop_routine *lr = rte_malloc(NULL, 1569 sizeof(struct loop_routine), 0); 1570 lr->loop = loop; 1571 lr->arg = arg; 1572 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1573 rte_eal_mp_wait_lcore(); 1574 rte_free(lr); 1575 } 1576 1577 void 1578 ff_dpdk_pktmbuf_free(void *m) 1579 { 1580 rte_pktmbuf_free((struct rte_mbuf *)m); 1581 } 1582 1583 static uint32_t 1584 toeplitz_hash(unsigned keylen, const uint8_t *key, 1585 unsigned datalen, const uint8_t *data) 1586 { 1587 uint32_t hash = 0, v; 1588 u_int i, b; 1589 1590 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1591 1592 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1593 for (i = 0; i < datalen; i++) { 1594 for (b = 0; b < 8; b++) { 1595 if (data[i] & (1<<(7-b))) 1596 hash ^= v; 1597 v <<= 1; 1598 if ((i + 4) < keylen && 1599 (key[i+4] & (1<<(7-b)))) 1600 v |= 1; 1601 } 1602 } 1603 return (hash); 1604 } 1605 1606 int 1607 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1608 uint16_t sport, uint16_t dport) 1609 { 1610 struct lcore_conf *qconf = &lcore_conf; 1611 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 1612 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id]; 1613 1614 if (nb_queues <= 1) { 1615 return 1; 1616 } 1617 1618 uint16_t reta_size = rss_reta_size[ctx->port_id]; 1619 uint16_t queueid = qconf->tx_queue_id[ctx->port_id]; 1620 1621 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1622 sizeof(dport)]; 1623 1624 unsigned datalen = 0; 1625 1626 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1627 datalen += sizeof(saddr); 1628 1629 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1630 datalen += sizeof(daddr); 1631 1632 bcopy(&sport, &data[datalen], sizeof(sport)); 1633 datalen += sizeof(sport); 1634 1635 bcopy(&dport, &data[datalen], sizeof(dport)); 1636 datalen += sizeof(dport); 1637 1638 uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes), 1639 default_rsskey_40bytes, datalen, data); 1640 1641 return ((hash & (reta_size - 1)) % nb_queues) == queueid; 1642 } 1643 1644 void 1645 ff_regist_packet_dispatcher(dispatch_func_t func) 1646 { 1647 packet_dispatcher = func; 1648 } 1649 1650 uint64_t 1651 ff_get_tsc_ns() 1652 { 1653 uint64_t cur_tsc = rte_rdtsc(); 1654 uint64_t hz = rte_get_tsc_hz(); 1655 return ((double)cur_tsc/(double)hz) * NS_PER_S; 1656 } 1657 1658