1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 28 #include <rte_common.h> 29 #include <rte_byteorder.h> 30 #include <rte_log.h> 31 #include <rte_memory.h> 32 #include <rte_memcpy.h> 33 #include <rte_memzone.h> 34 #include <rte_config.h> 35 #include <rte_eal.h> 36 #include <rte_pci.h> 37 #include <rte_mbuf.h> 38 #include <rte_memory.h> 39 #include <rte_lcore.h> 40 #include <rte_launch.h> 41 #include <rte_ethdev.h> 42 #include <rte_debug.h> 43 #include <rte_common.h> 44 #include <rte_ether.h> 45 #include <rte_malloc.h> 46 #include <rte_cycles.h> 47 #include <rte_timer.h> 48 #include <rte_thash.h> 49 #include <rte_ip.h> 50 #include <rte_tcp.h> 51 #include <rte_udp.h> 52 53 #include "ff_dpdk_if.h" 54 #include "ff_dpdk_pcap.h" 55 #include "ff_dpdk_kni.h" 56 #include "ff_config.h" 57 #include "ff_veth.h" 58 #include "ff_host_interface.h" 59 #include "ff_msg.h" 60 #include "ff_api.h" 61 62 #define MEMPOOL_CACHE_SIZE 256 63 64 #define DISPATCH_RING_SIZE 2048 65 66 #define MSG_RING_SIZE 32 67 68 /* 69 * Configurable number of RX/TX ring descriptors 70 */ 71 #define RX_QUEUE_SIZE 512 72 #define TX_QUEUE_SIZE 512 73 74 #define MAX_PKT_BURST 32 75 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 76 77 /* 78 * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send. 79 */ 80 #define MAX_TX_BURST (MAX_PKT_BURST / 2) 81 82 #define NB_SOCKETS 8 83 84 /* Configure how many packets ahead to prefetch, when reading packets */ 85 #define PREFETCH_OFFSET 3 86 87 #define MAX_RX_QUEUE_PER_LCORE 16 88 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS 89 #define MAX_RX_QUEUE_PER_PORT 128 90 91 #define KNI_MBUF_MAX 2048 92 #define KNI_QUEUE_SIZE 2048 93 94 static int enable_kni; 95 static int kni_accept; 96 97 static int numa_on; 98 99 static struct rte_timer freebsd_clock; 100 101 // Mellanox Linux's driver key 102 static uint8_t default_rsskey_40bytes[40] = { 103 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 104 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 105 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 106 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 107 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 108 }; 109 110 static struct rte_eth_conf default_port_conf = { 111 .rxmode = { 112 .mq_mode = ETH_MQ_RX_RSS, 113 .max_rx_pkt_len = ETHER_MAX_LEN, 114 .split_hdr_size = 0, /**< hdr buf size */ 115 .header_split = 0, /**< Header Split disabled */ 116 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 117 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 118 .hw_vlan_strip = 0, /**< VLAN strip disabled. */ 119 .hw_vlan_extend = 0, /**< Extended VLAN disabled. */ 120 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 121 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 122 .enable_lro = 0, /**< LRO disabled */ 123 }, 124 .rx_adv_conf = { 125 .rss_conf = { 126 .rss_key = default_rsskey_40bytes, 127 .rss_key_len = 40, 128 .rss_hf = ETH_RSS_PROTO_MASK, 129 }, 130 }, 131 .txmode = { 132 .mq_mode = ETH_MQ_TX_NONE, 133 }, 134 }; 135 136 struct mbuf_table { 137 uint16_t len; 138 struct rte_mbuf *m_table[MAX_PKT_BURST]; 139 }; 140 141 struct lcore_rx_queue { 142 uint8_t port_id; 143 uint8_t queue_id; 144 } __rte_cache_aligned; 145 146 struct lcore_conf { 147 uint16_t proc_id; 148 uint16_t socket_id; 149 uint16_t nb_queue_list[RTE_MAX_ETHPORTS]; 150 struct ff_port_cfg *port_cfgs; 151 152 uint16_t nb_rx_queue; 153 struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; 154 uint16_t nb_tx_port; 155 uint16_t tx_port_id[RTE_MAX_ETHPORTS]; 156 uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; 157 struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS]; 158 char *pcap[RTE_MAX_ETHPORTS]; 159 } __rte_cache_aligned; 160 161 static struct lcore_conf lcore_conf; 162 163 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 164 165 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; 166 static dispatch_func_t packet_dispatcher; 167 168 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 169 170 struct ff_msg_ring { 171 char ring_name[2][RTE_RING_NAMESIZE]; 172 /* ring[0] for lcore recv msg, other send */ 173 /* ring[1] for lcore send msg, other read */ 174 struct rte_ring *ring[2]; 175 } __rte_cache_aligned; 176 177 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 178 static struct rte_mempool *message_pool; 179 180 struct ff_dpdk_if_context { 181 void *sc; 182 void *ifp; 183 uint16_t port_id; 184 struct ff_hw_features hw_features; 185 } __rte_cache_aligned; 186 187 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 188 189 extern void ff_hardclock(void); 190 191 static void 192 ff_hardclock_job(__rte_unused struct rte_timer *timer, 193 __rte_unused void *arg) { 194 ff_hardclock(); 195 ff_update_current_ts(); 196 } 197 198 struct ff_dpdk_if_context * 199 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 200 { 201 struct ff_dpdk_if_context *ctx; 202 203 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 204 if (ctx == NULL) 205 return NULL; 206 207 ctx->sc = sc; 208 ctx->ifp = ifp; 209 ctx->port_id = cfg->port_id; 210 ctx->hw_features = cfg->hw_features; 211 212 return ctx; 213 } 214 215 void 216 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 217 { 218 free(ctx); 219 } 220 221 static void 222 check_all_ports_link_status(void) 223 { 224 #define CHECK_INTERVAL 100 /* 100ms */ 225 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 226 227 uint8_t portid, count, all_ports_up, print_flag = 0; 228 struct rte_eth_link link; 229 230 printf("\nChecking link status"); 231 fflush(stdout); 232 233 int i, nb_ports; 234 nb_ports = ff_global_cfg.dpdk.nb_ports; 235 for (count = 0; count <= MAX_CHECK_TIME; count++) { 236 all_ports_up = 1; 237 for (i = 0; i < nb_ports; i++) { 238 uint8_t portid = ff_global_cfg.dpdk.portid_list[i]; 239 memset(&link, 0, sizeof(link)); 240 rte_eth_link_get_nowait(portid, &link); 241 242 /* print link status if flag set */ 243 if (print_flag == 1) { 244 if (link.link_status) { 245 printf("Port %d Link Up - speed %u " 246 "Mbps - %s\n", (int)portid, 247 (unsigned)link.link_speed, 248 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 249 ("full-duplex") : ("half-duplex\n")); 250 } else { 251 printf("Port %d Link Down\n", (int)portid); 252 } 253 continue; 254 } 255 /* clear all_ports_up flag if any link down */ 256 if (link.link_status == 0) { 257 all_ports_up = 0; 258 break; 259 } 260 } 261 262 /* after finally printing all link status, get out */ 263 if (print_flag == 1) 264 break; 265 266 if (all_ports_up == 0) { 267 printf("."); 268 fflush(stdout); 269 rte_delay_ms(CHECK_INTERVAL); 270 } 271 272 /* set the print_flag if all ports up or timeout */ 273 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 274 print_flag = 1; 275 printf("done\n"); 276 } 277 } 278 } 279 280 static int 281 init_lcore_conf(void) 282 { 283 uint8_t nb_dev_ports = rte_eth_dev_count(); 284 if (nb_dev_ports == 0) { 285 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 286 } 287 288 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) { 289 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n", 290 ff_global_cfg.dpdk.max_portid); 291 } 292 293 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs; 294 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 295 296 uint16_t proc_id; 297 for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) { 298 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id]; 299 if (!lcore_config[lcore_id].detected) { 300 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 301 } 302 } 303 304 uint16_t socket_id = 0; 305 if (numa_on) { 306 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 307 } 308 309 lcore_conf.socket_id = socket_id; 310 311 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id]; 312 int j; 313 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) { 314 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j]; 315 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 316 317 int queueid = -1; 318 int i; 319 for (i = 0; i < pconf->nb_lcores; i++) { 320 if (pconf->lcore_list[i] == lcore_id) { 321 queueid = i; 322 } 323 } 324 if (queueid < 0) { 325 continue; 326 } 327 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid); 328 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 329 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 330 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid; 331 lcore_conf.nb_rx_queue++; 332 333 lcore_conf.tx_queue_id[port_id] = queueid; 334 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id; 335 lcore_conf.nb_tx_port++; 336 337 lcore_conf.pcap[port_id] = pconf->pcap; 338 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores; 339 } 340 341 if (lcore_conf.nb_rx_queue == 0) { 342 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id); 343 } 344 345 return 0; 346 } 347 348 static int 349 init_mem_pool(void) 350 { 351 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 352 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 353 uint32_t nb_tx_queue = nb_lcores; 354 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 355 356 unsigned nb_mbuf = RTE_MAX ( 357 (nb_rx_queue*RX_QUEUE_SIZE + 358 nb_ports*nb_lcores*MAX_PKT_BURST + 359 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 360 nb_lcores*MEMPOOL_CACHE_SIZE + 361 nb_ports*KNI_MBUF_MAX + 362 nb_ports*KNI_QUEUE_SIZE + 363 nb_lcores*nb_ports*DISPATCH_RING_SIZE), 364 (unsigned)8192); 365 366 unsigned socketid = 0; 367 uint16_t i, lcore_id; 368 char s[64]; 369 370 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) { 371 lcore_id = ff_global_cfg.dpdk.proc_lcore[i]; 372 if (numa_on) { 373 socketid = rte_lcore_to_socket_id(lcore_id); 374 } 375 376 if (socketid >= NB_SOCKETS) { 377 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 378 socketid, i, NB_SOCKETS); 379 } 380 381 if (pktmbuf_pool[socketid] != NULL) { 382 continue; 383 } 384 385 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 386 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 387 pktmbuf_pool[socketid] = 388 rte_pktmbuf_pool_create(s, nb_mbuf, 389 MEMPOOL_CACHE_SIZE, 0, 390 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 391 } else { 392 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 393 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 394 } 395 396 if (pktmbuf_pool[socketid] == NULL) { 397 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 398 } else { 399 printf("create mbuf pool on socket %d\n", socketid); 400 } 401 } 402 403 return 0; 404 } 405 406 static struct rte_ring * 407 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 408 { 409 struct rte_ring *ring; 410 411 if (name == NULL) 412 return NULL; 413 414 /* If already create, just attached it */ 415 if (likely((ring = rte_ring_lookup(name)) != NULL)) 416 return ring; 417 418 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 419 return rte_ring_create(name, count, socket_id, flags); 420 } else { 421 return rte_ring_lookup(name); 422 } 423 } 424 425 static int 426 init_dispatch_ring(void) 427 { 428 int j; 429 char name_buf[RTE_RING_NAMESIZE]; 430 int queueid; 431 432 unsigned socketid = lcore_conf.socket_id; 433 434 /* Create ring according to ports actually being used. */ 435 int nb_ports = ff_global_cfg.dpdk.nb_ports; 436 for (j = 0; j < nb_ports; j++) { 437 uint16_t portid = ff_global_cfg.dpdk.portid_list[j]; 438 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid]; 439 int nb_queues = pconf->nb_lcores; 440 if (dispatch_ring[portid] == NULL) { 441 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid); 442 443 dispatch_ring[portid] = rte_zmalloc(name_buf, 444 sizeof(struct rte_ring *) * nb_queues, 445 RTE_CACHE_LINE_SIZE); 446 if (dispatch_ring[portid] == NULL) { 447 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 448 "failed\n", name_buf); 449 } 450 } 451 452 for(queueid = 0; queueid < nb_queues; ++queueid) { 453 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d", 454 portid, queueid); 455 dispatch_ring[portid][queueid] = create_ring(name_buf, 456 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ); 457 458 if (dispatch_ring[portid][queueid] == NULL) 459 rte_panic("create ring:%s failed!\n", name_buf); 460 461 printf("create ring:%s success, %u ring entries are now free!\n", 462 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid])); 463 } 464 } 465 466 return 0; 467 } 468 469 static void 470 ff_msg_init(struct rte_mempool *mp, 471 __attribute__((unused)) void *opaque_arg, 472 void *obj, __attribute__((unused)) unsigned i) 473 { 474 struct ff_msg *msg = (struct ff_msg *)obj; 475 msg->msg_type = FF_UNKNOWN; 476 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 477 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 478 } 479 480 static int 481 init_msg_ring(void) 482 { 483 uint16_t i; 484 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 485 unsigned socketid = lcore_conf.socket_id; 486 487 /* Create message buffer pool */ 488 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 489 message_pool = rte_mempool_create(FF_MSG_POOL, 490 MSG_RING_SIZE * 2 * nb_procs, 491 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 492 NULL, NULL, ff_msg_init, NULL, 493 socketid, 0); 494 } else { 495 message_pool = rte_mempool_lookup(FF_MSG_POOL); 496 } 497 498 if (message_pool == NULL) { 499 rte_panic("Create msg mempool failed\n"); 500 } 501 502 for(i = 0; i < nb_procs; ++i) { 503 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 504 "%s%u", FF_MSG_RING_IN, i); 505 snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE, 506 "%s%u", FF_MSG_RING_OUT, i); 507 508 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 509 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 510 if (msg_ring[i].ring[0] == NULL) 511 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 512 513 msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1], 514 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 515 if (msg_ring[i].ring[1] == NULL) 516 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 517 } 518 519 return 0; 520 } 521 522 static int 523 init_kni(void) 524 { 525 int nb_ports = rte_eth_dev_count(); 526 kni_accept = 0; 527 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 528 kni_accept = 1; 529 530 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 531 ff_global_cfg.kni.udp_port); 532 533 unsigned socket_id = lcore_conf.socket_id; 534 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 535 536 nb_ports = ff_global_cfg.dpdk.nb_ports; 537 int i, ret; 538 for (i = 0; i < nb_ports; i++) { 539 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 540 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 541 } 542 543 return 0; 544 } 545 546 static void 547 set_rss_table(uint8_t port_id, uint16_t reta_size, uint16_t nb_queues) 548 { 549 if (reta_size == 0) { 550 return; 551 } 552 553 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 554 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 555 556 /* config HW indirection table */ 557 unsigned i, j, hash=0; 558 for (i = 0; i < reta_conf_size; i++) { 559 reta_conf[i].mask = ~0ULL; 560 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 561 reta_conf[i].reta[j] = hash++ % nb_queues; 562 } 563 } 564 565 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 566 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 567 port_id); 568 } 569 } 570 571 static int 572 init_port_start(void) 573 { 574 int nb_ports = ff_global_cfg.dpdk.nb_ports; 575 unsigned socketid = rte_lcore_to_socket_id(rte_lcore_id()); 576 struct rte_mempool *mbuf_pool = pktmbuf_pool[socketid]; 577 uint16_t i; 578 579 for (i = 0; i < nb_ports; i++) { 580 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 581 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 582 uint16_t nb_queues = pconf->nb_lcores; 583 584 struct rte_eth_dev_info dev_info; 585 rte_eth_dev_info_get(port_id, &dev_info); 586 587 if (nb_queues > dev_info.max_rx_queues) { 588 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 589 nb_queues, 590 dev_info.max_rx_queues); 591 } 592 593 if (nb_queues > dev_info.max_tx_queues) { 594 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 595 nb_queues, 596 dev_info.max_tx_queues); 597 } 598 599 struct ether_addr addr; 600 rte_eth_macaddr_get(port_id, &addr); 601 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 602 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 603 (unsigned)port_id, 604 addr.addr_bytes[0], addr.addr_bytes[1], 605 addr.addr_bytes[2], addr.addr_bytes[3], 606 addr.addr_bytes[4], addr.addr_bytes[5]); 607 608 rte_memcpy(pconf->mac, 609 addr.addr_bytes, ETHER_ADDR_LEN); 610 611 /* Clear txq_flags - we do not need multi-mempool and refcnt */ 612 dev_info.default_txconf.txq_flags = ETH_TXQ_FLAGS_NOMULTMEMP | 613 ETH_TXQ_FLAGS_NOREFCOUNT; 614 615 /* Disable features that are not supported by port's HW */ 616 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) { 617 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP; 618 } 619 620 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 621 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP; 622 } 623 624 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) { 625 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP; 626 } 627 628 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { 629 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; 630 } 631 632 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { 633 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; 634 } 635 636 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) && 637 !(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TSO)) { 638 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; 639 } 640 641 struct rte_eth_conf port_conf = {0}; 642 643 /* Set RSS mode */ 644 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 645 port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK; 646 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes; 647 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; 648 649 /* Set Rx VLAN stripping */ 650 if (ff_global_cfg.dpdk.vlan_strip) { 651 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 652 port_conf.rxmode.hw_vlan_strip = 1; 653 } 654 } 655 656 /* Enable HW CRC stripping */ 657 port_conf.rxmode.hw_strip_crc = 1; 658 659 /* FIXME: Enable TCP LRO ?*/ 660 #if 0 661 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 662 printf("LRO is supported\n"); 663 port_conf.rxmode.enable_lro = 1; 664 pconf->hw_features.rx_lro = 1; 665 } 666 #endif 667 668 /* Set Rx checksum checking */ 669 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 670 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 671 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 672 printf("RX checksum offload supported\n"); 673 port_conf.rxmode.hw_ip_checksum = 1; 674 pconf->hw_features.rx_csum = 1; 675 } 676 677 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 678 printf("TX ip checksum offload supported\n"); 679 pconf->hw_features.tx_csum_ip = 1; 680 } 681 682 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 683 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 684 printf("TX TCP&UDP checksum offload supported\n"); 685 pconf->hw_features.tx_csum_l4 = 1; 686 } 687 688 if (ff_global_cfg.dpdk.tso) { 689 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 690 printf("TSO is supported\n"); 691 pconf->hw_features.tx_tso = 1; 692 } 693 } else { 694 printf("TSO is disabled\n"); 695 } 696 697 if (dev_info.reta_size) { 698 /* reta size must be power of 2 */ 699 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 700 701 rss_reta_size[port_id] = dev_info.reta_size; 702 printf("port[%d]: rss table size: %d\n", port_id, 703 dev_info.reta_size); 704 } 705 706 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 707 continue; 708 } 709 710 int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); 711 if (ret != 0) { 712 return ret; 713 } 714 uint16_t q; 715 for (q = 0; q < nb_queues; q++) { 716 ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE, 717 socketid, &dev_info.default_txconf); 718 if (ret < 0) { 719 return ret; 720 } 721 722 ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE, 723 socketid, &dev_info.default_rxconf, mbuf_pool); 724 if (ret < 0) { 725 return ret; 726 } 727 } 728 729 ret = rte_eth_dev_start(port_id); 730 if (ret < 0) { 731 return ret; 732 } 733 734 if (nb_queues > 1) { 735 /* set HW rss hash function to Toeplitz. */ 736 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 737 struct rte_eth_hash_filter_info info = {0}; 738 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 739 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 740 741 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 742 RTE_ETH_FILTER_SET, &info) < 0) { 743 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 744 port_id); 745 } 746 } 747 748 set_rss_table(port_id, dev_info.reta_size, nb_queues); 749 } 750 751 /* Enable RX in promiscuous mode for the Ethernet device. */ 752 if (ff_global_cfg.dpdk.promiscuous) { 753 rte_eth_promiscuous_enable(port_id); 754 ret = rte_eth_promiscuous_get(port_id); 755 if (ret == 1) { 756 printf("set port %u to promiscuous mode ok\n", port_id); 757 } else { 758 printf("set port %u to promiscuous mode error\n", port_id); 759 } 760 } 761 762 /* Enable pcap dump */ 763 if (pconf->pcap) { 764 ff_enable_pcap(pconf->pcap); 765 } 766 } 767 768 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 769 check_all_ports_link_status(); 770 } 771 772 return 0; 773 } 774 775 static int 776 init_clock(void) 777 { 778 rte_timer_subsystem_init(); 779 uint64_t hz = rte_get_timer_hz(); 780 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 781 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 782 783 rte_timer_init(&freebsd_clock); 784 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 785 rte_lcore_id(), &ff_hardclock_job, NULL); 786 787 ff_update_current_ts(); 788 789 return 0; 790 } 791 792 int 793 ff_dpdk_init(int argc, char **argv) 794 { 795 if (ff_global_cfg.dpdk.nb_procs < 1 || 796 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 797 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 798 ff_global_cfg.dpdk.proc_id < 0) { 799 printf("param num_procs[%d] or proc_id[%d] error!\n", 800 ff_global_cfg.dpdk.nb_procs, 801 ff_global_cfg.dpdk.proc_id); 802 exit(1); 803 } 804 805 int ret = rte_eal_init(argc, argv); 806 if (ret < 0) { 807 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 808 } 809 810 numa_on = ff_global_cfg.dpdk.numa_on; 811 812 init_lcore_conf(); 813 814 init_mem_pool(); 815 816 init_dispatch_ring(); 817 818 init_msg_ring(); 819 820 enable_kni = ff_global_cfg.kni.enable; 821 if (enable_kni) { 822 init_kni(); 823 } 824 825 ret = init_port_start(); 826 if (ret < 0) { 827 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 828 } 829 830 init_clock(); 831 832 return 0; 833 } 834 835 static void 836 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 837 { 838 uint8_t rx_csum = ctx->hw_features.rx_csum; 839 if (rx_csum) { 840 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 841 return; 842 } 843 } 844 845 /* 846 * FIXME: should we save pkt->vlan_tci 847 * if (pkt->ol_flags & PKT_RX_VLAN_PKT) 848 */ 849 850 void *data = rte_pktmbuf_mtod(pkt, void*); 851 uint16_t len = rte_pktmbuf_data_len(pkt); 852 853 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 854 if (hdr == NULL) { 855 rte_pktmbuf_free(pkt); 856 return; 857 } 858 859 struct rte_mbuf *pn = pkt->next; 860 void *prev = hdr; 861 while(pn != NULL) { 862 data = rte_pktmbuf_mtod(pn, void*); 863 len = rte_pktmbuf_data_len(pn); 864 865 void *mb = ff_mbuf_get(prev, data, len); 866 if (mb == NULL) { 867 ff_mbuf_free(hdr); 868 rte_pktmbuf_free(pkt); 869 return; 870 } 871 pn = pn->next; 872 prev = mb; 873 } 874 875 ff_veth_process_packet(ctx->ifp, hdr); 876 } 877 878 static enum FilterReturn 879 protocol_filter(const void *data, uint16_t len) 880 { 881 if(len < ETHER_HDR_LEN) 882 return FILTER_UNKNOWN; 883 884 const struct ether_hdr *hdr; 885 hdr = (const struct ether_hdr *)data; 886 887 if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP) 888 return FILTER_ARP; 889 890 if (!enable_kni) { 891 return FILTER_UNKNOWN; 892 } 893 894 if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4) 895 return FILTER_UNKNOWN; 896 897 return ff_kni_proto_filter(data + ETHER_HDR_LEN, 898 len - ETHER_HDR_LEN); 899 } 900 901 static inline void 902 process_packets(uint8_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 903 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 904 { 905 struct lcore_conf *qconf = &lcore_conf; 906 uint16_t nb_queues = qconf->nb_queue_list[port_id]; 907 908 uint16_t i; 909 for (i = 0; i < count; i++) { 910 struct rte_mbuf *rtem = bufs[i]; 911 912 if (unlikely(qconf->pcap[port_id] != NULL)) { 913 if (!pkts_from_ring) { 914 ff_dump_packets(qconf->pcap[port_id], rtem); 915 } 916 } 917 918 void *data = rte_pktmbuf_mtod(rtem, void*); 919 uint16_t len = rte_pktmbuf_data_len(rtem); 920 921 if (!pkts_from_ring && packet_dispatcher) { 922 int ret = (*packet_dispatcher)(data, len, queue_id, nb_queues); 923 if (ret < 0 || ret >= nb_queues) { 924 rte_pktmbuf_free(rtem); 925 continue; 926 } 927 928 if (ret != queue_id) { 929 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem); 930 if (ret < 0) 931 rte_pktmbuf_free(rtem); 932 933 continue; 934 } 935 } 936 937 enum FilterReturn filter = protocol_filter(data, len); 938 if (filter == FILTER_ARP) { 939 struct rte_mempool *mbuf_pool; 940 struct rte_mbuf *mbuf_clone; 941 if (!pkts_from_ring) { 942 uint16_t j; 943 for(j = 0; j < nb_queues; ++j) { 944 if(j == queue_id) 945 continue; 946 947 unsigned socket_id = 0; 948 if (numa_on) { 949 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j]; 950 socket_id = rte_lcore_to_socket_id(lcore_id); 951 } 952 mbuf_pool = pktmbuf_pool[socket_id]; 953 mbuf_clone = rte_pktmbuf_clone(rtem, mbuf_pool); 954 if(mbuf_clone) { 955 int ret = rte_ring_enqueue(dispatch_ring[port_id][j], mbuf_clone); 956 if (ret < 0) 957 rte_pktmbuf_free(mbuf_clone); 958 } 959 } 960 } 961 962 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 963 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 964 mbuf_clone = rte_pktmbuf_clone(rtem, mbuf_pool); 965 if(mbuf_clone) { 966 ff_kni_enqueue(port_id, mbuf_clone); 967 } 968 } 969 970 ff_veth_input(ctx, rtem); 971 } else if (enable_kni && ((filter == FILTER_KNI && kni_accept) || 972 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 973 ff_kni_enqueue(port_id, rtem); 974 } else { 975 ff_veth_input(ctx, rtem); 976 } 977 } 978 } 979 980 static inline int 981 process_dispatch_ring(uint8_t port_id, uint16_t queue_id, 982 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 983 { 984 /* read packet from ring buf and to process */ 985 uint16_t nb_rb; 986 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id], 987 (void **)pkts_burst, MAX_PKT_BURST); 988 989 if(nb_rb > 0) { 990 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 991 } 992 993 return 0; 994 } 995 996 static inline void 997 handle_sysctl_msg(struct ff_msg *msg) 998 { 999 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 1000 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 1001 msg->sysctl.newlen); 1002 1003 if (ret < 0) { 1004 msg->result = errno; 1005 } else { 1006 msg->result = 0; 1007 } 1008 } 1009 1010 static inline void 1011 handle_ioctl_msg(struct ff_msg *msg) 1012 { 1013 int fd, ret; 1014 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 1015 if (fd < 0) { 1016 ret = -1; 1017 goto done; 1018 } 1019 1020 ret = ff_ioctl(fd, msg->ioctl.cmd, msg->ioctl.data); 1021 1022 ff_close(fd); 1023 1024 done: 1025 if (ret < 0) { 1026 msg->result = errno; 1027 } else { 1028 msg->result = 0; 1029 } 1030 } 1031 1032 static inline void 1033 handle_route_msg(struct ff_msg *msg) 1034 { 1035 int ret = ff_rtioctl(msg->route.fib, msg->route.data, 1036 &msg->route.len, msg->route.maxlen); 1037 if (ret < 0) { 1038 msg->result = errno; 1039 } else { 1040 msg->result = 0; 1041 } 1042 } 1043 1044 static struct ff_top_args ff_status; 1045 static inline void 1046 handle_top_msg(struct ff_msg *msg) 1047 { 1048 msg->top = ff_status; 1049 msg->result = 0; 1050 } 1051 1052 #ifdef FF_NETGRAPH 1053 static inline void 1054 handle_ngctl_msg(struct ff_msg *msg) 1055 { 1056 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data); 1057 if (ret < 0) { 1058 msg->result = errno; 1059 } else { 1060 msg->result = 0; 1061 msg->ngctl.ret = ret; 1062 } 1063 } 1064 #endif 1065 1066 #ifdef FF_IPFW 1067 static inline void 1068 handle_ipfw_msg(struct ff_msg *msg) 1069 { 1070 int fd, ret; 1071 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW); 1072 if (fd < 0) { 1073 ret = -1; 1074 goto done; 1075 } 1076 1077 switch (msg->ipfw.cmd) { 1078 case FF_IPFW_GET: 1079 ret = ff_getsockopt(fd, msg->ipfw.level, 1080 msg->ipfw.optname, msg->ipfw.optval, 1081 msg->ipfw.optlen); 1082 break; 1083 case FF_IPFW_SET: 1084 ret = ff_setsockopt(fd, msg->ipfw.level, 1085 msg->ipfw.optname, msg->ipfw.optval, 1086 *(msg->ipfw.optlen)); 1087 break; 1088 default: 1089 ret = -1; 1090 errno = ENOTSUP; 1091 break; 1092 } 1093 1094 ff_close(fd); 1095 1096 done: 1097 if (ret < 0) { 1098 msg->result = errno; 1099 } else { 1100 msg->result = 0; 1101 } 1102 } 1103 #endif 1104 1105 static inline void 1106 handle_default_msg(struct ff_msg *msg) 1107 { 1108 msg->result = ENOTSUP; 1109 } 1110 1111 static inline void 1112 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1113 { 1114 switch (msg->msg_type) { 1115 case FF_SYSCTL: 1116 handle_sysctl_msg(msg); 1117 break; 1118 case FF_IOCTL: 1119 handle_ioctl_msg(msg); 1120 break; 1121 case FF_ROUTE: 1122 handle_route_msg(msg); 1123 break; 1124 case FF_TOP: 1125 handle_top_msg(msg); 1126 break; 1127 #ifdef FF_NETGRAPH 1128 case FF_NGCTL: 1129 handle_ngctl_msg(msg); 1130 break; 1131 #endif 1132 #ifdef FF_IPFW 1133 case FF_IPFW_CTL: 1134 handle_ipfw_msg(msg); 1135 break; 1136 #endif 1137 default: 1138 handle_default_msg(msg); 1139 break; 1140 } 1141 rte_ring_enqueue(msg_ring[proc_id].ring[1], msg); 1142 } 1143 1144 static inline int 1145 process_msg_ring(uint16_t proc_id) 1146 { 1147 void *msg; 1148 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1149 1150 if (unlikely(ret == 0)) { 1151 handle_msg((struct ff_msg *)msg, proc_id); 1152 } 1153 1154 return 0; 1155 } 1156 1157 /* Send burst of packets on an output interface */ 1158 static inline int 1159 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1160 { 1161 struct rte_mbuf **m_table; 1162 int ret; 1163 uint16_t queueid; 1164 1165 queueid = qconf->tx_queue_id[port]; 1166 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1167 1168 if (unlikely(qconf->pcap[port] != NULL)) { 1169 uint16_t i; 1170 for (i = 0; i < n; i++) { 1171 ff_dump_packets(qconf->pcap[port], m_table[i]); 1172 } 1173 } 1174 1175 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1176 if (unlikely(ret < n)) { 1177 do { 1178 rte_pktmbuf_free(m_table[ret]); 1179 } while (++ret < n); 1180 } 1181 1182 return 0; 1183 } 1184 1185 /* Enqueue a single packet, and send burst if queue is filled */ 1186 static inline int 1187 send_single_packet(struct rte_mbuf *m, uint8_t port) 1188 { 1189 uint16_t len; 1190 struct lcore_conf *qconf; 1191 1192 qconf = &lcore_conf; 1193 len = qconf->tx_mbufs[port].len; 1194 qconf->tx_mbufs[port].m_table[len] = m; 1195 len++; 1196 1197 /* enough pkts to be sent */ 1198 if (unlikely(len == MAX_PKT_BURST)) { 1199 send_burst(qconf, MAX_PKT_BURST, port); 1200 len = 0; 1201 } 1202 1203 qconf->tx_mbufs[port].len = len; 1204 return 0; 1205 } 1206 1207 int 1208 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1209 int total) 1210 { 1211 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1212 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1213 if (head == NULL) { 1214 ff_mbuf_free(m); 1215 return -1; 1216 } 1217 1218 head->pkt_len = total; 1219 head->nb_segs = 0; 1220 1221 int off = 0; 1222 struct rte_mbuf *cur = head, *prev = NULL; 1223 while(total > 0) { 1224 if (cur == NULL) { 1225 cur = rte_pktmbuf_alloc(mbuf_pool); 1226 if (cur == NULL) { 1227 rte_pktmbuf_free(head); 1228 ff_mbuf_free(m); 1229 return -1; 1230 } 1231 } 1232 1233 void *data = rte_pktmbuf_mtod(cur, void*); 1234 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1235 int ret = ff_mbuf_copydata(m, data, off, len); 1236 if (ret < 0) { 1237 rte_pktmbuf_free(head); 1238 ff_mbuf_free(m); 1239 return -1; 1240 } 1241 1242 if (prev != NULL) { 1243 prev->next = cur; 1244 } 1245 prev = cur; 1246 1247 cur->data_len = len; 1248 off += len; 1249 total -= len; 1250 head->nb_segs++; 1251 cur = NULL; 1252 } 1253 1254 struct ff_tx_offload offload = {0}; 1255 ff_mbuf_tx_offload(m, &offload); 1256 1257 void *data = rte_pktmbuf_mtod(head, void*); 1258 1259 if (offload.ip_csum) { 1260 /* ipv6 not supported yet */ 1261 struct ipv4_hdr *iph; 1262 int iph_len; 1263 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1264 iph_len = (iph->version_ihl & 0x0f) << 2; 1265 1266 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 1267 head->l2_len = ETHER_HDR_LEN; 1268 head->l3_len = iph_len; 1269 } 1270 1271 if (ctx->hw_features.tx_csum_l4) { 1272 struct ipv4_hdr *iph; 1273 int iph_len; 1274 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1275 iph_len = (iph->version_ihl & 0x0f) << 2; 1276 1277 if (offload.tcp_csum) { 1278 head->ol_flags |= PKT_TX_TCP_CKSUM; 1279 head->l2_len = ETHER_HDR_LEN; 1280 head->l3_len = iph_len; 1281 } 1282 1283 /* 1284 * TCP segmentation offload. 1285 * 1286 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 1287 * implies PKT_TX_TCP_CKSUM) 1288 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 1289 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 1290 * write the IP checksum to 0 in the packet 1291 * - fill the mbuf offload information: l2_len, 1292 * l3_len, l4_len, tso_segsz 1293 * - calculate the pseudo header checksum without taking ip_len 1294 * in account, and set it in the TCP header. Refer to 1295 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 1296 * used as helpers. 1297 */ 1298 if (offload.tso_seg_size) { 1299 struct tcp_hdr *tcph; 1300 int tcph_len; 1301 tcph = (struct tcp_hdr *)((char *)iph + iph_len); 1302 tcph_len = (tcph->data_off & 0xf0) >> 2; 1303 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 1304 1305 head->ol_flags |= PKT_TX_TCP_SEG; 1306 head->l4_len = tcph_len; 1307 head->tso_segsz = offload.tso_seg_size; 1308 } 1309 1310 if (offload.udp_csum) { 1311 head->ol_flags |= PKT_TX_UDP_CKSUM; 1312 head->l2_len = ETHER_HDR_LEN; 1313 head->l3_len = iph_len; 1314 } 1315 } 1316 1317 ff_mbuf_free(m); 1318 1319 return send_single_packet(head, ctx->port_id); 1320 } 1321 1322 static int 1323 main_loop(void *arg) 1324 { 1325 struct loop_routine *lr = (struct loop_routine *)arg; 1326 1327 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1328 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc; 1329 int i, j, nb_rx, idle; 1330 uint8_t port_id, queue_id; 1331 struct lcore_conf *qconf; 1332 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 1333 US_PER_S * BURST_TX_DRAIN_US; 1334 struct ff_dpdk_if_context *ctx; 1335 1336 prev_tsc = 0; 1337 usch_tsc = 0; 1338 1339 qconf = &lcore_conf; 1340 1341 while (1) { 1342 cur_tsc = rte_rdtsc(); 1343 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1344 rte_timer_manage(); 1345 } 1346 1347 idle = 1; 1348 sys_tsc = 0; 1349 usr_tsc = 0; 1350 1351 /* 1352 * TX burst queue drain 1353 */ 1354 diff_tsc = cur_tsc - prev_tsc; 1355 if (unlikely(diff_tsc > drain_tsc)) { 1356 for (i = 0; i < qconf->nb_tx_port; i++) { 1357 port_id = qconf->tx_port_id[i]; 1358 if (qconf->tx_mbufs[port_id].len == 0) 1359 continue; 1360 1361 idle = 0; 1362 1363 send_burst(qconf, 1364 qconf->tx_mbufs[port_id].len, 1365 port_id); 1366 qconf->tx_mbufs[port_id].len = 0; 1367 } 1368 1369 prev_tsc = cur_tsc; 1370 } 1371 1372 /* 1373 * Read packet from RX queues 1374 */ 1375 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1376 port_id = qconf->rx_queue_list[i].port_id; 1377 queue_id = qconf->rx_queue_list[i].queue_id; 1378 ctx = veth_ctx[port_id]; 1379 1380 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1381 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1382 } 1383 1384 process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); 1385 1386 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1387 MAX_PKT_BURST); 1388 if (nb_rx == 0) 1389 continue; 1390 1391 idle = 0; 1392 1393 /* Prefetch first packets */ 1394 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1395 rte_prefetch0(rte_pktmbuf_mtod( 1396 pkts_burst[j], void *)); 1397 } 1398 1399 /* Prefetch and handle already prefetched packets */ 1400 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1401 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1402 j + PREFETCH_OFFSET], void *)); 1403 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1404 } 1405 1406 /* Handle remaining prefetched packets */ 1407 for (; j < nb_rx; j++) { 1408 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1409 } 1410 } 1411 1412 process_msg_ring(qconf->proc_id); 1413 1414 div_tsc = rte_rdtsc(); 1415 1416 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) { 1417 usch_tsc = cur_tsc; 1418 lr->loop(lr->arg); 1419 } 1420 1421 end_tsc = rte_rdtsc(); 1422 1423 if (usch_tsc == cur_tsc) { 1424 usr_tsc = end_tsc - div_tsc; 1425 } 1426 1427 if (!idle) { 1428 sys_tsc = div_tsc - cur_tsc; 1429 ff_status.sys_tsc += sys_tsc; 1430 } 1431 1432 ff_status.usr_tsc += usr_tsc; 1433 ff_status.work_tsc += end_tsc - cur_tsc; 1434 ff_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1435 1436 ff_status.loops++; 1437 } 1438 1439 return 0; 1440 } 1441 1442 int 1443 ff_dpdk_if_up(void) { 1444 int i; 1445 struct lcore_conf *qconf = &lcore_conf; 1446 for (i = 0; i < qconf->nb_tx_port; i++) { 1447 uint16_t port_id = qconf->tx_port_id[i]; 1448 1449 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id]; 1450 veth_ctx[port_id] = ff_veth_attach(pconf); 1451 if (veth_ctx[port_id] == NULL) { 1452 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1453 } 1454 } 1455 1456 return 0; 1457 } 1458 1459 void 1460 ff_dpdk_run(loop_func_t loop, void *arg) { 1461 struct loop_routine *lr = rte_malloc(NULL, 1462 sizeof(struct loop_routine), 0); 1463 lr->loop = loop; 1464 lr->arg = arg; 1465 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1466 rte_eal_mp_wait_lcore(); 1467 rte_free(lr); 1468 } 1469 1470 void 1471 ff_dpdk_pktmbuf_free(void *m) 1472 { 1473 rte_pktmbuf_free((struct rte_mbuf *)m); 1474 } 1475 1476 static uint32_t 1477 toeplitz_hash(unsigned keylen, const uint8_t *key, 1478 unsigned datalen, const uint8_t *data) 1479 { 1480 uint32_t hash = 0, v; 1481 u_int i, b; 1482 1483 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1484 1485 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1486 for (i = 0; i < datalen; i++) { 1487 for (b = 0; b < 8; b++) { 1488 if (data[i] & (1<<(7-b))) 1489 hash ^= v; 1490 v <<= 1; 1491 if ((i + 4) < keylen && 1492 (key[i+4] & (1<<(7-b)))) 1493 v |= 1; 1494 } 1495 } 1496 return (hash); 1497 } 1498 1499 int 1500 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1501 uint16_t sport, uint16_t dport) 1502 { 1503 struct lcore_conf *qconf = &lcore_conf; 1504 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 1505 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id]; 1506 1507 if (nb_queues <= 1) { 1508 return 1; 1509 } 1510 1511 uint16_t reta_size = rss_reta_size[ctx->port_id]; 1512 uint16_t queueid = qconf->tx_queue_id[ctx->port_id]; 1513 1514 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1515 sizeof(dport)]; 1516 1517 unsigned datalen = 0; 1518 1519 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1520 datalen += sizeof(saddr); 1521 1522 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1523 datalen += sizeof(daddr); 1524 1525 bcopy(&sport, &data[datalen], sizeof(sport)); 1526 datalen += sizeof(sport); 1527 1528 bcopy(&dport, &data[datalen], sizeof(dport)); 1529 datalen += sizeof(dport); 1530 1531 uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes), 1532 default_rsskey_40bytes, datalen, data); 1533 1534 return ((hash & (reta_size - 1)) % nb_queues) == queueid; 1535 } 1536 1537 void 1538 ff_regist_packet_dispatcher(dispatch_func_t func) 1539 { 1540 packet_dispatcher = func; 1541 } 1542