1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 28 #include <rte_common.h> 29 #include <rte_byteorder.h> 30 #include <rte_log.h> 31 #include <rte_memory.h> 32 #include <rte_memcpy.h> 33 #include <rte_memzone.h> 34 #include <rte_config.h> 35 #include <rte_eal.h> 36 #include <rte_pci.h> 37 #include <rte_mbuf.h> 38 #include <rte_memory.h> 39 #include <rte_lcore.h> 40 #include <rte_launch.h> 41 #include <rte_ethdev.h> 42 #include <rte_debug.h> 43 #include <rte_common.h> 44 #include <rte_ether.h> 45 #include <rte_malloc.h> 46 #include <rte_cycles.h> 47 #include <rte_timer.h> 48 #include <rte_thash.h> 49 #include <rte_ip.h> 50 #include <rte_tcp.h> 51 #include <rte_udp.h> 52 53 #include "ff_dpdk_if.h" 54 #include "ff_dpdk_pcap.h" 55 #include "ff_dpdk_kni.h" 56 #include "ff_config.h" 57 #include "ff_veth.h" 58 #include "ff_host_interface.h" 59 #include "ff_msg.h" 60 #include "ff_api.h" 61 62 #define MEMPOOL_CACHE_SIZE 256 63 64 #define DISPATCH_RING_SIZE 2048 65 66 #define MSG_RING_SIZE 32 67 68 /* 69 * Configurable number of RX/TX ring descriptors 70 */ 71 #define RX_QUEUE_SIZE 512 72 #define TX_QUEUE_SIZE 512 73 74 #define MAX_PKT_BURST 32 75 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 76 77 /* 78 * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send. 79 */ 80 #define MAX_TX_BURST (MAX_PKT_BURST / 2) 81 82 #define NB_SOCKETS 8 83 84 /* Configure how many packets ahead to prefetch, when reading packets */ 85 #define PREFETCH_OFFSET 3 86 87 #define MAX_RX_QUEUE_PER_LCORE 16 88 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS 89 #define MAX_RX_QUEUE_PER_PORT 128 90 91 #define KNI_MBUF_MAX 2048 92 #define KNI_QUEUE_SIZE 2048 93 94 static int enable_kni; 95 static int kni_accept; 96 97 static int numa_on; 98 99 static struct rte_timer freebsd_clock; 100 101 // Mellanox Linux's driver key 102 static uint8_t default_rsskey_40bytes[40] = { 103 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 104 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 105 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 106 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 107 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 108 }; 109 110 static struct rte_eth_conf default_port_conf = { 111 .rxmode = { 112 .mq_mode = ETH_MQ_RX_RSS, 113 .max_rx_pkt_len = ETHER_MAX_LEN, 114 .split_hdr_size = 0, /**< hdr buf size */ 115 .header_split = 0, /**< Header Split disabled */ 116 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 117 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 118 .hw_vlan_strip = 0, /**< VLAN strip disabled. */ 119 .hw_vlan_extend = 0, /**< Extended VLAN disabled. */ 120 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 121 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 122 .enable_lro = 0, /**< LRO disabled */ 123 }, 124 .rx_adv_conf = { 125 .rss_conf = { 126 .rss_key = default_rsskey_40bytes, 127 .rss_key_len = 40, 128 .rss_hf = ETH_RSS_PROTO_MASK, 129 }, 130 }, 131 .txmode = { 132 .mq_mode = ETH_MQ_TX_NONE, 133 }, 134 }; 135 136 struct mbuf_table { 137 uint16_t len; 138 struct rte_mbuf *m_table[MAX_PKT_BURST]; 139 }; 140 141 struct lcore_rx_queue { 142 uint16_t port_id; 143 uint16_t queue_id; 144 } __rte_cache_aligned; 145 146 struct lcore_conf { 147 uint16_t proc_id; 148 uint16_t socket_id; 149 uint16_t nb_queue_list[RTE_MAX_ETHPORTS]; 150 struct ff_port_cfg *port_cfgs; 151 152 uint16_t nb_rx_queue; 153 struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; 154 uint16_t nb_tx_port; 155 uint16_t tx_port_id[RTE_MAX_ETHPORTS]; 156 uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; 157 struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS]; 158 char *pcap[RTE_MAX_ETHPORTS]; 159 } __rte_cache_aligned; 160 161 static struct lcore_conf lcore_conf; 162 163 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 164 165 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; 166 static dispatch_func_t packet_dispatcher; 167 168 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 169 170 struct ff_msg_ring { 171 char ring_name[2][RTE_RING_NAMESIZE]; 172 /* ring[0] for lcore recv msg, other send */ 173 /* ring[1] for lcore send msg, other read */ 174 struct rte_ring *ring[2]; 175 } __rte_cache_aligned; 176 177 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 178 static struct rte_mempool *message_pool; 179 180 struct ff_dpdk_if_context { 181 void *sc; 182 void *ifp; 183 uint16_t port_id; 184 struct ff_hw_features hw_features; 185 } __rte_cache_aligned; 186 187 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 188 189 extern void ff_hardclock(void); 190 191 static void 192 ff_hardclock_job(__rte_unused struct rte_timer *timer, 193 __rte_unused void *arg) { 194 ff_hardclock(); 195 ff_update_current_ts(); 196 } 197 198 struct ff_dpdk_if_context * 199 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 200 { 201 struct ff_dpdk_if_context *ctx; 202 203 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 204 if (ctx == NULL) 205 return NULL; 206 207 ctx->sc = sc; 208 ctx->ifp = ifp; 209 ctx->port_id = cfg->port_id; 210 ctx->hw_features = cfg->hw_features; 211 212 return ctx; 213 } 214 215 void 216 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 217 { 218 free(ctx); 219 } 220 221 static void 222 check_all_ports_link_status(void) 223 { 224 #define CHECK_INTERVAL 100 /* 100ms */ 225 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 226 227 uint16_t portid; 228 uint8_t count, all_ports_up, print_flag = 0; 229 struct rte_eth_link link; 230 231 printf("\nChecking link status"); 232 fflush(stdout); 233 234 int i, nb_ports; 235 nb_ports = ff_global_cfg.dpdk.nb_ports; 236 for (count = 0; count <= MAX_CHECK_TIME; count++) { 237 all_ports_up = 1; 238 for (i = 0; i < nb_ports; i++) { 239 uint16_t portid = ff_global_cfg.dpdk.portid_list[i]; 240 memset(&link, 0, sizeof(link)); 241 rte_eth_link_get_nowait(portid, &link); 242 243 /* print link status if flag set */ 244 if (print_flag == 1) { 245 if (link.link_status) { 246 printf("Port %d Link Up - speed %u " 247 "Mbps - %s\n", (int)portid, 248 (unsigned)link.link_speed, 249 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 250 ("full-duplex") : ("half-duplex\n")); 251 } else { 252 printf("Port %d Link Down\n", (int)portid); 253 } 254 continue; 255 } 256 /* clear all_ports_up flag if any link down */ 257 if (link.link_status == 0) { 258 all_ports_up = 0; 259 break; 260 } 261 } 262 263 /* after finally printing all link status, get out */ 264 if (print_flag == 1) 265 break; 266 267 if (all_ports_up == 0) { 268 printf("."); 269 fflush(stdout); 270 rte_delay_ms(CHECK_INTERVAL); 271 } 272 273 /* set the print_flag if all ports up or timeout */ 274 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 275 print_flag = 1; 276 printf("done\n"); 277 } 278 } 279 } 280 281 static int 282 init_lcore_conf(void) 283 { 284 uint8_t nb_dev_ports = rte_eth_dev_count(); 285 if (nb_dev_ports == 0) { 286 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 287 } 288 289 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) { 290 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n", 291 ff_global_cfg.dpdk.max_portid); 292 } 293 294 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs; 295 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 296 297 uint16_t proc_id; 298 for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) { 299 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id]; 300 if (!lcore_config[lcore_id].detected) { 301 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 302 } 303 } 304 305 uint16_t socket_id = 0; 306 if (numa_on) { 307 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 308 } 309 310 lcore_conf.socket_id = socket_id; 311 312 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id]; 313 int j; 314 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) { 315 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j]; 316 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 317 318 int queueid = -1; 319 int i; 320 for (i = 0; i < pconf->nb_lcores; i++) { 321 if (pconf->lcore_list[i] == lcore_id) { 322 queueid = i; 323 } 324 } 325 if (queueid < 0) { 326 continue; 327 } 328 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid); 329 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 330 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 331 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid; 332 lcore_conf.nb_rx_queue++; 333 334 lcore_conf.tx_queue_id[port_id] = queueid; 335 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id; 336 lcore_conf.nb_tx_port++; 337 338 lcore_conf.pcap[port_id] = pconf->pcap; 339 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores; 340 } 341 342 if (lcore_conf.nb_rx_queue == 0) { 343 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id); 344 } 345 346 return 0; 347 } 348 349 static int 350 init_mem_pool(void) 351 { 352 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 353 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 354 uint32_t nb_tx_queue = nb_lcores; 355 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 356 357 unsigned nb_mbuf = RTE_MAX ( 358 (nb_rx_queue*RX_QUEUE_SIZE + 359 nb_ports*nb_lcores*MAX_PKT_BURST + 360 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 361 nb_lcores*MEMPOOL_CACHE_SIZE + 362 nb_ports*KNI_MBUF_MAX + 363 nb_ports*KNI_QUEUE_SIZE + 364 nb_lcores*nb_ports*DISPATCH_RING_SIZE), 365 (unsigned)8192); 366 367 unsigned socketid = 0; 368 uint16_t i, lcore_id; 369 char s[64]; 370 371 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) { 372 lcore_id = ff_global_cfg.dpdk.proc_lcore[i]; 373 if (numa_on) { 374 socketid = rte_lcore_to_socket_id(lcore_id); 375 } 376 377 if (socketid >= NB_SOCKETS) { 378 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 379 socketid, i, NB_SOCKETS); 380 } 381 382 if (pktmbuf_pool[socketid] != NULL) { 383 continue; 384 } 385 386 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 387 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 388 pktmbuf_pool[socketid] = 389 rte_pktmbuf_pool_create(s, nb_mbuf, 390 MEMPOOL_CACHE_SIZE, 0, 391 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 392 } else { 393 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 394 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 395 } 396 397 if (pktmbuf_pool[socketid] == NULL) { 398 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 399 } else { 400 printf("create mbuf pool on socket %d\n", socketid); 401 } 402 } 403 404 return 0; 405 } 406 407 static struct rte_ring * 408 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 409 { 410 struct rte_ring *ring; 411 412 if (name == NULL) 413 return NULL; 414 415 /* If already create, just attached it */ 416 if (likely((ring = rte_ring_lookup(name)) != NULL)) 417 return ring; 418 419 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 420 return rte_ring_create(name, count, socket_id, flags); 421 } else { 422 return rte_ring_lookup(name); 423 } 424 } 425 426 static int 427 init_dispatch_ring(void) 428 { 429 int j; 430 char name_buf[RTE_RING_NAMESIZE]; 431 int queueid; 432 433 unsigned socketid = lcore_conf.socket_id; 434 435 /* Create ring according to ports actually being used. */ 436 int nb_ports = ff_global_cfg.dpdk.nb_ports; 437 for (j = 0; j < nb_ports; j++) { 438 uint16_t portid = ff_global_cfg.dpdk.portid_list[j]; 439 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid]; 440 int nb_queues = pconf->nb_lcores; 441 if (dispatch_ring[portid] == NULL) { 442 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid); 443 444 dispatch_ring[portid] = rte_zmalloc(name_buf, 445 sizeof(struct rte_ring *) * nb_queues, 446 RTE_CACHE_LINE_SIZE); 447 if (dispatch_ring[portid] == NULL) { 448 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 449 "failed\n", name_buf); 450 } 451 } 452 453 for(queueid = 0; queueid < nb_queues; ++queueid) { 454 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d", 455 portid, queueid); 456 dispatch_ring[portid][queueid] = create_ring(name_buf, 457 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ); 458 459 if (dispatch_ring[portid][queueid] == NULL) 460 rte_panic("create ring:%s failed!\n", name_buf); 461 462 printf("create ring:%s success, %u ring entries are now free!\n", 463 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid])); 464 } 465 } 466 467 return 0; 468 } 469 470 static void 471 ff_msg_init(struct rte_mempool *mp, 472 __attribute__((unused)) void *opaque_arg, 473 void *obj, __attribute__((unused)) unsigned i) 474 { 475 struct ff_msg *msg = (struct ff_msg *)obj; 476 msg->msg_type = FF_UNKNOWN; 477 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 478 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 479 } 480 481 static int 482 init_msg_ring(void) 483 { 484 uint16_t i; 485 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 486 unsigned socketid = lcore_conf.socket_id; 487 488 /* Create message buffer pool */ 489 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 490 message_pool = rte_mempool_create(FF_MSG_POOL, 491 MSG_RING_SIZE * 2 * nb_procs, 492 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 493 NULL, NULL, ff_msg_init, NULL, 494 socketid, 0); 495 } else { 496 message_pool = rte_mempool_lookup(FF_MSG_POOL); 497 } 498 499 if (message_pool == NULL) { 500 rte_panic("Create msg mempool failed\n"); 501 } 502 503 for(i = 0; i < nb_procs; ++i) { 504 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 505 "%s%u", FF_MSG_RING_IN, i); 506 snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE, 507 "%s%u", FF_MSG_RING_OUT, i); 508 509 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 510 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 511 if (msg_ring[i].ring[0] == NULL) 512 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 513 514 msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1], 515 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 516 if (msg_ring[i].ring[1] == NULL) 517 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 518 } 519 520 return 0; 521 } 522 523 static int 524 init_kni(void) 525 { 526 int nb_ports = rte_eth_dev_count(); 527 kni_accept = 0; 528 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 529 kni_accept = 1; 530 531 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 532 ff_global_cfg.kni.udp_port); 533 534 unsigned socket_id = lcore_conf.socket_id; 535 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 536 537 nb_ports = ff_global_cfg.dpdk.nb_ports; 538 int i, ret; 539 for (i = 0; i < nb_ports; i++) { 540 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 541 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 542 } 543 544 return 0; 545 } 546 547 static void 548 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues) 549 { 550 if (reta_size == 0) { 551 return; 552 } 553 554 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 555 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 556 557 /* config HW indirection table */ 558 unsigned i, j, hash=0; 559 for (i = 0; i < reta_conf_size; i++) { 560 reta_conf[i].mask = ~0ULL; 561 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 562 reta_conf[i].reta[j] = hash++ % nb_queues; 563 } 564 } 565 566 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 567 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 568 port_id); 569 } 570 } 571 572 static int 573 init_port_start(void) 574 { 575 int nb_ports = ff_global_cfg.dpdk.nb_ports; 576 unsigned socketid = rte_lcore_to_socket_id(rte_lcore_id()); 577 struct rte_mempool *mbuf_pool = pktmbuf_pool[socketid]; 578 uint16_t i; 579 580 for (i = 0; i < nb_ports; i++) { 581 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 582 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 583 uint16_t nb_queues = pconf->nb_lcores; 584 585 struct rte_eth_dev_info dev_info; 586 rte_eth_dev_info_get(port_id, &dev_info); 587 588 if (nb_queues > dev_info.max_rx_queues) { 589 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 590 nb_queues, 591 dev_info.max_rx_queues); 592 } 593 594 if (nb_queues > dev_info.max_tx_queues) { 595 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 596 nb_queues, 597 dev_info.max_tx_queues); 598 } 599 600 struct ether_addr addr; 601 rte_eth_macaddr_get(port_id, &addr); 602 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 603 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 604 (unsigned)port_id, 605 addr.addr_bytes[0], addr.addr_bytes[1], 606 addr.addr_bytes[2], addr.addr_bytes[3], 607 addr.addr_bytes[4], addr.addr_bytes[5]); 608 609 rte_memcpy(pconf->mac, 610 addr.addr_bytes, ETHER_ADDR_LEN); 611 612 /* Clear txq_flags - we do not need multi-mempool and refcnt */ 613 dev_info.default_txconf.txq_flags = ETH_TXQ_FLAGS_NOMULTMEMP | 614 ETH_TXQ_FLAGS_NOREFCOUNT; 615 616 /* Disable features that are not supported by port's HW */ 617 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) { 618 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP; 619 } 620 621 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 622 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP; 623 } 624 625 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) { 626 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP; 627 } 628 629 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { 630 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; 631 } 632 633 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) && 634 !(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TSO)) { 635 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; 636 } 637 638 struct rte_eth_conf port_conf = {0}; 639 640 /* Set RSS mode */ 641 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 642 port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK; 643 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes; 644 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; 645 646 /* Set Rx VLAN stripping */ 647 if (ff_global_cfg.dpdk.vlan_strip) { 648 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 649 port_conf.rxmode.hw_vlan_strip = 1; 650 } 651 } 652 653 /* Enable HW CRC stripping */ 654 port_conf.rxmode.hw_strip_crc = 1; 655 656 /* FIXME: Enable TCP LRO ?*/ 657 #if 0 658 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 659 printf("LRO is supported\n"); 660 port_conf.rxmode.enable_lro = 1; 661 pconf->hw_features.rx_lro = 1; 662 } 663 #endif 664 665 /* Set Rx checksum checking */ 666 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 667 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 668 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 669 printf("RX checksum offload supported\n"); 670 port_conf.rxmode.hw_ip_checksum = 1; 671 pconf->hw_features.rx_csum = 1; 672 } 673 674 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 675 printf("TX ip checksum offload supported\n"); 676 pconf->hw_features.tx_csum_ip = 1; 677 } 678 679 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 680 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 681 printf("TX TCP&UDP checksum offload supported\n"); 682 pconf->hw_features.tx_csum_l4 = 1; 683 } 684 685 if (ff_global_cfg.dpdk.tso) { 686 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 687 printf("TSO is supported\n"); 688 pconf->hw_features.tx_tso = 1; 689 } 690 } else { 691 printf("TSO is disabled\n"); 692 } 693 694 if (dev_info.reta_size) { 695 /* reta size must be power of 2 */ 696 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 697 698 rss_reta_size[port_id] = dev_info.reta_size; 699 printf("port[%d]: rss table size: %d\n", port_id, 700 dev_info.reta_size); 701 } 702 703 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 704 continue; 705 } 706 707 int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); 708 if (ret != 0) { 709 return ret; 710 } 711 uint16_t q; 712 for (q = 0; q < nb_queues; q++) { 713 ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE, 714 socketid, &dev_info.default_txconf); 715 if (ret < 0) { 716 return ret; 717 } 718 719 ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE, 720 socketid, &dev_info.default_rxconf, mbuf_pool); 721 if (ret < 0) { 722 return ret; 723 } 724 } 725 726 ret = rte_eth_dev_start(port_id); 727 if (ret < 0) { 728 return ret; 729 } 730 731 if (nb_queues > 1) { 732 /* set HW rss hash function to Toeplitz. */ 733 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 734 struct rte_eth_hash_filter_info info = {0}; 735 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 736 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 737 738 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 739 RTE_ETH_FILTER_SET, &info) < 0) { 740 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 741 port_id); 742 } 743 } 744 745 set_rss_table(port_id, dev_info.reta_size, nb_queues); 746 } 747 748 /* Enable RX in promiscuous mode for the Ethernet device. */ 749 if (ff_global_cfg.dpdk.promiscuous) { 750 rte_eth_promiscuous_enable(port_id); 751 ret = rte_eth_promiscuous_get(port_id); 752 if (ret == 1) { 753 printf("set port %u to promiscuous mode ok\n", port_id); 754 } else { 755 printf("set port %u to promiscuous mode error\n", port_id); 756 } 757 } 758 759 /* Enable pcap dump */ 760 if (pconf->pcap) { 761 ff_enable_pcap(pconf->pcap); 762 } 763 } 764 765 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 766 check_all_ports_link_status(); 767 } 768 769 return 0; 770 } 771 772 static int 773 init_clock(void) 774 { 775 rte_timer_subsystem_init(); 776 uint64_t hz = rte_get_timer_hz(); 777 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 778 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 779 780 rte_timer_init(&freebsd_clock); 781 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 782 rte_lcore_id(), &ff_hardclock_job, NULL); 783 784 ff_update_current_ts(); 785 786 return 0; 787 } 788 789 int 790 ff_dpdk_init(int argc, char **argv) 791 { 792 if (ff_global_cfg.dpdk.nb_procs < 1 || 793 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 794 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 795 ff_global_cfg.dpdk.proc_id < 0) { 796 printf("param num_procs[%d] or proc_id[%d] error!\n", 797 ff_global_cfg.dpdk.nb_procs, 798 ff_global_cfg.dpdk.proc_id); 799 exit(1); 800 } 801 802 int ret = rte_eal_init(argc, argv); 803 if (ret < 0) { 804 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 805 } 806 807 numa_on = ff_global_cfg.dpdk.numa_on; 808 809 init_lcore_conf(); 810 811 init_mem_pool(); 812 813 init_dispatch_ring(); 814 815 init_msg_ring(); 816 817 enable_kni = ff_global_cfg.kni.enable; 818 if (enable_kni) { 819 init_kni(); 820 } 821 822 ret = init_port_start(); 823 if (ret < 0) { 824 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 825 } 826 827 init_clock(); 828 829 return 0; 830 } 831 832 static void 833 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 834 { 835 uint8_t rx_csum = ctx->hw_features.rx_csum; 836 if (rx_csum) { 837 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 838 return; 839 } 840 } 841 842 /* 843 * FIXME: should we save pkt->vlan_tci 844 * if (pkt->ol_flags & PKT_RX_VLAN_PKT) 845 */ 846 847 void *data = rte_pktmbuf_mtod(pkt, void*); 848 uint16_t len = rte_pktmbuf_data_len(pkt); 849 850 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 851 if (hdr == NULL) { 852 rte_pktmbuf_free(pkt); 853 return; 854 } 855 856 struct rte_mbuf *pn = pkt->next; 857 void *prev = hdr; 858 while(pn != NULL) { 859 data = rte_pktmbuf_mtod(pn, void*); 860 len = rte_pktmbuf_data_len(pn); 861 862 void *mb = ff_mbuf_get(prev, data, len); 863 if (mb == NULL) { 864 ff_mbuf_free(hdr); 865 rte_pktmbuf_free(pkt); 866 return; 867 } 868 pn = pn->next; 869 prev = mb; 870 } 871 872 ff_veth_process_packet(ctx->ifp, hdr); 873 } 874 875 static enum FilterReturn 876 protocol_filter(const void *data, uint16_t len) 877 { 878 if(len < ETHER_HDR_LEN) 879 return FILTER_UNKNOWN; 880 881 const struct ether_hdr *hdr; 882 hdr = (const struct ether_hdr *)data; 883 884 if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP) 885 return FILTER_ARP; 886 887 if (!enable_kni) { 888 return FILTER_UNKNOWN; 889 } 890 891 if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4) 892 return FILTER_UNKNOWN; 893 894 return ff_kni_proto_filter(data + ETHER_HDR_LEN, 895 len - ETHER_HDR_LEN); 896 } 897 898 static inline void 899 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m) 900 { 901 struct rte_mbuf *md; 902 void *src, *dst; 903 904 dst = rte_pktmbuf_mtod(mi, void *); 905 src = rte_pktmbuf_mtod(m, void *); 906 907 mi->data_len = m->data_len; 908 rte_memcpy(dst, src, m->data_len); 909 910 mi->port = m->port; 911 mi->vlan_tci = m->vlan_tci; 912 mi->vlan_tci_outer = m->vlan_tci_outer; 913 mi->tx_offload = m->tx_offload; 914 mi->hash = m->hash; 915 mi->ol_flags = m->ol_flags; 916 mi->packet_type = m->packet_type; 917 } 918 919 /* copied from rte_pktmbuf_clone */ 920 static inline struct rte_mbuf * 921 pktmbuf_deep_clone(const struct rte_mbuf *md, 922 struct rte_mempool *mp) 923 { 924 struct rte_mbuf *mc, *mi, **prev; 925 uint32_t pktlen; 926 uint8_t nseg; 927 928 if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) 929 return NULL; 930 931 mi = mc; 932 prev = &mi->next; 933 pktlen = md->pkt_len; 934 nseg = 0; 935 936 do { 937 nseg++; 938 pktmbuf_deep_attach(mi, md); 939 *prev = mi; 940 prev = &mi->next; 941 } while ((md = md->next) != NULL && 942 (mi = rte_pktmbuf_alloc(mp)) != NULL); 943 944 *prev = NULL; 945 mc->nb_segs = nseg; 946 mc->pkt_len = pktlen; 947 948 /* Allocation of new indirect segment failed */ 949 if (unlikely (mi == NULL)) { 950 rte_pktmbuf_free(mc); 951 return NULL; 952 } 953 954 __rte_mbuf_sanity_check(mc, 1); 955 return mc; 956 } 957 958 static inline void 959 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 960 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 961 { 962 struct lcore_conf *qconf = &lcore_conf; 963 uint16_t nb_queues = qconf->nb_queue_list[port_id]; 964 965 uint16_t i; 966 for (i = 0; i < count; i++) { 967 struct rte_mbuf *rtem = bufs[i]; 968 969 if (unlikely(qconf->pcap[port_id] != NULL)) { 970 if (!pkts_from_ring) { 971 ff_dump_packets(qconf->pcap[port_id], rtem); 972 } 973 } 974 975 void *data = rte_pktmbuf_mtod(rtem, void*); 976 uint16_t len = rte_pktmbuf_data_len(rtem); 977 978 if (!pkts_from_ring && packet_dispatcher) { 979 int ret = (*packet_dispatcher)(data, len, queue_id, nb_queues); 980 if (ret < 0 || ret >= nb_queues) { 981 rte_pktmbuf_free(rtem); 982 continue; 983 } 984 985 if (ret != queue_id) { 986 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem); 987 if (ret < 0) 988 rte_pktmbuf_free(rtem); 989 990 continue; 991 } 992 } 993 994 enum FilterReturn filter = protocol_filter(data, len); 995 if (filter == FILTER_ARP) { 996 struct rte_mempool *mbuf_pool; 997 struct rte_mbuf *mbuf_clone; 998 if (!pkts_from_ring) { 999 uint16_t j; 1000 for(j = 0; j < nb_queues; ++j) { 1001 if(j == queue_id) 1002 continue; 1003 1004 unsigned socket_id = 0; 1005 if (numa_on) { 1006 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j]; 1007 socket_id = rte_lcore_to_socket_id(lcore_id); 1008 } 1009 mbuf_pool = pktmbuf_pool[socket_id]; 1010 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1011 if(mbuf_clone) { 1012 int ret = rte_ring_enqueue(dispatch_ring[port_id][j], 1013 mbuf_clone); 1014 if (ret < 0) 1015 rte_pktmbuf_free(mbuf_clone); 1016 } 1017 } 1018 } 1019 1020 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1021 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 1022 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1023 if(mbuf_clone) { 1024 ff_kni_enqueue(port_id, mbuf_clone); 1025 } 1026 } 1027 1028 ff_veth_input(ctx, rtem); 1029 } else if (enable_kni && 1030 ((filter == FILTER_KNI && kni_accept) || 1031 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 1032 ff_kni_enqueue(port_id, rtem); 1033 } else { 1034 ff_veth_input(ctx, rtem); 1035 } 1036 } 1037 } 1038 1039 static inline int 1040 process_dispatch_ring(uint16_t port_id, uint16_t queue_id, 1041 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 1042 { 1043 /* read packet from ring buf and to process */ 1044 uint16_t nb_rb; 1045 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id], 1046 (void **)pkts_burst, MAX_PKT_BURST, NULL); 1047 1048 if(nb_rb > 0) { 1049 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 1050 } 1051 1052 return 0; 1053 } 1054 1055 static inline void 1056 handle_sysctl_msg(struct ff_msg *msg) 1057 { 1058 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 1059 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 1060 msg->sysctl.newlen); 1061 1062 if (ret < 0) { 1063 msg->result = errno; 1064 } else { 1065 msg->result = 0; 1066 } 1067 } 1068 1069 static inline void 1070 handle_ioctl_msg(struct ff_msg *msg) 1071 { 1072 int fd, ret; 1073 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 1074 if (fd < 0) { 1075 ret = -1; 1076 goto done; 1077 } 1078 1079 ret = ff_ioctl(fd, msg->ioctl.cmd, msg->ioctl.data); 1080 1081 ff_close(fd); 1082 1083 done: 1084 if (ret < 0) { 1085 msg->result = errno; 1086 } else { 1087 msg->result = 0; 1088 } 1089 } 1090 1091 static inline void 1092 handle_route_msg(struct ff_msg *msg) 1093 { 1094 int ret = ff_rtioctl(msg->route.fib, msg->route.data, 1095 &msg->route.len, msg->route.maxlen); 1096 if (ret < 0) { 1097 msg->result = errno; 1098 } else { 1099 msg->result = 0; 1100 } 1101 } 1102 1103 static struct ff_top_args ff_status; 1104 static inline void 1105 handle_top_msg(struct ff_msg *msg) 1106 { 1107 msg->top = ff_status; 1108 msg->result = 0; 1109 } 1110 1111 #ifdef FF_NETGRAPH 1112 static inline void 1113 handle_ngctl_msg(struct ff_msg *msg) 1114 { 1115 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data); 1116 if (ret < 0) { 1117 msg->result = errno; 1118 } else { 1119 msg->result = 0; 1120 msg->ngctl.ret = ret; 1121 } 1122 } 1123 #endif 1124 1125 #ifdef FF_IPFW 1126 static inline void 1127 handle_ipfw_msg(struct ff_msg *msg) 1128 { 1129 int fd, ret; 1130 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW); 1131 if (fd < 0) { 1132 ret = -1; 1133 goto done; 1134 } 1135 1136 switch (msg->ipfw.cmd) { 1137 case FF_IPFW_GET: 1138 ret = ff_getsockopt(fd, msg->ipfw.level, 1139 msg->ipfw.optname, msg->ipfw.optval, 1140 msg->ipfw.optlen); 1141 break; 1142 case FF_IPFW_SET: 1143 ret = ff_setsockopt(fd, msg->ipfw.level, 1144 msg->ipfw.optname, msg->ipfw.optval, 1145 *(msg->ipfw.optlen)); 1146 break; 1147 default: 1148 ret = -1; 1149 errno = ENOTSUP; 1150 break; 1151 } 1152 1153 ff_close(fd); 1154 1155 done: 1156 if (ret < 0) { 1157 msg->result = errno; 1158 } else { 1159 msg->result = 0; 1160 } 1161 } 1162 #endif 1163 1164 static inline void 1165 handle_default_msg(struct ff_msg *msg) 1166 { 1167 msg->result = ENOTSUP; 1168 } 1169 1170 static inline void 1171 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1172 { 1173 switch (msg->msg_type) { 1174 case FF_SYSCTL: 1175 handle_sysctl_msg(msg); 1176 break; 1177 case FF_IOCTL: 1178 handle_ioctl_msg(msg); 1179 break; 1180 case FF_ROUTE: 1181 handle_route_msg(msg); 1182 break; 1183 case FF_TOP: 1184 handle_top_msg(msg); 1185 break; 1186 #ifdef FF_NETGRAPH 1187 case FF_NGCTL: 1188 handle_ngctl_msg(msg); 1189 break; 1190 #endif 1191 #ifdef FF_IPFW 1192 case FF_IPFW_CTL: 1193 handle_ipfw_msg(msg); 1194 break; 1195 #endif 1196 default: 1197 handle_default_msg(msg); 1198 break; 1199 } 1200 rte_ring_enqueue(msg_ring[proc_id].ring[1], msg); 1201 } 1202 1203 static inline int 1204 process_msg_ring(uint16_t proc_id) 1205 { 1206 void *msg; 1207 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1208 1209 if (unlikely(ret == 0)) { 1210 handle_msg((struct ff_msg *)msg, proc_id); 1211 } 1212 1213 return 0; 1214 } 1215 1216 /* Send burst of packets on an output interface */ 1217 static inline int 1218 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1219 { 1220 struct rte_mbuf **m_table; 1221 int ret; 1222 uint16_t queueid; 1223 1224 queueid = qconf->tx_queue_id[port]; 1225 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1226 1227 if (unlikely(qconf->pcap[port] != NULL)) { 1228 uint16_t i; 1229 for (i = 0; i < n; i++) { 1230 ff_dump_packets(qconf->pcap[port], m_table[i]); 1231 } 1232 } 1233 1234 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1235 if (unlikely(ret < n)) { 1236 do { 1237 rte_pktmbuf_free(m_table[ret]); 1238 } while (++ret < n); 1239 } 1240 1241 return 0; 1242 } 1243 1244 /* Enqueue a single packet, and send burst if queue is filled */ 1245 static inline int 1246 send_single_packet(struct rte_mbuf *m, uint8_t port) 1247 { 1248 uint16_t len; 1249 struct lcore_conf *qconf; 1250 1251 qconf = &lcore_conf; 1252 len = qconf->tx_mbufs[port].len; 1253 qconf->tx_mbufs[port].m_table[len] = m; 1254 len++; 1255 1256 /* enough pkts to be sent */ 1257 if (unlikely(len == MAX_PKT_BURST)) { 1258 send_burst(qconf, MAX_PKT_BURST, port); 1259 len = 0; 1260 } 1261 1262 qconf->tx_mbufs[port].len = len; 1263 return 0; 1264 } 1265 1266 int 1267 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1268 int total) 1269 { 1270 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1271 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1272 if (head == NULL) { 1273 ff_mbuf_free(m); 1274 return -1; 1275 } 1276 1277 head->pkt_len = total; 1278 head->nb_segs = 0; 1279 1280 int off = 0; 1281 struct rte_mbuf *cur = head, *prev = NULL; 1282 while(total > 0) { 1283 if (cur == NULL) { 1284 cur = rte_pktmbuf_alloc(mbuf_pool); 1285 if (cur == NULL) { 1286 rte_pktmbuf_free(head); 1287 ff_mbuf_free(m); 1288 return -1; 1289 } 1290 } 1291 1292 void *data = rte_pktmbuf_mtod(cur, void*); 1293 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1294 int ret = ff_mbuf_copydata(m, data, off, len); 1295 if (ret < 0) { 1296 rte_pktmbuf_free(head); 1297 ff_mbuf_free(m); 1298 return -1; 1299 } 1300 1301 if (prev != NULL) { 1302 prev->next = cur; 1303 } 1304 prev = cur; 1305 1306 cur->data_len = len; 1307 off += len; 1308 total -= len; 1309 head->nb_segs++; 1310 cur = NULL; 1311 } 1312 1313 struct ff_tx_offload offload = {0}; 1314 ff_mbuf_tx_offload(m, &offload); 1315 1316 void *data = rte_pktmbuf_mtod(head, void*); 1317 1318 if (offload.ip_csum) { 1319 /* ipv6 not supported yet */ 1320 struct ipv4_hdr *iph; 1321 int iph_len; 1322 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1323 iph_len = (iph->version_ihl & 0x0f) << 2; 1324 1325 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 1326 head->l2_len = ETHER_HDR_LEN; 1327 head->l3_len = iph_len; 1328 } 1329 1330 if (ctx->hw_features.tx_csum_l4) { 1331 struct ipv4_hdr *iph; 1332 int iph_len; 1333 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1334 iph_len = (iph->version_ihl & 0x0f) << 2; 1335 1336 if (offload.tcp_csum) { 1337 head->ol_flags |= PKT_TX_TCP_CKSUM; 1338 head->l2_len = ETHER_HDR_LEN; 1339 head->l3_len = iph_len; 1340 } 1341 1342 /* 1343 * TCP segmentation offload. 1344 * 1345 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 1346 * implies PKT_TX_TCP_CKSUM) 1347 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 1348 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 1349 * write the IP checksum to 0 in the packet 1350 * - fill the mbuf offload information: l2_len, 1351 * l3_len, l4_len, tso_segsz 1352 * - calculate the pseudo header checksum without taking ip_len 1353 * in account, and set it in the TCP header. Refer to 1354 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 1355 * used as helpers. 1356 */ 1357 if (offload.tso_seg_size) { 1358 struct tcp_hdr *tcph; 1359 int tcph_len; 1360 tcph = (struct tcp_hdr *)((char *)iph + iph_len); 1361 tcph_len = (tcph->data_off & 0xf0) >> 2; 1362 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 1363 1364 head->ol_flags |= PKT_TX_TCP_SEG; 1365 head->l4_len = tcph_len; 1366 head->tso_segsz = offload.tso_seg_size; 1367 } 1368 1369 if (offload.udp_csum) { 1370 head->ol_flags |= PKT_TX_UDP_CKSUM; 1371 head->l2_len = ETHER_HDR_LEN; 1372 head->l3_len = iph_len; 1373 } 1374 } 1375 1376 ff_mbuf_free(m); 1377 1378 return send_single_packet(head, ctx->port_id); 1379 } 1380 1381 static int 1382 main_loop(void *arg) 1383 { 1384 struct loop_routine *lr = (struct loop_routine *)arg; 1385 1386 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1387 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc; 1388 int i, j, nb_rx, idle; 1389 uint16_t port_id, queue_id; 1390 struct lcore_conf *qconf; 1391 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 1392 US_PER_S * BURST_TX_DRAIN_US; 1393 struct ff_dpdk_if_context *ctx; 1394 1395 prev_tsc = 0; 1396 usch_tsc = 0; 1397 1398 qconf = &lcore_conf; 1399 1400 while (1) { 1401 cur_tsc = rte_rdtsc(); 1402 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1403 rte_timer_manage(); 1404 } 1405 1406 idle = 1; 1407 sys_tsc = 0; 1408 usr_tsc = 0; 1409 1410 /* 1411 * TX burst queue drain 1412 */ 1413 diff_tsc = cur_tsc - prev_tsc; 1414 if (unlikely(diff_tsc > drain_tsc)) { 1415 for (i = 0; i < qconf->nb_tx_port; i++) { 1416 port_id = qconf->tx_port_id[i]; 1417 if (qconf->tx_mbufs[port_id].len == 0) 1418 continue; 1419 1420 idle = 0; 1421 1422 send_burst(qconf, 1423 qconf->tx_mbufs[port_id].len, 1424 port_id); 1425 qconf->tx_mbufs[port_id].len = 0; 1426 } 1427 1428 prev_tsc = cur_tsc; 1429 } 1430 1431 /* 1432 * Read packet from RX queues 1433 */ 1434 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1435 port_id = qconf->rx_queue_list[i].port_id; 1436 queue_id = qconf->rx_queue_list[i].queue_id; 1437 ctx = veth_ctx[port_id]; 1438 1439 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1440 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1441 } 1442 1443 process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); 1444 1445 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1446 MAX_PKT_BURST); 1447 if (nb_rx == 0) 1448 continue; 1449 1450 idle = 0; 1451 1452 /* Prefetch first packets */ 1453 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1454 rte_prefetch0(rte_pktmbuf_mtod( 1455 pkts_burst[j], void *)); 1456 } 1457 1458 /* Prefetch and handle already prefetched packets */ 1459 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1460 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1461 j + PREFETCH_OFFSET], void *)); 1462 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1463 } 1464 1465 /* Handle remaining prefetched packets */ 1466 for (; j < nb_rx; j++) { 1467 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1468 } 1469 } 1470 1471 process_msg_ring(qconf->proc_id); 1472 1473 div_tsc = rte_rdtsc(); 1474 1475 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) { 1476 usch_tsc = cur_tsc; 1477 lr->loop(lr->arg); 1478 } 1479 1480 end_tsc = rte_rdtsc(); 1481 1482 if (usch_tsc == cur_tsc) { 1483 usr_tsc = end_tsc - div_tsc; 1484 } 1485 1486 if (!idle) { 1487 sys_tsc = div_tsc - cur_tsc; 1488 ff_status.sys_tsc += sys_tsc; 1489 } 1490 1491 ff_status.usr_tsc += usr_tsc; 1492 ff_status.work_tsc += end_tsc - cur_tsc; 1493 ff_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1494 1495 ff_status.loops++; 1496 } 1497 1498 return 0; 1499 } 1500 1501 int 1502 ff_dpdk_if_up(void) { 1503 int i; 1504 struct lcore_conf *qconf = &lcore_conf; 1505 for (i = 0; i < qconf->nb_tx_port; i++) { 1506 uint16_t port_id = qconf->tx_port_id[i]; 1507 1508 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id]; 1509 veth_ctx[port_id] = ff_veth_attach(pconf); 1510 if (veth_ctx[port_id] == NULL) { 1511 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1512 } 1513 } 1514 1515 return 0; 1516 } 1517 1518 void 1519 ff_dpdk_run(loop_func_t loop, void *arg) { 1520 struct loop_routine *lr = rte_malloc(NULL, 1521 sizeof(struct loop_routine), 0); 1522 lr->loop = loop; 1523 lr->arg = arg; 1524 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1525 rte_eal_mp_wait_lcore(); 1526 rte_free(lr); 1527 } 1528 1529 void 1530 ff_dpdk_pktmbuf_free(void *m) 1531 { 1532 rte_pktmbuf_free((struct rte_mbuf *)m); 1533 } 1534 1535 static uint32_t 1536 toeplitz_hash(unsigned keylen, const uint8_t *key, 1537 unsigned datalen, const uint8_t *data) 1538 { 1539 uint32_t hash = 0, v; 1540 u_int i, b; 1541 1542 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1543 1544 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1545 for (i = 0; i < datalen; i++) { 1546 for (b = 0; b < 8; b++) { 1547 if (data[i] & (1<<(7-b))) 1548 hash ^= v; 1549 v <<= 1; 1550 if ((i + 4) < keylen && 1551 (key[i+4] & (1<<(7-b)))) 1552 v |= 1; 1553 } 1554 } 1555 return (hash); 1556 } 1557 1558 int 1559 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1560 uint16_t sport, uint16_t dport) 1561 { 1562 struct lcore_conf *qconf = &lcore_conf; 1563 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 1564 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id]; 1565 1566 if (nb_queues <= 1) { 1567 return 1; 1568 } 1569 1570 uint16_t reta_size = rss_reta_size[ctx->port_id]; 1571 uint16_t queueid = qconf->tx_queue_id[ctx->port_id]; 1572 1573 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1574 sizeof(dport)]; 1575 1576 unsigned datalen = 0; 1577 1578 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1579 datalen += sizeof(saddr); 1580 1581 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1582 datalen += sizeof(daddr); 1583 1584 bcopy(&sport, &data[datalen], sizeof(sport)); 1585 datalen += sizeof(sport); 1586 1587 bcopy(&dport, &data[datalen], sizeof(dport)); 1588 datalen += sizeof(dport); 1589 1590 uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes), 1591 default_rsskey_40bytes, datalen, data); 1592 1593 return ((hash & (reta_size - 1)) % nb_queues) == queueid; 1594 } 1595 1596 void 1597 ff_regist_packet_dispatcher(dispatch_func_t func) 1598 { 1599 packet_dispatcher = func; 1600 } 1601 1602 uint64_t 1603 ff_get_tsc_ns() 1604 { 1605 uint64_t cur_tsc = rte_rdtsc(); 1606 uint64_t hz = rte_get_tsc_hz(); 1607 return ((double)cur_tsc/(double)hz) * NS_PER_S; 1608 } 1609 1610