1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 28 #include <rte_common.h> 29 #include <rte_byteorder.h> 30 #include <rte_log.h> 31 #include <rte_memory.h> 32 #include <rte_memcpy.h> 33 #include <rte_memzone.h> 34 #include <rte_config.h> 35 #include <rte_eal.h> 36 #include <rte_pci.h> 37 #include <rte_mbuf.h> 38 #include <rte_memory.h> 39 #include <rte_lcore.h> 40 #include <rte_launch.h> 41 #include <rte_ethdev.h> 42 #include <rte_debug.h> 43 #include <rte_common.h> 44 #include <rte_ether.h> 45 #include <rte_malloc.h> 46 #include <rte_cycles.h> 47 #include <rte_timer.h> 48 #include <rte_thash.h> 49 #include <rte_ip.h> 50 #include <rte_tcp.h> 51 #include <rte_udp.h> 52 53 #include "ff_dpdk_if.h" 54 #include "ff_dpdk_pcap.h" 55 #include "ff_dpdk_kni.h" 56 #include "ff_config.h" 57 #include "ff_veth.h" 58 #include "ff_host_interface.h" 59 #include "ff_msg.h" 60 #include "ff_api.h" 61 62 #define MEMPOOL_CACHE_SIZE 256 63 64 #define DISPATCH_RING_SIZE 2048 65 66 #define MSG_RING_SIZE 32 67 68 /* 69 * Configurable number of RX/TX ring descriptors 70 */ 71 #define RX_QUEUE_SIZE 512 72 #define TX_QUEUE_SIZE 512 73 74 #define MAX_PKT_BURST 32 75 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 76 77 /* 78 * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send. 79 */ 80 #define MAX_TX_BURST (MAX_PKT_BURST / 2) 81 82 #define NB_SOCKETS 8 83 84 /* Configure how many packets ahead to prefetch, when reading packets */ 85 #define PREFETCH_OFFSET 3 86 87 #define MAX_RX_QUEUE_PER_LCORE 16 88 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS 89 #define MAX_RX_QUEUE_PER_PORT 128 90 91 #ifdef FF_KNI 92 #define KNI_MBUF_MAX 2048 93 #define KNI_QUEUE_SIZE 2048 94 95 static int enable_kni; 96 static int kni_accept; 97 #endif 98 99 static int numa_on; 100 101 static struct rte_timer freebsd_clock; 102 103 // Mellanox Linux's driver key 104 static uint8_t default_rsskey_40bytes[40] = { 105 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 106 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 107 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 108 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 109 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 110 }; 111 112 static struct rte_eth_conf default_port_conf = { 113 .rxmode = { 114 .mq_mode = ETH_MQ_RX_RSS, 115 .max_rx_pkt_len = ETHER_MAX_LEN, 116 .split_hdr_size = 0, /**< hdr buf size */ 117 .header_split = 0, /**< Header Split disabled */ 118 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 119 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 120 .hw_vlan_strip = 0, /**< VLAN strip disabled. */ 121 .hw_vlan_extend = 0, /**< Extended VLAN disabled. */ 122 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 123 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 124 .enable_lro = 0, /**< LRO disabled */ 125 }, 126 .rx_adv_conf = { 127 .rss_conf = { 128 .rss_key = default_rsskey_40bytes, 129 .rss_key_len = 40, 130 .rss_hf = ETH_RSS_PROTO_MASK, 131 }, 132 }, 133 .txmode = { 134 .mq_mode = ETH_MQ_TX_NONE, 135 }, 136 }; 137 138 struct mbuf_table { 139 uint16_t len; 140 struct rte_mbuf *m_table[MAX_PKT_BURST]; 141 }; 142 143 struct lcore_rx_queue { 144 uint16_t port_id; 145 uint16_t queue_id; 146 } __rte_cache_aligned; 147 148 struct lcore_conf { 149 uint16_t proc_id; 150 uint16_t socket_id; 151 uint16_t nb_queue_list[RTE_MAX_ETHPORTS]; 152 struct ff_port_cfg *port_cfgs; 153 154 uint16_t nb_rx_queue; 155 struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; 156 uint16_t nb_tx_port; 157 uint16_t tx_port_id[RTE_MAX_ETHPORTS]; 158 uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; 159 struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS]; 160 char *pcap[RTE_MAX_ETHPORTS]; 161 } __rte_cache_aligned; 162 163 static struct lcore_conf lcore_conf; 164 165 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 166 167 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; 168 static dispatch_func_t packet_dispatcher; 169 170 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 171 172 struct ff_msg_ring { 173 char ring_name[2][RTE_RING_NAMESIZE]; 174 /* ring[0] for lcore recv msg, other send */ 175 /* ring[1] for lcore send msg, other read */ 176 struct rte_ring *ring[2]; 177 } __rte_cache_aligned; 178 179 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 180 static struct rte_mempool *message_pool; 181 182 struct ff_dpdk_if_context { 183 void *sc; 184 void *ifp; 185 uint16_t port_id; 186 struct ff_hw_features hw_features; 187 } __rte_cache_aligned; 188 189 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 190 191 extern void ff_hardclock(void); 192 193 static void 194 ff_hardclock_job(__rte_unused struct rte_timer *timer, 195 __rte_unused void *arg) { 196 ff_hardclock(); 197 ff_update_current_ts(); 198 } 199 200 struct ff_dpdk_if_context * 201 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 202 { 203 struct ff_dpdk_if_context *ctx; 204 205 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 206 if (ctx == NULL) 207 return NULL; 208 209 ctx->sc = sc; 210 ctx->ifp = ifp; 211 ctx->port_id = cfg->port_id; 212 ctx->hw_features = cfg->hw_features; 213 214 return ctx; 215 } 216 217 void 218 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 219 { 220 free(ctx); 221 } 222 223 static void 224 check_all_ports_link_status(void) 225 { 226 #define CHECK_INTERVAL 100 /* 100ms */ 227 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 228 229 uint16_t portid; 230 uint8_t count, all_ports_up, print_flag = 0; 231 struct rte_eth_link link; 232 233 printf("\nChecking link status"); 234 fflush(stdout); 235 236 int i, nb_ports; 237 nb_ports = ff_global_cfg.dpdk.nb_ports; 238 for (count = 0; count <= MAX_CHECK_TIME; count++) { 239 all_ports_up = 1; 240 for (i = 0; i < nb_ports; i++) { 241 uint16_t portid = ff_global_cfg.dpdk.portid_list[i]; 242 memset(&link, 0, sizeof(link)); 243 rte_eth_link_get_nowait(portid, &link); 244 245 /* print link status if flag set */ 246 if (print_flag == 1) { 247 if (link.link_status) { 248 printf("Port %d Link Up - speed %u " 249 "Mbps - %s\n", (int)portid, 250 (unsigned)link.link_speed, 251 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 252 ("full-duplex") : ("half-duplex\n")); 253 } else { 254 printf("Port %d Link Down\n", (int)portid); 255 } 256 continue; 257 } 258 /* clear all_ports_up flag if any link down */ 259 if (link.link_status == 0) { 260 all_ports_up = 0; 261 break; 262 } 263 } 264 265 /* after finally printing all link status, get out */ 266 if (print_flag == 1) 267 break; 268 269 if (all_ports_up == 0) { 270 printf("."); 271 fflush(stdout); 272 rte_delay_ms(CHECK_INTERVAL); 273 } 274 275 /* set the print_flag if all ports up or timeout */ 276 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 277 print_flag = 1; 278 printf("done\n"); 279 } 280 } 281 } 282 283 static int 284 init_lcore_conf(void) 285 { 286 uint8_t nb_dev_ports = rte_eth_dev_count(); 287 if (nb_dev_ports == 0) { 288 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 289 } 290 291 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) { 292 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n", 293 ff_global_cfg.dpdk.max_portid); 294 } 295 296 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs; 297 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 298 299 uint16_t proc_id; 300 for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) { 301 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id]; 302 if (!lcore_config[lcore_id].detected) { 303 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 304 } 305 } 306 307 uint16_t socket_id = 0; 308 if (numa_on) { 309 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 310 } 311 312 lcore_conf.socket_id = socket_id; 313 314 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id]; 315 int j; 316 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) { 317 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j]; 318 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 319 320 int queueid = -1; 321 int i; 322 for (i = 0; i < pconf->nb_lcores; i++) { 323 if (pconf->lcore_list[i] == lcore_id) { 324 queueid = i; 325 } 326 } 327 if (queueid < 0) { 328 continue; 329 } 330 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid); 331 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 332 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 333 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid; 334 lcore_conf.nb_rx_queue++; 335 336 lcore_conf.tx_queue_id[port_id] = queueid; 337 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id; 338 lcore_conf.nb_tx_port++; 339 340 lcore_conf.pcap[port_id] = pconf->pcap; 341 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores; 342 } 343 344 if (lcore_conf.nb_rx_queue == 0) { 345 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id); 346 } 347 348 return 0; 349 } 350 351 static int 352 init_mem_pool(void) 353 { 354 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 355 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 356 uint32_t nb_tx_queue = nb_lcores; 357 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 358 359 unsigned nb_mbuf = RTE_MAX ( 360 (nb_rx_queue*RX_QUEUE_SIZE + 361 nb_ports*nb_lcores*MAX_PKT_BURST + 362 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 363 nb_lcores*MEMPOOL_CACHE_SIZE + 364 #ifdef FF_KNI 365 nb_ports*KNI_MBUF_MAX + 366 nb_ports*KNI_QUEUE_SIZE + 367 #endif 368 nb_lcores*nb_ports*DISPATCH_RING_SIZE), 369 (unsigned)8192); 370 371 unsigned socketid = 0; 372 uint16_t i, lcore_id; 373 char s[64]; 374 375 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) { 376 lcore_id = ff_global_cfg.dpdk.proc_lcore[i]; 377 if (numa_on) { 378 socketid = rte_lcore_to_socket_id(lcore_id); 379 } 380 381 if (socketid >= NB_SOCKETS) { 382 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 383 socketid, i, NB_SOCKETS); 384 } 385 386 if (pktmbuf_pool[socketid] != NULL) { 387 continue; 388 } 389 390 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 391 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 392 pktmbuf_pool[socketid] = 393 rte_pktmbuf_pool_create(s, nb_mbuf, 394 MEMPOOL_CACHE_SIZE, 0, 395 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 396 } else { 397 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 398 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 399 } 400 401 if (pktmbuf_pool[socketid] == NULL) { 402 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 403 } else { 404 printf("create mbuf pool on socket %d\n", socketid); 405 } 406 } 407 408 return 0; 409 } 410 411 static struct rte_ring * 412 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 413 { 414 struct rte_ring *ring; 415 416 if (name == NULL) 417 return NULL; 418 419 /* If already create, just attached it */ 420 if (likely((ring = rte_ring_lookup(name)) != NULL)) 421 return ring; 422 423 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 424 return rte_ring_create(name, count, socket_id, flags); 425 } else { 426 return rte_ring_lookup(name); 427 } 428 } 429 430 static int 431 init_dispatch_ring(void) 432 { 433 int j; 434 char name_buf[RTE_RING_NAMESIZE]; 435 int queueid; 436 437 unsigned socketid = lcore_conf.socket_id; 438 439 /* Create ring according to ports actually being used. */ 440 int nb_ports = ff_global_cfg.dpdk.nb_ports; 441 for (j = 0; j < nb_ports; j++) { 442 uint16_t portid = ff_global_cfg.dpdk.portid_list[j]; 443 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid]; 444 int nb_queues = pconf->nb_lcores; 445 if (dispatch_ring[portid] == NULL) { 446 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid); 447 448 dispatch_ring[portid] = rte_zmalloc(name_buf, 449 sizeof(struct rte_ring *) * nb_queues, 450 RTE_CACHE_LINE_SIZE); 451 if (dispatch_ring[portid] == NULL) { 452 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 453 "failed\n", name_buf); 454 } 455 } 456 457 for(queueid = 0; queueid < nb_queues; ++queueid) { 458 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d", 459 portid, queueid); 460 dispatch_ring[portid][queueid] = create_ring(name_buf, 461 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ); 462 463 if (dispatch_ring[portid][queueid] == NULL) 464 rte_panic("create ring:%s failed!\n", name_buf); 465 466 printf("create ring:%s success, %u ring entries are now free!\n", 467 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid])); 468 } 469 } 470 471 return 0; 472 } 473 474 static void 475 ff_msg_init(struct rte_mempool *mp, 476 __attribute__((unused)) void *opaque_arg, 477 void *obj, __attribute__((unused)) unsigned i) 478 { 479 struct ff_msg *msg = (struct ff_msg *)obj; 480 msg->msg_type = FF_UNKNOWN; 481 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 482 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 483 } 484 485 static int 486 init_msg_ring(void) 487 { 488 uint16_t i; 489 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 490 unsigned socketid = lcore_conf.socket_id; 491 492 /* Create message buffer pool */ 493 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 494 message_pool = rte_mempool_create(FF_MSG_POOL, 495 MSG_RING_SIZE * 2 * nb_procs, 496 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 497 NULL, NULL, ff_msg_init, NULL, 498 socketid, 0); 499 } else { 500 message_pool = rte_mempool_lookup(FF_MSG_POOL); 501 } 502 503 if (message_pool == NULL) { 504 rte_panic("Create msg mempool failed\n"); 505 } 506 507 for(i = 0; i < nb_procs; ++i) { 508 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 509 "%s%u", FF_MSG_RING_IN, i); 510 snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE, 511 "%s%u", FF_MSG_RING_OUT, i); 512 513 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 514 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 515 if (msg_ring[i].ring[0] == NULL) 516 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 517 518 msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1], 519 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 520 if (msg_ring[i].ring[1] == NULL) 521 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 522 } 523 524 return 0; 525 } 526 527 #ifdef FF_KNI 528 static int 529 init_kni(void) 530 { 531 int nb_ports = rte_eth_dev_count(); 532 kni_accept = 0; 533 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 534 kni_accept = 1; 535 536 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 537 ff_global_cfg.kni.udp_port); 538 539 unsigned socket_id = lcore_conf.socket_id; 540 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 541 542 nb_ports = ff_global_cfg.dpdk.nb_ports; 543 int i, ret; 544 for (i = 0; i < nb_ports; i++) { 545 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 546 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 547 } 548 549 return 0; 550 } 551 #endif 552 553 static void 554 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues) 555 { 556 if (reta_size == 0) { 557 return; 558 } 559 560 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 561 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 562 563 /* config HW indirection table */ 564 unsigned i, j, hash=0; 565 for (i = 0; i < reta_conf_size; i++) { 566 reta_conf[i].mask = ~0ULL; 567 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 568 reta_conf[i].reta[j] = hash++ % nb_queues; 569 } 570 } 571 572 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 573 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 574 port_id); 575 } 576 } 577 578 static int 579 init_port_start(void) 580 { 581 int nb_ports = ff_global_cfg.dpdk.nb_ports; 582 unsigned socketid = 0; 583 struct rte_mempool *mbuf_pool; 584 uint16_t i; 585 586 for (i = 0; i < nb_ports; i++) { 587 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 588 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 589 uint16_t nb_queues = pconf->nb_lcores; 590 591 struct rte_eth_dev_info dev_info; 592 rte_eth_dev_info_get(port_id, &dev_info); 593 594 if (nb_queues > dev_info.max_rx_queues) { 595 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 596 nb_queues, 597 dev_info.max_rx_queues); 598 } 599 600 if (nb_queues > dev_info.max_tx_queues) { 601 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 602 nb_queues, 603 dev_info.max_tx_queues); 604 } 605 606 struct ether_addr addr; 607 rte_eth_macaddr_get(port_id, &addr); 608 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 609 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 610 (unsigned)port_id, 611 addr.addr_bytes[0], addr.addr_bytes[1], 612 addr.addr_bytes[2], addr.addr_bytes[3], 613 addr.addr_bytes[4], addr.addr_bytes[5]); 614 615 rte_memcpy(pconf->mac, 616 addr.addr_bytes, ETHER_ADDR_LEN); 617 618 /* Clear txq_flags - we do not need multi-mempool and refcnt */ 619 dev_info.default_txconf.txq_flags = ETH_TXQ_FLAGS_NOMULTMEMP | 620 ETH_TXQ_FLAGS_NOREFCOUNT; 621 622 /* Disable features that are not supported by port's HW */ 623 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) { 624 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP; 625 } 626 627 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 628 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP; 629 } 630 631 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) { 632 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP; 633 } 634 635 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { 636 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; 637 } 638 639 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) && 640 !(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TSO)) { 641 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; 642 } 643 644 struct rte_eth_conf port_conf = {0}; 645 646 /* Set RSS mode */ 647 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 648 port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK; 649 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes; 650 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; 651 652 /* Set Rx VLAN stripping */ 653 if (ff_global_cfg.dpdk.vlan_strip) { 654 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 655 port_conf.rxmode.hw_vlan_strip = 1; 656 } 657 } 658 659 /* Enable HW CRC stripping */ 660 port_conf.rxmode.hw_strip_crc = 1; 661 662 /* FIXME: Enable TCP LRO ?*/ 663 #if 0 664 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 665 printf("LRO is supported\n"); 666 port_conf.rxmode.enable_lro = 1; 667 pconf->hw_features.rx_lro = 1; 668 } 669 #endif 670 671 /* Set Rx checksum checking */ 672 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 673 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 674 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 675 printf("RX checksum offload supported\n"); 676 port_conf.rxmode.hw_ip_checksum = 1; 677 pconf->hw_features.rx_csum = 1; 678 } 679 680 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 681 printf("TX ip checksum offload supported\n"); 682 pconf->hw_features.tx_csum_ip = 1; 683 } 684 685 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 686 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 687 printf("TX TCP&UDP checksum offload supported\n"); 688 pconf->hw_features.tx_csum_l4 = 1; 689 } 690 691 if (ff_global_cfg.dpdk.tso) { 692 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 693 printf("TSO is supported\n"); 694 pconf->hw_features.tx_tso = 1; 695 } 696 } else { 697 printf("TSO is disabled\n"); 698 } 699 700 if (dev_info.reta_size) { 701 /* reta size must be power of 2 */ 702 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 703 704 rss_reta_size[port_id] = dev_info.reta_size; 705 printf("port[%d]: rss table size: %d\n", port_id, 706 dev_info.reta_size); 707 } 708 709 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 710 continue; 711 } 712 713 int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); 714 if (ret != 0) { 715 return ret; 716 } 717 uint16_t q; 718 for (q = 0; q < nb_queues; q++) { 719 if (numa_on) { 720 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q]; 721 socketid = rte_lcore_to_socket_id(lcore_id); 722 } 723 mbuf_pool = pktmbuf_pool[socketid]; 724 725 ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE, 726 socketid, &dev_info.default_txconf); 727 if (ret < 0) { 728 return ret; 729 } 730 731 ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE, 732 socketid, &dev_info.default_rxconf, mbuf_pool); 733 if (ret < 0) { 734 return ret; 735 } 736 } 737 738 ret = rte_eth_dev_start(port_id); 739 if (ret < 0) { 740 return ret; 741 } 742 743 if (nb_queues > 1) { 744 /* set HW rss hash function to Toeplitz. */ 745 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 746 struct rte_eth_hash_filter_info info = {0}; 747 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 748 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 749 750 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 751 RTE_ETH_FILTER_SET, &info) < 0) { 752 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 753 port_id); 754 } 755 } 756 757 set_rss_table(port_id, dev_info.reta_size, nb_queues); 758 } 759 760 /* Enable RX in promiscuous mode for the Ethernet device. */ 761 if (ff_global_cfg.dpdk.promiscuous) { 762 rte_eth_promiscuous_enable(port_id); 763 ret = rte_eth_promiscuous_get(port_id); 764 if (ret == 1) { 765 printf("set port %u to promiscuous mode ok\n", port_id); 766 } else { 767 printf("set port %u to promiscuous mode error\n", port_id); 768 } 769 } 770 771 /* Enable pcap dump */ 772 if (pconf->pcap) { 773 ff_enable_pcap(pconf->pcap); 774 } 775 } 776 777 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 778 check_all_ports_link_status(); 779 } 780 781 return 0; 782 } 783 784 static int 785 init_clock(void) 786 { 787 rte_timer_subsystem_init(); 788 uint64_t hz = rte_get_timer_hz(); 789 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 790 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 791 792 rte_timer_init(&freebsd_clock); 793 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 794 rte_lcore_id(), &ff_hardclock_job, NULL); 795 796 ff_update_current_ts(); 797 798 return 0; 799 } 800 801 int 802 ff_dpdk_init(int argc, char **argv) 803 { 804 if (ff_global_cfg.dpdk.nb_procs < 1 || 805 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 806 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 807 ff_global_cfg.dpdk.proc_id < 0) { 808 printf("param num_procs[%d] or proc_id[%d] error!\n", 809 ff_global_cfg.dpdk.nb_procs, 810 ff_global_cfg.dpdk.proc_id); 811 exit(1); 812 } 813 814 int ret = rte_eal_init(argc, argv); 815 if (ret < 0) { 816 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 817 } 818 819 numa_on = ff_global_cfg.dpdk.numa_on; 820 821 init_lcore_conf(); 822 823 init_mem_pool(); 824 825 init_dispatch_ring(); 826 827 init_msg_ring(); 828 829 #ifdef FF_KNI 830 enable_kni = ff_global_cfg.kni.enable; 831 if (enable_kni) { 832 init_kni(); 833 } 834 #endif 835 836 ret = init_port_start(); 837 if (ret < 0) { 838 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 839 } 840 841 init_clock(); 842 843 return 0; 844 } 845 846 static void 847 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 848 { 849 uint8_t rx_csum = ctx->hw_features.rx_csum; 850 if (rx_csum) { 851 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 852 rte_pktmbuf_free(pkt); 853 return; 854 } 855 } 856 857 /* 858 * FIXME: should we save pkt->vlan_tci 859 * if (pkt->ol_flags & PKT_RX_VLAN_PKT) 860 */ 861 862 void *data = rte_pktmbuf_mtod(pkt, void*); 863 uint16_t len = rte_pktmbuf_data_len(pkt); 864 865 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 866 if (hdr == NULL) { 867 rte_pktmbuf_free(pkt); 868 return; 869 } 870 871 struct rte_mbuf *pn = pkt->next; 872 void *prev = hdr; 873 while(pn != NULL) { 874 data = rte_pktmbuf_mtod(pn, void*); 875 len = rte_pktmbuf_data_len(pn); 876 877 void *mb = ff_mbuf_get(prev, data, len); 878 if (mb == NULL) { 879 ff_mbuf_free(hdr); 880 rte_pktmbuf_free(pkt); 881 return; 882 } 883 pn = pn->next; 884 prev = mb; 885 } 886 887 ff_veth_process_packet(ctx->ifp, hdr); 888 } 889 890 static enum FilterReturn 891 protocol_filter(const void *data, uint16_t len) 892 { 893 if(len < ETHER_HDR_LEN) 894 return FILTER_UNKNOWN; 895 896 const struct ether_hdr *hdr; 897 hdr = (const struct ether_hdr *)data; 898 899 if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP) 900 return FILTER_ARP; 901 902 #ifndef FF_KNI 903 return FILTER_UNKNOWN; 904 #else 905 if (!enable_kni) { 906 return FILTER_UNKNOWN; 907 } 908 909 if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4) 910 return FILTER_UNKNOWN; 911 912 return ff_kni_proto_filter(data + ETHER_HDR_LEN, 913 len - ETHER_HDR_LEN); 914 #endif 915 } 916 917 static inline void 918 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m) 919 { 920 struct rte_mbuf *md; 921 void *src, *dst; 922 923 dst = rte_pktmbuf_mtod(mi, void *); 924 src = rte_pktmbuf_mtod(m, void *); 925 926 mi->data_len = m->data_len; 927 rte_memcpy(dst, src, m->data_len); 928 929 mi->port = m->port; 930 mi->vlan_tci = m->vlan_tci; 931 mi->vlan_tci_outer = m->vlan_tci_outer; 932 mi->tx_offload = m->tx_offload; 933 mi->hash = m->hash; 934 mi->ol_flags = m->ol_flags; 935 mi->packet_type = m->packet_type; 936 } 937 938 /* copied from rte_pktmbuf_clone */ 939 static inline struct rte_mbuf * 940 pktmbuf_deep_clone(const struct rte_mbuf *md, 941 struct rte_mempool *mp) 942 { 943 struct rte_mbuf *mc, *mi, **prev; 944 uint32_t pktlen; 945 uint8_t nseg; 946 947 if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) 948 return NULL; 949 950 mi = mc; 951 prev = &mi->next; 952 pktlen = md->pkt_len; 953 nseg = 0; 954 955 do { 956 nseg++; 957 pktmbuf_deep_attach(mi, md); 958 *prev = mi; 959 prev = &mi->next; 960 } while ((md = md->next) != NULL && 961 (mi = rte_pktmbuf_alloc(mp)) != NULL); 962 963 *prev = NULL; 964 mc->nb_segs = nseg; 965 mc->pkt_len = pktlen; 966 967 /* Allocation of new indirect segment failed */ 968 if (unlikely (mi == NULL)) { 969 rte_pktmbuf_free(mc); 970 return NULL; 971 } 972 973 __rte_mbuf_sanity_check(mc, 1); 974 return mc; 975 } 976 977 static inline void 978 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 979 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 980 { 981 struct lcore_conf *qconf = &lcore_conf; 982 uint16_t nb_queues = qconf->nb_queue_list[port_id]; 983 984 uint16_t i; 985 for (i = 0; i < count; i++) { 986 struct rte_mbuf *rtem = bufs[i]; 987 988 if (unlikely(qconf->pcap[port_id] != NULL)) { 989 if (!pkts_from_ring) { 990 ff_dump_packets(qconf->pcap[port_id], rtem); 991 } 992 } 993 994 void *data = rte_pktmbuf_mtod(rtem, void*); 995 uint16_t len = rte_pktmbuf_data_len(rtem); 996 997 if (!pkts_from_ring && packet_dispatcher) { 998 int ret = (*packet_dispatcher)(data, len, queue_id, nb_queues); 999 if (ret < 0 || ret >= nb_queues) { 1000 rte_pktmbuf_free(rtem); 1001 continue; 1002 } 1003 1004 if (ret != queue_id) { 1005 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem); 1006 if (ret < 0) 1007 rte_pktmbuf_free(rtem); 1008 1009 continue; 1010 } 1011 } 1012 1013 enum FilterReturn filter = protocol_filter(data, len); 1014 if (filter == FILTER_ARP) { 1015 struct rte_mempool *mbuf_pool; 1016 struct rte_mbuf *mbuf_clone; 1017 if (!pkts_from_ring) { 1018 uint16_t j; 1019 for(j = 0; j < nb_queues; ++j) { 1020 if(j == queue_id) 1021 continue; 1022 1023 unsigned socket_id = 0; 1024 if (numa_on) { 1025 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j]; 1026 socket_id = rte_lcore_to_socket_id(lcore_id); 1027 } 1028 mbuf_pool = pktmbuf_pool[socket_id]; 1029 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1030 if(mbuf_clone) { 1031 int ret = rte_ring_enqueue(dispatch_ring[port_id][j], 1032 mbuf_clone); 1033 if (ret < 0) 1034 rte_pktmbuf_free(mbuf_clone); 1035 } 1036 } 1037 } 1038 1039 #ifdef FF_KNI 1040 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1041 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 1042 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1043 if(mbuf_clone) { 1044 ff_kni_enqueue(port_id, mbuf_clone); 1045 } 1046 } 1047 #endif 1048 ff_veth_input(ctx, rtem); 1049 #ifdef FF_KNI 1050 } else if (enable_kni && 1051 ((filter == FILTER_KNI && kni_accept) || 1052 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 1053 ff_kni_enqueue(port_id, rtem); 1054 #endif 1055 } else { 1056 ff_veth_input(ctx, rtem); 1057 } 1058 } 1059 } 1060 1061 static inline int 1062 process_dispatch_ring(uint16_t port_id, uint16_t queue_id, 1063 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 1064 { 1065 /* read packet from ring buf and to process */ 1066 uint16_t nb_rb; 1067 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id], 1068 (void **)pkts_burst, MAX_PKT_BURST, NULL); 1069 1070 if(nb_rb > 0) { 1071 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 1072 } 1073 1074 return 0; 1075 } 1076 1077 static inline void 1078 handle_sysctl_msg(struct ff_msg *msg) 1079 { 1080 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 1081 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 1082 msg->sysctl.newlen); 1083 1084 if (ret < 0) { 1085 msg->result = errno; 1086 } else { 1087 msg->result = 0; 1088 } 1089 } 1090 1091 static inline void 1092 handle_ioctl_msg(struct ff_msg *msg) 1093 { 1094 int fd, ret; 1095 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 1096 if (fd < 0) { 1097 ret = -1; 1098 goto done; 1099 } 1100 1101 ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data); 1102 1103 ff_close(fd); 1104 1105 done: 1106 if (ret < 0) { 1107 msg->result = errno; 1108 } else { 1109 msg->result = 0; 1110 } 1111 } 1112 1113 static inline void 1114 handle_route_msg(struct ff_msg *msg) 1115 { 1116 int ret = ff_rtioctl(msg->route.fib, msg->route.data, 1117 &msg->route.len, msg->route.maxlen); 1118 if (ret < 0) { 1119 msg->result = errno; 1120 } else { 1121 msg->result = 0; 1122 } 1123 } 1124 1125 static struct ff_top_args ff_status; 1126 static inline void 1127 handle_top_msg(struct ff_msg *msg) 1128 { 1129 msg->top = ff_status; 1130 msg->result = 0; 1131 } 1132 1133 #ifdef FF_NETGRAPH 1134 static inline void 1135 handle_ngctl_msg(struct ff_msg *msg) 1136 { 1137 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data); 1138 if (ret < 0) { 1139 msg->result = errno; 1140 } else { 1141 msg->result = 0; 1142 msg->ngctl.ret = ret; 1143 } 1144 } 1145 #endif 1146 1147 #ifdef FF_IPFW 1148 static inline void 1149 handle_ipfw_msg(struct ff_msg *msg) 1150 { 1151 int fd, ret; 1152 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW); 1153 if (fd < 0) { 1154 ret = -1; 1155 goto done; 1156 } 1157 1158 switch (msg->ipfw.cmd) { 1159 case FF_IPFW_GET: 1160 ret = ff_getsockopt_freebsd(fd, msg->ipfw.level, 1161 msg->ipfw.optname, msg->ipfw.optval, 1162 msg->ipfw.optlen); 1163 break; 1164 case FF_IPFW_SET: 1165 ret = ff_setsockopt_freebsd(fd, msg->ipfw.level, 1166 msg->ipfw.optname, msg->ipfw.optval, 1167 *(msg->ipfw.optlen)); 1168 break; 1169 default: 1170 ret = -1; 1171 errno = ENOTSUP; 1172 break; 1173 } 1174 1175 ff_close(fd); 1176 1177 done: 1178 if (ret < 0) { 1179 msg->result = errno; 1180 } else { 1181 msg->result = 0; 1182 } 1183 } 1184 #endif 1185 1186 static inline void 1187 handle_default_msg(struct ff_msg *msg) 1188 { 1189 msg->result = ENOTSUP; 1190 } 1191 1192 static inline void 1193 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1194 { 1195 switch (msg->msg_type) { 1196 case FF_SYSCTL: 1197 handle_sysctl_msg(msg); 1198 break; 1199 case FF_IOCTL: 1200 handle_ioctl_msg(msg); 1201 break; 1202 case FF_ROUTE: 1203 handle_route_msg(msg); 1204 break; 1205 case FF_TOP: 1206 handle_top_msg(msg); 1207 break; 1208 #ifdef FF_NETGRAPH 1209 case FF_NGCTL: 1210 handle_ngctl_msg(msg); 1211 break; 1212 #endif 1213 #ifdef FF_IPFW 1214 case FF_IPFW_CTL: 1215 handle_ipfw_msg(msg); 1216 break; 1217 #endif 1218 default: 1219 handle_default_msg(msg); 1220 break; 1221 } 1222 rte_ring_enqueue(msg_ring[proc_id].ring[1], msg); 1223 } 1224 1225 static inline int 1226 process_msg_ring(uint16_t proc_id) 1227 { 1228 void *msg; 1229 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1230 1231 if (unlikely(ret == 0)) { 1232 handle_msg((struct ff_msg *)msg, proc_id); 1233 } 1234 1235 return 0; 1236 } 1237 1238 /* Send burst of packets on an output interface */ 1239 static inline int 1240 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1241 { 1242 struct rte_mbuf **m_table; 1243 int ret; 1244 uint16_t queueid; 1245 1246 queueid = qconf->tx_queue_id[port]; 1247 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1248 1249 if (unlikely(qconf->pcap[port] != NULL)) { 1250 uint16_t i; 1251 for (i = 0; i < n; i++) { 1252 ff_dump_packets(qconf->pcap[port], m_table[i]); 1253 } 1254 } 1255 1256 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1257 if (unlikely(ret < n)) { 1258 do { 1259 rte_pktmbuf_free(m_table[ret]); 1260 } while (++ret < n); 1261 } 1262 1263 return 0; 1264 } 1265 1266 /* Enqueue a single packet, and send burst if queue is filled */ 1267 static inline int 1268 send_single_packet(struct rte_mbuf *m, uint8_t port) 1269 { 1270 uint16_t len; 1271 struct lcore_conf *qconf; 1272 1273 qconf = &lcore_conf; 1274 len = qconf->tx_mbufs[port].len; 1275 qconf->tx_mbufs[port].m_table[len] = m; 1276 len++; 1277 1278 /* enough pkts to be sent */ 1279 if (unlikely(len == MAX_PKT_BURST)) { 1280 send_burst(qconf, MAX_PKT_BURST, port); 1281 len = 0; 1282 } 1283 1284 qconf->tx_mbufs[port].len = len; 1285 return 0; 1286 } 1287 1288 int 1289 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1290 int total) 1291 { 1292 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1293 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1294 if (head == NULL) { 1295 ff_mbuf_free(m); 1296 return -1; 1297 } 1298 1299 head->pkt_len = total; 1300 head->nb_segs = 0; 1301 1302 int off = 0; 1303 struct rte_mbuf *cur = head, *prev = NULL; 1304 while(total > 0) { 1305 if (cur == NULL) { 1306 cur = rte_pktmbuf_alloc(mbuf_pool); 1307 if (cur == NULL) { 1308 rte_pktmbuf_free(head); 1309 ff_mbuf_free(m); 1310 return -1; 1311 } 1312 } 1313 1314 void *data = rte_pktmbuf_mtod(cur, void*); 1315 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1316 int ret = ff_mbuf_copydata(m, data, off, len); 1317 if (ret < 0) { 1318 rte_pktmbuf_free(head); 1319 ff_mbuf_free(m); 1320 return -1; 1321 } 1322 1323 if (prev != NULL) { 1324 prev->next = cur; 1325 } 1326 prev = cur; 1327 1328 cur->data_len = len; 1329 off += len; 1330 total -= len; 1331 head->nb_segs++; 1332 cur = NULL; 1333 } 1334 1335 struct ff_tx_offload offload = {0}; 1336 ff_mbuf_tx_offload(m, &offload); 1337 1338 void *data = rte_pktmbuf_mtod(head, void*); 1339 1340 if (offload.ip_csum) { 1341 /* ipv6 not supported yet */ 1342 struct ipv4_hdr *iph; 1343 int iph_len; 1344 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1345 iph_len = (iph->version_ihl & 0x0f) << 2; 1346 1347 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 1348 head->l2_len = ETHER_HDR_LEN; 1349 head->l3_len = iph_len; 1350 } 1351 1352 if (ctx->hw_features.tx_csum_l4) { 1353 struct ipv4_hdr *iph; 1354 int iph_len; 1355 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1356 iph_len = (iph->version_ihl & 0x0f) << 2; 1357 1358 if (offload.tcp_csum) { 1359 head->ol_flags |= PKT_TX_TCP_CKSUM; 1360 head->l2_len = ETHER_HDR_LEN; 1361 head->l3_len = iph_len; 1362 } 1363 1364 /* 1365 * TCP segmentation offload. 1366 * 1367 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 1368 * implies PKT_TX_TCP_CKSUM) 1369 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 1370 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 1371 * write the IP checksum to 0 in the packet 1372 * - fill the mbuf offload information: l2_len, 1373 * l3_len, l4_len, tso_segsz 1374 * - calculate the pseudo header checksum without taking ip_len 1375 * in account, and set it in the TCP header. Refer to 1376 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 1377 * used as helpers. 1378 */ 1379 if (offload.tso_seg_size) { 1380 struct tcp_hdr *tcph; 1381 int tcph_len; 1382 tcph = (struct tcp_hdr *)((char *)iph + iph_len); 1383 tcph_len = (tcph->data_off & 0xf0) >> 2; 1384 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 1385 1386 head->ol_flags |= PKT_TX_TCP_SEG; 1387 head->l4_len = tcph_len; 1388 head->tso_segsz = offload.tso_seg_size; 1389 } 1390 1391 if (offload.udp_csum) { 1392 head->ol_flags |= PKT_TX_UDP_CKSUM; 1393 head->l2_len = ETHER_HDR_LEN; 1394 head->l3_len = iph_len; 1395 } 1396 } 1397 1398 ff_mbuf_free(m); 1399 1400 return send_single_packet(head, ctx->port_id); 1401 } 1402 1403 static int 1404 main_loop(void *arg) 1405 { 1406 struct loop_routine *lr = (struct loop_routine *)arg; 1407 1408 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1409 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc; 1410 int i, j, nb_rx, idle; 1411 uint16_t port_id, queue_id; 1412 struct lcore_conf *qconf; 1413 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 1414 US_PER_S * BURST_TX_DRAIN_US; 1415 struct ff_dpdk_if_context *ctx; 1416 1417 prev_tsc = 0; 1418 usch_tsc = 0; 1419 1420 qconf = &lcore_conf; 1421 1422 while (1) { 1423 cur_tsc = rte_rdtsc(); 1424 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1425 rte_timer_manage(); 1426 } 1427 1428 idle = 1; 1429 sys_tsc = 0; 1430 usr_tsc = 0; 1431 1432 /* 1433 * TX burst queue drain 1434 */ 1435 diff_tsc = cur_tsc - prev_tsc; 1436 if (unlikely(diff_tsc > drain_tsc)) { 1437 for (i = 0; i < qconf->nb_tx_port; i++) { 1438 port_id = qconf->tx_port_id[i]; 1439 if (qconf->tx_mbufs[port_id].len == 0) 1440 continue; 1441 1442 idle = 0; 1443 1444 send_burst(qconf, 1445 qconf->tx_mbufs[port_id].len, 1446 port_id); 1447 qconf->tx_mbufs[port_id].len = 0; 1448 } 1449 1450 prev_tsc = cur_tsc; 1451 } 1452 1453 /* 1454 * Read packet from RX queues 1455 */ 1456 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1457 port_id = qconf->rx_queue_list[i].port_id; 1458 queue_id = qconf->rx_queue_list[i].queue_id; 1459 ctx = veth_ctx[port_id]; 1460 1461 #ifdef FF_KNI 1462 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1463 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1464 } 1465 #endif 1466 1467 process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); 1468 1469 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1470 MAX_PKT_BURST); 1471 if (nb_rx == 0) 1472 continue; 1473 1474 idle = 0; 1475 1476 /* Prefetch first packets */ 1477 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1478 rte_prefetch0(rte_pktmbuf_mtod( 1479 pkts_burst[j], void *)); 1480 } 1481 1482 /* Prefetch and handle already prefetched packets */ 1483 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1484 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1485 j + PREFETCH_OFFSET], void *)); 1486 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1487 } 1488 1489 /* Handle remaining prefetched packets */ 1490 for (; j < nb_rx; j++) { 1491 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1492 } 1493 } 1494 1495 process_msg_ring(qconf->proc_id); 1496 1497 div_tsc = rte_rdtsc(); 1498 1499 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) { 1500 usch_tsc = cur_tsc; 1501 lr->loop(lr->arg); 1502 } 1503 1504 end_tsc = rte_rdtsc(); 1505 1506 if (usch_tsc == cur_tsc) { 1507 usr_tsc = end_tsc - div_tsc; 1508 } 1509 1510 if (!idle) { 1511 sys_tsc = div_tsc - cur_tsc; 1512 ff_status.sys_tsc += sys_tsc; 1513 } 1514 1515 ff_status.usr_tsc += usr_tsc; 1516 ff_status.work_tsc += end_tsc - cur_tsc; 1517 ff_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1518 1519 ff_status.loops++; 1520 } 1521 1522 return 0; 1523 } 1524 1525 int 1526 ff_dpdk_if_up(void) { 1527 int i; 1528 struct lcore_conf *qconf = &lcore_conf; 1529 for (i = 0; i < qconf->nb_tx_port; i++) { 1530 uint16_t port_id = qconf->tx_port_id[i]; 1531 1532 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id]; 1533 veth_ctx[port_id] = ff_veth_attach(pconf); 1534 if (veth_ctx[port_id] == NULL) { 1535 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1536 } 1537 } 1538 1539 return 0; 1540 } 1541 1542 void 1543 ff_dpdk_run(loop_func_t loop, void *arg) { 1544 struct loop_routine *lr = rte_malloc(NULL, 1545 sizeof(struct loop_routine), 0); 1546 lr->loop = loop; 1547 lr->arg = arg; 1548 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1549 rte_eal_mp_wait_lcore(); 1550 rte_free(lr); 1551 } 1552 1553 void 1554 ff_dpdk_pktmbuf_free(void *m) 1555 { 1556 rte_pktmbuf_free((struct rte_mbuf *)m); 1557 } 1558 1559 static uint32_t 1560 toeplitz_hash(unsigned keylen, const uint8_t *key, 1561 unsigned datalen, const uint8_t *data) 1562 { 1563 uint32_t hash = 0, v; 1564 u_int i, b; 1565 1566 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1567 1568 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1569 for (i = 0; i < datalen; i++) { 1570 for (b = 0; b < 8; b++) { 1571 if (data[i] & (1<<(7-b))) 1572 hash ^= v; 1573 v <<= 1; 1574 if ((i + 4) < keylen && 1575 (key[i+4] & (1<<(7-b)))) 1576 v |= 1; 1577 } 1578 } 1579 return (hash); 1580 } 1581 1582 int 1583 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1584 uint16_t sport, uint16_t dport) 1585 { 1586 struct lcore_conf *qconf = &lcore_conf; 1587 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 1588 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id]; 1589 1590 if (nb_queues <= 1) { 1591 return 1; 1592 } 1593 1594 uint16_t reta_size = rss_reta_size[ctx->port_id]; 1595 uint16_t queueid = qconf->tx_queue_id[ctx->port_id]; 1596 1597 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1598 sizeof(dport)]; 1599 1600 unsigned datalen = 0; 1601 1602 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1603 datalen += sizeof(saddr); 1604 1605 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1606 datalen += sizeof(daddr); 1607 1608 bcopy(&sport, &data[datalen], sizeof(sport)); 1609 datalen += sizeof(sport); 1610 1611 bcopy(&dport, &data[datalen], sizeof(dport)); 1612 datalen += sizeof(dport); 1613 1614 uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes), 1615 default_rsskey_40bytes, datalen, data); 1616 1617 return ((hash & (reta_size - 1)) % nb_queues) == queueid; 1618 } 1619 1620 void 1621 ff_regist_packet_dispatcher(dispatch_func_t func) 1622 { 1623 packet_dispatcher = func; 1624 } 1625 1626 uint64_t 1627 ff_get_tsc_ns() 1628 { 1629 uint64_t cur_tsc = rte_rdtsc(); 1630 uint64_t hz = rte_get_tsc_hz(); 1631 return ((double)cur_tsc/(double)hz) * NS_PER_S; 1632 } 1633 1634