1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 28 #include <rte_common.h> 29 #include <rte_byteorder.h> 30 #include <rte_log.h> 31 #include <rte_memory.h> 32 #include <rte_memcpy.h> 33 #include <rte_memzone.h> 34 #include <rte_config.h> 35 #include <rte_eal.h> 36 #include <rte_pci.h> 37 #include <rte_mbuf.h> 38 #include <rte_memory.h> 39 #include <rte_lcore.h> 40 #include <rte_launch.h> 41 #include <rte_ethdev.h> 42 #include <rte_debug.h> 43 #include <rte_common.h> 44 #include <rte_ether.h> 45 #include <rte_malloc.h> 46 #include <rte_cycles.h> 47 #include <rte_timer.h> 48 #include <rte_thash.h> 49 #include <rte_ip.h> 50 #include <rte_tcp.h> 51 #include <rte_udp.h> 52 53 #include "ff_dpdk_if.h" 54 #include "ff_dpdk_pcap.h" 55 #include "ff_dpdk_kni.h" 56 #include "ff_config.h" 57 #include "ff_veth.h" 58 #include "ff_host_interface.h" 59 #include "ff_msg.h" 60 #include "ff_api.h" 61 62 #define MEMPOOL_CACHE_SIZE 256 63 64 #define ARP_RING_SIZE 2048 65 66 #define MSG_RING_SIZE 32 67 68 /* 69 * Configurable number of RX/TX ring descriptors 70 */ 71 #define RX_QUEUE_SIZE 512 72 #define TX_QUEUE_SIZE 512 73 74 #define MAX_PKT_BURST 32 75 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 76 77 /* 78 * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send. 79 */ 80 #define MAX_TX_BURST (MAX_PKT_BURST / 2) 81 82 #define NB_SOCKETS 8 83 84 /* Configure how many packets ahead to prefetch, when reading packets */ 85 #define PREFETCH_OFFSET 3 86 87 #define MAX_RX_QUEUE_PER_LCORE 16 88 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS 89 #define MAX_RX_QUEUE_PER_PORT 128 90 91 #define KNI_MBUF_MAX 2048 92 #define KNI_QUEUE_SIZE 2048 93 94 static int enable_kni; 95 static int kni_accept; 96 97 static struct rte_timer freebsd_clock; 98 99 // Mellanox Linux's driver key 100 static uint8_t default_rsskey_40bytes[40] = { 101 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 102 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 103 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 104 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 105 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 106 }; 107 108 static struct rte_eth_conf default_port_conf = { 109 .rxmode = { 110 .mq_mode = ETH_MQ_RX_RSS, 111 .max_rx_pkt_len = ETHER_MAX_LEN, 112 .split_hdr_size = 0, /**< hdr buf size */ 113 .header_split = 0, /**< Header Split disabled */ 114 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 115 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 116 .hw_vlan_strip = 0, /**< VLAN strip disabled. */ 117 .hw_vlan_extend = 0, /**< Extended VLAN disabled. */ 118 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 119 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 120 .enable_lro = 0, /**< LRO disabled */ 121 }, 122 .rx_adv_conf = { 123 .rss_conf = { 124 .rss_key = default_rsskey_40bytes, 125 .rss_key_len = 40, 126 .rss_hf = ETH_RSS_PROTO_MASK, 127 }, 128 }, 129 .txmode = { 130 .mq_mode = ETH_MQ_TX_NONE, 131 }, 132 }; 133 134 struct mbuf_table { 135 uint16_t len; 136 struct rte_mbuf *m_table[MAX_PKT_BURST]; 137 }; 138 139 struct lcore_rx_queue { 140 uint8_t port_id; 141 uint8_t queue_id; 142 } __rte_cache_aligned; 143 144 struct lcore_conf { 145 uint16_t proc_id; 146 uint16_t nb_procs; 147 uint16_t socket_id; 148 uint16_t nb_rx_queue; 149 uint16_t *proc_lcore; 150 struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; 151 uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; 152 struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS]; 153 char *pcap[RTE_MAX_ETHPORTS]; 154 } __rte_cache_aligned; 155 156 static struct lcore_conf lcore_conf; 157 158 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 159 160 static struct rte_ring **arp_ring[RTE_MAX_LCORE]; 161 162 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 163 164 struct ff_msg_ring { 165 char ring_name[2][RTE_RING_NAMESIZE]; 166 /* ring[0] for lcore recv msg, other send */ 167 /* ring[1] for lcore send msg, other read */ 168 struct rte_ring *ring[2]; 169 } __rte_cache_aligned; 170 171 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 172 static struct rte_mempool *message_pool; 173 174 struct ff_dpdk_if_context { 175 void *sc; 176 void *ifp; 177 uint16_t port_id; 178 struct ff_hw_features hw_features; 179 } __rte_cache_aligned; 180 181 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 182 183 extern void ff_hardclock(void); 184 185 static void 186 ff_hardclock_job(__rte_unused struct rte_timer *timer, 187 __rte_unused void *arg) { 188 ff_hardclock(); 189 ff_update_current_ts(); 190 } 191 192 struct ff_dpdk_if_context * 193 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 194 { 195 struct ff_dpdk_if_context *ctx; 196 197 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 198 if (ctx == NULL) 199 return NULL; 200 201 ctx->sc = sc; 202 ctx->ifp = ifp; 203 ctx->port_id = cfg->port_id; 204 ctx->hw_features = cfg->hw_features; 205 206 return ctx; 207 } 208 209 void 210 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 211 { 212 free(ctx); 213 } 214 215 static void 216 check_all_ports_link_status(void) 217 { 218 #define CHECK_INTERVAL 100 /* 100ms */ 219 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 220 221 uint8_t portid, count, all_ports_up, print_flag = 0; 222 struct rte_eth_link link; 223 224 printf("\nChecking link status"); 225 fflush(stdout); 226 227 int i, nb_ports; 228 nb_ports = ff_global_cfg.dpdk.nb_ports; 229 for (count = 0; count <= MAX_CHECK_TIME; count++) { 230 all_ports_up = 1; 231 for (i = 0; i < nb_ports; i++) { 232 uint8_t portid = ff_global_cfg.dpdk.port_cfgs[i].port_id; 233 memset(&link, 0, sizeof(link)); 234 rte_eth_link_get_nowait(portid, &link); 235 236 /* print link status if flag set */ 237 if (print_flag == 1) { 238 if (link.link_status) { 239 printf("Port %d Link Up - speed %u " 240 "Mbps - %s\n", (int)portid, 241 (unsigned)link.link_speed, 242 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 243 ("full-duplex") : ("half-duplex\n")); 244 } else { 245 printf("Port %d Link Down\n", (int)portid); 246 } 247 continue; 248 } 249 /* clear all_ports_up flag if any link down */ 250 if (link.link_status == 0) { 251 all_ports_up = 0; 252 break; 253 } 254 } 255 256 /* after finally printing all link status, get out */ 257 if (print_flag == 1) 258 break; 259 260 if (all_ports_up == 0) { 261 printf("."); 262 fflush(stdout); 263 rte_delay_ms(CHECK_INTERVAL); 264 } 265 266 /* set the print_flag if all ports up or timeout */ 267 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 268 print_flag = 1; 269 printf("done\n"); 270 } 271 } 272 } 273 274 static int 275 init_lcore_conf(void) 276 { 277 uint8_t nb_ports = rte_eth_dev_count(); 278 if (nb_ports == 0) { 279 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 280 } 281 282 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 283 lcore_conf.nb_procs = ff_global_cfg.dpdk.nb_procs; 284 285 lcore_conf.proc_lcore = rte_zmalloc(NULL, 286 sizeof(uint16_t) * lcore_conf.nb_procs, 0); 287 if (lcore_conf.proc_lcore == NULL) { 288 rte_exit(EXIT_FAILURE, "rte_zmalloc proc_lcore failed\n"); 289 } 290 rte_memcpy(lcore_conf.proc_lcore, ff_global_cfg.dpdk.proc_lcore, 291 sizeof(uint16_t) * lcore_conf.nb_procs); 292 uint16_t proc_id; 293 for (proc_id = 0; proc_id < lcore_conf.nb_procs; proc_id++) { 294 uint16_t lcore_id = lcore_conf.proc_lcore[proc_id]; 295 if (!lcore_config[lcore_id].detected) { 296 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 297 } 298 } 299 300 uint16_t socket_id = 0; 301 if (ff_global_cfg.dpdk.numa_on) { 302 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 303 } 304 305 lcore_conf.socket_id = socket_id; 306 307 /* Currently, proc id 1:1 map to rx/tx queue id per port. */ 308 uint8_t port_id, enabled_ports = 0; 309 for (port_id = 0; port_id < nb_ports; port_id++) { 310 if (ff_global_cfg.dpdk.port_mask && 311 (ff_global_cfg.dpdk.port_mask & (1 << port_id)) == 0) { 312 printf("\nSkipping disabled port %d\n", port_id); 313 continue; 314 } 315 316 if (port_id >= ff_global_cfg.dpdk.nb_ports) { 317 printf("\nSkipping non-configured port %d\n", port_id); 318 break; 319 } 320 321 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 322 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 323 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = lcore_conf.proc_id; 324 lcore_conf.nb_rx_queue++; 325 326 lcore_conf.tx_queue_id[port_id] = lcore_conf.proc_id; 327 lcore_conf.pcap[port_id] = ff_global_cfg.dpdk.port_cfgs[enabled_ports].pcap; 328 329 ff_global_cfg.dpdk.port_cfgs[enabled_ports].port_id = port_id; 330 331 enabled_ports++; 332 } 333 334 ff_global_cfg.dpdk.nb_ports = enabled_ports; 335 336 return 0; 337 } 338 339 static int 340 init_mem_pool(void) 341 { 342 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 343 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 344 uint32_t nb_tx_queue = nb_lcores; 345 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 346 347 unsigned nb_mbuf = RTE_MAX ( 348 (nb_rx_queue*RX_QUEUE_SIZE + 349 nb_ports*nb_lcores*MAX_PKT_BURST + 350 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 351 nb_lcores*MEMPOOL_CACHE_SIZE + 352 nb_ports*KNI_MBUF_MAX + 353 nb_ports*KNI_QUEUE_SIZE + 354 nb_lcores*nb_ports*ARP_RING_SIZE), 355 (unsigned)8192); 356 357 unsigned socketid = 0; 358 uint16_t i, lcore_id; 359 char s[64]; 360 int numa_on = ff_global_cfg.dpdk.numa_on; 361 362 for (i = 0; i < lcore_conf.nb_procs; i++) { 363 lcore_id = lcore_conf.proc_lcore[i]; 364 if (numa_on) { 365 socketid = rte_lcore_to_socket_id(lcore_id); 366 } 367 368 if (socketid >= NB_SOCKETS) { 369 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 370 socketid, i, NB_SOCKETS); 371 } 372 373 if (pktmbuf_pool[socketid] != NULL) { 374 continue; 375 } 376 377 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 378 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 379 pktmbuf_pool[socketid] = 380 rte_pktmbuf_pool_create(s, nb_mbuf, 381 MEMPOOL_CACHE_SIZE, 0, 382 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 383 } else { 384 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 385 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 386 } 387 388 if (pktmbuf_pool[socketid] == NULL) { 389 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 390 } else { 391 printf("create mbuf pool on socket %d\n", socketid); 392 } 393 } 394 395 return 0; 396 } 397 398 static struct rte_ring * 399 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 400 { 401 struct rte_ring *ring; 402 403 if (name == NULL) 404 return NULL; 405 406 /* If already create, just attached it */ 407 if (likely((ring = rte_ring_lookup(name)) != NULL)) 408 return ring; 409 410 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 411 return rte_ring_create(name, count, socket_id, flags); 412 } else { 413 return rte_ring_lookup(name); 414 } 415 } 416 417 static int 418 init_arp_ring(void) 419 { 420 int i, j, ret; 421 char name_buf[RTE_RING_NAMESIZE]; 422 int nb_procs = ff_global_cfg.dpdk.nb_procs; 423 int proc_id = ff_global_cfg.dpdk.proc_id; 424 425 /* Allocate arp ring ptr according to eth dev count. */ 426 int nb_ports = rte_eth_dev_count(); 427 for(i = 0; i < nb_procs; ++i) { 428 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_%d_%d", 429 proc_id, i); 430 431 arp_ring[i] = rte_zmalloc(name_buf, 432 sizeof(struct rte_ring *) * nb_ports, 433 RTE_CACHE_LINE_SIZE); 434 if (arp_ring[i] == NULL) { 435 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 436 "failed\n", name_buf); 437 } 438 } 439 440 unsigned socketid = lcore_conf.socket_id; 441 442 /* Create ring according to ports actually being used. */ 443 nb_ports = ff_global_cfg.dpdk.nb_ports; 444 for (j = 0; j < nb_ports; j++) { 445 uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[j].port_id; 446 447 for(i = 0; i < nb_procs; ++i) { 448 snprintf(name_buf, RTE_RING_NAMESIZE, "arp_ring_%d_%d", i, port_id); 449 arp_ring[i][port_id] = create_ring(name_buf, ARP_RING_SIZE, 450 socketid, RING_F_SC_DEQ); 451 452 if (arp_ring[i][port_id] == NULL) 453 rte_panic("create ring:%s failed!\n", name_buf); 454 455 printf("create ring:%s success, %u ring entries are now free!\n", 456 name_buf, rte_ring_free_count(arp_ring[i][port_id])); 457 } 458 } 459 460 return 0; 461 } 462 463 static void 464 ff_msg_init(struct rte_mempool *mp, 465 __attribute__((unused)) void *opaque_arg, 466 void *obj, __attribute__((unused)) unsigned i) 467 { 468 struct ff_msg *msg = (struct ff_msg *)obj; 469 msg->msg_type = FF_UNKNOWN; 470 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 471 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 472 } 473 474 static int 475 init_msg_ring(void) 476 { 477 uint16_t i; 478 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 479 unsigned socketid = lcore_conf.socket_id; 480 481 /* Create message buffer pool */ 482 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 483 message_pool = rte_mempool_create(FF_MSG_POOL, 484 MSG_RING_SIZE * 2 * nb_procs, 485 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 486 NULL, NULL, ff_msg_init, NULL, 487 socketid, 0); 488 } else { 489 message_pool = rte_mempool_lookup(FF_MSG_POOL); 490 } 491 492 if (message_pool == NULL) { 493 rte_panic("Create msg mempool failed\n"); 494 } 495 496 for(i = 0; i < nb_procs; ++i) { 497 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 498 "%s%u", FF_MSG_RING_IN, i); 499 snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE, 500 "%s%u", FF_MSG_RING_OUT, i); 501 502 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 503 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 504 if (msg_ring[i].ring[0] == NULL) 505 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 506 507 msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1], 508 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 509 if (msg_ring[i].ring[1] == NULL) 510 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 511 } 512 513 return 0; 514 } 515 516 static int 517 init_kni(void) 518 { 519 int nb_ports = rte_eth_dev_count(); 520 kni_accept = 0; 521 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 522 kni_accept = 1; 523 524 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 525 ff_global_cfg.kni.udp_port); 526 527 unsigned socket_id = lcore_conf.socket_id; 528 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 529 530 nb_ports = ff_global_cfg.dpdk.nb_ports; 531 int i, ret; 532 for (i = 0; i < nb_ports; i++) { 533 uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id; 534 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 535 } 536 537 return 0; 538 } 539 540 static void 541 set_rss_table(uint8_t port_id, uint16_t reta_size, uint16_t nb_queues) 542 { 543 if (reta_size == 0) { 544 return; 545 } 546 547 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 548 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 549 550 /* config HW indirection table */ 551 unsigned i, j, hash=0; 552 for (i = 0; i < reta_conf_size; i++) { 553 reta_conf[i].mask = ~0ULL; 554 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 555 reta_conf[i].reta[j] = hash++ % nb_queues; 556 } 557 } 558 559 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 560 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 561 port_id); 562 } 563 } 564 565 static int 566 init_port_start(void) 567 { 568 int nb_ports = ff_global_cfg.dpdk.nb_ports; 569 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 570 unsigned socketid = rte_lcore_to_socket_id(rte_lcore_id()); 571 struct rte_mempool *mbuf_pool = pktmbuf_pool[socketid]; 572 uint16_t i; 573 574 for (i = 0; i < nb_ports; i++) { 575 uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id; 576 577 struct rte_eth_dev_info dev_info; 578 rte_eth_dev_info_get(port_id, &dev_info); 579 580 if (nb_procs > dev_info.max_rx_queues) { 581 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 582 nb_procs, 583 dev_info.max_rx_queues); 584 } 585 586 if (nb_procs > dev_info.max_tx_queues) { 587 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 588 nb_procs, 589 dev_info.max_tx_queues); 590 } 591 592 struct ether_addr addr; 593 rte_eth_macaddr_get(port_id, &addr); 594 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 595 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 596 (unsigned)port_id, 597 addr.addr_bytes[0], addr.addr_bytes[1], 598 addr.addr_bytes[2], addr.addr_bytes[3], 599 addr.addr_bytes[4], addr.addr_bytes[5]); 600 601 rte_memcpy(ff_global_cfg.dpdk.port_cfgs[i].mac, 602 addr.addr_bytes, ETHER_ADDR_LEN); 603 604 /* Clear txq_flags - we do not need multi-mempool and refcnt */ 605 dev_info.default_txconf.txq_flags = ETH_TXQ_FLAGS_NOMULTMEMP | 606 ETH_TXQ_FLAGS_NOREFCOUNT; 607 608 /* Disable features that are not supported by port's HW */ 609 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) { 610 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP; 611 } 612 613 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 614 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP; 615 } 616 617 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) { 618 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP; 619 } 620 621 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { 622 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; 623 } 624 625 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { 626 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; 627 } 628 629 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) && 630 !(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TSO)) { 631 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; 632 } 633 634 struct rte_eth_conf port_conf = {0}; 635 636 /* Set RSS mode */ 637 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 638 port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK; 639 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes; 640 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; 641 642 /* Set Rx VLAN stripping */ 643 if (ff_global_cfg.dpdk.vlan_strip) { 644 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 645 port_conf.rxmode.hw_vlan_strip = 1; 646 } 647 } 648 649 /* Enable HW CRC stripping */ 650 port_conf.rxmode.hw_strip_crc = 1; 651 652 /* FIXME: Enable TCP LRO ?*/ 653 #if 0 654 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 655 printf("LRO is supported\n"); 656 port_conf.rxmode.enable_lro = 1; 657 ff_global_cfg.dpdk.port_cfgs[i].hw_features.rx_lro = 1; 658 } 659 #endif 660 661 /* Set Rx checksum checking */ 662 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 663 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 664 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 665 printf("RX checksum offload supported\n"); 666 port_conf.rxmode.hw_ip_checksum = 1; 667 ff_global_cfg.dpdk.port_cfgs[i].hw_features.rx_csum = 1; 668 } 669 670 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 671 printf("TX ip checksum offload supported\n"); 672 ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_csum_ip = 1; 673 } 674 675 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 676 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 677 printf("TX TCP&UDP checksum offload supported\n"); 678 ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_csum_l4 = 1; 679 } 680 681 if (ff_global_cfg.dpdk.tso) { 682 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 683 printf("TSO is supported\n"); 684 ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_tso = 1; 685 } 686 } else { 687 printf("TSO is disabled\n"); 688 } 689 690 if (dev_info.reta_size) { 691 /* reta size must be power of 2 */ 692 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 693 694 rss_reta_size[port_id] = dev_info.reta_size; 695 printf("port[%d]: rss table size: %d\n", port_id, 696 dev_info.reta_size); 697 } 698 699 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 700 continue; 701 } 702 703 /* Currently, proc id 1:1 map to queue id per port. */ 704 int ret = rte_eth_dev_configure(port_id, nb_procs, nb_procs, &port_conf); 705 if (ret != 0) { 706 return ret; 707 } 708 709 uint16_t q; 710 for (q = 0; q < nb_procs; q++) { 711 ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE, 712 socketid, &dev_info.default_txconf); 713 if (ret < 0) { 714 return ret; 715 } 716 717 ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE, 718 socketid, &dev_info.default_rxconf, mbuf_pool); 719 if (ret < 0) { 720 return ret; 721 } 722 } 723 724 ret = rte_eth_dev_start(port_id); 725 if (ret < 0) { 726 return ret; 727 } 728 729 if (nb_procs > 1) { 730 /* set HW rss hash function to Toeplitz. */ 731 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 732 struct rte_eth_hash_filter_info info = {0}; 733 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 734 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 735 736 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 737 RTE_ETH_FILTER_SET, &info) < 0) { 738 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 739 port_id); 740 } 741 } 742 743 set_rss_table(port_id, dev_info.reta_size, nb_procs); 744 } 745 746 /* Enable RX in promiscuous mode for the Ethernet device. */ 747 if (ff_global_cfg.dpdk.promiscuous) { 748 rte_eth_promiscuous_enable(port_id); 749 ret = rte_eth_promiscuous_get(port_id); 750 if (ret == 1) { 751 printf("set port %u to promiscuous mode ok\n", port_id); 752 } else { 753 printf("set port %u to promiscuous mode error\n", port_id); 754 } 755 } 756 757 /* Enable pcap dump */ 758 if (ff_global_cfg.dpdk.port_cfgs[i].pcap) { 759 ff_enable_pcap(ff_global_cfg.dpdk.port_cfgs[i].pcap); 760 } 761 } 762 763 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 764 check_all_ports_link_status(); 765 } 766 767 return 0; 768 } 769 770 static int 771 init_clock(void) 772 { 773 rte_timer_subsystem_init(); 774 uint64_t hz = rte_get_timer_hz(); 775 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 776 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 777 778 rte_timer_init(&freebsd_clock); 779 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 780 rte_lcore_id(), &ff_hardclock_job, NULL); 781 782 ff_update_current_ts(); 783 784 return 0; 785 } 786 787 int 788 ff_dpdk_init(int argc, char **argv) 789 { 790 if (ff_global_cfg.dpdk.nb_procs < 1 || 791 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 792 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 793 ff_global_cfg.dpdk.proc_id < 0) { 794 printf("param num_procs[%d] or proc_id[%d] error!\n", 795 ff_global_cfg.dpdk.nb_procs, 796 ff_global_cfg.dpdk.proc_id); 797 exit(1); 798 } 799 800 int ret = rte_eal_init(argc, argv); 801 if (ret < 0) { 802 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 803 } 804 805 init_lcore_conf(); 806 807 init_mem_pool(); 808 809 init_arp_ring(); 810 811 init_msg_ring(); 812 813 enable_kni = ff_global_cfg.kni.enable; 814 if (enable_kni) { 815 init_kni(); 816 } 817 818 ret = init_port_start(); 819 if (ret < 0) { 820 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 821 } 822 823 init_clock(); 824 825 return 0; 826 } 827 828 static void 829 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 830 { 831 uint8_t rx_csum = ctx->hw_features.rx_csum; 832 if (rx_csum) { 833 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 834 return; 835 } 836 } 837 838 /* 839 * FIXME: should we save pkt->vlan_tci 840 * if (pkt->ol_flags & PKT_RX_VLAN_PKT) 841 */ 842 843 void *data = rte_pktmbuf_mtod(pkt, void*); 844 uint16_t len = rte_pktmbuf_data_len(pkt); 845 846 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 847 if (hdr == NULL) { 848 rte_pktmbuf_free(pkt); 849 return; 850 } 851 852 struct rte_mbuf *pn = pkt->next; 853 void *prev = hdr; 854 while(pn != NULL) { 855 data = rte_pktmbuf_mtod(pkt, void*); 856 len = rte_pktmbuf_data_len(pkt); 857 858 void *mb = ff_mbuf_get(prev, data, len); 859 if (mb == NULL) { 860 ff_mbuf_free(hdr); 861 rte_pktmbuf_free(pkt); 862 return; 863 } 864 pn = pn->next; 865 prev = mb; 866 } 867 868 ff_veth_process_packet(ctx->ifp, hdr); 869 } 870 871 static enum FilterReturn 872 protocol_filter(const void *data, uint16_t len) 873 { 874 if(len < sizeof(struct ether_hdr)) 875 return FILTER_UNKNOWN; 876 877 const struct ether_hdr *hdr; 878 hdr = (const struct ether_hdr *)data; 879 880 if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP) 881 return FILTER_ARP; 882 883 if (!enable_kni) { 884 return FILTER_UNKNOWN; 885 } 886 887 if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4) 888 return FILTER_UNKNOWN; 889 890 return ff_kni_proto_filter(data + sizeof(struct ether_hdr), 891 len - sizeof(struct ether_hdr)); 892 } 893 894 static inline void 895 process_packets(uint8_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 896 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 897 { 898 struct lcore_conf *qconf = &lcore_conf; 899 900 uint16_t i; 901 for (i = 0; i < count; i++) { 902 struct rte_mbuf *rtem = bufs[i]; 903 904 if (unlikely(qconf->pcap[port_id] != NULL)) { 905 ff_dump_packets(qconf->pcap[port_id], rtem); 906 } 907 908 void *data = rte_pktmbuf_mtod(rtem, void*); 909 uint16_t len = rte_pktmbuf_data_len(rtem); 910 911 enum FilterReturn filter = protocol_filter(data, len); 912 if (filter == FILTER_ARP) { 913 struct rte_mempool *mbuf_pool; 914 struct rte_mbuf *mbuf_clone; 915 if (pkts_from_ring == 0) { 916 uint16_t i; 917 for(i = 0; i < qconf->nb_procs; ++i) { 918 if(i == queue_id) 919 continue; 920 921 mbuf_pool = pktmbuf_pool[rte_lcore_to_socket_id(qconf->proc_lcore[i])]; 922 mbuf_clone = rte_pktmbuf_clone(rtem, mbuf_pool); 923 if(mbuf_clone) { 924 int ret = rte_ring_enqueue(arp_ring[i][port_id], mbuf_clone); 925 if (ret < 0) 926 rte_pktmbuf_free(mbuf_clone); 927 } 928 } 929 } 930 931 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 932 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 933 mbuf_clone = rte_pktmbuf_clone(rtem, mbuf_pool); 934 if(mbuf_clone) { 935 ff_kni_enqueue(port_id, mbuf_clone); 936 } 937 } 938 939 ff_veth_input(ctx, rtem); 940 } else if (enable_kni && ((filter == FILTER_KNI && kni_accept) || 941 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 942 ff_kni_enqueue(port_id, rtem); 943 } else { 944 ff_veth_input(ctx, rtem); 945 } 946 } 947 } 948 949 static inline int 950 process_arp_ring(uint8_t port_id, uint16_t queue_id, 951 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 952 { 953 /* read packet from ring buf and to process */ 954 uint16_t nb_rb; 955 nb_rb = rte_ring_dequeue_burst(arp_ring[queue_id][port_id], 956 (void **)pkts_burst, MAX_PKT_BURST); 957 958 if(nb_rb > 0) { 959 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 960 } 961 962 return 0; 963 } 964 965 static inline void 966 handle_sysctl_msg(struct ff_msg *msg, uint16_t proc_id) 967 { 968 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 969 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 970 msg->sysctl.newlen); 971 972 if (ret < 0) { 973 msg->result = errno; 974 } else { 975 msg->result = 0; 976 } 977 978 rte_ring_enqueue(msg_ring[proc_id].ring[1], msg); 979 } 980 981 static inline void 982 handle_ioctl_msg(struct ff_msg *msg, uint16_t proc_id) 983 { 984 int fd, ret; 985 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 986 if (fd < 0) { 987 ret = -1; 988 goto done; 989 } 990 991 ret = ff_ioctl(fd, msg->ioctl.cmd, msg->ioctl.data); 992 993 ff_close(fd); 994 995 done: 996 if (ret < 0) { 997 msg->result = errno; 998 } else { 999 msg->result = 0; 1000 } 1001 1002 rte_ring_enqueue(msg_ring[proc_id].ring[1], msg); 1003 } 1004 1005 static inline void 1006 handle_route_msg(struct ff_msg *msg, uint16_t proc_id) 1007 { 1008 msg->result = ff_rtioctl(msg->route.fib, msg->route.data, 1009 &msg->route.len, msg->route.maxlen); 1010 1011 rte_ring_enqueue(msg_ring[proc_id].ring[1], msg); 1012 } 1013 1014 static struct ff_top_args ff_status; 1015 static inline void 1016 handle_top_msg(struct ff_msg *msg, uint16_t proc_id) 1017 { 1018 msg->top = ff_status; 1019 msg->result = 0; 1020 1021 rte_ring_enqueue(msg_ring[proc_id].ring[1], msg); 1022 } 1023 1024 static inline void 1025 handle_default_msg(struct ff_msg *msg, uint16_t proc_id) 1026 { 1027 msg->result = EINVAL; 1028 rte_ring_enqueue(msg_ring[proc_id].ring[1], msg); 1029 } 1030 1031 static inline void 1032 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1033 { 1034 switch (msg->msg_type) { 1035 case FF_SYSCTL: 1036 handle_sysctl_msg(msg, proc_id); 1037 break; 1038 case FF_IOCTL: 1039 handle_ioctl_msg(msg, proc_id); 1040 break; 1041 case FF_ROUTE: 1042 handle_route_msg(msg, proc_id); 1043 break; 1044 case FF_TOP: 1045 handle_top_msg(msg, proc_id); 1046 break; 1047 default: 1048 handle_default_msg(msg, proc_id); 1049 break; 1050 } 1051 } 1052 1053 static inline int 1054 process_msg_ring(uint16_t proc_id) 1055 { 1056 void *msg; 1057 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1058 1059 if (unlikely(ret == 0)) { 1060 handle_msg((struct ff_msg *)msg, proc_id); 1061 } 1062 1063 return 0; 1064 } 1065 1066 /* Send burst of packets on an output interface */ 1067 static inline int 1068 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1069 { 1070 struct rte_mbuf **m_table; 1071 int ret; 1072 uint16_t queueid; 1073 1074 queueid = qconf->tx_queue_id[port]; 1075 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1076 1077 if (unlikely(qconf->pcap[port] != NULL)) { 1078 uint16_t i; 1079 for (i = 0; i < n; i++) { 1080 ff_dump_packets(qconf->pcap[port], m_table[i]); 1081 } 1082 } 1083 1084 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1085 if (unlikely(ret < n)) { 1086 do { 1087 rte_pktmbuf_free(m_table[ret]); 1088 } while (++ret < n); 1089 } 1090 1091 return 0; 1092 } 1093 1094 /* Enqueue a single packet, and send burst if queue is filled */ 1095 static inline int 1096 send_single_packet(struct rte_mbuf *m, uint8_t port) 1097 { 1098 uint16_t len; 1099 struct lcore_conf *qconf; 1100 1101 qconf = &lcore_conf; 1102 len = qconf->tx_mbufs[port].len; 1103 qconf->tx_mbufs[port].m_table[len] = m; 1104 len++; 1105 1106 /* enough pkts to be sent */ 1107 if (unlikely(len == MAX_PKT_BURST)) { 1108 send_burst(qconf, MAX_PKT_BURST, port); 1109 len = 0; 1110 } 1111 1112 qconf->tx_mbufs[port].len = len; 1113 return 0; 1114 } 1115 1116 int 1117 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1118 int total) 1119 { 1120 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1121 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1122 if (head == NULL) { 1123 ff_mbuf_free(m); 1124 return -1; 1125 } 1126 1127 head->pkt_len = total; 1128 head->nb_segs = 0; 1129 1130 int off = 0; 1131 struct rte_mbuf *cur = head, *prev = NULL; 1132 while(total > 0) { 1133 if (cur == NULL) { 1134 cur = rte_pktmbuf_alloc(mbuf_pool); 1135 if (cur == NULL) { 1136 rte_pktmbuf_free(head); 1137 ff_mbuf_free(m); 1138 return -1; 1139 } 1140 } 1141 1142 void *data = rte_pktmbuf_mtod(cur, void*); 1143 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1144 int ret = ff_mbuf_copydata(m, data, off, len); 1145 if (ret < 0) { 1146 rte_pktmbuf_free(head); 1147 ff_mbuf_free(m); 1148 return -1; 1149 } 1150 1151 if (prev != NULL) { 1152 prev->next = cur; 1153 } 1154 prev = cur; 1155 1156 cur->data_len = len; 1157 off += len; 1158 total -= len; 1159 head->nb_segs++; 1160 cur = NULL; 1161 } 1162 1163 struct ff_tx_offload offload = {0}; 1164 ff_mbuf_tx_offload(m, &offload); 1165 1166 if (offload.ip_csum) { 1167 head->ol_flags |= PKT_TX_IP_CKSUM; 1168 head->l2_len = sizeof(struct ether_hdr); 1169 head->l3_len = sizeof(struct ipv4_hdr); 1170 } 1171 1172 if (ctx->hw_features.tx_csum_l4) { 1173 if (offload.tcp_csum) { 1174 head->ol_flags |= PKT_TX_TCP_CKSUM; 1175 head->l2_len = sizeof(struct ether_hdr); 1176 head->l3_len = sizeof(struct ipv4_hdr); 1177 } 1178 1179 if (offload.tso_seg_size) { 1180 head->ol_flags |= PKT_TX_TCP_SEG; 1181 head->l4_len = sizeof(struct tcp_hdr); 1182 head->tso_segsz = offload.tso_seg_size; 1183 } 1184 1185 if (offload.udp_csum) { 1186 head->ol_flags |= PKT_TX_UDP_CKSUM; 1187 head->l2_len = sizeof(struct ether_hdr); 1188 head->l3_len = sizeof(struct ipv4_hdr); 1189 } 1190 } 1191 1192 ff_mbuf_free(m); 1193 1194 return send_single_packet(head, ctx->port_id); 1195 } 1196 1197 static int 1198 main_loop(void *arg) 1199 { 1200 struct loop_routine *lr = (struct loop_routine *)arg; 1201 1202 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1203 unsigned lcore_id; 1204 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc; 1205 int i, j, nb_rx, idle; 1206 uint8_t port_id, queue_id; 1207 struct lcore_conf *qconf; 1208 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 1209 US_PER_S * BURST_TX_DRAIN_US; 1210 struct ff_dpdk_if_context *ctx; 1211 1212 prev_tsc = 0; 1213 usch_tsc = 0; 1214 1215 lcore_id = rte_lcore_id(); 1216 qconf = &lcore_conf; 1217 1218 if (qconf->nb_rx_queue == 0) { 1219 printf("lcore %u has nothing to do\n", lcore_id); 1220 return 0; 1221 } 1222 1223 while (1) { 1224 cur_tsc = rte_rdtsc(); 1225 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1226 rte_timer_manage(); 1227 } 1228 1229 idle = 1; 1230 sys_tsc = 0; 1231 usr_tsc = 0; 1232 1233 /* 1234 * TX burst queue drain 1235 */ 1236 diff_tsc = cur_tsc - prev_tsc; 1237 if (unlikely(diff_tsc > drain_tsc)) { 1238 /* 1239 * This could be optimized (use queueid instead of 1240 * portid), but it is not called so often 1241 */ 1242 for (port_id = 0; port_id < RTE_MAX_ETHPORTS; port_id++) { 1243 if (qconf->tx_mbufs[port_id].len == 0) 1244 continue; 1245 1246 idle = 0; 1247 send_burst(qconf, 1248 qconf->tx_mbufs[port_id].len, 1249 port_id); 1250 qconf->tx_mbufs[port_id].len = 0; 1251 } 1252 1253 prev_tsc = cur_tsc; 1254 } 1255 1256 /* 1257 * Read packet from RX queues 1258 */ 1259 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1260 port_id = qconf->rx_queue_list[i].port_id; 1261 queue_id = qconf->rx_queue_list[i].queue_id; 1262 ctx = veth_ctx[port_id]; 1263 1264 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1265 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1266 } 1267 1268 process_arp_ring(port_id, queue_id, pkts_burst, ctx); 1269 1270 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1271 MAX_PKT_BURST); 1272 if (nb_rx == 0) 1273 continue; 1274 1275 idle = 0; 1276 1277 /* Prefetch first packets */ 1278 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1279 rte_prefetch0(rte_pktmbuf_mtod( 1280 pkts_burst[j], void *)); 1281 } 1282 1283 /* Prefetch and handle already prefetched packets */ 1284 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1285 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1286 j + PREFETCH_OFFSET], void *)); 1287 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1288 } 1289 1290 /* Handle remaining prefetched packets */ 1291 for (; j < nb_rx; j++) { 1292 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1293 } 1294 } 1295 1296 process_msg_ring(qconf->proc_id); 1297 1298 div_tsc = rte_rdtsc(); 1299 1300 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) { 1301 usch_tsc = cur_tsc; 1302 lr->loop(lr->arg); 1303 } 1304 1305 end_tsc = rte_rdtsc(); 1306 1307 if (usch_tsc == cur_tsc) { 1308 usr_tsc = end_tsc - div_tsc; 1309 } 1310 1311 if (!idle) { 1312 sys_tsc = div_tsc - cur_tsc; 1313 ff_status.sys_tsc += sys_tsc; 1314 } 1315 1316 ff_status.usr_tsc += usr_tsc; 1317 ff_status.work_tsc += end_tsc - cur_tsc; 1318 ff_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1319 1320 ff_status.loops++; 1321 } 1322 } 1323 1324 int 1325 ff_dpdk_if_up(void) { 1326 int nb_ports = ff_global_cfg.dpdk.nb_ports; 1327 int i; 1328 for (i = 0; i < nb_ports; i++) { 1329 uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id; 1330 veth_ctx[port_id] = ff_veth_attach(ff_global_cfg.dpdk.port_cfgs + i); 1331 if (veth_ctx[port_id] == NULL) { 1332 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1333 } 1334 } 1335 1336 return 0; 1337 } 1338 1339 void 1340 ff_dpdk_run(loop_func_t loop, void *arg) { 1341 struct loop_routine *lr = rte_malloc(NULL, 1342 sizeof(struct loop_routine), 0); 1343 lr->loop = loop; 1344 lr->arg = arg; 1345 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1346 rte_eal_mp_wait_lcore(); 1347 rte_free(lr); 1348 } 1349 1350 void 1351 ff_dpdk_pktmbuf_free(void *m) 1352 { 1353 rte_pktmbuf_free((struct rte_mbuf *)m); 1354 } 1355 1356 static uint32_t 1357 toeplitz_hash(unsigned keylen, const uint8_t *key, 1358 unsigned datalen, const uint8_t *data) 1359 { 1360 uint32_t hash = 0, v; 1361 u_int i, b; 1362 1363 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1364 1365 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1366 for (i = 0; i < datalen; i++) { 1367 for (b = 0; b < 8; b++) { 1368 if (data[i] & (1<<(7-b))) 1369 hash ^= v; 1370 v <<= 1; 1371 if ((i + 4) < keylen && 1372 (key[i+4] & (1<<(7-b)))) 1373 v |= 1; 1374 } 1375 } 1376 return (hash); 1377 } 1378 1379 int 1380 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1381 uint16_t sport, uint16_t dport) 1382 { 1383 struct lcore_conf *qconf = &lcore_conf; 1384 1385 if (qconf->nb_procs == 1) { 1386 return 1; 1387 } 1388 1389 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 1390 uint16_t reta_size = rss_reta_size[ctx->port_id]; 1391 1392 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1393 sizeof(dport)]; 1394 1395 unsigned datalen = 0; 1396 1397 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1398 datalen += sizeof(saddr); 1399 1400 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1401 datalen += sizeof(daddr); 1402 1403 bcopy(&sport, &data[datalen], sizeof(sport)); 1404 datalen += sizeof(sport); 1405 1406 bcopy(&dport, &data[datalen], sizeof(dport)); 1407 datalen += sizeof(dport); 1408 1409 uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes), 1410 default_rsskey_40bytes, datalen, data); 1411 1412 return (hash & (reta_size - 1) % qconf->nb_procs) == qconf->proc_id; 1413 } 1414 1415 1416