1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 #include <unistd.h> 28 29 #include <rte_common.h> 30 #include <rte_byteorder.h> 31 #include <rte_log.h> 32 #include <rte_memory.h> 33 #include <rte_memcpy.h> 34 #include <rte_memzone.h> 35 #include <rte_config.h> 36 #include <rte_eal.h> 37 #include <rte_pci.h> 38 #include <rte_mbuf.h> 39 #include <rte_memory.h> 40 #include <rte_lcore.h> 41 #include <rte_launch.h> 42 #include <rte_ethdev.h> 43 #include <rte_debug.h> 44 #include <rte_common.h> 45 #include <rte_ether.h> 46 #include <rte_malloc.h> 47 #include <rte_cycles.h> 48 #include <rte_timer.h> 49 #include <rte_thash.h> 50 #include <rte_ip.h> 51 #include <rte_tcp.h> 52 #include <rte_udp.h> 53 54 #include "ff_dpdk_if.h" 55 #include "ff_dpdk_pcap.h" 56 #include "ff_dpdk_kni.h" 57 #include "ff_config.h" 58 #include "ff_veth.h" 59 #include "ff_host_interface.h" 60 #include "ff_msg.h" 61 #include "ff_api.h" 62 63 #define MEMPOOL_CACHE_SIZE 256 64 65 #define DISPATCH_RING_SIZE 2048 66 67 #define MSG_RING_SIZE 32 68 69 /* 70 * Configurable number of RX/TX ring descriptors 71 */ 72 #define RX_QUEUE_SIZE 512 73 #define TX_QUEUE_SIZE 512 74 75 #define MAX_PKT_BURST 32 76 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ 77 78 /* 79 * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send. 80 */ 81 #define MAX_TX_BURST (MAX_PKT_BURST / 2) 82 83 #define NB_SOCKETS 8 84 85 /* Configure how many packets ahead to prefetch, when reading packets */ 86 #define PREFETCH_OFFSET 3 87 88 #define MAX_RX_QUEUE_PER_LCORE 16 89 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS 90 #define MAX_RX_QUEUE_PER_PORT 128 91 92 #ifdef FF_KNI 93 #define KNI_MBUF_MAX 2048 94 #define KNI_QUEUE_SIZE 2048 95 96 static int enable_kni; 97 static int kni_accept; 98 #endif 99 100 static int numa_on; 101 102 static unsigned idle_sleep; 103 104 static struct rte_timer freebsd_clock; 105 106 // Mellanox Linux's driver key 107 static uint8_t default_rsskey_40bytes[40] = { 108 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 109 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 110 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 111 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 112 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 113 }; 114 115 static struct rte_eth_conf default_port_conf = { 116 .rxmode = { 117 .mq_mode = ETH_MQ_RX_RSS, 118 .max_rx_pkt_len = ETHER_MAX_LEN, 119 .split_hdr_size = 0, /**< hdr buf size */ 120 .header_split = 0, /**< Header Split disabled */ 121 .hw_ip_checksum = 0, /**< IP checksum offload disabled */ 122 .hw_vlan_filter = 0, /**< VLAN filtering disabled */ 123 .hw_vlan_strip = 0, /**< VLAN strip disabled. */ 124 .hw_vlan_extend = 0, /**< Extended VLAN disabled. */ 125 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */ 126 .hw_strip_crc = 0, /**< CRC stripped by hardware */ 127 .enable_lro = 0, /**< LRO disabled */ 128 }, 129 .rx_adv_conf = { 130 .rss_conf = { 131 .rss_key = default_rsskey_40bytes, 132 .rss_key_len = 40, 133 .rss_hf = ETH_RSS_PROTO_MASK, 134 }, 135 }, 136 .txmode = { 137 .mq_mode = ETH_MQ_TX_NONE, 138 }, 139 }; 140 141 struct mbuf_table { 142 uint16_t len; 143 struct rte_mbuf *m_table[MAX_PKT_BURST]; 144 }; 145 146 struct lcore_rx_queue { 147 uint16_t port_id; 148 uint16_t queue_id; 149 } __rte_cache_aligned; 150 151 struct lcore_conf { 152 uint16_t proc_id; 153 uint16_t socket_id; 154 uint16_t nb_queue_list[RTE_MAX_ETHPORTS]; 155 struct ff_port_cfg *port_cfgs; 156 157 uint16_t nb_rx_queue; 158 struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE]; 159 uint16_t nb_tx_port; 160 uint16_t tx_port_id[RTE_MAX_ETHPORTS]; 161 uint16_t tx_queue_id[RTE_MAX_ETHPORTS]; 162 struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS]; 163 char *pcap[RTE_MAX_ETHPORTS]; 164 } __rte_cache_aligned; 165 166 static struct lcore_conf lcore_conf; 167 168 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 169 170 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; 171 static dispatch_func_t packet_dispatcher; 172 173 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 174 175 struct ff_msg_ring { 176 char ring_name[2][RTE_RING_NAMESIZE]; 177 /* ring[0] for lcore recv msg, other send */ 178 /* ring[1] for lcore send msg, other read */ 179 struct rte_ring *ring[2]; 180 } __rte_cache_aligned; 181 182 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 183 static struct rte_mempool *message_pool; 184 185 struct ff_dpdk_if_context { 186 void *sc; 187 void *ifp; 188 uint16_t port_id; 189 struct ff_hw_features hw_features; 190 } __rte_cache_aligned; 191 192 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 193 194 static struct ff_top_args ff_top_status; 195 static struct ff_traffic_args ff_traffic; 196 197 extern void ff_hardclock(void); 198 199 static void 200 ff_hardclock_job(__rte_unused struct rte_timer *timer, 201 __rte_unused void *arg) { 202 ff_hardclock(); 203 ff_update_current_ts(); 204 } 205 206 struct ff_dpdk_if_context * 207 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 208 { 209 struct ff_dpdk_if_context *ctx; 210 211 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 212 if (ctx == NULL) 213 return NULL; 214 215 ctx->sc = sc; 216 ctx->ifp = ifp; 217 ctx->port_id = cfg->port_id; 218 ctx->hw_features = cfg->hw_features; 219 220 return ctx; 221 } 222 223 void 224 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 225 { 226 free(ctx); 227 } 228 229 static void 230 check_all_ports_link_status(void) 231 { 232 #define CHECK_INTERVAL 100 /* 100ms */ 233 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 234 235 uint16_t portid; 236 uint8_t count, all_ports_up, print_flag = 0; 237 struct rte_eth_link link; 238 239 printf("\nChecking link status"); 240 fflush(stdout); 241 242 int i, nb_ports; 243 nb_ports = ff_global_cfg.dpdk.nb_ports; 244 for (count = 0; count <= MAX_CHECK_TIME; count++) { 245 all_ports_up = 1; 246 for (i = 0; i < nb_ports; i++) { 247 uint16_t portid = ff_global_cfg.dpdk.portid_list[i]; 248 memset(&link, 0, sizeof(link)); 249 rte_eth_link_get_nowait(portid, &link); 250 251 /* print link status if flag set */ 252 if (print_flag == 1) { 253 if (link.link_status) { 254 printf("Port %d Link Up - speed %u " 255 "Mbps - %s\n", (int)portid, 256 (unsigned)link.link_speed, 257 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 258 ("full-duplex") : ("half-duplex\n")); 259 } else { 260 printf("Port %d Link Down\n", (int)portid); 261 } 262 continue; 263 } 264 /* clear all_ports_up flag if any link down */ 265 if (link.link_status == 0) { 266 all_ports_up = 0; 267 break; 268 } 269 } 270 271 /* after finally printing all link status, get out */ 272 if (print_flag == 1) 273 break; 274 275 if (all_ports_up == 0) { 276 printf("."); 277 fflush(stdout); 278 rte_delay_ms(CHECK_INTERVAL); 279 } 280 281 /* set the print_flag if all ports up or timeout */ 282 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 283 print_flag = 1; 284 printf("done\n"); 285 } 286 } 287 } 288 289 static int 290 init_lcore_conf(void) 291 { 292 uint8_t nb_dev_ports = rte_eth_dev_count(); 293 if (nb_dev_ports == 0) { 294 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 295 } 296 297 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) { 298 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n", 299 ff_global_cfg.dpdk.max_portid); 300 } 301 302 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs; 303 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 304 305 uint16_t proc_id; 306 for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) { 307 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id]; 308 if (!lcore_config[lcore_id].detected) { 309 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 310 } 311 } 312 313 uint16_t socket_id = 0; 314 if (numa_on) { 315 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 316 } 317 318 lcore_conf.socket_id = socket_id; 319 320 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id]; 321 int j; 322 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) { 323 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j]; 324 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 325 326 int queueid = -1; 327 int i; 328 for (i = 0; i < pconf->nb_lcores; i++) { 329 if (pconf->lcore_list[i] == lcore_id) { 330 queueid = i; 331 } 332 } 333 if (queueid < 0) { 334 continue; 335 } 336 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid); 337 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 338 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 339 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid; 340 lcore_conf.nb_rx_queue++; 341 342 lcore_conf.tx_queue_id[port_id] = queueid; 343 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id; 344 lcore_conf.nb_tx_port++; 345 346 lcore_conf.pcap[port_id] = pconf->pcap; 347 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores; 348 } 349 350 if (lcore_conf.nb_rx_queue == 0) { 351 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id); 352 } 353 354 return 0; 355 } 356 357 static int 358 init_mem_pool(void) 359 { 360 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 361 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 362 uint32_t nb_tx_queue = nb_lcores; 363 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 364 365 unsigned nb_mbuf = RTE_MAX ( 366 (nb_rx_queue*RX_QUEUE_SIZE + 367 nb_ports*nb_lcores*MAX_PKT_BURST + 368 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 369 nb_lcores*MEMPOOL_CACHE_SIZE + 370 #ifdef FF_KNI 371 nb_ports*KNI_MBUF_MAX + 372 nb_ports*KNI_QUEUE_SIZE + 373 #endif 374 nb_lcores*nb_ports*DISPATCH_RING_SIZE), 375 (unsigned)8192); 376 377 unsigned socketid = 0; 378 uint16_t i, lcore_id; 379 char s[64]; 380 381 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) { 382 lcore_id = ff_global_cfg.dpdk.proc_lcore[i]; 383 if (numa_on) { 384 socketid = rte_lcore_to_socket_id(lcore_id); 385 } 386 387 if (socketid >= NB_SOCKETS) { 388 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 389 socketid, i, NB_SOCKETS); 390 } 391 392 if (pktmbuf_pool[socketid] != NULL) { 393 continue; 394 } 395 396 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 397 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 398 pktmbuf_pool[socketid] = 399 rte_pktmbuf_pool_create(s, nb_mbuf, 400 MEMPOOL_CACHE_SIZE, 0, 401 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 402 } else { 403 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 404 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 405 } 406 407 if (pktmbuf_pool[socketid] == NULL) { 408 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 409 } else { 410 printf("create mbuf pool on socket %d\n", socketid); 411 } 412 } 413 414 return 0; 415 } 416 417 static struct rte_ring * 418 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 419 { 420 struct rte_ring *ring; 421 422 if (name == NULL) { 423 rte_exit(EXIT_FAILURE, "create ring failed, no name!\n"); 424 } 425 426 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 427 ring = rte_ring_create(name, count, socket_id, flags); 428 } else { 429 ring = rte_ring_lookup(name); 430 } 431 432 if (ring == NULL) { 433 rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name); 434 } 435 436 return ring; 437 } 438 439 static int 440 init_dispatch_ring(void) 441 { 442 int j; 443 char name_buf[RTE_RING_NAMESIZE]; 444 int queueid; 445 446 unsigned socketid = lcore_conf.socket_id; 447 448 /* Create ring according to ports actually being used. */ 449 int nb_ports = ff_global_cfg.dpdk.nb_ports; 450 for (j = 0; j < nb_ports; j++) { 451 uint16_t portid = ff_global_cfg.dpdk.portid_list[j]; 452 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid]; 453 int nb_queues = pconf->nb_lcores; 454 if (dispatch_ring[portid] == NULL) { 455 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid); 456 457 dispatch_ring[portid] = rte_zmalloc(name_buf, 458 sizeof(struct rte_ring *) * nb_queues, 459 RTE_CACHE_LINE_SIZE); 460 if (dispatch_ring[portid] == NULL) { 461 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 462 "failed\n", name_buf); 463 } 464 } 465 466 for(queueid = 0; queueid < nb_queues; ++queueid) { 467 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d", 468 portid, queueid); 469 dispatch_ring[portid][queueid] = create_ring(name_buf, 470 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ); 471 472 if (dispatch_ring[portid][queueid] == NULL) 473 rte_panic("create ring:%s failed!\n", name_buf); 474 475 printf("create ring:%s success, %u ring entries are now free!\n", 476 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid])); 477 } 478 } 479 480 return 0; 481 } 482 483 static void 484 ff_msg_init(struct rte_mempool *mp, 485 __attribute__((unused)) void *opaque_arg, 486 void *obj, __attribute__((unused)) unsigned i) 487 { 488 struct ff_msg *msg = (struct ff_msg *)obj; 489 msg->msg_type = FF_UNKNOWN; 490 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 491 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 492 } 493 494 static int 495 init_msg_ring(void) 496 { 497 uint16_t i; 498 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 499 unsigned socketid = lcore_conf.socket_id; 500 501 /* Create message buffer pool */ 502 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 503 message_pool = rte_mempool_create(FF_MSG_POOL, 504 MSG_RING_SIZE * 2 * nb_procs, 505 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 506 NULL, NULL, ff_msg_init, NULL, 507 socketid, 0); 508 } else { 509 message_pool = rte_mempool_lookup(FF_MSG_POOL); 510 } 511 512 if (message_pool == NULL) { 513 rte_panic("Create msg mempool failed\n"); 514 } 515 516 for(i = 0; i < nb_procs; ++i) { 517 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 518 "%s%u", FF_MSG_RING_IN, i); 519 snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE, 520 "%s%u", FF_MSG_RING_OUT, i); 521 522 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 523 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 524 if (msg_ring[i].ring[0] == NULL) 525 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 526 527 msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1], 528 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 529 if (msg_ring[i].ring[1] == NULL) 530 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 531 } 532 533 return 0; 534 } 535 536 #ifdef FF_KNI 537 static int 538 init_kni(void) 539 { 540 int nb_ports = rte_eth_dev_count(); 541 kni_accept = 0; 542 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 543 kni_accept = 1; 544 545 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 546 ff_global_cfg.kni.udp_port); 547 548 unsigned socket_id = lcore_conf.socket_id; 549 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 550 551 nb_ports = ff_global_cfg.dpdk.nb_ports; 552 int i, ret; 553 for (i = 0; i < nb_ports; i++) { 554 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 555 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 556 } 557 558 return 0; 559 } 560 #endif 561 562 static void 563 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues) 564 { 565 if (reta_size == 0) { 566 return; 567 } 568 569 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 570 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 571 572 /* config HW indirection table */ 573 unsigned i, j, hash=0; 574 for (i = 0; i < reta_conf_size; i++) { 575 reta_conf[i].mask = ~0ULL; 576 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 577 reta_conf[i].reta[j] = hash++ % nb_queues; 578 } 579 } 580 581 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 582 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 583 port_id); 584 } 585 } 586 587 static int 588 init_port_start(void) 589 { 590 int nb_ports = ff_global_cfg.dpdk.nb_ports; 591 unsigned socketid = 0; 592 struct rte_mempool *mbuf_pool; 593 uint16_t i; 594 595 for (i = 0; i < nb_ports; i++) { 596 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 597 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 598 uint16_t nb_queues = pconf->nb_lcores; 599 600 struct rte_eth_dev_info dev_info; 601 rte_eth_dev_info_get(port_id, &dev_info); 602 603 if (nb_queues > dev_info.max_rx_queues) { 604 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 605 nb_queues, 606 dev_info.max_rx_queues); 607 } 608 609 if (nb_queues > dev_info.max_tx_queues) { 610 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 611 nb_queues, 612 dev_info.max_tx_queues); 613 } 614 615 struct ether_addr addr; 616 rte_eth_macaddr_get(port_id, &addr); 617 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 618 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 619 (unsigned)port_id, 620 addr.addr_bytes[0], addr.addr_bytes[1], 621 addr.addr_bytes[2], addr.addr_bytes[3], 622 addr.addr_bytes[4], addr.addr_bytes[5]); 623 624 rte_memcpy(pconf->mac, 625 addr.addr_bytes, ETHER_ADDR_LEN); 626 627 /* Clear txq_flags - we do not need multi-mempool and refcnt */ 628 dev_info.default_txconf.txq_flags = ETH_TXQ_FLAGS_NOMULTMEMP | 629 ETH_TXQ_FLAGS_NOREFCOUNT; 630 631 /* Disable features that are not supported by port's HW */ 632 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) { 633 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP; 634 } 635 636 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 637 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP; 638 } 639 640 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) { 641 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP; 642 } 643 644 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { 645 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; 646 } 647 648 if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) && 649 !(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TSO)) { 650 dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; 651 } 652 653 struct rte_eth_conf port_conf = {0}; 654 655 /* Set RSS mode */ 656 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 657 port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK; 658 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes; 659 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40; 660 661 /* Set Rx VLAN stripping */ 662 if (ff_global_cfg.dpdk.vlan_strip) { 663 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 664 port_conf.rxmode.hw_vlan_strip = 1; 665 } 666 } 667 668 /* Enable HW CRC stripping */ 669 port_conf.rxmode.hw_strip_crc = 1; 670 671 /* FIXME: Enable TCP LRO ?*/ 672 #if 0 673 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 674 printf("LRO is supported\n"); 675 port_conf.rxmode.enable_lro = 1; 676 pconf->hw_features.rx_lro = 1; 677 } 678 #endif 679 680 /* Set Rx checksum checking */ 681 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 682 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 683 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 684 printf("RX checksum offload supported\n"); 685 port_conf.rxmode.hw_ip_checksum = 1; 686 pconf->hw_features.rx_csum = 1; 687 } 688 689 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 690 printf("TX ip checksum offload supported\n"); 691 pconf->hw_features.tx_csum_ip = 1; 692 } 693 694 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 695 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 696 printf("TX TCP&UDP checksum offload supported\n"); 697 pconf->hw_features.tx_csum_l4 = 1; 698 } 699 700 if (ff_global_cfg.dpdk.tso) { 701 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 702 printf("TSO is supported\n"); 703 pconf->hw_features.tx_tso = 1; 704 } 705 } else { 706 printf("TSO is disabled\n"); 707 } 708 709 if (dev_info.reta_size) { 710 /* reta size must be power of 2 */ 711 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 712 713 rss_reta_size[port_id] = dev_info.reta_size; 714 printf("port[%d]: rss table size: %d\n", port_id, 715 dev_info.reta_size); 716 } 717 718 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 719 continue; 720 } 721 722 int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); 723 if (ret != 0) { 724 return ret; 725 } 726 uint16_t q; 727 for (q = 0; q < nb_queues; q++) { 728 if (numa_on) { 729 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q]; 730 socketid = rte_lcore_to_socket_id(lcore_id); 731 } 732 mbuf_pool = pktmbuf_pool[socketid]; 733 734 ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE, 735 socketid, &dev_info.default_txconf); 736 if (ret < 0) { 737 return ret; 738 } 739 740 ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE, 741 socketid, &dev_info.default_rxconf, mbuf_pool); 742 if (ret < 0) { 743 return ret; 744 } 745 } 746 747 ret = rte_eth_dev_start(port_id); 748 if (ret < 0) { 749 return ret; 750 } 751 752 if (nb_queues > 1) { 753 /* set HW rss hash function to Toeplitz. */ 754 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 755 struct rte_eth_hash_filter_info info = {0}; 756 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 757 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 758 759 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 760 RTE_ETH_FILTER_SET, &info) < 0) { 761 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 762 port_id); 763 } 764 } 765 766 set_rss_table(port_id, dev_info.reta_size, nb_queues); 767 } 768 769 /* Enable RX in promiscuous mode for the Ethernet device. */ 770 if (ff_global_cfg.dpdk.promiscuous) { 771 rte_eth_promiscuous_enable(port_id); 772 ret = rte_eth_promiscuous_get(port_id); 773 if (ret == 1) { 774 printf("set port %u to promiscuous mode ok\n", port_id); 775 } else { 776 printf("set port %u to promiscuous mode error\n", port_id); 777 } 778 } 779 780 /* Enable pcap dump */ 781 if (pconf->pcap) { 782 ff_enable_pcap(pconf->pcap); 783 } 784 } 785 786 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 787 check_all_ports_link_status(); 788 } 789 790 return 0; 791 } 792 793 static int 794 init_clock(void) 795 { 796 rte_timer_subsystem_init(); 797 uint64_t hz = rte_get_timer_hz(); 798 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 799 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 800 801 rte_timer_init(&freebsd_clock); 802 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 803 rte_lcore_id(), &ff_hardclock_job, NULL); 804 805 ff_update_current_ts(); 806 807 return 0; 808 } 809 810 int 811 ff_dpdk_init(int argc, char **argv) 812 { 813 if (ff_global_cfg.dpdk.nb_procs < 1 || 814 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 815 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 816 ff_global_cfg.dpdk.proc_id < 0) { 817 printf("param num_procs[%d] or proc_id[%d] error!\n", 818 ff_global_cfg.dpdk.nb_procs, 819 ff_global_cfg.dpdk.proc_id); 820 exit(1); 821 } 822 823 int ret = rte_eal_init(argc, argv); 824 if (ret < 0) { 825 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 826 } 827 828 numa_on = ff_global_cfg.dpdk.numa_on; 829 830 idle_sleep = ff_global_cfg.dpdk.idle_sleep; 831 832 init_lcore_conf(); 833 834 init_mem_pool(); 835 836 init_dispatch_ring(); 837 838 init_msg_ring(); 839 840 #ifdef FF_KNI 841 enable_kni = ff_global_cfg.kni.enable; 842 if (enable_kni) { 843 init_kni(); 844 } 845 #endif 846 847 ret = init_port_start(); 848 if (ret < 0) { 849 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 850 } 851 852 init_clock(); 853 854 return 0; 855 } 856 857 static void 858 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 859 { 860 uint8_t rx_csum = ctx->hw_features.rx_csum; 861 if (rx_csum) { 862 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 863 rte_pktmbuf_free(pkt); 864 return; 865 } 866 } 867 868 /* 869 * FIXME: should we save pkt->vlan_tci 870 * if (pkt->ol_flags & PKT_RX_VLAN_PKT) 871 */ 872 873 void *data = rte_pktmbuf_mtod(pkt, void*); 874 uint16_t len = rte_pktmbuf_data_len(pkt); 875 876 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 877 if (hdr == NULL) { 878 rte_pktmbuf_free(pkt); 879 return; 880 } 881 882 struct rte_mbuf *pn = pkt->next; 883 void *prev = hdr; 884 while(pn != NULL) { 885 data = rte_pktmbuf_mtod(pn, void*); 886 len = rte_pktmbuf_data_len(pn); 887 888 void *mb = ff_mbuf_get(prev, data, len); 889 if (mb == NULL) { 890 ff_mbuf_free(hdr); 891 rte_pktmbuf_free(pkt); 892 return; 893 } 894 pn = pn->next; 895 prev = mb; 896 } 897 898 ff_veth_process_packet(ctx->ifp, hdr); 899 } 900 901 static enum FilterReturn 902 protocol_filter(const void *data, uint16_t len) 903 { 904 if(len < ETHER_HDR_LEN) 905 return FILTER_UNKNOWN; 906 907 const struct ether_hdr *hdr; 908 hdr = (const struct ether_hdr *)data; 909 910 if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP) 911 return FILTER_ARP; 912 913 #ifndef FF_KNI 914 return FILTER_UNKNOWN; 915 #else 916 if (!enable_kni) { 917 return FILTER_UNKNOWN; 918 } 919 920 if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4) 921 return FILTER_UNKNOWN; 922 923 return ff_kni_proto_filter(data + ETHER_HDR_LEN, 924 len - ETHER_HDR_LEN); 925 #endif 926 } 927 928 static inline void 929 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m) 930 { 931 struct rte_mbuf *md; 932 void *src, *dst; 933 934 dst = rte_pktmbuf_mtod(mi, void *); 935 src = rte_pktmbuf_mtod(m, void *); 936 937 mi->data_len = m->data_len; 938 rte_memcpy(dst, src, m->data_len); 939 940 mi->port = m->port; 941 mi->vlan_tci = m->vlan_tci; 942 mi->vlan_tci_outer = m->vlan_tci_outer; 943 mi->tx_offload = m->tx_offload; 944 mi->hash = m->hash; 945 mi->ol_flags = m->ol_flags; 946 mi->packet_type = m->packet_type; 947 } 948 949 /* copied from rte_pktmbuf_clone */ 950 static inline struct rte_mbuf * 951 pktmbuf_deep_clone(const struct rte_mbuf *md, 952 struct rte_mempool *mp) 953 { 954 struct rte_mbuf *mc, *mi, **prev; 955 uint32_t pktlen; 956 uint8_t nseg; 957 958 if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) 959 return NULL; 960 961 mi = mc; 962 prev = &mi->next; 963 pktlen = md->pkt_len; 964 nseg = 0; 965 966 do { 967 nseg++; 968 pktmbuf_deep_attach(mi, md); 969 *prev = mi; 970 prev = &mi->next; 971 } while ((md = md->next) != NULL && 972 (mi = rte_pktmbuf_alloc(mp)) != NULL); 973 974 *prev = NULL; 975 mc->nb_segs = nseg; 976 mc->pkt_len = pktlen; 977 978 /* Allocation of new indirect segment failed */ 979 if (unlikely (mi == NULL)) { 980 rte_pktmbuf_free(mc); 981 return NULL; 982 } 983 984 __rte_mbuf_sanity_check(mc, 1); 985 return mc; 986 } 987 988 static inline void 989 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 990 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 991 { 992 struct lcore_conf *qconf = &lcore_conf; 993 uint16_t nb_queues = qconf->nb_queue_list[port_id]; 994 995 uint16_t i; 996 for (i = 0; i < count; i++) { 997 struct rte_mbuf *rtem = bufs[i]; 998 999 if (unlikely(qconf->pcap[port_id] != NULL)) { 1000 if (!pkts_from_ring) { 1001 ff_dump_packets(qconf->pcap[port_id], rtem); 1002 } 1003 } 1004 1005 void *data = rte_pktmbuf_mtod(rtem, void*); 1006 uint16_t len = rte_pktmbuf_data_len(rtem); 1007 1008 if (!pkts_from_ring) { 1009 ff_traffic.rx_packets++; 1010 ff_traffic.rx_bytes += len; 1011 } 1012 1013 if (!pkts_from_ring && packet_dispatcher) { 1014 int ret = (*packet_dispatcher)(data, len, queue_id, nb_queues); 1015 if (ret < 0 || ret >= nb_queues) { 1016 rte_pktmbuf_free(rtem); 1017 continue; 1018 } 1019 1020 if (ret != queue_id) { 1021 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem); 1022 if (ret < 0) 1023 rte_pktmbuf_free(rtem); 1024 1025 continue; 1026 } 1027 } 1028 1029 enum FilterReturn filter = protocol_filter(data, len); 1030 if (filter == FILTER_ARP) { 1031 struct rte_mempool *mbuf_pool; 1032 struct rte_mbuf *mbuf_clone; 1033 if (!pkts_from_ring) { 1034 uint16_t j; 1035 for(j = 0; j < nb_queues; ++j) { 1036 if(j == queue_id) 1037 continue; 1038 1039 unsigned socket_id = 0; 1040 if (numa_on) { 1041 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j]; 1042 socket_id = rte_lcore_to_socket_id(lcore_id); 1043 } 1044 mbuf_pool = pktmbuf_pool[socket_id]; 1045 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1046 if(mbuf_clone) { 1047 int ret = rte_ring_enqueue(dispatch_ring[port_id][j], 1048 mbuf_clone); 1049 if (ret < 0) 1050 rte_pktmbuf_free(mbuf_clone); 1051 } 1052 } 1053 } 1054 1055 #ifdef FF_KNI 1056 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1057 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 1058 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1059 if(mbuf_clone) { 1060 ff_kni_enqueue(port_id, mbuf_clone); 1061 } 1062 } 1063 #endif 1064 ff_veth_input(ctx, rtem); 1065 #ifdef FF_KNI 1066 } else if (enable_kni && 1067 ((filter == FILTER_KNI && kni_accept) || 1068 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 1069 ff_kni_enqueue(port_id, rtem); 1070 #endif 1071 } else { 1072 ff_veth_input(ctx, rtem); 1073 } 1074 } 1075 } 1076 1077 static inline int 1078 process_dispatch_ring(uint16_t port_id, uint16_t queue_id, 1079 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 1080 { 1081 /* read packet from ring buf and to process */ 1082 uint16_t nb_rb; 1083 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id], 1084 (void **)pkts_burst, MAX_PKT_BURST, NULL); 1085 1086 if(nb_rb > 0) { 1087 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 1088 } 1089 1090 return 0; 1091 } 1092 1093 static inline void 1094 handle_sysctl_msg(struct ff_msg *msg) 1095 { 1096 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 1097 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 1098 msg->sysctl.newlen); 1099 1100 if (ret < 0) { 1101 msg->result = errno; 1102 } else { 1103 msg->result = 0; 1104 } 1105 } 1106 1107 static inline void 1108 handle_ioctl_msg(struct ff_msg *msg) 1109 { 1110 int fd, ret; 1111 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 1112 if (fd < 0) { 1113 ret = -1; 1114 goto done; 1115 } 1116 1117 ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data); 1118 1119 ff_close(fd); 1120 1121 done: 1122 if (ret < 0) { 1123 msg->result = errno; 1124 } else { 1125 msg->result = 0; 1126 } 1127 } 1128 1129 static inline void 1130 handle_route_msg(struct ff_msg *msg) 1131 { 1132 int ret = ff_rtioctl(msg->route.fib, msg->route.data, 1133 &msg->route.len, msg->route.maxlen); 1134 if (ret < 0) { 1135 msg->result = errno; 1136 } else { 1137 msg->result = 0; 1138 } 1139 } 1140 1141 static inline void 1142 handle_top_msg(struct ff_msg *msg) 1143 { 1144 msg->top = ff_top_status; 1145 msg->result = 0; 1146 } 1147 1148 #ifdef FF_NETGRAPH 1149 static inline void 1150 handle_ngctl_msg(struct ff_msg *msg) 1151 { 1152 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data); 1153 if (ret < 0) { 1154 msg->result = errno; 1155 } else { 1156 msg->result = 0; 1157 msg->ngctl.ret = ret; 1158 } 1159 } 1160 #endif 1161 1162 #ifdef FF_IPFW 1163 static inline void 1164 handle_ipfw_msg(struct ff_msg *msg) 1165 { 1166 int fd, ret; 1167 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW); 1168 if (fd < 0) { 1169 ret = -1; 1170 goto done; 1171 } 1172 1173 switch (msg->ipfw.cmd) { 1174 case FF_IPFW_GET: 1175 ret = ff_getsockopt_freebsd(fd, msg->ipfw.level, 1176 msg->ipfw.optname, msg->ipfw.optval, 1177 msg->ipfw.optlen); 1178 break; 1179 case FF_IPFW_SET: 1180 ret = ff_setsockopt_freebsd(fd, msg->ipfw.level, 1181 msg->ipfw.optname, msg->ipfw.optval, 1182 *(msg->ipfw.optlen)); 1183 break; 1184 default: 1185 ret = -1; 1186 errno = ENOTSUP; 1187 break; 1188 } 1189 1190 ff_close(fd); 1191 1192 done: 1193 if (ret < 0) { 1194 msg->result = errno; 1195 } else { 1196 msg->result = 0; 1197 } 1198 } 1199 #endif 1200 1201 static inline void 1202 handle_traffic_msg(struct ff_msg *msg) 1203 { 1204 msg->traffic = ff_traffic; 1205 msg->result = 0; 1206 } 1207 1208 static inline void 1209 handle_default_msg(struct ff_msg *msg) 1210 { 1211 msg->result = ENOTSUP; 1212 } 1213 1214 static inline void 1215 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1216 { 1217 switch (msg->msg_type) { 1218 case FF_SYSCTL: 1219 handle_sysctl_msg(msg); 1220 break; 1221 case FF_IOCTL: 1222 handle_ioctl_msg(msg); 1223 break; 1224 case FF_ROUTE: 1225 handle_route_msg(msg); 1226 break; 1227 case FF_TOP: 1228 handle_top_msg(msg); 1229 break; 1230 #ifdef FF_NETGRAPH 1231 case FF_NGCTL: 1232 handle_ngctl_msg(msg); 1233 break; 1234 #endif 1235 #ifdef FF_IPFW 1236 case FF_IPFW_CTL: 1237 handle_ipfw_msg(msg); 1238 break; 1239 #endif 1240 case FF_TRAFFIC: 1241 handle_traffic_msg(msg); 1242 break; 1243 default: 1244 handle_default_msg(msg); 1245 break; 1246 } 1247 rte_ring_enqueue(msg_ring[proc_id].ring[1], msg); 1248 } 1249 1250 static inline int 1251 process_msg_ring(uint16_t proc_id) 1252 { 1253 void *msg; 1254 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1255 1256 if (unlikely(ret == 0)) { 1257 handle_msg((struct ff_msg *)msg, proc_id); 1258 } 1259 1260 return 0; 1261 } 1262 1263 /* Send burst of packets on an output interface */ 1264 static inline int 1265 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1266 { 1267 struct rte_mbuf **m_table; 1268 int ret; 1269 uint16_t queueid; 1270 1271 queueid = qconf->tx_queue_id[port]; 1272 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1273 1274 if (unlikely(qconf->pcap[port] != NULL)) { 1275 uint16_t i; 1276 for (i = 0; i < n; i++) { 1277 ff_dump_packets(qconf->pcap[port], m_table[i]); 1278 } 1279 } 1280 1281 ff_traffic.tx_packets += n; 1282 uint16_t i; 1283 for (i = 0; i < n; i++) { 1284 ff_traffic.tx_bytes += rte_pktmbuf_data_len(m_table[i]); 1285 } 1286 1287 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1288 if (unlikely(ret < n)) { 1289 do { 1290 rte_pktmbuf_free(m_table[ret]); 1291 } while (++ret < n); 1292 } 1293 1294 return 0; 1295 } 1296 1297 /* Enqueue a single packet, and send burst if queue is filled */ 1298 static inline int 1299 send_single_packet(struct rte_mbuf *m, uint8_t port) 1300 { 1301 uint16_t len; 1302 struct lcore_conf *qconf; 1303 1304 qconf = &lcore_conf; 1305 len = qconf->tx_mbufs[port].len; 1306 qconf->tx_mbufs[port].m_table[len] = m; 1307 len++; 1308 1309 /* enough pkts to be sent */ 1310 if (unlikely(len == MAX_PKT_BURST)) { 1311 send_burst(qconf, MAX_PKT_BURST, port); 1312 len = 0; 1313 } 1314 1315 qconf->tx_mbufs[port].len = len; 1316 return 0; 1317 } 1318 1319 int 1320 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1321 int total) 1322 { 1323 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1324 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1325 if (head == NULL) { 1326 ff_mbuf_free(m); 1327 return -1; 1328 } 1329 1330 head->pkt_len = total; 1331 head->nb_segs = 0; 1332 1333 int off = 0; 1334 struct rte_mbuf *cur = head, *prev = NULL; 1335 while(total > 0) { 1336 if (cur == NULL) { 1337 cur = rte_pktmbuf_alloc(mbuf_pool); 1338 if (cur == NULL) { 1339 rte_pktmbuf_free(head); 1340 ff_mbuf_free(m); 1341 return -1; 1342 } 1343 } 1344 1345 if (prev != NULL) { 1346 prev->next = cur; 1347 } 1348 head->nb_segs++; 1349 1350 prev = cur; 1351 void *data = rte_pktmbuf_mtod(cur, void*); 1352 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1353 int ret = ff_mbuf_copydata(m, data, off, len); 1354 if (ret < 0) { 1355 rte_pktmbuf_free(head); 1356 ff_mbuf_free(m); 1357 return -1; 1358 } 1359 1360 1361 cur->data_len = len; 1362 off += len; 1363 total -= len; 1364 cur = NULL; 1365 } 1366 1367 struct ff_tx_offload offload = {0}; 1368 ff_mbuf_tx_offload(m, &offload); 1369 1370 void *data = rte_pktmbuf_mtod(head, void*); 1371 1372 if (offload.ip_csum) { 1373 /* ipv6 not supported yet */ 1374 struct ipv4_hdr *iph; 1375 int iph_len; 1376 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1377 iph_len = (iph->version_ihl & 0x0f) << 2; 1378 1379 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 1380 head->l2_len = ETHER_HDR_LEN; 1381 head->l3_len = iph_len; 1382 } 1383 1384 if (ctx->hw_features.tx_csum_l4) { 1385 struct ipv4_hdr *iph; 1386 int iph_len; 1387 iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN); 1388 iph_len = (iph->version_ihl & 0x0f) << 2; 1389 1390 if (offload.tcp_csum) { 1391 head->ol_flags |= PKT_TX_TCP_CKSUM; 1392 head->l2_len = ETHER_HDR_LEN; 1393 head->l3_len = iph_len; 1394 } 1395 1396 /* 1397 * TCP segmentation offload. 1398 * 1399 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 1400 * implies PKT_TX_TCP_CKSUM) 1401 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 1402 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 1403 * write the IP checksum to 0 in the packet 1404 * - fill the mbuf offload information: l2_len, 1405 * l3_len, l4_len, tso_segsz 1406 * - calculate the pseudo header checksum without taking ip_len 1407 * in account, and set it in the TCP header. Refer to 1408 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 1409 * used as helpers. 1410 */ 1411 if (offload.tso_seg_size) { 1412 struct tcp_hdr *tcph; 1413 int tcph_len; 1414 tcph = (struct tcp_hdr *)((char *)iph + iph_len); 1415 tcph_len = (tcph->data_off & 0xf0) >> 2; 1416 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 1417 1418 head->ol_flags |= PKT_TX_TCP_SEG; 1419 head->l4_len = tcph_len; 1420 head->tso_segsz = offload.tso_seg_size; 1421 } 1422 1423 if (offload.udp_csum) { 1424 head->ol_flags |= PKT_TX_UDP_CKSUM; 1425 head->l2_len = ETHER_HDR_LEN; 1426 head->l3_len = iph_len; 1427 } 1428 } 1429 1430 ff_mbuf_free(m); 1431 1432 return send_single_packet(head, ctx->port_id); 1433 } 1434 1435 static int 1436 main_loop(void *arg) 1437 { 1438 struct loop_routine *lr = (struct loop_routine *)arg; 1439 1440 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1441 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc; 1442 int i, j, nb_rx, idle; 1443 uint16_t port_id, queue_id; 1444 struct lcore_conf *qconf; 1445 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / 1446 US_PER_S * BURST_TX_DRAIN_US; 1447 struct ff_dpdk_if_context *ctx; 1448 1449 prev_tsc = 0; 1450 usch_tsc = 0; 1451 1452 qconf = &lcore_conf; 1453 1454 while (1) { 1455 cur_tsc = rte_rdtsc(); 1456 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1457 rte_timer_manage(); 1458 } 1459 1460 idle = 1; 1461 sys_tsc = 0; 1462 usr_tsc = 0; 1463 1464 /* 1465 * TX burst queue drain 1466 */ 1467 diff_tsc = cur_tsc - prev_tsc; 1468 if (unlikely(diff_tsc > drain_tsc)) { 1469 for (i = 0; i < qconf->nb_tx_port; i++) { 1470 port_id = qconf->tx_port_id[i]; 1471 if (qconf->tx_mbufs[port_id].len == 0) 1472 continue; 1473 1474 idle = 0; 1475 1476 send_burst(qconf, 1477 qconf->tx_mbufs[port_id].len, 1478 port_id); 1479 qconf->tx_mbufs[port_id].len = 0; 1480 } 1481 1482 prev_tsc = cur_tsc; 1483 } 1484 1485 /* 1486 * Read packet from RX queues 1487 */ 1488 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1489 port_id = qconf->rx_queue_list[i].port_id; 1490 queue_id = qconf->rx_queue_list[i].queue_id; 1491 ctx = veth_ctx[port_id]; 1492 1493 #ifdef FF_KNI 1494 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1495 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1496 } 1497 #endif 1498 1499 process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); 1500 1501 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1502 MAX_PKT_BURST); 1503 if (nb_rx == 0) 1504 continue; 1505 1506 idle = 0; 1507 1508 /* Prefetch first packets */ 1509 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1510 rte_prefetch0(rte_pktmbuf_mtod( 1511 pkts_burst[j], void *)); 1512 } 1513 1514 /* Prefetch and handle already prefetched packets */ 1515 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1516 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1517 j + PREFETCH_OFFSET], void *)); 1518 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1519 } 1520 1521 /* Handle remaining prefetched packets */ 1522 for (; j < nb_rx; j++) { 1523 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1524 } 1525 } 1526 1527 process_msg_ring(qconf->proc_id); 1528 1529 div_tsc = rte_rdtsc(); 1530 1531 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) { 1532 usch_tsc = cur_tsc; 1533 lr->loop(lr->arg); 1534 } 1535 1536 idle_sleep_tsc = rte_rdtsc(); 1537 if (likely(idle && idle_sleep)) { 1538 usleep(idle_sleep); 1539 end_tsc = rte_rdtsc(); 1540 } else { 1541 end_tsc = idle_sleep_tsc; 1542 } 1543 1544 end_tsc = rte_rdtsc(); 1545 1546 if (usch_tsc == cur_tsc) { 1547 usr_tsc = idle_sleep_tsc - div_tsc; 1548 } 1549 1550 if (!idle) { 1551 sys_tsc = div_tsc - cur_tsc; 1552 ff_top_status.sys_tsc += sys_tsc; 1553 } 1554 1555 ff_top_status.usr_tsc += usr_tsc; 1556 ff_top_status.work_tsc += end_tsc - cur_tsc; 1557 ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1558 1559 ff_top_status.loops++; 1560 } 1561 1562 return 0; 1563 } 1564 1565 int 1566 ff_dpdk_if_up(void) { 1567 int i; 1568 struct lcore_conf *qconf = &lcore_conf; 1569 for (i = 0; i < qconf->nb_tx_port; i++) { 1570 uint16_t port_id = qconf->tx_port_id[i]; 1571 1572 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id]; 1573 veth_ctx[port_id] = ff_veth_attach(pconf); 1574 if (veth_ctx[port_id] == NULL) { 1575 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1576 } 1577 } 1578 1579 return 0; 1580 } 1581 1582 void 1583 ff_dpdk_run(loop_func_t loop, void *arg) { 1584 struct loop_routine *lr = rte_malloc(NULL, 1585 sizeof(struct loop_routine), 0); 1586 lr->loop = loop; 1587 lr->arg = arg; 1588 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1589 rte_eal_mp_wait_lcore(); 1590 rte_free(lr); 1591 } 1592 1593 void 1594 ff_dpdk_pktmbuf_free(void *m) 1595 { 1596 rte_pktmbuf_free((struct rte_mbuf *)m); 1597 } 1598 1599 static uint32_t 1600 toeplitz_hash(unsigned keylen, const uint8_t *key, 1601 unsigned datalen, const uint8_t *data) 1602 { 1603 uint32_t hash = 0, v; 1604 u_int i, b; 1605 1606 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1607 1608 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1609 for (i = 0; i < datalen; i++) { 1610 for (b = 0; b < 8; b++) { 1611 if (data[i] & (1<<(7-b))) 1612 hash ^= v; 1613 v <<= 1; 1614 if ((i + 4) < keylen && 1615 (key[i+4] & (1<<(7-b)))) 1616 v |= 1; 1617 } 1618 } 1619 return (hash); 1620 } 1621 1622 int 1623 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1624 uint16_t sport, uint16_t dport) 1625 { 1626 struct lcore_conf *qconf = &lcore_conf; 1627 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 1628 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id]; 1629 1630 if (nb_queues <= 1) { 1631 return 1; 1632 } 1633 1634 uint16_t reta_size = rss_reta_size[ctx->port_id]; 1635 uint16_t queueid = qconf->tx_queue_id[ctx->port_id]; 1636 1637 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 1638 sizeof(dport)]; 1639 1640 unsigned datalen = 0; 1641 1642 bcopy(&saddr, &data[datalen], sizeof(saddr)); 1643 datalen += sizeof(saddr); 1644 1645 bcopy(&daddr, &data[datalen], sizeof(daddr)); 1646 datalen += sizeof(daddr); 1647 1648 bcopy(&sport, &data[datalen], sizeof(sport)); 1649 datalen += sizeof(sport); 1650 1651 bcopy(&dport, &data[datalen], sizeof(dport)); 1652 datalen += sizeof(dport); 1653 1654 uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes), 1655 default_rsskey_40bytes, datalen, data); 1656 1657 return ((hash & (reta_size - 1)) % nb_queues) == queueid; 1658 } 1659 1660 void 1661 ff_regist_packet_dispatcher(dispatch_func_t func) 1662 { 1663 packet_dispatcher = func; 1664 } 1665 1666 uint64_t 1667 ff_get_tsc_ns() 1668 { 1669 uint64_t cur_tsc = rte_rdtsc(); 1670 uint64_t hz = rte_get_tsc_hz(); 1671 return ((double)cur_tsc/(double)hz) * NS_PER_S; 1672 } 1673 1674