1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 #include <unistd.h> 28 #include <sys/mman.h> 29 #include <errno.h> 30 31 #include <rte_common.h> 32 #include <rte_byteorder.h> 33 #include <rte_log.h> 34 #include <rte_memory.h> 35 #include <rte_memcpy.h> 36 #include <rte_memzone.h> 37 #include <rte_config.h> 38 #include <rte_eal.h> 39 #include <rte_pci.h> 40 #include <rte_mbuf.h> 41 #include <rte_memory.h> 42 #include <rte_lcore.h> 43 #include <rte_launch.h> 44 #include <rte_ethdev.h> 45 #include <rte_debug.h> 46 #include <rte_common.h> 47 #include <rte_ether.h> 48 #include <rte_malloc.h> 49 #include <rte_cycles.h> 50 #include <rte_timer.h> 51 #include <rte_thash.h> 52 #include <rte_ip.h> 53 #include <rte_tcp.h> 54 #include <rte_udp.h> 55 #include <rte_eth_bond.h> 56 57 #include "ff_dpdk_if.h" 58 #include "ff_dpdk_pcap.h" 59 #include "ff_dpdk_kni.h" 60 #include "ff_config.h" 61 #include "ff_veth.h" 62 #include "ff_host_interface.h" 63 #include "ff_msg.h" 64 #include "ff_api.h" 65 #include "ff_memory.h" 66 67 #ifdef FF_KNI 68 #define KNI_MBUF_MAX 2048 69 #define KNI_QUEUE_SIZE 2048 70 71 int enable_kni; 72 static int kni_accept; 73 static int knictl_action = FF_KNICTL_ACTION_DEFAULT; 74 #endif 75 76 static int numa_on; 77 78 static unsigned idle_sleep; 79 static unsigned pkt_tx_delay; 80 81 static struct rte_timer freebsd_clock; 82 83 // Mellanox Linux's driver key 84 static uint8_t default_rsskey_40bytes[40] = { 85 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 86 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 87 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 88 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 89 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 90 }; 91 92 static uint8_t default_rsskey_52bytes[52] = { 93 0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23, 94 0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30, 95 0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02, 96 0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c, 97 0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55, 98 0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e, 99 0x81, 0x15, 0x03, 0x66 100 }; 101 102 static uint8_t symmetric_rsskey[52] = { 103 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 104 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 105 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 106 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 107 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 108 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 109 0x6d, 0x5a, 0x6d, 0x5a 110 }; 111 112 static int rsskey_len = sizeof(default_rsskey_40bytes); 113 static uint8_t *rsskey = default_rsskey_40bytes; 114 115 struct lcore_conf lcore_conf; 116 117 struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 118 119 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; 120 static dispatch_func_t packet_dispatcher; 121 122 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 123 124 #define BOND_DRIVER_NAME "net_bonding" 125 126 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port); 127 128 struct ff_msg_ring { 129 char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE]; 130 /* ring[0] for lcore recv msg, other send */ 131 /* ring[1] for lcore send msg, other read */ 132 struct rte_ring *ring[FF_MSG_NUM]; 133 } __rte_cache_aligned; 134 135 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 136 static struct rte_mempool *message_pool; 137 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 138 139 static struct ff_top_args ff_top_status; 140 static struct ff_traffic_args ff_traffic; 141 extern void ff_hardclock(void); 142 143 static void 144 ff_hardclock_job(__rte_unused struct rte_timer *timer, 145 __rte_unused void *arg) { 146 ff_hardclock(); 147 ff_update_current_ts(); 148 } 149 150 struct ff_dpdk_if_context * 151 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 152 { 153 struct ff_dpdk_if_context *ctx; 154 155 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 156 if (ctx == NULL) 157 return NULL; 158 159 ctx->sc = sc; 160 ctx->ifp = ifp; 161 ctx->port_id = cfg->port_id; 162 ctx->hw_features = cfg->hw_features; 163 164 return ctx; 165 } 166 167 void 168 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 169 { 170 free(ctx); 171 } 172 173 static void 174 check_all_ports_link_status(void) 175 { 176 #define CHECK_INTERVAL 100 /* 100ms */ 177 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 178 179 uint16_t portid; 180 uint8_t count, all_ports_up, print_flag = 0; 181 struct rte_eth_link link; 182 183 printf("\nChecking link status"); 184 fflush(stdout); 185 186 int i, nb_ports; 187 nb_ports = ff_global_cfg.dpdk.nb_ports; 188 for (count = 0; count <= MAX_CHECK_TIME; count++) { 189 all_ports_up = 1; 190 for (i = 0; i < nb_ports; i++) { 191 uint16_t portid = ff_global_cfg.dpdk.portid_list[i]; 192 memset(&link, 0, sizeof(link)); 193 rte_eth_link_get_nowait(portid, &link); 194 195 /* print link status if flag set */ 196 if (print_flag == 1) { 197 if (link.link_status) { 198 printf("Port %d Link Up - speed %u " 199 "Mbps - %s\n", (int)portid, 200 (unsigned)link.link_speed, 201 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 202 ("full-duplex") : ("half-duplex\n")); 203 } else { 204 printf("Port %d Link Down\n", (int)portid); 205 } 206 continue; 207 } 208 /* clear all_ports_up flag if any link down */ 209 if (link.link_status == 0) { 210 all_ports_up = 0; 211 break; 212 } 213 } 214 215 /* after finally printing all link status, get out */ 216 if (print_flag == 1) 217 break; 218 219 if (all_ports_up == 0) { 220 printf("."); 221 fflush(stdout); 222 rte_delay_ms(CHECK_INTERVAL); 223 } 224 225 /* set the print_flag if all ports up or timeout */ 226 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 227 print_flag = 1; 228 printf("done\n"); 229 } 230 } 231 } 232 233 static int 234 init_lcore_conf(void) 235 { 236 uint8_t nb_dev_ports = rte_eth_dev_count_avail(); 237 if (nb_dev_ports == 0) { 238 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 239 } 240 241 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) { 242 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n", 243 ff_global_cfg.dpdk.max_portid); 244 } 245 246 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs; 247 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 248 249 uint16_t socket_id = 0; 250 if (numa_on) { 251 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 252 } 253 254 lcore_conf.socket_id = socket_id; 255 256 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id]; 257 if (!rte_lcore_is_enabled(lcore_id)) { 258 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 259 } 260 261 int j; 262 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) { 263 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j]; 264 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 265 266 int queueid = -1; 267 int i; 268 for (i = 0; i < pconf->nb_lcores; i++) { 269 if (pconf->lcore_list[i] == lcore_id) { 270 queueid = i; 271 } 272 } 273 if (queueid < 0) { 274 continue; 275 } 276 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid); 277 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 278 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 279 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid; 280 lcore_conf.nb_rx_queue++; 281 282 lcore_conf.tx_queue_id[port_id] = queueid; 283 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id; 284 lcore_conf.nb_tx_port++; 285 286 /* Enable pcap dump */ 287 if (ff_global_cfg.pcap.enable) { 288 ff_enable_pcap(ff_global_cfg.pcap.save_path, ff_global_cfg.pcap.snap_len); 289 } 290 291 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores; 292 } 293 294 if (lcore_conf.nb_rx_queue == 0) { 295 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id); 296 } 297 298 return 0; 299 } 300 301 static int 302 init_mem_pool(void) 303 { 304 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 305 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 306 uint32_t nb_tx_queue = nb_lcores; 307 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 308 uint16_t max_portid = ff_global_cfg.dpdk.max_portid; 309 310 unsigned nb_mbuf = RTE_ALIGN_CEIL ( 311 (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE + 312 nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST + 313 nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE + 314 nb_lcores * MEMPOOL_CACHE_SIZE + 315 #ifdef FF_KNI 316 nb_ports * KNI_MBUF_MAX + 317 nb_ports * KNI_QUEUE_SIZE + 318 #endif 319 nb_lcores * nb_ports * DISPATCH_RING_SIZE), 320 (unsigned)8192); 321 322 unsigned socketid = 0; 323 uint16_t i, lcore_id; 324 char s[64]; 325 326 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) { 327 lcore_id = ff_global_cfg.dpdk.proc_lcore[i]; 328 if (numa_on) { 329 socketid = rte_lcore_to_socket_id(lcore_id); 330 } 331 332 if (socketid >= NB_SOCKETS) { 333 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 334 socketid, i, NB_SOCKETS); 335 } 336 337 if (pktmbuf_pool[socketid] != NULL) { 338 continue; 339 } 340 341 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 342 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 343 pktmbuf_pool[socketid] = 344 rte_pktmbuf_pool_create(s, nb_mbuf, 345 MEMPOOL_CACHE_SIZE, 0, 346 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 347 } else { 348 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 349 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 350 } 351 352 if (pktmbuf_pool[socketid] == NULL) { 353 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 354 } else { 355 printf("create mbuf pool on socket %d\n", socketid); 356 } 357 358 #ifdef FF_USE_PAGE_ARRAY 359 nb_mbuf = RTE_ALIGN_CEIL ( 360 nb_ports*nb_lcores*MAX_PKT_BURST + 361 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 362 nb_lcores*MEMPOOL_CACHE_SIZE, 363 (unsigned)4096); 364 ff_init_ref_pool(nb_mbuf, socketid); 365 #endif 366 } 367 368 return 0; 369 } 370 371 static struct rte_ring * 372 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 373 { 374 struct rte_ring *ring; 375 376 if (name == NULL) { 377 rte_exit(EXIT_FAILURE, "create ring failed, no name!\n"); 378 } 379 380 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 381 ring = rte_ring_create(name, count, socket_id, flags); 382 } else { 383 ring = rte_ring_lookup(name); 384 } 385 386 if (ring == NULL) { 387 rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name); 388 } 389 390 return ring; 391 } 392 393 static int 394 init_dispatch_ring(void) 395 { 396 int j; 397 char name_buf[RTE_RING_NAMESIZE]; 398 int queueid; 399 400 unsigned socketid = lcore_conf.socket_id; 401 402 /* Create ring according to ports actually being used. */ 403 int nb_ports = ff_global_cfg.dpdk.nb_ports; 404 for (j = 0; j < nb_ports; j++) { 405 uint16_t portid = ff_global_cfg.dpdk.portid_list[j]; 406 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid]; 407 int nb_queues = pconf->nb_lcores; 408 if (dispatch_ring[portid] == NULL) { 409 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid); 410 411 dispatch_ring[portid] = rte_zmalloc(name_buf, 412 sizeof(struct rte_ring *) * nb_queues, 413 RTE_CACHE_LINE_SIZE); 414 if (dispatch_ring[portid] == NULL) { 415 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 416 "failed\n", name_buf); 417 } 418 } 419 420 for(queueid = 0; queueid < nb_queues; ++queueid) { 421 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d", 422 portid, queueid); 423 dispatch_ring[portid][queueid] = create_ring(name_buf, 424 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ); 425 426 if (dispatch_ring[portid][queueid] == NULL) 427 rte_panic("create ring:%s failed!\n", name_buf); 428 429 printf("create ring:%s success, %u ring entries are now free!\n", 430 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid])); 431 } 432 } 433 434 return 0; 435 } 436 437 static void 438 ff_msg_init(struct rte_mempool *mp, 439 __attribute__((unused)) void *opaque_arg, 440 void *obj, __attribute__((unused)) unsigned i) 441 { 442 struct ff_msg *msg = (struct ff_msg *)obj; 443 msg->msg_type = FF_UNKNOWN; 444 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 445 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 446 msg->original_buf = NULL; 447 msg->original_buf_len = 0; 448 } 449 450 static int 451 init_msg_ring(void) 452 { 453 uint16_t i, j; 454 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 455 unsigned socketid = lcore_conf.socket_id; 456 457 /* Create message buffer pool */ 458 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 459 message_pool = rte_mempool_create(FF_MSG_POOL, 460 MSG_RING_SIZE * 2 * nb_procs, 461 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 462 NULL, NULL, ff_msg_init, NULL, 463 socketid, 0); 464 } else { 465 message_pool = rte_mempool_lookup(FF_MSG_POOL); 466 } 467 468 if (message_pool == NULL) { 469 rte_panic("Create msg mempool failed\n"); 470 } 471 472 for(i = 0; i < nb_procs; ++i) { 473 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 474 "%s%u", FF_MSG_RING_IN, i); 475 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 476 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 477 if (msg_ring[i].ring[0] == NULL) 478 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 479 480 for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) { 481 snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE, 482 "%s%u_%u", FF_MSG_RING_OUT, i, j); 483 msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j], 484 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 485 if (msg_ring[i].ring[j] == NULL) 486 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]); 487 } 488 } 489 490 return 0; 491 } 492 493 #ifdef FF_KNI 494 495 static enum FF_KNICTL_CMD get_kni_action(const char *c){ 496 if (!c) 497 return FF_KNICTL_ACTION_DEFAULT; 498 if (0 == strcasecmp(c, "alltokni")){ 499 return FF_KNICTL_ACTION_ALL_TO_KNI; 500 } else if (0 == strcasecmp(c, "alltoff")){ 501 return FF_KNICTL_ACTION_ALL_TO_FF; 502 } else if (0 == strcasecmp(c, "default")){ 503 return FF_KNICTL_ACTION_DEFAULT; 504 } else { 505 return FF_KNICTL_ACTION_DEFAULT; 506 } 507 } 508 509 static int 510 init_kni(void) 511 { 512 int nb_ports = rte_eth_dev_count_avail(); 513 kni_accept = 0; 514 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 515 kni_accept = 1; 516 517 knictl_action = get_kni_action(ff_global_cfg.kni.kni_action); 518 519 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 520 ff_global_cfg.kni.udp_port); 521 522 unsigned socket_id = lcore_conf.socket_id; 523 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 524 525 nb_ports = ff_global_cfg.dpdk.nb_ports; 526 int i, ret; 527 for (i = 0; i < nb_ports; i++) { 528 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 529 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 530 } 531 532 return 0; 533 } 534 #endif 535 536 //RSS reta update will failed when enable flow isolate 537 #ifndef FF_FLOW_ISOLATE 538 static void 539 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues) 540 { 541 if (reta_size == 0) { 542 return; 543 } 544 545 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 546 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 547 548 /* config HW indirection table */ 549 unsigned i, j, hash=0; 550 for (i = 0; i < reta_conf_size; i++) { 551 reta_conf[i].mask = ~0ULL; 552 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 553 reta_conf[i].reta[j] = hash++ % nb_queues; 554 } 555 } 556 557 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 558 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 559 port_id); 560 } 561 } 562 #endif 563 564 static int 565 init_port_start(void) 566 { 567 int nb_ports = ff_global_cfg.dpdk.nb_ports; 568 unsigned socketid = 0; 569 struct rte_mempool *mbuf_pool; 570 uint16_t i, j; 571 572 for (i = 0; i < nb_ports; i++) { 573 uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i]; 574 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id]; 575 uint16_t nb_queues = pconf->nb_lcores; 576 577 for (j=0; j<=pconf->nb_slaves; j++) { 578 if (j < pconf->nb_slaves) { 579 port_id = pconf->slave_portid_list[j]; 580 printf("To init %s's %d'st slave port[%d]\n", 581 ff_global_cfg.dpdk.bond_cfgs->name, 582 j, port_id); 583 } else { 584 port_id = u_port_id; 585 } 586 587 struct rte_eth_dev_info dev_info; 588 struct rte_eth_conf port_conf = {0}; 589 struct rte_eth_rxconf rxq_conf; 590 struct rte_eth_txconf txq_conf; 591 592 int ret = rte_eth_dev_info_get(port_id, &dev_info); 593 if (ret != 0) 594 rte_exit(EXIT_FAILURE, 595 "Error during getting device (port %u) info: %s\n", 596 port_id, strerror(-ret)); 597 598 if (nb_queues > dev_info.max_rx_queues) { 599 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 600 nb_queues, 601 dev_info.max_rx_queues); 602 } 603 604 if (nb_queues > dev_info.max_tx_queues) { 605 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 606 nb_queues, 607 dev_info.max_tx_queues); 608 } 609 610 struct rte_ether_addr addr; 611 rte_eth_macaddr_get(port_id, &addr); 612 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 613 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 614 (unsigned)port_id, 615 addr.addr_bytes[0], addr.addr_bytes[1], 616 addr.addr_bytes[2], addr.addr_bytes[3], 617 addr.addr_bytes[4], addr.addr_bytes[5]); 618 619 rte_memcpy(pconf->mac, 620 addr.addr_bytes, RTE_ETHER_ADDR_LEN); 621 622 /* Set RSS mode */ 623 uint64_t default_rss_hf = ETH_RSS_PROTO_MASK; 624 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 625 port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf; 626 if (dev_info.hash_key_size == 52) { 627 rsskey = default_rsskey_52bytes; 628 rsskey_len = 52; 629 } 630 if (ff_global_cfg.dpdk.symmetric_rss) { 631 printf("Use symmetric Receive-side Scaling(RSS) key\n"); 632 rsskey = symmetric_rsskey; 633 } 634 port_conf.rx_adv_conf.rss_conf.rss_key = rsskey; 635 port_conf.rx_adv_conf.rss_conf.rss_key_len = rsskey_len; 636 port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads; 637 if (port_conf.rx_adv_conf.rss_conf.rss_hf != 638 ETH_RSS_PROTO_MASK) { 639 printf("Port %u modified RSS hash function based on hardware support," 640 "requested:%#"PRIx64" configured:%#"PRIx64"\n", 641 port_id, default_rss_hf, 642 port_conf.rx_adv_conf.rss_conf.rss_hf); 643 } 644 645 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) { 646 port_conf.txmode.offloads |= 647 DEV_TX_OFFLOAD_MBUF_FAST_FREE; 648 } 649 650 /* Set Rx VLAN stripping */ 651 if (ff_global_cfg.dpdk.vlan_strip) { 652 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 653 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP; 654 } 655 } 656 657 /* Enable HW CRC stripping */ 658 port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC; 659 660 /* FIXME: Enable TCP LRO ?*/ 661 #if 0 662 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 663 printf("LRO is supported\n"); 664 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO; 665 pconf->hw_features.rx_lro = 1; 666 } 667 #endif 668 669 /* Set Rx checksum checking */ 670 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 671 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 672 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 673 printf("RX checksum offload supported\n"); 674 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM; 675 pconf->hw_features.rx_csum = 1; 676 } 677 678 if (ff_global_cfg.dpdk.tx_csum_offoad_skip == 0) { 679 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 680 printf("TX ip checksum offload supported\n"); 681 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; 682 pconf->hw_features.tx_csum_ip = 1; 683 } 684 685 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 686 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 687 printf("TX TCP&UDP checksum offload supported\n"); 688 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM; 689 pconf->hw_features.tx_csum_l4 = 1; 690 } 691 } else { 692 printf("TX checksum offoad is disabled\n"); 693 } 694 695 if (ff_global_cfg.dpdk.tso) { 696 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 697 printf("TSO is supported\n"); 698 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO; 699 pconf->hw_features.tx_tso = 1; 700 } 701 } else { 702 printf("TSO is disabled\n"); 703 } 704 705 if (dev_info.reta_size) { 706 /* reta size must be power of 2 */ 707 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 708 709 rss_reta_size[port_id] = dev_info.reta_size; 710 printf("port[%d]: rss table size: %d\n", port_id, 711 dev_info.reta_size); 712 } 713 714 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 715 continue; 716 } 717 718 ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); 719 if (ret != 0) { 720 return ret; 721 } 722 723 static uint16_t nb_rxd = RX_QUEUE_SIZE; 724 static uint16_t nb_txd = TX_QUEUE_SIZE; 725 ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd); 726 if (ret < 0) 727 printf("Could not adjust number of descriptors " 728 "for port%u (%d)\n", (unsigned)port_id, ret); 729 730 uint16_t q; 731 for (q = 0; q < nb_queues; q++) { 732 if (numa_on) { 733 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q]; 734 socketid = rte_lcore_to_socket_id(lcore_id); 735 } 736 mbuf_pool = pktmbuf_pool[socketid]; 737 738 txq_conf = dev_info.default_txconf; 739 txq_conf.offloads = port_conf.txmode.offloads; 740 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd, 741 socketid, &txq_conf); 742 if (ret < 0) { 743 return ret; 744 } 745 746 rxq_conf = dev_info.default_rxconf; 747 rxq_conf.offloads = port_conf.rxmode.offloads; 748 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd, 749 socketid, &rxq_conf, mbuf_pool); 750 if (ret < 0) { 751 return ret; 752 } 753 } 754 755 756 if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME, 757 strlen(dev_info.driver_name)) == 0) { 758 759 rte_eth_macaddr_get(port_id, &addr); 760 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 761 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 762 (unsigned)port_id, 763 addr.addr_bytes[0], addr.addr_bytes[1], 764 addr.addr_bytes[2], addr.addr_bytes[3], 765 addr.addr_bytes[4], addr.addr_bytes[5]); 766 767 rte_memcpy(pconf->mac, 768 addr.addr_bytes, RTE_ETHER_ADDR_LEN); 769 770 int mode, count, x; 771 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS; 772 773 mode = rte_eth_bond_mode_get(port_id); 774 printf("Port %u, bond mode:%d\n", port_id, mode); 775 776 count = rte_eth_bond_slaves_get(port_id, slaves, len); 777 printf("Port %u, %s's slave ports count:%d\n", port_id, 778 ff_global_cfg.dpdk.bond_cfgs->name, count); 779 for (x=0; x<count; x++) { 780 printf("Port %u, %s's slave port[%u]\n", port_id, 781 ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]); 782 } 783 } 784 785 ret = rte_eth_dev_start(port_id); 786 if (ret < 0) { 787 return ret; 788 } 789 //RSS reta update will failed when enable flow isolate 790 #ifndef FF_FLOW_ISOLATE 791 if (nb_queues > 1) { 792 /* set HW rss hash function to Toeplitz. */ 793 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 794 struct rte_eth_hash_filter_info info = {0}; 795 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 796 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 797 798 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 799 RTE_ETH_FILTER_SET, &info) < 0) { 800 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 801 port_id); 802 } 803 } 804 805 set_rss_table(port_id, dev_info.reta_size, nb_queues); 806 } 807 #endif 808 809 /* Enable RX in promiscuous mode for the Ethernet device. */ 810 if (ff_global_cfg.dpdk.promiscuous) { 811 ret = rte_eth_promiscuous_enable(port_id); 812 if (ret == 0) { 813 printf("set port %u to promiscuous mode ok\n", port_id); 814 } else { 815 printf("set port %u to promiscuous mode error\n", port_id); 816 } 817 } 818 } 819 } 820 821 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 822 check_all_ports_link_status(); 823 } 824 825 return 0; 826 } 827 828 static int 829 init_clock(void) 830 { 831 rte_timer_subsystem_init(); 832 uint64_t hz = rte_get_timer_hz(); 833 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 834 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 835 836 rte_timer_init(&freebsd_clock); 837 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 838 rte_lcore_id(), &ff_hardclock_job, NULL); 839 840 ff_update_current_ts(); 841 842 return 0; 843 } 844 845 #ifdef FF_FLOW_ISOLATE 846 /** Print a message out of a flow error. */ 847 static int 848 port_flow_complain(struct rte_flow_error *error) 849 { 850 static const char *const errstrlist[] = { 851 [RTE_FLOW_ERROR_TYPE_NONE] = "no error", 852 [RTE_FLOW_ERROR_TYPE_UNSPECIFIED] = "cause unspecified", 853 [RTE_FLOW_ERROR_TYPE_HANDLE] = "flow rule (handle)", 854 [RTE_FLOW_ERROR_TYPE_ATTR_GROUP] = "group field", 855 [RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY] = "priority field", 856 [RTE_FLOW_ERROR_TYPE_ATTR_INGRESS] = "ingress field", 857 [RTE_FLOW_ERROR_TYPE_ATTR_EGRESS] = "egress field", 858 [RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER] = "transfer field", 859 [RTE_FLOW_ERROR_TYPE_ATTR] = "attributes structure", 860 [RTE_FLOW_ERROR_TYPE_ITEM_NUM] = "pattern length", 861 [RTE_FLOW_ERROR_TYPE_ITEM_SPEC] = "item specification", 862 [RTE_FLOW_ERROR_TYPE_ITEM_LAST] = "item specification range", 863 [RTE_FLOW_ERROR_TYPE_ITEM_MASK] = "item specification mask", 864 [RTE_FLOW_ERROR_TYPE_ITEM] = "specific pattern item", 865 [RTE_FLOW_ERROR_TYPE_ACTION_NUM] = "number of actions", 866 [RTE_FLOW_ERROR_TYPE_ACTION_CONF] = "action configuration", 867 [RTE_FLOW_ERROR_TYPE_ACTION] = "specific action", 868 }; 869 const char *errstr; 870 char buf[32]; 871 int err = rte_errno; 872 873 if ((unsigned int)error->type >= RTE_DIM(errstrlist) || 874 !errstrlist[error->type]) 875 errstr = "unknown type"; 876 else 877 errstr = errstrlist[error->type]; 878 printf("Caught error type %d (%s): %s%s: %s\n", 879 error->type, errstr, 880 error->cause ? (snprintf(buf, sizeof(buf), "cause: %p, ", 881 error->cause), buf) : "", 882 error->message ? error->message : "(no stated reason)", 883 rte_strerror(err)); 884 return -err; 885 } 886 887 static int 888 port_flow_isolate(uint16_t port_id, int set) 889 { 890 struct rte_flow_error error; 891 892 /* Poisoning to make sure PMDs update it in case of error. */ 893 memset(&error, 0x66, sizeof(error)); 894 if (rte_flow_isolate(port_id, set, &error)) 895 return port_flow_complain(&error); 896 printf("Ingress traffic on port %u is %s to the defined flow rules\n", 897 port_id, 898 set ? "now restricted" : "not restricted anymore"); 899 return 0; 900 } 901 902 static int 903 create_tcp_flow(uint16_t port_id, uint16_t tcp_port) { 904 struct rte_flow_attr attr = {.ingress = 1}; 905 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 906 int nb_queues = pconf->nb_lcores; 907 uint16_t queue[RTE_MAX_QUEUES_PER_PORT]; 908 int i = 0, j = 0; 909 for (i = 0, j = 0; i < nb_queues; ++i) 910 queue[j++] = i; 911 struct rte_flow_action_rss rss = { 912 .types = ETH_RSS_NONFRAG_IPV4_TCP, 913 .key_len = rsskey_len, 914 .key = rsskey, 915 .queue_num = j, 916 .queue = queue, 917 }; 918 919 struct rte_eth_dev_info dev_info; 920 int ret = rte_eth_dev_info_get(port_id, &dev_info); 921 if (ret != 0) 922 rte_exit(EXIT_FAILURE, "Error during getting device (port %u) info: %s\n", port_id, strerror(-ret)); 923 924 struct rte_flow_item pattern[3]; 925 struct rte_flow_action action[2]; 926 struct rte_flow_item_tcp tcp_spec; 927 struct rte_flow_item_tcp tcp_mask = { 928 .hdr = { 929 .src_port = RTE_BE16(0x0000), 930 .dst_port = RTE_BE16(0xffff), 931 }, 932 }; 933 struct rte_flow_error error; 934 935 memset(pattern, 0, sizeof(pattern)); 936 memset(action, 0, sizeof(action)); 937 938 /* set the dst ipv4 packet to the required value */ 939 pattern[0].type = RTE_FLOW_ITEM_TYPE_IPV4; 940 941 memset(&tcp_spec, 0, sizeof(struct rte_flow_item_tcp)); 942 tcp_spec.hdr.dst_port = rte_cpu_to_be_16(tcp_port); 943 pattern[1].type = RTE_FLOW_ITEM_TYPE_TCP; 944 pattern[1].spec = &tcp_spec; 945 pattern[1].mask = &tcp_mask; 946 947 /* end the pattern array */ 948 pattern[2].type = RTE_FLOW_ITEM_TYPE_END; 949 950 /* create the action */ 951 action[0].type = RTE_FLOW_ACTION_TYPE_RSS; 952 action[0].conf = &rss; 953 action[1].type = RTE_FLOW_ACTION_TYPE_END; 954 955 struct rte_flow *flow; 956 /* validate and create the flow rule */ 957 if (!rte_flow_validate(port_id, &attr, pattern, action, &error)) { 958 flow = rte_flow_create(port_id, &attr, pattern, action, &error); 959 if (!flow) { 960 return port_flow_complain(&error); 961 } 962 } 963 964 memset(pattern, 0, sizeof(pattern)); 965 966 /* set the dst ipv4 packet to the required value */ 967 pattern[0].type = RTE_FLOW_ITEM_TYPE_IPV4; 968 969 struct rte_flow_item_tcp tcp_src_mask = { 970 .hdr = { 971 .src_port = RTE_BE16(0xffff), 972 .dst_port = RTE_BE16(0x0000), 973 }, 974 }; 975 976 memset(&tcp_spec, 0, sizeof(struct rte_flow_item_tcp)); 977 tcp_spec.hdr.src_port = rte_cpu_to_be_16(tcp_port); 978 pattern[1].type = RTE_FLOW_ITEM_TYPE_TCP; 979 pattern[1].spec = &tcp_spec; 980 pattern[1].mask = &tcp_src_mask; 981 982 /* end the pattern array */ 983 pattern[2].type = RTE_FLOW_ITEM_TYPE_END; 984 985 /* validate and create the flow rule */ 986 if (!rte_flow_validate(port_id, &attr, pattern, action, &error)) { 987 flow = rte_flow_create(port_id, &attr, pattern, action, &error); 988 if (!flow) { 989 return port_flow_complain(&error); 990 } 991 } 992 993 return 1; 994 } 995 996 static int 997 init_flow(uint16_t port_id, uint16_t tcp_port) { 998 // struct ff_flow_cfg fcfg = ff_global_cfg.dpdk.flow_cfgs[0]; 999 1000 // int i; 1001 // for (i = 0; i < fcfg.nb_port; i++) { 1002 // if(!create_tcp_flow(fcfg.port_id, fcfg.tcp_ports[i])) { 1003 // return 0; 1004 // } 1005 // } 1006 1007 if(!create_tcp_flow(port_id, tcp_port)) { 1008 rte_exit(EXIT_FAILURE, "create tcp flow failed\n"); 1009 return -1; 1010 } 1011 1012 /* ARP rule */ 1013 struct rte_flow_attr attr = {.ingress = 1}; 1014 struct rte_flow_action_queue queue = {.index = 0}; 1015 1016 struct rte_flow_item pattern_[2]; 1017 struct rte_flow_action action[2]; 1018 struct rte_flow_item_eth eth_type = {.type = RTE_BE16(0x0806)}; 1019 struct rte_flow_item_eth eth_mask = { 1020 .type = RTE_BE16(0xffff) 1021 }; 1022 1023 memset(pattern_, 0, sizeof(pattern_)); 1024 memset(action, 0, sizeof(action)); 1025 1026 pattern_[0].type = RTE_FLOW_ITEM_TYPE_ETH; 1027 pattern_[0].spec = ð_type; 1028 pattern_[0].mask = ð_mask; 1029 1030 pattern_[1].type = RTE_FLOW_ITEM_TYPE_END; 1031 1032 /* create the action */ 1033 action[0].type = RTE_FLOW_ACTION_TYPE_QUEUE; 1034 action[0].conf = &queue; 1035 action[1].type = RTE_FLOW_ACTION_TYPE_END; 1036 1037 struct rte_flow *flow; 1038 struct rte_flow_error error; 1039 /* validate and create the flow rule */ 1040 if (!rte_flow_validate(port_id, &attr, pattern_, action, &error)) { 1041 flow = rte_flow_create(port_id, &attr, pattern_, action, &error); 1042 if (!flow) { 1043 return port_flow_complain(&error); 1044 } 1045 } 1046 1047 return 1; 1048 } 1049 1050 #endif 1051 1052 int 1053 ff_dpdk_init(int argc, char **argv) 1054 { 1055 if (ff_global_cfg.dpdk.nb_procs < 1 || 1056 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 1057 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 1058 ff_global_cfg.dpdk.proc_id < 0) { 1059 printf("param num_procs[%d] or proc_id[%d] error!\n", 1060 ff_global_cfg.dpdk.nb_procs, 1061 ff_global_cfg.dpdk.proc_id); 1062 exit(1); 1063 } 1064 1065 int ret = rte_eal_init(argc, argv); 1066 if (ret < 0) { 1067 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 1068 } 1069 1070 numa_on = ff_global_cfg.dpdk.numa_on; 1071 1072 idle_sleep = ff_global_cfg.dpdk.idle_sleep; 1073 pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \ 1074 BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay; 1075 1076 init_lcore_conf(); 1077 1078 init_mem_pool(); 1079 1080 init_dispatch_ring(); 1081 1082 init_msg_ring(); 1083 1084 #ifdef FF_KNI 1085 enable_kni = ff_global_cfg.kni.enable; 1086 if (enable_kni) { 1087 init_kni(); 1088 } 1089 #endif 1090 1091 #ifdef FF_USE_PAGE_ARRAY 1092 ff_mmap_init(); 1093 #endif 1094 1095 #ifdef FF_FLOW_ISOLATE 1096 // run once in primary process 1097 if (0 == lcore_conf.tx_queue_id[0]){ 1098 ret = port_flow_isolate(0, 1); 1099 if (ret < 0) 1100 rte_exit(EXIT_FAILURE, "init_port_isolate failed\n"); 1101 } 1102 #endif 1103 1104 ret = init_port_start(); 1105 if (ret < 0) { 1106 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 1107 } 1108 1109 init_clock(); 1110 #ifdef FF_FLOW_ISOLATE 1111 //Only give a example usage: port_id=0, tcp_port= 80. 1112 //Recommend: 1113 //1. init_flow should replace `set_rss_table` in `init_port_start` loop, This can set all NIC's port_id_list instead only 0 device(port_id). 1114 //2. using config options `tcp_port` replace magic number of 80 1115 ret = init_flow(0, 80); 1116 if (ret < 0) { 1117 rte_exit(EXIT_FAILURE, "init_port_flow failed\n"); 1118 } 1119 #endif 1120 return 0; 1121 } 1122 1123 static void 1124 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 1125 { 1126 uint8_t rx_csum = ctx->hw_features.rx_csum; 1127 if (rx_csum) { 1128 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 1129 rte_pktmbuf_free(pkt); 1130 return; 1131 } 1132 } 1133 1134 void *data = rte_pktmbuf_mtod(pkt, void*); 1135 uint16_t len = rte_pktmbuf_data_len(pkt); 1136 1137 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 1138 if (hdr == NULL) { 1139 rte_pktmbuf_free(pkt); 1140 return; 1141 } 1142 1143 if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) { 1144 ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci); 1145 } 1146 1147 struct rte_mbuf *pn = pkt->next; 1148 void *prev = hdr; 1149 while(pn != NULL) { 1150 data = rte_pktmbuf_mtod(pn, void*); 1151 len = rte_pktmbuf_data_len(pn); 1152 1153 void *mb = ff_mbuf_get(prev, pn, data, len); 1154 if (mb == NULL) { 1155 ff_mbuf_free(hdr); 1156 rte_pktmbuf_free(pkt); 1157 return; 1158 } 1159 pn = pn->next; 1160 prev = mb; 1161 } 1162 1163 ff_veth_process_packet(ctx->ifp, hdr); 1164 } 1165 1166 static enum FilterReturn 1167 protocol_filter(const void *data, uint16_t len) 1168 { 1169 if(len < RTE_ETHER_ADDR_LEN) 1170 return FILTER_UNKNOWN; 1171 1172 const struct rte_ether_hdr *hdr; 1173 const struct rte_vlan_hdr *vlanhdr; 1174 hdr = (const struct rte_ether_hdr *)data; 1175 uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type); 1176 data += RTE_ETHER_HDR_LEN; 1177 len -= RTE_ETHER_HDR_LEN; 1178 1179 if (ether_type == RTE_ETHER_TYPE_VLAN) { 1180 vlanhdr = (struct rte_vlan_hdr *)data; 1181 ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto); 1182 data += sizeof(struct rte_vlan_hdr); 1183 len -= sizeof(struct rte_vlan_hdr); 1184 } 1185 1186 if(ether_type == RTE_ETHER_TYPE_ARP) 1187 return FILTER_ARP; 1188 1189 #ifdef INET6 1190 if (ether_type == RTE_ETHER_TYPE_IPV6) { 1191 return ff_kni_proto_filter(data, 1192 len, ether_type); 1193 } 1194 #endif 1195 1196 #ifndef FF_KNI 1197 return FILTER_UNKNOWN; 1198 #else 1199 if (!enable_kni) { 1200 return FILTER_UNKNOWN; 1201 } 1202 1203 if(ether_type != RTE_ETHER_TYPE_IPV4) 1204 return FILTER_UNKNOWN; 1205 1206 return ff_kni_proto_filter(data, 1207 len, ether_type); 1208 #endif 1209 } 1210 1211 static inline void 1212 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m) 1213 { 1214 struct rte_mbuf *md; 1215 void *src, *dst; 1216 1217 dst = rte_pktmbuf_mtod(mi, void *); 1218 src = rte_pktmbuf_mtod(m, void *); 1219 1220 mi->data_len = m->data_len; 1221 rte_memcpy(dst, src, m->data_len); 1222 1223 mi->port = m->port; 1224 mi->vlan_tci = m->vlan_tci; 1225 mi->vlan_tci_outer = m->vlan_tci_outer; 1226 mi->tx_offload = m->tx_offload; 1227 mi->hash = m->hash; 1228 mi->ol_flags = m->ol_flags; 1229 mi->packet_type = m->packet_type; 1230 } 1231 1232 /* copied from rte_pktmbuf_clone */ 1233 static inline struct rte_mbuf * 1234 pktmbuf_deep_clone(const struct rte_mbuf *md, 1235 struct rte_mempool *mp) 1236 { 1237 struct rte_mbuf *mc, *mi, **prev; 1238 uint32_t pktlen; 1239 uint8_t nseg; 1240 1241 if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) 1242 return NULL; 1243 1244 mi = mc; 1245 prev = &mi->next; 1246 pktlen = md->pkt_len; 1247 nseg = 0; 1248 1249 do { 1250 nseg++; 1251 pktmbuf_deep_attach(mi, md); 1252 *prev = mi; 1253 prev = &mi->next; 1254 } while ((md = md->next) != NULL && 1255 (mi = rte_pktmbuf_alloc(mp)) != NULL); 1256 1257 *prev = NULL; 1258 mc->nb_segs = nseg; 1259 mc->pkt_len = pktlen; 1260 1261 /* Allocation of new indirect segment failed */ 1262 if (unlikely (mi == NULL)) { 1263 rte_pktmbuf_free(mc); 1264 return NULL; 1265 } 1266 1267 __rte_mbuf_sanity_check(mc, 1); 1268 return mc; 1269 } 1270 1271 static inline void 1272 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 1273 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 1274 { 1275 struct lcore_conf *qconf = &lcore_conf; 1276 uint16_t nb_queues = qconf->nb_queue_list[port_id]; 1277 1278 uint16_t i; 1279 for (i = 0; i < count; i++) { 1280 struct rte_mbuf *rtem = bufs[i]; 1281 1282 if (unlikely( ff_global_cfg.pcap.enable)) { 1283 if (!pkts_from_ring) { 1284 ff_dump_packets( ff_global_cfg.pcap.save_path, rtem, ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len); 1285 } 1286 } 1287 1288 void *data = rte_pktmbuf_mtod(rtem, void*); 1289 uint16_t len = rte_pktmbuf_data_len(rtem); 1290 1291 if (!pkts_from_ring) { 1292 ff_traffic.rx_packets++; 1293 ff_traffic.rx_bytes += len; 1294 } 1295 1296 if (!pkts_from_ring && packet_dispatcher) { 1297 int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues); 1298 if (ret == FF_DISPATCH_RESPONSE) { 1299 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len; 1300 1301 /* 1302 * We have not support vlan out strip 1303 */ 1304 if (rtem->vlan_tci) { 1305 data = rte_pktmbuf_prepend(rtem, sizeof(struct rte_vlan_hdr)); 1306 if (data != NULL) { 1307 memmove(data, data + sizeof(struct rte_vlan_hdr), RTE_ETHER_HDR_LEN); 1308 struct rte_ether_hdr *etherhdr = (struct rte_ether_hdr *)data; 1309 struct rte_vlan_hdr *vlanhdr = (struct rte_vlan_hdr *)(data + RTE_ETHER_HDR_LEN); 1310 vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci); 1311 vlanhdr->eth_proto = etherhdr->ether_type; 1312 etherhdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN); 1313 } 1314 } 1315 send_single_packet(rtem, port_id); 1316 continue; 1317 } 1318 1319 if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) { 1320 rte_pktmbuf_free(rtem); 1321 continue; 1322 } 1323 1324 if (ret != queue_id) { 1325 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem); 1326 if (ret < 0) 1327 rte_pktmbuf_free(rtem); 1328 1329 continue; 1330 } 1331 } 1332 1333 enum FilterReturn filter = protocol_filter(data, len); 1334 #ifdef INET6 1335 if (filter == FILTER_ARP || filter == FILTER_NDP) { 1336 #else 1337 if (filter == FILTER_ARP) { 1338 #endif 1339 struct rte_mempool *mbuf_pool; 1340 struct rte_mbuf *mbuf_clone; 1341 if (!pkts_from_ring) { 1342 uint16_t j; 1343 for(j = 0; j < nb_queues; ++j) { 1344 if(j == queue_id) 1345 continue; 1346 1347 unsigned socket_id = 0; 1348 if (numa_on) { 1349 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j]; 1350 socket_id = rte_lcore_to_socket_id(lcore_id); 1351 } 1352 mbuf_pool = pktmbuf_pool[socket_id]; 1353 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1354 if(mbuf_clone) { 1355 int ret = rte_ring_enqueue(dispatch_ring[port_id][j], 1356 mbuf_clone); 1357 if (ret < 0) 1358 rte_pktmbuf_free(mbuf_clone); 1359 } 1360 } 1361 } 1362 1363 #ifdef FF_KNI 1364 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1365 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 1366 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1367 if(mbuf_clone) { 1368 ff_kni_enqueue(port_id, mbuf_clone); 1369 } 1370 } 1371 #endif 1372 ff_veth_input(ctx, rtem); 1373 #ifdef FF_KNI 1374 } else if (enable_kni) { 1375 if (knictl_action == FF_KNICTL_ACTION_ALL_TO_KNI){ 1376 ff_kni_enqueue(port_id, rtem); 1377 } else if (knictl_action == FF_KNICTL_ACTION_ALL_TO_FF){ 1378 ff_veth_input(ctx, rtem); 1379 } else if (knictl_action == FF_KNICTL_ACTION_DEFAULT){ 1380 if (enable_kni && 1381 ((filter == FILTER_KNI && kni_accept) || 1382 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 1383 ff_kni_enqueue(port_id, rtem); 1384 } else { 1385 ff_veth_input(ctx, rtem); 1386 } 1387 } else { 1388 ff_veth_input(ctx, rtem); 1389 } 1390 #endif 1391 } else { 1392 ff_veth_input(ctx, rtem); 1393 } 1394 } 1395 } 1396 1397 static inline int 1398 process_dispatch_ring(uint16_t port_id, uint16_t queue_id, 1399 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 1400 { 1401 /* read packet from ring buf and to process */ 1402 uint16_t nb_rb; 1403 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id], 1404 (void **)pkts_burst, MAX_PKT_BURST, NULL); 1405 1406 if(nb_rb > 0) { 1407 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 1408 } 1409 1410 return 0; 1411 } 1412 1413 static inline void 1414 handle_sysctl_msg(struct ff_msg *msg) 1415 { 1416 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 1417 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 1418 msg->sysctl.newlen); 1419 1420 if (ret < 0) { 1421 msg->result = errno; 1422 } else { 1423 msg->result = 0; 1424 } 1425 } 1426 1427 static inline void 1428 handle_ioctl_msg(struct ff_msg *msg) 1429 { 1430 int fd, ret; 1431 #ifdef INET6 1432 if (msg->msg_type == FF_IOCTL6) { 1433 fd = ff_socket(AF_INET6, SOCK_DGRAM, 0); 1434 } else 1435 #endif 1436 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 1437 1438 if (fd < 0) { 1439 ret = -1; 1440 goto done; 1441 } 1442 1443 ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data); 1444 1445 ff_close(fd); 1446 1447 done: 1448 if (ret < 0) { 1449 msg->result = errno; 1450 } else { 1451 msg->result = 0; 1452 } 1453 } 1454 1455 static inline void 1456 handle_route_msg(struct ff_msg *msg) 1457 { 1458 int ret = ff_rtioctl(msg->route.fib, msg->route.data, 1459 &msg->route.len, msg->route.maxlen); 1460 if (ret < 0) { 1461 msg->result = errno; 1462 } else { 1463 msg->result = 0; 1464 } 1465 } 1466 1467 static inline void 1468 handle_top_msg(struct ff_msg *msg) 1469 { 1470 msg->top = ff_top_status; 1471 msg->result = 0; 1472 } 1473 1474 #ifdef FF_NETGRAPH 1475 static inline void 1476 handle_ngctl_msg(struct ff_msg *msg) 1477 { 1478 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data); 1479 if (ret < 0) { 1480 msg->result = errno; 1481 } else { 1482 msg->result = 0; 1483 msg->ngctl.ret = ret; 1484 } 1485 } 1486 #endif 1487 1488 #ifdef FF_IPFW 1489 static inline void 1490 handle_ipfw_msg(struct ff_msg *msg) 1491 { 1492 int fd, ret; 1493 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW); 1494 if (fd < 0) { 1495 ret = -1; 1496 goto done; 1497 } 1498 1499 switch (msg->ipfw.cmd) { 1500 case FF_IPFW_GET: 1501 ret = ff_getsockopt_freebsd(fd, msg->ipfw.level, 1502 msg->ipfw.optname, msg->ipfw.optval, 1503 msg->ipfw.optlen); 1504 break; 1505 case FF_IPFW_SET: 1506 ret = ff_setsockopt_freebsd(fd, msg->ipfw.level, 1507 msg->ipfw.optname, msg->ipfw.optval, 1508 *(msg->ipfw.optlen)); 1509 break; 1510 default: 1511 ret = -1; 1512 errno = ENOTSUP; 1513 break; 1514 } 1515 1516 ff_close(fd); 1517 1518 done: 1519 if (ret < 0) { 1520 msg->result = errno; 1521 } else { 1522 msg->result = 0; 1523 } 1524 } 1525 #endif 1526 1527 static inline void 1528 handle_traffic_msg(struct ff_msg *msg) 1529 { 1530 msg->traffic = ff_traffic; 1531 msg->result = 0; 1532 } 1533 1534 #ifdef FF_KNI 1535 static inline void 1536 handle_knictl_msg(struct ff_msg *msg) 1537 { 1538 if (msg->knictl.kni_cmd == FF_KNICTL_CMD_SET){ 1539 switch (msg->knictl.kni_action){ 1540 case FF_KNICTL_ACTION_ALL_TO_FF: knictl_action = FF_KNICTL_ACTION_ALL_TO_FF; msg->result = 0; printf("new kni action: alltoff\n"); break; 1541 case FF_KNICTL_ACTION_ALL_TO_KNI: knictl_action = FF_KNICTL_ACTION_ALL_TO_KNI; msg->result = 0; printf("new kni action: alltokni\n"); break; 1542 case FF_KNICTL_ACTION_DEFAULT: knictl_action = FF_KNICTL_ACTION_DEFAULT; msg->result = 0; printf("new kni action: default\n"); break; 1543 default: msg->result = -1; 1544 } 1545 } 1546 else if (msg->knictl.kni_cmd == FF_KNICTL_CMD_GET){ 1547 msg->knictl.kni_action = knictl_action; 1548 } else { 1549 msg->result = -2; 1550 } 1551 } 1552 #endif 1553 1554 static inline void 1555 handle_default_msg(struct ff_msg *msg) 1556 { 1557 msg->result = ENOTSUP; 1558 } 1559 1560 static inline void 1561 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1562 { 1563 switch (msg->msg_type) { 1564 case FF_SYSCTL: 1565 handle_sysctl_msg(msg); 1566 break; 1567 case FF_IOCTL: 1568 #ifdef INET6 1569 case FF_IOCTL6: 1570 #endif 1571 handle_ioctl_msg(msg); 1572 break; 1573 case FF_ROUTE: 1574 handle_route_msg(msg); 1575 break; 1576 case FF_TOP: 1577 handle_top_msg(msg); 1578 break; 1579 #ifdef FF_NETGRAPH 1580 case FF_NGCTL: 1581 handle_ngctl_msg(msg); 1582 break; 1583 #endif 1584 #ifdef FF_IPFW 1585 case FF_IPFW_CTL: 1586 handle_ipfw_msg(msg); 1587 break; 1588 #endif 1589 case FF_TRAFFIC: 1590 handle_traffic_msg(msg); 1591 break; 1592 #ifdef FF_KNI 1593 case FF_KNICTL: 1594 handle_knictl_msg(msg); 1595 break; 1596 #endif 1597 default: 1598 handle_default_msg(msg); 1599 break; 1600 } 1601 rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg); 1602 } 1603 1604 static inline int 1605 process_msg_ring(uint16_t proc_id) 1606 { 1607 void *msg; 1608 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1609 1610 if (unlikely(ret == 0)) { 1611 handle_msg((struct ff_msg *)msg, proc_id); 1612 } 1613 1614 return 0; 1615 } 1616 1617 /* Send burst of packets on an output interface */ 1618 static inline int 1619 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1620 { 1621 struct rte_mbuf **m_table; 1622 int ret; 1623 uint16_t queueid; 1624 1625 queueid = qconf->tx_queue_id[port]; 1626 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1627 1628 if (unlikely(ff_global_cfg.pcap.enable)) { 1629 uint16_t i; 1630 for (i = 0; i < n; i++) { 1631 ff_dump_packets( ff_global_cfg.pcap.save_path, m_table[i], 1632 ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len); 1633 } 1634 } 1635 1636 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1637 ff_traffic.tx_packets += ret; 1638 uint16_t i; 1639 for (i = 0; i < ret; i++) { 1640 ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]); 1641 #ifdef FF_USE_PAGE_ARRAY 1642 if (qconf->tx_mbufs[port].bsd_m_table[i]) 1643 ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs); 1644 #endif 1645 } 1646 if (unlikely(ret < n)) { 1647 do { 1648 rte_pktmbuf_free(m_table[ret]); 1649 #ifdef FF_USE_PAGE_ARRAY 1650 if ( qconf->tx_mbufs[port].bsd_m_table[ret] ) 1651 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]); 1652 #endif 1653 } while (++ret < n); 1654 } 1655 return 0; 1656 } 1657 1658 /* Enqueue a single packet, and send burst if queue is filled */ 1659 static inline int 1660 send_single_packet(struct rte_mbuf *m, uint8_t port) 1661 { 1662 uint16_t len; 1663 struct lcore_conf *qconf; 1664 1665 qconf = &lcore_conf; 1666 len = qconf->tx_mbufs[port].len; 1667 qconf->tx_mbufs[port].m_table[len] = m; 1668 len++; 1669 1670 /* enough pkts to be sent */ 1671 if (unlikely(len == MAX_PKT_BURST)) { 1672 send_burst(qconf, MAX_PKT_BURST, port); 1673 len = 0; 1674 } 1675 1676 qconf->tx_mbufs[port].len = len; 1677 return 0; 1678 } 1679 1680 int 1681 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1682 int total) 1683 { 1684 #ifdef FF_USE_PAGE_ARRAY 1685 struct lcore_conf *qconf = &lcore_conf; 1686 int len = 0; 1687 1688 len = ff_if_send_onepkt(ctx, m,total); 1689 if (unlikely(len == MAX_PKT_BURST)) { 1690 send_burst(qconf, MAX_PKT_BURST, ctx->port_id); 1691 len = 0; 1692 } 1693 qconf->tx_mbufs[ctx->port_id].len = len; 1694 return 0; 1695 #endif 1696 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1697 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1698 if (head == NULL) { 1699 ff_mbuf_free(m); 1700 return -1; 1701 } 1702 1703 head->pkt_len = total; 1704 head->nb_segs = 0; 1705 1706 int off = 0; 1707 struct rte_mbuf *cur = head, *prev = NULL; 1708 while(total > 0) { 1709 if (cur == NULL) { 1710 cur = rte_pktmbuf_alloc(mbuf_pool); 1711 if (cur == NULL) { 1712 rte_pktmbuf_free(head); 1713 ff_mbuf_free(m); 1714 return -1; 1715 } 1716 } 1717 1718 if (prev != NULL) { 1719 prev->next = cur; 1720 } 1721 head->nb_segs++; 1722 1723 prev = cur; 1724 void *data = rte_pktmbuf_mtod(cur, void*); 1725 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1726 int ret = ff_mbuf_copydata(m, data, off, len); 1727 if (ret < 0) { 1728 rte_pktmbuf_free(head); 1729 ff_mbuf_free(m); 1730 return -1; 1731 } 1732 1733 1734 cur->data_len = len; 1735 off += len; 1736 total -= len; 1737 cur = NULL; 1738 } 1739 1740 struct ff_tx_offload offload = {0}; 1741 ff_mbuf_tx_offload(m, &offload); 1742 1743 void *data = rte_pktmbuf_mtod(head, void*); 1744 1745 if (offload.ip_csum) { 1746 /* ipv6 not supported yet */ 1747 struct rte_ipv4_hdr *iph; 1748 int iph_len; 1749 iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN); 1750 iph_len = (iph->version_ihl & 0x0f) << 2; 1751 1752 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 1753 head->l2_len = RTE_ETHER_HDR_LEN; 1754 head->l3_len = iph_len; 1755 } 1756 1757 if (ctx->hw_features.tx_csum_l4) { 1758 struct rte_ipv4_hdr *iph; 1759 int iph_len; 1760 iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN); 1761 iph_len = (iph->version_ihl & 0x0f) << 2; 1762 1763 if (offload.tcp_csum) { 1764 head->ol_flags |= PKT_TX_TCP_CKSUM; 1765 head->l2_len = RTE_ETHER_HDR_LEN; 1766 head->l3_len = iph_len; 1767 } 1768 1769 /* 1770 * TCP segmentation offload. 1771 * 1772 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 1773 * implies PKT_TX_TCP_CKSUM) 1774 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 1775 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 1776 * write the IP checksum to 0 in the packet 1777 * - fill the mbuf offload information: l2_len, 1778 * l3_len, l4_len, tso_segsz 1779 * - calculate the pseudo header checksum without taking ip_len 1780 * in account, and set it in the TCP header. Refer to 1781 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 1782 * used as helpers. 1783 */ 1784 if (offload.tso_seg_size) { 1785 struct rte_tcp_hdr *tcph; 1786 int tcph_len; 1787 tcph = (struct rte_tcp_hdr *)((char *)iph + iph_len); 1788 tcph_len = (tcph->data_off & 0xf0) >> 2; 1789 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 1790 1791 head->ol_flags |= PKT_TX_TCP_SEG; 1792 head->l4_len = tcph_len; 1793 head->tso_segsz = offload.tso_seg_size; 1794 } 1795 1796 if (offload.udp_csum) { 1797 head->ol_flags |= PKT_TX_UDP_CKSUM; 1798 head->l2_len = RTE_ETHER_HDR_LEN; 1799 head->l3_len = iph_len; 1800 } 1801 } 1802 1803 ff_mbuf_free(m); 1804 1805 return send_single_packet(head, ctx->port_id); 1806 } 1807 1808 static int 1809 main_loop(void *arg) 1810 { 1811 struct loop_routine *lr = (struct loop_routine *)arg; 1812 1813 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1814 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc; 1815 int i, j, nb_rx, idle; 1816 uint16_t port_id, queue_id; 1817 struct lcore_conf *qconf; 1818 uint64_t drain_tsc = 0; 1819 struct ff_dpdk_if_context *ctx; 1820 1821 if (pkt_tx_delay) { 1822 drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay; 1823 } 1824 1825 prev_tsc = 0; 1826 usch_tsc = 0; 1827 1828 qconf = &lcore_conf; 1829 1830 while (1) { 1831 cur_tsc = rte_rdtsc(); 1832 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1833 rte_timer_manage(); 1834 } 1835 1836 idle = 1; 1837 sys_tsc = 0; 1838 usr_tsc = 0; 1839 1840 /* 1841 * TX burst queue drain 1842 */ 1843 diff_tsc = cur_tsc - prev_tsc; 1844 if (unlikely(diff_tsc >= drain_tsc)) { 1845 for (i = 0; i < qconf->nb_tx_port; i++) { 1846 port_id = qconf->tx_port_id[i]; 1847 if (qconf->tx_mbufs[port_id].len == 0) 1848 continue; 1849 1850 idle = 0; 1851 1852 send_burst(qconf, 1853 qconf->tx_mbufs[port_id].len, 1854 port_id); 1855 qconf->tx_mbufs[port_id].len = 0; 1856 } 1857 1858 prev_tsc = cur_tsc; 1859 } 1860 1861 /* 1862 * Read packet from RX queues 1863 */ 1864 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1865 port_id = qconf->rx_queue_list[i].port_id; 1866 queue_id = qconf->rx_queue_list[i].queue_id; 1867 ctx = veth_ctx[port_id]; 1868 1869 #ifdef FF_KNI 1870 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1871 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1872 } 1873 #endif 1874 1875 process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); 1876 1877 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1878 MAX_PKT_BURST); 1879 if (nb_rx == 0) 1880 continue; 1881 1882 idle = 0; 1883 1884 /* Prefetch first packets */ 1885 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1886 rte_prefetch0(rte_pktmbuf_mtod( 1887 pkts_burst[j], void *)); 1888 } 1889 1890 /* Prefetch and handle already prefetched packets */ 1891 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1892 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1893 j + PREFETCH_OFFSET], void *)); 1894 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1895 } 1896 1897 /* Handle remaining prefetched packets */ 1898 for (; j < nb_rx; j++) { 1899 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1900 } 1901 } 1902 1903 process_msg_ring(qconf->proc_id); 1904 1905 div_tsc = rte_rdtsc(); 1906 1907 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) { 1908 usch_tsc = cur_tsc; 1909 lr->loop(lr->arg); 1910 } 1911 1912 idle_sleep_tsc = rte_rdtsc(); 1913 if (likely(idle && idle_sleep)) { 1914 usleep(idle_sleep); 1915 end_tsc = rte_rdtsc(); 1916 } else { 1917 end_tsc = idle_sleep_tsc; 1918 } 1919 1920 if (usch_tsc == cur_tsc) { 1921 usr_tsc = idle_sleep_tsc - div_tsc; 1922 } 1923 1924 if (!idle) { 1925 sys_tsc = div_tsc - cur_tsc; 1926 ff_top_status.sys_tsc += sys_tsc; 1927 } 1928 1929 ff_top_status.usr_tsc += usr_tsc; 1930 ff_top_status.work_tsc += end_tsc - cur_tsc; 1931 ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1932 1933 ff_top_status.loops++; 1934 } 1935 1936 return 0; 1937 } 1938 1939 int 1940 ff_dpdk_if_up(void) { 1941 int i; 1942 struct lcore_conf *qconf = &lcore_conf; 1943 for (i = 0; i < qconf->nb_tx_port; i++) { 1944 uint16_t port_id = qconf->tx_port_id[i]; 1945 1946 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id]; 1947 veth_ctx[port_id] = ff_veth_attach(pconf); 1948 if (veth_ctx[port_id] == NULL) { 1949 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1950 } 1951 } 1952 1953 return 0; 1954 } 1955 1956 void 1957 ff_dpdk_run(loop_func_t loop, void *arg) { 1958 struct loop_routine *lr = rte_malloc(NULL, 1959 sizeof(struct loop_routine), 0); 1960 lr->loop = loop; 1961 lr->arg = arg; 1962 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1963 rte_eal_mp_wait_lcore(); 1964 rte_free(lr); 1965 } 1966 1967 void 1968 ff_dpdk_pktmbuf_free(void *m) 1969 { 1970 rte_pktmbuf_free_seg((struct rte_mbuf *)m); 1971 } 1972 1973 static uint32_t 1974 toeplitz_hash(unsigned keylen, const uint8_t *key, 1975 unsigned datalen, const uint8_t *data) 1976 { 1977 uint32_t hash = 0, v; 1978 u_int i, b; 1979 1980 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1981 1982 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1983 for (i = 0; i < datalen; i++) { 1984 for (b = 0; b < 8; b++) { 1985 if (data[i] & (1<<(7-b))) 1986 hash ^= v; 1987 v <<= 1; 1988 if ((i + 4) < keylen && 1989 (key[i+4] & (1<<(7-b)))) 1990 v |= 1; 1991 } 1992 } 1993 return (hash); 1994 } 1995 1996 int 1997 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1998 uint16_t sport, uint16_t dport) 1999 { 2000 struct lcore_conf *qconf = &lcore_conf; 2001 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 2002 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id]; 2003 2004 if (nb_queues <= 1) { 2005 return 1; 2006 } 2007 2008 uint16_t reta_size = rss_reta_size[ctx->port_id]; 2009 uint16_t queueid = qconf->tx_queue_id[ctx->port_id]; 2010 2011 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 2012 sizeof(dport)]; 2013 2014 unsigned datalen = 0; 2015 2016 bcopy(&saddr, &data[datalen], sizeof(saddr)); 2017 datalen += sizeof(saddr); 2018 2019 bcopy(&daddr, &data[datalen], sizeof(daddr)); 2020 datalen += sizeof(daddr); 2021 2022 bcopy(&sport, &data[datalen], sizeof(sport)); 2023 datalen += sizeof(sport); 2024 2025 bcopy(&dport, &data[datalen], sizeof(dport)); 2026 datalen += sizeof(dport); 2027 2028 uint32_t hash = 0; 2029 hash = toeplitz_hash(rsskey_len, rsskey, datalen, data); 2030 2031 return ((hash & (reta_size - 1)) % nb_queues) == queueid; 2032 } 2033 2034 void 2035 ff_regist_packet_dispatcher(dispatch_func_t func) 2036 { 2037 packet_dispatcher = func; 2038 } 2039 2040 uint64_t 2041 ff_get_tsc_ns() 2042 { 2043 uint64_t cur_tsc = rte_rdtsc(); 2044 uint64_t hz = rte_get_tsc_hz(); 2045 return ((double)cur_tsc/(double)hz) * NS_PER_S; 2046 } 2047 2048