1 /* 2 * Copyright (C) 2017 THL A29 Limited, a Tencent company. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 * 25 */ 26 #include <assert.h> 27 #include <unistd.h> 28 #include <sys/mman.h> 29 #include <errno.h> 30 31 #include <rte_common.h> 32 #include <rte_byteorder.h> 33 #include <rte_log.h> 34 #include <rte_memory.h> 35 #include <rte_memcpy.h> 36 #include <rte_memzone.h> 37 #include <rte_config.h> 38 #include <rte_eal.h> 39 #include <rte_pci.h> 40 #include <rte_mbuf.h> 41 #include <rte_memory.h> 42 #include <rte_lcore.h> 43 #include <rte_launch.h> 44 #include <rte_ethdev.h> 45 #include <rte_debug.h> 46 #include <rte_common.h> 47 #include <rte_ether.h> 48 #include <rte_malloc.h> 49 #include <rte_cycles.h> 50 #include <rte_timer.h> 51 #include <rte_thash.h> 52 #include <rte_ip.h> 53 #include <rte_tcp.h> 54 #include <rte_udp.h> 55 #include <rte_eth_bond.h> 56 57 #include "ff_dpdk_if.h" 58 #include "ff_dpdk_pcap.h" 59 #include "ff_dpdk_kni.h" 60 #include "ff_config.h" 61 #include "ff_veth.h" 62 #include "ff_host_interface.h" 63 #include "ff_msg.h" 64 #include "ff_api.h" 65 #include "ff_memory.h" 66 67 #ifdef FF_KNI 68 #define KNI_MBUF_MAX 2048 69 #define KNI_QUEUE_SIZE 2048 70 71 int enable_kni; 72 static int kni_accept; 73 static int knictl_action = FF_KNICTL_ACTION_DEFAULT; 74 #endif 75 76 static int numa_on; 77 78 static unsigned idle_sleep; 79 static unsigned pkt_tx_delay; 80 81 static struct rte_timer freebsd_clock; 82 83 // Mellanox Linux's driver key 84 static uint8_t default_rsskey_40bytes[40] = { 85 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, 86 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, 87 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, 88 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, 89 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc 90 }; 91 92 static uint8_t default_rsskey_52bytes[52] = { 93 0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23, 94 0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30, 95 0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02, 96 0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c, 97 0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55, 98 0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e, 99 0x81, 0x15, 0x03, 0x66 100 }; 101 102 static uint8_t symmetric_rsskey[52] = { 103 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 104 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 105 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 106 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 107 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 108 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 109 0x6d, 0x5a, 0x6d, 0x5a 110 }; 111 112 static int rsskey_len = sizeof(default_rsskey_40bytes); 113 static uint8_t *rsskey = default_rsskey_40bytes; 114 115 struct lcore_conf lcore_conf; 116 117 struct rte_mempool *pktmbuf_pool[NB_SOCKETS]; 118 119 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS]; 120 static dispatch_func_t packet_dispatcher; 121 122 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS]; 123 124 #define BOND_DRIVER_NAME "net_bonding" 125 126 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port); 127 128 struct ff_msg_ring { 129 char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE]; 130 /* ring[0] for lcore recv msg, other send */ 131 /* ring[1] for lcore send msg, other read */ 132 struct rte_ring *ring[FF_MSG_NUM]; 133 } __rte_cache_aligned; 134 135 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE]; 136 static struct rte_mempool *message_pool; 137 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS]; 138 139 static struct ff_top_args ff_top_status; 140 static struct ff_traffic_args ff_traffic; 141 extern void ff_hardclock(void); 142 143 static void 144 ff_hardclock_job(__rte_unused struct rte_timer *timer, 145 __rte_unused void *arg) { 146 ff_hardclock(); 147 ff_update_current_ts(); 148 } 149 150 struct ff_dpdk_if_context * 151 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg) 152 { 153 struct ff_dpdk_if_context *ctx; 154 155 ctx = calloc(1, sizeof(struct ff_dpdk_if_context)); 156 if (ctx == NULL) 157 return NULL; 158 159 ctx->sc = sc; 160 ctx->ifp = ifp; 161 ctx->port_id = cfg->port_id; 162 ctx->hw_features = cfg->hw_features; 163 164 return ctx; 165 } 166 167 void 168 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx) 169 { 170 free(ctx); 171 } 172 173 static void 174 check_all_ports_link_status(void) 175 { 176 #define CHECK_INTERVAL 100 /* 100ms */ 177 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */ 178 179 uint16_t portid; 180 uint8_t count, all_ports_up, print_flag = 0; 181 struct rte_eth_link link; 182 183 printf("\nChecking link status"); 184 fflush(stdout); 185 186 int i, nb_ports; 187 nb_ports = ff_global_cfg.dpdk.nb_ports; 188 for (count = 0; count <= MAX_CHECK_TIME; count++) { 189 all_ports_up = 1; 190 for (i = 0; i < nb_ports; i++) { 191 uint16_t portid = ff_global_cfg.dpdk.portid_list[i]; 192 memset(&link, 0, sizeof(link)); 193 rte_eth_link_get_nowait(portid, &link); 194 195 /* print link status if flag set */ 196 if (print_flag == 1) { 197 if (link.link_status) { 198 printf("Port %d Link Up - speed %u " 199 "Mbps - %s\n", (int)portid, 200 (unsigned)link.link_speed, 201 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? 202 ("full-duplex") : ("half-duplex\n")); 203 } else { 204 printf("Port %d Link Down\n", (int)portid); 205 } 206 continue; 207 } 208 /* clear all_ports_up flag if any link down */ 209 if (link.link_status == 0) { 210 all_ports_up = 0; 211 break; 212 } 213 } 214 215 /* after finally printing all link status, get out */ 216 if (print_flag == 1) 217 break; 218 219 if (all_ports_up == 0) { 220 printf("."); 221 fflush(stdout); 222 rte_delay_ms(CHECK_INTERVAL); 223 } 224 225 /* set the print_flag if all ports up or timeout */ 226 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) { 227 print_flag = 1; 228 printf("done\n"); 229 } 230 } 231 } 232 233 static int 234 init_lcore_conf(void) 235 { 236 uint8_t nb_dev_ports = rte_eth_dev_count_avail(); 237 if (nb_dev_ports == 0) { 238 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n"); 239 } 240 241 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) { 242 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n", 243 ff_global_cfg.dpdk.max_portid); 244 } 245 246 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs; 247 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id; 248 249 uint16_t socket_id = 0; 250 if (numa_on) { 251 socket_id = rte_lcore_to_socket_id(rte_lcore_id()); 252 } 253 254 lcore_conf.socket_id = socket_id; 255 256 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id]; 257 if (!rte_lcore_is_enabled(lcore_id)) { 258 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id); 259 } 260 261 int j; 262 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) { 263 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j]; 264 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 265 266 int queueid = -1; 267 int i; 268 for (i = 0; i < pconf->nb_lcores; i++) { 269 if (pconf->lcore_list[i] == lcore_id) { 270 queueid = i; 271 } 272 } 273 if (queueid < 0) { 274 continue; 275 } 276 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid); 277 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue; 278 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id; 279 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid; 280 lcore_conf.nb_rx_queue++; 281 282 lcore_conf.tx_queue_id[port_id] = queueid; 283 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id; 284 lcore_conf.nb_tx_port++; 285 286 /* Enable pcap dump */ 287 if (ff_global_cfg.pcap.enable) { 288 ff_enable_pcap(ff_global_cfg.pcap.save_path, ff_global_cfg.pcap.snap_len); 289 } 290 291 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores; 292 } 293 294 if (lcore_conf.nb_rx_queue == 0) { 295 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id); 296 } 297 298 return 0; 299 } 300 301 static int 302 init_mem_pool(void) 303 { 304 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports; 305 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs; 306 uint32_t nb_tx_queue = nb_lcores; 307 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores; 308 uint16_t max_portid = ff_global_cfg.dpdk.max_portid; 309 310 unsigned nb_mbuf = RTE_ALIGN_CEIL ( 311 (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE + 312 nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST + 313 nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE + 314 nb_lcores * MEMPOOL_CACHE_SIZE + 315 #ifdef FF_KNI 316 nb_ports * KNI_MBUF_MAX + 317 nb_ports * KNI_QUEUE_SIZE + 318 #endif 319 nb_lcores * nb_ports * DISPATCH_RING_SIZE), 320 (unsigned)8192); 321 322 unsigned socketid = 0; 323 uint16_t i, lcore_id; 324 char s[64]; 325 326 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) { 327 lcore_id = ff_global_cfg.dpdk.proc_lcore[i]; 328 if (numa_on) { 329 socketid = rte_lcore_to_socket_id(lcore_id); 330 } 331 332 if (socketid >= NB_SOCKETS) { 333 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n", 334 socketid, i, NB_SOCKETS); 335 } 336 337 if (pktmbuf_pool[socketid] != NULL) { 338 continue; 339 } 340 341 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 342 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 343 pktmbuf_pool[socketid] = 344 rte_pktmbuf_pool_create(s, nb_mbuf, 345 MEMPOOL_CACHE_SIZE, 0, 346 RTE_MBUF_DEFAULT_BUF_SIZE, socketid); 347 } else { 348 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid); 349 pktmbuf_pool[socketid] = rte_mempool_lookup(s); 350 } 351 352 if (pktmbuf_pool[socketid] == NULL) { 353 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid); 354 } else { 355 printf("create mbuf pool on socket %d\n", socketid); 356 } 357 358 #ifdef FF_USE_PAGE_ARRAY 359 nb_mbuf = RTE_ALIGN_CEIL ( 360 nb_ports*nb_lcores*MAX_PKT_BURST + 361 nb_ports*nb_tx_queue*TX_QUEUE_SIZE + 362 nb_lcores*MEMPOOL_CACHE_SIZE, 363 (unsigned)4096); 364 ff_init_ref_pool(nb_mbuf, socketid); 365 #endif 366 } 367 368 return 0; 369 } 370 371 static struct rte_ring * 372 create_ring(const char *name, unsigned count, int socket_id, unsigned flags) 373 { 374 struct rte_ring *ring; 375 376 if (name == NULL) { 377 rte_exit(EXIT_FAILURE, "create ring failed, no name!\n"); 378 } 379 380 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 381 ring = rte_ring_create(name, count, socket_id, flags); 382 } else { 383 ring = rte_ring_lookup(name); 384 } 385 386 if (ring == NULL) { 387 rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name); 388 } 389 390 return ring; 391 } 392 393 static int 394 init_dispatch_ring(void) 395 { 396 int j; 397 char name_buf[RTE_RING_NAMESIZE]; 398 int queueid; 399 400 unsigned socketid = lcore_conf.socket_id; 401 402 /* Create ring according to ports actually being used. */ 403 int nb_ports = ff_global_cfg.dpdk.nb_ports; 404 for (j = 0; j < nb_ports; j++) { 405 uint16_t portid = ff_global_cfg.dpdk.portid_list[j]; 406 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid]; 407 int nb_queues = pconf->nb_lcores; 408 if (dispatch_ring[portid] == NULL) { 409 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid); 410 411 dispatch_ring[portid] = rte_zmalloc(name_buf, 412 sizeof(struct rte_ring *) * nb_queues, 413 RTE_CACHE_LINE_SIZE); 414 if (dispatch_ring[portid] == NULL) { 415 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) " 416 "failed\n", name_buf); 417 } 418 } 419 420 for(queueid = 0; queueid < nb_queues; ++queueid) { 421 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d", 422 portid, queueid); 423 dispatch_ring[portid][queueid] = create_ring(name_buf, 424 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ); 425 426 if (dispatch_ring[portid][queueid] == NULL) 427 rte_panic("create ring:%s failed!\n", name_buf); 428 429 printf("create ring:%s success, %u ring entries are now free!\n", 430 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid])); 431 } 432 } 433 434 return 0; 435 } 436 437 static void 438 ff_msg_init(struct rte_mempool *mp, 439 __attribute__((unused)) void *opaque_arg, 440 void *obj, __attribute__((unused)) unsigned i) 441 { 442 struct ff_msg *msg = (struct ff_msg *)obj; 443 msg->msg_type = FF_UNKNOWN; 444 msg->buf_addr = (char *)msg + sizeof(struct ff_msg); 445 msg->buf_len = mp->elt_size - sizeof(struct ff_msg); 446 msg->original_buf = NULL; 447 msg->original_buf_len = 0; 448 } 449 450 static int 451 init_msg_ring(void) 452 { 453 uint16_t i, j; 454 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs; 455 unsigned socketid = lcore_conf.socket_id; 456 457 /* Create message buffer pool */ 458 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 459 message_pool = rte_mempool_create(FF_MSG_POOL, 460 MSG_RING_SIZE * 2 * nb_procs, 461 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0, 462 NULL, NULL, ff_msg_init, NULL, 463 socketid, 0); 464 } else { 465 message_pool = rte_mempool_lookup(FF_MSG_POOL); 466 } 467 468 if (message_pool == NULL) { 469 rte_panic("Create msg mempool failed\n"); 470 } 471 472 for(i = 0; i < nb_procs; ++i) { 473 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE, 474 "%s%u", FF_MSG_RING_IN, i); 475 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0], 476 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 477 if (msg_ring[i].ring[0] == NULL) 478 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]); 479 480 for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) { 481 snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE, 482 "%s%u_%u", FF_MSG_RING_OUT, i, j); 483 msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j], 484 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ); 485 if (msg_ring[i].ring[j] == NULL) 486 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]); 487 } 488 } 489 490 return 0; 491 } 492 493 #ifdef FF_KNI 494 495 static enum FF_KNICTL_CMD get_kni_action(const char *c){ 496 if (!c) 497 return FF_KNICTL_ACTION_DEFAULT; 498 if (0 == strcasecmp(c, "alltokni")){ 499 return FF_KNICTL_ACTION_ALL_TO_KNI; 500 } else if (0 == strcasecmp(c, "alltoff")){ 501 return FF_KNICTL_ACTION_ALL_TO_FF; 502 } else if (0 == strcasecmp(c, "default")){ 503 return FF_KNICTL_ACTION_DEFAULT; 504 } else { 505 return FF_KNICTL_ACTION_DEFAULT; 506 } 507 } 508 509 static int 510 init_kni(void) 511 { 512 int nb_ports = rte_eth_dev_count_avail(); 513 kni_accept = 0; 514 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0) 515 kni_accept = 1; 516 517 knictl_action = get_kni_action(ff_global_cfg.kni.kni_action); 518 519 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port, 520 ff_global_cfg.kni.udp_port); 521 522 unsigned socket_id = lcore_conf.socket_id; 523 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id]; 524 525 nb_ports = ff_global_cfg.dpdk.nb_ports; 526 int i, ret; 527 for (i = 0; i < nb_ports; i++) { 528 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i]; 529 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE); 530 } 531 532 return 0; 533 } 534 #endif 535 536 //RSS reta update will failed when enable flow isolate 537 #ifndef FF_FLOW_ISOLATE 538 static void 539 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues) 540 { 541 if (reta_size == 0) { 542 return; 543 } 544 545 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE); 546 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; 547 548 /* config HW indirection table */ 549 unsigned i, j, hash=0; 550 for (i = 0; i < reta_conf_size; i++) { 551 reta_conf[i].mask = ~0ULL; 552 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) { 553 reta_conf[i].reta[j] = hash++ % nb_queues; 554 } 555 } 556 557 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) { 558 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n", 559 port_id); 560 } 561 } 562 #endif 563 564 static int 565 init_port_start(void) 566 { 567 int nb_ports = ff_global_cfg.dpdk.nb_ports; 568 unsigned socketid = 0; 569 struct rte_mempool *mbuf_pool; 570 uint16_t i, j; 571 572 for (i = 0; i < nb_ports; i++) { 573 uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i]; 574 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id]; 575 uint16_t nb_queues = pconf->nb_lcores; 576 577 for (j=0; j<=pconf->nb_slaves; j++) { 578 if (j < pconf->nb_slaves) { 579 port_id = pconf->slave_portid_list[j]; 580 printf("To init %s's %d'st slave port[%d]\n", 581 ff_global_cfg.dpdk.bond_cfgs->name, 582 j, port_id); 583 } else { 584 port_id = u_port_id; 585 } 586 587 struct rte_eth_dev_info dev_info; 588 struct rte_eth_conf port_conf = {0}; 589 struct rte_eth_rxconf rxq_conf; 590 struct rte_eth_txconf txq_conf; 591 592 int ret = rte_eth_dev_info_get(port_id, &dev_info); 593 if (ret != 0) 594 rte_exit(EXIT_FAILURE, 595 "Error during getting device (port %u) info: %s\n", 596 port_id, strerror(-ret)); 597 598 if (nb_queues > dev_info.max_rx_queues) { 599 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n", 600 nb_queues, 601 dev_info.max_rx_queues); 602 } 603 604 if (nb_queues > dev_info.max_tx_queues) { 605 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n", 606 nb_queues, 607 dev_info.max_tx_queues); 608 } 609 610 struct rte_ether_addr addr; 611 rte_eth_macaddr_get(port_id, &addr); 612 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 613 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 614 (unsigned)port_id, 615 addr.addr_bytes[0], addr.addr_bytes[1], 616 addr.addr_bytes[2], addr.addr_bytes[3], 617 addr.addr_bytes[4], addr.addr_bytes[5]); 618 619 rte_memcpy(pconf->mac, 620 addr.addr_bytes, RTE_ETHER_ADDR_LEN); 621 622 /* Set RSS mode */ 623 uint64_t default_rss_hf = ETH_RSS_PROTO_MASK; 624 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; 625 port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf; 626 if (dev_info.hash_key_size == 52) { 627 rsskey = default_rsskey_52bytes; 628 rsskey_len = 52; 629 } 630 if (ff_global_cfg.dpdk.symmetric_rss) { 631 printf("Use symmetric Receive-side Scaling(RSS) key\n"); 632 rsskey = symmetric_rsskey; 633 } 634 port_conf.rx_adv_conf.rss_conf.rss_key = rsskey; 635 port_conf.rx_adv_conf.rss_conf.rss_key_len = rsskey_len; 636 port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads; 637 if (port_conf.rx_adv_conf.rss_conf.rss_hf != 638 ETH_RSS_PROTO_MASK) { 639 printf("Port %u modified RSS hash function based on hardware support," 640 "requested:%#"PRIx64" configured:%#"PRIx64"\n", 641 port_id, default_rss_hf, 642 port_conf.rx_adv_conf.rss_conf.rss_hf); 643 } 644 645 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) { 646 port_conf.txmode.offloads |= 647 DEV_TX_OFFLOAD_MBUF_FAST_FREE; 648 } 649 650 /* Set Rx VLAN stripping */ 651 if (ff_global_cfg.dpdk.vlan_strip) { 652 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { 653 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP; 654 } 655 } 656 657 /* Enable HW CRC stripping */ 658 port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC; 659 660 /* FIXME: Enable TCP LRO ?*/ 661 #if 0 662 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) { 663 printf("LRO is supported\n"); 664 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO; 665 pconf->hw_features.rx_lro = 1; 666 } 667 #endif 668 669 /* Set Rx checksum checking */ 670 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && 671 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) && 672 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { 673 printf("RX checksum offload supported\n"); 674 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM; 675 pconf->hw_features.rx_csum = 1; 676 } 677 678 if (ff_global_cfg.dpdk.tx_csum_offoad_skip == 0) { 679 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { 680 printf("TX ip checksum offload supported\n"); 681 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; 682 pconf->hw_features.tx_csum_ip = 1; 683 } 684 685 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) && 686 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { 687 printf("TX TCP&UDP checksum offload supported\n"); 688 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM; 689 pconf->hw_features.tx_csum_l4 = 1; 690 } 691 } else { 692 printf("TX checksum offoad is disabled\n"); 693 } 694 695 if (ff_global_cfg.dpdk.tso) { 696 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { 697 printf("TSO is supported\n"); 698 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO; 699 pconf->hw_features.tx_tso = 1; 700 } 701 } else { 702 printf("TSO is disabled\n"); 703 } 704 705 if (dev_info.reta_size) { 706 /* reta size must be power of 2 */ 707 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0); 708 709 rss_reta_size[port_id] = dev_info.reta_size; 710 printf("port[%d]: rss table size: %d\n", port_id, 711 dev_info.reta_size); 712 } 713 714 if (rte_eal_process_type() != RTE_PROC_PRIMARY) { 715 continue; 716 } 717 718 ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf); 719 if (ret != 0) { 720 return ret; 721 } 722 723 static uint16_t nb_rxd = RX_QUEUE_SIZE; 724 static uint16_t nb_txd = TX_QUEUE_SIZE; 725 ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd); 726 if (ret < 0) 727 printf("Could not adjust number of descriptors " 728 "for port%u (%d)\n", (unsigned)port_id, ret); 729 730 uint16_t q; 731 for (q = 0; q < nb_queues; q++) { 732 if (numa_on) { 733 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q]; 734 socketid = rte_lcore_to_socket_id(lcore_id); 735 } 736 mbuf_pool = pktmbuf_pool[socketid]; 737 738 txq_conf = dev_info.default_txconf; 739 txq_conf.offloads = port_conf.txmode.offloads; 740 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd, 741 socketid, &txq_conf); 742 if (ret < 0) { 743 return ret; 744 } 745 746 rxq_conf = dev_info.default_rxconf; 747 rxq_conf.offloads = port_conf.rxmode.offloads; 748 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd, 749 socketid, &rxq_conf, mbuf_pool); 750 if (ret < 0) { 751 return ret; 752 } 753 } 754 755 756 if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME, 757 strlen(dev_info.driver_name)) == 0) { 758 759 rte_eth_macaddr_get(port_id, &addr); 760 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 761 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", 762 (unsigned)port_id, 763 addr.addr_bytes[0], addr.addr_bytes[1], 764 addr.addr_bytes[2], addr.addr_bytes[3], 765 addr.addr_bytes[4], addr.addr_bytes[5]); 766 767 rte_memcpy(pconf->mac, 768 addr.addr_bytes, RTE_ETHER_ADDR_LEN); 769 770 int mode, count, x; 771 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS; 772 773 mode = rte_eth_bond_mode_get(port_id); 774 printf("Port %u, bond mode:%d\n", port_id, mode); 775 776 count = rte_eth_bond_slaves_get(port_id, slaves, len); 777 printf("Port %u, %s's slave ports count:%d\n", port_id, 778 ff_global_cfg.dpdk.bond_cfgs->name, count); 779 for (x=0; x<count; x++) { 780 printf("Port %u, %s's slave port[%u]\n", port_id, 781 ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]); 782 } 783 } 784 785 ret = rte_eth_dev_start(port_id); 786 if (ret < 0) { 787 return ret; 788 } 789 //RSS reta update will failed when enable flow isolate 790 #ifndef FF_FLOW_ISOLATE 791 if (nb_queues > 1) { 792 /* set HW rss hash function to Toeplitz. */ 793 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) { 794 struct rte_eth_hash_filter_info info = {0}; 795 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; 796 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; 797 798 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH, 799 RTE_ETH_FILTER_SET, &info) < 0) { 800 rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n", 801 port_id); 802 } 803 } 804 805 set_rss_table(port_id, dev_info.reta_size, nb_queues); 806 } 807 #endif 808 809 /* Enable RX in promiscuous mode for the Ethernet device. */ 810 if (ff_global_cfg.dpdk.promiscuous) { 811 ret = rte_eth_promiscuous_enable(port_id); 812 if (ret == 0) { 813 printf("set port %u to promiscuous mode ok\n", port_id); 814 } else { 815 printf("set port %u to promiscuous mode error\n", port_id); 816 } 817 } 818 } 819 } 820 821 if (rte_eal_process_type() == RTE_PROC_PRIMARY) { 822 check_all_ports_link_status(); 823 } 824 825 return 0; 826 } 827 828 static int 829 init_clock(void) 830 { 831 rte_timer_subsystem_init(); 832 uint64_t hz = rte_get_timer_hz(); 833 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz; 834 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs; 835 836 rte_timer_init(&freebsd_clock); 837 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL, 838 rte_lcore_id(), &ff_hardclock_job, NULL); 839 840 ff_update_current_ts(); 841 842 return 0; 843 } 844 845 #ifdef FF_FLOW_ISOLATE 846 /** Print a message out of a flow error. */ 847 static int 848 port_flow_complain(struct rte_flow_error *error) 849 { 850 static const char *const errstrlist[] = { 851 [RTE_FLOW_ERROR_TYPE_NONE] = "no error", 852 [RTE_FLOW_ERROR_TYPE_UNSPECIFIED] = "cause unspecified", 853 [RTE_FLOW_ERROR_TYPE_HANDLE] = "flow rule (handle)", 854 [RTE_FLOW_ERROR_TYPE_ATTR_GROUP] = "group field", 855 [RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY] = "priority field", 856 [RTE_FLOW_ERROR_TYPE_ATTR_INGRESS] = "ingress field", 857 [RTE_FLOW_ERROR_TYPE_ATTR_EGRESS] = "egress field", 858 [RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER] = "transfer field", 859 [RTE_FLOW_ERROR_TYPE_ATTR] = "attributes structure", 860 [RTE_FLOW_ERROR_TYPE_ITEM_NUM] = "pattern length", 861 [RTE_FLOW_ERROR_TYPE_ITEM_SPEC] = "item specification", 862 [RTE_FLOW_ERROR_TYPE_ITEM_LAST] = "item specification range", 863 [RTE_FLOW_ERROR_TYPE_ITEM_MASK] = "item specification mask", 864 [RTE_FLOW_ERROR_TYPE_ITEM] = "specific pattern item", 865 [RTE_FLOW_ERROR_TYPE_ACTION_NUM] = "number of actions", 866 [RTE_FLOW_ERROR_TYPE_ACTION_CONF] = "action configuration", 867 [RTE_FLOW_ERROR_TYPE_ACTION] = "specific action", 868 }; 869 const char *errstr; 870 char buf[32]; 871 int err = rte_errno; 872 873 if ((unsigned int)error->type >= RTE_DIM(errstrlist) || 874 !errstrlist[error->type]) 875 errstr = "unknown type"; 876 else 877 errstr = errstrlist[error->type]; 878 printf("Caught error type %d (%s): %s%s: %s\n", 879 error->type, errstr, 880 error->cause ? (snprintf(buf, sizeof(buf), "cause: %p, ", 881 error->cause), buf) : "", 882 error->message ? error->message : "(no stated reason)", 883 rte_strerror(err)); 884 return -err; 885 } 886 887 static int 888 port_flow_isolate(uint16_t port_id, int set) 889 { 890 struct rte_flow_error error; 891 892 /* Poisoning to make sure PMDs update it in case of error. */ 893 memset(&error, 0x66, sizeof(error)); 894 if (rte_flow_isolate(port_id, set, &error)) 895 return port_flow_complain(&error); 896 printf("Ingress traffic on port %u is %s to the defined flow rules\n", 897 port_id, 898 set ? "now restricted" : "not restricted anymore"); 899 return 0; 900 } 901 902 static int 903 create_tcp_flow(uint16_t port_id, uint16_t tcp_port) { 904 struct rte_flow_attr attr = {.ingress = 1}; 905 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id]; 906 int nb_queues = pconf->nb_lcores; 907 uint16_t queue[RTE_MAX_QUEUES_PER_PORT]; 908 int i = 0, j = 0; 909 for (i = 0, j = 0; i < nb_queues; ++i) 910 queue[j++] = i; 911 struct rte_flow_action_rss rss = { 912 .types = ETH_RSS_NONFRAG_IPV4_TCP, 913 .key_len = rsskey_len, 914 .key = rsskey, 915 .queue_num = j, 916 .queue = queue, 917 }; 918 919 struct rte_eth_dev_info dev_info; 920 int ret = rte_eth_dev_info_get(port_id, &dev_info); 921 if (ret != 0) 922 rte_exit(EXIT_FAILURE, "Error during getting device (port %u) info: %s\n", port_id, strerror(-ret)); 923 924 struct rte_flow_item pattern[3]; 925 struct rte_flow_action action[2]; 926 struct rte_flow_item_tcp tcp_spec; 927 struct rte_flow_item_tcp tcp_mask = { 928 .hdr = { 929 .src_port = RTE_BE16(0x0000), 930 .dst_port = RTE_BE16(0xffff), 931 }, 932 }; 933 struct rte_flow_error error; 934 935 memset(pattern, 0, sizeof(pattern)); 936 memset(action, 0, sizeof(action)); 937 938 /* set the dst ipv4 packet to the required value */ 939 pattern[0].type = RTE_FLOW_ITEM_TYPE_IPV4; 940 941 memset(&tcp_spec, 0, sizeof(struct rte_flow_item_tcp)); 942 tcp_spec.hdr.dst_port = rte_cpu_to_be_16(tcp_port); 943 pattern[1].type = RTE_FLOW_ITEM_TYPE_TCP; 944 pattern[1].spec = &tcp_spec; 945 pattern[1].mask = &tcp_mask; 946 947 /* end the pattern array */ 948 pattern[2].type = RTE_FLOW_ITEM_TYPE_END; 949 950 /* create the action */ 951 action[0].type = RTE_FLOW_ACTION_TYPE_RSS; 952 action[0].conf = &rss; 953 action[1].type = RTE_FLOW_ACTION_TYPE_END; 954 955 struct rte_flow *flow; 956 /* validate and create the flow rule */ 957 if (!rte_flow_validate(port_id, &attr, pattern, action, &error)) { 958 flow = rte_flow_create(port_id, &attr, pattern, action, &error); 959 if (!flow) { 960 return port_flow_complain(&error); 961 } 962 } 963 964 memset(pattern, 0, sizeof(pattern)); 965 966 /* set the dst ipv4 packet to the required value */ 967 pattern[0].type = RTE_FLOW_ITEM_TYPE_IPV4; 968 969 struct rte_flow_item_tcp tcp_src_mask = { 970 .hdr = { 971 .src_port = RTE_BE16(0xffff), 972 .dst_port = RTE_BE16(0x0000), 973 }, 974 }; 975 976 memset(&tcp_spec, 0, sizeof(struct rte_flow_item_tcp)); 977 tcp_spec.hdr.src_port = rte_cpu_to_be_16(tcp_port); 978 pattern[1].type = RTE_FLOW_ITEM_TYPE_TCP; 979 pattern[1].spec = &tcp_spec; 980 pattern[1].mask = &tcp_src_mask; 981 982 /* end the pattern array */ 983 pattern[2].type = RTE_FLOW_ITEM_TYPE_END; 984 985 /* validate and create the flow rule */ 986 if (!rte_flow_validate(port_id, &attr, pattern, action, &error)) { 987 flow = rte_flow_create(port_id, &attr, pattern, action, &error); 988 if (!flow) { 989 return port_flow_complain(&error); 990 } 991 } 992 993 return 1; 994 } 995 996 static int 997 init_flow(uint16_t port_id, uint16_t tcp_port) { 998 // struct ff_flow_cfg fcfg = ff_global_cfg.dpdk.flow_cfgs[0]; 999 1000 // int i; 1001 // for (i = 0; i < fcfg.nb_port; i++) { 1002 // if(!create_tcp_flow(fcfg.port_id, fcfg.tcp_ports[i])) { 1003 // return 0; 1004 // } 1005 // } 1006 1007 if(!create_tcp_flow(port_id, tcp_port)) { 1008 rte_exit(EXIT_FAILURE, "create tcp flow failed\n"); 1009 return -1; 1010 } 1011 1012 /* ARP rule */ 1013 struct rte_flow_attr attr = {.ingress = 1}; 1014 struct rte_flow_action_queue queue = {.index = 0}; 1015 1016 struct rte_flow_item pattern_[2]; 1017 struct rte_flow_action action[2]; 1018 struct rte_flow_item_eth eth_type = {.type = RTE_BE16(0x0806)}; 1019 struct rte_flow_item_eth eth_mask = { 1020 .type = RTE_BE16(0xffff) 1021 }; 1022 1023 memset(pattern_, 0, sizeof(pattern_)); 1024 memset(action, 0, sizeof(action)); 1025 1026 pattern_[0].type = RTE_FLOW_ITEM_TYPE_ETH; 1027 pattern_[0].spec = ð_type; 1028 pattern_[0].mask = ð_mask; 1029 1030 pattern_[1].type = RTE_FLOW_ITEM_TYPE_END; 1031 1032 /* create the action */ 1033 action[0].type = RTE_FLOW_ACTION_TYPE_QUEUE; 1034 action[0].conf = &queue; 1035 action[1].type = RTE_FLOW_ACTION_TYPE_END; 1036 1037 struct rte_flow *flow; 1038 struct rte_flow_error error; 1039 /* validate and create the flow rule */ 1040 if (!rte_flow_validate(port_id, &attr, pattern_, action, &error)) { 1041 flow = rte_flow_create(port_id, &attr, pattern_, action, &error); 1042 if (!flow) { 1043 return port_flow_complain(&error); 1044 } 1045 } 1046 1047 return 1; 1048 } 1049 1050 #endif 1051 1052 int 1053 ff_dpdk_init(int argc, char **argv) 1054 { 1055 if (ff_global_cfg.dpdk.nb_procs < 1 || 1056 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE || 1057 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs || 1058 ff_global_cfg.dpdk.proc_id < 0) { 1059 printf("param num_procs[%d] or proc_id[%d] error!\n", 1060 ff_global_cfg.dpdk.nb_procs, 1061 ff_global_cfg.dpdk.proc_id); 1062 exit(1); 1063 } 1064 1065 int ret = rte_eal_init(argc, argv); 1066 if (ret < 0) { 1067 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); 1068 } 1069 1070 numa_on = ff_global_cfg.dpdk.numa_on; 1071 1072 idle_sleep = ff_global_cfg.dpdk.idle_sleep; 1073 pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \ 1074 BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay; 1075 1076 init_lcore_conf(); 1077 1078 init_mem_pool(); 1079 1080 init_dispatch_ring(); 1081 1082 init_msg_ring(); 1083 1084 #ifdef FF_KNI 1085 enable_kni = ff_global_cfg.kni.enable; 1086 if (enable_kni) { 1087 init_kni(); 1088 } 1089 #endif 1090 1091 #ifdef FF_USE_PAGE_ARRAY 1092 ff_mmap_init(); 1093 #endif 1094 1095 #ifdef FF_FLOW_ISOLATE 1096 // run once in primary process 1097 if (0 == lcore_conf.tx_queue_id[0]){ 1098 ret = port_flow_isolate(0, 1); 1099 if (ret < 0) 1100 rte_exit(EXIT_FAILURE, "init_port_isolate failed\n"); 1101 } 1102 #endif 1103 1104 ret = init_port_start(); 1105 if (ret < 0) { 1106 rte_exit(EXIT_FAILURE, "init_port_start failed\n"); 1107 } 1108 1109 init_clock(); 1110 #ifdef FF_FLOW_ISOLATE 1111 //TODO: using config options replace magic number 1112 ret = init_flow(0, 80); 1113 if (ret < 0) { 1114 rte_exit(EXIT_FAILURE, "init_port_flow failed\n"); 1115 } 1116 #endif 1117 return 0; 1118 } 1119 1120 static void 1121 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt) 1122 { 1123 uint8_t rx_csum = ctx->hw_features.rx_csum; 1124 if (rx_csum) { 1125 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { 1126 rte_pktmbuf_free(pkt); 1127 return; 1128 } 1129 } 1130 1131 void *data = rte_pktmbuf_mtod(pkt, void*); 1132 uint16_t len = rte_pktmbuf_data_len(pkt); 1133 1134 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum); 1135 if (hdr == NULL) { 1136 rte_pktmbuf_free(pkt); 1137 return; 1138 } 1139 1140 if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) { 1141 ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci); 1142 } 1143 1144 struct rte_mbuf *pn = pkt->next; 1145 void *prev = hdr; 1146 while(pn != NULL) { 1147 data = rte_pktmbuf_mtod(pn, void*); 1148 len = rte_pktmbuf_data_len(pn); 1149 1150 void *mb = ff_mbuf_get(prev, data, len); 1151 if (mb == NULL) { 1152 ff_mbuf_free(hdr); 1153 rte_pktmbuf_free(pkt); 1154 return; 1155 } 1156 pn = pn->next; 1157 prev = mb; 1158 } 1159 1160 ff_veth_process_packet(ctx->ifp, hdr); 1161 } 1162 1163 static enum FilterReturn 1164 protocol_filter(const void *data, uint16_t len) 1165 { 1166 if(len < RTE_ETHER_ADDR_LEN) 1167 return FILTER_UNKNOWN; 1168 1169 const struct rte_ether_hdr *hdr; 1170 const struct rte_vlan_hdr *vlanhdr; 1171 hdr = (const struct rte_ether_hdr *)data; 1172 uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type); 1173 data += RTE_ETHER_HDR_LEN; 1174 len -= RTE_ETHER_HDR_LEN; 1175 1176 if (ether_type == RTE_ETHER_TYPE_VLAN) { 1177 vlanhdr = (struct rte_vlan_hdr *)data; 1178 ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto); 1179 data += sizeof(struct rte_vlan_hdr); 1180 len -= sizeof(struct rte_vlan_hdr); 1181 } 1182 1183 if(ether_type == RTE_ETHER_TYPE_ARP) 1184 return FILTER_ARP; 1185 1186 #ifdef INET6 1187 if (ether_type == RTE_ETHER_TYPE_IPV6) { 1188 return ff_kni_proto_filter(data, 1189 len, ether_type); 1190 } 1191 #endif 1192 1193 #ifndef FF_KNI 1194 return FILTER_UNKNOWN; 1195 #else 1196 if (!enable_kni) { 1197 return FILTER_UNKNOWN; 1198 } 1199 1200 if(ether_type != RTE_ETHER_TYPE_IPV4) 1201 return FILTER_UNKNOWN; 1202 1203 return ff_kni_proto_filter(data, 1204 len, ether_type); 1205 #endif 1206 } 1207 1208 static inline void 1209 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m) 1210 { 1211 struct rte_mbuf *md; 1212 void *src, *dst; 1213 1214 dst = rte_pktmbuf_mtod(mi, void *); 1215 src = rte_pktmbuf_mtod(m, void *); 1216 1217 mi->data_len = m->data_len; 1218 rte_memcpy(dst, src, m->data_len); 1219 1220 mi->port = m->port; 1221 mi->vlan_tci = m->vlan_tci; 1222 mi->vlan_tci_outer = m->vlan_tci_outer; 1223 mi->tx_offload = m->tx_offload; 1224 mi->hash = m->hash; 1225 mi->ol_flags = m->ol_flags; 1226 mi->packet_type = m->packet_type; 1227 } 1228 1229 /* copied from rte_pktmbuf_clone */ 1230 static inline struct rte_mbuf * 1231 pktmbuf_deep_clone(const struct rte_mbuf *md, 1232 struct rte_mempool *mp) 1233 { 1234 struct rte_mbuf *mc, *mi, **prev; 1235 uint32_t pktlen; 1236 uint8_t nseg; 1237 1238 if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL)) 1239 return NULL; 1240 1241 mi = mc; 1242 prev = &mi->next; 1243 pktlen = md->pkt_len; 1244 nseg = 0; 1245 1246 do { 1247 nseg++; 1248 pktmbuf_deep_attach(mi, md); 1249 *prev = mi; 1250 prev = &mi->next; 1251 } while ((md = md->next) != NULL && 1252 (mi = rte_pktmbuf_alloc(mp)) != NULL); 1253 1254 *prev = NULL; 1255 mc->nb_segs = nseg; 1256 mc->pkt_len = pktlen; 1257 1258 /* Allocation of new indirect segment failed */ 1259 if (unlikely (mi == NULL)) { 1260 rte_pktmbuf_free(mc); 1261 return NULL; 1262 } 1263 1264 __rte_mbuf_sanity_check(mc, 1); 1265 return mc; 1266 } 1267 1268 static inline void 1269 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs, 1270 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring) 1271 { 1272 struct lcore_conf *qconf = &lcore_conf; 1273 uint16_t nb_queues = qconf->nb_queue_list[port_id]; 1274 1275 uint16_t i; 1276 for (i = 0; i < count; i++) { 1277 struct rte_mbuf *rtem = bufs[i]; 1278 1279 if (unlikely( ff_global_cfg.pcap.enable)) { 1280 if (!pkts_from_ring) { 1281 ff_dump_packets( ff_global_cfg.pcap.save_path, rtem, ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len); 1282 } 1283 } 1284 1285 void *data = rte_pktmbuf_mtod(rtem, void*); 1286 uint16_t len = rte_pktmbuf_data_len(rtem); 1287 1288 if (!pkts_from_ring) { 1289 ff_traffic.rx_packets++; 1290 ff_traffic.rx_bytes += len; 1291 } 1292 1293 if (!pkts_from_ring && packet_dispatcher) { 1294 int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues); 1295 if (ret == FF_DISPATCH_RESPONSE) { 1296 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len; 1297 1298 /* 1299 * We have not support vlan out strip 1300 */ 1301 if (rtem->vlan_tci) { 1302 data = rte_pktmbuf_prepend(rtem, sizeof(struct rte_vlan_hdr)); 1303 if (data != NULL) { 1304 memmove(data, data + sizeof(struct rte_vlan_hdr), RTE_ETHER_HDR_LEN); 1305 struct rte_ether_hdr *etherhdr = (struct rte_ether_hdr *)data; 1306 struct rte_vlan_hdr *vlanhdr = (struct rte_vlan_hdr *)(data + RTE_ETHER_HDR_LEN); 1307 vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci); 1308 vlanhdr->eth_proto = etherhdr->ether_type; 1309 etherhdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN); 1310 } 1311 } 1312 send_single_packet(rtem, port_id); 1313 continue; 1314 } 1315 1316 if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) { 1317 rte_pktmbuf_free(rtem); 1318 continue; 1319 } 1320 1321 if (ret != queue_id) { 1322 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem); 1323 if (ret < 0) 1324 rte_pktmbuf_free(rtem); 1325 1326 continue; 1327 } 1328 } 1329 1330 enum FilterReturn filter = protocol_filter(data, len); 1331 #ifdef INET6 1332 if (filter == FILTER_ARP || filter == FILTER_NDP) { 1333 #else 1334 if (filter == FILTER_ARP) { 1335 #endif 1336 struct rte_mempool *mbuf_pool; 1337 struct rte_mbuf *mbuf_clone; 1338 if (!pkts_from_ring) { 1339 uint16_t j; 1340 for(j = 0; j < nb_queues; ++j) { 1341 if(j == queue_id) 1342 continue; 1343 1344 unsigned socket_id = 0; 1345 if (numa_on) { 1346 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j]; 1347 socket_id = rte_lcore_to_socket_id(lcore_id); 1348 } 1349 mbuf_pool = pktmbuf_pool[socket_id]; 1350 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1351 if(mbuf_clone) { 1352 int ret = rte_ring_enqueue(dispatch_ring[port_id][j], 1353 mbuf_clone); 1354 if (ret < 0) 1355 rte_pktmbuf_free(mbuf_clone); 1356 } 1357 } 1358 } 1359 1360 #ifdef FF_KNI 1361 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1362 mbuf_pool = pktmbuf_pool[qconf->socket_id]; 1363 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool); 1364 if(mbuf_clone) { 1365 ff_kni_enqueue(port_id, mbuf_clone); 1366 } 1367 } 1368 #endif 1369 ff_veth_input(ctx, rtem); 1370 #ifdef FF_KNI 1371 } else if (enable_kni) { 1372 if (knictl_action == FF_KNICTL_ACTION_ALL_TO_KNI){ 1373 ff_kni_enqueue(port_id, rtem); 1374 } else if (knictl_action == FF_KNICTL_ACTION_ALL_TO_FF){ 1375 ff_veth_input(ctx, rtem); 1376 } else if (knictl_action == FF_KNICTL_ACTION_DEFAULT){ 1377 if (enable_kni && 1378 ((filter == FILTER_KNI && kni_accept) || 1379 (filter == FILTER_UNKNOWN && !kni_accept)) ) { 1380 ff_kni_enqueue(port_id, rtem); 1381 } else { 1382 ff_veth_input(ctx, rtem); 1383 } 1384 } else { 1385 ff_veth_input(ctx, rtem); 1386 } 1387 #endif 1388 } else { 1389 ff_veth_input(ctx, rtem); 1390 } 1391 } 1392 } 1393 1394 static inline int 1395 process_dispatch_ring(uint16_t port_id, uint16_t queue_id, 1396 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx) 1397 { 1398 /* read packet from ring buf and to process */ 1399 uint16_t nb_rb; 1400 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id], 1401 (void **)pkts_burst, MAX_PKT_BURST, NULL); 1402 1403 if(nb_rb > 0) { 1404 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1); 1405 } 1406 1407 return 0; 1408 } 1409 1410 static inline void 1411 handle_sysctl_msg(struct ff_msg *msg) 1412 { 1413 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen, 1414 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new, 1415 msg->sysctl.newlen); 1416 1417 if (ret < 0) { 1418 msg->result = errno; 1419 } else { 1420 msg->result = 0; 1421 } 1422 } 1423 1424 static inline void 1425 handle_ioctl_msg(struct ff_msg *msg) 1426 { 1427 int fd, ret; 1428 #ifdef INET6 1429 if (msg->msg_type == FF_IOCTL6) { 1430 fd = ff_socket(AF_INET6, SOCK_DGRAM, 0); 1431 } else 1432 #endif 1433 fd = ff_socket(AF_INET, SOCK_DGRAM, 0); 1434 1435 if (fd < 0) { 1436 ret = -1; 1437 goto done; 1438 } 1439 1440 ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data); 1441 1442 ff_close(fd); 1443 1444 done: 1445 if (ret < 0) { 1446 msg->result = errno; 1447 } else { 1448 msg->result = 0; 1449 } 1450 } 1451 1452 static inline void 1453 handle_route_msg(struct ff_msg *msg) 1454 { 1455 int ret = ff_rtioctl(msg->route.fib, msg->route.data, 1456 &msg->route.len, msg->route.maxlen); 1457 if (ret < 0) { 1458 msg->result = errno; 1459 } else { 1460 msg->result = 0; 1461 } 1462 } 1463 1464 static inline void 1465 handle_top_msg(struct ff_msg *msg) 1466 { 1467 msg->top = ff_top_status; 1468 msg->result = 0; 1469 } 1470 1471 #ifdef FF_NETGRAPH 1472 static inline void 1473 handle_ngctl_msg(struct ff_msg *msg) 1474 { 1475 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data); 1476 if (ret < 0) { 1477 msg->result = errno; 1478 } else { 1479 msg->result = 0; 1480 msg->ngctl.ret = ret; 1481 } 1482 } 1483 #endif 1484 1485 #ifdef FF_IPFW 1486 static inline void 1487 handle_ipfw_msg(struct ff_msg *msg) 1488 { 1489 int fd, ret; 1490 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW); 1491 if (fd < 0) { 1492 ret = -1; 1493 goto done; 1494 } 1495 1496 switch (msg->ipfw.cmd) { 1497 case FF_IPFW_GET: 1498 ret = ff_getsockopt_freebsd(fd, msg->ipfw.level, 1499 msg->ipfw.optname, msg->ipfw.optval, 1500 msg->ipfw.optlen); 1501 break; 1502 case FF_IPFW_SET: 1503 ret = ff_setsockopt_freebsd(fd, msg->ipfw.level, 1504 msg->ipfw.optname, msg->ipfw.optval, 1505 *(msg->ipfw.optlen)); 1506 break; 1507 default: 1508 ret = -1; 1509 errno = ENOTSUP; 1510 break; 1511 } 1512 1513 ff_close(fd); 1514 1515 done: 1516 if (ret < 0) { 1517 msg->result = errno; 1518 } else { 1519 msg->result = 0; 1520 } 1521 } 1522 #endif 1523 1524 static inline void 1525 handle_traffic_msg(struct ff_msg *msg) 1526 { 1527 msg->traffic = ff_traffic; 1528 msg->result = 0; 1529 } 1530 1531 #ifdef FF_KNI 1532 static inline void 1533 handle_knictl_msg(struct ff_msg *msg) 1534 { 1535 if (msg->knictl.kni_cmd == FF_KNICTL_CMD_SET){ 1536 switch (msg->knictl.kni_action){ 1537 case FF_KNICTL_ACTION_ALL_TO_FF: knictl_action = FF_KNICTL_ACTION_ALL_TO_FF; msg->result = 0; printf("new kni action: alltoff\n"); break; 1538 case FF_KNICTL_ACTION_ALL_TO_KNI: knictl_action = FF_KNICTL_ACTION_ALL_TO_KNI; msg->result = 0; printf("new kni action: alltokni\n"); break; 1539 case FF_KNICTL_ACTION_DEFAULT: knictl_action = FF_KNICTL_ACTION_DEFAULT; msg->result = 0; printf("new kni action: default\n"); break; 1540 default: msg->result = -1; 1541 } 1542 } 1543 else if (msg->knictl.kni_cmd == FF_KNICTL_CMD_GET){ 1544 msg->knictl.kni_action = knictl_action; 1545 } else { 1546 msg->result = -2; 1547 } 1548 } 1549 #endif 1550 1551 static inline void 1552 handle_default_msg(struct ff_msg *msg) 1553 { 1554 msg->result = ENOTSUP; 1555 } 1556 1557 static inline void 1558 handle_msg(struct ff_msg *msg, uint16_t proc_id) 1559 { 1560 switch (msg->msg_type) { 1561 case FF_SYSCTL: 1562 handle_sysctl_msg(msg); 1563 break; 1564 case FF_IOCTL: 1565 #ifdef INET6 1566 case FF_IOCTL6: 1567 #endif 1568 handle_ioctl_msg(msg); 1569 break; 1570 case FF_ROUTE: 1571 handle_route_msg(msg); 1572 break; 1573 case FF_TOP: 1574 handle_top_msg(msg); 1575 break; 1576 #ifdef FF_NETGRAPH 1577 case FF_NGCTL: 1578 handle_ngctl_msg(msg); 1579 break; 1580 #endif 1581 #ifdef FF_IPFW 1582 case FF_IPFW_CTL: 1583 handle_ipfw_msg(msg); 1584 break; 1585 #endif 1586 case FF_TRAFFIC: 1587 handle_traffic_msg(msg); 1588 break; 1589 #ifdef FF_KNI 1590 case FF_KNICTL: 1591 handle_knictl_msg(msg); 1592 break; 1593 #endif 1594 default: 1595 handle_default_msg(msg); 1596 break; 1597 } 1598 rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg); 1599 } 1600 1601 static inline int 1602 process_msg_ring(uint16_t proc_id) 1603 { 1604 void *msg; 1605 int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg); 1606 1607 if (unlikely(ret == 0)) { 1608 handle_msg((struct ff_msg *)msg, proc_id); 1609 } 1610 1611 return 0; 1612 } 1613 1614 /* Send burst of packets on an output interface */ 1615 static inline int 1616 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port) 1617 { 1618 struct rte_mbuf **m_table; 1619 int ret; 1620 uint16_t queueid; 1621 1622 queueid = qconf->tx_queue_id[port]; 1623 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table; 1624 1625 if (unlikely(ff_global_cfg.pcap.enable)) { 1626 uint16_t i; 1627 for (i = 0; i < n; i++) { 1628 ff_dump_packets( ff_global_cfg.pcap.save_path, m_table[i], 1629 ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len); 1630 } 1631 } 1632 1633 ret = rte_eth_tx_burst(port, queueid, m_table, n); 1634 ff_traffic.tx_packets += ret; 1635 uint16_t i; 1636 for (i = 0; i < ret; i++) { 1637 ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]); 1638 #ifdef FF_USE_PAGE_ARRAY 1639 if (qconf->tx_mbufs[port].bsd_m_table[i]) 1640 ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs); 1641 #endif 1642 } 1643 if (unlikely(ret < n)) { 1644 do { 1645 rte_pktmbuf_free(m_table[ret]); 1646 #ifdef FF_USE_PAGE_ARRAY 1647 if ( qconf->tx_mbufs[port].bsd_m_table[ret] ) 1648 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]); 1649 #endif 1650 } while (++ret < n); 1651 } 1652 return 0; 1653 } 1654 1655 /* Enqueue a single packet, and send burst if queue is filled */ 1656 static inline int 1657 send_single_packet(struct rte_mbuf *m, uint8_t port) 1658 { 1659 uint16_t len; 1660 struct lcore_conf *qconf; 1661 1662 qconf = &lcore_conf; 1663 len = qconf->tx_mbufs[port].len; 1664 qconf->tx_mbufs[port].m_table[len] = m; 1665 len++; 1666 1667 /* enough pkts to be sent */ 1668 if (unlikely(len == MAX_PKT_BURST)) { 1669 send_burst(qconf, MAX_PKT_BURST, port); 1670 len = 0; 1671 } 1672 1673 qconf->tx_mbufs[port].len = len; 1674 return 0; 1675 } 1676 1677 int 1678 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m, 1679 int total) 1680 { 1681 #ifdef FF_USE_PAGE_ARRAY 1682 struct lcore_conf *qconf = &lcore_conf; 1683 int len = 0; 1684 1685 len = ff_if_send_onepkt(ctx, m,total); 1686 if (unlikely(len == MAX_PKT_BURST)) { 1687 send_burst(qconf, MAX_PKT_BURST, ctx->port_id); 1688 len = 0; 1689 } 1690 qconf->tx_mbufs[ctx->port_id].len = len; 1691 return 0; 1692 #endif 1693 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id]; 1694 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool); 1695 if (head == NULL) { 1696 ff_mbuf_free(m); 1697 return -1; 1698 } 1699 1700 head->pkt_len = total; 1701 head->nb_segs = 0; 1702 1703 int off = 0; 1704 struct rte_mbuf *cur = head, *prev = NULL; 1705 while(total > 0) { 1706 if (cur == NULL) { 1707 cur = rte_pktmbuf_alloc(mbuf_pool); 1708 if (cur == NULL) { 1709 rte_pktmbuf_free(head); 1710 ff_mbuf_free(m); 1711 return -1; 1712 } 1713 } 1714 1715 if (prev != NULL) { 1716 prev->next = cur; 1717 } 1718 head->nb_segs++; 1719 1720 prev = cur; 1721 void *data = rte_pktmbuf_mtod(cur, void*); 1722 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total; 1723 int ret = ff_mbuf_copydata(m, data, off, len); 1724 if (ret < 0) { 1725 rte_pktmbuf_free(head); 1726 ff_mbuf_free(m); 1727 return -1; 1728 } 1729 1730 1731 cur->data_len = len; 1732 off += len; 1733 total -= len; 1734 cur = NULL; 1735 } 1736 1737 struct ff_tx_offload offload = {0}; 1738 ff_mbuf_tx_offload(m, &offload); 1739 1740 void *data = rte_pktmbuf_mtod(head, void*); 1741 1742 if (offload.ip_csum) { 1743 /* ipv6 not supported yet */ 1744 struct rte_ipv4_hdr *iph; 1745 int iph_len; 1746 iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN); 1747 iph_len = (iph->version_ihl & 0x0f) << 2; 1748 1749 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4; 1750 head->l2_len = RTE_ETHER_HDR_LEN; 1751 head->l3_len = iph_len; 1752 } 1753 1754 if (ctx->hw_features.tx_csum_l4) { 1755 struct rte_ipv4_hdr *iph; 1756 int iph_len; 1757 iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN); 1758 iph_len = (iph->version_ihl & 0x0f) << 2; 1759 1760 if (offload.tcp_csum) { 1761 head->ol_flags |= PKT_TX_TCP_CKSUM; 1762 head->l2_len = RTE_ETHER_HDR_LEN; 1763 head->l3_len = iph_len; 1764 } 1765 1766 /* 1767 * TCP segmentation offload. 1768 * 1769 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag 1770 * implies PKT_TX_TCP_CKSUM) 1771 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6 1772 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and 1773 * write the IP checksum to 0 in the packet 1774 * - fill the mbuf offload information: l2_len, 1775 * l3_len, l4_len, tso_segsz 1776 * - calculate the pseudo header checksum without taking ip_len 1777 * in account, and set it in the TCP header. Refer to 1778 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be 1779 * used as helpers. 1780 */ 1781 if (offload.tso_seg_size) { 1782 struct rte_tcp_hdr *tcph; 1783 int tcph_len; 1784 tcph = (struct rte_tcp_hdr *)((char *)iph + iph_len); 1785 tcph_len = (tcph->data_off & 0xf0) >> 2; 1786 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG); 1787 1788 head->ol_flags |= PKT_TX_TCP_SEG; 1789 head->l4_len = tcph_len; 1790 head->tso_segsz = offload.tso_seg_size; 1791 } 1792 1793 if (offload.udp_csum) { 1794 head->ol_flags |= PKT_TX_UDP_CKSUM; 1795 head->l2_len = RTE_ETHER_HDR_LEN; 1796 head->l3_len = iph_len; 1797 } 1798 } 1799 1800 ff_mbuf_free(m); 1801 1802 return send_single_packet(head, ctx->port_id); 1803 } 1804 1805 static int 1806 main_loop(void *arg) 1807 { 1808 struct loop_routine *lr = (struct loop_routine *)arg; 1809 1810 struct rte_mbuf *pkts_burst[MAX_PKT_BURST]; 1811 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc; 1812 int i, j, nb_rx, idle; 1813 uint16_t port_id, queue_id; 1814 struct lcore_conf *qconf; 1815 uint64_t drain_tsc = 0; 1816 struct ff_dpdk_if_context *ctx; 1817 1818 if (pkt_tx_delay) { 1819 drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay; 1820 } 1821 1822 prev_tsc = 0; 1823 usch_tsc = 0; 1824 1825 qconf = &lcore_conf; 1826 1827 while (1) { 1828 cur_tsc = rte_rdtsc(); 1829 if (unlikely(freebsd_clock.expire < cur_tsc)) { 1830 rte_timer_manage(); 1831 } 1832 1833 idle = 1; 1834 sys_tsc = 0; 1835 usr_tsc = 0; 1836 1837 /* 1838 * TX burst queue drain 1839 */ 1840 diff_tsc = cur_tsc - prev_tsc; 1841 if (unlikely(diff_tsc >= drain_tsc)) { 1842 for (i = 0; i < qconf->nb_tx_port; i++) { 1843 port_id = qconf->tx_port_id[i]; 1844 if (qconf->tx_mbufs[port_id].len == 0) 1845 continue; 1846 1847 idle = 0; 1848 1849 send_burst(qconf, 1850 qconf->tx_mbufs[port_id].len, 1851 port_id); 1852 qconf->tx_mbufs[port_id].len = 0; 1853 } 1854 1855 prev_tsc = cur_tsc; 1856 } 1857 1858 /* 1859 * Read packet from RX queues 1860 */ 1861 for (i = 0; i < qconf->nb_rx_queue; ++i) { 1862 port_id = qconf->rx_queue_list[i].port_id; 1863 queue_id = qconf->rx_queue_list[i].queue_id; 1864 ctx = veth_ctx[port_id]; 1865 1866 #ifdef FF_KNI 1867 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) { 1868 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST); 1869 } 1870 #endif 1871 1872 process_dispatch_ring(port_id, queue_id, pkts_burst, ctx); 1873 1874 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst, 1875 MAX_PKT_BURST); 1876 if (nb_rx == 0) 1877 continue; 1878 1879 idle = 0; 1880 1881 /* Prefetch first packets */ 1882 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) { 1883 rte_prefetch0(rte_pktmbuf_mtod( 1884 pkts_burst[j], void *)); 1885 } 1886 1887 /* Prefetch and handle already prefetched packets */ 1888 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) { 1889 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[ 1890 j + PREFETCH_OFFSET], void *)); 1891 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1892 } 1893 1894 /* Handle remaining prefetched packets */ 1895 for (; j < nb_rx; j++) { 1896 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0); 1897 } 1898 } 1899 1900 process_msg_ring(qconf->proc_id); 1901 1902 div_tsc = rte_rdtsc(); 1903 1904 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) { 1905 usch_tsc = cur_tsc; 1906 lr->loop(lr->arg); 1907 } 1908 1909 idle_sleep_tsc = rte_rdtsc(); 1910 if (likely(idle && idle_sleep)) { 1911 usleep(idle_sleep); 1912 end_tsc = rte_rdtsc(); 1913 } else { 1914 end_tsc = idle_sleep_tsc; 1915 } 1916 1917 if (usch_tsc == cur_tsc) { 1918 usr_tsc = idle_sleep_tsc - div_tsc; 1919 } 1920 1921 if (!idle) { 1922 sys_tsc = div_tsc - cur_tsc; 1923 ff_top_status.sys_tsc += sys_tsc; 1924 } 1925 1926 ff_top_status.usr_tsc += usr_tsc; 1927 ff_top_status.work_tsc += end_tsc - cur_tsc; 1928 ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc; 1929 1930 ff_top_status.loops++; 1931 } 1932 1933 return 0; 1934 } 1935 1936 int 1937 ff_dpdk_if_up(void) { 1938 int i; 1939 struct lcore_conf *qconf = &lcore_conf; 1940 for (i = 0; i < qconf->nb_tx_port; i++) { 1941 uint16_t port_id = qconf->tx_port_id[i]; 1942 1943 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id]; 1944 veth_ctx[port_id] = ff_veth_attach(pconf); 1945 if (veth_ctx[port_id] == NULL) { 1946 rte_exit(EXIT_FAILURE, "ff_veth_attach failed"); 1947 } 1948 } 1949 1950 return 0; 1951 } 1952 1953 void 1954 ff_dpdk_run(loop_func_t loop, void *arg) { 1955 struct loop_routine *lr = rte_malloc(NULL, 1956 sizeof(struct loop_routine), 0); 1957 lr->loop = loop; 1958 lr->arg = arg; 1959 rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER); 1960 rte_eal_mp_wait_lcore(); 1961 rte_free(lr); 1962 } 1963 1964 void 1965 ff_dpdk_pktmbuf_free(void *m) 1966 { 1967 rte_pktmbuf_free((struct rte_mbuf *)m); 1968 } 1969 1970 static uint32_t 1971 toeplitz_hash(unsigned keylen, const uint8_t *key, 1972 unsigned datalen, const uint8_t *data) 1973 { 1974 uint32_t hash = 0, v; 1975 u_int i, b; 1976 1977 /* XXXRW: Perhaps an assertion about key length vs. data length? */ 1978 1979 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; 1980 for (i = 0; i < datalen; i++) { 1981 for (b = 0; b < 8; b++) { 1982 if (data[i] & (1<<(7-b))) 1983 hash ^= v; 1984 v <<= 1; 1985 if ((i + 4) < keylen && 1986 (key[i+4] & (1<<(7-b)))) 1987 v |= 1; 1988 } 1989 } 1990 return (hash); 1991 } 1992 1993 int 1994 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr, 1995 uint16_t sport, uint16_t dport) 1996 { 1997 struct lcore_conf *qconf = &lcore_conf; 1998 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc); 1999 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id]; 2000 2001 if (nb_queues <= 1) { 2002 return 1; 2003 } 2004 2005 uint16_t reta_size = rss_reta_size[ctx->port_id]; 2006 uint16_t queueid = qconf->tx_queue_id[ctx->port_id]; 2007 2008 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) + 2009 sizeof(dport)]; 2010 2011 unsigned datalen = 0; 2012 2013 bcopy(&saddr, &data[datalen], sizeof(saddr)); 2014 datalen += sizeof(saddr); 2015 2016 bcopy(&daddr, &data[datalen], sizeof(daddr)); 2017 datalen += sizeof(daddr); 2018 2019 bcopy(&sport, &data[datalen], sizeof(sport)); 2020 datalen += sizeof(sport); 2021 2022 bcopy(&dport, &data[datalen], sizeof(dport)); 2023 datalen += sizeof(dport); 2024 2025 uint32_t hash = 0; 2026 hash = toeplitz_hash(rsskey_len, rsskey, datalen, data); 2027 2028 return ((hash & (reta_size - 1)) % nb_queues) == queueid; 2029 } 2030 2031 void 2032 ff_regist_packet_dispatcher(dispatch_func_t func) 2033 { 2034 packet_dispatcher = func; 2035 } 2036 2037 uint64_t 2038 ff_get_tsc_ns() 2039 { 2040 uint64_t cur_tsc = rte_rdtsc(); 2041 uint64_t hz = rte_get_tsc_hz(); 2042 return ((double)cur_tsc/(double)hz) * NS_PER_S; 2043 } 2044 2045