1 /*
2 * Copyright (C) 2017-2021 THL A29 Limited, a Tencent company.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice, this
9 * list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright notice,
11 * this list of conditions and the following disclaimer in the documentation
12 * and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 *
25 */
26 #include <assert.h>
27 #include <unistd.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30
31 #include <rte_common.h>
32 #include <rte_byteorder.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memcpy.h>
36 #include <rte_memzone.h>
37 #include <rte_config.h>
38 #include <rte_eal.h>
39 #include <rte_pci.h>
40 #include <rte_mbuf.h>
41 #include <rte_memory.h>
42 #include <rte_lcore.h>
43 #include <rte_launch.h>
44 #include <rte_ethdev.h>
45 #include <rte_debug.h>
46 #include <rte_common.h>
47 #include <rte_ether.h>
48 #include <rte_malloc.h>
49 #include <rte_cycles.h>
50 #include <rte_timer.h>
51 #include <rte_thash.h>
52 #include <rte_ip.h>
53 #include <rte_tcp.h>
54 #include <rte_udp.h>
55 #include <rte_eth_bond.h>
56
57 #include "ff_dpdk_if.h"
58 #include "ff_dpdk_pcap.h"
59 #include "ff_dpdk_kni.h"
60 #include "ff_config.h"
61 #include "ff_veth.h"
62 #include "ff_host_interface.h"
63 #include "ff_msg.h"
64 #include "ff_api.h"
65 #include "ff_memory.h"
66
67 #ifdef FF_KNI
68 #define KNI_MBUF_MAX 2048
69 #define KNI_QUEUE_SIZE 2048
70
71 int enable_kni;
72 static int kni_accept;
73 static int knictl_action = FF_KNICTL_ACTION_DEFAULT;
74 #endif
75
76 static int numa_on;
77
78 static unsigned idle_sleep;
79 static unsigned pkt_tx_delay;
80 static uint64_t usr_cb_tsc;
81
82 static struct rte_timer freebsd_clock;
83
84 // Mellanox Linux's driver key
85 static uint8_t default_rsskey_40bytes[40] = {
86 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
87 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
88 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
89 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
90 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
91 };
92
93 static uint8_t default_rsskey_52bytes[52] = {
94 0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23,
95 0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30,
96 0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02,
97 0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c,
98 0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55,
99 0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e,
100 0x81, 0x15, 0x03, 0x66
101 };
102
103 static uint8_t symmetric_rsskey[52] = {
104 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
105 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
106 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
107 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
108 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
109 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
110 0x6d, 0x5a, 0x6d, 0x5a
111 };
112
113 static int rsskey_len = sizeof(default_rsskey_40bytes);
114 static uint8_t *rsskey = default_rsskey_40bytes;
115
116 struct lcore_conf lcore_conf;
117
118 struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
119
120 static pcblddr_func_t pcblddr_fun;
121
122 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
123 static dispatch_func_t packet_dispatcher;
124
125 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
126
127 #define BOND_DRIVER_NAME "net_bonding"
128
129 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port);
130
131 struct ff_msg_ring {
132 char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE];
133 /* ring[0] for lcore recv msg, other send */
134 /* ring[1] for lcore send msg, other read */
135 struct rte_ring *ring[FF_MSG_NUM];
136 } __rte_cache_aligned;
137
138 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
139 static struct rte_mempool *message_pool;
140 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
141
142 static struct ff_top_args ff_top_status;
143 static struct ff_traffic_args ff_traffic;
144 extern void ff_hardclock(void);
145
146 static void
ff_hardclock_job(__rte_unused struct rte_timer * timer,__rte_unused void * arg)147 ff_hardclock_job(__rte_unused struct rte_timer *timer,
148 __rte_unused void *arg) {
149 ff_hardclock();
150 ff_update_current_ts();
151 }
152
153 struct ff_dpdk_if_context *
ff_dpdk_register_if(void * sc,void * ifp,struct ff_port_cfg * cfg)154 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
155 {
156 struct ff_dpdk_if_context *ctx;
157
158 ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
159 if (ctx == NULL)
160 return NULL;
161
162 ctx->sc = sc;
163 ctx->ifp = ifp;
164 ctx->port_id = cfg->port_id;
165 ctx->hw_features = cfg->hw_features;
166
167 return ctx;
168 }
169
170 void
ff_dpdk_deregister_if(struct ff_dpdk_if_context * ctx)171 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
172 {
173 free(ctx);
174 }
175
176 static void
check_all_ports_link_status(void)177 check_all_ports_link_status(void)
178 {
179 #define CHECK_INTERVAL 100 /* 100ms */
180 #define MAX_CHECK_TIME 90 /* 9s (90 * 100ms) in total */
181
182 uint16_t portid;
183 uint8_t count, all_ports_up, print_flag = 0;
184 struct rte_eth_link link;
185
186 printf("\nChecking link status");
187 fflush(stdout);
188
189 int i, nb_ports;
190 nb_ports = ff_global_cfg.dpdk.nb_ports;
191 for (count = 0; count <= MAX_CHECK_TIME; count++) {
192 all_ports_up = 1;
193 for (i = 0; i < nb_ports; i++) {
194 uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
195 memset(&link, 0, sizeof(link));
196 rte_eth_link_get_nowait(portid, &link);
197
198 /* print link status if flag set */
199 if (print_flag == 1) {
200 if (link.link_status) {
201 printf("Port %d Link Up - speed %u "
202 "Mbps - %s\n", (int)portid,
203 (unsigned)link.link_speed,
204 (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
205 ("full-duplex") : ("half-duplex\n"));
206 } else {
207 printf("Port %d Link Down\n", (int)portid);
208 }
209 continue;
210 }
211 /* clear all_ports_up flag if any link down */
212 if (link.link_status == 0) {
213 all_ports_up = 0;
214 break;
215 }
216 }
217
218 /* after finally printing all link status, get out */
219 if (print_flag == 1)
220 break;
221
222 if (all_ports_up == 0) {
223 printf(".");
224 fflush(stdout);
225 rte_delay_ms(CHECK_INTERVAL);
226 }
227
228 /* set the print_flag if all ports up or timeout */
229 if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
230 print_flag = 1;
231 printf("done\n");
232 }
233 }
234 }
235
236 static int
init_lcore_conf(void)237 init_lcore_conf(void)
238 {
239 uint8_t nb_dev_ports = rte_eth_dev_count_avail();
240 if (nb_dev_ports == 0) {
241 rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
242 }
243
244 if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
245 rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
246 ff_global_cfg.dpdk.max_portid);
247 }
248
249 lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
250 lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
251
252 uint16_t socket_id = 0;
253 if (numa_on) {
254 socket_id = rte_lcore_to_socket_id(rte_lcore_id());
255 }
256
257 lcore_conf.socket_id = socket_id;
258
259 uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
260 if (!rte_lcore_is_enabled(lcore_id)) {
261 rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
262 }
263
264 int j;
265 for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
266 uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
267 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
268
269 int queueid = -1;
270 int i;
271 for (i = 0; i < pconf->nb_lcores; i++) {
272 if (pconf->lcore_list[i] == lcore_id) {
273 queueid = i;
274 }
275 }
276 if (queueid < 0) {
277 continue;
278 }
279 printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
280 uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
281 lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
282 lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
283 lcore_conf.nb_rx_queue++;
284
285 lcore_conf.tx_queue_id[port_id] = queueid;
286 lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
287 lcore_conf.nb_tx_port++;
288
289 /* Enable pcap dump */
290 if (ff_global_cfg.pcap.enable) {
291 ff_enable_pcap(ff_global_cfg.pcap.save_path, ff_global_cfg.pcap.snap_len);
292 }
293
294 lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
295 }
296
297 if (lcore_conf.nb_rx_queue == 0) {
298 rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
299 }
300
301 return 0;
302 }
303
304 static int
init_mem_pool(void)305 init_mem_pool(void)
306 {
307 uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
308 uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
309 uint32_t nb_tx_queue = nb_lcores;
310 uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
311 uint16_t max_portid = ff_global_cfg.dpdk.max_portid;
312
313 unsigned nb_mbuf = RTE_ALIGN_CEIL (
314 (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE +
315 nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST +
316 nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE +
317 nb_lcores * MEMPOOL_CACHE_SIZE +
318 #ifdef FF_KNI
319 nb_ports * KNI_MBUF_MAX +
320 nb_ports * KNI_QUEUE_SIZE +
321 #endif
322 nb_lcores * nb_ports * DISPATCH_RING_SIZE),
323 (unsigned)8192);
324
325 unsigned socketid = 0;
326 uint16_t i, lcore_id;
327 char s[64];
328
329 for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
330 lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
331 if (numa_on) {
332 socketid = rte_lcore_to_socket_id(lcore_id);
333 }
334
335 if (socketid >= NB_SOCKETS) {
336 rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
337 socketid, i, NB_SOCKETS);
338 }
339
340 if (pktmbuf_pool[socketid] != NULL) {
341 continue;
342 }
343
344 if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
345 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
346 pktmbuf_pool[socketid] =
347 rte_pktmbuf_pool_create(s, nb_mbuf,
348 MEMPOOL_CACHE_SIZE, 0,
349 RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
350 } else {
351 snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
352 pktmbuf_pool[socketid] = rte_mempool_lookup(s);
353 }
354
355 if (pktmbuf_pool[socketid] == NULL) {
356 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
357 } else {
358 printf("create mbuf pool on socket %d\n", socketid);
359 }
360
361 #ifdef FF_USE_PAGE_ARRAY
362 nb_mbuf = RTE_ALIGN_CEIL (
363 nb_ports*nb_lcores*MAX_PKT_BURST +
364 nb_ports*nb_tx_queue*TX_QUEUE_SIZE +
365 nb_lcores*MEMPOOL_CACHE_SIZE,
366 (unsigned)4096);
367 ff_init_ref_pool(nb_mbuf, socketid);
368 #endif
369 }
370
371 return 0;
372 }
373
374 static struct rte_ring *
create_ring(const char * name,unsigned count,int socket_id,unsigned flags)375 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
376 {
377 struct rte_ring *ring;
378
379 if (name == NULL) {
380 rte_exit(EXIT_FAILURE, "create ring failed, no name!\n");
381 }
382
383 if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
384 ring = rte_ring_create(name, count, socket_id, flags);
385 } else {
386 ring = rte_ring_lookup(name);
387 }
388
389 if (ring == NULL) {
390 rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name);
391 }
392
393 return ring;
394 }
395
396 static int
init_dispatch_ring(void)397 init_dispatch_ring(void)
398 {
399 int j;
400 char name_buf[RTE_RING_NAMESIZE];
401 int queueid;
402
403 unsigned socketid = lcore_conf.socket_id;
404
405 /* Create ring according to ports actually being used. */
406 int nb_ports = ff_global_cfg.dpdk.nb_ports;
407 for (j = 0; j < nb_ports; j++) {
408 uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
409 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
410 int nb_queues = pconf->nb_lcores;
411 if (dispatch_ring[portid] == NULL) {
412 snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
413
414 dispatch_ring[portid] = rte_zmalloc(name_buf,
415 sizeof(struct rte_ring *) * nb_queues,
416 RTE_CACHE_LINE_SIZE);
417 if (dispatch_ring[portid] == NULL) {
418 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
419 "failed\n", name_buf);
420 }
421 }
422
423 for(queueid = 0; queueid < nb_queues; ++queueid) {
424 snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
425 portid, queueid);
426 dispatch_ring[portid][queueid] = create_ring(name_buf,
427 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
428
429 if (dispatch_ring[portid][queueid] == NULL)
430 rte_panic("create ring:%s failed!\n", name_buf);
431
432 printf("create ring:%s success, %u ring entries are now free!\n",
433 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
434 }
435 }
436
437 return 0;
438 }
439
440 static void
ff_msg_init(struct rte_mempool * mp,void * opaque_arg,void * obj,unsigned i)441 ff_msg_init(struct rte_mempool *mp,
442 __attribute__((unused)) void *opaque_arg,
443 void *obj, __attribute__((unused)) unsigned i)
444 {
445 struct ff_msg *msg = (struct ff_msg *)obj;
446 msg->msg_type = FF_UNKNOWN;
447 msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
448 msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
449 msg->original_buf = NULL;
450 msg->original_buf_len = 0;
451 }
452
453 static int
init_msg_ring(void)454 init_msg_ring(void)
455 {
456 uint16_t i, j;
457 uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
458 unsigned socketid = lcore_conf.socket_id;
459
460 /* Create message buffer pool */
461 if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
462 message_pool = rte_mempool_create(FF_MSG_POOL,
463 MSG_RING_SIZE * 2 * nb_procs,
464 MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
465 NULL, NULL, ff_msg_init, NULL,
466 socketid, 0);
467 } else {
468 message_pool = rte_mempool_lookup(FF_MSG_POOL);
469 }
470
471 if (message_pool == NULL) {
472 rte_panic("Create msg mempool failed\n");
473 }
474
475 for(i = 0; i < nb_procs; ++i) {
476 snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
477 "%s%u", FF_MSG_RING_IN, i);
478 msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
479 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
480 if (msg_ring[i].ring[0] == NULL)
481 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
482
483 for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) {
484 snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE,
485 "%s%u_%u", FF_MSG_RING_OUT, i, j);
486 msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j],
487 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
488 if (msg_ring[i].ring[j] == NULL)
489 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]);
490 }
491 }
492
493 return 0;
494 }
495
496 #ifdef FF_KNI
497
get_kni_action(const char * c)498 static enum FF_KNICTL_CMD get_kni_action(const char *c){
499 if (!c)
500 return FF_KNICTL_ACTION_DEFAULT;
501 if (0 == strcasecmp(c, "alltokni")){
502 return FF_KNICTL_ACTION_ALL_TO_KNI;
503 } else if (0 == strcasecmp(c, "alltoff")){
504 return FF_KNICTL_ACTION_ALL_TO_FF;
505 } else if (0 == strcasecmp(c, "default")){
506 return FF_KNICTL_ACTION_DEFAULT;
507 } else {
508 return FF_KNICTL_ACTION_DEFAULT;
509 }
510 }
511
512 static int
init_kni(void)513 init_kni(void)
514 {
515 int nb_ports = rte_eth_dev_count_avail();
516 kni_accept = 0;
517 if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
518 kni_accept = 1;
519
520 knictl_action = get_kni_action(ff_global_cfg.kni.kni_action);
521
522 ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
523 ff_global_cfg.kni.udp_port);
524
525 unsigned socket_id = lcore_conf.socket_id;
526 struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
527
528 nb_ports = ff_global_cfg.dpdk.nb_ports;
529 int i, ret;
530 for (i = 0; i < nb_ports; i++) {
531 uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
532 ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
533 }
534
535 return 0;
536 }
537 #endif
538
539 //RSS reta update will failed when enable flow isolate
540 #ifndef FF_FLOW_ISOLATE
541 static void
set_rss_table(uint16_t port_id,uint16_t reta_size,uint16_t nb_queues)542 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
543 {
544 if (reta_size == 0) {
545 return;
546 }
547
548 int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
549 struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
550
551 /* config HW indirection table */
552 unsigned i, j, hash=0;
553 for (i = 0; i < reta_conf_size; i++) {
554 reta_conf[i].mask = ~0ULL;
555 for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
556 reta_conf[i].reta[j] = hash++ % nb_queues;
557 }
558 }
559
560 if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
561 rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
562 port_id);
563 }
564 }
565 #endif
566
567 static int
init_port_start(void)568 init_port_start(void)
569 {
570 int nb_ports = ff_global_cfg.dpdk.nb_ports;
571 unsigned socketid = 0;
572 struct rte_mempool *mbuf_pool;
573 uint16_t i, j;
574
575 for (i = 0; i < nb_ports; i++) {
576 uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i];
577 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id];
578 uint16_t nb_queues = pconf->nb_lcores;
579
580 for (j=0; j<=pconf->nb_slaves; j++) {
581 if (j < pconf->nb_slaves) {
582 port_id = pconf->slave_portid_list[j];
583 printf("To init %s's %d'st slave port[%d]\n",
584 ff_global_cfg.dpdk.bond_cfgs->name,
585 j, port_id);
586 } else {
587 port_id = u_port_id;
588 }
589
590 struct rte_eth_dev_info dev_info;
591 struct rte_eth_conf port_conf = {0};
592 struct rte_eth_rxconf rxq_conf;
593 struct rte_eth_txconf txq_conf;
594
595 int ret = rte_eth_dev_info_get(port_id, &dev_info);
596 if (ret != 0)
597 rte_exit(EXIT_FAILURE,
598 "Error during getting device (port %u) info: %s\n",
599 port_id, strerror(-ret));
600
601 if (nb_queues > dev_info.max_rx_queues) {
602 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
603 nb_queues,
604 dev_info.max_rx_queues);
605 }
606
607 if (nb_queues > dev_info.max_tx_queues) {
608 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
609 nb_queues,
610 dev_info.max_tx_queues);
611 }
612
613 struct rte_ether_addr addr;
614 rte_eth_macaddr_get(port_id, &addr);
615 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
616 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
617 (unsigned)port_id,
618 addr.addr_bytes[0], addr.addr_bytes[1],
619 addr.addr_bytes[2], addr.addr_bytes[3],
620 addr.addr_bytes[4], addr.addr_bytes[5]);
621
622 rte_memcpy(pconf->mac,
623 addr.addr_bytes, RTE_ETHER_ADDR_LEN);
624
625 /* Set RSS mode */
626 uint64_t default_rss_hf = ETH_RSS_PROTO_MASK;
627 port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
628 port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf;
629 if (dev_info.hash_key_size == 52) {
630 rsskey = default_rsskey_52bytes;
631 rsskey_len = 52;
632 }
633 if (ff_global_cfg.dpdk.symmetric_rss) {
634 printf("Use symmetric Receive-side Scaling(RSS) key\n");
635 rsskey = symmetric_rsskey;
636 }
637 port_conf.rx_adv_conf.rss_conf.rss_key = rsskey;
638 port_conf.rx_adv_conf.rss_conf.rss_key_len = rsskey_len;
639 port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads;
640 if (port_conf.rx_adv_conf.rss_conf.rss_hf !=
641 ETH_RSS_PROTO_MASK) {
642 printf("Port %u modified RSS hash function based on hardware support,"
643 "requested:%#"PRIx64" configured:%#"PRIx64"\n",
644 port_id, default_rss_hf,
645 port_conf.rx_adv_conf.rss_conf.rss_hf);
646 }
647
648 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) {
649 port_conf.txmode.offloads |=
650 DEV_TX_OFFLOAD_MBUF_FAST_FREE;
651 }
652
653 /* Set Rx VLAN stripping */
654 if (ff_global_cfg.dpdk.vlan_strip) {
655 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
656 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
657 }
658 }
659
660 /* Enable HW CRC stripping */
661 port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC;
662
663 /* FIXME: Enable TCP LRO ?*/
664 #if 0
665 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
666 printf("LRO is supported\n");
667 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
668 pconf->hw_features.rx_lro = 1;
669 }
670 #endif
671
672 /* Set Rx checksum checking */
673 if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
674 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
675 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
676 printf("RX checksum offload supported\n");
677 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
678 pconf->hw_features.rx_csum = 1;
679 }
680
681 if (ff_global_cfg.dpdk.tx_csum_offoad_skip == 0) {
682 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
683 printf("TX ip checksum offload supported\n");
684 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
685 pconf->hw_features.tx_csum_ip = 1;
686 }
687
688 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
689 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
690 printf("TX TCP&UDP checksum offload supported\n");
691 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM;
692 pconf->hw_features.tx_csum_l4 = 1;
693 }
694 } else {
695 printf("TX checksum offoad is disabled\n");
696 }
697
698 if (ff_global_cfg.dpdk.tso) {
699 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
700 printf("TSO is supported\n");
701 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO;
702 pconf->hw_features.tx_tso = 1;
703 }
704 } else {
705 printf("TSO is disabled\n");
706 }
707
708 if (dev_info.reta_size) {
709 /* reta size must be power of 2 */
710 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
711
712 rss_reta_size[port_id] = dev_info.reta_size;
713 printf("port[%d]: rss table size: %d\n", port_id,
714 dev_info.reta_size);
715 }
716
717 if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
718 continue;
719 }
720
721 ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
722 if (ret != 0) {
723 return ret;
724 }
725
726 static uint16_t nb_rxd = RX_QUEUE_SIZE;
727 static uint16_t nb_txd = TX_QUEUE_SIZE;
728 ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd);
729 if (ret < 0)
730 printf("Could not adjust number of descriptors "
731 "for port%u (%d)\n", (unsigned)port_id, ret);
732
733 uint16_t q;
734 for (q = 0; q < nb_queues; q++) {
735 if (numa_on) {
736 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
737 socketid = rte_lcore_to_socket_id(lcore_id);
738 }
739 mbuf_pool = pktmbuf_pool[socketid];
740
741 txq_conf = dev_info.default_txconf;
742 txq_conf.offloads = port_conf.txmode.offloads;
743 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd,
744 socketid, &txq_conf);
745 if (ret < 0) {
746 return ret;
747 }
748
749 rxq_conf = dev_info.default_rxconf;
750 rxq_conf.offloads = port_conf.rxmode.offloads;
751 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd,
752 socketid, &rxq_conf, mbuf_pool);
753 if (ret < 0) {
754 return ret;
755 }
756 }
757
758
759 if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME,
760 strlen(dev_info.driver_name)) == 0) {
761
762 rte_eth_macaddr_get(port_id, &addr);
763 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
764 " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
765 (unsigned)port_id,
766 addr.addr_bytes[0], addr.addr_bytes[1],
767 addr.addr_bytes[2], addr.addr_bytes[3],
768 addr.addr_bytes[4], addr.addr_bytes[5]);
769
770 rte_memcpy(pconf->mac,
771 addr.addr_bytes, RTE_ETHER_ADDR_LEN);
772
773 int mode, count, x;
774 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS;
775
776 mode = rte_eth_bond_mode_get(port_id);
777 printf("Port %u, bond mode:%d\n", port_id, mode);
778
779 count = rte_eth_bond_slaves_get(port_id, slaves, len);
780 printf("Port %u, %s's slave ports count:%d\n", port_id,
781 ff_global_cfg.dpdk.bond_cfgs->name, count);
782 for (x=0; x<count; x++) {
783 printf("Port %u, %s's slave port[%u]\n", port_id,
784 ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]);
785 }
786 }
787
788 ret = rte_eth_dev_start(port_id);
789 if (ret < 0) {
790 return ret;
791 }
792 //RSS reta update will failed when enable flow isolate
793 #ifndef FF_FLOW_ISOLATE
794 if (nb_queues > 1) {
795 /*
796 * FIXME: modify RSS set to FDIR
797 */
798 set_rss_table(port_id, dev_info.reta_size, nb_queues);
799 }
800 #endif
801
802 /* Enable RX in promiscuous mode for the Ethernet device. */
803 if (ff_global_cfg.dpdk.promiscuous) {
804 ret = rte_eth_promiscuous_enable(port_id);
805 if (ret == 0) {
806 printf("set port %u to promiscuous mode ok\n", port_id);
807 } else {
808 printf("set port %u to promiscuous mode error\n", port_id);
809 }
810 }
811 }
812 }
813
814 if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
815 check_all_ports_link_status();
816 }
817
818 return 0;
819 }
820
821 static int
init_clock(void)822 init_clock(void)
823 {
824 rte_timer_subsystem_init();
825 uint64_t hz = rte_get_timer_hz();
826 uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
827 uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
828
829 rte_timer_init(&freebsd_clock);
830 rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
831 rte_lcore_id(), &ff_hardclock_job, NULL);
832
833 ff_update_current_ts();
834
835 return 0;
836 }
837
838 #ifdef FF_FLOW_ISOLATE
839 /** Print a message out of a flow error. */
840 static int
port_flow_complain(struct rte_flow_error * error)841 port_flow_complain(struct rte_flow_error *error)
842 {
843 static const char *const errstrlist[] = {
844 [RTE_FLOW_ERROR_TYPE_NONE] = "no error",
845 [RTE_FLOW_ERROR_TYPE_UNSPECIFIED] = "cause unspecified",
846 [RTE_FLOW_ERROR_TYPE_HANDLE] = "flow rule (handle)",
847 [RTE_FLOW_ERROR_TYPE_ATTR_GROUP] = "group field",
848 [RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY] = "priority field",
849 [RTE_FLOW_ERROR_TYPE_ATTR_INGRESS] = "ingress field",
850 [RTE_FLOW_ERROR_TYPE_ATTR_EGRESS] = "egress field",
851 [RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER] = "transfer field",
852 [RTE_FLOW_ERROR_TYPE_ATTR] = "attributes structure",
853 [RTE_FLOW_ERROR_TYPE_ITEM_NUM] = "pattern length",
854 [RTE_FLOW_ERROR_TYPE_ITEM_SPEC] = "item specification",
855 [RTE_FLOW_ERROR_TYPE_ITEM_LAST] = "item specification range",
856 [RTE_FLOW_ERROR_TYPE_ITEM_MASK] = "item specification mask",
857 [RTE_FLOW_ERROR_TYPE_ITEM] = "specific pattern item",
858 [RTE_FLOW_ERROR_TYPE_ACTION_NUM] = "number of actions",
859 [RTE_FLOW_ERROR_TYPE_ACTION_CONF] = "action configuration",
860 [RTE_FLOW_ERROR_TYPE_ACTION] = "specific action",
861 };
862 const char *errstr;
863 char buf[32];
864 int err = rte_errno;
865
866 if ((unsigned int)error->type >= RTE_DIM(errstrlist) ||
867 !errstrlist[error->type])
868 errstr = "unknown type";
869 else
870 errstr = errstrlist[error->type];
871 printf("Caught error type %d (%s): %s%s: %s\n",
872 error->type, errstr,
873 error->cause ? (snprintf(buf, sizeof(buf), "cause: %p, ",
874 error->cause), buf) : "",
875 error->message ? error->message : "(no stated reason)",
876 rte_strerror(err));
877 return -err;
878 }
879
880 static int
port_flow_isolate(uint16_t port_id,int set)881 port_flow_isolate(uint16_t port_id, int set)
882 {
883 struct rte_flow_error error;
884
885 /* Poisoning to make sure PMDs update it in case of error. */
886 memset(&error, 0x66, sizeof(error));
887 if (rte_flow_isolate(port_id, set, &error))
888 return port_flow_complain(&error);
889 printf("Ingress traffic on port %u is %s to the defined flow rules\n",
890 port_id,
891 set ? "now restricted" : "not restricted anymore");
892 return 0;
893 }
894
895 static int
create_tcp_flow(uint16_t port_id,uint16_t tcp_port)896 create_tcp_flow(uint16_t port_id, uint16_t tcp_port) {
897 struct rte_flow_attr attr = {.ingress = 1};
898 struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
899 int nb_queues = pconf->nb_lcores;
900 uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
901 int i = 0, j = 0;
902 for (i = 0, j = 0; i < nb_queues; ++i)
903 queue[j++] = i;
904 struct rte_flow_action_rss rss = {
905 .types = ETH_RSS_NONFRAG_IPV4_TCP,
906 .key_len = rsskey_len,
907 .key = rsskey,
908 .queue_num = j,
909 .queue = queue,
910 };
911
912 struct rte_eth_dev_info dev_info;
913 int ret = rte_eth_dev_info_get(port_id, &dev_info);
914 if (ret != 0)
915 rte_exit(EXIT_FAILURE, "Error during getting device (port %u) info: %s\n", port_id, strerror(-ret));
916
917 struct rte_flow_item pattern[3];
918 struct rte_flow_action action[2];
919 struct rte_flow_item_tcp tcp_spec;
920 struct rte_flow_item_tcp tcp_mask = {
921 .hdr = {
922 .src_port = RTE_BE16(0x0000),
923 .dst_port = RTE_BE16(0xffff),
924 },
925 };
926 struct rte_flow_error error;
927
928 memset(pattern, 0, sizeof(pattern));
929 memset(action, 0, sizeof(action));
930
931 /* set the dst ipv4 packet to the required value */
932 pattern[0].type = RTE_FLOW_ITEM_TYPE_IPV4;
933
934 memset(&tcp_spec, 0, sizeof(struct rte_flow_item_tcp));
935 tcp_spec.hdr.dst_port = rte_cpu_to_be_16(tcp_port);
936 pattern[1].type = RTE_FLOW_ITEM_TYPE_TCP;
937 pattern[1].spec = &tcp_spec;
938 pattern[1].mask = &tcp_mask;
939
940 /* end the pattern array */
941 pattern[2].type = RTE_FLOW_ITEM_TYPE_END;
942
943 /* create the action */
944 action[0].type = RTE_FLOW_ACTION_TYPE_RSS;
945 action[0].conf = &rss;
946 action[1].type = RTE_FLOW_ACTION_TYPE_END;
947
948 struct rte_flow *flow;
949 /* validate and create the flow rule */
950 if (!rte_flow_validate(port_id, &attr, pattern, action, &error)) {
951 flow = rte_flow_create(port_id, &attr, pattern, action, &error);
952 if (!flow) {
953 return port_flow_complain(&error);
954 }
955 }
956
957 memset(pattern, 0, sizeof(pattern));
958
959 /* set the dst ipv4 packet to the required value */
960 pattern[0].type = RTE_FLOW_ITEM_TYPE_IPV4;
961
962 struct rte_flow_item_tcp tcp_src_mask = {
963 .hdr = {
964 .src_port = RTE_BE16(0xffff),
965 .dst_port = RTE_BE16(0x0000),
966 },
967 };
968
969 memset(&tcp_spec, 0, sizeof(struct rte_flow_item_tcp));
970 tcp_spec.hdr.src_port = rte_cpu_to_be_16(tcp_port);
971 pattern[1].type = RTE_FLOW_ITEM_TYPE_TCP;
972 pattern[1].spec = &tcp_spec;
973 pattern[1].mask = &tcp_src_mask;
974
975 /* end the pattern array */
976 pattern[2].type = RTE_FLOW_ITEM_TYPE_END;
977
978 /* validate and create the flow rule */
979 if (!rte_flow_validate(port_id, &attr, pattern, action, &error)) {
980 flow = rte_flow_create(port_id, &attr, pattern, action, &error);
981 if (!flow) {
982 return port_flow_complain(&error);
983 }
984 }
985
986 return 1;
987 }
988
989 static int
init_flow(uint16_t port_id,uint16_t tcp_port)990 init_flow(uint16_t port_id, uint16_t tcp_port) {
991 // struct ff_flow_cfg fcfg = ff_global_cfg.dpdk.flow_cfgs[0];
992
993 // int i;
994 // for (i = 0; i < fcfg.nb_port; i++) {
995 // if(!create_tcp_flow(fcfg.port_id, fcfg.tcp_ports[i])) {
996 // return 0;
997 // }
998 // }
999
1000 if(!create_tcp_flow(port_id, tcp_port)) {
1001 rte_exit(EXIT_FAILURE, "create tcp flow failed\n");
1002 return -1;
1003 }
1004
1005 /* ARP rule */
1006 struct rte_flow_attr attr = {.ingress = 1};
1007 struct rte_flow_action_queue queue = {.index = 0};
1008
1009 struct rte_flow_item pattern_[2];
1010 struct rte_flow_action action[2];
1011 struct rte_flow_item_eth eth_type = {.type = RTE_BE16(0x0806)};
1012 struct rte_flow_item_eth eth_mask = {
1013 .type = RTE_BE16(0xffff)
1014 };
1015
1016 memset(pattern_, 0, sizeof(pattern_));
1017 memset(action, 0, sizeof(action));
1018
1019 pattern_[0].type = RTE_FLOW_ITEM_TYPE_ETH;
1020 pattern_[0].spec = ð_type;
1021 pattern_[0].mask = ð_mask;
1022
1023 pattern_[1].type = RTE_FLOW_ITEM_TYPE_END;
1024
1025 /* create the action */
1026 action[0].type = RTE_FLOW_ACTION_TYPE_QUEUE;
1027 action[0].conf = &queue;
1028 action[1].type = RTE_FLOW_ACTION_TYPE_END;
1029
1030 struct rte_flow *flow;
1031 struct rte_flow_error error;
1032 /* validate and create the flow rule */
1033 if (!rte_flow_validate(port_id, &attr, pattern_, action, &error)) {
1034 flow = rte_flow_create(port_id, &attr, pattern_, action, &error);
1035 if (!flow) {
1036 return port_flow_complain(&error);
1037 }
1038 }
1039
1040 return 1;
1041 }
1042
1043 #endif
1044
1045 int
ff_dpdk_init(int argc,char ** argv)1046 ff_dpdk_init(int argc, char **argv)
1047 {
1048 if (ff_global_cfg.dpdk.nb_procs < 1 ||
1049 ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
1050 ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
1051 ff_global_cfg.dpdk.proc_id < 0) {
1052 printf("param num_procs[%d] or proc_id[%d] error!\n",
1053 ff_global_cfg.dpdk.nb_procs,
1054 ff_global_cfg.dpdk.proc_id);
1055 exit(1);
1056 }
1057
1058 int ret = rte_eal_init(argc, argv);
1059 if (ret < 0) {
1060 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1061 }
1062
1063 numa_on = ff_global_cfg.dpdk.numa_on;
1064
1065 idle_sleep = ff_global_cfg.dpdk.idle_sleep;
1066 pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \
1067 BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay;
1068
1069 init_lcore_conf();
1070
1071 init_mem_pool();
1072
1073 init_dispatch_ring();
1074
1075 init_msg_ring();
1076
1077 #ifdef FF_KNI
1078 enable_kni = ff_global_cfg.kni.enable;
1079 if (enable_kni) {
1080 init_kni();
1081 }
1082 #endif
1083
1084 #ifdef FF_USE_PAGE_ARRAY
1085 ff_mmap_init();
1086 #endif
1087
1088 #ifdef FF_FLOW_ISOLATE
1089 // run once in primary process
1090 if (0 == lcore_conf.tx_queue_id[0]){
1091 ret = port_flow_isolate(0, 1);
1092 if (ret < 0)
1093 rte_exit(EXIT_FAILURE, "init_port_isolate failed\n");
1094 }
1095 #endif
1096
1097 ret = init_port_start();
1098 if (ret < 0) {
1099 rte_exit(EXIT_FAILURE, "init_port_start failed\n");
1100 }
1101
1102 init_clock();
1103 #ifdef FF_FLOW_ISOLATE
1104 //Only give a example usage: port_id=0, tcp_port= 80.
1105 //Recommend:
1106 //1. init_flow should replace `set_rss_table` in `init_port_start` loop, This can set all NIC's port_id_list instead only 0 device(port_id).
1107 //2. using config options `tcp_port` replace magic number of 80
1108 ret = init_flow(0, 80);
1109 if (ret < 0) {
1110 rte_exit(EXIT_FAILURE, "init_port_flow failed\n");
1111 }
1112 #endif
1113 return 0;
1114 }
1115
1116 static void
ff_veth_input(const struct ff_dpdk_if_context * ctx,struct rte_mbuf * pkt)1117 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
1118 {
1119 uint8_t rx_csum = ctx->hw_features.rx_csum;
1120 if (rx_csum) {
1121 if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
1122 rte_pktmbuf_free(pkt);
1123 return;
1124 }
1125 }
1126
1127 void *data = rte_pktmbuf_mtod(pkt, void*);
1128 uint16_t len = rte_pktmbuf_data_len(pkt);
1129
1130 void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
1131 if (hdr == NULL) {
1132 rte_pktmbuf_free(pkt);
1133 return;
1134 }
1135
1136 if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) {
1137 ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci);
1138 }
1139
1140 struct rte_mbuf *pn = pkt->next;
1141 void *prev = hdr;
1142 while(pn != NULL) {
1143 data = rte_pktmbuf_mtod(pn, void*);
1144 len = rte_pktmbuf_data_len(pn);
1145
1146 void *mb = ff_mbuf_get(prev, pn, data, len);
1147 if (mb == NULL) {
1148 ff_mbuf_free(hdr);
1149 rte_pktmbuf_free(pkt);
1150 return;
1151 }
1152 pn = pn->next;
1153 prev = mb;
1154 }
1155
1156 ff_veth_process_packet(ctx->ifp, hdr);
1157 }
1158
1159 static enum FilterReturn
protocol_filter(const void * data,uint16_t len)1160 protocol_filter(const void *data, uint16_t len)
1161 {
1162 if(len < RTE_ETHER_ADDR_LEN)
1163 return FILTER_UNKNOWN;
1164
1165 const struct rte_ether_hdr *hdr;
1166 const struct rte_vlan_hdr *vlanhdr;
1167 hdr = (const struct rte_ether_hdr *)data;
1168 uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type);
1169 data += RTE_ETHER_HDR_LEN;
1170 len -= RTE_ETHER_HDR_LEN;
1171
1172 if (ether_type == RTE_ETHER_TYPE_VLAN) {
1173 vlanhdr = (struct rte_vlan_hdr *)data;
1174 ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto);
1175 data += sizeof(struct rte_vlan_hdr);
1176 len -= sizeof(struct rte_vlan_hdr);
1177 }
1178
1179 if(ether_type == RTE_ETHER_TYPE_ARP)
1180 return FILTER_ARP;
1181
1182 #ifdef INET6
1183 if (ether_type == RTE_ETHER_TYPE_IPV6) {
1184 return ff_kni_proto_filter(data,
1185 len, ether_type);
1186 }
1187 #endif
1188
1189 #ifndef FF_KNI
1190 return FILTER_UNKNOWN;
1191 #else
1192 if (!enable_kni) {
1193 return FILTER_UNKNOWN;
1194 }
1195
1196 if(ether_type != RTE_ETHER_TYPE_IPV4)
1197 return FILTER_UNKNOWN;
1198
1199 return ff_kni_proto_filter(data,
1200 len, ether_type);
1201 #endif
1202 }
1203
1204 static inline void
pktmbuf_deep_attach(struct rte_mbuf * mi,const struct rte_mbuf * m)1205 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
1206 {
1207 struct rte_mbuf *md;
1208 void *src, *dst;
1209
1210 dst = rte_pktmbuf_mtod(mi, void *);
1211 src = rte_pktmbuf_mtod(m, void *);
1212
1213 mi->data_len = m->data_len;
1214 rte_memcpy(dst, src, m->data_len);
1215
1216 mi->port = m->port;
1217 mi->vlan_tci = m->vlan_tci;
1218 mi->vlan_tci_outer = m->vlan_tci_outer;
1219 mi->tx_offload = m->tx_offload;
1220 mi->hash = m->hash;
1221 mi->ol_flags = m->ol_flags;
1222 mi->packet_type = m->packet_type;
1223 }
1224
1225 /* copied from rte_pktmbuf_clone */
1226 static inline struct rte_mbuf *
pktmbuf_deep_clone(const struct rte_mbuf * md,struct rte_mempool * mp)1227 pktmbuf_deep_clone(const struct rte_mbuf *md,
1228 struct rte_mempool *mp)
1229 {
1230 struct rte_mbuf *mc, *mi, **prev;
1231 uint32_t pktlen;
1232 uint8_t nseg;
1233
1234 if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
1235 return NULL;
1236
1237 mi = mc;
1238 prev = &mi->next;
1239 pktlen = md->pkt_len;
1240 nseg = 0;
1241
1242 do {
1243 nseg++;
1244 pktmbuf_deep_attach(mi, md);
1245 *prev = mi;
1246 prev = &mi->next;
1247 } while ((md = md->next) != NULL &&
1248 (mi = rte_pktmbuf_alloc(mp)) != NULL);
1249
1250 *prev = NULL;
1251 mc->nb_segs = nseg;
1252 mc->pkt_len = pktlen;
1253
1254 /* Allocation of new indirect segment failed */
1255 if (unlikely (mi == NULL)) {
1256 rte_pktmbuf_free(mc);
1257 return NULL;
1258 }
1259
1260 __rte_mbuf_sanity_check(mc, 1);
1261 return mc;
1262 }
1263
1264 static inline void
process_packets(uint16_t port_id,uint16_t queue_id,struct rte_mbuf ** bufs,uint16_t count,const struct ff_dpdk_if_context * ctx,int pkts_from_ring)1265 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
1266 uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
1267 {
1268 struct lcore_conf *qconf = &lcore_conf;
1269 uint16_t nb_queues = qconf->nb_queue_list[port_id];
1270
1271 uint16_t i;
1272 for (i = 0; i < count; i++) {
1273 struct rte_mbuf *rtem = bufs[i];
1274
1275 if (unlikely( ff_global_cfg.pcap.enable)) {
1276 if (!pkts_from_ring) {
1277 ff_dump_packets( ff_global_cfg.pcap.save_path, rtem, ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1278 }
1279 }
1280
1281 void *data = rte_pktmbuf_mtod(rtem, void*);
1282 uint16_t len = rte_pktmbuf_data_len(rtem);
1283
1284 if (!pkts_from_ring) {
1285 ff_traffic.rx_packets += rtem->nb_segs;
1286 ff_traffic.rx_bytes += rte_pktmbuf_pkt_len(rtem);
1287 }
1288
1289 if (!pkts_from_ring && packet_dispatcher) {
1290 uint64_t cur_tsc = rte_rdtsc();
1291 int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues);
1292 usr_cb_tsc += rte_rdtsc() - cur_tsc;
1293 if (ret == FF_DISPATCH_RESPONSE) {
1294 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len;
1295
1296 /*
1297 * We have not support vlan out strip
1298 */
1299 if (rtem->vlan_tci) {
1300 data = rte_pktmbuf_prepend(rtem, sizeof(struct rte_vlan_hdr));
1301 if (data != NULL) {
1302 memmove(data, data + sizeof(struct rte_vlan_hdr), RTE_ETHER_HDR_LEN);
1303 struct rte_ether_hdr *etherhdr = (struct rte_ether_hdr *)data;
1304 struct rte_vlan_hdr *vlanhdr = (struct rte_vlan_hdr *)(data + RTE_ETHER_HDR_LEN);
1305 vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci);
1306 vlanhdr->eth_proto = etherhdr->ether_type;
1307 etherhdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN);
1308 }
1309 }
1310 send_single_packet(rtem, port_id);
1311 continue;
1312 }
1313
1314 if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) {
1315 rte_pktmbuf_free(rtem);
1316 continue;
1317 }
1318
1319 if (ret != queue_id) {
1320 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
1321 if (ret < 0)
1322 rte_pktmbuf_free(rtem);
1323
1324 continue;
1325 }
1326 }
1327
1328 enum FilterReturn filter = protocol_filter(data, len);
1329 #ifdef INET6
1330 if (filter == FILTER_ARP || filter == FILTER_NDP) {
1331 #else
1332 if (filter == FILTER_ARP) {
1333 #endif
1334 struct rte_mempool *mbuf_pool;
1335 struct rte_mbuf *mbuf_clone;
1336 if (!pkts_from_ring) {
1337 uint16_t j;
1338 for(j = 0; j < nb_queues; ++j) {
1339 if(j == queue_id)
1340 continue;
1341
1342 unsigned socket_id = 0;
1343 if (numa_on) {
1344 uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1345 socket_id = rte_lcore_to_socket_id(lcore_id);
1346 }
1347 mbuf_pool = pktmbuf_pool[socket_id];
1348 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1349 if(mbuf_clone) {
1350 int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1351 mbuf_clone);
1352 if (ret < 0)
1353 rte_pktmbuf_free(mbuf_clone);
1354 }
1355 }
1356 }
1357
1358 #ifdef FF_KNI
1359 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1360 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1361 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1362 if(mbuf_clone) {
1363 ff_kni_enqueue(port_id, mbuf_clone);
1364 }
1365 }
1366 #endif
1367 ff_veth_input(ctx, rtem);
1368 #ifdef FF_KNI
1369 } else if (enable_kni) {
1370 if (knictl_action == FF_KNICTL_ACTION_ALL_TO_KNI){
1371 ff_kni_enqueue(port_id, rtem);
1372 } else if (knictl_action == FF_KNICTL_ACTION_ALL_TO_FF){
1373 ff_veth_input(ctx, rtem);
1374 } else if (knictl_action == FF_KNICTL_ACTION_DEFAULT){
1375 if (enable_kni &&
1376 ((filter == FILTER_KNI && kni_accept) ||
1377 (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1378 ff_kni_enqueue(port_id, rtem);
1379 } else {
1380 ff_veth_input(ctx, rtem);
1381 }
1382 } else {
1383 ff_veth_input(ctx, rtem);
1384 }
1385 #endif
1386 } else {
1387 ff_veth_input(ctx, rtem);
1388 }
1389 }
1390 }
1391
1392 static inline int
1393 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1394 struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1395 {
1396 /* read packet from ring buf and to process */
1397 uint16_t nb_rb;
1398 nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1399 (void **)pkts_burst, MAX_PKT_BURST, NULL);
1400
1401 if(nb_rb > 0) {
1402 process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1403 }
1404
1405 return nb_rb;
1406 }
1407
1408 static inline void
1409 handle_sysctl_msg(struct ff_msg *msg)
1410 {
1411 int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1412 msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1413 msg->sysctl.newlen);
1414
1415 if (ret < 0) {
1416 msg->result = errno;
1417 } else {
1418 msg->result = 0;
1419 }
1420 }
1421
1422 static inline void
1423 handle_ioctl_msg(struct ff_msg *msg)
1424 {
1425 int fd, ret;
1426 #ifdef INET6
1427 if (msg->msg_type == FF_IOCTL6) {
1428 fd = ff_socket(AF_INET6, SOCK_DGRAM, 0);
1429 } else
1430 #endif
1431 fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1432
1433 if (fd < 0) {
1434 ret = -1;
1435 goto done;
1436 }
1437
1438 ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1439
1440 ff_close(fd);
1441
1442 done:
1443 if (ret < 0) {
1444 msg->result = errno;
1445 } else {
1446 msg->result = 0;
1447 }
1448 }
1449
1450 static inline void
1451 handle_route_msg(struct ff_msg *msg)
1452 {
1453 int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1454 &msg->route.len, msg->route.maxlen);
1455 if (ret < 0) {
1456 msg->result = errno;
1457 } else {
1458 msg->result = 0;
1459 }
1460 }
1461
1462 static inline void
1463 handle_top_msg(struct ff_msg *msg)
1464 {
1465 msg->top = ff_top_status;
1466 msg->result = 0;
1467 }
1468
1469 #ifdef FF_NETGRAPH
1470 static inline void
1471 handle_ngctl_msg(struct ff_msg *msg)
1472 {
1473 int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1474 if (ret < 0) {
1475 msg->result = errno;
1476 } else {
1477 msg->result = 0;
1478 msg->ngctl.ret = ret;
1479 }
1480 }
1481 #endif
1482
1483 #ifdef FF_IPFW
1484 static inline void
1485 handle_ipfw_msg(struct ff_msg *msg)
1486 {
1487 int fd, ret;
1488 fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1489 if (fd < 0) {
1490 ret = -1;
1491 goto done;
1492 }
1493
1494 switch (msg->ipfw.cmd) {
1495 case FF_IPFW_GET:
1496 ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1497 msg->ipfw.optname, msg->ipfw.optval,
1498 msg->ipfw.optlen);
1499 break;
1500 case FF_IPFW_SET:
1501 ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1502 msg->ipfw.optname, msg->ipfw.optval,
1503 *(msg->ipfw.optlen));
1504 break;
1505 default:
1506 ret = -1;
1507 errno = ENOTSUP;
1508 break;
1509 }
1510
1511 ff_close(fd);
1512
1513 done:
1514 if (ret < 0) {
1515 msg->result = errno;
1516 } else {
1517 msg->result = 0;
1518 }
1519 }
1520 #endif
1521
1522 static inline void
1523 handle_traffic_msg(struct ff_msg *msg)
1524 {
1525 msg->traffic = ff_traffic;
1526 msg->result = 0;
1527 }
1528
1529 #ifdef FF_KNI
1530 static inline void
1531 handle_knictl_msg(struct ff_msg *msg)
1532 {
1533 if (msg->knictl.kni_cmd == FF_KNICTL_CMD_SET){
1534 switch (msg->knictl.kni_action){
1535 case FF_KNICTL_ACTION_ALL_TO_FF: knictl_action = FF_KNICTL_ACTION_ALL_TO_FF; msg->result = 0; printf("new kni action: alltoff\n"); break;
1536 case FF_KNICTL_ACTION_ALL_TO_KNI: knictl_action = FF_KNICTL_ACTION_ALL_TO_KNI; msg->result = 0; printf("new kni action: alltokni\n"); break;
1537 case FF_KNICTL_ACTION_DEFAULT: knictl_action = FF_KNICTL_ACTION_DEFAULT; msg->result = 0; printf("new kni action: default\n"); break;
1538 default: msg->result = -1;
1539 }
1540 }
1541 else if (msg->knictl.kni_cmd == FF_KNICTL_CMD_GET){
1542 msg->knictl.kni_action = knictl_action;
1543 } else {
1544 msg->result = -2;
1545 }
1546 }
1547 #endif
1548
1549 static inline void
1550 handle_default_msg(struct ff_msg *msg)
1551 {
1552 msg->result = ENOTSUP;
1553 }
1554
1555 static inline void
1556 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1557 {
1558 switch (msg->msg_type) {
1559 case FF_SYSCTL:
1560 handle_sysctl_msg(msg);
1561 break;
1562 case FF_IOCTL:
1563 #ifdef INET6
1564 case FF_IOCTL6:
1565 #endif
1566 handle_ioctl_msg(msg);
1567 break;
1568 case FF_ROUTE:
1569 handle_route_msg(msg);
1570 break;
1571 case FF_TOP:
1572 handle_top_msg(msg);
1573 break;
1574 #ifdef FF_NETGRAPH
1575 case FF_NGCTL:
1576 handle_ngctl_msg(msg);
1577 break;
1578 #endif
1579 #ifdef FF_IPFW
1580 case FF_IPFW_CTL:
1581 handle_ipfw_msg(msg);
1582 break;
1583 #endif
1584 case FF_TRAFFIC:
1585 handle_traffic_msg(msg);
1586 break;
1587 #ifdef FF_KNI
1588 case FF_KNICTL:
1589 handle_knictl_msg(msg);
1590 break;
1591 #endif
1592 default:
1593 handle_default_msg(msg);
1594 break;
1595 }
1596 if (rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg) < 0) {
1597 if (msg->original_buf) {
1598 rte_free(msg->buf_addr);
1599 msg->buf_addr = msg->original_buf;
1600 msg->buf_len = msg->original_buf_len;
1601 msg->original_buf = NULL;
1602 }
1603
1604 rte_mempool_put(message_pool, msg);
1605 }
1606 }
1607
1608 static inline int
1609 process_msg_ring(uint16_t proc_id, struct rte_mbuf **pkts_burst)
1610 {
1611 /* read msg from ring buf and to process */
1612 uint16_t nb_rb;
1613 int i;
1614
1615 nb_rb = rte_ring_dequeue_burst(msg_ring[proc_id].ring[0],
1616 (void **)pkts_burst, MAX_PKT_BURST, NULL);
1617
1618 if (likely(nb_rb == 0))
1619 return 0;
1620
1621 for (i = 0; i < nb_rb; ++i) {
1622 handle_msg((struct ff_msg *)pkts_burst[i], proc_id);
1623 }
1624
1625 return 0;
1626 }
1627
1628 /* Send burst of packets on an output interface */
1629 static inline int
1630 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1631 {
1632 struct rte_mbuf **m_table;
1633 int ret;
1634 uint16_t queueid;
1635
1636 queueid = qconf->tx_queue_id[port];
1637 m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1638
1639 if (unlikely(ff_global_cfg.pcap.enable)) {
1640 uint16_t i;
1641 for (i = 0; i < n; i++) {
1642 ff_dump_packets( ff_global_cfg.pcap.save_path, m_table[i],
1643 ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1644 }
1645 }
1646
1647 ret = rte_eth_tx_burst(port, queueid, m_table, n);
1648 ff_traffic.tx_packets += ret;
1649 uint16_t i;
1650 for (i = 0; i < ret; i++) {
1651 ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]);
1652 #ifdef FF_USE_PAGE_ARRAY
1653 if (qconf->tx_mbufs[port].bsd_m_table[i])
1654 ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs);
1655 #endif
1656 }
1657 if (unlikely(ret < n)) {
1658 do {
1659 rte_pktmbuf_free(m_table[ret]);
1660 #ifdef FF_USE_PAGE_ARRAY
1661 if ( qconf->tx_mbufs[port].bsd_m_table[ret] )
1662 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]);
1663 #endif
1664 } while (++ret < n);
1665 }
1666 return 0;
1667 }
1668
1669 /* Enqueue a single packet, and send burst if queue is filled */
1670 static inline int
1671 send_single_packet(struct rte_mbuf *m, uint8_t port)
1672 {
1673 uint16_t len;
1674 struct lcore_conf *qconf;
1675
1676 qconf = &lcore_conf;
1677 len = qconf->tx_mbufs[port].len;
1678 qconf->tx_mbufs[port].m_table[len] = m;
1679 len++;
1680
1681 /* enough pkts to be sent */
1682 if (unlikely(len == MAX_PKT_BURST)) {
1683 send_burst(qconf, MAX_PKT_BURST, port);
1684 len = 0;
1685 }
1686
1687 qconf->tx_mbufs[port].len = len;
1688 return 0;
1689 }
1690
1691 int
1692 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1693 int total)
1694 {
1695 #ifdef FF_USE_PAGE_ARRAY
1696 struct lcore_conf *qconf = &lcore_conf;
1697 int len = 0;
1698
1699 len = ff_if_send_onepkt(ctx, m,total);
1700 if (unlikely(len == MAX_PKT_BURST)) {
1701 send_burst(qconf, MAX_PKT_BURST, ctx->port_id);
1702 len = 0;
1703 }
1704 qconf->tx_mbufs[ctx->port_id].len = len;
1705 return 0;
1706 #endif
1707 struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1708 struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1709 if (head == NULL) {
1710 ff_mbuf_free(m);
1711 return -1;
1712 }
1713
1714 head->pkt_len = total;
1715 head->nb_segs = 0;
1716
1717 int off = 0;
1718 struct rte_mbuf *cur = head, *prev = NULL;
1719 while(total > 0) {
1720 if (cur == NULL) {
1721 cur = rte_pktmbuf_alloc(mbuf_pool);
1722 if (cur == NULL) {
1723 rte_pktmbuf_free(head);
1724 ff_mbuf_free(m);
1725 return -1;
1726 }
1727 }
1728
1729 if (prev != NULL) {
1730 prev->next = cur;
1731 }
1732 head->nb_segs++;
1733
1734 prev = cur;
1735 void *data = rte_pktmbuf_mtod(cur, void*);
1736 int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1737 int ret = ff_mbuf_copydata(m, data, off, len);
1738 if (ret < 0) {
1739 rte_pktmbuf_free(head);
1740 ff_mbuf_free(m);
1741 return -1;
1742 }
1743
1744
1745 cur->data_len = len;
1746 off += len;
1747 total -= len;
1748 cur = NULL;
1749 }
1750
1751 struct ff_tx_offload offload = {0};
1752 ff_mbuf_tx_offload(m, &offload);
1753
1754 void *data = rte_pktmbuf_mtod(head, void*);
1755
1756 if (offload.ip_csum) {
1757 /* ipv6 not supported yet */
1758 struct rte_ipv4_hdr *iph;
1759 int iph_len;
1760 iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN);
1761 iph_len = (iph->version_ihl & 0x0f) << 2;
1762
1763 head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1764 head->l2_len = RTE_ETHER_HDR_LEN;
1765 head->l3_len = iph_len;
1766 }
1767
1768 if (ctx->hw_features.tx_csum_l4) {
1769 struct rte_ipv4_hdr *iph;
1770 int iph_len;
1771 iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN);
1772 iph_len = (iph->version_ihl & 0x0f) << 2;
1773
1774 if (offload.tcp_csum) {
1775 head->ol_flags |= PKT_TX_TCP_CKSUM;
1776 head->l2_len = RTE_ETHER_HDR_LEN;
1777 head->l3_len = iph_len;
1778 }
1779
1780 /*
1781 * TCP segmentation offload.
1782 *
1783 * - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1784 * implies PKT_TX_TCP_CKSUM)
1785 * - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1786 * - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1787 * write the IP checksum to 0 in the packet
1788 * - fill the mbuf offload information: l2_len,
1789 * l3_len, l4_len, tso_segsz
1790 * - calculate the pseudo header checksum without taking ip_len
1791 * in account, and set it in the TCP header. Refer to
1792 * rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1793 * used as helpers.
1794 */
1795 if (offload.tso_seg_size) {
1796 struct rte_tcp_hdr *tcph;
1797 int tcph_len;
1798 tcph = (struct rte_tcp_hdr *)((char *)iph + iph_len);
1799 tcph_len = (tcph->data_off & 0xf0) >> 2;
1800 tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1801
1802 head->ol_flags |= PKT_TX_TCP_SEG;
1803 head->l4_len = tcph_len;
1804 head->tso_segsz = offload.tso_seg_size;
1805 }
1806
1807 if (offload.udp_csum) {
1808 head->ol_flags |= PKT_TX_UDP_CKSUM;
1809 head->l2_len = RTE_ETHER_HDR_LEN;
1810 head->l3_len = iph_len;
1811 }
1812 }
1813
1814 ff_mbuf_free(m);
1815
1816 return send_single_packet(head, ctx->port_id);
1817 }
1818
1819 static int
1820 main_loop(void *arg)
1821 {
1822 struct loop_routine *lr = (struct loop_routine *)arg;
1823
1824 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1825 uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc;
1826 int i, j, nb_rx, idle;
1827 uint16_t port_id, queue_id;
1828 struct lcore_conf *qconf;
1829 uint64_t drain_tsc = 0;
1830 struct ff_dpdk_if_context *ctx;
1831
1832 if (pkt_tx_delay) {
1833 drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay;
1834 }
1835
1836 prev_tsc = 0;
1837 usch_tsc = 0;
1838
1839 qconf = &lcore_conf;
1840
1841 while (1) {
1842 cur_tsc = rte_rdtsc();
1843 if (unlikely(freebsd_clock.expire < cur_tsc)) {
1844 rte_timer_manage();
1845 }
1846
1847 idle = 1;
1848 sys_tsc = 0;
1849 usr_tsc = 0;
1850 usr_cb_tsc = 0;
1851
1852 /*
1853 * TX burst queue drain
1854 */
1855 diff_tsc = cur_tsc - prev_tsc;
1856 if (unlikely(diff_tsc >= drain_tsc)) {
1857 for (i = 0; i < qconf->nb_tx_port; i++) {
1858 port_id = qconf->tx_port_id[i];
1859 if (qconf->tx_mbufs[port_id].len == 0)
1860 continue;
1861
1862 idle = 0;
1863
1864 send_burst(qconf,
1865 qconf->tx_mbufs[port_id].len,
1866 port_id);
1867 qconf->tx_mbufs[port_id].len = 0;
1868 }
1869
1870 prev_tsc = cur_tsc;
1871 }
1872
1873 /*
1874 * Read packet from RX queues
1875 */
1876 for (i = 0; i < qconf->nb_rx_queue; ++i) {
1877 port_id = qconf->rx_queue_list[i].port_id;
1878 queue_id = qconf->rx_queue_list[i].queue_id;
1879 ctx = veth_ctx[port_id];
1880
1881 #ifdef FF_KNI
1882 if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1883 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1884 }
1885 #endif
1886
1887 idle &= !process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1888
1889 nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1890 MAX_PKT_BURST);
1891 if (nb_rx == 0)
1892 continue;
1893
1894 idle = 0;
1895
1896 /* Prefetch first packets */
1897 for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1898 rte_prefetch0(rte_pktmbuf_mtod(
1899 pkts_burst[j], void *));
1900 }
1901
1902 /* Prefetch and handle already prefetched packets */
1903 for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1904 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1905 j + PREFETCH_OFFSET], void *));
1906 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1907 }
1908
1909 /* Handle remaining prefetched packets */
1910 for (; j < nb_rx; j++) {
1911 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1912 }
1913 }
1914
1915 process_msg_ring(qconf->proc_id, pkts_burst);
1916
1917 div_tsc = rte_rdtsc();
1918
1919 if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) {
1920 usch_tsc = cur_tsc;
1921 lr->loop(lr->arg);
1922 }
1923
1924 idle_sleep_tsc = rte_rdtsc();
1925 if (likely(idle && idle_sleep)) {
1926 usleep(idle_sleep);
1927 end_tsc = rte_rdtsc();
1928 } else {
1929 end_tsc = idle_sleep_tsc;
1930 }
1931
1932 usr_tsc = usr_cb_tsc;
1933 if (usch_tsc == cur_tsc) {
1934 usr_tsc += idle_sleep_tsc - div_tsc;
1935 }
1936
1937 if (!idle) {
1938 sys_tsc = div_tsc - cur_tsc - usr_cb_tsc;
1939 ff_top_status.sys_tsc += sys_tsc;
1940 }
1941
1942 ff_top_status.usr_tsc += usr_tsc;
1943 ff_top_status.work_tsc += end_tsc - cur_tsc;
1944 ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1945
1946 ff_top_status.loops++;
1947 }
1948
1949 return 0;
1950 }
1951
1952 int
1953 ff_dpdk_if_up(void) {
1954 int i;
1955 struct lcore_conf *qconf = &lcore_conf;
1956 for (i = 0; i < qconf->nb_tx_port; i++) {
1957 uint16_t port_id = qconf->tx_port_id[i];
1958
1959 struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1960 veth_ctx[port_id] = ff_veth_attach(pconf);
1961 if (veth_ctx[port_id] == NULL) {
1962 rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1963 }
1964 }
1965
1966 return 0;
1967 }
1968
1969 void
1970 ff_dpdk_run(loop_func_t loop, void *arg) {
1971 struct loop_routine *lr = rte_malloc(NULL,
1972 sizeof(struct loop_routine), 0);
1973 lr->loop = loop;
1974 lr->arg = arg;
1975 rte_eal_mp_remote_launch(main_loop, lr, CALL_MAIN);
1976 rte_eal_mp_wait_lcore();
1977 rte_free(lr);
1978 }
1979
1980 void
1981 ff_dpdk_pktmbuf_free(void *m)
1982 {
1983 rte_pktmbuf_free_seg((struct rte_mbuf *)m);
1984 }
1985
1986 static uint32_t
1987 toeplitz_hash(unsigned keylen, const uint8_t *key,
1988 unsigned datalen, const uint8_t *data)
1989 {
1990 uint32_t hash = 0, v;
1991 u_int i, b;
1992
1993 /* XXXRW: Perhaps an assertion about key length vs. data length? */
1994
1995 v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1996 for (i = 0; i < datalen; i++) {
1997 for (b = 0; b < 8; b++) {
1998 if (data[i] & (1<<(7-b)))
1999 hash ^= v;
2000 v <<= 1;
2001 if ((i + 4) < keylen &&
2002 (key[i+4] & (1<<(7-b))))
2003 v |= 1;
2004 }
2005 }
2006 return (hash);
2007 }
2008
2009 int
2010 ff_in_pcbladdr(uint16_t family, void *faddr, uint16_t fport, void *laddr)
2011 {
2012 int ret = 0;
2013 uint16_t fa;
2014
2015 if (!pcblddr_fun)
2016 return ret;
2017
2018 if (family == AF_INET)
2019 fa = AF_INET;
2020 else if (family == AF_INET6_FREEBSD)
2021 fa = AF_INET6_LINUX;
2022 else
2023 return EADDRNOTAVAIL;
2024
2025 ret = (*pcblddr_fun)(fa, faddr, fport, laddr);
2026
2027 return ret;
2028 }
2029
2030 void
2031 ff_regist_pcblddr_fun(pcblddr_func_t func)
2032 {
2033 pcblddr_fun = func;
2034 }
2035
2036 int
2037 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
2038 uint16_t sport, uint16_t dport)
2039 {
2040 struct lcore_conf *qconf = &lcore_conf;
2041 struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
2042 uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
2043
2044 if (nb_queues <= 1) {
2045 return 1;
2046 }
2047
2048 uint16_t reta_size = rss_reta_size[ctx->port_id];
2049 uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
2050
2051 uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
2052 sizeof(dport)];
2053
2054 unsigned datalen = 0;
2055
2056 bcopy(&saddr, &data[datalen], sizeof(saddr));
2057 datalen += sizeof(saddr);
2058
2059 bcopy(&daddr, &data[datalen], sizeof(daddr));
2060 datalen += sizeof(daddr);
2061
2062 bcopy(&sport, &data[datalen], sizeof(sport));
2063 datalen += sizeof(sport);
2064
2065 bcopy(&dport, &data[datalen], sizeof(dport));
2066 datalen += sizeof(dport);
2067
2068 uint32_t hash = 0;
2069 hash = toeplitz_hash(rsskey_len, rsskey, datalen, data);
2070
2071 return ((hash & (reta_size - 1)) % nb_queues) == queueid;
2072 }
2073
2074 void
2075 ff_regist_packet_dispatcher(dispatch_func_t func)
2076 {
2077 packet_dispatcher = func;
2078 }
2079
2080 uint64_t
2081 ff_get_tsc_ns()
2082 {
2083 uint64_t cur_tsc = rte_rdtsc();
2084 uint64_t hz = rte_get_tsc_hz();
2085 return ((double)cur_tsc/(double)hz) * NS_PER_S;
2086 }
2087
2088