xref: /f-stack/lib/ff_dpdk_if.c (revision 8640edf1)
1 /*
2  * Copyright (C) 2017-2021 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 #include <unistd.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30 
31 #include <rte_common.h>
32 #include <rte_byteorder.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memcpy.h>
36 #include <rte_memzone.h>
37 #include <rte_config.h>
38 #include <rte_eal.h>
39 #include <rte_pci.h>
40 #include <rte_mbuf.h>
41 #include <rte_memory.h>
42 #include <rte_lcore.h>
43 #include <rte_launch.h>
44 #include <rte_ethdev.h>
45 #include <rte_debug.h>
46 #include <rte_common.h>
47 #include <rte_ether.h>
48 #include <rte_malloc.h>
49 #include <rte_cycles.h>
50 #include <rte_timer.h>
51 #include <rte_thash.h>
52 #include <rte_ip.h>
53 #include <rte_tcp.h>
54 #include <rte_udp.h>
55 #include <rte_eth_bond.h>
56 
57 #include "ff_dpdk_if.h"
58 #include "ff_dpdk_pcap.h"
59 #include "ff_dpdk_kni.h"
60 #include "ff_config.h"
61 #include "ff_veth.h"
62 #include "ff_host_interface.h"
63 #include "ff_msg.h"
64 #include "ff_api.h"
65 #include "ff_memory.h"
66 
67 #ifdef FF_KNI
68 #define KNI_MBUF_MAX 2048
69 #define KNI_QUEUE_SIZE 2048
70 
71 int enable_kni;
72 static int kni_accept;
73 static int knictl_action = FF_KNICTL_ACTION_DEFAULT;
74 #endif
75 
76 static int numa_on;
77 
78 static unsigned idle_sleep;
79 static unsigned pkt_tx_delay;
80 static uint64_t usr_cb_tsc;
81 
82 static struct rte_timer freebsd_clock;
83 
84 // Mellanox Linux's driver key
85 static uint8_t default_rsskey_40bytes[40] = {
86     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
87     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
88     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
89     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
90     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
91 };
92 
93 static uint8_t default_rsskey_52bytes[52] = {
94     0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23,
95     0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30,
96     0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02,
97     0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c,
98     0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55,
99     0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e,
100     0x81, 0x15, 0x03, 0x66
101 };
102 
103 static uint8_t symmetric_rsskey[52] = {
104     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
105     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
106     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
107     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
108     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
109     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
110     0x6d, 0x5a, 0x6d, 0x5a
111 };
112 
113 static int rsskey_len = sizeof(default_rsskey_40bytes);
114 static uint8_t *rsskey = default_rsskey_40bytes;
115 
116 struct lcore_conf lcore_conf;
117 
118 struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
119 
120 static pcblddr_func_t pcblddr_fun;
121 
122 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
123 static dispatch_func_t packet_dispatcher;
124 
125 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
126 
127 #define BOND_DRIVER_NAME    "net_bonding"
128 
129 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port);
130 
131 struct ff_msg_ring {
132     char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE];
133     /* ring[0] for lcore recv msg, other send */
134     /* ring[1] for lcore send msg, other read */
135     struct rte_ring *ring[FF_MSG_NUM];
136 } __rte_cache_aligned;
137 
138 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
139 static struct rte_mempool *message_pool;
140 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
141 
142 static struct ff_top_args ff_top_status;
143 static struct ff_traffic_args ff_traffic;
144 extern void ff_hardclock(void);
145 
146 static void
147 ff_hardclock_job(__rte_unused struct rte_timer *timer,
148     __rte_unused void *arg) {
149     ff_hardclock();
150     ff_update_current_ts();
151 }
152 
153 struct ff_dpdk_if_context *
154 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
155 {
156     struct ff_dpdk_if_context *ctx;
157 
158     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
159     if (ctx == NULL)
160         return NULL;
161 
162     ctx->sc = sc;
163     ctx->ifp = ifp;
164     ctx->port_id = cfg->port_id;
165     ctx->hw_features = cfg->hw_features;
166 
167     return ctx;
168 }
169 
170 void
171 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
172 {
173     free(ctx);
174 }
175 
176 static void
177 check_all_ports_link_status(void)
178 {
179     #define CHECK_INTERVAL 100 /* 100ms */
180     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
181 
182     uint16_t portid;
183     uint8_t count, all_ports_up, print_flag = 0;
184     struct rte_eth_link link;
185 
186     printf("\nChecking link status");
187     fflush(stdout);
188 
189     int i, nb_ports;
190     nb_ports = ff_global_cfg.dpdk.nb_ports;
191     for (count = 0; count <= MAX_CHECK_TIME; count++) {
192         all_ports_up = 1;
193         for (i = 0; i < nb_ports; i++) {
194             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
195             memset(&link, 0, sizeof(link));
196             rte_eth_link_get_nowait(portid, &link);
197 
198             /* print link status if flag set */
199             if (print_flag == 1) {
200                 if (link.link_status) {
201                     printf("Port %d Link Up - speed %u "
202                         "Mbps - %s\n", (int)portid,
203                         (unsigned)link.link_speed,
204                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
205                         ("full-duplex") : ("half-duplex\n"));
206                 } else {
207                     printf("Port %d Link Down\n", (int)portid);
208                 }
209                 continue;
210             }
211             /* clear all_ports_up flag if any link down */
212             if (link.link_status == 0) {
213                 all_ports_up = 0;
214                 break;
215             }
216         }
217 
218         /* after finally printing all link status, get out */
219         if (print_flag == 1)
220             break;
221 
222         if (all_ports_up == 0) {
223             printf(".");
224             fflush(stdout);
225             rte_delay_ms(CHECK_INTERVAL);
226         }
227 
228         /* set the print_flag if all ports up or timeout */
229         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
230             print_flag = 1;
231             printf("done\n");
232         }
233     }
234 }
235 
236 static int
237 init_lcore_conf(void)
238 {
239     uint8_t nb_dev_ports = rte_eth_dev_count_avail();
240     if (nb_dev_ports == 0) {
241         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
242     }
243 
244     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
245         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
246                  ff_global_cfg.dpdk.max_portid);
247     }
248 
249     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
250     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
251 
252     uint16_t socket_id = 0;
253     if (numa_on) {
254         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
255     }
256 
257     lcore_conf.socket_id = socket_id;
258 
259     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
260     if (!rte_lcore_is_enabled(lcore_id)) {
261         rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
262     }
263 
264     int j;
265     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
266         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
267         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
268 
269         int queueid = -1;
270         int i;
271         for (i = 0; i < pconf->nb_lcores; i++) {
272             if (pconf->lcore_list[i] == lcore_id) {
273                 queueid = i;
274             }
275         }
276         if (queueid < 0) {
277             continue;
278         }
279         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
280         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
281         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
282         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
283         lcore_conf.nb_rx_queue++;
284 
285         lcore_conf.tx_queue_id[port_id] = queueid;
286         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
287         lcore_conf.nb_tx_port++;
288 
289         /* Enable pcap dump */
290         if (ff_global_cfg.pcap.enable) {
291             ff_enable_pcap(ff_global_cfg.pcap.save_path, ff_global_cfg.pcap.snap_len);
292         }
293 
294         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
295     }
296 
297     if (lcore_conf.nb_rx_queue == 0) {
298         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
299     }
300 
301     return 0;
302 }
303 
304 static int
305 init_mem_pool(void)
306 {
307     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
308     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
309     uint32_t nb_tx_queue = nb_lcores;
310     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
311     uint16_t max_portid = ff_global_cfg.dpdk.max_portid;
312 
313     unsigned nb_mbuf = RTE_ALIGN_CEIL (
314         (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE          +
315         nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST    +
316         nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE  +
317         nb_lcores * MEMPOOL_CACHE_SIZE +
318 #ifdef FF_KNI
319         nb_ports * KNI_MBUF_MAX +
320         nb_ports * KNI_QUEUE_SIZE +
321 #endif
322         nb_lcores * nb_ports * DISPATCH_RING_SIZE),
323         (unsigned)8192);
324 
325     unsigned socketid = 0;
326     uint16_t i, lcore_id;
327     char s[64];
328 
329     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
330         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
331         if (numa_on) {
332             socketid = rte_lcore_to_socket_id(lcore_id);
333         }
334 
335         if (socketid >= NB_SOCKETS) {
336             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
337                 socketid, i, NB_SOCKETS);
338         }
339 
340         if (pktmbuf_pool[socketid] != NULL) {
341             continue;
342         }
343 
344         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
345             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
346             pktmbuf_pool[socketid] =
347                 rte_pktmbuf_pool_create(s, nb_mbuf,
348                     MEMPOOL_CACHE_SIZE, 0,
349                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
350         } else {
351             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
352             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
353         }
354 
355         if (pktmbuf_pool[socketid] == NULL) {
356             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
357         } else {
358             printf("create mbuf pool on socket %d\n", socketid);
359         }
360 
361 #ifdef FF_USE_PAGE_ARRAY
362         nb_mbuf = RTE_ALIGN_CEIL (
363             nb_ports*nb_lcores*MAX_PKT_BURST    +
364             nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
365             nb_lcores*MEMPOOL_CACHE_SIZE,
366             (unsigned)4096);
367         ff_init_ref_pool(nb_mbuf, socketid);
368 #endif
369     }
370 
371     return 0;
372 }
373 
374 static struct rte_ring *
375 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
376 {
377     struct rte_ring *ring;
378 
379     if (name == NULL) {
380         rte_exit(EXIT_FAILURE, "create ring failed, no name!\n");
381     }
382 
383     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
384         ring = rte_ring_create(name, count, socket_id, flags);
385     } else {
386         ring = rte_ring_lookup(name);
387     }
388 
389     if (ring == NULL) {
390         rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name);
391     }
392 
393     return ring;
394 }
395 
396 static int
397 init_dispatch_ring(void)
398 {
399     int j;
400     char name_buf[RTE_RING_NAMESIZE];
401     int queueid;
402 
403     unsigned socketid = lcore_conf.socket_id;
404 
405     /* Create ring according to ports actually being used. */
406     int nb_ports = ff_global_cfg.dpdk.nb_ports;
407     for (j = 0; j < nb_ports; j++) {
408         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
409         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
410         int nb_queues = pconf->nb_lcores;
411         if (dispatch_ring[portid] == NULL) {
412             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
413 
414             dispatch_ring[portid] = rte_zmalloc(name_buf,
415                 sizeof(struct rte_ring *) * nb_queues,
416                 RTE_CACHE_LINE_SIZE);
417             if (dispatch_ring[portid] == NULL) {
418                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
419                     "failed\n", name_buf);
420             }
421         }
422 
423         for(queueid = 0; queueid < nb_queues; ++queueid) {
424             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
425                 portid, queueid);
426             dispatch_ring[portid][queueid] = create_ring(name_buf,
427                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
428 
429             if (dispatch_ring[portid][queueid] == NULL)
430                 rte_panic("create ring:%s failed!\n", name_buf);
431 
432             printf("create ring:%s success, %u ring entries are now free!\n",
433                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
434         }
435     }
436 
437     return 0;
438 }
439 
440 static void
441 ff_msg_init(struct rte_mempool *mp,
442     __attribute__((unused)) void *opaque_arg,
443     void *obj, __attribute__((unused)) unsigned i)
444 {
445     struct ff_msg *msg = (struct ff_msg *)obj;
446     msg->msg_type = FF_UNKNOWN;
447     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
448     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
449     msg->original_buf = NULL;
450     msg->original_buf_len = 0;
451 }
452 
453 static int
454 init_msg_ring(void)
455 {
456     uint16_t i, j;
457     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
458     unsigned socketid = lcore_conf.socket_id;
459 
460     /* Create message buffer pool */
461     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
462         message_pool = rte_mempool_create(FF_MSG_POOL,
463            MSG_RING_SIZE * 2 * nb_procs,
464            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
465            NULL, NULL, ff_msg_init, NULL,
466            socketid, 0);
467     } else {
468         message_pool = rte_mempool_lookup(FF_MSG_POOL);
469     }
470 
471     if (message_pool == NULL) {
472         rte_panic("Create msg mempool failed\n");
473     }
474 
475     for(i = 0; i < nb_procs; ++i) {
476         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
477             "%s%u", FF_MSG_RING_IN, i);
478         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
479             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
480         if (msg_ring[i].ring[0] == NULL)
481             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
482 
483         for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) {
484             snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE,
485                 "%s%u_%u", FF_MSG_RING_OUT, i, j);
486             msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j],
487                 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
488             if (msg_ring[i].ring[j] == NULL)
489                 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]);
490         }
491     }
492 
493     return 0;
494 }
495 
496 #ifdef FF_KNI
497 
498 static enum FF_KNICTL_CMD get_kni_action(const char *c){
499     if (!c)
500         return FF_KNICTL_ACTION_DEFAULT;
501     if (0 == strcasecmp(c, "alltokni")){
502         return FF_KNICTL_ACTION_ALL_TO_KNI;
503     } else  if (0 == strcasecmp(c, "alltoff")){
504         return FF_KNICTL_ACTION_ALL_TO_FF;
505     } else if (0 == strcasecmp(c, "default")){
506         return FF_KNICTL_ACTION_DEFAULT;
507     } else {
508         return FF_KNICTL_ACTION_DEFAULT;
509     }
510 }
511 
512 static int
513 init_kni(void)
514 {
515     int nb_ports = rte_eth_dev_count_avail();
516     kni_accept = 0;
517     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
518         kni_accept = 1;
519 
520     knictl_action = get_kni_action(ff_global_cfg.kni.kni_action);
521 
522     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
523         ff_global_cfg.kni.udp_port);
524 
525     unsigned socket_id = lcore_conf.socket_id;
526     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
527 
528     nb_ports = ff_global_cfg.dpdk.nb_ports;
529     int i, ret;
530     for (i = 0; i < nb_ports; i++) {
531         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
532         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
533     }
534 
535     return 0;
536 }
537 #endif
538 
539 //RSS reta update will failed when enable flow isolate
540 #ifndef FF_FLOW_ISOLATE
541 static void
542 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
543 {
544     if (reta_size == 0) {
545         return;
546     }
547 
548     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
549     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
550 
551     /* config HW indirection table */
552     unsigned i, j, hash=0;
553     for (i = 0; i < reta_conf_size; i++) {
554         reta_conf[i].mask = ~0ULL;
555         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
556             reta_conf[i].reta[j] = hash++ % nb_queues;
557         }
558     }
559 
560     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
561         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
562             port_id);
563     }
564 }
565 #endif
566 
567 static int
568 init_port_start(void)
569 {
570     int nb_ports = ff_global_cfg.dpdk.nb_ports;
571     unsigned socketid = 0;
572     struct rte_mempool *mbuf_pool;
573     uint16_t i, j;
574 
575     for (i = 0; i < nb_ports; i++) {
576         uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i];
577         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id];
578         uint16_t nb_queues = pconf->nb_lcores;
579 
580         for (j=0; j<=pconf->nb_slaves; j++) {
581             if (j < pconf->nb_slaves) {
582                 port_id = pconf->slave_portid_list[j];
583                 printf("To init %s's %d'st slave port[%d]\n",
584                         ff_global_cfg.dpdk.bond_cfgs->name,
585                         j, port_id);
586             } else {
587                 port_id = u_port_id;
588             }
589 
590             struct rte_eth_dev_info dev_info;
591             struct rte_eth_conf port_conf = {0};
592             struct rte_eth_rxconf rxq_conf;
593             struct rte_eth_txconf txq_conf;
594 
595             int ret = rte_eth_dev_info_get(port_id, &dev_info);
596             if (ret != 0)
597                 rte_exit(EXIT_FAILURE,
598                     "Error during getting device (port %u) info: %s\n",
599                     port_id, strerror(-ret));
600 
601             if (nb_queues > dev_info.max_rx_queues) {
602                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
603                     nb_queues,
604                     dev_info.max_rx_queues);
605             }
606 
607             if (nb_queues > dev_info.max_tx_queues) {
608                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
609                     nb_queues,
610                     dev_info.max_tx_queues);
611             }
612 
613             struct rte_ether_addr addr;
614             rte_eth_macaddr_get(port_id, &addr);
615             printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
616                        " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
617                     (unsigned)port_id,
618                     addr.addr_bytes[0], addr.addr_bytes[1],
619                     addr.addr_bytes[2], addr.addr_bytes[3],
620                     addr.addr_bytes[4], addr.addr_bytes[5]);
621 
622             rte_memcpy(pconf->mac,
623                 addr.addr_bytes, RTE_ETHER_ADDR_LEN);
624 
625             /* Set RSS mode */
626             uint64_t default_rss_hf = ETH_RSS_PROTO_MASK;
627             port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
628             port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf;
629             if (dev_info.hash_key_size == 52) {
630                 rsskey = default_rsskey_52bytes;
631                 rsskey_len = 52;
632             }
633             if (ff_global_cfg.dpdk.symmetric_rss) {
634                 printf("Use symmetric Receive-side Scaling(RSS) key\n");
635                 rsskey = symmetric_rsskey;
636             }
637             port_conf.rx_adv_conf.rss_conf.rss_key = rsskey;
638             port_conf.rx_adv_conf.rss_conf.rss_key_len = rsskey_len;
639             port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads;
640             if (port_conf.rx_adv_conf.rss_conf.rss_hf !=
641                     ETH_RSS_PROTO_MASK) {
642                 printf("Port %u modified RSS hash function based on hardware support,"
643                         "requested:%#"PRIx64" configured:%#"PRIx64"\n",
644                         port_id, default_rss_hf,
645                         port_conf.rx_adv_conf.rss_conf.rss_hf);
646             }
647 
648             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) {
649                 port_conf.txmode.offloads |=
650                     DEV_TX_OFFLOAD_MBUF_FAST_FREE;
651             }
652 
653             /* Set Rx VLAN stripping */
654             if (ff_global_cfg.dpdk.vlan_strip) {
655                 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
656                     port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
657                 }
658             }
659 
660             /* Enable HW CRC stripping */
661             port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC;
662 
663             /* FIXME: Enable TCP LRO ?*/
664             #if 0
665             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
666                 printf("LRO is supported\n");
667                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
668                 pconf->hw_features.rx_lro = 1;
669             }
670             #endif
671 
672             /* Set Rx checksum checking */
673             if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
674                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
675                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
676                 printf("RX checksum offload supported\n");
677                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
678                 pconf->hw_features.rx_csum = 1;
679             }
680 
681             if (ff_global_cfg.dpdk.tx_csum_offoad_skip == 0) {
682                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
683                     printf("TX ip checksum offload supported\n");
684                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
685                     pconf->hw_features.tx_csum_ip = 1;
686                 }
687 
688                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
689                     (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
690                     printf("TX TCP&UDP checksum offload supported\n");
691                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM;
692                     pconf->hw_features.tx_csum_l4 = 1;
693                 }
694             } else {
695                 printf("TX checksum offoad is disabled\n");
696             }
697 
698             if (ff_global_cfg.dpdk.tso) {
699                 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
700                     printf("TSO is supported\n");
701                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO;
702                     pconf->hw_features.tx_tso = 1;
703                 }
704             } else {
705                 printf("TSO is disabled\n");
706             }
707 
708             if (dev_info.reta_size) {
709                 /* reta size must be power of 2 */
710                 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
711 
712                 rss_reta_size[port_id] = dev_info.reta_size;
713                 printf("port[%d]: rss table size: %d\n", port_id,
714                     dev_info.reta_size);
715             }
716 
717             if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
718                 continue;
719             }
720 
721             ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
722             if (ret != 0) {
723                 return ret;
724             }
725 
726             static uint16_t nb_rxd = RX_QUEUE_SIZE;
727             static uint16_t nb_txd = TX_QUEUE_SIZE;
728             ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd);
729             if (ret < 0)
730                 printf("Could not adjust number of descriptors "
731                         "for port%u (%d)\n", (unsigned)port_id, ret);
732 
733             uint16_t q;
734             for (q = 0; q < nb_queues; q++) {
735                 if (numa_on) {
736                     uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
737                     socketid = rte_lcore_to_socket_id(lcore_id);
738                 }
739                 mbuf_pool = pktmbuf_pool[socketid];
740 
741                 txq_conf = dev_info.default_txconf;
742                 txq_conf.offloads = port_conf.txmode.offloads;
743                 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd,
744                     socketid, &txq_conf);
745                 if (ret < 0) {
746                     return ret;
747                 }
748 
749                 rxq_conf = dev_info.default_rxconf;
750                 rxq_conf.offloads = port_conf.rxmode.offloads;
751                 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd,
752                     socketid, &rxq_conf, mbuf_pool);
753                 if (ret < 0) {
754                     return ret;
755                 }
756             }
757 
758 
759             if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME,
760                     strlen(dev_info.driver_name)) == 0) {
761 
762                 rte_eth_macaddr_get(port_id, &addr);
763                 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
764                            " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
765                         (unsigned)port_id,
766                         addr.addr_bytes[0], addr.addr_bytes[1],
767                         addr.addr_bytes[2], addr.addr_bytes[3],
768                         addr.addr_bytes[4], addr.addr_bytes[5]);
769 
770                 rte_memcpy(pconf->mac,
771                     addr.addr_bytes, RTE_ETHER_ADDR_LEN);
772 
773                 int mode, count, x;
774                 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS;
775 
776                 mode = rte_eth_bond_mode_get(port_id);
777                 printf("Port %u, bond mode:%d\n", port_id, mode);
778 
779                 count = rte_eth_bond_slaves_get(port_id, slaves, len);
780                 printf("Port %u, %s's slave ports count:%d\n", port_id,
781                             ff_global_cfg.dpdk.bond_cfgs->name, count);
782                 for (x=0; x<count; x++) {
783                     printf("Port %u, %s's slave port[%u]\n", port_id,
784                             ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]);
785                 }
786             }
787 
788             ret = rte_eth_dev_start(port_id);
789             if (ret < 0) {
790                 return ret;
791             }
792     //RSS reta update will failed when enable flow isolate
793     #ifndef FF_FLOW_ISOLATE
794             if (nb_queues > 1) {
795                 /*
796                  * FIXME: modify RSS set to FDIR
797                  */
798                 set_rss_table(port_id, dev_info.reta_size, nb_queues);
799             }
800     #endif
801 
802             /* Enable RX in promiscuous mode for the Ethernet device. */
803             if (ff_global_cfg.dpdk.promiscuous) {
804                 ret = rte_eth_promiscuous_enable(port_id);
805                 if (ret == 0) {
806                     printf("set port %u to promiscuous mode ok\n", port_id);
807                 } else {
808                     printf("set port %u to promiscuous mode error\n", port_id);
809                 }
810             }
811         }
812     }
813 
814     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
815         check_all_ports_link_status();
816     }
817 
818     return 0;
819 }
820 
821 static int
822 init_clock(void)
823 {
824     rte_timer_subsystem_init();
825     uint64_t hz = rte_get_timer_hz();
826     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
827     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
828 
829     rte_timer_init(&freebsd_clock);
830     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
831         rte_lcore_id(), &ff_hardclock_job, NULL);
832 
833     ff_update_current_ts();
834 
835     return 0;
836 }
837 
838 #ifdef FF_FLOW_ISOLATE
839 /** Print a message out of a flow error. */
840 static int
841 port_flow_complain(struct rte_flow_error *error)
842 {
843     static const char *const errstrlist[] = {
844         [RTE_FLOW_ERROR_TYPE_NONE] = "no error",
845         [RTE_FLOW_ERROR_TYPE_UNSPECIFIED] = "cause unspecified",
846         [RTE_FLOW_ERROR_TYPE_HANDLE] = "flow rule (handle)",
847         [RTE_FLOW_ERROR_TYPE_ATTR_GROUP] = "group field",
848         [RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY] = "priority field",
849         [RTE_FLOW_ERROR_TYPE_ATTR_INGRESS] = "ingress field",
850         [RTE_FLOW_ERROR_TYPE_ATTR_EGRESS] = "egress field",
851         [RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER] = "transfer field",
852         [RTE_FLOW_ERROR_TYPE_ATTR] = "attributes structure",
853         [RTE_FLOW_ERROR_TYPE_ITEM_NUM] = "pattern length",
854         [RTE_FLOW_ERROR_TYPE_ITEM_SPEC] = "item specification",
855         [RTE_FLOW_ERROR_TYPE_ITEM_LAST] = "item specification range",
856         [RTE_FLOW_ERROR_TYPE_ITEM_MASK] = "item specification mask",
857         [RTE_FLOW_ERROR_TYPE_ITEM] = "specific pattern item",
858         [RTE_FLOW_ERROR_TYPE_ACTION_NUM] = "number of actions",
859         [RTE_FLOW_ERROR_TYPE_ACTION_CONF] = "action configuration",
860         [RTE_FLOW_ERROR_TYPE_ACTION] = "specific action",
861     };
862     const char *errstr;
863     char buf[32];
864     int err = rte_errno;
865 
866     if ((unsigned int)error->type >= RTE_DIM(errstrlist) ||
867         !errstrlist[error->type])
868         errstr = "unknown type";
869     else
870         errstr = errstrlist[error->type];
871     printf("Caught error type %d (%s): %s%s: %s\n",
872            error->type, errstr,
873            error->cause ? (snprintf(buf, sizeof(buf), "cause: %p, ",
874                                     error->cause), buf) : "",
875            error->message ? error->message : "(no stated reason)",
876            rte_strerror(err));
877     return -err;
878 }
879 
880 static int
881 port_flow_isolate(uint16_t port_id, int set)
882 {
883     struct rte_flow_error error;
884 
885     /* Poisoning to make sure PMDs update it in case of error. */
886     memset(&error, 0x66, sizeof(error));
887     if (rte_flow_isolate(port_id, set, &error))
888         return port_flow_complain(&error);
889     printf("Ingress traffic on port %u is %s to the defined flow rules\n",
890            port_id,
891            set ? "now restricted" : "not restricted anymore");
892     return 0;
893 }
894 
895 static int
896 create_tcp_flow(uint16_t port_id, uint16_t tcp_port) {
897   struct rte_flow_attr attr = {.ingress = 1};
898   struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
899   int nb_queues = pconf->nb_lcores;
900   uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
901   int i = 0, j = 0;
902   for (i = 0, j = 0; i < nb_queues; ++i)
903    queue[j++] = i;
904   struct rte_flow_action_rss rss = {
905    .types = ETH_RSS_NONFRAG_IPV4_TCP,
906    .key_len = rsskey_len,
907    .key = rsskey,
908    .queue_num = j,
909    .queue = queue,
910   };
911 
912   struct rte_eth_dev_info dev_info;
913   int ret = rte_eth_dev_info_get(port_id, &dev_info);
914   if (ret != 0)
915     rte_exit(EXIT_FAILURE, "Error during getting device (port %u) info: %s\n", port_id, strerror(-ret));
916 
917   struct rte_flow_item pattern[3];
918   struct rte_flow_action action[2];
919   struct rte_flow_item_tcp tcp_spec;
920   struct rte_flow_item_tcp tcp_mask = {
921           .hdr = {
922                   .src_port = RTE_BE16(0x0000),
923                   .dst_port = RTE_BE16(0xffff),
924           },
925   };
926   struct rte_flow_error error;
927 
928   memset(pattern, 0, sizeof(pattern));
929   memset(action, 0, sizeof(action));
930 
931   /* set the dst ipv4 packet to the required value */
932   pattern[0].type = RTE_FLOW_ITEM_TYPE_IPV4;
933 
934   memset(&tcp_spec, 0, sizeof(struct rte_flow_item_tcp));
935   tcp_spec.hdr.dst_port = rte_cpu_to_be_16(tcp_port);
936   pattern[1].type = RTE_FLOW_ITEM_TYPE_TCP;
937   pattern[1].spec = &tcp_spec;
938   pattern[1].mask = &tcp_mask;
939 
940   /* end the pattern array */
941   pattern[2].type = RTE_FLOW_ITEM_TYPE_END;
942 
943   /* create the action */
944   action[0].type = RTE_FLOW_ACTION_TYPE_RSS;
945   action[0].conf = &rss;
946   action[1].type = RTE_FLOW_ACTION_TYPE_END;
947 
948   struct rte_flow *flow;
949   /* validate and create the flow rule */
950   if (!rte_flow_validate(port_id, &attr, pattern, action, &error)) {
951       flow = rte_flow_create(port_id, &attr, pattern, action, &error);
952       if (!flow) {
953           return port_flow_complain(&error);
954       }
955   }
956 
957   memset(pattern, 0, sizeof(pattern));
958 
959   /* set the dst ipv4 packet to the required value */
960   pattern[0].type = RTE_FLOW_ITEM_TYPE_IPV4;
961 
962   struct rte_flow_item_tcp tcp_src_mask = {
963           .hdr = {
964                   .src_port = RTE_BE16(0xffff),
965                   .dst_port = RTE_BE16(0x0000),
966           },
967   };
968 
969   memset(&tcp_spec, 0, sizeof(struct rte_flow_item_tcp));
970   tcp_spec.hdr.src_port = rte_cpu_to_be_16(tcp_port);
971   pattern[1].type = RTE_FLOW_ITEM_TYPE_TCP;
972   pattern[1].spec = &tcp_spec;
973   pattern[1].mask = &tcp_src_mask;
974 
975   /* end the pattern array */
976   pattern[2].type = RTE_FLOW_ITEM_TYPE_END;
977 
978   /* validate and create the flow rule */
979   if (!rte_flow_validate(port_id, &attr, pattern, action, &error)) {
980       flow = rte_flow_create(port_id, &attr, pattern, action, &error);
981       if (!flow) {
982           return port_flow_complain(&error);
983       }
984   }
985 
986   return 1;
987 }
988 
989 static int
990 init_flow(uint16_t port_id, uint16_t tcp_port) {
991   // struct ff_flow_cfg fcfg = ff_global_cfg.dpdk.flow_cfgs[0];
992 
993   // int i;
994   // for (i = 0; i < fcfg.nb_port; i++) {
995   //     if(!create_tcp_flow(fcfg.port_id, fcfg.tcp_ports[i])) {
996   //         return 0;
997   //     }
998   // }
999 
1000   if(!create_tcp_flow(port_id, tcp_port)) {
1001       rte_exit(EXIT_FAILURE, "create tcp flow failed\n");
1002       return -1;
1003   }
1004 
1005   /*  ARP rule */
1006   struct rte_flow_attr attr = {.ingress = 1};
1007   struct rte_flow_action_queue queue = {.index = 0};
1008 
1009   struct rte_flow_item pattern_[2];
1010   struct rte_flow_action action[2];
1011   struct rte_flow_item_eth eth_type = {.type = RTE_BE16(0x0806)};
1012   struct rte_flow_item_eth eth_mask = {
1013           .type = RTE_BE16(0xffff)
1014   };
1015 
1016   memset(pattern_, 0, sizeof(pattern_));
1017   memset(action, 0, sizeof(action));
1018 
1019   pattern_[0].type = RTE_FLOW_ITEM_TYPE_ETH;
1020   pattern_[0].spec = &eth_type;
1021   pattern_[0].mask = &eth_mask;
1022 
1023   pattern_[1].type = RTE_FLOW_ITEM_TYPE_END;
1024 
1025   /* create the action */
1026   action[0].type = RTE_FLOW_ACTION_TYPE_QUEUE;
1027   action[0].conf = &queue;
1028   action[1].type = RTE_FLOW_ACTION_TYPE_END;
1029 
1030   struct rte_flow *flow;
1031   struct rte_flow_error error;
1032   /* validate and create the flow rule */
1033   if (!rte_flow_validate(port_id, &attr, pattern_, action, &error)) {
1034       flow = rte_flow_create(port_id, &attr, pattern_, action, &error);
1035       if (!flow) {
1036           return port_flow_complain(&error);
1037       }
1038   }
1039 
1040   return 1;
1041 }
1042 
1043 #endif
1044 
1045 int
1046 ff_dpdk_init(int argc, char **argv)
1047 {
1048     if (ff_global_cfg.dpdk.nb_procs < 1 ||
1049         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
1050         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
1051         ff_global_cfg.dpdk.proc_id < 0) {
1052         printf("param num_procs[%d] or proc_id[%d] error!\n",
1053             ff_global_cfg.dpdk.nb_procs,
1054             ff_global_cfg.dpdk.proc_id);
1055         exit(1);
1056     }
1057 
1058     int ret = rte_eal_init(argc, argv);
1059     if (ret < 0) {
1060         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1061     }
1062 
1063     numa_on = ff_global_cfg.dpdk.numa_on;
1064 
1065     idle_sleep = ff_global_cfg.dpdk.idle_sleep;
1066     pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \
1067         BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay;
1068 
1069     init_lcore_conf();
1070 
1071     init_mem_pool();
1072 
1073     init_dispatch_ring();
1074 
1075     init_msg_ring();
1076 
1077 #ifdef FF_KNI
1078     enable_kni = ff_global_cfg.kni.enable;
1079     if (enable_kni) {
1080         init_kni();
1081     }
1082 #endif
1083 
1084 #ifdef FF_USE_PAGE_ARRAY
1085     ff_mmap_init();
1086 #endif
1087 
1088 #ifdef FF_FLOW_ISOLATE
1089     // run once in primary process
1090     if (0 == lcore_conf.tx_queue_id[0]){
1091         ret = port_flow_isolate(0, 1);
1092         if (ret < 0)
1093             rte_exit(EXIT_FAILURE, "init_port_isolate failed\n");
1094     }
1095 #endif
1096 
1097     ret = init_port_start();
1098     if (ret < 0) {
1099         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
1100     }
1101 
1102     init_clock();
1103 #ifdef FF_FLOW_ISOLATE
1104     //Only give a example usage: port_id=0, tcp_port= 80.
1105     //Recommend:
1106     //1. init_flow should replace `set_rss_table` in `init_port_start` loop, This can set all NIC's port_id_list instead only 0 device(port_id).
1107     //2. using config options `tcp_port` replace magic number of 80
1108     ret = init_flow(0, 80);
1109     if (ret < 0) {
1110         rte_exit(EXIT_FAILURE, "init_port_flow failed\n");
1111     }
1112 #endif
1113     return 0;
1114 }
1115 
1116 static void
1117 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
1118 {
1119     uint8_t rx_csum = ctx->hw_features.rx_csum;
1120     if (rx_csum) {
1121         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
1122             rte_pktmbuf_free(pkt);
1123             return;
1124         }
1125     }
1126 
1127     void *data = rte_pktmbuf_mtod(pkt, void*);
1128     uint16_t len = rte_pktmbuf_data_len(pkt);
1129 
1130     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
1131     if (hdr == NULL) {
1132         rte_pktmbuf_free(pkt);
1133         return;
1134     }
1135 
1136     if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) {
1137         ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci);
1138     }
1139 
1140     struct rte_mbuf *pn = pkt->next;
1141     void *prev = hdr;
1142     while(pn != NULL) {
1143         data = rte_pktmbuf_mtod(pn, void*);
1144         len = rte_pktmbuf_data_len(pn);
1145 
1146         void *mb = ff_mbuf_get(prev, pn, data, len);
1147         if (mb == NULL) {
1148             ff_mbuf_free(hdr);
1149             rte_pktmbuf_free(pkt);
1150             return;
1151         }
1152         pn = pn->next;
1153         prev = mb;
1154     }
1155 
1156     ff_veth_process_packet(ctx->ifp, hdr);
1157 }
1158 
1159 static enum FilterReturn
1160 protocol_filter(const void *data, uint16_t len)
1161 {
1162     if(len < RTE_ETHER_ADDR_LEN)
1163         return FILTER_UNKNOWN;
1164 
1165     const struct rte_ether_hdr *hdr;
1166     const struct rte_vlan_hdr *vlanhdr;
1167     hdr = (const struct rte_ether_hdr *)data;
1168     uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type);
1169     data += RTE_ETHER_HDR_LEN;
1170     len -= RTE_ETHER_HDR_LEN;
1171 
1172     if (ether_type == RTE_ETHER_TYPE_VLAN) {
1173         vlanhdr = (struct rte_vlan_hdr *)data;
1174         ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto);
1175         data += sizeof(struct rte_vlan_hdr);
1176         len -= sizeof(struct rte_vlan_hdr);
1177     }
1178 
1179     if(ether_type == RTE_ETHER_TYPE_ARP)
1180         return FILTER_ARP;
1181 
1182 #ifdef INET6
1183     if (ether_type == RTE_ETHER_TYPE_IPV6) {
1184         return ff_kni_proto_filter(data,
1185             len, ether_type);
1186     }
1187 #endif
1188 
1189 #ifndef FF_KNI
1190     return FILTER_UNKNOWN;
1191 #else
1192     if (!enable_kni) {
1193         return FILTER_UNKNOWN;
1194     }
1195 
1196     if(ether_type != RTE_ETHER_TYPE_IPV4)
1197         return FILTER_UNKNOWN;
1198 
1199     return ff_kni_proto_filter(data,
1200         len, ether_type);
1201 #endif
1202 }
1203 
1204 static inline void
1205 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
1206 {
1207     struct rte_mbuf *md;
1208     void *src, *dst;
1209 
1210     dst = rte_pktmbuf_mtod(mi, void *);
1211     src = rte_pktmbuf_mtod(m, void *);
1212 
1213     mi->data_len = m->data_len;
1214     rte_memcpy(dst, src, m->data_len);
1215 
1216     mi->port = m->port;
1217     mi->vlan_tci = m->vlan_tci;
1218     mi->vlan_tci_outer = m->vlan_tci_outer;
1219     mi->tx_offload = m->tx_offload;
1220     mi->hash = m->hash;
1221     mi->ol_flags = m->ol_flags;
1222     mi->packet_type = m->packet_type;
1223 }
1224 
1225 /* copied from rte_pktmbuf_clone */
1226 static inline struct rte_mbuf *
1227 pktmbuf_deep_clone(const struct rte_mbuf *md,
1228     struct rte_mempool *mp)
1229 {
1230     struct rte_mbuf *mc, *mi, **prev;
1231     uint32_t pktlen;
1232     uint8_t nseg;
1233 
1234     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
1235         return NULL;
1236 
1237     mi = mc;
1238     prev = &mi->next;
1239     pktlen = md->pkt_len;
1240     nseg = 0;
1241 
1242     do {
1243         nseg++;
1244         pktmbuf_deep_attach(mi, md);
1245         *prev = mi;
1246         prev = &mi->next;
1247     } while ((md = md->next) != NULL &&
1248         (mi = rte_pktmbuf_alloc(mp)) != NULL);
1249 
1250     *prev = NULL;
1251     mc->nb_segs = nseg;
1252     mc->pkt_len = pktlen;
1253 
1254     /* Allocation of new indirect segment failed */
1255     if (unlikely (mi == NULL)) {
1256         rte_pktmbuf_free(mc);
1257         return NULL;
1258     }
1259 
1260     __rte_mbuf_sanity_check(mc, 1);
1261     return mc;
1262 }
1263 
1264 static inline void
1265 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
1266     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
1267 {
1268     struct lcore_conf *qconf = &lcore_conf;
1269     uint16_t nb_queues = qconf->nb_queue_list[port_id];
1270 
1271     uint16_t i;
1272     for (i = 0; i < count; i++) {
1273         struct rte_mbuf *rtem = bufs[i];
1274 
1275         if (unlikely( ff_global_cfg.pcap.enable)) {
1276             if (!pkts_from_ring) {
1277                 ff_dump_packets( ff_global_cfg.pcap.save_path, rtem, ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1278             }
1279         }
1280 
1281         void *data = rte_pktmbuf_mtod(rtem, void*);
1282         uint16_t len = rte_pktmbuf_data_len(rtem);
1283 
1284         if (!pkts_from_ring) {
1285             ff_traffic.rx_packets += rtem->nb_segs;
1286             ff_traffic.rx_bytes += rte_pktmbuf_pkt_len(rtem);
1287         }
1288 
1289         if (!pkts_from_ring && packet_dispatcher) {
1290             uint64_t cur_tsc = rte_rdtsc();
1291             int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues);
1292             usr_cb_tsc += rte_rdtsc() - cur_tsc;
1293             if (ret == FF_DISPATCH_RESPONSE) {
1294                 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len;
1295 
1296                 /*
1297                  * We have not support vlan out strip
1298                  */
1299                 if (rtem->vlan_tci) {
1300                     data = rte_pktmbuf_prepend(rtem, sizeof(struct rte_vlan_hdr));
1301                     if (data != NULL) {
1302                         memmove(data, data + sizeof(struct rte_vlan_hdr), RTE_ETHER_HDR_LEN);
1303                         struct rte_ether_hdr *etherhdr = (struct rte_ether_hdr *)data;
1304                         struct rte_vlan_hdr *vlanhdr = (struct rte_vlan_hdr *)(data + RTE_ETHER_HDR_LEN);
1305                         vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci);
1306                         vlanhdr->eth_proto = etherhdr->ether_type;
1307                         etherhdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN);
1308                     }
1309                 }
1310                 send_single_packet(rtem, port_id);
1311                 continue;
1312             }
1313 
1314             if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) {
1315                 rte_pktmbuf_free(rtem);
1316                 continue;
1317             }
1318 
1319             if (ret != queue_id) {
1320                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
1321                 if (ret < 0)
1322                     rte_pktmbuf_free(rtem);
1323 
1324                 continue;
1325             }
1326         }
1327 
1328         enum FilterReturn filter = protocol_filter(data, len);
1329 #ifdef INET6
1330         if (filter == FILTER_ARP || filter == FILTER_NDP) {
1331 #else
1332         if (filter == FILTER_ARP) {
1333 #endif
1334             struct rte_mempool *mbuf_pool;
1335             struct rte_mbuf *mbuf_clone;
1336             if (!pkts_from_ring) {
1337                 uint16_t j;
1338                 for(j = 0; j < nb_queues; ++j) {
1339                     if(j == queue_id)
1340                         continue;
1341 
1342                     unsigned socket_id = 0;
1343                     if (numa_on) {
1344                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1345                         socket_id = rte_lcore_to_socket_id(lcore_id);
1346                     }
1347                     mbuf_pool = pktmbuf_pool[socket_id];
1348                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1349                     if(mbuf_clone) {
1350                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1351                             mbuf_clone);
1352                         if (ret < 0)
1353                             rte_pktmbuf_free(mbuf_clone);
1354                     }
1355                 }
1356             }
1357 
1358 #ifdef FF_KNI
1359             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1360                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1361                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1362                 if(mbuf_clone) {
1363                     ff_kni_enqueue(port_id, mbuf_clone);
1364                 }
1365             }
1366 #endif
1367             ff_veth_input(ctx, rtem);
1368 #ifdef FF_KNI
1369         } else if (enable_kni) {
1370             if (knictl_action == FF_KNICTL_ACTION_ALL_TO_KNI){
1371                 ff_kni_enqueue(port_id, rtem);
1372             } else if (knictl_action == FF_KNICTL_ACTION_ALL_TO_FF){
1373                 ff_veth_input(ctx, rtem);
1374             } else if (knictl_action == FF_KNICTL_ACTION_DEFAULT){
1375                 if (enable_kni &&
1376                         ((filter == FILTER_KNI && kni_accept) ||
1377                         (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1378                         ff_kni_enqueue(port_id, rtem);
1379                 } else {
1380                     ff_veth_input(ctx, rtem);
1381                 }
1382             } else {
1383                 ff_veth_input(ctx, rtem);
1384             }
1385 #endif
1386         } else {
1387             ff_veth_input(ctx, rtem);
1388         }
1389     }
1390 }
1391 
1392 static inline int
1393 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1394     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1395 {
1396     /* read packet from ring buf and to process */
1397     uint16_t nb_rb;
1398     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1399         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1400 
1401     if(nb_rb > 0) {
1402         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1403     }
1404 
1405     return nb_rb;
1406 }
1407 
1408 static inline void
1409 handle_sysctl_msg(struct ff_msg *msg)
1410 {
1411     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1412         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1413         msg->sysctl.newlen);
1414 
1415     if (ret < 0) {
1416         msg->result = errno;
1417     } else {
1418         msg->result = 0;
1419     }
1420 }
1421 
1422 static inline void
1423 handle_ioctl_msg(struct ff_msg *msg)
1424 {
1425     int fd, ret;
1426 #ifdef INET6
1427     if (msg->msg_type == FF_IOCTL6) {
1428         fd = ff_socket(AF_INET6, SOCK_DGRAM, 0);
1429     } else
1430 #endif
1431         fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1432 
1433     if (fd < 0) {
1434         ret = -1;
1435         goto done;
1436     }
1437 
1438     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1439 
1440     ff_close(fd);
1441 
1442 done:
1443     if (ret < 0) {
1444         msg->result = errno;
1445     } else {
1446         msg->result = 0;
1447     }
1448 }
1449 
1450 static inline void
1451 handle_route_msg(struct ff_msg *msg)
1452 {
1453     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1454         &msg->route.len, msg->route.maxlen);
1455     if (ret < 0) {
1456         msg->result = errno;
1457     } else {
1458         msg->result = 0;
1459     }
1460 }
1461 
1462 static inline void
1463 handle_top_msg(struct ff_msg *msg)
1464 {
1465     msg->top = ff_top_status;
1466     msg->result = 0;
1467 }
1468 
1469 #ifdef FF_NETGRAPH
1470 static inline void
1471 handle_ngctl_msg(struct ff_msg *msg)
1472 {
1473     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1474     if (ret < 0) {
1475         msg->result = errno;
1476     } else {
1477         msg->result = 0;
1478         msg->ngctl.ret = ret;
1479     }
1480 }
1481 #endif
1482 
1483 #ifdef FF_IPFW
1484 static inline void
1485 handle_ipfw_msg(struct ff_msg *msg)
1486 {
1487     int fd, ret;
1488     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1489     if (fd < 0) {
1490         ret = -1;
1491         goto done;
1492     }
1493 
1494     switch (msg->ipfw.cmd) {
1495         case FF_IPFW_GET:
1496             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1497                 msg->ipfw.optname, msg->ipfw.optval,
1498                 msg->ipfw.optlen);
1499             break;
1500         case FF_IPFW_SET:
1501             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1502                 msg->ipfw.optname, msg->ipfw.optval,
1503                 *(msg->ipfw.optlen));
1504             break;
1505         default:
1506             ret = -1;
1507             errno = ENOTSUP;
1508             break;
1509     }
1510 
1511     ff_close(fd);
1512 
1513 done:
1514     if (ret < 0) {
1515         msg->result = errno;
1516     } else {
1517         msg->result = 0;
1518     }
1519 }
1520 #endif
1521 
1522 static inline void
1523 handle_traffic_msg(struct ff_msg *msg)
1524 {
1525     msg->traffic = ff_traffic;
1526     msg->result = 0;
1527 }
1528 
1529 #ifdef FF_KNI
1530 static inline void
1531 handle_knictl_msg(struct ff_msg *msg)
1532 {
1533     if (msg->knictl.kni_cmd == FF_KNICTL_CMD_SET){
1534         switch (msg->knictl.kni_action){
1535             case FF_KNICTL_ACTION_ALL_TO_FF: knictl_action = FF_KNICTL_ACTION_ALL_TO_FF; msg->result = 0; printf("new kni action: alltoff\n"); break;
1536             case FF_KNICTL_ACTION_ALL_TO_KNI: knictl_action = FF_KNICTL_ACTION_ALL_TO_KNI; msg->result = 0; printf("new kni action: alltokni\n"); break;
1537             case FF_KNICTL_ACTION_DEFAULT: knictl_action = FF_KNICTL_ACTION_DEFAULT; msg->result = 0; printf("new kni action: default\n"); break;
1538             default: msg->result = -1;
1539         }
1540     }
1541     else if (msg->knictl.kni_cmd == FF_KNICTL_CMD_GET){
1542         msg->knictl.kni_action = knictl_action;
1543     } else {
1544         msg->result = -2;
1545     }
1546 }
1547 #endif
1548 
1549 static inline void
1550 handle_default_msg(struct ff_msg *msg)
1551 {
1552     msg->result = ENOTSUP;
1553 }
1554 
1555 static inline void
1556 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1557 {
1558     switch (msg->msg_type) {
1559         case FF_SYSCTL:
1560             handle_sysctl_msg(msg);
1561             break;
1562         case FF_IOCTL:
1563 #ifdef INET6
1564         case FF_IOCTL6:
1565 #endif
1566             handle_ioctl_msg(msg);
1567             break;
1568         case FF_ROUTE:
1569             handle_route_msg(msg);
1570             break;
1571         case FF_TOP:
1572             handle_top_msg(msg);
1573             break;
1574 #ifdef FF_NETGRAPH
1575         case FF_NGCTL:
1576             handle_ngctl_msg(msg);
1577             break;
1578 #endif
1579 #ifdef FF_IPFW
1580         case FF_IPFW_CTL:
1581             handle_ipfw_msg(msg);
1582             break;
1583 #endif
1584         case FF_TRAFFIC:
1585             handle_traffic_msg(msg);
1586             break;
1587 #ifdef FF_KNI
1588         case FF_KNICTL:
1589             handle_knictl_msg(msg);
1590             break;
1591 #endif
1592         default:
1593             handle_default_msg(msg);
1594             break;
1595     }
1596     if (rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg) < 0) {
1597         if (msg->original_buf) {
1598             rte_free(msg->buf_addr);
1599             msg->buf_addr = msg->original_buf;
1600             msg->buf_len = msg->original_buf_len;
1601             msg->original_buf = NULL;
1602         }
1603 
1604         rte_mempool_put(message_pool, msg);
1605     }
1606 }
1607 
1608 static inline int
1609 process_msg_ring(uint16_t proc_id, struct rte_mbuf **pkts_burst)
1610 {
1611     /* read msg from ring buf and to process */
1612     uint16_t nb_rb;
1613     int i;
1614 
1615     nb_rb = rte_ring_dequeue_burst(msg_ring[proc_id].ring[0],
1616         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1617 
1618     if (likely(nb_rb == 0))
1619         return 0;
1620 
1621     for (i = 0; i < nb_rb; ++i) {
1622         handle_msg((struct ff_msg *)pkts_burst[i], proc_id);
1623     }
1624 
1625     return 0;
1626 }
1627 
1628 /* Send burst of packets on an output interface */
1629 static inline int
1630 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1631 {
1632     struct rte_mbuf **m_table;
1633     int ret;
1634     uint16_t queueid;
1635 
1636     queueid = qconf->tx_queue_id[port];
1637     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1638 
1639     if (unlikely(ff_global_cfg.pcap.enable)) {
1640         uint16_t i;
1641         for (i = 0; i < n; i++) {
1642             ff_dump_packets( ff_global_cfg.pcap.save_path, m_table[i],
1643                ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1644         }
1645     }
1646 
1647     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1648     ff_traffic.tx_packets += ret;
1649     uint16_t i;
1650     for (i = 0; i < ret; i++) {
1651         ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]);
1652 #ifdef FF_USE_PAGE_ARRAY
1653         if (qconf->tx_mbufs[port].bsd_m_table[i])
1654             ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs);
1655 #endif
1656     }
1657     if (unlikely(ret < n)) {
1658         do {
1659             rte_pktmbuf_free(m_table[ret]);
1660 #ifdef FF_USE_PAGE_ARRAY
1661             if ( qconf->tx_mbufs[port].bsd_m_table[ret] )
1662                 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]);
1663 #endif
1664         } while (++ret < n);
1665     }
1666     return 0;
1667 }
1668 
1669 /* Enqueue a single packet, and send burst if queue is filled */
1670 static inline int
1671 send_single_packet(struct rte_mbuf *m, uint8_t port)
1672 {
1673     uint16_t len;
1674     struct lcore_conf *qconf;
1675 
1676     qconf = &lcore_conf;
1677     len = qconf->tx_mbufs[port].len;
1678     qconf->tx_mbufs[port].m_table[len] = m;
1679     len++;
1680 
1681     /* enough pkts to be sent */
1682     if (unlikely(len == MAX_PKT_BURST)) {
1683         send_burst(qconf, MAX_PKT_BURST, port);
1684         len = 0;
1685     }
1686 
1687     qconf->tx_mbufs[port].len = len;
1688     return 0;
1689 }
1690 
1691 int
1692 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1693     int total)
1694 {
1695 #ifdef FF_USE_PAGE_ARRAY
1696     struct lcore_conf *qconf = &lcore_conf;
1697     int    len = 0;
1698 
1699     len = ff_if_send_onepkt(ctx, m,total);
1700     if (unlikely(len == MAX_PKT_BURST)) {
1701         send_burst(qconf, MAX_PKT_BURST, ctx->port_id);
1702         len = 0;
1703     }
1704     qconf->tx_mbufs[ctx->port_id].len = len;
1705     return 0;
1706 #endif
1707     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1708     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1709     if (head == NULL) {
1710         ff_mbuf_free(m);
1711         return -1;
1712     }
1713 
1714     head->pkt_len = total;
1715     head->nb_segs = 0;
1716 
1717     int off = 0;
1718     struct rte_mbuf *cur = head, *prev = NULL;
1719     while(total > 0) {
1720         if (cur == NULL) {
1721             cur = rte_pktmbuf_alloc(mbuf_pool);
1722             if (cur == NULL) {
1723                 rte_pktmbuf_free(head);
1724                 ff_mbuf_free(m);
1725                 return -1;
1726             }
1727         }
1728 
1729         if (prev != NULL) {
1730             prev->next = cur;
1731         }
1732         head->nb_segs++;
1733 
1734         prev = cur;
1735         void *data = rte_pktmbuf_mtod(cur, void*);
1736         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1737         int ret = ff_mbuf_copydata(m, data, off, len);
1738         if (ret < 0) {
1739             rte_pktmbuf_free(head);
1740             ff_mbuf_free(m);
1741             return -1;
1742         }
1743 
1744 
1745         cur->data_len = len;
1746         off += len;
1747         total -= len;
1748         cur = NULL;
1749     }
1750 
1751     struct ff_tx_offload offload = {0};
1752     ff_mbuf_tx_offload(m, &offload);
1753 
1754     void *data = rte_pktmbuf_mtod(head, void*);
1755 
1756     if (offload.ip_csum) {
1757         /* ipv6 not supported yet */
1758         struct rte_ipv4_hdr *iph;
1759         int iph_len;
1760         iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN);
1761         iph_len = (iph->version_ihl & 0x0f) << 2;
1762 
1763         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1764         head->l2_len = RTE_ETHER_HDR_LEN;
1765         head->l3_len = iph_len;
1766     }
1767 
1768     if (ctx->hw_features.tx_csum_l4) {
1769         struct rte_ipv4_hdr *iph;
1770         int iph_len;
1771         iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN);
1772         iph_len = (iph->version_ihl & 0x0f) << 2;
1773 
1774         if (offload.tcp_csum) {
1775             head->ol_flags |= PKT_TX_TCP_CKSUM;
1776             head->l2_len = RTE_ETHER_HDR_LEN;
1777             head->l3_len = iph_len;
1778         }
1779 
1780         /*
1781          *  TCP segmentation offload.
1782          *
1783          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1784          *    implies PKT_TX_TCP_CKSUM)
1785          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1786          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1787          *    write the IP checksum to 0 in the packet
1788          *  - fill the mbuf offload information: l2_len,
1789          *    l3_len, l4_len, tso_segsz
1790          *  - calculate the pseudo header checksum without taking ip_len
1791          *    in account, and set it in the TCP header. Refer to
1792          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1793          *    used as helpers.
1794          */
1795         if (offload.tso_seg_size) {
1796             struct rte_tcp_hdr *tcph;
1797             int tcph_len;
1798             tcph = (struct rte_tcp_hdr *)((char *)iph + iph_len);
1799             tcph_len = (tcph->data_off & 0xf0) >> 2;
1800             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1801 
1802             head->ol_flags |= PKT_TX_TCP_SEG;
1803             head->l4_len = tcph_len;
1804             head->tso_segsz = offload.tso_seg_size;
1805         }
1806 
1807         if (offload.udp_csum) {
1808             head->ol_flags |= PKT_TX_UDP_CKSUM;
1809             head->l2_len = RTE_ETHER_HDR_LEN;
1810             head->l3_len = iph_len;
1811         }
1812     }
1813 
1814     ff_mbuf_free(m);
1815 
1816     return send_single_packet(head, ctx->port_id);
1817 }
1818 
1819 static int
1820 main_loop(void *arg)
1821 {
1822     struct loop_routine *lr = (struct loop_routine *)arg;
1823 
1824     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1825     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc;
1826     int i, j, nb_rx, idle;
1827     uint16_t port_id, queue_id;
1828     struct lcore_conf *qconf;
1829     uint64_t drain_tsc = 0;
1830     struct ff_dpdk_if_context *ctx;
1831 
1832     if (pkt_tx_delay) {
1833         drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay;
1834     }
1835 
1836     prev_tsc = 0;
1837     usch_tsc = 0;
1838 
1839     qconf = &lcore_conf;
1840 
1841     while (1) {
1842         cur_tsc = rte_rdtsc();
1843         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1844             rte_timer_manage();
1845         }
1846 
1847         idle = 1;
1848         sys_tsc = 0;
1849         usr_tsc = 0;
1850         usr_cb_tsc = 0;
1851 
1852         /*
1853          * TX burst queue drain
1854          */
1855         diff_tsc = cur_tsc - prev_tsc;
1856         if (unlikely(diff_tsc >= drain_tsc)) {
1857             for (i = 0; i < qconf->nb_tx_port; i++) {
1858                 port_id = qconf->tx_port_id[i];
1859                 if (qconf->tx_mbufs[port_id].len == 0)
1860                     continue;
1861 
1862                 idle = 0;
1863 
1864                 send_burst(qconf,
1865                     qconf->tx_mbufs[port_id].len,
1866                     port_id);
1867                 qconf->tx_mbufs[port_id].len = 0;
1868             }
1869 
1870             prev_tsc = cur_tsc;
1871         }
1872 
1873         /*
1874          * Read packet from RX queues
1875          */
1876         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1877             port_id = qconf->rx_queue_list[i].port_id;
1878             queue_id = qconf->rx_queue_list[i].queue_id;
1879             ctx = veth_ctx[port_id];
1880 
1881 #ifdef FF_KNI
1882             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1883                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1884             }
1885 #endif
1886 
1887             idle &= !process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1888 
1889             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1890                 MAX_PKT_BURST);
1891             if (nb_rx == 0)
1892                 continue;
1893 
1894             idle = 0;
1895 
1896             /* Prefetch first packets */
1897             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1898                 rte_prefetch0(rte_pktmbuf_mtod(
1899                         pkts_burst[j], void *));
1900             }
1901 
1902             /* Prefetch and handle already prefetched packets */
1903             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1904                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1905                         j + PREFETCH_OFFSET], void *));
1906                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1907             }
1908 
1909             /* Handle remaining prefetched packets */
1910             for (; j < nb_rx; j++) {
1911                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1912             }
1913         }
1914 
1915         process_msg_ring(qconf->proc_id, pkts_burst);
1916 
1917         div_tsc = rte_rdtsc();
1918 
1919         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) {
1920             usch_tsc = cur_tsc;
1921             lr->loop(lr->arg);
1922         }
1923 
1924         idle_sleep_tsc = rte_rdtsc();
1925         if (likely(idle && idle_sleep)) {
1926             usleep(idle_sleep);
1927             end_tsc = rte_rdtsc();
1928         } else {
1929             end_tsc = idle_sleep_tsc;
1930         }
1931 
1932         usr_tsc = usr_cb_tsc;
1933         if (usch_tsc == cur_tsc) {
1934             usr_tsc += idle_sleep_tsc - div_tsc;
1935         }
1936 
1937         if (!idle) {
1938             sys_tsc = div_tsc - cur_tsc - usr_cb_tsc;
1939             ff_top_status.sys_tsc += sys_tsc;
1940         }
1941 
1942         ff_top_status.usr_tsc += usr_tsc;
1943         ff_top_status.work_tsc += end_tsc - cur_tsc;
1944         ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1945 
1946         ff_top_status.loops++;
1947     }
1948 
1949     return 0;
1950 }
1951 
1952 int
1953 ff_dpdk_if_up(void) {
1954     int i;
1955     struct lcore_conf *qconf = &lcore_conf;
1956     for (i = 0; i < qconf->nb_tx_port; i++) {
1957         uint16_t port_id = qconf->tx_port_id[i];
1958 
1959         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1960         veth_ctx[port_id] = ff_veth_attach(pconf);
1961         if (veth_ctx[port_id] == NULL) {
1962             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1963         }
1964     }
1965 
1966     return 0;
1967 }
1968 
1969 void
1970 ff_dpdk_run(loop_func_t loop, void *arg) {
1971     struct loop_routine *lr = rte_malloc(NULL,
1972         sizeof(struct loop_routine), 0);
1973     lr->loop = loop;
1974     lr->arg = arg;
1975     rte_eal_mp_remote_launch(main_loop, lr, CALL_MAIN);
1976     rte_eal_mp_wait_lcore();
1977     rte_free(lr);
1978 }
1979 
1980 void
1981 ff_dpdk_pktmbuf_free(void *m)
1982 {
1983     rte_pktmbuf_free_seg((struct rte_mbuf *)m);
1984 }
1985 
1986 static uint32_t
1987 toeplitz_hash(unsigned keylen, const uint8_t *key,
1988     unsigned datalen, const uint8_t *data)
1989 {
1990     uint32_t hash = 0, v;
1991     u_int i, b;
1992 
1993     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1994 
1995     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1996     for (i = 0; i < datalen; i++) {
1997         for (b = 0; b < 8; b++) {
1998             if (data[i] & (1<<(7-b)))
1999                 hash ^= v;
2000             v <<= 1;
2001             if ((i + 4) < keylen &&
2002                 (key[i+4] & (1<<(7-b))))
2003                 v |= 1;
2004         }
2005     }
2006     return (hash);
2007 }
2008 
2009 int
2010 ff_in_pcbladdr(uint16_t family, void *faddr, uint16_t fport, void *laddr)
2011 {
2012     int ret = 0;
2013     uint16_t fa;
2014 
2015     if (!pcblddr_fun)
2016         return ret;
2017 
2018     if (family == AF_INET)
2019         fa = AF_INET;
2020     else if (family == AF_INET6_FREEBSD)
2021         fa = AF_INET6_LINUX;
2022     else
2023         return EADDRNOTAVAIL;
2024 
2025     ret = (*pcblddr_fun)(fa, faddr, fport, laddr);
2026 
2027     return ret;
2028 }
2029 
2030 void
2031 ff_regist_pcblddr_fun(pcblddr_func_t func)
2032 {
2033     pcblddr_fun = func;
2034 }
2035 
2036 int
2037 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
2038     uint16_t sport, uint16_t dport)
2039 {
2040     struct lcore_conf *qconf = &lcore_conf;
2041     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
2042     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
2043 
2044     if (nb_queues <= 1) {
2045         return 1;
2046     }
2047 
2048     uint16_t reta_size = rss_reta_size[ctx->port_id];
2049     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
2050 
2051     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
2052         sizeof(dport)];
2053 
2054     unsigned datalen = 0;
2055 
2056     bcopy(&saddr, &data[datalen], sizeof(saddr));
2057     datalen += sizeof(saddr);
2058 
2059     bcopy(&daddr, &data[datalen], sizeof(daddr));
2060     datalen += sizeof(daddr);
2061 
2062     bcopy(&sport, &data[datalen], sizeof(sport));
2063     datalen += sizeof(sport);
2064 
2065     bcopy(&dport, &data[datalen], sizeof(dport));
2066     datalen += sizeof(dport);
2067 
2068     uint32_t hash = 0;
2069     hash = toeplitz_hash(rsskey_len, rsskey, datalen, data);
2070 
2071     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
2072 }
2073 
2074 void
2075 ff_regist_packet_dispatcher(dispatch_func_t func)
2076 {
2077     packet_dispatcher = func;
2078 }
2079 
2080 uint64_t
2081 ff_get_tsc_ns()
2082 {
2083     uint64_t cur_tsc = rte_rdtsc();
2084     uint64_t hz = rte_get_tsc_hz();
2085     return ((double)cur_tsc/(double)hz) * NS_PER_S;
2086 }
2087 
2088