xref: /f-stack/lib/ff_dpdk_if.c (revision 73cd51a2)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 #include <unistd.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30 
31 #include <rte_common.h>
32 #include <rte_byteorder.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memcpy.h>
36 #include <rte_memzone.h>
37 #include <rte_config.h>
38 #include <rte_eal.h>
39 #include <rte_pci.h>
40 #include <rte_mbuf.h>
41 #include <rte_memory.h>
42 #include <rte_lcore.h>
43 #include <rte_launch.h>
44 #include <rte_ethdev.h>
45 #include <rte_debug.h>
46 #include <rte_common.h>
47 #include <rte_ether.h>
48 #include <rte_malloc.h>
49 #include <rte_cycles.h>
50 #include <rte_timer.h>
51 #include <rte_thash.h>
52 #include <rte_ip.h>
53 #include <rte_tcp.h>
54 #include <rte_udp.h>
55 #include <rte_eth_bond.h>
56 
57 #include "ff_dpdk_if.h"
58 #include "ff_dpdk_pcap.h"
59 #include "ff_dpdk_kni.h"
60 #include "ff_config.h"
61 #include "ff_veth.h"
62 #include "ff_host_interface.h"
63 #include "ff_msg.h"
64 #include "ff_api.h"
65 #include "ff_memory.h"
66 
67 #ifdef FF_KNI
68 #define KNI_MBUF_MAX 2048
69 #define KNI_QUEUE_SIZE 2048
70 
71 int enable_kni;
72 static int kni_accept;
73 static int knictl_action = FF_KNICTL_ACTION_DEFAULT;
74 #endif
75 
76 static int numa_on;
77 
78 static unsigned idle_sleep;
79 static unsigned pkt_tx_delay;
80 
81 static struct rte_timer freebsd_clock;
82 
83 // Mellanox Linux's driver key
84 static uint8_t default_rsskey_40bytes[40] = {
85     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
86     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
87     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
88     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
89     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
90 };
91 
92 static uint8_t default_rsskey_52bytes[52] = {
93     0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23,
94     0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30,
95     0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02,
96     0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c,
97     0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55,
98     0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e,
99     0x81, 0x15, 0x03, 0x66
100 };
101 
102 static uint8_t symmetric_rsskey[52] = {
103     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
104     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
105     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
106     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
107     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
108     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
109     0x6d, 0x5a, 0x6d, 0x5a
110 };
111 
112 static int rsskey_len = sizeof(default_rsskey_40bytes);
113 static uint8_t *rsskey = default_rsskey_40bytes;
114 
115 struct lcore_conf lcore_conf;
116 
117 struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
118 
119 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
120 static dispatch_func_t packet_dispatcher;
121 
122 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
123 
124 #define BOND_DRIVER_NAME    "net_bonding"
125 
126 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port);
127 
128 struct ff_msg_ring {
129     char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE];
130     /* ring[0] for lcore recv msg, other send */
131     /* ring[1] for lcore send msg, other read */
132     struct rte_ring *ring[FF_MSG_NUM];
133 } __rte_cache_aligned;
134 
135 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
136 static struct rte_mempool *message_pool;
137 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
138 
139 static struct ff_top_args ff_top_status;
140 static struct ff_traffic_args ff_traffic;
141 extern void ff_hardclock(void);
142 
143 static void
144 ff_hardclock_job(__rte_unused struct rte_timer *timer,
145     __rte_unused void *arg) {
146     ff_hardclock();
147     ff_update_current_ts();
148 }
149 
150 struct ff_dpdk_if_context *
151 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
152 {
153     struct ff_dpdk_if_context *ctx;
154 
155     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
156     if (ctx == NULL)
157         return NULL;
158 
159     ctx->sc = sc;
160     ctx->ifp = ifp;
161     ctx->port_id = cfg->port_id;
162     ctx->hw_features = cfg->hw_features;
163 
164     return ctx;
165 }
166 
167 void
168 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
169 {
170     free(ctx);
171 }
172 
173 static void
174 check_all_ports_link_status(void)
175 {
176     #define CHECK_INTERVAL 100 /* 100ms */
177     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
178 
179     uint16_t portid;
180     uint8_t count, all_ports_up, print_flag = 0;
181     struct rte_eth_link link;
182 
183     printf("\nChecking link status");
184     fflush(stdout);
185 
186     int i, nb_ports;
187     nb_ports = ff_global_cfg.dpdk.nb_ports;
188     for (count = 0; count <= MAX_CHECK_TIME; count++) {
189         all_ports_up = 1;
190         for (i = 0; i < nb_ports; i++) {
191             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
192             memset(&link, 0, sizeof(link));
193             rte_eth_link_get_nowait(portid, &link);
194 
195             /* print link status if flag set */
196             if (print_flag == 1) {
197                 if (link.link_status) {
198                     printf("Port %d Link Up - speed %u "
199                         "Mbps - %s\n", (int)portid,
200                         (unsigned)link.link_speed,
201                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
202                         ("full-duplex") : ("half-duplex\n"));
203                 } else {
204                     printf("Port %d Link Down\n", (int)portid);
205                 }
206                 continue;
207             }
208             /* clear all_ports_up flag if any link down */
209             if (link.link_status == 0) {
210                 all_ports_up = 0;
211                 break;
212             }
213         }
214 
215         /* after finally printing all link status, get out */
216         if (print_flag == 1)
217             break;
218 
219         if (all_ports_up == 0) {
220             printf(".");
221             fflush(stdout);
222             rte_delay_ms(CHECK_INTERVAL);
223         }
224 
225         /* set the print_flag if all ports up or timeout */
226         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
227             print_flag = 1;
228             printf("done\n");
229         }
230     }
231 }
232 
233 static int
234 init_lcore_conf(void)
235 {
236     uint8_t nb_dev_ports = rte_eth_dev_count_avail();
237     if (nb_dev_ports == 0) {
238         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
239     }
240 
241     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
242         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
243                  ff_global_cfg.dpdk.max_portid);
244     }
245 
246     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
247     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
248 
249     uint16_t socket_id = 0;
250     if (numa_on) {
251         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
252     }
253 
254     lcore_conf.socket_id = socket_id;
255 
256     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
257     if (!rte_lcore_is_enabled(lcore_id)) {
258         rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
259     }
260 
261     int j;
262     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
263         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
264         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
265 
266         int queueid = -1;
267         int i;
268         for (i = 0; i < pconf->nb_lcores; i++) {
269             if (pconf->lcore_list[i] == lcore_id) {
270                 queueid = i;
271             }
272         }
273         if (queueid < 0) {
274             continue;
275         }
276         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
277         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
278         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
279         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
280         lcore_conf.nb_rx_queue++;
281 
282         lcore_conf.tx_queue_id[port_id] = queueid;
283         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
284         lcore_conf.nb_tx_port++;
285 
286         /* Enable pcap dump */
287         if (ff_global_cfg.pcap.enable) {
288             ff_enable_pcap(ff_global_cfg.pcap.save_path, ff_global_cfg.pcap.snap_len);
289         }
290 
291         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
292     }
293 
294     if (lcore_conf.nb_rx_queue == 0) {
295         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
296     }
297 
298     return 0;
299 }
300 
301 static int
302 init_mem_pool(void)
303 {
304     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
305     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
306     uint32_t nb_tx_queue = nb_lcores;
307     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
308     uint16_t max_portid = ff_global_cfg.dpdk.max_portid;
309 
310     unsigned nb_mbuf = RTE_ALIGN_CEIL (
311         (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE          +
312         nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST    +
313         nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE  +
314         nb_lcores * MEMPOOL_CACHE_SIZE +
315 #ifdef FF_KNI
316         nb_ports * KNI_MBUF_MAX +
317         nb_ports * KNI_QUEUE_SIZE +
318 #endif
319         nb_lcores * nb_ports * DISPATCH_RING_SIZE),
320         (unsigned)8192);
321 
322     unsigned socketid = 0;
323     uint16_t i, lcore_id;
324     char s[64];
325 
326     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
327         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
328         if (numa_on) {
329             socketid = rte_lcore_to_socket_id(lcore_id);
330         }
331 
332         if (socketid >= NB_SOCKETS) {
333             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
334                 socketid, i, NB_SOCKETS);
335         }
336 
337         if (pktmbuf_pool[socketid] != NULL) {
338             continue;
339         }
340 
341         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
342             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
343             pktmbuf_pool[socketid] =
344                 rte_pktmbuf_pool_create(s, nb_mbuf,
345                     MEMPOOL_CACHE_SIZE, 0,
346                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
347         } else {
348             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
349             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
350         }
351 
352         if (pktmbuf_pool[socketid] == NULL) {
353             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
354         } else {
355             printf("create mbuf pool on socket %d\n", socketid);
356         }
357 
358 #ifdef FF_USE_PAGE_ARRAY
359         nb_mbuf = RTE_ALIGN_CEIL (
360             nb_ports*nb_lcores*MAX_PKT_BURST    +
361             nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
362             nb_lcores*MEMPOOL_CACHE_SIZE,
363             (unsigned)4096);
364         ff_init_ref_pool(nb_mbuf, socketid);
365 #endif
366     }
367 
368     return 0;
369 }
370 
371 static struct rte_ring *
372 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
373 {
374     struct rte_ring *ring;
375 
376     if (name == NULL) {
377         rte_exit(EXIT_FAILURE, "create ring failed, no name!\n");
378     }
379 
380     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
381         ring = rte_ring_create(name, count, socket_id, flags);
382     } else {
383         ring = rte_ring_lookup(name);
384     }
385 
386     if (ring == NULL) {
387         rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name);
388     }
389 
390     return ring;
391 }
392 
393 static int
394 init_dispatch_ring(void)
395 {
396     int j;
397     char name_buf[RTE_RING_NAMESIZE];
398     int queueid;
399 
400     unsigned socketid = lcore_conf.socket_id;
401 
402     /* Create ring according to ports actually being used. */
403     int nb_ports = ff_global_cfg.dpdk.nb_ports;
404     for (j = 0; j < nb_ports; j++) {
405         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
406         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
407         int nb_queues = pconf->nb_lcores;
408         if (dispatch_ring[portid] == NULL) {
409             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
410 
411             dispatch_ring[portid] = rte_zmalloc(name_buf,
412                 sizeof(struct rte_ring *) * nb_queues,
413                 RTE_CACHE_LINE_SIZE);
414             if (dispatch_ring[portid] == NULL) {
415                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
416                     "failed\n", name_buf);
417             }
418         }
419 
420         for(queueid = 0; queueid < nb_queues; ++queueid) {
421             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
422                 portid, queueid);
423             dispatch_ring[portid][queueid] = create_ring(name_buf,
424                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
425 
426             if (dispatch_ring[portid][queueid] == NULL)
427                 rte_panic("create ring:%s failed!\n", name_buf);
428 
429             printf("create ring:%s success, %u ring entries are now free!\n",
430                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
431         }
432     }
433 
434     return 0;
435 }
436 
437 static void
438 ff_msg_init(struct rte_mempool *mp,
439     __attribute__((unused)) void *opaque_arg,
440     void *obj, __attribute__((unused)) unsigned i)
441 {
442     struct ff_msg *msg = (struct ff_msg *)obj;
443     msg->msg_type = FF_UNKNOWN;
444     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
445     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
446     msg->original_buf = NULL;
447     msg->original_buf_len = 0;
448 }
449 
450 static int
451 init_msg_ring(void)
452 {
453     uint16_t i, j;
454     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
455     unsigned socketid = lcore_conf.socket_id;
456 
457     /* Create message buffer pool */
458     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
459         message_pool = rte_mempool_create(FF_MSG_POOL,
460            MSG_RING_SIZE * 2 * nb_procs,
461            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
462            NULL, NULL, ff_msg_init, NULL,
463            socketid, 0);
464     } else {
465         message_pool = rte_mempool_lookup(FF_MSG_POOL);
466     }
467 
468     if (message_pool == NULL) {
469         rte_panic("Create msg mempool failed\n");
470     }
471 
472     for(i = 0; i < nb_procs; ++i) {
473         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
474             "%s%u", FF_MSG_RING_IN, i);
475         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
476             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
477         if (msg_ring[i].ring[0] == NULL)
478             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
479 
480         for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) {
481             snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE,
482                 "%s%u_%u", FF_MSG_RING_OUT, i, j);
483             msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j],
484                 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
485             if (msg_ring[i].ring[j] == NULL)
486                 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]);
487         }
488     }
489 
490     return 0;
491 }
492 
493 #ifdef FF_KNI
494 
495 static enum FF_KNICTL_CMD get_kni_action(const char *c){
496     if (!c)
497         return FF_KNICTL_ACTION_DEFAULT;
498     if (0 == strcasecmp(c, "alltokni")){
499         return FF_KNICTL_ACTION_ALL_TO_KNI;
500     } else  if (0 == strcasecmp(c, "alltoff")){
501         return FF_KNICTL_ACTION_ALL_TO_FF;
502     } else if (0 == strcasecmp(c, "default")){
503         return FF_KNICTL_ACTION_DEFAULT;
504     } else {
505         return FF_KNICTL_ACTION_DEFAULT;
506     }
507 }
508 
509 static int
510 init_kni(void)
511 {
512     int nb_ports = rte_eth_dev_count_avail();
513     kni_accept = 0;
514     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
515         kni_accept = 1;
516 
517     knictl_action = get_kni_action(ff_global_cfg.kni.kni_action);
518 
519     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
520         ff_global_cfg.kni.udp_port);
521 
522     unsigned socket_id = lcore_conf.socket_id;
523     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
524 
525     nb_ports = ff_global_cfg.dpdk.nb_ports;
526     int i, ret;
527     for (i = 0; i < nb_ports; i++) {
528         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
529         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
530     }
531 
532     return 0;
533 }
534 #endif
535 
536 //RSS reta update will failed when enable flow isolate
537 #ifndef FF_FLOW_ISOLATE
538 static void
539 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
540 {
541     if (reta_size == 0) {
542         return;
543     }
544 
545     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
546     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
547 
548     /* config HW indirection table */
549     unsigned i, j, hash=0;
550     for (i = 0; i < reta_conf_size; i++) {
551         reta_conf[i].mask = ~0ULL;
552         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
553             reta_conf[i].reta[j] = hash++ % nb_queues;
554         }
555     }
556 
557     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
558         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
559             port_id);
560     }
561 }
562 #endif
563 
564 static int
565 init_port_start(void)
566 {
567     int nb_ports = ff_global_cfg.dpdk.nb_ports;
568     unsigned socketid = 0;
569     struct rte_mempool *mbuf_pool;
570     uint16_t i, j;
571 
572     for (i = 0; i < nb_ports; i++) {
573         uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i];
574         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id];
575         uint16_t nb_queues = pconf->nb_lcores;
576 
577         for (j=0; j<=pconf->nb_slaves; j++) {
578             if (j < pconf->nb_slaves) {
579                 port_id = pconf->slave_portid_list[j];
580                 printf("To init %s's %d'st slave port[%d]\n",
581                         ff_global_cfg.dpdk.bond_cfgs->name,
582                         j, port_id);
583             } else {
584                 port_id = u_port_id;
585             }
586 
587             struct rte_eth_dev_info dev_info;
588             struct rte_eth_conf port_conf = {0};
589             struct rte_eth_rxconf rxq_conf;
590             struct rte_eth_txconf txq_conf;
591 
592             int ret = rte_eth_dev_info_get(port_id, &dev_info);
593             if (ret != 0)
594                 rte_exit(EXIT_FAILURE,
595                     "Error during getting device (port %u) info: %s\n",
596                     port_id, strerror(-ret));
597 
598             if (nb_queues > dev_info.max_rx_queues) {
599                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
600                     nb_queues,
601                     dev_info.max_rx_queues);
602             }
603 
604             if (nb_queues > dev_info.max_tx_queues) {
605                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
606                     nb_queues,
607                     dev_info.max_tx_queues);
608             }
609 
610             struct rte_ether_addr addr;
611             rte_eth_macaddr_get(port_id, &addr);
612             printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
613                        " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
614                     (unsigned)port_id,
615                     addr.addr_bytes[0], addr.addr_bytes[1],
616                     addr.addr_bytes[2], addr.addr_bytes[3],
617                     addr.addr_bytes[4], addr.addr_bytes[5]);
618 
619             rte_memcpy(pconf->mac,
620                 addr.addr_bytes, RTE_ETHER_ADDR_LEN);
621 
622             /* Set RSS mode */
623             uint64_t default_rss_hf = ETH_RSS_PROTO_MASK;
624             port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
625             port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf;
626             if (dev_info.hash_key_size == 52) {
627                 rsskey = default_rsskey_52bytes;
628                 rsskey_len = 52;
629             }
630             if (ff_global_cfg.dpdk.symmetric_rss) {
631                 printf("Use symmetric Receive-side Scaling(RSS) key\n");
632                 rsskey = symmetric_rsskey;
633             }
634             port_conf.rx_adv_conf.rss_conf.rss_key = rsskey;
635             port_conf.rx_adv_conf.rss_conf.rss_key_len = rsskey_len;
636             port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads;
637             if (port_conf.rx_adv_conf.rss_conf.rss_hf !=
638                     ETH_RSS_PROTO_MASK) {
639                 printf("Port %u modified RSS hash function based on hardware support,"
640                         "requested:%#"PRIx64" configured:%#"PRIx64"\n",
641                         port_id, default_rss_hf,
642                         port_conf.rx_adv_conf.rss_conf.rss_hf);
643             }
644 
645             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) {
646                 port_conf.txmode.offloads |=
647                     DEV_TX_OFFLOAD_MBUF_FAST_FREE;
648             }
649 
650             /* Set Rx VLAN stripping */
651             if (ff_global_cfg.dpdk.vlan_strip) {
652                 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
653                     port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
654                 }
655             }
656 
657             /* Enable HW CRC stripping */
658             port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC;
659 
660             /* FIXME: Enable TCP LRO ?*/
661             #if 0
662             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
663                 printf("LRO is supported\n");
664                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
665                 pconf->hw_features.rx_lro = 1;
666             }
667             #endif
668 
669             /* Set Rx checksum checking */
670             if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
671                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
672                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
673                 printf("RX checksum offload supported\n");
674                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
675                 pconf->hw_features.rx_csum = 1;
676             }
677 
678             if (ff_global_cfg.dpdk.tx_csum_offoad_skip == 0) {
679                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
680                     printf("TX ip checksum offload supported\n");
681                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
682                     pconf->hw_features.tx_csum_ip = 1;
683                 }
684 
685                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
686                     (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
687                     printf("TX TCP&UDP checksum offload supported\n");
688                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM;
689                     pconf->hw_features.tx_csum_l4 = 1;
690                 }
691             } else {
692                 printf("TX checksum offoad is disabled\n");
693             }
694 
695             if (ff_global_cfg.dpdk.tso) {
696                 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
697                     printf("TSO is supported\n");
698                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO;
699                     pconf->hw_features.tx_tso = 1;
700                 }
701             } else {
702                 printf("TSO is disabled\n");
703             }
704 
705             if (dev_info.reta_size) {
706                 /* reta size must be power of 2 */
707                 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
708 
709                 rss_reta_size[port_id] = dev_info.reta_size;
710                 printf("port[%d]: rss table size: %d\n", port_id,
711                     dev_info.reta_size);
712             }
713 
714             if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
715                 continue;
716             }
717 
718             ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
719             if (ret != 0) {
720                 return ret;
721             }
722 
723             static uint16_t nb_rxd = RX_QUEUE_SIZE;
724             static uint16_t nb_txd = TX_QUEUE_SIZE;
725             ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd);
726             if (ret < 0)
727                 printf("Could not adjust number of descriptors "
728                         "for port%u (%d)\n", (unsigned)port_id, ret);
729 
730             uint16_t q;
731             for (q = 0; q < nb_queues; q++) {
732                 if (numa_on) {
733                     uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
734                     socketid = rte_lcore_to_socket_id(lcore_id);
735                 }
736                 mbuf_pool = pktmbuf_pool[socketid];
737 
738                 txq_conf = dev_info.default_txconf;
739                 txq_conf.offloads = port_conf.txmode.offloads;
740                 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd,
741                     socketid, &txq_conf);
742                 if (ret < 0) {
743                     return ret;
744                 }
745 
746                 rxq_conf = dev_info.default_rxconf;
747                 rxq_conf.offloads = port_conf.rxmode.offloads;
748                 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd,
749                     socketid, &rxq_conf, mbuf_pool);
750                 if (ret < 0) {
751                     return ret;
752                 }
753             }
754 
755 
756             if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME,
757                     strlen(dev_info.driver_name)) == 0) {
758 
759                 rte_eth_macaddr_get(port_id, &addr);
760                 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
761                            " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
762                         (unsigned)port_id,
763                         addr.addr_bytes[0], addr.addr_bytes[1],
764                         addr.addr_bytes[2], addr.addr_bytes[3],
765                         addr.addr_bytes[4], addr.addr_bytes[5]);
766 
767                 rte_memcpy(pconf->mac,
768                     addr.addr_bytes, RTE_ETHER_ADDR_LEN);
769 
770                 int mode, count, x;
771                 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS;
772 
773                 mode = rte_eth_bond_mode_get(port_id);
774                 printf("Port %u, bond mode:%d\n", port_id, mode);
775 
776                 count = rte_eth_bond_slaves_get(port_id, slaves, len);
777                 printf("Port %u, %s's slave ports count:%d\n", port_id,
778                             ff_global_cfg.dpdk.bond_cfgs->name, count);
779                 for (x=0; x<count; x++) {
780                     printf("Port %u, %s's slave port[%u]\n", port_id,
781                             ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]);
782                 }
783             }
784 
785             ret = rte_eth_dev_start(port_id);
786             if (ret < 0) {
787                 return ret;
788             }
789     //RSS reta update will failed when enable flow isolate
790     #ifndef FF_FLOW_ISOLATE
791             if (nb_queues > 1) {
792                 /* set HW rss hash function to Toeplitz. */
793                 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
794                     struct rte_eth_hash_filter_info info = {0};
795                     info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
796                     info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
797 
798                     if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
799                         RTE_ETH_FILTER_SET, &info) < 0) {
800                         rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
801                             port_id);
802                     }
803                 }
804 
805                 set_rss_table(port_id, dev_info.reta_size, nb_queues);
806             }
807     #endif
808 
809             /* Enable RX in promiscuous mode for the Ethernet device. */
810             if (ff_global_cfg.dpdk.promiscuous) {
811                 ret = rte_eth_promiscuous_enable(port_id);
812                 if (ret == 0) {
813                     printf("set port %u to promiscuous mode ok\n", port_id);
814                 } else {
815                     printf("set port %u to promiscuous mode error\n", port_id);
816                 }
817             }
818         }
819     }
820 
821     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
822         check_all_ports_link_status();
823     }
824 
825     return 0;
826 }
827 
828 static int
829 init_clock(void)
830 {
831     rte_timer_subsystem_init();
832     uint64_t hz = rte_get_timer_hz();
833     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
834     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
835 
836     rte_timer_init(&freebsd_clock);
837     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
838         rte_lcore_id(), &ff_hardclock_job, NULL);
839 
840     ff_update_current_ts();
841 
842     return 0;
843 }
844 
845 #ifdef FF_FLOW_ISOLATE
846 /** Print a message out of a flow error. */
847 static int
848 port_flow_complain(struct rte_flow_error *error)
849 {
850     static const char *const errstrlist[] = {
851         [RTE_FLOW_ERROR_TYPE_NONE] = "no error",
852         [RTE_FLOW_ERROR_TYPE_UNSPECIFIED] = "cause unspecified",
853         [RTE_FLOW_ERROR_TYPE_HANDLE] = "flow rule (handle)",
854         [RTE_FLOW_ERROR_TYPE_ATTR_GROUP] = "group field",
855         [RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY] = "priority field",
856         [RTE_FLOW_ERROR_TYPE_ATTR_INGRESS] = "ingress field",
857         [RTE_FLOW_ERROR_TYPE_ATTR_EGRESS] = "egress field",
858         [RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER] = "transfer field",
859         [RTE_FLOW_ERROR_TYPE_ATTR] = "attributes structure",
860         [RTE_FLOW_ERROR_TYPE_ITEM_NUM] = "pattern length",
861         [RTE_FLOW_ERROR_TYPE_ITEM_SPEC] = "item specification",
862         [RTE_FLOW_ERROR_TYPE_ITEM_LAST] = "item specification range",
863         [RTE_FLOW_ERROR_TYPE_ITEM_MASK] = "item specification mask",
864         [RTE_FLOW_ERROR_TYPE_ITEM] = "specific pattern item",
865         [RTE_FLOW_ERROR_TYPE_ACTION_NUM] = "number of actions",
866         [RTE_FLOW_ERROR_TYPE_ACTION_CONF] = "action configuration",
867         [RTE_FLOW_ERROR_TYPE_ACTION] = "specific action",
868     };
869     const char *errstr;
870     char buf[32];
871     int err = rte_errno;
872 
873     if ((unsigned int)error->type >= RTE_DIM(errstrlist) ||
874         !errstrlist[error->type])
875         errstr = "unknown type";
876     else
877         errstr = errstrlist[error->type];
878     printf("Caught error type %d (%s): %s%s: %s\n",
879            error->type, errstr,
880            error->cause ? (snprintf(buf, sizeof(buf), "cause: %p, ",
881                                     error->cause), buf) : "",
882            error->message ? error->message : "(no stated reason)",
883            rte_strerror(err));
884     return -err;
885 }
886 
887 static int
888 port_flow_isolate(uint16_t port_id, int set)
889 {
890     struct rte_flow_error error;
891 
892     /* Poisoning to make sure PMDs update it in case of error. */
893     memset(&error, 0x66, sizeof(error));
894     if (rte_flow_isolate(port_id, set, &error))
895         return port_flow_complain(&error);
896     printf("Ingress traffic on port %u is %s to the defined flow rules\n",
897            port_id,
898            set ? "now restricted" : "not restricted anymore");
899     return 0;
900 }
901 
902 static int
903 create_tcp_flow(uint16_t port_id, uint16_t tcp_port) {
904   struct rte_flow_attr attr = {.ingress = 1};
905   struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
906   int nb_queues = pconf->nb_lcores;
907   uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
908   int i = 0, j = 0;
909   for (i = 0, j = 0; i < nb_queues; ++i)
910    queue[j++] = i;
911   struct rte_flow_action_rss rss = {
912    .types = ETH_RSS_NONFRAG_IPV4_TCP,
913    .key_len = rsskey_len,
914    .key = rsskey,
915    .queue_num = j,
916    .queue = queue,
917   };
918 
919   struct rte_eth_dev_info dev_info;
920   int ret = rte_eth_dev_info_get(port_id, &dev_info);
921   if (ret != 0)
922     rte_exit(EXIT_FAILURE, "Error during getting device (port %u) info: %s\n", port_id, strerror(-ret));
923 
924   struct rte_flow_item pattern[3];
925   struct rte_flow_action action[2];
926   struct rte_flow_item_tcp tcp_spec;
927   struct rte_flow_item_tcp tcp_mask = {
928           .hdr = {
929                   .src_port = RTE_BE16(0x0000),
930                   .dst_port = RTE_BE16(0xffff),
931           },
932   };
933   struct rte_flow_error error;
934 
935   memset(pattern, 0, sizeof(pattern));
936   memset(action, 0, sizeof(action));
937 
938   /* set the dst ipv4 packet to the required value */
939   pattern[0].type = RTE_FLOW_ITEM_TYPE_IPV4;
940 
941   memset(&tcp_spec, 0, sizeof(struct rte_flow_item_tcp));
942   tcp_spec.hdr.dst_port = rte_cpu_to_be_16(tcp_port);
943   pattern[1].type = RTE_FLOW_ITEM_TYPE_TCP;
944   pattern[1].spec = &tcp_spec;
945   pattern[1].mask = &tcp_mask;
946 
947   /* end the pattern array */
948   pattern[2].type = RTE_FLOW_ITEM_TYPE_END;
949 
950   /* create the action */
951   action[0].type = RTE_FLOW_ACTION_TYPE_RSS;
952   action[0].conf = &rss;
953   action[1].type = RTE_FLOW_ACTION_TYPE_END;
954 
955   struct rte_flow *flow;
956   /* validate and create the flow rule */
957   if (!rte_flow_validate(port_id, &attr, pattern, action, &error)) {
958       flow = rte_flow_create(port_id, &attr, pattern, action, &error);
959       if (!flow) {
960           return port_flow_complain(&error);
961       }
962   }
963 
964   memset(pattern, 0, sizeof(pattern));
965 
966   /* set the dst ipv4 packet to the required value */
967   pattern[0].type = RTE_FLOW_ITEM_TYPE_IPV4;
968 
969   struct rte_flow_item_tcp tcp_src_mask = {
970           .hdr = {
971                   .src_port = RTE_BE16(0xffff),
972                   .dst_port = RTE_BE16(0x0000),
973           },
974   };
975 
976   memset(&tcp_spec, 0, sizeof(struct rte_flow_item_tcp));
977   tcp_spec.hdr.src_port = rte_cpu_to_be_16(tcp_port);
978   pattern[1].type = RTE_FLOW_ITEM_TYPE_TCP;
979   pattern[1].spec = &tcp_spec;
980   pattern[1].mask = &tcp_src_mask;
981 
982   /* end the pattern array */
983   pattern[2].type = RTE_FLOW_ITEM_TYPE_END;
984 
985   /* validate and create the flow rule */
986   if (!rte_flow_validate(port_id, &attr, pattern, action, &error)) {
987       flow = rte_flow_create(port_id, &attr, pattern, action, &error);
988       if (!flow) {
989           return port_flow_complain(&error);
990       }
991   }
992 
993   return 1;
994 }
995 
996 static int
997 init_flow(uint16_t port_id, uint16_t tcp_port) {
998   // struct ff_flow_cfg fcfg = ff_global_cfg.dpdk.flow_cfgs[0];
999 
1000   // int i;
1001   // for (i = 0; i < fcfg.nb_port; i++) {
1002   //     if(!create_tcp_flow(fcfg.port_id, fcfg.tcp_ports[i])) {
1003   //         return 0;
1004   //     }
1005   // }
1006 
1007   if(!create_tcp_flow(port_id, tcp_port)) {
1008       rte_exit(EXIT_FAILURE, "create tcp flow failed\n");
1009       return -1;
1010   }
1011 
1012   /*  ARP rule */
1013   struct rte_flow_attr attr = {.ingress = 1};
1014   struct rte_flow_action_queue queue = {.index = 0};
1015 
1016   struct rte_flow_item pattern_[2];
1017   struct rte_flow_action action[2];
1018   struct rte_flow_item_eth eth_type = {.type = RTE_BE16(0x0806)};
1019   struct rte_flow_item_eth eth_mask = {
1020           .type = RTE_BE16(0xffff)
1021   };
1022 
1023   memset(pattern_, 0, sizeof(pattern_));
1024   memset(action, 0, sizeof(action));
1025 
1026   pattern_[0].type = RTE_FLOW_ITEM_TYPE_ETH;
1027   pattern_[0].spec = &eth_type;
1028   pattern_[0].mask = &eth_mask;
1029 
1030   pattern_[1].type = RTE_FLOW_ITEM_TYPE_END;
1031 
1032   /* create the action */
1033   action[0].type = RTE_FLOW_ACTION_TYPE_QUEUE;
1034   action[0].conf = &queue;
1035   action[1].type = RTE_FLOW_ACTION_TYPE_END;
1036 
1037   struct rte_flow *flow;
1038   struct rte_flow_error error;
1039   /* validate and create the flow rule */
1040   if (!rte_flow_validate(port_id, &attr, pattern_, action, &error)) {
1041       flow = rte_flow_create(port_id, &attr, pattern_, action, &error);
1042       if (!flow) {
1043           return port_flow_complain(&error);
1044       }
1045   }
1046 
1047   return 1;
1048 }
1049 
1050 #endif
1051 
1052 int
1053 ff_dpdk_init(int argc, char **argv)
1054 {
1055     if (ff_global_cfg.dpdk.nb_procs < 1 ||
1056         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
1057         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
1058         ff_global_cfg.dpdk.proc_id < 0) {
1059         printf("param num_procs[%d] or proc_id[%d] error!\n",
1060             ff_global_cfg.dpdk.nb_procs,
1061             ff_global_cfg.dpdk.proc_id);
1062         exit(1);
1063     }
1064 
1065     int ret = rte_eal_init(argc, argv);
1066     if (ret < 0) {
1067         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1068     }
1069 
1070     numa_on = ff_global_cfg.dpdk.numa_on;
1071 
1072     idle_sleep = ff_global_cfg.dpdk.idle_sleep;
1073     pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \
1074         BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay;
1075 
1076     init_lcore_conf();
1077 
1078     init_mem_pool();
1079 
1080     init_dispatch_ring();
1081 
1082     init_msg_ring();
1083 
1084 #ifdef FF_KNI
1085     enable_kni = ff_global_cfg.kni.enable;
1086     if (enable_kni) {
1087         init_kni();
1088     }
1089 #endif
1090 
1091 #ifdef FF_USE_PAGE_ARRAY
1092     ff_mmap_init();
1093 #endif
1094 
1095 #ifdef FF_FLOW_ISOLATE
1096     // run once in primary process
1097     if (0 == lcore_conf.tx_queue_id[0]){
1098         ret = port_flow_isolate(0, 1);
1099         if (ret < 0)
1100             rte_exit(EXIT_FAILURE, "init_port_isolate failed\n");
1101     }
1102 #endif
1103 
1104     ret = init_port_start();
1105     if (ret < 0) {
1106         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
1107     }
1108 
1109     init_clock();
1110 #ifdef FF_FLOW_ISOLATE
1111     //Only give a example usage: port_id=0, tcp_port= 80.
1112     //Recommend:
1113     //1. init_flow should replace `set_rss_table` in `init_port_start` loop, This can set all NIC's port_id_list instead only 0 device(port_id).
1114     //2. using config options `tcp_port` replace magic number of 80
1115     ret = init_flow(0, 80);
1116     if (ret < 0) {
1117         rte_exit(EXIT_FAILURE, "init_port_flow failed\n");
1118     }
1119 #endif
1120     return 0;
1121 }
1122 
1123 static void
1124 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
1125 {
1126     uint8_t rx_csum = ctx->hw_features.rx_csum;
1127     if (rx_csum) {
1128         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
1129             rte_pktmbuf_free(pkt);
1130             return;
1131         }
1132     }
1133 
1134     void *data = rte_pktmbuf_mtod(pkt, void*);
1135     uint16_t len = rte_pktmbuf_data_len(pkt);
1136 
1137     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
1138     if (hdr == NULL) {
1139         rte_pktmbuf_free(pkt);
1140         return;
1141     }
1142 
1143     if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) {
1144         ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci);
1145     }
1146 
1147     struct rte_mbuf *pn = pkt->next;
1148     void *prev = hdr;
1149     while(pn != NULL) {
1150         data = rte_pktmbuf_mtod(pn, void*);
1151         len = rte_pktmbuf_data_len(pn);
1152 
1153         void *mb = ff_mbuf_get(prev, pn, data, len);
1154         if (mb == NULL) {
1155             ff_mbuf_free(hdr);
1156             rte_pktmbuf_free(pkt);
1157             return;
1158         }
1159         pn = pn->next;
1160         prev = mb;
1161     }
1162 
1163     ff_veth_process_packet(ctx->ifp, hdr);
1164 }
1165 
1166 static enum FilterReturn
1167 protocol_filter(const void *data, uint16_t len)
1168 {
1169     if(len < RTE_ETHER_ADDR_LEN)
1170         return FILTER_UNKNOWN;
1171 
1172     const struct rte_ether_hdr *hdr;
1173     const struct rte_vlan_hdr *vlanhdr;
1174     hdr = (const struct rte_ether_hdr *)data;
1175     uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type);
1176     data += RTE_ETHER_HDR_LEN;
1177     len -= RTE_ETHER_HDR_LEN;
1178 
1179     if (ether_type == RTE_ETHER_TYPE_VLAN) {
1180         vlanhdr = (struct rte_vlan_hdr *)data;
1181         ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto);
1182         data += sizeof(struct rte_vlan_hdr);
1183         len -= sizeof(struct rte_vlan_hdr);
1184     }
1185 
1186     if(ether_type == RTE_ETHER_TYPE_ARP)
1187         return FILTER_ARP;
1188 
1189 #ifdef INET6
1190     if (ether_type == RTE_ETHER_TYPE_IPV6) {
1191         return ff_kni_proto_filter(data,
1192             len, ether_type);
1193     }
1194 #endif
1195 
1196 #ifndef FF_KNI
1197     return FILTER_UNKNOWN;
1198 #else
1199     if (!enable_kni) {
1200         return FILTER_UNKNOWN;
1201     }
1202 
1203     if(ether_type != RTE_ETHER_TYPE_IPV4)
1204         return FILTER_UNKNOWN;
1205 
1206     return ff_kni_proto_filter(data,
1207         len, ether_type);
1208 #endif
1209 }
1210 
1211 static inline void
1212 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
1213 {
1214     struct rte_mbuf *md;
1215     void *src, *dst;
1216 
1217     dst = rte_pktmbuf_mtod(mi, void *);
1218     src = rte_pktmbuf_mtod(m, void *);
1219 
1220     mi->data_len = m->data_len;
1221     rte_memcpy(dst, src, m->data_len);
1222 
1223     mi->port = m->port;
1224     mi->vlan_tci = m->vlan_tci;
1225     mi->vlan_tci_outer = m->vlan_tci_outer;
1226     mi->tx_offload = m->tx_offload;
1227     mi->hash = m->hash;
1228     mi->ol_flags = m->ol_flags;
1229     mi->packet_type = m->packet_type;
1230 }
1231 
1232 /* copied from rte_pktmbuf_clone */
1233 static inline struct rte_mbuf *
1234 pktmbuf_deep_clone(const struct rte_mbuf *md,
1235     struct rte_mempool *mp)
1236 {
1237     struct rte_mbuf *mc, *mi, **prev;
1238     uint32_t pktlen;
1239     uint8_t nseg;
1240 
1241     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
1242         return NULL;
1243 
1244     mi = mc;
1245     prev = &mi->next;
1246     pktlen = md->pkt_len;
1247     nseg = 0;
1248 
1249     do {
1250         nseg++;
1251         pktmbuf_deep_attach(mi, md);
1252         *prev = mi;
1253         prev = &mi->next;
1254     } while ((md = md->next) != NULL &&
1255         (mi = rte_pktmbuf_alloc(mp)) != NULL);
1256 
1257     *prev = NULL;
1258     mc->nb_segs = nseg;
1259     mc->pkt_len = pktlen;
1260 
1261     /* Allocation of new indirect segment failed */
1262     if (unlikely (mi == NULL)) {
1263         rte_pktmbuf_free(mc);
1264         return NULL;
1265     }
1266 
1267     __rte_mbuf_sanity_check(mc, 1);
1268     return mc;
1269 }
1270 
1271 static inline void
1272 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
1273     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
1274 {
1275     struct lcore_conf *qconf = &lcore_conf;
1276     uint16_t nb_queues = qconf->nb_queue_list[port_id];
1277 
1278     uint16_t i;
1279     for (i = 0; i < count; i++) {
1280         struct rte_mbuf *rtem = bufs[i];
1281 
1282         if (unlikely( ff_global_cfg.pcap.enable)) {
1283             if (!pkts_from_ring) {
1284                 ff_dump_packets( ff_global_cfg.pcap.save_path, rtem, ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1285             }
1286         }
1287 
1288         void *data = rte_pktmbuf_mtod(rtem, void*);
1289         uint16_t len = rte_pktmbuf_data_len(rtem);
1290 
1291         if (!pkts_from_ring) {
1292             ff_traffic.rx_packets++;
1293             ff_traffic.rx_bytes += len;
1294         }
1295 
1296         if (!pkts_from_ring && packet_dispatcher) {
1297             int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues);
1298             if (ret == FF_DISPATCH_RESPONSE) {
1299                 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len;
1300 
1301                 /*
1302                  * We have not support vlan out strip
1303                  */
1304                 if (rtem->vlan_tci) {
1305                     data = rte_pktmbuf_prepend(rtem, sizeof(struct rte_vlan_hdr));
1306                     if (data != NULL) {
1307                         memmove(data, data + sizeof(struct rte_vlan_hdr), RTE_ETHER_HDR_LEN);
1308                         struct rte_ether_hdr *etherhdr = (struct rte_ether_hdr *)data;
1309                         struct rte_vlan_hdr *vlanhdr = (struct rte_vlan_hdr *)(data + RTE_ETHER_HDR_LEN);
1310                         vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci);
1311                         vlanhdr->eth_proto = etherhdr->ether_type;
1312                         etherhdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN);
1313                     }
1314                 }
1315                 send_single_packet(rtem, port_id);
1316                 continue;
1317             }
1318 
1319             if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) {
1320                 rte_pktmbuf_free(rtem);
1321                 continue;
1322             }
1323 
1324             if (ret != queue_id) {
1325                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
1326                 if (ret < 0)
1327                     rte_pktmbuf_free(rtem);
1328 
1329                 continue;
1330             }
1331         }
1332 
1333         enum FilterReturn filter = protocol_filter(data, len);
1334 #ifdef INET6
1335         if (filter == FILTER_ARP || filter == FILTER_NDP) {
1336 #else
1337         if (filter == FILTER_ARP) {
1338 #endif
1339             struct rte_mempool *mbuf_pool;
1340             struct rte_mbuf *mbuf_clone;
1341             if (!pkts_from_ring) {
1342                 uint16_t j;
1343                 for(j = 0; j < nb_queues; ++j) {
1344                     if(j == queue_id)
1345                         continue;
1346 
1347                     unsigned socket_id = 0;
1348                     if (numa_on) {
1349                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1350                         socket_id = rte_lcore_to_socket_id(lcore_id);
1351                     }
1352                     mbuf_pool = pktmbuf_pool[socket_id];
1353                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1354                     if(mbuf_clone) {
1355                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1356                             mbuf_clone);
1357                         if (ret < 0)
1358                             rte_pktmbuf_free(mbuf_clone);
1359                     }
1360                 }
1361             }
1362 
1363 #ifdef FF_KNI
1364             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1365                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1366                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1367                 if(mbuf_clone) {
1368                     ff_kni_enqueue(port_id, mbuf_clone);
1369                 }
1370             }
1371 #endif
1372             ff_veth_input(ctx, rtem);
1373 #ifdef FF_KNI
1374         } else if (enable_kni) {
1375             if (knictl_action == FF_KNICTL_ACTION_ALL_TO_KNI){
1376                 ff_kni_enqueue(port_id, rtem);
1377             } else if (knictl_action == FF_KNICTL_ACTION_ALL_TO_FF){
1378                 ff_veth_input(ctx, rtem);
1379             } else if (knictl_action == FF_KNICTL_ACTION_DEFAULT){
1380                 if (enable_kni &&
1381                         ((filter == FILTER_KNI && kni_accept) ||
1382                         (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1383                         ff_kni_enqueue(port_id, rtem);
1384                 } else {
1385                     ff_veth_input(ctx, rtem);
1386                 }
1387             } else {
1388                 ff_veth_input(ctx, rtem);
1389             }
1390 #endif
1391         } else {
1392             ff_veth_input(ctx, rtem);
1393         }
1394     }
1395 }
1396 
1397 static inline int
1398 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1399     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1400 {
1401     /* read packet from ring buf and to process */
1402     uint16_t nb_rb;
1403     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1404         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1405 
1406     if(nb_rb > 0) {
1407         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1408     }
1409 
1410     return 0;
1411 }
1412 
1413 static inline void
1414 handle_sysctl_msg(struct ff_msg *msg)
1415 {
1416     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1417         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1418         msg->sysctl.newlen);
1419 
1420     if (ret < 0) {
1421         msg->result = errno;
1422     } else {
1423         msg->result = 0;
1424     }
1425 }
1426 
1427 static inline void
1428 handle_ioctl_msg(struct ff_msg *msg)
1429 {
1430     int fd, ret;
1431 #ifdef INET6
1432     if (msg->msg_type == FF_IOCTL6) {
1433         fd = ff_socket(AF_INET6, SOCK_DGRAM, 0);
1434     } else
1435 #endif
1436         fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1437 
1438     if (fd < 0) {
1439         ret = -1;
1440         goto done;
1441     }
1442 
1443     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1444 
1445     ff_close(fd);
1446 
1447 done:
1448     if (ret < 0) {
1449         msg->result = errno;
1450     } else {
1451         msg->result = 0;
1452     }
1453 }
1454 
1455 static inline void
1456 handle_route_msg(struct ff_msg *msg)
1457 {
1458     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1459         &msg->route.len, msg->route.maxlen);
1460     if (ret < 0) {
1461         msg->result = errno;
1462     } else {
1463         msg->result = 0;
1464     }
1465 }
1466 
1467 static inline void
1468 handle_top_msg(struct ff_msg *msg)
1469 {
1470     msg->top = ff_top_status;
1471     msg->result = 0;
1472 }
1473 
1474 #ifdef FF_NETGRAPH
1475 static inline void
1476 handle_ngctl_msg(struct ff_msg *msg)
1477 {
1478     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1479     if (ret < 0) {
1480         msg->result = errno;
1481     } else {
1482         msg->result = 0;
1483         msg->ngctl.ret = ret;
1484     }
1485 }
1486 #endif
1487 
1488 #ifdef FF_IPFW
1489 static inline void
1490 handle_ipfw_msg(struct ff_msg *msg)
1491 {
1492     int fd, ret;
1493     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1494     if (fd < 0) {
1495         ret = -1;
1496         goto done;
1497     }
1498 
1499     switch (msg->ipfw.cmd) {
1500         case FF_IPFW_GET:
1501             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1502                 msg->ipfw.optname, msg->ipfw.optval,
1503                 msg->ipfw.optlen);
1504             break;
1505         case FF_IPFW_SET:
1506             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1507                 msg->ipfw.optname, msg->ipfw.optval,
1508                 *(msg->ipfw.optlen));
1509             break;
1510         default:
1511             ret = -1;
1512             errno = ENOTSUP;
1513             break;
1514     }
1515 
1516     ff_close(fd);
1517 
1518 done:
1519     if (ret < 0) {
1520         msg->result = errno;
1521     } else {
1522         msg->result = 0;
1523     }
1524 }
1525 #endif
1526 
1527 static inline void
1528 handle_traffic_msg(struct ff_msg *msg)
1529 {
1530     msg->traffic = ff_traffic;
1531     msg->result = 0;
1532 }
1533 
1534 #ifdef FF_KNI
1535 static inline void
1536 handle_knictl_msg(struct ff_msg *msg)
1537 {
1538     if (msg->knictl.kni_cmd == FF_KNICTL_CMD_SET){
1539         switch (msg->knictl.kni_action){
1540             case FF_KNICTL_ACTION_ALL_TO_FF: knictl_action = FF_KNICTL_ACTION_ALL_TO_FF; msg->result = 0; printf("new kni action: alltoff\n"); break;
1541             case FF_KNICTL_ACTION_ALL_TO_KNI: knictl_action = FF_KNICTL_ACTION_ALL_TO_KNI; msg->result = 0; printf("new kni action: alltokni\n"); break;
1542             case FF_KNICTL_ACTION_DEFAULT: knictl_action = FF_KNICTL_ACTION_DEFAULT; msg->result = 0; printf("new kni action: default\n"); break;
1543             default: msg->result = -1;
1544         }
1545     }
1546     else if (msg->knictl.kni_cmd == FF_KNICTL_CMD_GET){
1547         msg->knictl.kni_action = knictl_action;
1548     } else {
1549         msg->result = -2;
1550     }
1551 }
1552 #endif
1553 
1554 static inline void
1555 handle_default_msg(struct ff_msg *msg)
1556 {
1557     msg->result = ENOTSUP;
1558 }
1559 
1560 static inline void
1561 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1562 {
1563     switch (msg->msg_type) {
1564         case FF_SYSCTL:
1565             handle_sysctl_msg(msg);
1566             break;
1567         case FF_IOCTL:
1568 #ifdef INET6
1569         case FF_IOCTL6:
1570 #endif
1571             handle_ioctl_msg(msg);
1572             break;
1573         case FF_ROUTE:
1574             handle_route_msg(msg);
1575             break;
1576         case FF_TOP:
1577             handle_top_msg(msg);
1578             break;
1579 #ifdef FF_NETGRAPH
1580         case FF_NGCTL:
1581             handle_ngctl_msg(msg);
1582             break;
1583 #endif
1584 #ifdef FF_IPFW
1585         case FF_IPFW_CTL:
1586             handle_ipfw_msg(msg);
1587             break;
1588 #endif
1589         case FF_TRAFFIC:
1590             handle_traffic_msg(msg);
1591             break;
1592 #ifdef FF_KNI
1593         case FF_KNICTL:
1594             handle_knictl_msg(msg);
1595             break;
1596 #endif
1597         default:
1598             handle_default_msg(msg);
1599             break;
1600     }
1601     rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg);
1602 }
1603 
1604 static inline int
1605 process_msg_ring(uint16_t proc_id)
1606 {
1607     void *msg;
1608     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1609 
1610     if (unlikely(ret == 0)) {
1611         handle_msg((struct ff_msg *)msg, proc_id);
1612     }
1613 
1614     return 0;
1615 }
1616 
1617 /* Send burst of packets on an output interface */
1618 static inline int
1619 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1620 {
1621     struct rte_mbuf **m_table;
1622     int ret;
1623     uint16_t queueid;
1624 
1625     queueid = qconf->tx_queue_id[port];
1626     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1627 
1628     if (unlikely(ff_global_cfg.pcap.enable)) {
1629         uint16_t i;
1630         for (i = 0; i < n; i++) {
1631             ff_dump_packets( ff_global_cfg.pcap.save_path, m_table[i],
1632                ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1633         }
1634     }
1635 
1636     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1637     ff_traffic.tx_packets += ret;
1638     uint16_t i;
1639     for (i = 0; i < ret; i++) {
1640         ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]);
1641 #ifdef FF_USE_PAGE_ARRAY
1642         if (qconf->tx_mbufs[port].bsd_m_table[i])
1643             ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs);
1644 #endif
1645     }
1646     if (unlikely(ret < n)) {
1647         do {
1648             rte_pktmbuf_free(m_table[ret]);
1649 #ifdef FF_USE_PAGE_ARRAY
1650             if ( qconf->tx_mbufs[port].bsd_m_table[ret] )
1651                 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]);
1652 #endif
1653         } while (++ret < n);
1654     }
1655     return 0;
1656 }
1657 
1658 /* Enqueue a single packet, and send burst if queue is filled */
1659 static inline int
1660 send_single_packet(struct rte_mbuf *m, uint8_t port)
1661 {
1662     uint16_t len;
1663     struct lcore_conf *qconf;
1664 
1665     qconf = &lcore_conf;
1666     len = qconf->tx_mbufs[port].len;
1667     qconf->tx_mbufs[port].m_table[len] = m;
1668     len++;
1669 
1670     /* enough pkts to be sent */
1671     if (unlikely(len == MAX_PKT_BURST)) {
1672         send_burst(qconf, MAX_PKT_BURST, port);
1673         len = 0;
1674     }
1675 
1676     qconf->tx_mbufs[port].len = len;
1677     return 0;
1678 }
1679 
1680 int
1681 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1682     int total)
1683 {
1684 #ifdef FF_USE_PAGE_ARRAY
1685     struct lcore_conf *qconf = &lcore_conf;
1686     int    len = 0;
1687 
1688     len = ff_if_send_onepkt(ctx, m,total);
1689     if (unlikely(len == MAX_PKT_BURST)) {
1690         send_burst(qconf, MAX_PKT_BURST, ctx->port_id);
1691         len = 0;
1692     }
1693     qconf->tx_mbufs[ctx->port_id].len = len;
1694     return 0;
1695 #endif
1696     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1697     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1698     if (head == NULL) {
1699         ff_mbuf_free(m);
1700         return -1;
1701     }
1702 
1703     head->pkt_len = total;
1704     head->nb_segs = 0;
1705 
1706     int off = 0;
1707     struct rte_mbuf *cur = head, *prev = NULL;
1708     while(total > 0) {
1709         if (cur == NULL) {
1710             cur = rte_pktmbuf_alloc(mbuf_pool);
1711             if (cur == NULL) {
1712                 rte_pktmbuf_free(head);
1713                 ff_mbuf_free(m);
1714                 return -1;
1715             }
1716         }
1717 
1718         if (prev != NULL) {
1719             prev->next = cur;
1720         }
1721         head->nb_segs++;
1722 
1723         prev = cur;
1724         void *data = rte_pktmbuf_mtod(cur, void*);
1725         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1726         int ret = ff_mbuf_copydata(m, data, off, len);
1727         if (ret < 0) {
1728             rte_pktmbuf_free(head);
1729             ff_mbuf_free(m);
1730             return -1;
1731         }
1732 
1733 
1734         cur->data_len = len;
1735         off += len;
1736         total -= len;
1737         cur = NULL;
1738     }
1739 
1740     struct ff_tx_offload offload = {0};
1741     ff_mbuf_tx_offload(m, &offload);
1742 
1743     void *data = rte_pktmbuf_mtod(head, void*);
1744 
1745     if (offload.ip_csum) {
1746         /* ipv6 not supported yet */
1747         struct rte_ipv4_hdr *iph;
1748         int iph_len;
1749         iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN);
1750         iph_len = (iph->version_ihl & 0x0f) << 2;
1751 
1752         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1753         head->l2_len = RTE_ETHER_HDR_LEN;
1754         head->l3_len = iph_len;
1755     }
1756 
1757     if (ctx->hw_features.tx_csum_l4) {
1758         struct rte_ipv4_hdr *iph;
1759         int iph_len;
1760         iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN);
1761         iph_len = (iph->version_ihl & 0x0f) << 2;
1762 
1763         if (offload.tcp_csum) {
1764             head->ol_flags |= PKT_TX_TCP_CKSUM;
1765             head->l2_len = RTE_ETHER_HDR_LEN;
1766             head->l3_len = iph_len;
1767         }
1768 
1769         /*
1770          *  TCP segmentation offload.
1771          *
1772          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1773          *    implies PKT_TX_TCP_CKSUM)
1774          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1775          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1776          *    write the IP checksum to 0 in the packet
1777          *  - fill the mbuf offload information: l2_len,
1778          *    l3_len, l4_len, tso_segsz
1779          *  - calculate the pseudo header checksum without taking ip_len
1780          *    in account, and set it in the TCP header. Refer to
1781          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1782          *    used as helpers.
1783          */
1784         if (offload.tso_seg_size) {
1785             struct rte_tcp_hdr *tcph;
1786             int tcph_len;
1787             tcph = (struct rte_tcp_hdr *)((char *)iph + iph_len);
1788             tcph_len = (tcph->data_off & 0xf0) >> 2;
1789             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1790 
1791             head->ol_flags |= PKT_TX_TCP_SEG;
1792             head->l4_len = tcph_len;
1793             head->tso_segsz = offload.tso_seg_size;
1794         }
1795 
1796         if (offload.udp_csum) {
1797             head->ol_flags |= PKT_TX_UDP_CKSUM;
1798             head->l2_len = RTE_ETHER_HDR_LEN;
1799             head->l3_len = iph_len;
1800         }
1801     }
1802 
1803     ff_mbuf_free(m);
1804 
1805     return send_single_packet(head, ctx->port_id);
1806 }
1807 
1808 static int
1809 main_loop(void *arg)
1810 {
1811     struct loop_routine *lr = (struct loop_routine *)arg;
1812 
1813     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1814     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc;
1815     int i, j, nb_rx, idle;
1816     uint16_t port_id, queue_id;
1817     struct lcore_conf *qconf;
1818     uint64_t drain_tsc = 0;
1819     struct ff_dpdk_if_context *ctx;
1820 
1821     if (pkt_tx_delay) {
1822         drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay;
1823     }
1824 
1825     prev_tsc = 0;
1826     usch_tsc = 0;
1827 
1828     qconf = &lcore_conf;
1829 
1830     while (1) {
1831         cur_tsc = rte_rdtsc();
1832         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1833             rte_timer_manage();
1834         }
1835 
1836         idle = 1;
1837         sys_tsc = 0;
1838         usr_tsc = 0;
1839 
1840         /*
1841          * TX burst queue drain
1842          */
1843         diff_tsc = cur_tsc - prev_tsc;
1844         if (unlikely(diff_tsc >= drain_tsc)) {
1845             for (i = 0; i < qconf->nb_tx_port; i++) {
1846                 port_id = qconf->tx_port_id[i];
1847                 if (qconf->tx_mbufs[port_id].len == 0)
1848                     continue;
1849 
1850                 idle = 0;
1851 
1852                 send_burst(qconf,
1853                     qconf->tx_mbufs[port_id].len,
1854                     port_id);
1855                 qconf->tx_mbufs[port_id].len = 0;
1856             }
1857 
1858             prev_tsc = cur_tsc;
1859         }
1860 
1861         /*
1862          * Read packet from RX queues
1863          */
1864         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1865             port_id = qconf->rx_queue_list[i].port_id;
1866             queue_id = qconf->rx_queue_list[i].queue_id;
1867             ctx = veth_ctx[port_id];
1868 
1869 #ifdef FF_KNI
1870             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1871                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1872             }
1873 #endif
1874 
1875             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1876 
1877             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1878                 MAX_PKT_BURST);
1879             if (nb_rx == 0)
1880                 continue;
1881 
1882             idle = 0;
1883 
1884             /* Prefetch first packets */
1885             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1886                 rte_prefetch0(rte_pktmbuf_mtod(
1887                         pkts_burst[j], void *));
1888             }
1889 
1890             /* Prefetch and handle already prefetched packets */
1891             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1892                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1893                         j + PREFETCH_OFFSET], void *));
1894                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1895             }
1896 
1897             /* Handle remaining prefetched packets */
1898             for (; j < nb_rx; j++) {
1899                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1900             }
1901         }
1902 
1903         process_msg_ring(qconf->proc_id);
1904 
1905         div_tsc = rte_rdtsc();
1906 
1907         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) {
1908             usch_tsc = cur_tsc;
1909             lr->loop(lr->arg);
1910         }
1911 
1912         idle_sleep_tsc = rte_rdtsc();
1913         if (likely(idle && idle_sleep)) {
1914             usleep(idle_sleep);
1915             end_tsc = rte_rdtsc();
1916         } else {
1917             end_tsc = idle_sleep_tsc;
1918         }
1919 
1920         if (usch_tsc == cur_tsc) {
1921             usr_tsc = idle_sleep_tsc - div_tsc;
1922         }
1923 
1924         if (!idle) {
1925             sys_tsc = div_tsc - cur_tsc;
1926             ff_top_status.sys_tsc += sys_tsc;
1927         }
1928 
1929         ff_top_status.usr_tsc += usr_tsc;
1930         ff_top_status.work_tsc += end_tsc - cur_tsc;
1931         ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1932 
1933         ff_top_status.loops++;
1934     }
1935 
1936     return 0;
1937 }
1938 
1939 int
1940 ff_dpdk_if_up(void) {
1941     int i;
1942     struct lcore_conf *qconf = &lcore_conf;
1943     for (i = 0; i < qconf->nb_tx_port; i++) {
1944         uint16_t port_id = qconf->tx_port_id[i];
1945 
1946         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1947         veth_ctx[port_id] = ff_veth_attach(pconf);
1948         if (veth_ctx[port_id] == NULL) {
1949             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1950         }
1951     }
1952 
1953     return 0;
1954 }
1955 
1956 void
1957 ff_dpdk_run(loop_func_t loop, void *arg) {
1958     struct loop_routine *lr = rte_malloc(NULL,
1959         sizeof(struct loop_routine), 0);
1960     lr->loop = loop;
1961     lr->arg = arg;
1962     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1963     rte_eal_mp_wait_lcore();
1964     rte_free(lr);
1965 }
1966 
1967 void
1968 ff_dpdk_pktmbuf_free(void *m)
1969 {
1970     rte_pktmbuf_free_seg((struct rte_mbuf *)m);
1971 }
1972 
1973 static uint32_t
1974 toeplitz_hash(unsigned keylen, const uint8_t *key,
1975     unsigned datalen, const uint8_t *data)
1976 {
1977     uint32_t hash = 0, v;
1978     u_int i, b;
1979 
1980     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1981 
1982     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1983     for (i = 0; i < datalen; i++) {
1984         for (b = 0; b < 8; b++) {
1985             if (data[i] & (1<<(7-b)))
1986                 hash ^= v;
1987             v <<= 1;
1988             if ((i + 4) < keylen &&
1989                 (key[i+4] & (1<<(7-b))))
1990                 v |= 1;
1991         }
1992     }
1993     return (hash);
1994 }
1995 
1996 int
1997 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
1998     uint16_t sport, uint16_t dport)
1999 {
2000     struct lcore_conf *qconf = &lcore_conf;
2001     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
2002     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
2003 
2004     if (nb_queues <= 1) {
2005         return 1;
2006     }
2007 
2008     uint16_t reta_size = rss_reta_size[ctx->port_id];
2009     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
2010 
2011     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
2012         sizeof(dport)];
2013 
2014     unsigned datalen = 0;
2015 
2016     bcopy(&saddr, &data[datalen], sizeof(saddr));
2017     datalen += sizeof(saddr);
2018 
2019     bcopy(&daddr, &data[datalen], sizeof(daddr));
2020     datalen += sizeof(daddr);
2021 
2022     bcopy(&sport, &data[datalen], sizeof(sport));
2023     datalen += sizeof(sport);
2024 
2025     bcopy(&dport, &data[datalen], sizeof(dport));
2026     datalen += sizeof(dport);
2027 
2028     uint32_t hash = 0;
2029     hash = toeplitz_hash(rsskey_len, rsskey, datalen, data);
2030 
2031     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
2032 }
2033 
2034 void
2035 ff_regist_packet_dispatcher(dispatch_func_t func)
2036 {
2037     packet_dispatcher = func;
2038 }
2039 
2040 uint64_t
2041 ff_get_tsc_ns()
2042 {
2043     uint64_t cur_tsc = rte_rdtsc();
2044     uint64_t hz = rte_get_tsc_hz();
2045     return ((double)cur_tsc/(double)hz) * NS_PER_S;
2046 }
2047 
2048