xref: /f-stack/lib/ff_dpdk_if.c (revision bee2fff9)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 #include <unistd.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30 
31 #include <rte_common.h>
32 #include <rte_byteorder.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memcpy.h>
36 #include <rte_memzone.h>
37 #include <rte_config.h>
38 #include <rte_eal.h>
39 #include <rte_pci.h>
40 #include <rte_mbuf.h>
41 #include <rte_memory.h>
42 #include <rte_lcore.h>
43 #include <rte_launch.h>
44 #include <rte_ethdev.h>
45 #include <rte_debug.h>
46 #include <rte_common.h>
47 #include <rte_ether.h>
48 #include <rte_malloc.h>
49 #include <rte_cycles.h>
50 #include <rte_timer.h>
51 #include <rte_thash.h>
52 #include <rte_ip.h>
53 #include <rte_tcp.h>
54 #include <rte_udp.h>
55 #include <rte_eth_bond.h>
56 
57 #include "ff_dpdk_if.h"
58 #include "ff_dpdk_pcap.h"
59 #include "ff_dpdk_kni.h"
60 #include "ff_config.h"
61 #include "ff_veth.h"
62 #include "ff_host_interface.h"
63 #include "ff_msg.h"
64 #include "ff_api.h"
65 #include "ff_memory.h"
66 
67 #ifdef FF_KNI
68 #define KNI_MBUF_MAX 2048
69 #define KNI_QUEUE_SIZE 2048
70 
71 int enable_kni;
72 static int kni_accept;
73 static int knictl_action = FF_KNICTL_ACTION_DEFAULT;
74 #endif
75 
76 static int numa_on;
77 
78 static unsigned idle_sleep;
79 static unsigned pkt_tx_delay;
80 
81 static struct rte_timer freebsd_clock;
82 
83 // Mellanox Linux's driver key
84 static uint8_t default_rsskey_40bytes[40] = {
85     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
86     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
87     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
88     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
89     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
90 };
91 
92 static uint8_t default_rsskey_52bytes[52] = {
93     0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23,
94     0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30,
95     0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02,
96     0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c,
97     0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55,
98     0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e,
99     0x81, 0x15, 0x03, 0x66
100 };
101 
102 static uint8_t symmetric_rsskey[52] = {
103     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
104     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
105     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
106     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
107     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
108     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
109     0x6d, 0x5a, 0x6d, 0x5a
110 };
111 
112 static int rsskey_len = sizeof(default_rsskey_40bytes);
113 static uint8_t *rsskey = default_rsskey_40bytes;
114 
115 struct lcore_conf lcore_conf;
116 
117 struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
118 
119 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
120 static dispatch_func_t packet_dispatcher;
121 
122 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
123 
124 #define BOND_DRIVER_NAME    "net_bonding"
125 
126 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port);
127 
128 struct ff_msg_ring {
129     char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE];
130     /* ring[0] for lcore recv msg, other send */
131     /* ring[1] for lcore send msg, other read */
132     struct rte_ring *ring[FF_MSG_NUM];
133 } __rte_cache_aligned;
134 
135 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
136 static struct rte_mempool *message_pool;
137 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
138 
139 static struct ff_top_args ff_top_status;
140 static struct ff_traffic_args ff_traffic;
141 extern void ff_hardclock(void);
142 
143 static void
144 ff_hardclock_job(__rte_unused struct rte_timer *timer,
145     __rte_unused void *arg) {
146     ff_hardclock();
147     ff_update_current_ts();
148 }
149 
150 struct ff_dpdk_if_context *
151 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
152 {
153     struct ff_dpdk_if_context *ctx;
154 
155     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
156     if (ctx == NULL)
157         return NULL;
158 
159     ctx->sc = sc;
160     ctx->ifp = ifp;
161     ctx->port_id = cfg->port_id;
162     ctx->hw_features = cfg->hw_features;
163 
164     return ctx;
165 }
166 
167 void
168 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
169 {
170     free(ctx);
171 }
172 
173 static void
174 check_all_ports_link_status(void)
175 {
176     #define CHECK_INTERVAL 100 /* 100ms */
177     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
178 
179     uint16_t portid;
180     uint8_t count, all_ports_up, print_flag = 0;
181     struct rte_eth_link link;
182 
183     printf("\nChecking link status");
184     fflush(stdout);
185 
186     int i, nb_ports;
187     nb_ports = ff_global_cfg.dpdk.nb_ports;
188     for (count = 0; count <= MAX_CHECK_TIME; count++) {
189         all_ports_up = 1;
190         for (i = 0; i < nb_ports; i++) {
191             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
192             memset(&link, 0, sizeof(link));
193             rte_eth_link_get_nowait(portid, &link);
194 
195             /* print link status if flag set */
196             if (print_flag == 1) {
197                 if (link.link_status) {
198                     printf("Port %d Link Up - speed %u "
199                         "Mbps - %s\n", (int)portid,
200                         (unsigned)link.link_speed,
201                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
202                         ("full-duplex") : ("half-duplex\n"));
203                 } else {
204                     printf("Port %d Link Down\n", (int)portid);
205                 }
206                 continue;
207             }
208             /* clear all_ports_up flag if any link down */
209             if (link.link_status == 0) {
210                 all_ports_up = 0;
211                 break;
212             }
213         }
214 
215         /* after finally printing all link status, get out */
216         if (print_flag == 1)
217             break;
218 
219         if (all_ports_up == 0) {
220             printf(".");
221             fflush(stdout);
222             rte_delay_ms(CHECK_INTERVAL);
223         }
224 
225         /* set the print_flag if all ports up or timeout */
226         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
227             print_flag = 1;
228             printf("done\n");
229         }
230     }
231 }
232 
233 static int
234 init_lcore_conf(void)
235 {
236     uint8_t nb_dev_ports = rte_eth_dev_count_avail();
237     if (nb_dev_ports == 0) {
238         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
239     }
240 
241     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
242         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
243                  ff_global_cfg.dpdk.max_portid);
244     }
245 
246     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
247     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
248 
249     uint16_t socket_id = 0;
250     if (numa_on) {
251         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
252     }
253 
254     lcore_conf.socket_id = socket_id;
255 
256     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
257     if (!rte_lcore_is_enabled(lcore_id)) {
258         rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
259     }
260 
261     int j;
262     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
263         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
264         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
265 
266         int queueid = -1;
267         int i;
268         for (i = 0; i < pconf->nb_lcores; i++) {
269             if (pconf->lcore_list[i] == lcore_id) {
270                 queueid = i;
271             }
272         }
273         if (queueid < 0) {
274             continue;
275         }
276         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
277         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
278         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
279         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
280         lcore_conf.nb_rx_queue++;
281 
282         lcore_conf.tx_queue_id[port_id] = queueid;
283         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
284         lcore_conf.nb_tx_port++;
285 
286         /* Enable pcap dump */
287         if (ff_global_cfg.pcap.enable) {
288             ff_enable_pcap(ff_global_cfg.pcap.save_path, ff_global_cfg.pcap.snap_len);
289         }
290 
291         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
292     }
293 
294     if (lcore_conf.nb_rx_queue == 0) {
295         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
296     }
297 
298     return 0;
299 }
300 
301 static int
302 init_mem_pool(void)
303 {
304     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
305     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
306     uint32_t nb_tx_queue = nb_lcores;
307     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
308     uint16_t max_portid = ff_global_cfg.dpdk.max_portid;
309 
310     unsigned nb_mbuf = RTE_ALIGN_CEIL (
311         (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE          +
312         nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST    +
313         nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE  +
314         nb_lcores * MEMPOOL_CACHE_SIZE +
315 #ifdef FF_KNI
316         nb_ports * KNI_MBUF_MAX +
317         nb_ports * KNI_QUEUE_SIZE +
318 #endif
319         nb_lcores * nb_ports * DISPATCH_RING_SIZE),
320         (unsigned)8192);
321 
322     unsigned socketid = 0;
323     uint16_t i, lcore_id;
324     char s[64];
325 
326     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
327         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
328         if (numa_on) {
329             socketid = rte_lcore_to_socket_id(lcore_id);
330         }
331 
332         if (socketid >= NB_SOCKETS) {
333             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
334                 socketid, i, NB_SOCKETS);
335         }
336 
337         if (pktmbuf_pool[socketid] != NULL) {
338             continue;
339         }
340 
341         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
342             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
343             pktmbuf_pool[socketid] =
344                 rte_pktmbuf_pool_create(s, nb_mbuf,
345                     MEMPOOL_CACHE_SIZE, 0,
346                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
347         } else {
348             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
349             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
350         }
351 
352         if (pktmbuf_pool[socketid] == NULL) {
353             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
354         } else {
355             printf("create mbuf pool on socket %d\n", socketid);
356         }
357 
358 #ifdef FF_USE_PAGE_ARRAY
359         nb_mbuf = RTE_ALIGN_CEIL (
360             nb_ports*nb_lcores*MAX_PKT_BURST    +
361             nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
362             nb_lcores*MEMPOOL_CACHE_SIZE,
363             (unsigned)4096);
364         ff_init_ref_pool(nb_mbuf, socketid);
365 #endif
366     }
367 
368     return 0;
369 }
370 
371 static struct rte_ring *
372 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
373 {
374     struct rte_ring *ring;
375 
376     if (name == NULL) {
377         rte_exit(EXIT_FAILURE, "create ring failed, no name!\n");
378     }
379 
380     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
381         ring = rte_ring_create(name, count, socket_id, flags);
382     } else {
383         ring = rte_ring_lookup(name);
384     }
385 
386     if (ring == NULL) {
387         rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name);
388     }
389 
390     return ring;
391 }
392 
393 static int
394 init_dispatch_ring(void)
395 {
396     int j;
397     char name_buf[RTE_RING_NAMESIZE];
398     int queueid;
399 
400     unsigned socketid = lcore_conf.socket_id;
401 
402     /* Create ring according to ports actually being used. */
403     int nb_ports = ff_global_cfg.dpdk.nb_ports;
404     for (j = 0; j < nb_ports; j++) {
405         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
406         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
407         int nb_queues = pconf->nb_lcores;
408         if (dispatch_ring[portid] == NULL) {
409             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
410 
411             dispatch_ring[portid] = rte_zmalloc(name_buf,
412                 sizeof(struct rte_ring *) * nb_queues,
413                 RTE_CACHE_LINE_SIZE);
414             if (dispatch_ring[portid] == NULL) {
415                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
416                     "failed\n", name_buf);
417             }
418         }
419 
420         for(queueid = 0; queueid < nb_queues; ++queueid) {
421             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
422                 portid, queueid);
423             dispatch_ring[portid][queueid] = create_ring(name_buf,
424                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
425 
426             if (dispatch_ring[portid][queueid] == NULL)
427                 rte_panic("create ring:%s failed!\n", name_buf);
428 
429             printf("create ring:%s success, %u ring entries are now free!\n",
430                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
431         }
432     }
433 
434     return 0;
435 }
436 
437 static void
438 ff_msg_init(struct rte_mempool *mp,
439     __attribute__((unused)) void *opaque_arg,
440     void *obj, __attribute__((unused)) unsigned i)
441 {
442     struct ff_msg *msg = (struct ff_msg *)obj;
443     msg->msg_type = FF_UNKNOWN;
444     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
445     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
446     msg->original_buf = NULL;
447     msg->original_buf_len = 0;
448 }
449 
450 static int
451 init_msg_ring(void)
452 {
453     uint16_t i, j;
454     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
455     unsigned socketid = lcore_conf.socket_id;
456 
457     /* Create message buffer pool */
458     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
459         message_pool = rte_mempool_create(FF_MSG_POOL,
460            MSG_RING_SIZE * 2 * nb_procs,
461            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
462            NULL, NULL, ff_msg_init, NULL,
463            socketid, 0);
464     } else {
465         message_pool = rte_mempool_lookup(FF_MSG_POOL);
466     }
467 
468     if (message_pool == NULL) {
469         rte_panic("Create msg mempool failed\n");
470     }
471 
472     for(i = 0; i < nb_procs; ++i) {
473         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
474             "%s%u", FF_MSG_RING_IN, i);
475         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
476             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
477         if (msg_ring[i].ring[0] == NULL)
478             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
479 
480         for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) {
481             snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE,
482                 "%s%u_%u", FF_MSG_RING_OUT, i, j);
483             msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j],
484                 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
485             if (msg_ring[i].ring[j] == NULL)
486                 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]);
487         }
488     }
489 
490     return 0;
491 }
492 
493 #ifdef FF_KNI
494 
495 static enum FF_KNICTL_CMD get_kni_action(const char *c){
496     if (!c)
497         return FF_KNICTL_ACTION_DEFAULT;
498     if (0 == strcasecmp(c, "alltokni")){
499         return FF_KNICTL_ACTION_ALL_TO_KNI;
500     } else  if (0 == strcasecmp(c, "alltoff")){
501         return FF_KNICTL_ACTION_ALL_TO_FF;
502     } else if (0 == strcasecmp(c, "default")){
503         return FF_KNICTL_ACTION_DEFAULT;
504     } else {
505         return FF_KNICTL_ACTION_DEFAULT;
506     }
507 }
508 
509 static int
510 init_kni(void)
511 {
512     int nb_ports = rte_eth_dev_count_avail();
513     kni_accept = 0;
514     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
515         kni_accept = 1;
516 
517     knictl_action = get_kni_action(ff_global_cfg.kni.kni_action);
518 
519     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
520         ff_global_cfg.kni.udp_port);
521 
522     unsigned socket_id = lcore_conf.socket_id;
523     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
524 
525     nb_ports = ff_global_cfg.dpdk.nb_ports;
526     int i, ret;
527     for (i = 0; i < nb_ports; i++) {
528         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
529         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
530     }
531 
532     return 0;
533 }
534 #endif
535 
536 //RSS reta update will failed when enable flow isolate
537 #ifndef FF_FLOW_ISOLATE
538 static void
539 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
540 {
541     if (reta_size == 0) {
542         return;
543     }
544 
545     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
546     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
547 
548     /* config HW indirection table */
549     unsigned i, j, hash=0;
550     for (i = 0; i < reta_conf_size; i++) {
551         reta_conf[i].mask = ~0ULL;
552         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
553             reta_conf[i].reta[j] = hash++ % nb_queues;
554         }
555     }
556 
557     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
558         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
559             port_id);
560     }
561 }
562 #endif
563 
564 static int
565 init_port_start(void)
566 {
567     int nb_ports = ff_global_cfg.dpdk.nb_ports;
568     unsigned socketid = 0;
569     struct rte_mempool *mbuf_pool;
570     uint16_t i, j;
571 
572     for (i = 0; i < nb_ports; i++) {
573         uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i];
574         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id];
575         uint16_t nb_queues = pconf->nb_lcores;
576 
577         for (j=0; j<=pconf->nb_slaves; j++) {
578             if (j < pconf->nb_slaves) {
579                 port_id = pconf->slave_portid_list[j];
580                 printf("To init %s's %d'st slave port[%d]\n",
581                         ff_global_cfg.dpdk.bond_cfgs->name,
582                         j, port_id);
583             } else {
584                 port_id = u_port_id;
585             }
586 
587             struct rte_eth_dev_info dev_info;
588             struct rte_eth_conf port_conf = {0};
589             struct rte_eth_rxconf rxq_conf;
590             struct rte_eth_txconf txq_conf;
591 
592             int ret = rte_eth_dev_info_get(port_id, &dev_info);
593             if (ret != 0)
594                 rte_exit(EXIT_FAILURE,
595                     "Error during getting device (port %u) info: %s\n",
596                     port_id, strerror(-ret));
597 
598             if (nb_queues > dev_info.max_rx_queues) {
599                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
600                     nb_queues,
601                     dev_info.max_rx_queues);
602             }
603 
604             if (nb_queues > dev_info.max_tx_queues) {
605                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
606                     nb_queues,
607                     dev_info.max_tx_queues);
608             }
609 
610             struct rte_ether_addr addr;
611             rte_eth_macaddr_get(port_id, &addr);
612             printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
613                        " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
614                     (unsigned)port_id,
615                     addr.addr_bytes[0], addr.addr_bytes[1],
616                     addr.addr_bytes[2], addr.addr_bytes[3],
617                     addr.addr_bytes[4], addr.addr_bytes[5]);
618 
619             rte_memcpy(pconf->mac,
620                 addr.addr_bytes, RTE_ETHER_ADDR_LEN);
621 
622             /* Set RSS mode */
623             uint64_t default_rss_hf = ETH_RSS_PROTO_MASK;
624             port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
625             port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf;
626             if (dev_info.hash_key_size == 52) {
627                 rsskey = default_rsskey_52bytes;
628                 rsskey_len = 52;
629             }
630             if (ff_global_cfg.dpdk.symmetric_rss) {
631                 printf("Use symmetric Receive-side Scaling(RSS) key\n");
632                 rsskey = symmetric_rsskey;
633             }
634             port_conf.rx_adv_conf.rss_conf.rss_key = rsskey;
635             port_conf.rx_adv_conf.rss_conf.rss_key_len = rsskey_len;
636             port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads;
637             if (port_conf.rx_adv_conf.rss_conf.rss_hf !=
638                     ETH_RSS_PROTO_MASK) {
639                 printf("Port %u modified RSS hash function based on hardware support,"
640                         "requested:%#"PRIx64" configured:%#"PRIx64"\n",
641                         port_id, default_rss_hf,
642                         port_conf.rx_adv_conf.rss_conf.rss_hf);
643             }
644 
645             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) {
646                 port_conf.txmode.offloads |=
647                     DEV_TX_OFFLOAD_MBUF_FAST_FREE;
648             }
649 
650             /* Set Rx VLAN stripping */
651             if (ff_global_cfg.dpdk.vlan_strip) {
652                 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
653                     port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
654                 }
655             }
656 
657             /* Enable HW CRC stripping */
658             port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC;
659 
660             /* FIXME: Enable TCP LRO ?*/
661             #if 0
662             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
663                 printf("LRO is supported\n");
664                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
665                 pconf->hw_features.rx_lro = 1;
666             }
667             #endif
668 
669             /* Set Rx checksum checking */
670             if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
671                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
672                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
673                 printf("RX checksum offload supported\n");
674                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
675                 pconf->hw_features.rx_csum = 1;
676             }
677 
678             if (ff_global_cfg.dpdk.tx_csum_offoad_skip == 0) {
679                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
680                     printf("TX ip checksum offload supported\n");
681                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
682                     pconf->hw_features.tx_csum_ip = 1;
683                 }
684 
685                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
686                     (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
687                     printf("TX TCP&UDP checksum offload supported\n");
688                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM;
689                     pconf->hw_features.tx_csum_l4 = 1;
690                 }
691             } else {
692                 printf("TX checksum offoad is disabled\n");
693             }
694 
695             if (ff_global_cfg.dpdk.tso) {
696                 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
697                     printf("TSO is supported\n");
698                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO;
699                     pconf->hw_features.tx_tso = 1;
700                 }
701             } else {
702                 printf("TSO is disabled\n");
703             }
704 
705             if (dev_info.reta_size) {
706                 /* reta size must be power of 2 */
707                 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
708 
709                 rss_reta_size[port_id] = dev_info.reta_size;
710                 printf("port[%d]: rss table size: %d\n", port_id,
711                     dev_info.reta_size);
712             }
713 
714             if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
715                 continue;
716             }
717 
718             ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
719             if (ret != 0) {
720                 return ret;
721             }
722 
723             static uint16_t nb_rxd = RX_QUEUE_SIZE;
724             static uint16_t nb_txd = TX_QUEUE_SIZE;
725             ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd);
726             if (ret < 0)
727                 printf("Could not adjust number of descriptors "
728                         "for port%u (%d)\n", (unsigned)port_id, ret);
729 
730             uint16_t q;
731             for (q = 0; q < nb_queues; q++) {
732                 if (numa_on) {
733                     uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
734                     socketid = rte_lcore_to_socket_id(lcore_id);
735                 }
736                 mbuf_pool = pktmbuf_pool[socketid];
737 
738                 txq_conf = dev_info.default_txconf;
739                 txq_conf.offloads = port_conf.txmode.offloads;
740                 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd,
741                     socketid, &txq_conf);
742                 if (ret < 0) {
743                     return ret;
744                 }
745 
746                 rxq_conf = dev_info.default_rxconf;
747                 rxq_conf.offloads = port_conf.rxmode.offloads;
748                 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd,
749                     socketid, &rxq_conf, mbuf_pool);
750                 if (ret < 0) {
751                     return ret;
752                 }
753             }
754 
755 
756             if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME,
757                     strlen(dev_info.driver_name)) == 0) {
758 
759                 rte_eth_macaddr_get(port_id, &addr);
760                 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
761                            " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
762                         (unsigned)port_id,
763                         addr.addr_bytes[0], addr.addr_bytes[1],
764                         addr.addr_bytes[2], addr.addr_bytes[3],
765                         addr.addr_bytes[4], addr.addr_bytes[5]);
766 
767                 rte_memcpy(pconf->mac,
768                     addr.addr_bytes, RTE_ETHER_ADDR_LEN);
769 
770                 int mode, count, x;
771                 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS;
772 
773                 mode = rte_eth_bond_mode_get(port_id);
774                 printf("Port %u, bond mode:%d\n", port_id, mode);
775 
776                 count = rte_eth_bond_slaves_get(port_id, slaves, len);
777                 printf("Port %u, %s's slave ports count:%d\n", port_id,
778                             ff_global_cfg.dpdk.bond_cfgs->name, count);
779                 for (x=0; x<count; x++) {
780                     printf("Port %u, %s's slave port[%u]\n", port_id,
781                             ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]);
782                 }
783             }
784 
785             ret = rte_eth_dev_start(port_id);
786             if (ret < 0) {
787                 return ret;
788             }
789     //RSS reta update will failed when enable flow isolate
790     #ifndef FF_FLOW_ISOLATE
791             if (nb_queues > 1) {
792                 /* set HW rss hash function to Toeplitz. */
793                 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
794                     struct rte_eth_hash_filter_info info = {0};
795                     info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
796                     info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
797 
798                     if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
799                         RTE_ETH_FILTER_SET, &info) < 0) {
800                         rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
801                             port_id);
802                     }
803                 }
804 
805                 set_rss_table(port_id, dev_info.reta_size, nb_queues);
806             }
807     #endif
808 
809             /* Enable RX in promiscuous mode for the Ethernet device. */
810             if (ff_global_cfg.dpdk.promiscuous) {
811                 ret = rte_eth_promiscuous_enable(port_id);
812                 if (ret == 0) {
813                     printf("set port %u to promiscuous mode ok\n", port_id);
814                 } else {
815                     printf("set port %u to promiscuous mode error\n", port_id);
816                 }
817             }
818         }
819     }
820 
821     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
822         check_all_ports_link_status();
823     }
824 
825     return 0;
826 }
827 
828 static int
829 init_clock(void)
830 {
831     rte_timer_subsystem_init();
832     uint64_t hz = rte_get_timer_hz();
833     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
834     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
835 
836     rte_timer_init(&freebsd_clock);
837     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
838         rte_lcore_id(), &ff_hardclock_job, NULL);
839 
840     ff_update_current_ts();
841 
842     return 0;
843 }
844 
845 #ifdef FF_FLOW_ISOLATE
846 /** Print a message out of a flow error. */
847 static int
848 port_flow_complain(struct rte_flow_error *error)
849 {
850     static const char *const errstrlist[] = {
851         [RTE_FLOW_ERROR_TYPE_NONE] = "no error",
852         [RTE_FLOW_ERROR_TYPE_UNSPECIFIED] = "cause unspecified",
853         [RTE_FLOW_ERROR_TYPE_HANDLE] = "flow rule (handle)",
854         [RTE_FLOW_ERROR_TYPE_ATTR_GROUP] = "group field",
855         [RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY] = "priority field",
856         [RTE_FLOW_ERROR_TYPE_ATTR_INGRESS] = "ingress field",
857         [RTE_FLOW_ERROR_TYPE_ATTR_EGRESS] = "egress field",
858         [RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER] = "transfer field",
859         [RTE_FLOW_ERROR_TYPE_ATTR] = "attributes structure",
860         [RTE_FLOW_ERROR_TYPE_ITEM_NUM] = "pattern length",
861         [RTE_FLOW_ERROR_TYPE_ITEM_SPEC] = "item specification",
862         [RTE_FLOW_ERROR_TYPE_ITEM_LAST] = "item specification range",
863         [RTE_FLOW_ERROR_TYPE_ITEM_MASK] = "item specification mask",
864         [RTE_FLOW_ERROR_TYPE_ITEM] = "specific pattern item",
865         [RTE_FLOW_ERROR_TYPE_ACTION_NUM] = "number of actions",
866         [RTE_FLOW_ERROR_TYPE_ACTION_CONF] = "action configuration",
867         [RTE_FLOW_ERROR_TYPE_ACTION] = "specific action",
868     };
869     const char *errstr;
870     char buf[32];
871     int err = rte_errno;
872 
873     if ((unsigned int)error->type >= RTE_DIM(errstrlist) ||
874         !errstrlist[error->type])
875         errstr = "unknown type";
876     else
877         errstr = errstrlist[error->type];
878     printf("Caught error type %d (%s): %s%s: %s\n",
879            error->type, errstr,
880            error->cause ? (snprintf(buf, sizeof(buf), "cause: %p, ",
881                                     error->cause), buf) : "",
882            error->message ? error->message : "(no stated reason)",
883            rte_strerror(err));
884     return -err;
885 }
886 
887 static int
888 port_flow_isolate(uint16_t port_id, int set)
889 {
890     struct rte_flow_error error;
891 
892     /* Poisoning to make sure PMDs update it in case of error. */
893     memset(&error, 0x66, sizeof(error));
894     if (rte_flow_isolate(port_id, set, &error))
895         return port_flow_complain(&error);
896     printf("Ingress traffic on port %u is %s to the defined flow rules\n",
897            port_id,
898            set ? "now restricted" : "not restricted anymore");
899     return 0;
900 }
901 
902 static int
903 create_tcp_flow(uint16_t port_id, uint16_t tcp_port) {
904   struct rte_flow_attr attr = {.ingress = 1};
905   struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
906   int nb_queues = pconf->nb_lcores;
907   uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
908   int i = 0, j = 0;
909   for (i = 0, j = 0; i < nb_queues; ++i)
910    queue[j++] = i;
911   struct rte_flow_action_rss rss = {
912    .types = ETH_RSS_NONFRAG_IPV4_TCP,
913    .key_len = rsskey_len,
914    .key = rsskey,
915    .queue_num = j,
916    .queue = queue,
917   };
918 
919   struct rte_eth_dev_info dev_info;
920   int ret = rte_eth_dev_info_get(port_id, &dev_info);
921   if (ret != 0)
922     rte_exit(EXIT_FAILURE, "Error during getting device (port %u) info: %s\n", port_id, strerror(-ret));
923 
924   struct rte_flow_item pattern[3];
925   struct rte_flow_action action[2];
926   struct rte_flow_item_tcp tcp_spec;
927   struct rte_flow_item_tcp tcp_mask = {
928           .hdr = {
929                   .src_port = RTE_BE16(0x0000),
930                   .dst_port = RTE_BE16(0xffff),
931           },
932   };
933   struct rte_flow_error error;
934 
935   memset(pattern, 0, sizeof(pattern));
936   memset(action, 0, sizeof(action));
937 
938   /* set the dst ipv4 packet to the required value */
939   pattern[0].type = RTE_FLOW_ITEM_TYPE_IPV4;
940 
941   memset(&tcp_spec, 0, sizeof(struct rte_flow_item_tcp));
942   tcp_spec.hdr.dst_port = rte_cpu_to_be_16(tcp_port);
943   pattern[1].type = RTE_FLOW_ITEM_TYPE_TCP;
944   pattern[1].spec = &tcp_spec;
945   pattern[1].mask = &tcp_mask;
946 
947   /* end the pattern array */
948   pattern[2].type = RTE_FLOW_ITEM_TYPE_END;
949 
950   /* create the action */
951   action[0].type = RTE_FLOW_ACTION_TYPE_RSS;
952   action[0].conf = &rss;
953   action[1].type = RTE_FLOW_ACTION_TYPE_END;
954 
955   struct rte_flow *flow;
956   /* validate and create the flow rule */
957   if (!rte_flow_validate(port_id, &attr, pattern, action, &error)) {
958       flow = rte_flow_create(port_id, &attr, pattern, action, &error);
959       if (!flow) {
960           return port_flow_complain(&error);
961       }
962   }
963 
964   memset(pattern, 0, sizeof(pattern));
965 
966   /* set the dst ipv4 packet to the required value */
967   pattern[0].type = RTE_FLOW_ITEM_TYPE_IPV4;
968 
969   struct rte_flow_item_tcp tcp_src_mask = {
970           .hdr = {
971                   .src_port = RTE_BE16(0xffff),
972                   .dst_port = RTE_BE16(0x0000),
973           },
974   };
975 
976   memset(&tcp_spec, 0, sizeof(struct rte_flow_item_tcp));
977   tcp_spec.hdr.src_port = rte_cpu_to_be_16(tcp_port);
978   pattern[1].type = RTE_FLOW_ITEM_TYPE_TCP;
979   pattern[1].spec = &tcp_spec;
980   pattern[1].mask = &tcp_src_mask;
981 
982   /* end the pattern array */
983   pattern[2].type = RTE_FLOW_ITEM_TYPE_END;
984 
985   /* validate and create the flow rule */
986   if (!rte_flow_validate(port_id, &attr, pattern, action, &error)) {
987       flow = rte_flow_create(port_id, &attr, pattern, action, &error);
988       if (!flow) {
989           return port_flow_complain(&error);
990       }
991   }
992 
993   return 1;
994 }
995 
996 static int
997 init_flow(uint16_t port_id, uint16_t tcp_port) {
998   // struct ff_flow_cfg fcfg = ff_global_cfg.dpdk.flow_cfgs[0];
999 
1000   // int i;
1001   // for (i = 0; i < fcfg.nb_port; i++) {
1002   //     if(!create_tcp_flow(fcfg.port_id, fcfg.tcp_ports[i])) {
1003   //         return 0;
1004   //     }
1005   // }
1006 
1007   if(!create_tcp_flow(port_id, tcp_port)) {
1008       rte_exit(EXIT_FAILURE, "create tcp flow failed\n");
1009       return -1;
1010   }
1011 
1012   /*  ARP rule */
1013   struct rte_flow_attr attr = {.ingress = 1};
1014   struct rte_flow_action_queue queue = {.index = 0};
1015 
1016   struct rte_flow_item pattern_[2];
1017   struct rte_flow_action action[2];
1018   struct rte_flow_item_eth eth_type = {.type = RTE_BE16(0x0806)};
1019   struct rte_flow_item_eth eth_mask = {
1020           .type = RTE_BE16(0xffff)
1021   };
1022 
1023   memset(pattern_, 0, sizeof(pattern_));
1024   memset(action, 0, sizeof(action));
1025 
1026   pattern_[0].type = RTE_FLOW_ITEM_TYPE_ETH;
1027   pattern_[0].spec = &eth_type;
1028   pattern_[0].mask = &eth_mask;
1029 
1030   pattern_[1].type = RTE_FLOW_ITEM_TYPE_END;
1031 
1032   /* create the action */
1033   action[0].type = RTE_FLOW_ACTION_TYPE_QUEUE;
1034   action[0].conf = &queue;
1035   action[1].type = RTE_FLOW_ACTION_TYPE_END;
1036 
1037   struct rte_flow *flow;
1038   struct rte_flow_error error;
1039   /* validate and create the flow rule */
1040   if (!rte_flow_validate(port_id, &attr, pattern_, action, &error)) {
1041       flow = rte_flow_create(port_id, &attr, pattern_, action, &error);
1042       if (!flow) {
1043           return port_flow_complain(&error);
1044       }
1045   }
1046 
1047   return 1;
1048 }
1049 
1050 #endif
1051 
1052 int
1053 ff_dpdk_init(int argc, char **argv)
1054 {
1055     if (ff_global_cfg.dpdk.nb_procs < 1 ||
1056         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
1057         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
1058         ff_global_cfg.dpdk.proc_id < 0) {
1059         printf("param num_procs[%d] or proc_id[%d] error!\n",
1060             ff_global_cfg.dpdk.nb_procs,
1061             ff_global_cfg.dpdk.proc_id);
1062         exit(1);
1063     }
1064 
1065     int ret = rte_eal_init(argc, argv);
1066     if (ret < 0) {
1067         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1068     }
1069 
1070     numa_on = ff_global_cfg.dpdk.numa_on;
1071 
1072     idle_sleep = ff_global_cfg.dpdk.idle_sleep;
1073     pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \
1074         BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay;
1075 
1076     init_lcore_conf();
1077 
1078     init_mem_pool();
1079 
1080     init_dispatch_ring();
1081 
1082     init_msg_ring();
1083 
1084 #ifdef FF_KNI
1085     enable_kni = ff_global_cfg.kni.enable;
1086     if (enable_kni) {
1087         init_kni();
1088     }
1089 #endif
1090 
1091 #ifdef FF_USE_PAGE_ARRAY
1092     ff_mmap_init();
1093 #endif
1094 
1095 #ifdef FF_FLOW_ISOLATE
1096     // run once in primary process
1097     if (0 == lcore_conf.tx_queue_id[0]){
1098         ret = port_flow_isolate(0, 1);
1099         if (ret < 0)
1100             rte_exit(EXIT_FAILURE, "init_port_isolate failed\n");
1101     }
1102 #endif
1103 
1104     ret = init_port_start();
1105     if (ret < 0) {
1106         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
1107     }
1108 
1109     init_clock();
1110 #ifdef FF_FLOW_ISOLATE
1111     //TODO: using config options replace magic number
1112     ret = init_flow(0, 80);
1113     if (ret < 0) {
1114         rte_exit(EXIT_FAILURE, "init_port_flow failed\n");
1115     }
1116 #endif
1117     return 0;
1118 }
1119 
1120 static void
1121 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
1122 {
1123     uint8_t rx_csum = ctx->hw_features.rx_csum;
1124     if (rx_csum) {
1125         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
1126             rte_pktmbuf_free(pkt);
1127             return;
1128         }
1129     }
1130 
1131     void *data = rte_pktmbuf_mtod(pkt, void*);
1132     uint16_t len = rte_pktmbuf_data_len(pkt);
1133 
1134     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
1135     if (hdr == NULL) {
1136         rte_pktmbuf_free(pkt);
1137         return;
1138     }
1139 
1140     if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) {
1141         ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci);
1142     }
1143 
1144     struct rte_mbuf *pn = pkt->next;
1145     void *prev = hdr;
1146     while(pn != NULL) {
1147         data = rte_pktmbuf_mtod(pn, void*);
1148         len = rte_pktmbuf_data_len(pn);
1149 
1150         void *mb = ff_mbuf_get(prev, data, len);
1151         if (mb == NULL) {
1152             ff_mbuf_free(hdr);
1153             rte_pktmbuf_free(pkt);
1154             return;
1155         }
1156         pn = pn->next;
1157         prev = mb;
1158     }
1159 
1160     ff_veth_process_packet(ctx->ifp, hdr);
1161 }
1162 
1163 static enum FilterReturn
1164 protocol_filter(const void *data, uint16_t len)
1165 {
1166     if(len < RTE_ETHER_ADDR_LEN)
1167         return FILTER_UNKNOWN;
1168 
1169     const struct rte_ether_hdr *hdr;
1170     const struct rte_vlan_hdr *vlanhdr;
1171     hdr = (const struct rte_ether_hdr *)data;
1172     uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type);
1173     data += RTE_ETHER_HDR_LEN;
1174     len -= RTE_ETHER_HDR_LEN;
1175 
1176     if (ether_type == RTE_ETHER_TYPE_VLAN) {
1177         vlanhdr = (struct rte_vlan_hdr *)data;
1178         ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto);
1179         data += sizeof(struct rte_vlan_hdr);
1180         len -= sizeof(struct rte_vlan_hdr);
1181     }
1182 
1183     if(ether_type == RTE_ETHER_TYPE_ARP)
1184         return FILTER_ARP;
1185 
1186 #ifdef INET6
1187     if (ether_type == RTE_ETHER_TYPE_IPV6) {
1188         return ff_kni_proto_filter(data,
1189             len, ether_type);
1190     }
1191 #endif
1192 
1193 #ifndef FF_KNI
1194     return FILTER_UNKNOWN;
1195 #else
1196     if (!enable_kni) {
1197         return FILTER_UNKNOWN;
1198     }
1199 
1200     if(ether_type != RTE_ETHER_TYPE_IPV4)
1201         return FILTER_UNKNOWN;
1202 
1203     return ff_kni_proto_filter(data,
1204         len, ether_type);
1205 #endif
1206 }
1207 
1208 static inline void
1209 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
1210 {
1211     struct rte_mbuf *md;
1212     void *src, *dst;
1213 
1214     dst = rte_pktmbuf_mtod(mi, void *);
1215     src = rte_pktmbuf_mtod(m, void *);
1216 
1217     mi->data_len = m->data_len;
1218     rte_memcpy(dst, src, m->data_len);
1219 
1220     mi->port = m->port;
1221     mi->vlan_tci = m->vlan_tci;
1222     mi->vlan_tci_outer = m->vlan_tci_outer;
1223     mi->tx_offload = m->tx_offload;
1224     mi->hash = m->hash;
1225     mi->ol_flags = m->ol_flags;
1226     mi->packet_type = m->packet_type;
1227 }
1228 
1229 /* copied from rte_pktmbuf_clone */
1230 static inline struct rte_mbuf *
1231 pktmbuf_deep_clone(const struct rte_mbuf *md,
1232     struct rte_mempool *mp)
1233 {
1234     struct rte_mbuf *mc, *mi, **prev;
1235     uint32_t pktlen;
1236     uint8_t nseg;
1237 
1238     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
1239         return NULL;
1240 
1241     mi = mc;
1242     prev = &mi->next;
1243     pktlen = md->pkt_len;
1244     nseg = 0;
1245 
1246     do {
1247         nseg++;
1248         pktmbuf_deep_attach(mi, md);
1249         *prev = mi;
1250         prev = &mi->next;
1251     } while ((md = md->next) != NULL &&
1252         (mi = rte_pktmbuf_alloc(mp)) != NULL);
1253 
1254     *prev = NULL;
1255     mc->nb_segs = nseg;
1256     mc->pkt_len = pktlen;
1257 
1258     /* Allocation of new indirect segment failed */
1259     if (unlikely (mi == NULL)) {
1260         rte_pktmbuf_free(mc);
1261         return NULL;
1262     }
1263 
1264     __rte_mbuf_sanity_check(mc, 1);
1265     return mc;
1266 }
1267 
1268 static inline void
1269 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
1270     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
1271 {
1272     struct lcore_conf *qconf = &lcore_conf;
1273     uint16_t nb_queues = qconf->nb_queue_list[port_id];
1274 
1275     uint16_t i;
1276     for (i = 0; i < count; i++) {
1277         struct rte_mbuf *rtem = bufs[i];
1278 
1279         if (unlikely( ff_global_cfg.pcap.enable)) {
1280             if (!pkts_from_ring) {
1281                 ff_dump_packets( ff_global_cfg.pcap.save_path, rtem, ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1282             }
1283         }
1284 
1285         void *data = rte_pktmbuf_mtod(rtem, void*);
1286         uint16_t len = rte_pktmbuf_data_len(rtem);
1287 
1288         if (!pkts_from_ring) {
1289             ff_traffic.rx_packets++;
1290             ff_traffic.rx_bytes += len;
1291         }
1292 
1293         if (!pkts_from_ring && packet_dispatcher) {
1294             int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues);
1295             if (ret == FF_DISPATCH_RESPONSE) {
1296                 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len;
1297 
1298                 /*
1299                  * We have not support vlan out strip
1300                  */
1301                 if (rtem->vlan_tci) {
1302                     data = rte_pktmbuf_prepend(rtem, sizeof(struct rte_vlan_hdr));
1303                     if (data != NULL) {
1304                         memmove(data, data + sizeof(struct rte_vlan_hdr), RTE_ETHER_HDR_LEN);
1305                         struct rte_ether_hdr *etherhdr = (struct rte_ether_hdr *)data;
1306                         struct rte_vlan_hdr *vlanhdr = (struct rte_vlan_hdr *)(data + RTE_ETHER_HDR_LEN);
1307                         vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci);
1308                         vlanhdr->eth_proto = etherhdr->ether_type;
1309                         etherhdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN);
1310                     }
1311                 }
1312                 send_single_packet(rtem, port_id);
1313                 continue;
1314             }
1315 
1316             if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) {
1317                 rte_pktmbuf_free(rtem);
1318                 continue;
1319             }
1320 
1321             if (ret != queue_id) {
1322                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
1323                 if (ret < 0)
1324                     rte_pktmbuf_free(rtem);
1325 
1326                 continue;
1327             }
1328         }
1329 
1330         enum FilterReturn filter = protocol_filter(data, len);
1331 #ifdef INET6
1332         if (filter == FILTER_ARP || filter == FILTER_NDP) {
1333 #else
1334         if (filter == FILTER_ARP) {
1335 #endif
1336             struct rte_mempool *mbuf_pool;
1337             struct rte_mbuf *mbuf_clone;
1338             if (!pkts_from_ring) {
1339                 uint16_t j;
1340                 for(j = 0; j < nb_queues; ++j) {
1341                     if(j == queue_id)
1342                         continue;
1343 
1344                     unsigned socket_id = 0;
1345                     if (numa_on) {
1346                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1347                         socket_id = rte_lcore_to_socket_id(lcore_id);
1348                     }
1349                     mbuf_pool = pktmbuf_pool[socket_id];
1350                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1351                     if(mbuf_clone) {
1352                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1353                             mbuf_clone);
1354                         if (ret < 0)
1355                             rte_pktmbuf_free(mbuf_clone);
1356                     }
1357                 }
1358             }
1359 
1360 #ifdef FF_KNI
1361             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1362                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1363                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1364                 if(mbuf_clone) {
1365                     ff_kni_enqueue(port_id, mbuf_clone);
1366                 }
1367             }
1368 #endif
1369             ff_veth_input(ctx, rtem);
1370 #ifdef FF_KNI
1371         } else if (enable_kni) {
1372             if (knictl_action == FF_KNICTL_ACTION_ALL_TO_KNI){
1373                 ff_kni_enqueue(port_id, rtem);
1374             } else if (knictl_action == FF_KNICTL_ACTION_ALL_TO_FF){
1375                 ff_veth_input(ctx, rtem);
1376             } else if (knictl_action == FF_KNICTL_ACTION_DEFAULT){
1377                 if (enable_kni &&
1378                         ((filter == FILTER_KNI && kni_accept) ||
1379                         (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1380                         ff_kni_enqueue(port_id, rtem);
1381                 } else {
1382                     ff_veth_input(ctx, rtem);
1383                 }
1384             } else {
1385                 ff_veth_input(ctx, rtem);
1386             }
1387 #endif
1388         } else {
1389             ff_veth_input(ctx, rtem);
1390         }
1391     }
1392 }
1393 
1394 static inline int
1395 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1396     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1397 {
1398     /* read packet from ring buf and to process */
1399     uint16_t nb_rb;
1400     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1401         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1402 
1403     if(nb_rb > 0) {
1404         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1405     }
1406 
1407     return 0;
1408 }
1409 
1410 static inline void
1411 handle_sysctl_msg(struct ff_msg *msg)
1412 {
1413     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1414         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1415         msg->sysctl.newlen);
1416 
1417     if (ret < 0) {
1418         msg->result = errno;
1419     } else {
1420         msg->result = 0;
1421     }
1422 }
1423 
1424 static inline void
1425 handle_ioctl_msg(struct ff_msg *msg)
1426 {
1427     int fd, ret;
1428 #ifdef INET6
1429     if (msg->msg_type == FF_IOCTL6) {
1430         fd = ff_socket(AF_INET6, SOCK_DGRAM, 0);
1431     } else
1432 #endif
1433         fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1434 
1435     if (fd < 0) {
1436         ret = -1;
1437         goto done;
1438     }
1439 
1440     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1441 
1442     ff_close(fd);
1443 
1444 done:
1445     if (ret < 0) {
1446         msg->result = errno;
1447     } else {
1448         msg->result = 0;
1449     }
1450 }
1451 
1452 static inline void
1453 handle_route_msg(struct ff_msg *msg)
1454 {
1455     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1456         &msg->route.len, msg->route.maxlen);
1457     if (ret < 0) {
1458         msg->result = errno;
1459     } else {
1460         msg->result = 0;
1461     }
1462 }
1463 
1464 static inline void
1465 handle_top_msg(struct ff_msg *msg)
1466 {
1467     msg->top = ff_top_status;
1468     msg->result = 0;
1469 }
1470 
1471 #ifdef FF_NETGRAPH
1472 static inline void
1473 handle_ngctl_msg(struct ff_msg *msg)
1474 {
1475     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1476     if (ret < 0) {
1477         msg->result = errno;
1478     } else {
1479         msg->result = 0;
1480         msg->ngctl.ret = ret;
1481     }
1482 }
1483 #endif
1484 
1485 #ifdef FF_IPFW
1486 static inline void
1487 handle_ipfw_msg(struct ff_msg *msg)
1488 {
1489     int fd, ret;
1490     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1491     if (fd < 0) {
1492         ret = -1;
1493         goto done;
1494     }
1495 
1496     switch (msg->ipfw.cmd) {
1497         case FF_IPFW_GET:
1498             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1499                 msg->ipfw.optname, msg->ipfw.optval,
1500                 msg->ipfw.optlen);
1501             break;
1502         case FF_IPFW_SET:
1503             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1504                 msg->ipfw.optname, msg->ipfw.optval,
1505                 *(msg->ipfw.optlen));
1506             break;
1507         default:
1508             ret = -1;
1509             errno = ENOTSUP;
1510             break;
1511     }
1512 
1513     ff_close(fd);
1514 
1515 done:
1516     if (ret < 0) {
1517         msg->result = errno;
1518     } else {
1519         msg->result = 0;
1520     }
1521 }
1522 #endif
1523 
1524 static inline void
1525 handle_traffic_msg(struct ff_msg *msg)
1526 {
1527     msg->traffic = ff_traffic;
1528     msg->result = 0;
1529 }
1530 
1531 #ifdef FF_KNI
1532 static inline void
1533 handle_knictl_msg(struct ff_msg *msg)
1534 {
1535     if (msg->knictl.kni_cmd == FF_KNICTL_CMD_SET){
1536         switch (msg->knictl.kni_action){
1537             case FF_KNICTL_ACTION_ALL_TO_FF: knictl_action = FF_KNICTL_ACTION_ALL_TO_FF; msg->result = 0; printf("new kni action: alltoff\n"); break;
1538             case FF_KNICTL_ACTION_ALL_TO_KNI: knictl_action = FF_KNICTL_ACTION_ALL_TO_KNI; msg->result = 0; printf("new kni action: alltokni\n"); break;
1539             case FF_KNICTL_ACTION_DEFAULT: knictl_action = FF_KNICTL_ACTION_DEFAULT; msg->result = 0; printf("new kni action: default\n"); break;
1540             default: msg->result = -1;
1541         }
1542     }
1543     else if (msg->knictl.kni_cmd == FF_KNICTL_CMD_GET){
1544         msg->knictl.kni_action = knictl_action;
1545     } else {
1546         msg->result = -2;
1547     }
1548 }
1549 #endif
1550 
1551 static inline void
1552 handle_default_msg(struct ff_msg *msg)
1553 {
1554     msg->result = ENOTSUP;
1555 }
1556 
1557 static inline void
1558 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1559 {
1560     switch (msg->msg_type) {
1561         case FF_SYSCTL:
1562             handle_sysctl_msg(msg);
1563             break;
1564         case FF_IOCTL:
1565 #ifdef INET6
1566         case FF_IOCTL6:
1567 #endif
1568             handle_ioctl_msg(msg);
1569             break;
1570         case FF_ROUTE:
1571             handle_route_msg(msg);
1572             break;
1573         case FF_TOP:
1574             handle_top_msg(msg);
1575             break;
1576 #ifdef FF_NETGRAPH
1577         case FF_NGCTL:
1578             handle_ngctl_msg(msg);
1579             break;
1580 #endif
1581 #ifdef FF_IPFW
1582         case FF_IPFW_CTL:
1583             handle_ipfw_msg(msg);
1584             break;
1585 #endif
1586         case FF_TRAFFIC:
1587             handle_traffic_msg(msg);
1588             break;
1589 #ifdef FF_KNI
1590         case FF_KNICTL:
1591             handle_knictl_msg(msg);
1592             break;
1593 #endif
1594         default:
1595             handle_default_msg(msg);
1596             break;
1597     }
1598     rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg);
1599 }
1600 
1601 static inline int
1602 process_msg_ring(uint16_t proc_id)
1603 {
1604     void *msg;
1605     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1606 
1607     if (unlikely(ret == 0)) {
1608         handle_msg((struct ff_msg *)msg, proc_id);
1609     }
1610 
1611     return 0;
1612 }
1613 
1614 /* Send burst of packets on an output interface */
1615 static inline int
1616 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1617 {
1618     struct rte_mbuf **m_table;
1619     int ret;
1620     uint16_t queueid;
1621 
1622     queueid = qconf->tx_queue_id[port];
1623     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1624 
1625     if (unlikely(ff_global_cfg.pcap.enable)) {
1626         uint16_t i;
1627         for (i = 0; i < n; i++) {
1628             ff_dump_packets( ff_global_cfg.pcap.save_path, m_table[i],
1629                ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1630         }
1631     }
1632 
1633     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1634     ff_traffic.tx_packets += ret;
1635     uint16_t i;
1636     for (i = 0; i < ret; i++) {
1637         ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]);
1638 #ifdef FF_USE_PAGE_ARRAY
1639         if (qconf->tx_mbufs[port].bsd_m_table[i])
1640             ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs);
1641 #endif
1642     }
1643     if (unlikely(ret < n)) {
1644         do {
1645             rte_pktmbuf_free(m_table[ret]);
1646 #ifdef FF_USE_PAGE_ARRAY
1647             if ( qconf->tx_mbufs[port].bsd_m_table[ret] )
1648                 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]);
1649 #endif
1650         } while (++ret < n);
1651     }
1652     return 0;
1653 }
1654 
1655 /* Enqueue a single packet, and send burst if queue is filled */
1656 static inline int
1657 send_single_packet(struct rte_mbuf *m, uint8_t port)
1658 {
1659     uint16_t len;
1660     struct lcore_conf *qconf;
1661 
1662     qconf = &lcore_conf;
1663     len = qconf->tx_mbufs[port].len;
1664     qconf->tx_mbufs[port].m_table[len] = m;
1665     len++;
1666 
1667     /* enough pkts to be sent */
1668     if (unlikely(len == MAX_PKT_BURST)) {
1669         send_burst(qconf, MAX_PKT_BURST, port);
1670         len = 0;
1671     }
1672 
1673     qconf->tx_mbufs[port].len = len;
1674     return 0;
1675 }
1676 
1677 int
1678 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1679     int total)
1680 {
1681 #ifdef FF_USE_PAGE_ARRAY
1682     struct lcore_conf *qconf = &lcore_conf;
1683     int    len = 0;
1684 
1685     len = ff_if_send_onepkt(ctx, m,total);
1686     if (unlikely(len == MAX_PKT_BURST)) {
1687         send_burst(qconf, MAX_PKT_BURST, ctx->port_id);
1688         len = 0;
1689     }
1690     qconf->tx_mbufs[ctx->port_id].len = len;
1691     return 0;
1692 #endif
1693     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1694     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1695     if (head == NULL) {
1696         ff_mbuf_free(m);
1697         return -1;
1698     }
1699 
1700     head->pkt_len = total;
1701     head->nb_segs = 0;
1702 
1703     int off = 0;
1704     struct rte_mbuf *cur = head, *prev = NULL;
1705     while(total > 0) {
1706         if (cur == NULL) {
1707             cur = rte_pktmbuf_alloc(mbuf_pool);
1708             if (cur == NULL) {
1709                 rte_pktmbuf_free(head);
1710                 ff_mbuf_free(m);
1711                 return -1;
1712             }
1713         }
1714 
1715         if (prev != NULL) {
1716             prev->next = cur;
1717         }
1718         head->nb_segs++;
1719 
1720         prev = cur;
1721         void *data = rte_pktmbuf_mtod(cur, void*);
1722         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1723         int ret = ff_mbuf_copydata(m, data, off, len);
1724         if (ret < 0) {
1725             rte_pktmbuf_free(head);
1726             ff_mbuf_free(m);
1727             return -1;
1728         }
1729 
1730 
1731         cur->data_len = len;
1732         off += len;
1733         total -= len;
1734         cur = NULL;
1735     }
1736 
1737     struct ff_tx_offload offload = {0};
1738     ff_mbuf_tx_offload(m, &offload);
1739 
1740     void *data = rte_pktmbuf_mtod(head, void*);
1741 
1742     if (offload.ip_csum) {
1743         /* ipv6 not supported yet */
1744         struct rte_ipv4_hdr *iph;
1745         int iph_len;
1746         iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN);
1747         iph_len = (iph->version_ihl & 0x0f) << 2;
1748 
1749         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1750         head->l2_len = RTE_ETHER_HDR_LEN;
1751         head->l3_len = iph_len;
1752     }
1753 
1754     if (ctx->hw_features.tx_csum_l4) {
1755         struct rte_ipv4_hdr *iph;
1756         int iph_len;
1757         iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN);
1758         iph_len = (iph->version_ihl & 0x0f) << 2;
1759 
1760         if (offload.tcp_csum) {
1761             head->ol_flags |= PKT_TX_TCP_CKSUM;
1762             head->l2_len = RTE_ETHER_HDR_LEN;
1763             head->l3_len = iph_len;
1764         }
1765 
1766         /*
1767          *  TCP segmentation offload.
1768          *
1769          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1770          *    implies PKT_TX_TCP_CKSUM)
1771          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1772          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1773          *    write the IP checksum to 0 in the packet
1774          *  - fill the mbuf offload information: l2_len,
1775          *    l3_len, l4_len, tso_segsz
1776          *  - calculate the pseudo header checksum without taking ip_len
1777          *    in account, and set it in the TCP header. Refer to
1778          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1779          *    used as helpers.
1780          */
1781         if (offload.tso_seg_size) {
1782             struct rte_tcp_hdr *tcph;
1783             int tcph_len;
1784             tcph = (struct rte_tcp_hdr *)((char *)iph + iph_len);
1785             tcph_len = (tcph->data_off & 0xf0) >> 2;
1786             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1787 
1788             head->ol_flags |= PKT_TX_TCP_SEG;
1789             head->l4_len = tcph_len;
1790             head->tso_segsz = offload.tso_seg_size;
1791         }
1792 
1793         if (offload.udp_csum) {
1794             head->ol_flags |= PKT_TX_UDP_CKSUM;
1795             head->l2_len = RTE_ETHER_HDR_LEN;
1796             head->l3_len = iph_len;
1797         }
1798     }
1799 
1800     ff_mbuf_free(m);
1801 
1802     return send_single_packet(head, ctx->port_id);
1803 }
1804 
1805 static int
1806 main_loop(void *arg)
1807 {
1808     struct loop_routine *lr = (struct loop_routine *)arg;
1809 
1810     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1811     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc;
1812     int i, j, nb_rx, idle;
1813     uint16_t port_id, queue_id;
1814     struct lcore_conf *qconf;
1815     uint64_t drain_tsc = 0;
1816     struct ff_dpdk_if_context *ctx;
1817 
1818     if (pkt_tx_delay) {
1819         drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay;
1820     }
1821 
1822     prev_tsc = 0;
1823     usch_tsc = 0;
1824 
1825     qconf = &lcore_conf;
1826 
1827     while (1) {
1828         cur_tsc = rte_rdtsc();
1829         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1830             rte_timer_manage();
1831         }
1832 
1833         idle = 1;
1834         sys_tsc = 0;
1835         usr_tsc = 0;
1836 
1837         /*
1838          * TX burst queue drain
1839          */
1840         diff_tsc = cur_tsc - prev_tsc;
1841         if (unlikely(diff_tsc >= drain_tsc)) {
1842             for (i = 0; i < qconf->nb_tx_port; i++) {
1843                 port_id = qconf->tx_port_id[i];
1844                 if (qconf->tx_mbufs[port_id].len == 0)
1845                     continue;
1846 
1847                 idle = 0;
1848 
1849                 send_burst(qconf,
1850                     qconf->tx_mbufs[port_id].len,
1851                     port_id);
1852                 qconf->tx_mbufs[port_id].len = 0;
1853             }
1854 
1855             prev_tsc = cur_tsc;
1856         }
1857 
1858         /*
1859          * Read packet from RX queues
1860          */
1861         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1862             port_id = qconf->rx_queue_list[i].port_id;
1863             queue_id = qconf->rx_queue_list[i].queue_id;
1864             ctx = veth_ctx[port_id];
1865 
1866 #ifdef FF_KNI
1867             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1868                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1869             }
1870 #endif
1871 
1872             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1873 
1874             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1875                 MAX_PKT_BURST);
1876             if (nb_rx == 0)
1877                 continue;
1878 
1879             idle = 0;
1880 
1881             /* Prefetch first packets */
1882             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1883                 rte_prefetch0(rte_pktmbuf_mtod(
1884                         pkts_burst[j], void *));
1885             }
1886 
1887             /* Prefetch and handle already prefetched packets */
1888             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1889                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1890                         j + PREFETCH_OFFSET], void *));
1891                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1892             }
1893 
1894             /* Handle remaining prefetched packets */
1895             for (; j < nb_rx; j++) {
1896                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1897             }
1898         }
1899 
1900         process_msg_ring(qconf->proc_id);
1901 
1902         div_tsc = rte_rdtsc();
1903 
1904         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) {
1905             usch_tsc = cur_tsc;
1906             lr->loop(lr->arg);
1907         }
1908 
1909         idle_sleep_tsc = rte_rdtsc();
1910         if (likely(idle && idle_sleep)) {
1911             usleep(idle_sleep);
1912             end_tsc = rte_rdtsc();
1913         } else {
1914             end_tsc = idle_sleep_tsc;
1915         }
1916 
1917         if (usch_tsc == cur_tsc) {
1918             usr_tsc = idle_sleep_tsc - div_tsc;
1919         }
1920 
1921         if (!idle) {
1922             sys_tsc = div_tsc - cur_tsc;
1923             ff_top_status.sys_tsc += sys_tsc;
1924         }
1925 
1926         ff_top_status.usr_tsc += usr_tsc;
1927         ff_top_status.work_tsc += end_tsc - cur_tsc;
1928         ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1929 
1930         ff_top_status.loops++;
1931     }
1932 
1933     return 0;
1934 }
1935 
1936 int
1937 ff_dpdk_if_up(void) {
1938     int i;
1939     struct lcore_conf *qconf = &lcore_conf;
1940     for (i = 0; i < qconf->nb_tx_port; i++) {
1941         uint16_t port_id = qconf->tx_port_id[i];
1942 
1943         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1944         veth_ctx[port_id] = ff_veth_attach(pconf);
1945         if (veth_ctx[port_id] == NULL) {
1946             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1947         }
1948     }
1949 
1950     return 0;
1951 }
1952 
1953 void
1954 ff_dpdk_run(loop_func_t loop, void *arg) {
1955     struct loop_routine *lr = rte_malloc(NULL,
1956         sizeof(struct loop_routine), 0);
1957     lr->loop = loop;
1958     lr->arg = arg;
1959     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1960     rte_eal_mp_wait_lcore();
1961     rte_free(lr);
1962 }
1963 
1964 void
1965 ff_dpdk_pktmbuf_free(void *m)
1966 {
1967     rte_pktmbuf_free((struct rte_mbuf *)m);
1968 }
1969 
1970 static uint32_t
1971 toeplitz_hash(unsigned keylen, const uint8_t *key,
1972     unsigned datalen, const uint8_t *data)
1973 {
1974     uint32_t hash = 0, v;
1975     u_int i, b;
1976 
1977     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1978 
1979     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1980     for (i = 0; i < datalen; i++) {
1981         for (b = 0; b < 8; b++) {
1982             if (data[i] & (1<<(7-b)))
1983                 hash ^= v;
1984             v <<= 1;
1985             if ((i + 4) < keylen &&
1986                 (key[i+4] & (1<<(7-b))))
1987                 v |= 1;
1988         }
1989     }
1990     return (hash);
1991 }
1992 
1993 int
1994 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
1995     uint16_t sport, uint16_t dport)
1996 {
1997     struct lcore_conf *qconf = &lcore_conf;
1998     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
1999     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
2000 
2001     if (nb_queues <= 1) {
2002         return 1;
2003     }
2004 
2005     uint16_t reta_size = rss_reta_size[ctx->port_id];
2006     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
2007 
2008     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
2009         sizeof(dport)];
2010 
2011     unsigned datalen = 0;
2012 
2013     bcopy(&saddr, &data[datalen], sizeof(saddr));
2014     datalen += sizeof(saddr);
2015 
2016     bcopy(&daddr, &data[datalen], sizeof(daddr));
2017     datalen += sizeof(daddr);
2018 
2019     bcopy(&sport, &data[datalen], sizeof(sport));
2020     datalen += sizeof(sport);
2021 
2022     bcopy(&dport, &data[datalen], sizeof(dport));
2023     datalen += sizeof(dport);
2024 
2025     uint32_t hash = 0;
2026     hash = toeplitz_hash(rsskey_len, rsskey, datalen, data);
2027 
2028     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
2029 }
2030 
2031 void
2032 ff_regist_packet_dispatcher(dispatch_func_t func)
2033 {
2034     packet_dispatcher = func;
2035 }
2036 
2037 uint64_t
2038 ff_get_tsc_ns()
2039 {
2040     uint64_t cur_tsc = rte_rdtsc();
2041     uint64_t hz = rte_get_tsc_hz();
2042     return ((double)cur_tsc/(double)hz) * NS_PER_S;
2043 }
2044 
2045