xref: /f-stack/lib/ff_dpdk_if.c (revision 2d9fd380)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 #include <unistd.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30 
31 #include <rte_common.h>
32 #include <rte_byteorder.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memcpy.h>
36 #include <rte_memzone.h>
37 #include <rte_config.h>
38 #include <rte_eal.h>
39 #include <rte_pci.h>
40 #include <rte_mbuf.h>
41 #include <rte_memory.h>
42 #include <rte_lcore.h>
43 #include <rte_launch.h>
44 #include <rte_ethdev.h>
45 #include <rte_debug.h>
46 #include <rte_common.h>
47 #include <rte_ether.h>
48 #include <rte_malloc.h>
49 #include <rte_cycles.h>
50 #include <rte_timer.h>
51 #include <rte_thash.h>
52 #include <rte_ip.h>
53 #include <rte_tcp.h>
54 #include <rte_udp.h>
55 #include <rte_eth_bond.h>
56 
57 #include "ff_dpdk_if.h"
58 #include "ff_dpdk_pcap.h"
59 #include "ff_dpdk_kni.h"
60 #include "ff_config.h"
61 #include "ff_veth.h"
62 #include "ff_host_interface.h"
63 #include "ff_msg.h"
64 #include "ff_api.h"
65 #include "ff_memory.h"
66 
67 #ifdef FF_KNI
68 #define KNI_MBUF_MAX 2048
69 #define KNI_QUEUE_SIZE 2048
70 
71 int enable_kni;
72 static int kni_accept;
73 static int knictl_action = FF_KNICTL_ACTION_DEFAULT;
74 #endif
75 
76 static int numa_on;
77 
78 static unsigned idle_sleep;
79 static unsigned pkt_tx_delay;
80 
81 static struct rte_timer freebsd_clock;
82 
83 // Mellanox Linux's driver key
84 static uint8_t default_rsskey_40bytes[40] = {
85     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
86     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
87     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
88     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
89     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
90 };
91 
92 static uint8_t default_rsskey_52bytes[52] = {
93     0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23,
94     0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30,
95     0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02,
96     0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c,
97     0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55,
98     0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e,
99     0x81, 0x15, 0x03, 0x66
100 };
101 
102 static uint8_t symmetric_rsskey[52] = {
103     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
104     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
105     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
106     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
107     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
108     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
109     0x6d, 0x5a, 0x6d, 0x5a
110 };
111 
112 static int rsskey_len = sizeof(default_rsskey_40bytes);
113 static uint8_t *rsskey = default_rsskey_40bytes;
114 
115 struct lcore_conf lcore_conf;
116 
117 struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
118 
119 static pcblddr_func_t pcblddr_fun;
120 
121 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
122 static dispatch_func_t packet_dispatcher;
123 
124 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
125 
126 #define BOND_DRIVER_NAME    "net_bonding"
127 
128 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port);
129 
130 struct ff_msg_ring {
131     char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE];
132     /* ring[0] for lcore recv msg, other send */
133     /* ring[1] for lcore send msg, other read */
134     struct rte_ring *ring[FF_MSG_NUM];
135 } __rte_cache_aligned;
136 
137 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
138 static struct rte_mempool *message_pool;
139 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
140 
141 static struct ff_top_args ff_top_status;
142 static struct ff_traffic_args ff_traffic;
143 extern void ff_hardclock(void);
144 
145 static void
146 ff_hardclock_job(__rte_unused struct rte_timer *timer,
147     __rte_unused void *arg) {
148     ff_hardclock();
149     ff_update_current_ts();
150 }
151 
152 struct ff_dpdk_if_context *
153 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
154 {
155     struct ff_dpdk_if_context *ctx;
156 
157     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
158     if (ctx == NULL)
159         return NULL;
160 
161     ctx->sc = sc;
162     ctx->ifp = ifp;
163     ctx->port_id = cfg->port_id;
164     ctx->hw_features = cfg->hw_features;
165 
166     return ctx;
167 }
168 
169 void
170 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
171 {
172     free(ctx);
173 }
174 
175 static void
176 check_all_ports_link_status(void)
177 {
178     #define CHECK_INTERVAL 100 /* 100ms */
179     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
180 
181     uint16_t portid;
182     uint8_t count, all_ports_up, print_flag = 0;
183     struct rte_eth_link link;
184 
185     printf("\nChecking link status");
186     fflush(stdout);
187 
188     int i, nb_ports;
189     nb_ports = ff_global_cfg.dpdk.nb_ports;
190     for (count = 0; count <= MAX_CHECK_TIME; count++) {
191         all_ports_up = 1;
192         for (i = 0; i < nb_ports; i++) {
193             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
194             memset(&link, 0, sizeof(link));
195             rte_eth_link_get_nowait(portid, &link);
196 
197             /* print link status if flag set */
198             if (print_flag == 1) {
199                 if (link.link_status) {
200                     printf("Port %d Link Up - speed %u "
201                         "Mbps - %s\n", (int)portid,
202                         (unsigned)link.link_speed,
203                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
204                         ("full-duplex") : ("half-duplex\n"));
205                 } else {
206                     printf("Port %d Link Down\n", (int)portid);
207                 }
208                 continue;
209             }
210             /* clear all_ports_up flag if any link down */
211             if (link.link_status == 0) {
212                 all_ports_up = 0;
213                 break;
214             }
215         }
216 
217         /* after finally printing all link status, get out */
218         if (print_flag == 1)
219             break;
220 
221         if (all_ports_up == 0) {
222             printf(".");
223             fflush(stdout);
224             rte_delay_ms(CHECK_INTERVAL);
225         }
226 
227         /* set the print_flag if all ports up or timeout */
228         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
229             print_flag = 1;
230             printf("done\n");
231         }
232     }
233 }
234 
235 static int
236 init_lcore_conf(void)
237 {
238     uint8_t nb_dev_ports = rte_eth_dev_count_avail();
239     if (nb_dev_ports == 0) {
240         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
241     }
242 
243     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
244         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
245                  ff_global_cfg.dpdk.max_portid);
246     }
247 
248     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
249     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
250 
251     uint16_t socket_id = 0;
252     if (numa_on) {
253         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
254     }
255 
256     lcore_conf.socket_id = socket_id;
257 
258     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
259     if (!rte_lcore_is_enabled(lcore_id)) {
260         rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
261     }
262 
263     int j;
264     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
265         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
266         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
267 
268         int queueid = -1;
269         int i;
270         for (i = 0; i < pconf->nb_lcores; i++) {
271             if (pconf->lcore_list[i] == lcore_id) {
272                 queueid = i;
273             }
274         }
275         if (queueid < 0) {
276             continue;
277         }
278         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
279         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
280         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
281         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
282         lcore_conf.nb_rx_queue++;
283 
284         lcore_conf.tx_queue_id[port_id] = queueid;
285         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
286         lcore_conf.nb_tx_port++;
287 
288         /* Enable pcap dump */
289         if (ff_global_cfg.pcap.enable) {
290             ff_enable_pcap(ff_global_cfg.pcap.save_path, ff_global_cfg.pcap.snap_len);
291         }
292 
293         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
294     }
295 
296     if (lcore_conf.nb_rx_queue == 0) {
297         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
298     }
299 
300     return 0;
301 }
302 
303 static int
304 init_mem_pool(void)
305 {
306     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
307     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
308     uint32_t nb_tx_queue = nb_lcores;
309     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
310     uint16_t max_portid = ff_global_cfg.dpdk.max_portid;
311 
312     unsigned nb_mbuf = RTE_ALIGN_CEIL (
313         (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE          +
314         nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST    +
315         nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE  +
316         nb_lcores * MEMPOOL_CACHE_SIZE +
317 #ifdef FF_KNI
318         nb_ports * KNI_MBUF_MAX +
319         nb_ports * KNI_QUEUE_SIZE +
320 #endif
321         nb_lcores * nb_ports * DISPATCH_RING_SIZE),
322         (unsigned)8192);
323 
324     unsigned socketid = 0;
325     uint16_t i, lcore_id;
326     char s[64];
327 
328     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
329         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
330         if (numa_on) {
331             socketid = rte_lcore_to_socket_id(lcore_id);
332         }
333 
334         if (socketid >= NB_SOCKETS) {
335             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
336                 socketid, i, NB_SOCKETS);
337         }
338 
339         if (pktmbuf_pool[socketid] != NULL) {
340             continue;
341         }
342 
343         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
344             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
345             pktmbuf_pool[socketid] =
346                 rte_pktmbuf_pool_create(s, nb_mbuf,
347                     MEMPOOL_CACHE_SIZE, 0,
348                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
349         } else {
350             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
351             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
352         }
353 
354         if (pktmbuf_pool[socketid] == NULL) {
355             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
356         } else {
357             printf("create mbuf pool on socket %d\n", socketid);
358         }
359 
360 #ifdef FF_USE_PAGE_ARRAY
361         nb_mbuf = RTE_ALIGN_CEIL (
362             nb_ports*nb_lcores*MAX_PKT_BURST    +
363             nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
364             nb_lcores*MEMPOOL_CACHE_SIZE,
365             (unsigned)4096);
366         ff_init_ref_pool(nb_mbuf, socketid);
367 #endif
368     }
369 
370     return 0;
371 }
372 
373 static struct rte_ring *
374 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
375 {
376     struct rte_ring *ring;
377 
378     if (name == NULL) {
379         rte_exit(EXIT_FAILURE, "create ring failed, no name!\n");
380     }
381 
382     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
383         ring = rte_ring_create(name, count, socket_id, flags);
384     } else {
385         ring = rte_ring_lookup(name);
386     }
387 
388     if (ring == NULL) {
389         rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name);
390     }
391 
392     return ring;
393 }
394 
395 static int
396 init_dispatch_ring(void)
397 {
398     int j;
399     char name_buf[RTE_RING_NAMESIZE];
400     int queueid;
401 
402     unsigned socketid = lcore_conf.socket_id;
403 
404     /* Create ring according to ports actually being used. */
405     int nb_ports = ff_global_cfg.dpdk.nb_ports;
406     for (j = 0; j < nb_ports; j++) {
407         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
408         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
409         int nb_queues = pconf->nb_lcores;
410         if (dispatch_ring[portid] == NULL) {
411             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
412 
413             dispatch_ring[portid] = rte_zmalloc(name_buf,
414                 sizeof(struct rte_ring *) * nb_queues,
415                 RTE_CACHE_LINE_SIZE);
416             if (dispatch_ring[portid] == NULL) {
417                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
418                     "failed\n", name_buf);
419             }
420         }
421 
422         for(queueid = 0; queueid < nb_queues; ++queueid) {
423             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
424                 portid, queueid);
425             dispatch_ring[portid][queueid] = create_ring(name_buf,
426                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
427 
428             if (dispatch_ring[portid][queueid] == NULL)
429                 rte_panic("create ring:%s failed!\n", name_buf);
430 
431             printf("create ring:%s success, %u ring entries are now free!\n",
432                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
433         }
434     }
435 
436     return 0;
437 }
438 
439 static void
440 ff_msg_init(struct rte_mempool *mp,
441     __attribute__((unused)) void *opaque_arg,
442     void *obj, __attribute__((unused)) unsigned i)
443 {
444     struct ff_msg *msg = (struct ff_msg *)obj;
445     msg->msg_type = FF_UNKNOWN;
446     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
447     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
448     msg->original_buf = NULL;
449     msg->original_buf_len = 0;
450 }
451 
452 static int
453 init_msg_ring(void)
454 {
455     uint16_t i, j;
456     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
457     unsigned socketid = lcore_conf.socket_id;
458 
459     /* Create message buffer pool */
460     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
461         message_pool = rte_mempool_create(FF_MSG_POOL,
462            MSG_RING_SIZE * 2 * nb_procs,
463            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
464            NULL, NULL, ff_msg_init, NULL,
465            socketid, 0);
466     } else {
467         message_pool = rte_mempool_lookup(FF_MSG_POOL);
468     }
469 
470     if (message_pool == NULL) {
471         rte_panic("Create msg mempool failed\n");
472     }
473 
474     for(i = 0; i < nb_procs; ++i) {
475         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
476             "%s%u", FF_MSG_RING_IN, i);
477         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
478             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
479         if (msg_ring[i].ring[0] == NULL)
480             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
481 
482         for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) {
483             snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE,
484                 "%s%u_%u", FF_MSG_RING_OUT, i, j);
485             msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j],
486                 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
487             if (msg_ring[i].ring[j] == NULL)
488                 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]);
489         }
490     }
491 
492     return 0;
493 }
494 
495 #ifdef FF_KNI
496 
497 static enum FF_KNICTL_CMD get_kni_action(const char *c){
498     if (!c)
499         return FF_KNICTL_ACTION_DEFAULT;
500     if (0 == strcasecmp(c, "alltokni")){
501         return FF_KNICTL_ACTION_ALL_TO_KNI;
502     } else  if (0 == strcasecmp(c, "alltoff")){
503         return FF_KNICTL_ACTION_ALL_TO_FF;
504     } else if (0 == strcasecmp(c, "default")){
505         return FF_KNICTL_ACTION_DEFAULT;
506     } else {
507         return FF_KNICTL_ACTION_DEFAULT;
508     }
509 }
510 
511 static int
512 init_kni(void)
513 {
514     int nb_ports = rte_eth_dev_count_avail();
515     kni_accept = 0;
516     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
517         kni_accept = 1;
518 
519     knictl_action = get_kni_action(ff_global_cfg.kni.kni_action);
520 
521     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
522         ff_global_cfg.kni.udp_port);
523 
524     unsigned socket_id = lcore_conf.socket_id;
525     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
526 
527     nb_ports = ff_global_cfg.dpdk.nb_ports;
528     int i, ret;
529     for (i = 0; i < nb_ports; i++) {
530         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
531         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
532     }
533 
534     return 0;
535 }
536 #endif
537 
538 //RSS reta update will failed when enable flow isolate
539 #ifndef FF_FLOW_ISOLATE
540 static void
541 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
542 {
543     if (reta_size == 0) {
544         return;
545     }
546 
547     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
548     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
549 
550     /* config HW indirection table */
551     unsigned i, j, hash=0;
552     for (i = 0; i < reta_conf_size; i++) {
553         reta_conf[i].mask = ~0ULL;
554         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
555             reta_conf[i].reta[j] = hash++ % nb_queues;
556         }
557     }
558 
559     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
560         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
561             port_id);
562     }
563 }
564 #endif
565 
566 static int
567 init_port_start(void)
568 {
569     int nb_ports = ff_global_cfg.dpdk.nb_ports;
570     unsigned socketid = 0;
571     struct rte_mempool *mbuf_pool;
572     uint16_t i, j;
573 
574     for (i = 0; i < nb_ports; i++) {
575         uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i];
576         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id];
577         uint16_t nb_queues = pconf->nb_lcores;
578 
579         for (j=0; j<=pconf->nb_slaves; j++) {
580             if (j < pconf->nb_slaves) {
581                 port_id = pconf->slave_portid_list[j];
582                 printf("To init %s's %d'st slave port[%d]\n",
583                         ff_global_cfg.dpdk.bond_cfgs->name,
584                         j, port_id);
585             } else {
586                 port_id = u_port_id;
587             }
588 
589             struct rte_eth_dev_info dev_info;
590             struct rte_eth_conf port_conf = {0};
591             struct rte_eth_rxconf rxq_conf;
592             struct rte_eth_txconf txq_conf;
593 
594             int ret = rte_eth_dev_info_get(port_id, &dev_info);
595             if (ret != 0)
596                 rte_exit(EXIT_FAILURE,
597                     "Error during getting device (port %u) info: %s\n",
598                     port_id, strerror(-ret));
599 
600             if (nb_queues > dev_info.max_rx_queues) {
601                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
602                     nb_queues,
603                     dev_info.max_rx_queues);
604             }
605 
606             if (nb_queues > dev_info.max_tx_queues) {
607                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
608                     nb_queues,
609                     dev_info.max_tx_queues);
610             }
611 
612             struct rte_ether_addr addr;
613             rte_eth_macaddr_get(port_id, &addr);
614             printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
615                        " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
616                     (unsigned)port_id,
617                     addr.addr_bytes[0], addr.addr_bytes[1],
618                     addr.addr_bytes[2], addr.addr_bytes[3],
619                     addr.addr_bytes[4], addr.addr_bytes[5]);
620 
621             rte_memcpy(pconf->mac,
622                 addr.addr_bytes, RTE_ETHER_ADDR_LEN);
623 
624             /* Set RSS mode */
625             uint64_t default_rss_hf = ETH_RSS_PROTO_MASK;
626             port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
627             port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf;
628             if (dev_info.hash_key_size == 52) {
629                 rsskey = default_rsskey_52bytes;
630                 rsskey_len = 52;
631             }
632             if (ff_global_cfg.dpdk.symmetric_rss) {
633                 printf("Use symmetric Receive-side Scaling(RSS) key\n");
634                 rsskey = symmetric_rsskey;
635             }
636             port_conf.rx_adv_conf.rss_conf.rss_key = rsskey;
637             port_conf.rx_adv_conf.rss_conf.rss_key_len = rsskey_len;
638             port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads;
639             if (port_conf.rx_adv_conf.rss_conf.rss_hf !=
640                     ETH_RSS_PROTO_MASK) {
641                 printf("Port %u modified RSS hash function based on hardware support,"
642                         "requested:%#"PRIx64" configured:%#"PRIx64"\n",
643                         port_id, default_rss_hf,
644                         port_conf.rx_adv_conf.rss_conf.rss_hf);
645             }
646 
647             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) {
648                 port_conf.txmode.offloads |=
649                     DEV_TX_OFFLOAD_MBUF_FAST_FREE;
650             }
651 
652             /* Set Rx VLAN stripping */
653             if (ff_global_cfg.dpdk.vlan_strip) {
654                 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
655                     port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
656                 }
657             }
658 
659             /* Enable HW CRC stripping */
660             port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC;
661 
662             /* FIXME: Enable TCP LRO ?*/
663             #if 0
664             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
665                 printf("LRO is supported\n");
666                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
667                 pconf->hw_features.rx_lro = 1;
668             }
669             #endif
670 
671             /* Set Rx checksum checking */
672             if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
673                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
674                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
675                 printf("RX checksum offload supported\n");
676                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
677                 pconf->hw_features.rx_csum = 1;
678             }
679 
680             if (ff_global_cfg.dpdk.tx_csum_offoad_skip == 0) {
681                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
682                     printf("TX ip checksum offload supported\n");
683                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
684                     pconf->hw_features.tx_csum_ip = 1;
685                 }
686 
687                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
688                     (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
689                     printf("TX TCP&UDP checksum offload supported\n");
690                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM;
691                     pconf->hw_features.tx_csum_l4 = 1;
692                 }
693             } else {
694                 printf("TX checksum offoad is disabled\n");
695             }
696 
697             if (ff_global_cfg.dpdk.tso) {
698                 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
699                     printf("TSO is supported\n");
700                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO;
701                     pconf->hw_features.tx_tso = 1;
702                 }
703             } else {
704                 printf("TSO is disabled\n");
705             }
706 
707             if (dev_info.reta_size) {
708                 /* reta size must be power of 2 */
709                 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
710 
711                 rss_reta_size[port_id] = dev_info.reta_size;
712                 printf("port[%d]: rss table size: %d\n", port_id,
713                     dev_info.reta_size);
714             }
715 
716             if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
717                 continue;
718             }
719 
720             ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
721             if (ret != 0) {
722                 return ret;
723             }
724 
725             static uint16_t nb_rxd = RX_QUEUE_SIZE;
726             static uint16_t nb_txd = TX_QUEUE_SIZE;
727             ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd);
728             if (ret < 0)
729                 printf("Could not adjust number of descriptors "
730                         "for port%u (%d)\n", (unsigned)port_id, ret);
731 
732             uint16_t q;
733             for (q = 0; q < nb_queues; q++) {
734                 if (numa_on) {
735                     uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
736                     socketid = rte_lcore_to_socket_id(lcore_id);
737                 }
738                 mbuf_pool = pktmbuf_pool[socketid];
739 
740                 txq_conf = dev_info.default_txconf;
741                 txq_conf.offloads = port_conf.txmode.offloads;
742                 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd,
743                     socketid, &txq_conf);
744                 if (ret < 0) {
745                     return ret;
746                 }
747 
748                 rxq_conf = dev_info.default_rxconf;
749                 rxq_conf.offloads = port_conf.rxmode.offloads;
750                 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd,
751                     socketid, &rxq_conf, mbuf_pool);
752                 if (ret < 0) {
753                     return ret;
754                 }
755             }
756 
757 
758             if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME,
759                     strlen(dev_info.driver_name)) == 0) {
760 
761                 rte_eth_macaddr_get(port_id, &addr);
762                 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
763                            " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
764                         (unsigned)port_id,
765                         addr.addr_bytes[0], addr.addr_bytes[1],
766                         addr.addr_bytes[2], addr.addr_bytes[3],
767                         addr.addr_bytes[4], addr.addr_bytes[5]);
768 
769                 rte_memcpy(pconf->mac,
770                     addr.addr_bytes, RTE_ETHER_ADDR_LEN);
771 
772                 int mode, count, x;
773                 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS;
774 
775                 mode = rte_eth_bond_mode_get(port_id);
776                 printf("Port %u, bond mode:%d\n", port_id, mode);
777 
778                 count = rte_eth_bond_slaves_get(port_id, slaves, len);
779                 printf("Port %u, %s's slave ports count:%d\n", port_id,
780                             ff_global_cfg.dpdk.bond_cfgs->name, count);
781                 for (x=0; x<count; x++) {
782                     printf("Port %u, %s's slave port[%u]\n", port_id,
783                             ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]);
784                 }
785             }
786 
787             ret = rte_eth_dev_start(port_id);
788             if (ret < 0) {
789                 return ret;
790             }
791     //RSS reta update will failed when enable flow isolate
792     #ifndef FF_FLOW_ISOLATE
793             if (nb_queues > 1) {
794                 /*
795                  * FIXME: modify RSS set to FDIR
796                  */
797                 set_rss_table(port_id, dev_info.reta_size, nb_queues);
798             }
799     #endif
800 
801             /* Enable RX in promiscuous mode for the Ethernet device. */
802             if (ff_global_cfg.dpdk.promiscuous) {
803                 ret = rte_eth_promiscuous_enable(port_id);
804                 if (ret == 0) {
805                     printf("set port %u to promiscuous mode ok\n", port_id);
806                 } else {
807                     printf("set port %u to promiscuous mode error\n", port_id);
808                 }
809             }
810         }
811     }
812 
813     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
814         check_all_ports_link_status();
815     }
816 
817     return 0;
818 }
819 
820 static int
821 init_clock(void)
822 {
823     rte_timer_subsystem_init();
824     uint64_t hz = rte_get_timer_hz();
825     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
826     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
827 
828     rte_timer_init(&freebsd_clock);
829     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
830         rte_lcore_id(), &ff_hardclock_job, NULL);
831 
832     ff_update_current_ts();
833 
834     return 0;
835 }
836 
837 #ifdef FF_FLOW_ISOLATE
838 /** Print a message out of a flow error. */
839 static int
840 port_flow_complain(struct rte_flow_error *error)
841 {
842     static const char *const errstrlist[] = {
843         [RTE_FLOW_ERROR_TYPE_NONE] = "no error",
844         [RTE_FLOW_ERROR_TYPE_UNSPECIFIED] = "cause unspecified",
845         [RTE_FLOW_ERROR_TYPE_HANDLE] = "flow rule (handle)",
846         [RTE_FLOW_ERROR_TYPE_ATTR_GROUP] = "group field",
847         [RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY] = "priority field",
848         [RTE_FLOW_ERROR_TYPE_ATTR_INGRESS] = "ingress field",
849         [RTE_FLOW_ERROR_TYPE_ATTR_EGRESS] = "egress field",
850         [RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER] = "transfer field",
851         [RTE_FLOW_ERROR_TYPE_ATTR] = "attributes structure",
852         [RTE_FLOW_ERROR_TYPE_ITEM_NUM] = "pattern length",
853         [RTE_FLOW_ERROR_TYPE_ITEM_SPEC] = "item specification",
854         [RTE_FLOW_ERROR_TYPE_ITEM_LAST] = "item specification range",
855         [RTE_FLOW_ERROR_TYPE_ITEM_MASK] = "item specification mask",
856         [RTE_FLOW_ERROR_TYPE_ITEM] = "specific pattern item",
857         [RTE_FLOW_ERROR_TYPE_ACTION_NUM] = "number of actions",
858         [RTE_FLOW_ERROR_TYPE_ACTION_CONF] = "action configuration",
859         [RTE_FLOW_ERROR_TYPE_ACTION] = "specific action",
860     };
861     const char *errstr;
862     char buf[32];
863     int err = rte_errno;
864 
865     if ((unsigned int)error->type >= RTE_DIM(errstrlist) ||
866         !errstrlist[error->type])
867         errstr = "unknown type";
868     else
869         errstr = errstrlist[error->type];
870     printf("Caught error type %d (%s): %s%s: %s\n",
871            error->type, errstr,
872            error->cause ? (snprintf(buf, sizeof(buf), "cause: %p, ",
873                                     error->cause), buf) : "",
874            error->message ? error->message : "(no stated reason)",
875            rte_strerror(err));
876     return -err;
877 }
878 
879 static int
880 port_flow_isolate(uint16_t port_id, int set)
881 {
882     struct rte_flow_error error;
883 
884     /* Poisoning to make sure PMDs update it in case of error. */
885     memset(&error, 0x66, sizeof(error));
886     if (rte_flow_isolate(port_id, set, &error))
887         return port_flow_complain(&error);
888     printf("Ingress traffic on port %u is %s to the defined flow rules\n",
889            port_id,
890            set ? "now restricted" : "not restricted anymore");
891     return 0;
892 }
893 
894 static int
895 create_tcp_flow(uint16_t port_id, uint16_t tcp_port) {
896   struct rte_flow_attr attr = {.ingress = 1};
897   struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
898   int nb_queues = pconf->nb_lcores;
899   uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
900   int i = 0, j = 0;
901   for (i = 0, j = 0; i < nb_queues; ++i)
902    queue[j++] = i;
903   struct rte_flow_action_rss rss = {
904    .types = ETH_RSS_NONFRAG_IPV4_TCP,
905    .key_len = rsskey_len,
906    .key = rsskey,
907    .queue_num = j,
908    .queue = queue,
909   };
910 
911   struct rte_eth_dev_info dev_info;
912   int ret = rte_eth_dev_info_get(port_id, &dev_info);
913   if (ret != 0)
914     rte_exit(EXIT_FAILURE, "Error during getting device (port %u) info: %s\n", port_id, strerror(-ret));
915 
916   struct rte_flow_item pattern[3];
917   struct rte_flow_action action[2];
918   struct rte_flow_item_tcp tcp_spec;
919   struct rte_flow_item_tcp tcp_mask = {
920           .hdr = {
921                   .src_port = RTE_BE16(0x0000),
922                   .dst_port = RTE_BE16(0xffff),
923           },
924   };
925   struct rte_flow_error error;
926 
927   memset(pattern, 0, sizeof(pattern));
928   memset(action, 0, sizeof(action));
929 
930   /* set the dst ipv4 packet to the required value */
931   pattern[0].type = RTE_FLOW_ITEM_TYPE_IPV4;
932 
933   memset(&tcp_spec, 0, sizeof(struct rte_flow_item_tcp));
934   tcp_spec.hdr.dst_port = rte_cpu_to_be_16(tcp_port);
935   pattern[1].type = RTE_FLOW_ITEM_TYPE_TCP;
936   pattern[1].spec = &tcp_spec;
937   pattern[1].mask = &tcp_mask;
938 
939   /* end the pattern array */
940   pattern[2].type = RTE_FLOW_ITEM_TYPE_END;
941 
942   /* create the action */
943   action[0].type = RTE_FLOW_ACTION_TYPE_RSS;
944   action[0].conf = &rss;
945   action[1].type = RTE_FLOW_ACTION_TYPE_END;
946 
947   struct rte_flow *flow;
948   /* validate and create the flow rule */
949   if (!rte_flow_validate(port_id, &attr, pattern, action, &error)) {
950       flow = rte_flow_create(port_id, &attr, pattern, action, &error);
951       if (!flow) {
952           return port_flow_complain(&error);
953       }
954   }
955 
956   memset(pattern, 0, sizeof(pattern));
957 
958   /* set the dst ipv4 packet to the required value */
959   pattern[0].type = RTE_FLOW_ITEM_TYPE_IPV4;
960 
961   struct rte_flow_item_tcp tcp_src_mask = {
962           .hdr = {
963                   .src_port = RTE_BE16(0xffff),
964                   .dst_port = RTE_BE16(0x0000),
965           },
966   };
967 
968   memset(&tcp_spec, 0, sizeof(struct rte_flow_item_tcp));
969   tcp_spec.hdr.src_port = rte_cpu_to_be_16(tcp_port);
970   pattern[1].type = RTE_FLOW_ITEM_TYPE_TCP;
971   pattern[1].spec = &tcp_spec;
972   pattern[1].mask = &tcp_src_mask;
973 
974   /* end the pattern array */
975   pattern[2].type = RTE_FLOW_ITEM_TYPE_END;
976 
977   /* validate and create the flow rule */
978   if (!rte_flow_validate(port_id, &attr, pattern, action, &error)) {
979       flow = rte_flow_create(port_id, &attr, pattern, action, &error);
980       if (!flow) {
981           return port_flow_complain(&error);
982       }
983   }
984 
985   return 1;
986 }
987 
988 static int
989 init_flow(uint16_t port_id, uint16_t tcp_port) {
990   // struct ff_flow_cfg fcfg = ff_global_cfg.dpdk.flow_cfgs[0];
991 
992   // int i;
993   // for (i = 0; i < fcfg.nb_port; i++) {
994   //     if(!create_tcp_flow(fcfg.port_id, fcfg.tcp_ports[i])) {
995   //         return 0;
996   //     }
997   // }
998 
999   if(!create_tcp_flow(port_id, tcp_port)) {
1000       rte_exit(EXIT_FAILURE, "create tcp flow failed\n");
1001       return -1;
1002   }
1003 
1004   /*  ARP rule */
1005   struct rte_flow_attr attr = {.ingress = 1};
1006   struct rte_flow_action_queue queue = {.index = 0};
1007 
1008   struct rte_flow_item pattern_[2];
1009   struct rte_flow_action action[2];
1010   struct rte_flow_item_eth eth_type = {.type = RTE_BE16(0x0806)};
1011   struct rte_flow_item_eth eth_mask = {
1012           .type = RTE_BE16(0xffff)
1013   };
1014 
1015   memset(pattern_, 0, sizeof(pattern_));
1016   memset(action, 0, sizeof(action));
1017 
1018   pattern_[0].type = RTE_FLOW_ITEM_TYPE_ETH;
1019   pattern_[0].spec = &eth_type;
1020   pattern_[0].mask = &eth_mask;
1021 
1022   pattern_[1].type = RTE_FLOW_ITEM_TYPE_END;
1023 
1024   /* create the action */
1025   action[0].type = RTE_FLOW_ACTION_TYPE_QUEUE;
1026   action[0].conf = &queue;
1027   action[1].type = RTE_FLOW_ACTION_TYPE_END;
1028 
1029   struct rte_flow *flow;
1030   struct rte_flow_error error;
1031   /* validate and create the flow rule */
1032   if (!rte_flow_validate(port_id, &attr, pattern_, action, &error)) {
1033       flow = rte_flow_create(port_id, &attr, pattern_, action, &error);
1034       if (!flow) {
1035           return port_flow_complain(&error);
1036       }
1037   }
1038 
1039   return 1;
1040 }
1041 
1042 #endif
1043 
1044 int
1045 ff_dpdk_init(int argc, char **argv)
1046 {
1047     if (ff_global_cfg.dpdk.nb_procs < 1 ||
1048         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
1049         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
1050         ff_global_cfg.dpdk.proc_id < 0) {
1051         printf("param num_procs[%d] or proc_id[%d] error!\n",
1052             ff_global_cfg.dpdk.nb_procs,
1053             ff_global_cfg.dpdk.proc_id);
1054         exit(1);
1055     }
1056 
1057     int ret = rte_eal_init(argc, argv);
1058     if (ret < 0) {
1059         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1060     }
1061 
1062     numa_on = ff_global_cfg.dpdk.numa_on;
1063 
1064     idle_sleep = ff_global_cfg.dpdk.idle_sleep;
1065     pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \
1066         BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay;
1067 
1068     init_lcore_conf();
1069 
1070     init_mem_pool();
1071 
1072     init_dispatch_ring();
1073 
1074     init_msg_ring();
1075 
1076 #ifdef FF_KNI
1077     enable_kni = ff_global_cfg.kni.enable;
1078     if (enable_kni) {
1079         init_kni();
1080     }
1081 #endif
1082 
1083 #ifdef FF_USE_PAGE_ARRAY
1084     ff_mmap_init();
1085 #endif
1086 
1087 #ifdef FF_FLOW_ISOLATE
1088     // run once in primary process
1089     if (0 == lcore_conf.tx_queue_id[0]){
1090         ret = port_flow_isolate(0, 1);
1091         if (ret < 0)
1092             rte_exit(EXIT_FAILURE, "init_port_isolate failed\n");
1093     }
1094 #endif
1095 
1096     ret = init_port_start();
1097     if (ret < 0) {
1098         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
1099     }
1100 
1101     init_clock();
1102 #ifdef FF_FLOW_ISOLATE
1103     //Only give a example usage: port_id=0, tcp_port= 80.
1104     //Recommend:
1105     //1. init_flow should replace `set_rss_table` in `init_port_start` loop, This can set all NIC's port_id_list instead only 0 device(port_id).
1106     //2. using config options `tcp_port` replace magic number of 80
1107     ret = init_flow(0, 80);
1108     if (ret < 0) {
1109         rte_exit(EXIT_FAILURE, "init_port_flow failed\n");
1110     }
1111 #endif
1112     return 0;
1113 }
1114 
1115 static void
1116 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
1117 {
1118     uint8_t rx_csum = ctx->hw_features.rx_csum;
1119     if (rx_csum) {
1120         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
1121             rte_pktmbuf_free(pkt);
1122             return;
1123         }
1124     }
1125 
1126     void *data = rte_pktmbuf_mtod(pkt, void*);
1127     uint16_t len = rte_pktmbuf_data_len(pkt);
1128 
1129     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
1130     if (hdr == NULL) {
1131         rte_pktmbuf_free(pkt);
1132         return;
1133     }
1134 
1135     if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) {
1136         ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci);
1137     }
1138 
1139     struct rte_mbuf *pn = pkt->next;
1140     void *prev = hdr;
1141     while(pn != NULL) {
1142         data = rte_pktmbuf_mtod(pn, void*);
1143         len = rte_pktmbuf_data_len(pn);
1144 
1145         void *mb = ff_mbuf_get(prev, pn, data, len);
1146         if (mb == NULL) {
1147             ff_mbuf_free(hdr);
1148             rte_pktmbuf_free(pkt);
1149             return;
1150         }
1151         pn = pn->next;
1152         prev = mb;
1153     }
1154 
1155     ff_veth_process_packet(ctx->ifp, hdr);
1156 }
1157 
1158 static enum FilterReturn
1159 protocol_filter(const void *data, uint16_t len)
1160 {
1161     if(len < RTE_ETHER_ADDR_LEN)
1162         return FILTER_UNKNOWN;
1163 
1164     const struct rte_ether_hdr *hdr;
1165     const struct rte_vlan_hdr *vlanhdr;
1166     hdr = (const struct rte_ether_hdr *)data;
1167     uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type);
1168     data += RTE_ETHER_HDR_LEN;
1169     len -= RTE_ETHER_HDR_LEN;
1170 
1171     if (ether_type == RTE_ETHER_TYPE_VLAN) {
1172         vlanhdr = (struct rte_vlan_hdr *)data;
1173         ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto);
1174         data += sizeof(struct rte_vlan_hdr);
1175         len -= sizeof(struct rte_vlan_hdr);
1176     }
1177 
1178     if(ether_type == RTE_ETHER_TYPE_ARP)
1179         return FILTER_ARP;
1180 
1181 #ifdef INET6
1182     if (ether_type == RTE_ETHER_TYPE_IPV6) {
1183         return ff_kni_proto_filter(data,
1184             len, ether_type);
1185     }
1186 #endif
1187 
1188 #ifndef FF_KNI
1189     return FILTER_UNKNOWN;
1190 #else
1191     if (!enable_kni) {
1192         return FILTER_UNKNOWN;
1193     }
1194 
1195     if(ether_type != RTE_ETHER_TYPE_IPV4)
1196         return FILTER_UNKNOWN;
1197 
1198     return ff_kni_proto_filter(data,
1199         len, ether_type);
1200 #endif
1201 }
1202 
1203 static inline void
1204 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
1205 {
1206     struct rte_mbuf *md;
1207     void *src, *dst;
1208 
1209     dst = rte_pktmbuf_mtod(mi, void *);
1210     src = rte_pktmbuf_mtod(m, void *);
1211 
1212     mi->data_len = m->data_len;
1213     rte_memcpy(dst, src, m->data_len);
1214 
1215     mi->port = m->port;
1216     mi->vlan_tci = m->vlan_tci;
1217     mi->vlan_tci_outer = m->vlan_tci_outer;
1218     mi->tx_offload = m->tx_offload;
1219     mi->hash = m->hash;
1220     mi->ol_flags = m->ol_flags;
1221     mi->packet_type = m->packet_type;
1222 }
1223 
1224 /* copied from rte_pktmbuf_clone */
1225 static inline struct rte_mbuf *
1226 pktmbuf_deep_clone(const struct rte_mbuf *md,
1227     struct rte_mempool *mp)
1228 {
1229     struct rte_mbuf *mc, *mi, **prev;
1230     uint32_t pktlen;
1231     uint8_t nseg;
1232 
1233     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
1234         return NULL;
1235 
1236     mi = mc;
1237     prev = &mi->next;
1238     pktlen = md->pkt_len;
1239     nseg = 0;
1240 
1241     do {
1242         nseg++;
1243         pktmbuf_deep_attach(mi, md);
1244         *prev = mi;
1245         prev = &mi->next;
1246     } while ((md = md->next) != NULL &&
1247         (mi = rte_pktmbuf_alloc(mp)) != NULL);
1248 
1249     *prev = NULL;
1250     mc->nb_segs = nseg;
1251     mc->pkt_len = pktlen;
1252 
1253     /* Allocation of new indirect segment failed */
1254     if (unlikely (mi == NULL)) {
1255         rte_pktmbuf_free(mc);
1256         return NULL;
1257     }
1258 
1259     __rte_mbuf_sanity_check(mc, 1);
1260     return mc;
1261 }
1262 
1263 static inline void
1264 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
1265     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
1266 {
1267     struct lcore_conf *qconf = &lcore_conf;
1268     uint16_t nb_queues = qconf->nb_queue_list[port_id];
1269 
1270     uint16_t i;
1271     for (i = 0; i < count; i++) {
1272         struct rte_mbuf *rtem = bufs[i];
1273 
1274         if (unlikely( ff_global_cfg.pcap.enable)) {
1275             if (!pkts_from_ring) {
1276                 ff_dump_packets( ff_global_cfg.pcap.save_path, rtem, ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1277             }
1278         }
1279 
1280         void *data = rte_pktmbuf_mtod(rtem, void*);
1281         uint16_t len = rte_pktmbuf_data_len(rtem);
1282 
1283         if (!pkts_from_ring) {
1284             ff_traffic.rx_packets++;
1285             ff_traffic.rx_bytes += len;
1286         }
1287 
1288         if (!pkts_from_ring && packet_dispatcher) {
1289             int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues);
1290             if (ret == FF_DISPATCH_RESPONSE) {
1291                 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len;
1292 
1293                 /*
1294                  * We have not support vlan out strip
1295                  */
1296                 if (rtem->vlan_tci) {
1297                     data = rte_pktmbuf_prepend(rtem, sizeof(struct rte_vlan_hdr));
1298                     if (data != NULL) {
1299                         memmove(data, data + sizeof(struct rte_vlan_hdr), RTE_ETHER_HDR_LEN);
1300                         struct rte_ether_hdr *etherhdr = (struct rte_ether_hdr *)data;
1301                         struct rte_vlan_hdr *vlanhdr = (struct rte_vlan_hdr *)(data + RTE_ETHER_HDR_LEN);
1302                         vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci);
1303                         vlanhdr->eth_proto = etherhdr->ether_type;
1304                         etherhdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN);
1305                     }
1306                 }
1307                 send_single_packet(rtem, port_id);
1308                 continue;
1309             }
1310 
1311             if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) {
1312                 rte_pktmbuf_free(rtem);
1313                 continue;
1314             }
1315 
1316             if (ret != queue_id) {
1317                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
1318                 if (ret < 0)
1319                     rte_pktmbuf_free(rtem);
1320 
1321                 continue;
1322             }
1323         }
1324 
1325         enum FilterReturn filter = protocol_filter(data, len);
1326 #ifdef INET6
1327         if (filter == FILTER_ARP || filter == FILTER_NDP) {
1328 #else
1329         if (filter == FILTER_ARP) {
1330 #endif
1331             struct rte_mempool *mbuf_pool;
1332             struct rte_mbuf *mbuf_clone;
1333             if (!pkts_from_ring) {
1334                 uint16_t j;
1335                 for(j = 0; j < nb_queues; ++j) {
1336                     if(j == queue_id)
1337                         continue;
1338 
1339                     unsigned socket_id = 0;
1340                     if (numa_on) {
1341                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1342                         socket_id = rte_lcore_to_socket_id(lcore_id);
1343                     }
1344                     mbuf_pool = pktmbuf_pool[socket_id];
1345                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1346                     if(mbuf_clone) {
1347                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1348                             mbuf_clone);
1349                         if (ret < 0)
1350                             rte_pktmbuf_free(mbuf_clone);
1351                     }
1352                 }
1353             }
1354 
1355 #ifdef FF_KNI
1356             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1357                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1358                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1359                 if(mbuf_clone) {
1360                     ff_kni_enqueue(port_id, mbuf_clone);
1361                 }
1362             }
1363 #endif
1364             ff_veth_input(ctx, rtem);
1365 #ifdef FF_KNI
1366         } else if (enable_kni) {
1367             if (knictl_action == FF_KNICTL_ACTION_ALL_TO_KNI){
1368                 ff_kni_enqueue(port_id, rtem);
1369             } else if (knictl_action == FF_KNICTL_ACTION_ALL_TO_FF){
1370                 ff_veth_input(ctx, rtem);
1371             } else if (knictl_action == FF_KNICTL_ACTION_DEFAULT){
1372                 if (enable_kni &&
1373                         ((filter == FILTER_KNI && kni_accept) ||
1374                         (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1375                         ff_kni_enqueue(port_id, rtem);
1376                 } else {
1377                     ff_veth_input(ctx, rtem);
1378                 }
1379             } else {
1380                 ff_veth_input(ctx, rtem);
1381             }
1382 #endif
1383         } else {
1384             ff_veth_input(ctx, rtem);
1385         }
1386     }
1387 }
1388 
1389 static inline int
1390 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1391     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1392 {
1393     /* read packet from ring buf and to process */
1394     uint16_t nb_rb;
1395     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1396         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1397 
1398     if(nb_rb > 0) {
1399         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1400     }
1401 
1402     return 0;
1403 }
1404 
1405 static inline void
1406 handle_sysctl_msg(struct ff_msg *msg)
1407 {
1408     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1409         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1410         msg->sysctl.newlen);
1411 
1412     if (ret < 0) {
1413         msg->result = errno;
1414     } else {
1415         msg->result = 0;
1416     }
1417 }
1418 
1419 static inline void
1420 handle_ioctl_msg(struct ff_msg *msg)
1421 {
1422     int fd, ret;
1423 #ifdef INET6
1424     if (msg->msg_type == FF_IOCTL6) {
1425         fd = ff_socket(AF_INET6, SOCK_DGRAM, 0);
1426     } else
1427 #endif
1428         fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1429 
1430     if (fd < 0) {
1431         ret = -1;
1432         goto done;
1433     }
1434 
1435     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1436 
1437     ff_close(fd);
1438 
1439 done:
1440     if (ret < 0) {
1441         msg->result = errno;
1442     } else {
1443         msg->result = 0;
1444     }
1445 }
1446 
1447 static inline void
1448 handle_route_msg(struct ff_msg *msg)
1449 {
1450     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1451         &msg->route.len, msg->route.maxlen);
1452     if (ret < 0) {
1453         msg->result = errno;
1454     } else {
1455         msg->result = 0;
1456     }
1457 }
1458 
1459 static inline void
1460 handle_top_msg(struct ff_msg *msg)
1461 {
1462     msg->top = ff_top_status;
1463     msg->result = 0;
1464 }
1465 
1466 #ifdef FF_NETGRAPH
1467 static inline void
1468 handle_ngctl_msg(struct ff_msg *msg)
1469 {
1470     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1471     if (ret < 0) {
1472         msg->result = errno;
1473     } else {
1474         msg->result = 0;
1475         msg->ngctl.ret = ret;
1476     }
1477 }
1478 #endif
1479 
1480 #ifdef FF_IPFW
1481 static inline void
1482 handle_ipfw_msg(struct ff_msg *msg)
1483 {
1484     int fd, ret;
1485     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1486     if (fd < 0) {
1487         ret = -1;
1488         goto done;
1489     }
1490 
1491     switch (msg->ipfw.cmd) {
1492         case FF_IPFW_GET:
1493             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1494                 msg->ipfw.optname, msg->ipfw.optval,
1495                 msg->ipfw.optlen);
1496             break;
1497         case FF_IPFW_SET:
1498             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1499                 msg->ipfw.optname, msg->ipfw.optval,
1500                 *(msg->ipfw.optlen));
1501             break;
1502         default:
1503             ret = -1;
1504             errno = ENOTSUP;
1505             break;
1506     }
1507 
1508     ff_close(fd);
1509 
1510 done:
1511     if (ret < 0) {
1512         msg->result = errno;
1513     } else {
1514         msg->result = 0;
1515     }
1516 }
1517 #endif
1518 
1519 static inline void
1520 handle_traffic_msg(struct ff_msg *msg)
1521 {
1522     msg->traffic = ff_traffic;
1523     msg->result = 0;
1524 }
1525 
1526 #ifdef FF_KNI
1527 static inline void
1528 handle_knictl_msg(struct ff_msg *msg)
1529 {
1530     if (msg->knictl.kni_cmd == FF_KNICTL_CMD_SET){
1531         switch (msg->knictl.kni_action){
1532             case FF_KNICTL_ACTION_ALL_TO_FF: knictl_action = FF_KNICTL_ACTION_ALL_TO_FF; msg->result = 0; printf("new kni action: alltoff\n"); break;
1533             case FF_KNICTL_ACTION_ALL_TO_KNI: knictl_action = FF_KNICTL_ACTION_ALL_TO_KNI; msg->result = 0; printf("new kni action: alltokni\n"); break;
1534             case FF_KNICTL_ACTION_DEFAULT: knictl_action = FF_KNICTL_ACTION_DEFAULT; msg->result = 0; printf("new kni action: default\n"); break;
1535             default: msg->result = -1;
1536         }
1537     }
1538     else if (msg->knictl.kni_cmd == FF_KNICTL_CMD_GET){
1539         msg->knictl.kni_action = knictl_action;
1540     } else {
1541         msg->result = -2;
1542     }
1543 }
1544 #endif
1545 
1546 static inline void
1547 handle_default_msg(struct ff_msg *msg)
1548 {
1549     msg->result = ENOTSUP;
1550 }
1551 
1552 static inline void
1553 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1554 {
1555     switch (msg->msg_type) {
1556         case FF_SYSCTL:
1557             handle_sysctl_msg(msg);
1558             break;
1559         case FF_IOCTL:
1560 #ifdef INET6
1561         case FF_IOCTL6:
1562 #endif
1563             handle_ioctl_msg(msg);
1564             break;
1565         case FF_ROUTE:
1566             handle_route_msg(msg);
1567             break;
1568         case FF_TOP:
1569             handle_top_msg(msg);
1570             break;
1571 #ifdef FF_NETGRAPH
1572         case FF_NGCTL:
1573             handle_ngctl_msg(msg);
1574             break;
1575 #endif
1576 #ifdef FF_IPFW
1577         case FF_IPFW_CTL:
1578             handle_ipfw_msg(msg);
1579             break;
1580 #endif
1581         case FF_TRAFFIC:
1582             handle_traffic_msg(msg);
1583             break;
1584 #ifdef FF_KNI
1585         case FF_KNICTL:
1586             handle_knictl_msg(msg);
1587             break;
1588 #endif
1589         default:
1590             handle_default_msg(msg);
1591             break;
1592     }
1593     if (rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg) < 0) {
1594         if (msg->original_buf) {
1595             rte_free(msg->buf_addr);
1596             msg->buf_addr = msg->original_buf;
1597             msg->buf_len = msg->original_buf_len;
1598             msg->original_buf = NULL;
1599         }
1600 
1601         rte_mempool_put(message_pool, msg);
1602     }
1603 }
1604 
1605 static inline int
1606 process_msg_ring(uint16_t proc_id, struct rte_mbuf **pkts_burst)
1607 {
1608     /* read msg from ring buf and to process */
1609     uint16_t nb_rb;
1610     int i;
1611 
1612     nb_rb = rte_ring_dequeue_burst(msg_ring[proc_id].ring[0],
1613         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1614 
1615     if (likely(nb_rb == 0))
1616         return 0;
1617 
1618     for (i = 0; i < nb_rb; ++i) {
1619         handle_msg((struct ff_msg *)pkts_burst[i], proc_id);
1620     }
1621 
1622     return 0;
1623 }
1624 
1625 /* Send burst of packets on an output interface */
1626 static inline int
1627 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1628 {
1629     struct rte_mbuf **m_table;
1630     int ret;
1631     uint16_t queueid;
1632 
1633     queueid = qconf->tx_queue_id[port];
1634     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1635 
1636     if (unlikely(ff_global_cfg.pcap.enable)) {
1637         uint16_t i;
1638         for (i = 0; i < n; i++) {
1639             ff_dump_packets( ff_global_cfg.pcap.save_path, m_table[i],
1640                ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1641         }
1642     }
1643 
1644     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1645     ff_traffic.tx_packets += ret;
1646     uint16_t i;
1647     for (i = 0; i < ret; i++) {
1648         ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]);
1649 #ifdef FF_USE_PAGE_ARRAY
1650         if (qconf->tx_mbufs[port].bsd_m_table[i])
1651             ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs);
1652 #endif
1653     }
1654     if (unlikely(ret < n)) {
1655         do {
1656             rte_pktmbuf_free(m_table[ret]);
1657 #ifdef FF_USE_PAGE_ARRAY
1658             if ( qconf->tx_mbufs[port].bsd_m_table[ret] )
1659                 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]);
1660 #endif
1661         } while (++ret < n);
1662     }
1663     return 0;
1664 }
1665 
1666 /* Enqueue a single packet, and send burst if queue is filled */
1667 static inline int
1668 send_single_packet(struct rte_mbuf *m, uint8_t port)
1669 {
1670     uint16_t len;
1671     struct lcore_conf *qconf;
1672 
1673     qconf = &lcore_conf;
1674     len = qconf->tx_mbufs[port].len;
1675     qconf->tx_mbufs[port].m_table[len] = m;
1676     len++;
1677 
1678     /* enough pkts to be sent */
1679     if (unlikely(len == MAX_PKT_BURST)) {
1680         send_burst(qconf, MAX_PKT_BURST, port);
1681         len = 0;
1682     }
1683 
1684     qconf->tx_mbufs[port].len = len;
1685     return 0;
1686 }
1687 
1688 int
1689 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1690     int total)
1691 {
1692 #ifdef FF_USE_PAGE_ARRAY
1693     struct lcore_conf *qconf = &lcore_conf;
1694     int    len = 0;
1695 
1696     len = ff_if_send_onepkt(ctx, m,total);
1697     if (unlikely(len == MAX_PKT_BURST)) {
1698         send_burst(qconf, MAX_PKT_BURST, ctx->port_id);
1699         len = 0;
1700     }
1701     qconf->tx_mbufs[ctx->port_id].len = len;
1702     return 0;
1703 #endif
1704     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1705     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1706     if (head == NULL) {
1707         ff_mbuf_free(m);
1708         return -1;
1709     }
1710 
1711     head->pkt_len = total;
1712     head->nb_segs = 0;
1713 
1714     int off = 0;
1715     struct rte_mbuf *cur = head, *prev = NULL;
1716     while(total > 0) {
1717         if (cur == NULL) {
1718             cur = rte_pktmbuf_alloc(mbuf_pool);
1719             if (cur == NULL) {
1720                 rte_pktmbuf_free(head);
1721                 ff_mbuf_free(m);
1722                 return -1;
1723             }
1724         }
1725 
1726         if (prev != NULL) {
1727             prev->next = cur;
1728         }
1729         head->nb_segs++;
1730 
1731         prev = cur;
1732         void *data = rte_pktmbuf_mtod(cur, void*);
1733         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1734         int ret = ff_mbuf_copydata(m, data, off, len);
1735         if (ret < 0) {
1736             rte_pktmbuf_free(head);
1737             ff_mbuf_free(m);
1738             return -1;
1739         }
1740 
1741 
1742         cur->data_len = len;
1743         off += len;
1744         total -= len;
1745         cur = NULL;
1746     }
1747 
1748     struct ff_tx_offload offload = {0};
1749     ff_mbuf_tx_offload(m, &offload);
1750 
1751     void *data = rte_pktmbuf_mtod(head, void*);
1752 
1753     if (offload.ip_csum) {
1754         /* ipv6 not supported yet */
1755         struct rte_ipv4_hdr *iph;
1756         int iph_len;
1757         iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN);
1758         iph_len = (iph->version_ihl & 0x0f) << 2;
1759 
1760         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1761         head->l2_len = RTE_ETHER_HDR_LEN;
1762         head->l3_len = iph_len;
1763     }
1764 
1765     if (ctx->hw_features.tx_csum_l4) {
1766         struct rte_ipv4_hdr *iph;
1767         int iph_len;
1768         iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN);
1769         iph_len = (iph->version_ihl & 0x0f) << 2;
1770 
1771         if (offload.tcp_csum) {
1772             head->ol_flags |= PKT_TX_TCP_CKSUM;
1773             head->l2_len = RTE_ETHER_HDR_LEN;
1774             head->l3_len = iph_len;
1775         }
1776 
1777         /*
1778          *  TCP segmentation offload.
1779          *
1780          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1781          *    implies PKT_TX_TCP_CKSUM)
1782          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1783          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1784          *    write the IP checksum to 0 in the packet
1785          *  - fill the mbuf offload information: l2_len,
1786          *    l3_len, l4_len, tso_segsz
1787          *  - calculate the pseudo header checksum without taking ip_len
1788          *    in account, and set it in the TCP header. Refer to
1789          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1790          *    used as helpers.
1791          */
1792         if (offload.tso_seg_size) {
1793             struct rte_tcp_hdr *tcph;
1794             int tcph_len;
1795             tcph = (struct rte_tcp_hdr *)((char *)iph + iph_len);
1796             tcph_len = (tcph->data_off & 0xf0) >> 2;
1797             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1798 
1799             head->ol_flags |= PKT_TX_TCP_SEG;
1800             head->l4_len = tcph_len;
1801             head->tso_segsz = offload.tso_seg_size;
1802         }
1803 
1804         if (offload.udp_csum) {
1805             head->ol_flags |= PKT_TX_UDP_CKSUM;
1806             head->l2_len = RTE_ETHER_HDR_LEN;
1807             head->l3_len = iph_len;
1808         }
1809     }
1810 
1811     ff_mbuf_free(m);
1812 
1813     return send_single_packet(head, ctx->port_id);
1814 }
1815 
1816 static int
1817 main_loop(void *arg)
1818 {
1819     struct loop_routine *lr = (struct loop_routine *)arg;
1820 
1821     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1822     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc;
1823     int i, j, nb_rx, idle;
1824     uint16_t port_id, queue_id;
1825     struct lcore_conf *qconf;
1826     uint64_t drain_tsc = 0;
1827     struct ff_dpdk_if_context *ctx;
1828 
1829     if (pkt_tx_delay) {
1830         drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay;
1831     }
1832 
1833     prev_tsc = 0;
1834     usch_tsc = 0;
1835 
1836     qconf = &lcore_conf;
1837 
1838     while (1) {
1839         cur_tsc = rte_rdtsc();
1840         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1841             rte_timer_manage();
1842         }
1843 
1844         idle = 1;
1845         sys_tsc = 0;
1846         usr_tsc = 0;
1847 
1848         /*
1849          * TX burst queue drain
1850          */
1851         diff_tsc = cur_tsc - prev_tsc;
1852         if (unlikely(diff_tsc >= drain_tsc)) {
1853             for (i = 0; i < qconf->nb_tx_port; i++) {
1854                 port_id = qconf->tx_port_id[i];
1855                 if (qconf->tx_mbufs[port_id].len == 0)
1856                     continue;
1857 
1858                 idle = 0;
1859 
1860                 send_burst(qconf,
1861                     qconf->tx_mbufs[port_id].len,
1862                     port_id);
1863                 qconf->tx_mbufs[port_id].len = 0;
1864             }
1865 
1866             prev_tsc = cur_tsc;
1867         }
1868 
1869         /*
1870          * Read packet from RX queues
1871          */
1872         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1873             port_id = qconf->rx_queue_list[i].port_id;
1874             queue_id = qconf->rx_queue_list[i].queue_id;
1875             ctx = veth_ctx[port_id];
1876 
1877 #ifdef FF_KNI
1878             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1879                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1880             }
1881 #endif
1882 
1883             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1884 
1885             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1886                 MAX_PKT_BURST);
1887             if (nb_rx == 0)
1888                 continue;
1889 
1890             idle = 0;
1891 
1892             /* Prefetch first packets */
1893             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1894                 rte_prefetch0(rte_pktmbuf_mtod(
1895                         pkts_burst[j], void *));
1896             }
1897 
1898             /* Prefetch and handle already prefetched packets */
1899             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1900                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1901                         j + PREFETCH_OFFSET], void *));
1902                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1903             }
1904 
1905             /* Handle remaining prefetched packets */
1906             for (; j < nb_rx; j++) {
1907                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1908             }
1909         }
1910 
1911         process_msg_ring(qconf->proc_id, pkts_burst);
1912 
1913         div_tsc = rte_rdtsc();
1914 
1915         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) {
1916             usch_tsc = cur_tsc;
1917             lr->loop(lr->arg);
1918         }
1919 
1920         idle_sleep_tsc = rte_rdtsc();
1921         if (likely(idle && idle_sleep)) {
1922             usleep(idle_sleep);
1923             end_tsc = rte_rdtsc();
1924         } else {
1925             end_tsc = idle_sleep_tsc;
1926         }
1927 
1928         if (usch_tsc == cur_tsc) {
1929             usr_tsc = idle_sleep_tsc - div_tsc;
1930         }
1931 
1932         if (!idle) {
1933             sys_tsc = div_tsc - cur_tsc;
1934             ff_top_status.sys_tsc += sys_tsc;
1935         }
1936 
1937         ff_top_status.usr_tsc += usr_tsc;
1938         ff_top_status.work_tsc += end_tsc - cur_tsc;
1939         ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1940 
1941         ff_top_status.loops++;
1942     }
1943 
1944     return 0;
1945 }
1946 
1947 int
1948 ff_dpdk_if_up(void) {
1949     int i;
1950     struct lcore_conf *qconf = &lcore_conf;
1951     for (i = 0; i < qconf->nb_tx_port; i++) {
1952         uint16_t port_id = qconf->tx_port_id[i];
1953 
1954         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1955         veth_ctx[port_id] = ff_veth_attach(pconf);
1956         if (veth_ctx[port_id] == NULL) {
1957             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1958         }
1959     }
1960 
1961     return 0;
1962 }
1963 
1964 void
1965 ff_dpdk_run(loop_func_t loop, void *arg) {
1966     struct loop_routine *lr = rte_malloc(NULL,
1967         sizeof(struct loop_routine), 0);
1968     lr->loop = loop;
1969     lr->arg = arg;
1970     rte_eal_mp_remote_launch(main_loop, lr, CALL_MAIN);
1971     rte_eal_mp_wait_lcore();
1972     rte_free(lr);
1973 }
1974 
1975 void
1976 ff_dpdk_pktmbuf_free(void *m)
1977 {
1978     rte_pktmbuf_free_seg((struct rte_mbuf *)m);
1979 }
1980 
1981 static uint32_t
1982 toeplitz_hash(unsigned keylen, const uint8_t *key,
1983     unsigned datalen, const uint8_t *data)
1984 {
1985     uint32_t hash = 0, v;
1986     u_int i, b;
1987 
1988     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1989 
1990     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1991     for (i = 0; i < datalen; i++) {
1992         for (b = 0; b < 8; b++) {
1993             if (data[i] & (1<<(7-b)))
1994                 hash ^= v;
1995             v <<= 1;
1996             if ((i + 4) < keylen &&
1997                 (key[i+4] & (1<<(7-b))))
1998                 v |= 1;
1999         }
2000     }
2001     return (hash);
2002 }
2003 
2004 int
2005 ff_in_pcbladdr(uint16_t family, void *faddr, uint16_t fport, void *laddr)
2006 {
2007     int ret = 0;
2008     uint16_t fa;
2009 
2010     if (!pcblddr_fun)
2011         return ret;
2012 
2013     if (family == AF_INET)
2014         fa = AF_INET;
2015     else if (family == AF_INET6_FREEBSD)
2016         fa = AF_INET6_LINUX;
2017     else
2018         return EADDRNOTAVAIL;
2019 
2020     ret = (*pcblddr_fun)(fa, faddr, fport, laddr);
2021 
2022     return ret;
2023 }
2024 
2025 void
2026 ff_regist_pcblddr_fun(pcblddr_func_t func)
2027 {
2028     pcblddr_fun = func;
2029 }
2030 
2031 int
2032 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
2033     uint16_t sport, uint16_t dport)
2034 {
2035     struct lcore_conf *qconf = &lcore_conf;
2036     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
2037     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
2038 
2039     if (nb_queues <= 1) {
2040         return 1;
2041     }
2042 
2043     uint16_t reta_size = rss_reta_size[ctx->port_id];
2044     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
2045 
2046     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
2047         sizeof(dport)];
2048 
2049     unsigned datalen = 0;
2050 
2051     bcopy(&saddr, &data[datalen], sizeof(saddr));
2052     datalen += sizeof(saddr);
2053 
2054     bcopy(&daddr, &data[datalen], sizeof(daddr));
2055     datalen += sizeof(daddr);
2056 
2057     bcopy(&sport, &data[datalen], sizeof(sport));
2058     datalen += sizeof(sport);
2059 
2060     bcopy(&dport, &data[datalen], sizeof(dport));
2061     datalen += sizeof(dport);
2062 
2063     uint32_t hash = 0;
2064     hash = toeplitz_hash(rsskey_len, rsskey, datalen, data);
2065 
2066     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
2067 }
2068 
2069 void
2070 ff_regist_packet_dispatcher(dispatch_func_t func)
2071 {
2072     packet_dispatcher = func;
2073 }
2074 
2075 uint64_t
2076 ff_get_tsc_ns()
2077 {
2078     uint64_t cur_tsc = rte_rdtsc();
2079     uint64_t hz = rte_get_tsc_hz();
2080     return ((double)cur_tsc/(double)hz) * NS_PER_S;
2081 }
2082 
2083