xref: /f-stack/lib/ff_dpdk_if.c (revision 92bcc6b4)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 #include <unistd.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30 
31 #include <rte_common.h>
32 #include <rte_byteorder.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memcpy.h>
36 #include <rte_memzone.h>
37 #include <rte_config.h>
38 #include <rte_eal.h>
39 #include <rte_pci.h>
40 #include <rte_mbuf.h>
41 #include <rte_memory.h>
42 #include <rte_lcore.h>
43 #include <rte_launch.h>
44 #include <rte_ethdev.h>
45 #include <rte_debug.h>
46 #include <rte_common.h>
47 #include <rte_ether.h>
48 #include <rte_malloc.h>
49 #include <rte_cycles.h>
50 #include <rte_timer.h>
51 #include <rte_thash.h>
52 #include <rte_ip.h>
53 #include <rte_tcp.h>
54 #include <rte_udp.h>
55 #include <rte_eth_bond.h>
56 
57 #include "ff_dpdk_if.h"
58 #include "ff_dpdk_pcap.h"
59 #include "ff_dpdk_kni.h"
60 #include "ff_config.h"
61 #include "ff_veth.h"
62 #include "ff_host_interface.h"
63 #include "ff_msg.h"
64 #include "ff_api.h"
65 #include "ff_memory.h"
66 
67 #ifdef FF_KNI
68 #define KNI_MBUF_MAX 2048
69 #define KNI_QUEUE_SIZE 2048
70 
71 int enable_kni;
72 static int kni_accept;
73 static int knictl_action = FF_KNICTL_ACTION_DEFAULT;
74 #endif
75 
76 static int numa_on;
77 
78 static unsigned idle_sleep;
79 static unsigned pkt_tx_delay;
80 
81 static struct rte_timer freebsd_clock;
82 
83 // Mellanox Linux's driver key
84 static uint8_t default_rsskey_40bytes[40] = {
85     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
86     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
87     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
88     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
89     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
90 };
91 
92 static uint8_t default_rsskey_52bytes[52] = {
93     0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23,
94     0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30,
95     0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02,
96     0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c,
97     0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55,
98     0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e,
99     0x81, 0x15, 0x03, 0x66
100 };
101 
102 static uint8_t symmetric_rsskey[52] = {
103     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
104     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
105     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
106     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
107     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
108     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
109     0x6d, 0x5a, 0x6d, 0x5a
110 };
111 
112 static int rsskey_len = sizeof(default_rsskey_40bytes);
113 static uint8_t *rsskey = default_rsskey_40bytes;
114 
115 struct lcore_conf lcore_conf;
116 
117 struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
118 
119 static pcblddr_func_t pcblddr_fun;
120 
121 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
122 static dispatch_func_t packet_dispatcher;
123 
124 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
125 
126 #define BOND_DRIVER_NAME    "net_bonding"
127 
128 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port);
129 
130 struct ff_msg_ring {
131     char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE];
132     /* ring[0] for lcore recv msg, other send */
133     /* ring[1] for lcore send msg, other read */
134     struct rte_ring *ring[FF_MSG_NUM];
135 } __rte_cache_aligned;
136 
137 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
138 static struct rte_mempool *message_pool;
139 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
140 
141 static struct ff_top_args ff_top_status;
142 static struct ff_traffic_args ff_traffic;
143 extern void ff_hardclock(void);
144 
145 static void
146 ff_hardclock_job(__rte_unused struct rte_timer *timer,
147     __rte_unused void *arg) {
148     ff_hardclock();
149     ff_update_current_ts();
150 }
151 
152 struct ff_dpdk_if_context *
153 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
154 {
155     struct ff_dpdk_if_context *ctx;
156 
157     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
158     if (ctx == NULL)
159         return NULL;
160 
161     ctx->sc = sc;
162     ctx->ifp = ifp;
163     ctx->port_id = cfg->port_id;
164     ctx->hw_features = cfg->hw_features;
165 
166     return ctx;
167 }
168 
169 void
170 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
171 {
172     free(ctx);
173 }
174 
175 static void
176 check_all_ports_link_status(void)
177 {
178     #define CHECK_INTERVAL 100 /* 100ms */
179     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
180 
181     uint16_t portid;
182     uint8_t count, all_ports_up, print_flag = 0;
183     struct rte_eth_link link;
184 
185     printf("\nChecking link status");
186     fflush(stdout);
187 
188     int i, nb_ports;
189     nb_ports = ff_global_cfg.dpdk.nb_ports;
190     for (count = 0; count <= MAX_CHECK_TIME; count++) {
191         all_ports_up = 1;
192         for (i = 0; i < nb_ports; i++) {
193             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
194             memset(&link, 0, sizeof(link));
195             rte_eth_link_get_nowait(portid, &link);
196 
197             /* print link status if flag set */
198             if (print_flag == 1) {
199                 if (link.link_status) {
200                     printf("Port %d Link Up - speed %u "
201                         "Mbps - %s\n", (int)portid,
202                         (unsigned)link.link_speed,
203                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
204                         ("full-duplex") : ("half-duplex\n"));
205                 } else {
206                     printf("Port %d Link Down\n", (int)portid);
207                 }
208                 continue;
209             }
210             /* clear all_ports_up flag if any link down */
211             if (link.link_status == 0) {
212                 all_ports_up = 0;
213                 break;
214             }
215         }
216 
217         /* after finally printing all link status, get out */
218         if (print_flag == 1)
219             break;
220 
221         if (all_ports_up == 0) {
222             printf(".");
223             fflush(stdout);
224             rte_delay_ms(CHECK_INTERVAL);
225         }
226 
227         /* set the print_flag if all ports up or timeout */
228         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
229             print_flag = 1;
230             printf("done\n");
231         }
232     }
233 }
234 
235 static int
236 init_lcore_conf(void)
237 {
238     uint8_t nb_dev_ports = rte_eth_dev_count_avail();
239     if (nb_dev_ports == 0) {
240         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
241     }
242 
243     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
244         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
245                  ff_global_cfg.dpdk.max_portid);
246     }
247 
248     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
249     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
250 
251     uint16_t socket_id = 0;
252     if (numa_on) {
253         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
254     }
255 
256     lcore_conf.socket_id = socket_id;
257 
258     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
259     if (!rte_lcore_is_enabled(lcore_id)) {
260         rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
261     }
262 
263     int j;
264     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
265         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
266         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
267 
268         int queueid = -1;
269         int i;
270         for (i = 0; i < pconf->nb_lcores; i++) {
271             if (pconf->lcore_list[i] == lcore_id) {
272                 queueid = i;
273             }
274         }
275         if (queueid < 0) {
276             continue;
277         }
278         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
279         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
280         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
281         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
282         lcore_conf.nb_rx_queue++;
283 
284         lcore_conf.tx_queue_id[port_id] = queueid;
285         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
286         lcore_conf.nb_tx_port++;
287 
288         /* Enable pcap dump */
289         if (ff_global_cfg.pcap.enable) {
290             ff_enable_pcap(ff_global_cfg.pcap.save_path, ff_global_cfg.pcap.snap_len);
291         }
292 
293         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
294     }
295 
296     if (lcore_conf.nb_rx_queue == 0) {
297         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
298     }
299 
300     return 0;
301 }
302 
303 static int
304 init_mem_pool(void)
305 {
306     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
307     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
308     uint32_t nb_tx_queue = nb_lcores;
309     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
310     uint16_t max_portid = ff_global_cfg.dpdk.max_portid;
311 
312     unsigned nb_mbuf = RTE_ALIGN_CEIL (
313         (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE          +
314         nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST    +
315         nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE  +
316         nb_lcores * MEMPOOL_CACHE_SIZE +
317 #ifdef FF_KNI
318         nb_ports * KNI_MBUF_MAX +
319         nb_ports * KNI_QUEUE_SIZE +
320 #endif
321         nb_lcores * nb_ports * DISPATCH_RING_SIZE),
322         (unsigned)8192);
323 
324     unsigned socketid = 0;
325     uint16_t i, lcore_id;
326     char s[64];
327 
328     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
329         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
330         if (numa_on) {
331             socketid = rte_lcore_to_socket_id(lcore_id);
332         }
333 
334         if (socketid >= NB_SOCKETS) {
335             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
336                 socketid, i, NB_SOCKETS);
337         }
338 
339         if (pktmbuf_pool[socketid] != NULL) {
340             continue;
341         }
342 
343         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
344             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
345             pktmbuf_pool[socketid] =
346                 rte_pktmbuf_pool_create(s, nb_mbuf,
347                     MEMPOOL_CACHE_SIZE, 0,
348                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
349         } else {
350             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
351             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
352         }
353 
354         if (pktmbuf_pool[socketid] == NULL) {
355             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
356         } else {
357             printf("create mbuf pool on socket %d\n", socketid);
358         }
359 
360 #ifdef FF_USE_PAGE_ARRAY
361         nb_mbuf = RTE_ALIGN_CEIL (
362             nb_ports*nb_lcores*MAX_PKT_BURST    +
363             nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
364             nb_lcores*MEMPOOL_CACHE_SIZE,
365             (unsigned)4096);
366         ff_init_ref_pool(nb_mbuf, socketid);
367 #endif
368     }
369 
370     return 0;
371 }
372 
373 static struct rte_ring *
374 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
375 {
376     struct rte_ring *ring;
377 
378     if (name == NULL) {
379         rte_exit(EXIT_FAILURE, "create ring failed, no name!\n");
380     }
381 
382     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
383         ring = rte_ring_create(name, count, socket_id, flags);
384     } else {
385         ring = rte_ring_lookup(name);
386     }
387 
388     if (ring == NULL) {
389         rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name);
390     }
391 
392     return ring;
393 }
394 
395 static int
396 init_dispatch_ring(void)
397 {
398     int j;
399     char name_buf[RTE_RING_NAMESIZE];
400     int queueid;
401 
402     unsigned socketid = lcore_conf.socket_id;
403 
404     /* Create ring according to ports actually being used. */
405     int nb_ports = ff_global_cfg.dpdk.nb_ports;
406     for (j = 0; j < nb_ports; j++) {
407         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
408         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
409         int nb_queues = pconf->nb_lcores;
410         if (dispatch_ring[portid] == NULL) {
411             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
412 
413             dispatch_ring[portid] = rte_zmalloc(name_buf,
414                 sizeof(struct rte_ring *) * nb_queues,
415                 RTE_CACHE_LINE_SIZE);
416             if (dispatch_ring[portid] == NULL) {
417                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
418                     "failed\n", name_buf);
419             }
420         }
421 
422         for(queueid = 0; queueid < nb_queues; ++queueid) {
423             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
424                 portid, queueid);
425             dispatch_ring[portid][queueid] = create_ring(name_buf,
426                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
427 
428             if (dispatch_ring[portid][queueid] == NULL)
429                 rte_panic("create ring:%s failed!\n", name_buf);
430 
431             printf("create ring:%s success, %u ring entries are now free!\n",
432                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
433         }
434     }
435 
436     return 0;
437 }
438 
439 static void
440 ff_msg_init(struct rte_mempool *mp,
441     __attribute__((unused)) void *opaque_arg,
442     void *obj, __attribute__((unused)) unsigned i)
443 {
444     struct ff_msg *msg = (struct ff_msg *)obj;
445     msg->msg_type = FF_UNKNOWN;
446     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
447     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
448     msg->original_buf = NULL;
449     msg->original_buf_len = 0;
450 }
451 
452 static int
453 init_msg_ring(void)
454 {
455     uint16_t i, j;
456     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
457     unsigned socketid = lcore_conf.socket_id;
458 
459     /* Create message buffer pool */
460     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
461         message_pool = rte_mempool_create(FF_MSG_POOL,
462            MSG_RING_SIZE * 2 * nb_procs,
463            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
464            NULL, NULL, ff_msg_init, NULL,
465            socketid, 0);
466     } else {
467         message_pool = rte_mempool_lookup(FF_MSG_POOL);
468     }
469 
470     if (message_pool == NULL) {
471         rte_panic("Create msg mempool failed\n");
472     }
473 
474     for(i = 0; i < nb_procs; ++i) {
475         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
476             "%s%u", FF_MSG_RING_IN, i);
477         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
478             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
479         if (msg_ring[i].ring[0] == NULL)
480             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
481 
482         for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) {
483             snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE,
484                 "%s%u_%u", FF_MSG_RING_OUT, i, j);
485             msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j],
486                 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
487             if (msg_ring[i].ring[j] == NULL)
488                 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]);
489         }
490     }
491 
492     return 0;
493 }
494 
495 #ifdef FF_KNI
496 
497 static enum FF_KNICTL_CMD get_kni_action(const char *c){
498     if (!c)
499         return FF_KNICTL_ACTION_DEFAULT;
500     if (0 == strcasecmp(c, "alltokni")){
501         return FF_KNICTL_ACTION_ALL_TO_KNI;
502     } else  if (0 == strcasecmp(c, "alltoff")){
503         return FF_KNICTL_ACTION_ALL_TO_FF;
504     } else if (0 == strcasecmp(c, "default")){
505         return FF_KNICTL_ACTION_DEFAULT;
506     } else {
507         return FF_KNICTL_ACTION_DEFAULT;
508     }
509 }
510 
511 static int
512 init_kni(void)
513 {
514     int nb_ports = rte_eth_dev_count_avail();
515     kni_accept = 0;
516     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
517         kni_accept = 1;
518 
519     knictl_action = get_kni_action(ff_global_cfg.kni.kni_action);
520 
521     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
522         ff_global_cfg.kni.udp_port);
523 
524     unsigned socket_id = lcore_conf.socket_id;
525     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
526 
527     nb_ports = ff_global_cfg.dpdk.nb_ports;
528     int i, ret;
529     for (i = 0; i < nb_ports; i++) {
530         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
531         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
532     }
533 
534     return 0;
535 }
536 #endif
537 
538 //RSS reta update will failed when enable flow isolate
539 #ifndef FF_FLOW_ISOLATE
540 static void
541 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
542 {
543     if (reta_size == 0) {
544         return;
545     }
546 
547     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
548     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
549 
550     /* config HW indirection table */
551     unsigned i, j, hash=0;
552     for (i = 0; i < reta_conf_size; i++) {
553         reta_conf[i].mask = ~0ULL;
554         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
555             reta_conf[i].reta[j] = hash++ % nb_queues;
556         }
557     }
558 
559     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
560         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
561             port_id);
562     }
563 }
564 #endif
565 
566 static int
567 init_port_start(void)
568 {
569     int nb_ports = ff_global_cfg.dpdk.nb_ports;
570     unsigned socketid = 0;
571     struct rte_mempool *mbuf_pool;
572     uint16_t i, j;
573 
574     for (i = 0; i < nb_ports; i++) {
575         uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i];
576         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id];
577         uint16_t nb_queues = pconf->nb_lcores;
578 
579         for (j=0; j<=pconf->nb_slaves; j++) {
580             if (j < pconf->nb_slaves) {
581                 port_id = pconf->slave_portid_list[j];
582                 printf("To init %s's %d'st slave port[%d]\n",
583                         ff_global_cfg.dpdk.bond_cfgs->name,
584                         j, port_id);
585             } else {
586                 port_id = u_port_id;
587             }
588 
589             struct rte_eth_dev_info dev_info;
590             struct rte_eth_conf port_conf = {0};
591             struct rte_eth_rxconf rxq_conf;
592             struct rte_eth_txconf txq_conf;
593 
594             int ret = rte_eth_dev_info_get(port_id, &dev_info);
595             if (ret != 0)
596                 rte_exit(EXIT_FAILURE,
597                     "Error during getting device (port %u) info: %s\n",
598                     port_id, strerror(-ret));
599 
600             if (nb_queues > dev_info.max_rx_queues) {
601                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
602                     nb_queues,
603                     dev_info.max_rx_queues);
604             }
605 
606             if (nb_queues > dev_info.max_tx_queues) {
607                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
608                     nb_queues,
609                     dev_info.max_tx_queues);
610             }
611 
612             struct rte_ether_addr addr;
613             rte_eth_macaddr_get(port_id, &addr);
614             printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
615                        " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
616                     (unsigned)port_id,
617                     addr.addr_bytes[0], addr.addr_bytes[1],
618                     addr.addr_bytes[2], addr.addr_bytes[3],
619                     addr.addr_bytes[4], addr.addr_bytes[5]);
620 
621             rte_memcpy(pconf->mac,
622                 addr.addr_bytes, RTE_ETHER_ADDR_LEN);
623 
624             /* Set RSS mode */
625             uint64_t default_rss_hf = ETH_RSS_PROTO_MASK;
626             port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
627             port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf;
628             if (dev_info.hash_key_size == 52) {
629                 rsskey = default_rsskey_52bytes;
630                 rsskey_len = 52;
631             }
632             if (ff_global_cfg.dpdk.symmetric_rss) {
633                 printf("Use symmetric Receive-side Scaling(RSS) key\n");
634                 rsskey = symmetric_rsskey;
635             }
636             port_conf.rx_adv_conf.rss_conf.rss_key = rsskey;
637             port_conf.rx_adv_conf.rss_conf.rss_key_len = rsskey_len;
638             port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads;
639             if (port_conf.rx_adv_conf.rss_conf.rss_hf !=
640                     ETH_RSS_PROTO_MASK) {
641                 printf("Port %u modified RSS hash function based on hardware support,"
642                         "requested:%#"PRIx64" configured:%#"PRIx64"\n",
643                         port_id, default_rss_hf,
644                         port_conf.rx_adv_conf.rss_conf.rss_hf);
645             }
646 
647             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) {
648                 port_conf.txmode.offloads |=
649                     DEV_TX_OFFLOAD_MBUF_FAST_FREE;
650             }
651 
652             /* Set Rx VLAN stripping */
653             if (ff_global_cfg.dpdk.vlan_strip) {
654                 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
655                     port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
656                 }
657             }
658 
659             /* Enable HW CRC stripping */
660             port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC;
661 
662             /* FIXME: Enable TCP LRO ?*/
663             #if 0
664             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
665                 printf("LRO is supported\n");
666                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
667                 pconf->hw_features.rx_lro = 1;
668             }
669             #endif
670 
671             /* Set Rx checksum checking */
672             if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
673                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
674                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
675                 printf("RX checksum offload supported\n");
676                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
677                 pconf->hw_features.rx_csum = 1;
678             }
679 
680             if (ff_global_cfg.dpdk.tx_csum_offoad_skip == 0) {
681                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
682                     printf("TX ip checksum offload supported\n");
683                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
684                     pconf->hw_features.tx_csum_ip = 1;
685                 }
686 
687                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
688                     (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
689                     printf("TX TCP&UDP checksum offload supported\n");
690                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM;
691                     pconf->hw_features.tx_csum_l4 = 1;
692                 }
693             } else {
694                 printf("TX checksum offoad is disabled\n");
695             }
696 
697             if (ff_global_cfg.dpdk.tso) {
698                 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
699                     printf("TSO is supported\n");
700                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO;
701                     pconf->hw_features.tx_tso = 1;
702                 }
703             } else {
704                 printf("TSO is disabled\n");
705             }
706 
707             if (dev_info.reta_size) {
708                 /* reta size must be power of 2 */
709                 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
710 
711                 rss_reta_size[port_id] = dev_info.reta_size;
712                 printf("port[%d]: rss table size: %d\n", port_id,
713                     dev_info.reta_size);
714             }
715 
716             if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
717                 continue;
718             }
719 
720             ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
721             if (ret != 0) {
722                 return ret;
723             }
724 
725             static uint16_t nb_rxd = RX_QUEUE_SIZE;
726             static uint16_t nb_txd = TX_QUEUE_SIZE;
727             ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd);
728             if (ret < 0)
729                 printf("Could not adjust number of descriptors "
730                         "for port%u (%d)\n", (unsigned)port_id, ret);
731 
732             uint16_t q;
733             for (q = 0; q < nb_queues; q++) {
734                 if (numa_on) {
735                     uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
736                     socketid = rte_lcore_to_socket_id(lcore_id);
737                 }
738                 mbuf_pool = pktmbuf_pool[socketid];
739 
740                 txq_conf = dev_info.default_txconf;
741                 txq_conf.offloads = port_conf.txmode.offloads;
742                 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd,
743                     socketid, &txq_conf);
744                 if (ret < 0) {
745                     return ret;
746                 }
747 
748                 rxq_conf = dev_info.default_rxconf;
749                 rxq_conf.offloads = port_conf.rxmode.offloads;
750                 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd,
751                     socketid, &rxq_conf, mbuf_pool);
752                 if (ret < 0) {
753                     return ret;
754                 }
755             }
756 
757 
758             if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME,
759                     strlen(dev_info.driver_name)) == 0) {
760 
761                 rte_eth_macaddr_get(port_id, &addr);
762                 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
763                            " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
764                         (unsigned)port_id,
765                         addr.addr_bytes[0], addr.addr_bytes[1],
766                         addr.addr_bytes[2], addr.addr_bytes[3],
767                         addr.addr_bytes[4], addr.addr_bytes[5]);
768 
769                 rte_memcpy(pconf->mac,
770                     addr.addr_bytes, RTE_ETHER_ADDR_LEN);
771 
772                 int mode, count, x;
773                 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS;
774 
775                 mode = rte_eth_bond_mode_get(port_id);
776                 printf("Port %u, bond mode:%d\n", port_id, mode);
777 
778                 count = rte_eth_bond_slaves_get(port_id, slaves, len);
779                 printf("Port %u, %s's slave ports count:%d\n", port_id,
780                             ff_global_cfg.dpdk.bond_cfgs->name, count);
781                 for (x=0; x<count; x++) {
782                     printf("Port %u, %s's slave port[%u]\n", port_id,
783                             ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]);
784                 }
785             }
786 
787             ret = rte_eth_dev_start(port_id);
788             if (ret < 0) {
789                 return ret;
790             }
791     //RSS reta update will failed when enable flow isolate
792     #ifndef FF_FLOW_ISOLATE
793             if (nb_queues > 1) {
794                 /* set HW rss hash function to Toeplitz. */
795                 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
796                     struct rte_eth_hash_filter_info info = {0};
797                     info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
798                     info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
799 
800                     if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
801                         RTE_ETH_FILTER_SET, &info) < 0) {
802                         rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
803                             port_id);
804                     }
805                 }
806 
807                 set_rss_table(port_id, dev_info.reta_size, nb_queues);
808             }
809     #endif
810 
811             /* Enable RX in promiscuous mode for the Ethernet device. */
812             if (ff_global_cfg.dpdk.promiscuous) {
813                 ret = rte_eth_promiscuous_enable(port_id);
814                 if (ret == 0) {
815                     printf("set port %u to promiscuous mode ok\n", port_id);
816                 } else {
817                     printf("set port %u to promiscuous mode error\n", port_id);
818                 }
819             }
820         }
821     }
822 
823     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
824         check_all_ports_link_status();
825     }
826 
827     return 0;
828 }
829 
830 static int
831 init_clock(void)
832 {
833     rte_timer_subsystem_init();
834     uint64_t hz = rte_get_timer_hz();
835     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
836     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
837 
838     rte_timer_init(&freebsd_clock);
839     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
840         rte_lcore_id(), &ff_hardclock_job, NULL);
841 
842     ff_update_current_ts();
843 
844     return 0;
845 }
846 
847 #ifdef FF_FLOW_ISOLATE
848 /** Print a message out of a flow error. */
849 static int
850 port_flow_complain(struct rte_flow_error *error)
851 {
852     static const char *const errstrlist[] = {
853         [RTE_FLOW_ERROR_TYPE_NONE] = "no error",
854         [RTE_FLOW_ERROR_TYPE_UNSPECIFIED] = "cause unspecified",
855         [RTE_FLOW_ERROR_TYPE_HANDLE] = "flow rule (handle)",
856         [RTE_FLOW_ERROR_TYPE_ATTR_GROUP] = "group field",
857         [RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY] = "priority field",
858         [RTE_FLOW_ERROR_TYPE_ATTR_INGRESS] = "ingress field",
859         [RTE_FLOW_ERROR_TYPE_ATTR_EGRESS] = "egress field",
860         [RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER] = "transfer field",
861         [RTE_FLOW_ERROR_TYPE_ATTR] = "attributes structure",
862         [RTE_FLOW_ERROR_TYPE_ITEM_NUM] = "pattern length",
863         [RTE_FLOW_ERROR_TYPE_ITEM_SPEC] = "item specification",
864         [RTE_FLOW_ERROR_TYPE_ITEM_LAST] = "item specification range",
865         [RTE_FLOW_ERROR_TYPE_ITEM_MASK] = "item specification mask",
866         [RTE_FLOW_ERROR_TYPE_ITEM] = "specific pattern item",
867         [RTE_FLOW_ERROR_TYPE_ACTION_NUM] = "number of actions",
868         [RTE_FLOW_ERROR_TYPE_ACTION_CONF] = "action configuration",
869         [RTE_FLOW_ERROR_TYPE_ACTION] = "specific action",
870     };
871     const char *errstr;
872     char buf[32];
873     int err = rte_errno;
874 
875     if ((unsigned int)error->type >= RTE_DIM(errstrlist) ||
876         !errstrlist[error->type])
877         errstr = "unknown type";
878     else
879         errstr = errstrlist[error->type];
880     printf("Caught error type %d (%s): %s%s: %s\n",
881            error->type, errstr,
882            error->cause ? (snprintf(buf, sizeof(buf), "cause: %p, ",
883                                     error->cause), buf) : "",
884            error->message ? error->message : "(no stated reason)",
885            rte_strerror(err));
886     return -err;
887 }
888 
889 static int
890 port_flow_isolate(uint16_t port_id, int set)
891 {
892     struct rte_flow_error error;
893 
894     /* Poisoning to make sure PMDs update it in case of error. */
895     memset(&error, 0x66, sizeof(error));
896     if (rte_flow_isolate(port_id, set, &error))
897         return port_flow_complain(&error);
898     printf("Ingress traffic on port %u is %s to the defined flow rules\n",
899            port_id,
900            set ? "now restricted" : "not restricted anymore");
901     return 0;
902 }
903 
904 static int
905 create_tcp_flow(uint16_t port_id, uint16_t tcp_port) {
906   struct rte_flow_attr attr = {.ingress = 1};
907   struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
908   int nb_queues = pconf->nb_lcores;
909   uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
910   int i = 0, j = 0;
911   for (i = 0, j = 0; i < nb_queues; ++i)
912    queue[j++] = i;
913   struct rte_flow_action_rss rss = {
914    .types = ETH_RSS_NONFRAG_IPV4_TCP,
915    .key_len = rsskey_len,
916    .key = rsskey,
917    .queue_num = j,
918    .queue = queue,
919   };
920 
921   struct rte_eth_dev_info dev_info;
922   int ret = rte_eth_dev_info_get(port_id, &dev_info);
923   if (ret != 0)
924     rte_exit(EXIT_FAILURE, "Error during getting device (port %u) info: %s\n", port_id, strerror(-ret));
925 
926   struct rte_flow_item pattern[3];
927   struct rte_flow_action action[2];
928   struct rte_flow_item_tcp tcp_spec;
929   struct rte_flow_item_tcp tcp_mask = {
930           .hdr = {
931                   .src_port = RTE_BE16(0x0000),
932                   .dst_port = RTE_BE16(0xffff),
933           },
934   };
935   struct rte_flow_error error;
936 
937   memset(pattern, 0, sizeof(pattern));
938   memset(action, 0, sizeof(action));
939 
940   /* set the dst ipv4 packet to the required value */
941   pattern[0].type = RTE_FLOW_ITEM_TYPE_IPV4;
942 
943   memset(&tcp_spec, 0, sizeof(struct rte_flow_item_tcp));
944   tcp_spec.hdr.dst_port = rte_cpu_to_be_16(tcp_port);
945   pattern[1].type = RTE_FLOW_ITEM_TYPE_TCP;
946   pattern[1].spec = &tcp_spec;
947   pattern[1].mask = &tcp_mask;
948 
949   /* end the pattern array */
950   pattern[2].type = RTE_FLOW_ITEM_TYPE_END;
951 
952   /* create the action */
953   action[0].type = RTE_FLOW_ACTION_TYPE_RSS;
954   action[0].conf = &rss;
955   action[1].type = RTE_FLOW_ACTION_TYPE_END;
956 
957   struct rte_flow *flow;
958   /* validate and create the flow rule */
959   if (!rte_flow_validate(port_id, &attr, pattern, action, &error)) {
960       flow = rte_flow_create(port_id, &attr, pattern, action, &error);
961       if (!flow) {
962           return port_flow_complain(&error);
963       }
964   }
965 
966   memset(pattern, 0, sizeof(pattern));
967 
968   /* set the dst ipv4 packet to the required value */
969   pattern[0].type = RTE_FLOW_ITEM_TYPE_IPV4;
970 
971   struct rte_flow_item_tcp tcp_src_mask = {
972           .hdr = {
973                   .src_port = RTE_BE16(0xffff),
974                   .dst_port = RTE_BE16(0x0000),
975           },
976   };
977 
978   memset(&tcp_spec, 0, sizeof(struct rte_flow_item_tcp));
979   tcp_spec.hdr.src_port = rte_cpu_to_be_16(tcp_port);
980   pattern[1].type = RTE_FLOW_ITEM_TYPE_TCP;
981   pattern[1].spec = &tcp_spec;
982   pattern[1].mask = &tcp_src_mask;
983 
984   /* end the pattern array */
985   pattern[2].type = RTE_FLOW_ITEM_TYPE_END;
986 
987   /* validate and create the flow rule */
988   if (!rte_flow_validate(port_id, &attr, pattern, action, &error)) {
989       flow = rte_flow_create(port_id, &attr, pattern, action, &error);
990       if (!flow) {
991           return port_flow_complain(&error);
992       }
993   }
994 
995   return 1;
996 }
997 
998 static int
999 init_flow(uint16_t port_id, uint16_t tcp_port) {
1000   // struct ff_flow_cfg fcfg = ff_global_cfg.dpdk.flow_cfgs[0];
1001 
1002   // int i;
1003   // for (i = 0; i < fcfg.nb_port; i++) {
1004   //     if(!create_tcp_flow(fcfg.port_id, fcfg.tcp_ports[i])) {
1005   //         return 0;
1006   //     }
1007   // }
1008 
1009   if(!create_tcp_flow(port_id, tcp_port)) {
1010       rte_exit(EXIT_FAILURE, "create tcp flow failed\n");
1011       return -1;
1012   }
1013 
1014   /*  ARP rule */
1015   struct rte_flow_attr attr = {.ingress = 1};
1016   struct rte_flow_action_queue queue = {.index = 0};
1017 
1018   struct rte_flow_item pattern_[2];
1019   struct rte_flow_action action[2];
1020   struct rte_flow_item_eth eth_type = {.type = RTE_BE16(0x0806)};
1021   struct rte_flow_item_eth eth_mask = {
1022           .type = RTE_BE16(0xffff)
1023   };
1024 
1025   memset(pattern_, 0, sizeof(pattern_));
1026   memset(action, 0, sizeof(action));
1027 
1028   pattern_[0].type = RTE_FLOW_ITEM_TYPE_ETH;
1029   pattern_[0].spec = &eth_type;
1030   pattern_[0].mask = &eth_mask;
1031 
1032   pattern_[1].type = RTE_FLOW_ITEM_TYPE_END;
1033 
1034   /* create the action */
1035   action[0].type = RTE_FLOW_ACTION_TYPE_QUEUE;
1036   action[0].conf = &queue;
1037   action[1].type = RTE_FLOW_ACTION_TYPE_END;
1038 
1039   struct rte_flow *flow;
1040   struct rte_flow_error error;
1041   /* validate and create the flow rule */
1042   if (!rte_flow_validate(port_id, &attr, pattern_, action, &error)) {
1043       flow = rte_flow_create(port_id, &attr, pattern_, action, &error);
1044       if (!flow) {
1045           return port_flow_complain(&error);
1046       }
1047   }
1048 
1049   return 1;
1050 }
1051 
1052 #endif
1053 
1054 int
1055 ff_dpdk_init(int argc, char **argv)
1056 {
1057     if (ff_global_cfg.dpdk.nb_procs < 1 ||
1058         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
1059         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
1060         ff_global_cfg.dpdk.proc_id < 0) {
1061         printf("param num_procs[%d] or proc_id[%d] error!\n",
1062             ff_global_cfg.dpdk.nb_procs,
1063             ff_global_cfg.dpdk.proc_id);
1064         exit(1);
1065     }
1066 
1067     int ret = rte_eal_init(argc, argv);
1068     if (ret < 0) {
1069         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1070     }
1071 
1072     numa_on = ff_global_cfg.dpdk.numa_on;
1073 
1074     idle_sleep = ff_global_cfg.dpdk.idle_sleep;
1075     pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \
1076         BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay;
1077 
1078     init_lcore_conf();
1079 
1080     init_mem_pool();
1081 
1082     init_dispatch_ring();
1083 
1084     init_msg_ring();
1085 
1086 #ifdef FF_KNI
1087     enable_kni = ff_global_cfg.kni.enable;
1088     if (enable_kni) {
1089         init_kni();
1090     }
1091 #endif
1092 
1093 #ifdef FF_USE_PAGE_ARRAY
1094     ff_mmap_init();
1095 #endif
1096 
1097 #ifdef FF_FLOW_ISOLATE
1098     // run once in primary process
1099     if (0 == lcore_conf.tx_queue_id[0]){
1100         ret = port_flow_isolate(0, 1);
1101         if (ret < 0)
1102             rte_exit(EXIT_FAILURE, "init_port_isolate failed\n");
1103     }
1104 #endif
1105 
1106     ret = init_port_start();
1107     if (ret < 0) {
1108         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
1109     }
1110 
1111     init_clock();
1112 #ifdef FF_FLOW_ISOLATE
1113     //Only give a example usage: port_id=0, tcp_port= 80.
1114     //Recommend:
1115     //1. init_flow should replace `set_rss_table` in `init_port_start` loop, This can set all NIC's port_id_list instead only 0 device(port_id).
1116     //2. using config options `tcp_port` replace magic number of 80
1117     ret = init_flow(0, 80);
1118     if (ret < 0) {
1119         rte_exit(EXIT_FAILURE, "init_port_flow failed\n");
1120     }
1121 #endif
1122     return 0;
1123 }
1124 
1125 static void
1126 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
1127 {
1128     uint8_t rx_csum = ctx->hw_features.rx_csum;
1129     if (rx_csum) {
1130         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
1131             rte_pktmbuf_free(pkt);
1132             return;
1133         }
1134     }
1135 
1136     void *data = rte_pktmbuf_mtod(pkt, void*);
1137     uint16_t len = rte_pktmbuf_data_len(pkt);
1138 
1139     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
1140     if (hdr == NULL) {
1141         rte_pktmbuf_free(pkt);
1142         return;
1143     }
1144 
1145     if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) {
1146         ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci);
1147     }
1148 
1149     struct rte_mbuf *pn = pkt->next;
1150     void *prev = hdr;
1151     while(pn != NULL) {
1152         data = rte_pktmbuf_mtod(pn, void*);
1153         len = rte_pktmbuf_data_len(pn);
1154 
1155         void *mb = ff_mbuf_get(prev, pn, data, len);
1156         if (mb == NULL) {
1157             ff_mbuf_free(hdr);
1158             rte_pktmbuf_free(pkt);
1159             return;
1160         }
1161         pn = pn->next;
1162         prev = mb;
1163     }
1164 
1165     ff_veth_process_packet(ctx->ifp, hdr);
1166 }
1167 
1168 static enum FilterReturn
1169 protocol_filter(const void *data, uint16_t len)
1170 {
1171     if(len < RTE_ETHER_ADDR_LEN)
1172         return FILTER_UNKNOWN;
1173 
1174     const struct rte_ether_hdr *hdr;
1175     const struct rte_vlan_hdr *vlanhdr;
1176     hdr = (const struct rte_ether_hdr *)data;
1177     uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type);
1178     data += RTE_ETHER_HDR_LEN;
1179     len -= RTE_ETHER_HDR_LEN;
1180 
1181     if (ether_type == RTE_ETHER_TYPE_VLAN) {
1182         vlanhdr = (struct rte_vlan_hdr *)data;
1183         ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto);
1184         data += sizeof(struct rte_vlan_hdr);
1185         len -= sizeof(struct rte_vlan_hdr);
1186     }
1187 
1188     if(ether_type == RTE_ETHER_TYPE_ARP)
1189         return FILTER_ARP;
1190 
1191 #ifdef INET6
1192     if (ether_type == RTE_ETHER_TYPE_IPV6) {
1193         return ff_kni_proto_filter(data,
1194             len, ether_type);
1195     }
1196 #endif
1197 
1198 #ifndef FF_KNI
1199     return FILTER_UNKNOWN;
1200 #else
1201     if (!enable_kni) {
1202         return FILTER_UNKNOWN;
1203     }
1204 
1205     if(ether_type != RTE_ETHER_TYPE_IPV4)
1206         return FILTER_UNKNOWN;
1207 
1208     return ff_kni_proto_filter(data,
1209         len, ether_type);
1210 #endif
1211 }
1212 
1213 static inline void
1214 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
1215 {
1216     struct rte_mbuf *md;
1217     void *src, *dst;
1218 
1219     dst = rte_pktmbuf_mtod(mi, void *);
1220     src = rte_pktmbuf_mtod(m, void *);
1221 
1222     mi->data_len = m->data_len;
1223     rte_memcpy(dst, src, m->data_len);
1224 
1225     mi->port = m->port;
1226     mi->vlan_tci = m->vlan_tci;
1227     mi->vlan_tci_outer = m->vlan_tci_outer;
1228     mi->tx_offload = m->tx_offload;
1229     mi->hash = m->hash;
1230     mi->ol_flags = m->ol_flags;
1231     mi->packet_type = m->packet_type;
1232 }
1233 
1234 /* copied from rte_pktmbuf_clone */
1235 static inline struct rte_mbuf *
1236 pktmbuf_deep_clone(const struct rte_mbuf *md,
1237     struct rte_mempool *mp)
1238 {
1239     struct rte_mbuf *mc, *mi, **prev;
1240     uint32_t pktlen;
1241     uint8_t nseg;
1242 
1243     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
1244         return NULL;
1245 
1246     mi = mc;
1247     prev = &mi->next;
1248     pktlen = md->pkt_len;
1249     nseg = 0;
1250 
1251     do {
1252         nseg++;
1253         pktmbuf_deep_attach(mi, md);
1254         *prev = mi;
1255         prev = &mi->next;
1256     } while ((md = md->next) != NULL &&
1257         (mi = rte_pktmbuf_alloc(mp)) != NULL);
1258 
1259     *prev = NULL;
1260     mc->nb_segs = nseg;
1261     mc->pkt_len = pktlen;
1262 
1263     /* Allocation of new indirect segment failed */
1264     if (unlikely (mi == NULL)) {
1265         rte_pktmbuf_free(mc);
1266         return NULL;
1267     }
1268 
1269     __rte_mbuf_sanity_check(mc, 1);
1270     return mc;
1271 }
1272 
1273 static inline void
1274 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
1275     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
1276 {
1277     struct lcore_conf *qconf = &lcore_conf;
1278     uint16_t nb_queues = qconf->nb_queue_list[port_id];
1279 
1280     uint16_t i;
1281     for (i = 0; i < count; i++) {
1282         struct rte_mbuf *rtem = bufs[i];
1283 
1284         if (unlikely( ff_global_cfg.pcap.enable)) {
1285             if (!pkts_from_ring) {
1286                 ff_dump_packets( ff_global_cfg.pcap.save_path, rtem, ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1287             }
1288         }
1289 
1290         void *data = rte_pktmbuf_mtod(rtem, void*);
1291         uint16_t len = rte_pktmbuf_data_len(rtem);
1292 
1293         if (!pkts_from_ring) {
1294             ff_traffic.rx_packets++;
1295             ff_traffic.rx_bytes += len;
1296         }
1297 
1298         if (!pkts_from_ring && packet_dispatcher) {
1299             int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues);
1300             if (ret == FF_DISPATCH_RESPONSE) {
1301                 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len;
1302 
1303                 /*
1304                  * We have not support vlan out strip
1305                  */
1306                 if (rtem->vlan_tci) {
1307                     data = rte_pktmbuf_prepend(rtem, sizeof(struct rte_vlan_hdr));
1308                     if (data != NULL) {
1309                         memmove(data, data + sizeof(struct rte_vlan_hdr), RTE_ETHER_HDR_LEN);
1310                         struct rte_ether_hdr *etherhdr = (struct rte_ether_hdr *)data;
1311                         struct rte_vlan_hdr *vlanhdr = (struct rte_vlan_hdr *)(data + RTE_ETHER_HDR_LEN);
1312                         vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci);
1313                         vlanhdr->eth_proto = etherhdr->ether_type;
1314                         etherhdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN);
1315                     }
1316                 }
1317                 send_single_packet(rtem, port_id);
1318                 continue;
1319             }
1320 
1321             if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) {
1322                 rte_pktmbuf_free(rtem);
1323                 continue;
1324             }
1325 
1326             if (ret != queue_id) {
1327                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
1328                 if (ret < 0)
1329                     rte_pktmbuf_free(rtem);
1330 
1331                 continue;
1332             }
1333         }
1334 
1335         enum FilterReturn filter = protocol_filter(data, len);
1336 #ifdef INET6
1337         if (filter == FILTER_ARP || filter == FILTER_NDP) {
1338 #else
1339         if (filter == FILTER_ARP) {
1340 #endif
1341             struct rte_mempool *mbuf_pool;
1342             struct rte_mbuf *mbuf_clone;
1343             if (!pkts_from_ring) {
1344                 uint16_t j;
1345                 for(j = 0; j < nb_queues; ++j) {
1346                     if(j == queue_id)
1347                         continue;
1348 
1349                     unsigned socket_id = 0;
1350                     if (numa_on) {
1351                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1352                         socket_id = rte_lcore_to_socket_id(lcore_id);
1353                     }
1354                     mbuf_pool = pktmbuf_pool[socket_id];
1355                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1356                     if(mbuf_clone) {
1357                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1358                             mbuf_clone);
1359                         if (ret < 0)
1360                             rte_pktmbuf_free(mbuf_clone);
1361                     }
1362                 }
1363             }
1364 
1365 #ifdef FF_KNI
1366             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1367                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1368                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1369                 if(mbuf_clone) {
1370                     ff_kni_enqueue(port_id, mbuf_clone);
1371                 }
1372             }
1373 #endif
1374             ff_veth_input(ctx, rtem);
1375 #ifdef FF_KNI
1376         } else if (enable_kni) {
1377             if (knictl_action == FF_KNICTL_ACTION_ALL_TO_KNI){
1378                 ff_kni_enqueue(port_id, rtem);
1379             } else if (knictl_action == FF_KNICTL_ACTION_ALL_TO_FF){
1380                 ff_veth_input(ctx, rtem);
1381             } else if (knictl_action == FF_KNICTL_ACTION_DEFAULT){
1382                 if (enable_kni &&
1383                         ((filter == FILTER_KNI && kni_accept) ||
1384                         (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1385                         ff_kni_enqueue(port_id, rtem);
1386                 } else {
1387                     ff_veth_input(ctx, rtem);
1388                 }
1389             } else {
1390                 ff_veth_input(ctx, rtem);
1391             }
1392 #endif
1393         } else {
1394             ff_veth_input(ctx, rtem);
1395         }
1396     }
1397 }
1398 
1399 static inline int
1400 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1401     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1402 {
1403     /* read packet from ring buf and to process */
1404     uint16_t nb_rb;
1405     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1406         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1407 
1408     if(nb_rb > 0) {
1409         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1410     }
1411 
1412     return 0;
1413 }
1414 
1415 static inline void
1416 handle_sysctl_msg(struct ff_msg *msg)
1417 {
1418     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1419         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1420         msg->sysctl.newlen);
1421 
1422     if (ret < 0) {
1423         msg->result = errno;
1424     } else {
1425         msg->result = 0;
1426     }
1427 }
1428 
1429 static inline void
1430 handle_ioctl_msg(struct ff_msg *msg)
1431 {
1432     int fd, ret;
1433 #ifdef INET6
1434     if (msg->msg_type == FF_IOCTL6) {
1435         fd = ff_socket(AF_INET6, SOCK_DGRAM, 0);
1436     } else
1437 #endif
1438         fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1439 
1440     if (fd < 0) {
1441         ret = -1;
1442         goto done;
1443     }
1444 
1445     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1446 
1447     ff_close(fd);
1448 
1449 done:
1450     if (ret < 0) {
1451         msg->result = errno;
1452     } else {
1453         msg->result = 0;
1454     }
1455 }
1456 
1457 static inline void
1458 handle_route_msg(struct ff_msg *msg)
1459 {
1460     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1461         &msg->route.len, msg->route.maxlen);
1462     if (ret < 0) {
1463         msg->result = errno;
1464     } else {
1465         msg->result = 0;
1466     }
1467 }
1468 
1469 static inline void
1470 handle_top_msg(struct ff_msg *msg)
1471 {
1472     msg->top = ff_top_status;
1473     msg->result = 0;
1474 }
1475 
1476 #ifdef FF_NETGRAPH
1477 static inline void
1478 handle_ngctl_msg(struct ff_msg *msg)
1479 {
1480     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1481     if (ret < 0) {
1482         msg->result = errno;
1483     } else {
1484         msg->result = 0;
1485         msg->ngctl.ret = ret;
1486     }
1487 }
1488 #endif
1489 
1490 #ifdef FF_IPFW
1491 static inline void
1492 handle_ipfw_msg(struct ff_msg *msg)
1493 {
1494     int fd, ret;
1495     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1496     if (fd < 0) {
1497         ret = -1;
1498         goto done;
1499     }
1500 
1501     switch (msg->ipfw.cmd) {
1502         case FF_IPFW_GET:
1503             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1504                 msg->ipfw.optname, msg->ipfw.optval,
1505                 msg->ipfw.optlen);
1506             break;
1507         case FF_IPFW_SET:
1508             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1509                 msg->ipfw.optname, msg->ipfw.optval,
1510                 *(msg->ipfw.optlen));
1511             break;
1512         default:
1513             ret = -1;
1514             errno = ENOTSUP;
1515             break;
1516     }
1517 
1518     ff_close(fd);
1519 
1520 done:
1521     if (ret < 0) {
1522         msg->result = errno;
1523     } else {
1524         msg->result = 0;
1525     }
1526 }
1527 #endif
1528 
1529 static inline void
1530 handle_traffic_msg(struct ff_msg *msg)
1531 {
1532     msg->traffic = ff_traffic;
1533     msg->result = 0;
1534 }
1535 
1536 #ifdef FF_KNI
1537 static inline void
1538 handle_knictl_msg(struct ff_msg *msg)
1539 {
1540     if (msg->knictl.kni_cmd == FF_KNICTL_CMD_SET){
1541         switch (msg->knictl.kni_action){
1542             case FF_KNICTL_ACTION_ALL_TO_FF: knictl_action = FF_KNICTL_ACTION_ALL_TO_FF; msg->result = 0; printf("new kni action: alltoff\n"); break;
1543             case FF_KNICTL_ACTION_ALL_TO_KNI: knictl_action = FF_KNICTL_ACTION_ALL_TO_KNI; msg->result = 0; printf("new kni action: alltokni\n"); break;
1544             case FF_KNICTL_ACTION_DEFAULT: knictl_action = FF_KNICTL_ACTION_DEFAULT; msg->result = 0; printf("new kni action: default\n"); break;
1545             default: msg->result = -1;
1546         }
1547     }
1548     else if (msg->knictl.kni_cmd == FF_KNICTL_CMD_GET){
1549         msg->knictl.kni_action = knictl_action;
1550     } else {
1551         msg->result = -2;
1552     }
1553 }
1554 #endif
1555 
1556 static inline void
1557 handle_default_msg(struct ff_msg *msg)
1558 {
1559     msg->result = ENOTSUP;
1560 }
1561 
1562 static inline void
1563 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1564 {
1565     switch (msg->msg_type) {
1566         case FF_SYSCTL:
1567             handle_sysctl_msg(msg);
1568             break;
1569         case FF_IOCTL:
1570 #ifdef INET6
1571         case FF_IOCTL6:
1572 #endif
1573             handle_ioctl_msg(msg);
1574             break;
1575         case FF_ROUTE:
1576             handle_route_msg(msg);
1577             break;
1578         case FF_TOP:
1579             handle_top_msg(msg);
1580             break;
1581 #ifdef FF_NETGRAPH
1582         case FF_NGCTL:
1583             handle_ngctl_msg(msg);
1584             break;
1585 #endif
1586 #ifdef FF_IPFW
1587         case FF_IPFW_CTL:
1588             handle_ipfw_msg(msg);
1589             break;
1590 #endif
1591         case FF_TRAFFIC:
1592             handle_traffic_msg(msg);
1593             break;
1594 #ifdef FF_KNI
1595         case FF_KNICTL:
1596             handle_knictl_msg(msg);
1597             break;
1598 #endif
1599         default:
1600             handle_default_msg(msg);
1601             break;
1602     }
1603     if (rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg) < 0) {
1604         if (msg->original_buf) {
1605             rte_free(msg->buf_addr);
1606             msg->buf_addr = msg->original_buf;
1607             msg->buf_len = msg->original_buf_len;
1608             msg->original_buf = NULL;
1609         }
1610 
1611         rte_mempool_put(message_pool, msg);
1612     }
1613 }
1614 
1615 static inline int
1616 process_msg_ring(uint16_t proc_id, struct rte_mbuf **pkts_burst)
1617 {
1618     /* read msg from ring buf and to process */
1619     uint16_t nb_rb;
1620     int i;
1621 
1622     nb_rb = rte_ring_dequeue_burst(msg_ring[proc_id].ring[0],
1623         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1624 
1625     if (likely(nb_rb == 0))
1626         return 0;
1627 
1628     for (i = 0; i < nb_rb; ++i) {
1629         handle_msg((struct ff_msg *)pkts_burst[i], proc_id);
1630     }
1631 
1632     return 0;
1633 }
1634 
1635 /* Send burst of packets on an output interface */
1636 static inline int
1637 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1638 {
1639     struct rte_mbuf **m_table;
1640     int ret;
1641     uint16_t queueid;
1642 
1643     queueid = qconf->tx_queue_id[port];
1644     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1645 
1646     if (unlikely(ff_global_cfg.pcap.enable)) {
1647         uint16_t i;
1648         for (i = 0; i < n; i++) {
1649             ff_dump_packets( ff_global_cfg.pcap.save_path, m_table[i],
1650                ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1651         }
1652     }
1653 
1654     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1655     ff_traffic.tx_packets += ret;
1656     uint16_t i;
1657     for (i = 0; i < ret; i++) {
1658         ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]);
1659 #ifdef FF_USE_PAGE_ARRAY
1660         if (qconf->tx_mbufs[port].bsd_m_table[i])
1661             ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs);
1662 #endif
1663     }
1664     if (unlikely(ret < n)) {
1665         do {
1666             rte_pktmbuf_free(m_table[ret]);
1667 #ifdef FF_USE_PAGE_ARRAY
1668             if ( qconf->tx_mbufs[port].bsd_m_table[ret] )
1669                 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]);
1670 #endif
1671         } while (++ret < n);
1672     }
1673     return 0;
1674 }
1675 
1676 /* Enqueue a single packet, and send burst if queue is filled */
1677 static inline int
1678 send_single_packet(struct rte_mbuf *m, uint8_t port)
1679 {
1680     uint16_t len;
1681     struct lcore_conf *qconf;
1682 
1683     qconf = &lcore_conf;
1684     len = qconf->tx_mbufs[port].len;
1685     qconf->tx_mbufs[port].m_table[len] = m;
1686     len++;
1687 
1688     /* enough pkts to be sent */
1689     if (unlikely(len == MAX_PKT_BURST)) {
1690         send_burst(qconf, MAX_PKT_BURST, port);
1691         len = 0;
1692     }
1693 
1694     qconf->tx_mbufs[port].len = len;
1695     return 0;
1696 }
1697 
1698 int
1699 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1700     int total)
1701 {
1702 #ifdef FF_USE_PAGE_ARRAY
1703     struct lcore_conf *qconf = &lcore_conf;
1704     int    len = 0;
1705 
1706     len = ff_if_send_onepkt(ctx, m,total);
1707     if (unlikely(len == MAX_PKT_BURST)) {
1708         send_burst(qconf, MAX_PKT_BURST, ctx->port_id);
1709         len = 0;
1710     }
1711     qconf->tx_mbufs[ctx->port_id].len = len;
1712     return 0;
1713 #endif
1714     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1715     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1716     if (head == NULL) {
1717         ff_mbuf_free(m);
1718         return -1;
1719     }
1720 
1721     head->pkt_len = total;
1722     head->nb_segs = 0;
1723 
1724     int off = 0;
1725     struct rte_mbuf *cur = head, *prev = NULL;
1726     while(total > 0) {
1727         if (cur == NULL) {
1728             cur = rte_pktmbuf_alloc(mbuf_pool);
1729             if (cur == NULL) {
1730                 rte_pktmbuf_free(head);
1731                 ff_mbuf_free(m);
1732                 return -1;
1733             }
1734         }
1735 
1736         if (prev != NULL) {
1737             prev->next = cur;
1738         }
1739         head->nb_segs++;
1740 
1741         prev = cur;
1742         void *data = rte_pktmbuf_mtod(cur, void*);
1743         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1744         int ret = ff_mbuf_copydata(m, data, off, len);
1745         if (ret < 0) {
1746             rte_pktmbuf_free(head);
1747             ff_mbuf_free(m);
1748             return -1;
1749         }
1750 
1751 
1752         cur->data_len = len;
1753         off += len;
1754         total -= len;
1755         cur = NULL;
1756     }
1757 
1758     struct ff_tx_offload offload = {0};
1759     ff_mbuf_tx_offload(m, &offload);
1760 
1761     void *data = rte_pktmbuf_mtod(head, void*);
1762 
1763     if (offload.ip_csum) {
1764         /* ipv6 not supported yet */
1765         struct rte_ipv4_hdr *iph;
1766         int iph_len;
1767         iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN);
1768         iph_len = (iph->version_ihl & 0x0f) << 2;
1769 
1770         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1771         head->l2_len = RTE_ETHER_HDR_LEN;
1772         head->l3_len = iph_len;
1773     }
1774 
1775     if (ctx->hw_features.tx_csum_l4) {
1776         struct rte_ipv4_hdr *iph;
1777         int iph_len;
1778         iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN);
1779         iph_len = (iph->version_ihl & 0x0f) << 2;
1780 
1781         if (offload.tcp_csum) {
1782             head->ol_flags |= PKT_TX_TCP_CKSUM;
1783             head->l2_len = RTE_ETHER_HDR_LEN;
1784             head->l3_len = iph_len;
1785         }
1786 
1787         /*
1788          *  TCP segmentation offload.
1789          *
1790          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1791          *    implies PKT_TX_TCP_CKSUM)
1792          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1793          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1794          *    write the IP checksum to 0 in the packet
1795          *  - fill the mbuf offload information: l2_len,
1796          *    l3_len, l4_len, tso_segsz
1797          *  - calculate the pseudo header checksum without taking ip_len
1798          *    in account, and set it in the TCP header. Refer to
1799          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1800          *    used as helpers.
1801          */
1802         if (offload.tso_seg_size) {
1803             struct rte_tcp_hdr *tcph;
1804             int tcph_len;
1805             tcph = (struct rte_tcp_hdr *)((char *)iph + iph_len);
1806             tcph_len = (tcph->data_off & 0xf0) >> 2;
1807             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1808 
1809             head->ol_flags |= PKT_TX_TCP_SEG;
1810             head->l4_len = tcph_len;
1811             head->tso_segsz = offload.tso_seg_size;
1812         }
1813 
1814         if (offload.udp_csum) {
1815             head->ol_flags |= PKT_TX_UDP_CKSUM;
1816             head->l2_len = RTE_ETHER_HDR_LEN;
1817             head->l3_len = iph_len;
1818         }
1819     }
1820 
1821     ff_mbuf_free(m);
1822 
1823     return send_single_packet(head, ctx->port_id);
1824 }
1825 
1826 static int
1827 main_loop(void *arg)
1828 {
1829     struct loop_routine *lr = (struct loop_routine *)arg;
1830 
1831     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1832     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc;
1833     int i, j, nb_rx, idle;
1834     uint16_t port_id, queue_id;
1835     struct lcore_conf *qconf;
1836     uint64_t drain_tsc = 0;
1837     struct ff_dpdk_if_context *ctx;
1838 
1839     if (pkt_tx_delay) {
1840         drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay;
1841     }
1842 
1843     prev_tsc = 0;
1844     usch_tsc = 0;
1845 
1846     qconf = &lcore_conf;
1847 
1848     while (1) {
1849         cur_tsc = rte_rdtsc();
1850         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1851             rte_timer_manage();
1852         }
1853 
1854         idle = 1;
1855         sys_tsc = 0;
1856         usr_tsc = 0;
1857 
1858         /*
1859          * TX burst queue drain
1860          */
1861         diff_tsc = cur_tsc - prev_tsc;
1862         if (unlikely(diff_tsc >= drain_tsc)) {
1863             for (i = 0; i < qconf->nb_tx_port; i++) {
1864                 port_id = qconf->tx_port_id[i];
1865                 if (qconf->tx_mbufs[port_id].len == 0)
1866                     continue;
1867 
1868                 idle = 0;
1869 
1870                 send_burst(qconf,
1871                     qconf->tx_mbufs[port_id].len,
1872                     port_id);
1873                 qconf->tx_mbufs[port_id].len = 0;
1874             }
1875 
1876             prev_tsc = cur_tsc;
1877         }
1878 
1879         /*
1880          * Read packet from RX queues
1881          */
1882         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1883             port_id = qconf->rx_queue_list[i].port_id;
1884             queue_id = qconf->rx_queue_list[i].queue_id;
1885             ctx = veth_ctx[port_id];
1886 
1887 #ifdef FF_KNI
1888             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1889                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1890             }
1891 #endif
1892 
1893             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1894 
1895             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1896                 MAX_PKT_BURST);
1897             if (nb_rx == 0)
1898                 continue;
1899 
1900             idle = 0;
1901 
1902             /* Prefetch first packets */
1903             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1904                 rte_prefetch0(rte_pktmbuf_mtod(
1905                         pkts_burst[j], void *));
1906             }
1907 
1908             /* Prefetch and handle already prefetched packets */
1909             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1910                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1911                         j + PREFETCH_OFFSET], void *));
1912                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1913             }
1914 
1915             /* Handle remaining prefetched packets */
1916             for (; j < nb_rx; j++) {
1917                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1918             }
1919         }
1920 
1921         process_msg_ring(qconf->proc_id, pkts_burst);
1922 
1923         div_tsc = rte_rdtsc();
1924 
1925         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) {
1926             usch_tsc = cur_tsc;
1927             lr->loop(lr->arg);
1928         }
1929 
1930         idle_sleep_tsc = rte_rdtsc();
1931         if (likely(idle && idle_sleep)) {
1932             usleep(idle_sleep);
1933             end_tsc = rte_rdtsc();
1934         } else {
1935             end_tsc = idle_sleep_tsc;
1936         }
1937 
1938         if (usch_tsc == cur_tsc) {
1939             usr_tsc = idle_sleep_tsc - div_tsc;
1940         }
1941 
1942         if (!idle) {
1943             sys_tsc = div_tsc - cur_tsc;
1944             ff_top_status.sys_tsc += sys_tsc;
1945         }
1946 
1947         ff_top_status.usr_tsc += usr_tsc;
1948         ff_top_status.work_tsc += end_tsc - cur_tsc;
1949         ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1950 
1951         ff_top_status.loops++;
1952     }
1953 
1954     return 0;
1955 }
1956 
1957 int
1958 ff_dpdk_if_up(void) {
1959     int i;
1960     struct lcore_conf *qconf = &lcore_conf;
1961     for (i = 0; i < qconf->nb_tx_port; i++) {
1962         uint16_t port_id = qconf->tx_port_id[i];
1963 
1964         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1965         veth_ctx[port_id] = ff_veth_attach(pconf);
1966         if (veth_ctx[port_id] == NULL) {
1967             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1968         }
1969     }
1970 
1971     return 0;
1972 }
1973 
1974 void
1975 ff_dpdk_run(loop_func_t loop, void *arg) {
1976     struct loop_routine *lr = rte_malloc(NULL,
1977         sizeof(struct loop_routine), 0);
1978     lr->loop = loop;
1979     lr->arg = arg;
1980     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1981     rte_eal_mp_wait_lcore();
1982     rte_free(lr);
1983 }
1984 
1985 void
1986 ff_dpdk_pktmbuf_free(void *m)
1987 {
1988     rte_pktmbuf_free_seg((struct rte_mbuf *)m);
1989 }
1990 
1991 static uint32_t
1992 toeplitz_hash(unsigned keylen, const uint8_t *key,
1993     unsigned datalen, const uint8_t *data)
1994 {
1995     uint32_t hash = 0, v;
1996     u_int i, b;
1997 
1998     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1999 
2000     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
2001     for (i = 0; i < datalen; i++) {
2002         for (b = 0; b < 8; b++) {
2003             if (data[i] & (1<<(7-b)))
2004                 hash ^= v;
2005             v <<= 1;
2006             if ((i + 4) < keylen &&
2007                 (key[i+4] & (1<<(7-b))))
2008                 v |= 1;
2009         }
2010     }
2011     return (hash);
2012 }
2013 
2014 int
2015 ff_in_pcbladdr(uint16_t family, void *faddr, uint16_t fport, void *laddr)
2016 {
2017     int ret = 0;
2018     uint16_t fa;
2019 
2020     if (!pcblddr_fun)
2021         return ret;
2022 
2023     if (family == AF_INET)
2024         fa = AF_INET;
2025     else if (family == AF_INET6_FREEBSD)
2026         fa = AF_INET6_LINUX;
2027     else
2028         return EADDRNOTAVAIL;
2029 
2030     ret = (*pcblddr_fun)(fa, faddr, fport, laddr);
2031 
2032     return ret;
2033 }
2034 
2035 void
2036 ff_regist_pcblddr_fun(pcblddr_func_t func)
2037 {
2038     pcblddr_fun = func;
2039 }
2040 
2041 int
2042 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
2043     uint16_t sport, uint16_t dport)
2044 {
2045     struct lcore_conf *qconf = &lcore_conf;
2046     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
2047     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
2048 
2049     if (nb_queues <= 1) {
2050         return 1;
2051     }
2052 
2053     uint16_t reta_size = rss_reta_size[ctx->port_id];
2054     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
2055 
2056     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
2057         sizeof(dport)];
2058 
2059     unsigned datalen = 0;
2060 
2061     bcopy(&saddr, &data[datalen], sizeof(saddr));
2062     datalen += sizeof(saddr);
2063 
2064     bcopy(&daddr, &data[datalen], sizeof(daddr));
2065     datalen += sizeof(daddr);
2066 
2067     bcopy(&sport, &data[datalen], sizeof(sport));
2068     datalen += sizeof(sport);
2069 
2070     bcopy(&dport, &data[datalen], sizeof(dport));
2071     datalen += sizeof(dport);
2072 
2073     uint32_t hash = 0;
2074     hash = toeplitz_hash(rsskey_len, rsskey, datalen, data);
2075 
2076     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
2077 }
2078 
2079 void
2080 ff_regist_packet_dispatcher(dispatch_func_t func)
2081 {
2082     packet_dispatcher = func;
2083 }
2084 
2085 uint64_t
2086 ff_get_tsc_ns()
2087 {
2088     uint64_t cur_tsc = rte_rdtsc();
2089     uint64_t hz = rte_get_tsc_hz();
2090     return ((double)cur_tsc/(double)hz) * NS_PER_S;
2091 }
2092 
2093