xref: /f-stack/lib/ff_dpdk_if.c (revision ebf5cedb)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 #include <unistd.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30 
31 #include <rte_common.h>
32 #include <rte_byteorder.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memcpy.h>
36 #include <rte_memzone.h>
37 #include <rte_config.h>
38 #include <rte_eal.h>
39 #include <rte_pci.h>
40 #include <rte_mbuf.h>
41 #include <rte_memory.h>
42 #include <rte_lcore.h>
43 #include <rte_launch.h>
44 #include <rte_ethdev.h>
45 #include <rte_debug.h>
46 #include <rte_common.h>
47 #include <rte_ether.h>
48 #include <rte_malloc.h>
49 #include <rte_cycles.h>
50 #include <rte_timer.h>
51 #include <rte_thash.h>
52 #include <rte_ip.h>
53 #include <rte_tcp.h>
54 #include <rte_udp.h>
55 #include <rte_eth_bond.h>
56 
57 #include "ff_dpdk_if.h"
58 #include "ff_dpdk_pcap.h"
59 #include "ff_dpdk_kni.h"
60 #include "ff_config.h"
61 #include "ff_veth.h"
62 #include "ff_host_interface.h"
63 #include "ff_msg.h"
64 #include "ff_api.h"
65 #include "ff_memory.h"
66 
67 #ifdef FF_KNI
68 #define KNI_MBUF_MAX 2048
69 #define KNI_QUEUE_SIZE 2048
70 
71 int enable_kni;
72 static int kni_accept;
73 static int knictl_action = FF_KNICTL_ACTION_DEFAULT;
74 #endif
75 
76 static int numa_on;
77 
78 static unsigned idle_sleep;
79 static unsigned pkt_tx_delay;
80 
81 static struct rte_timer freebsd_clock;
82 
83 // Mellanox Linux's driver key
84 static uint8_t default_rsskey_40bytes[40] = {
85     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
86     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
87     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
88     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
89     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
90 };
91 
92 static uint8_t default_rsskey_52bytes[52] = {
93     0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23,
94     0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30,
95     0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02,
96     0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c,
97     0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55,
98     0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e,
99     0x81, 0x15, 0x03, 0x66
100 };
101 
102 static uint8_t symmetric_rsskey[52] = {
103     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
104     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
105     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
106     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
107     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
108     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
109     0x6d, 0x5a, 0x6d, 0x5a
110 };
111 
112 static int rsskey_len = sizeof(default_rsskey_40bytes);
113 static uint8_t *rsskey = default_rsskey_40bytes;
114 
115 struct lcore_conf lcore_conf;
116 
117 struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
118 
119 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
120 static dispatch_func_t packet_dispatcher;
121 
122 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
123 
124 #define BOND_DRIVER_NAME    "net_bonding"
125 
126 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port);
127 
128 struct ff_msg_ring {
129     char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE];
130     /* ring[0] for lcore recv msg, other send */
131     /* ring[1] for lcore send msg, other read */
132     struct rte_ring *ring[FF_MSG_NUM];
133 } __rte_cache_aligned;
134 
135 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
136 static struct rte_mempool *message_pool;
137 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
138 
139 static struct ff_top_args ff_top_status;
140 static struct ff_traffic_args ff_traffic;
141 extern void ff_hardclock(void);
142 
143 static void
144 ff_hardclock_job(__rte_unused struct rte_timer *timer,
145     __rte_unused void *arg) {
146     ff_hardclock();
147     ff_update_current_ts();
148 }
149 
150 struct ff_dpdk_if_context *
151 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
152 {
153     struct ff_dpdk_if_context *ctx;
154 
155     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
156     if (ctx == NULL)
157         return NULL;
158 
159     ctx->sc = sc;
160     ctx->ifp = ifp;
161     ctx->port_id = cfg->port_id;
162     ctx->hw_features = cfg->hw_features;
163 
164     return ctx;
165 }
166 
167 void
168 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
169 {
170     free(ctx);
171 }
172 
173 static void
174 check_all_ports_link_status(void)
175 {
176     #define CHECK_INTERVAL 100 /* 100ms */
177     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
178 
179     uint16_t portid;
180     uint8_t count, all_ports_up, print_flag = 0;
181     struct rte_eth_link link;
182 
183     printf("\nChecking link status");
184     fflush(stdout);
185 
186     int i, nb_ports;
187     nb_ports = ff_global_cfg.dpdk.nb_ports;
188     for (count = 0; count <= MAX_CHECK_TIME; count++) {
189         all_ports_up = 1;
190         for (i = 0; i < nb_ports; i++) {
191             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
192             memset(&link, 0, sizeof(link));
193             rte_eth_link_get_nowait(portid, &link);
194 
195             /* print link status if flag set */
196             if (print_flag == 1) {
197                 if (link.link_status) {
198                     printf("Port %d Link Up - speed %u "
199                         "Mbps - %s\n", (int)portid,
200                         (unsigned)link.link_speed,
201                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
202                         ("full-duplex") : ("half-duplex\n"));
203                 } else {
204                     printf("Port %d Link Down\n", (int)portid);
205                 }
206                 continue;
207             }
208             /* clear all_ports_up flag if any link down */
209             if (link.link_status == 0) {
210                 all_ports_up = 0;
211                 break;
212             }
213         }
214 
215         /* after finally printing all link status, get out */
216         if (print_flag == 1)
217             break;
218 
219         if (all_ports_up == 0) {
220             printf(".");
221             fflush(stdout);
222             rte_delay_ms(CHECK_INTERVAL);
223         }
224 
225         /* set the print_flag if all ports up or timeout */
226         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
227             print_flag = 1;
228             printf("done\n");
229         }
230     }
231 }
232 
233 static int
234 init_lcore_conf(void)
235 {
236     uint8_t nb_dev_ports = rte_eth_dev_count_avail();
237     if (nb_dev_ports == 0) {
238         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
239     }
240 
241     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
242         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
243                  ff_global_cfg.dpdk.max_portid);
244     }
245 
246     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
247     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
248 
249     uint16_t socket_id = 0;
250     if (numa_on) {
251         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
252     }
253 
254     lcore_conf.socket_id = socket_id;
255 
256     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
257     if (!rte_lcore_is_enabled(lcore_id)) {
258         rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
259     }
260 
261     int j;
262     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
263         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
264         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
265 
266         int queueid = -1;
267         int i;
268         for (i = 0; i < pconf->nb_lcores; i++) {
269             if (pconf->lcore_list[i] == lcore_id) {
270                 queueid = i;
271             }
272         }
273         if (queueid < 0) {
274             continue;
275         }
276         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
277         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
278         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
279         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
280         lcore_conf.nb_rx_queue++;
281 
282         lcore_conf.tx_queue_id[port_id] = queueid;
283         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
284         lcore_conf.nb_tx_port++;
285 
286         /* Enable pcap dump */
287         if (ff_global_cfg.pcap.enable) {
288             ff_enable_pcap(ff_global_cfg.pcap.save_path, ff_global_cfg.pcap.snap_len);
289         }
290 
291         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
292     }
293 
294     if (lcore_conf.nb_rx_queue == 0) {
295         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
296     }
297 
298     return 0;
299 }
300 
301 static int
302 init_mem_pool(void)
303 {
304     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
305     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
306     uint32_t nb_tx_queue = nb_lcores;
307     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
308     uint16_t max_portid = ff_global_cfg.dpdk.max_portid;
309 
310     unsigned nb_mbuf = RTE_ALIGN_CEIL (
311         (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE          +
312         nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST    +
313         nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE  +
314         nb_lcores * MEMPOOL_CACHE_SIZE +
315 #ifdef FF_KNI
316         nb_ports * KNI_MBUF_MAX +
317         nb_ports * KNI_QUEUE_SIZE +
318 #endif
319         nb_lcores * nb_ports * DISPATCH_RING_SIZE),
320         (unsigned)8192);
321 
322     unsigned socketid = 0;
323     uint16_t i, lcore_id;
324     char s[64];
325 
326     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
327         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
328         if (numa_on) {
329             socketid = rte_lcore_to_socket_id(lcore_id);
330         }
331 
332         if (socketid >= NB_SOCKETS) {
333             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
334                 socketid, i, NB_SOCKETS);
335         }
336 
337         if (pktmbuf_pool[socketid] != NULL) {
338             continue;
339         }
340 
341         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
342             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
343             pktmbuf_pool[socketid] =
344                 rte_pktmbuf_pool_create(s, nb_mbuf,
345                     MEMPOOL_CACHE_SIZE, 0,
346                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
347         } else {
348             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
349             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
350         }
351 
352         if (pktmbuf_pool[socketid] == NULL) {
353             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
354         } else {
355             printf("create mbuf pool on socket %d\n", socketid);
356         }
357 
358 #ifdef FF_USE_PAGE_ARRAY
359         nb_mbuf = RTE_ALIGN_CEIL (
360             nb_ports*nb_lcores*MAX_PKT_BURST    +
361             nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
362             nb_lcores*MEMPOOL_CACHE_SIZE,
363             (unsigned)4096);
364         ff_init_ref_pool(nb_mbuf, socketid);
365 #endif
366     }
367 
368     return 0;
369 }
370 
371 static struct rte_ring *
372 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
373 {
374     struct rte_ring *ring;
375 
376     if (name == NULL) {
377         rte_exit(EXIT_FAILURE, "create ring failed, no name!\n");
378     }
379 
380     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
381         ring = rte_ring_create(name, count, socket_id, flags);
382     } else {
383         ring = rte_ring_lookup(name);
384     }
385 
386     if (ring == NULL) {
387         rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name);
388     }
389 
390     return ring;
391 }
392 
393 static int
394 init_dispatch_ring(void)
395 {
396     int j;
397     char name_buf[RTE_RING_NAMESIZE];
398     int queueid;
399 
400     unsigned socketid = lcore_conf.socket_id;
401 
402     /* Create ring according to ports actually being used. */
403     int nb_ports = ff_global_cfg.dpdk.nb_ports;
404     for (j = 0; j < nb_ports; j++) {
405         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
406         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
407         int nb_queues = pconf->nb_lcores;
408         if (dispatch_ring[portid] == NULL) {
409             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
410 
411             dispatch_ring[portid] = rte_zmalloc(name_buf,
412                 sizeof(struct rte_ring *) * nb_queues,
413                 RTE_CACHE_LINE_SIZE);
414             if (dispatch_ring[portid] == NULL) {
415                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
416                     "failed\n", name_buf);
417             }
418         }
419 
420         for(queueid = 0; queueid < nb_queues; ++queueid) {
421             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
422                 portid, queueid);
423             dispatch_ring[portid][queueid] = create_ring(name_buf,
424                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
425 
426             if (dispatch_ring[portid][queueid] == NULL)
427                 rte_panic("create ring:%s failed!\n", name_buf);
428 
429             printf("create ring:%s success, %u ring entries are now free!\n",
430                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
431         }
432     }
433 
434     return 0;
435 }
436 
437 static void
438 ff_msg_init(struct rte_mempool *mp,
439     __attribute__((unused)) void *opaque_arg,
440     void *obj, __attribute__((unused)) unsigned i)
441 {
442     struct ff_msg *msg = (struct ff_msg *)obj;
443     msg->msg_type = FF_UNKNOWN;
444     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
445     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
446 }
447 
448 static int
449 init_msg_ring(void)
450 {
451     uint16_t i, j;
452     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
453     unsigned socketid = lcore_conf.socket_id;
454 
455     /* Create message buffer pool */
456     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
457         message_pool = rte_mempool_create(FF_MSG_POOL,
458            MSG_RING_SIZE * 2 * nb_procs,
459            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
460            NULL, NULL, ff_msg_init, NULL,
461            socketid, 0);
462     } else {
463         message_pool = rte_mempool_lookup(FF_MSG_POOL);
464     }
465 
466     if (message_pool == NULL) {
467         rte_panic("Create msg mempool failed\n");
468     }
469 
470     for(i = 0; i < nb_procs; ++i) {
471         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
472             "%s%u", FF_MSG_RING_IN, i);
473         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
474             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
475         if (msg_ring[i].ring[0] == NULL)
476             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
477 
478         for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) {
479             snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE,
480                 "%s%u_%u", FF_MSG_RING_OUT, i, j);
481             msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j],
482                 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
483             if (msg_ring[i].ring[j] == NULL)
484                 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]);
485         }
486     }
487 
488     return 0;
489 }
490 
491 #ifdef FF_KNI
492 
493 static enum FF_KNICTL_CMD get_kni_action(const char *c){
494     if (!c)
495         return FF_KNICTL_ACTION_DEFAULT;
496     if (0 == strcasecmp(c, "alltokni")){
497         return FF_KNICTL_ACTION_ALL_TO_KNI;
498     } else  if (0 == strcasecmp(c, "alltoff")){
499         return FF_KNICTL_ACTION_ALL_TO_FF;
500     } else if (0 == strcasecmp(c, "default")){
501         return FF_KNICTL_ACTION_DEFAULT;
502     } else {
503         return FF_KNICTL_ACTION_DEFAULT;
504     }
505 }
506 
507 static int
508 init_kni(void)
509 {
510     int nb_ports = rte_eth_dev_count_avail();
511     kni_accept = 0;
512     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
513         kni_accept = 1;
514 
515     knictl_action = get_kni_action(ff_global_cfg.kni.kni_action);
516 
517     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
518         ff_global_cfg.kni.udp_port);
519 
520     unsigned socket_id = lcore_conf.socket_id;
521     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
522 
523     nb_ports = ff_global_cfg.dpdk.nb_ports;
524     int i, ret;
525     for (i = 0; i < nb_ports; i++) {
526         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
527         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
528     }
529 
530     return 0;
531 }
532 #endif
533 
534 static void
535 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
536 {
537     if (reta_size == 0) {
538         return;
539     }
540 
541     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
542     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
543 
544     /* config HW indirection table */
545     unsigned i, j, hash=0;
546     for (i = 0; i < reta_conf_size; i++) {
547         reta_conf[i].mask = ~0ULL;
548         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
549             reta_conf[i].reta[j] = hash++ % nb_queues;
550         }
551     }
552 
553     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
554         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
555             port_id);
556     }
557 }
558 
559 static int
560 init_port_start(void)
561 {
562     int nb_ports = ff_global_cfg.dpdk.nb_ports;
563     unsigned socketid = 0;
564     struct rte_mempool *mbuf_pool;
565     uint16_t i, j;
566 
567     for (i = 0; i < nb_ports; i++) {
568         uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i];
569         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id];
570         uint16_t nb_queues = pconf->nb_lcores;
571 
572         for (j=0; j<=pconf->nb_slaves; j++) {
573             if (j < pconf->nb_slaves) {
574                 port_id = pconf->slave_portid_list[j];
575                 printf("To init %s's %d'st slave port[%d]\n",
576                         ff_global_cfg.dpdk.bond_cfgs->name,
577                         j, port_id);
578             } else {
579                 port_id = u_port_id;
580             }
581 
582             struct rte_eth_dev_info dev_info;
583             struct rte_eth_conf port_conf = {0};
584             struct rte_eth_rxconf rxq_conf;
585             struct rte_eth_txconf txq_conf;
586 
587             int ret = rte_eth_dev_info_get(port_id, &dev_info);
588             if (ret != 0)
589                 rte_exit(EXIT_FAILURE,
590                     "Error during getting device (port %u) info: %s\n",
591                     port_id, strerror(-ret));
592 
593             if (nb_queues > dev_info.max_rx_queues) {
594                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
595                     nb_queues,
596                     dev_info.max_rx_queues);
597             }
598 
599             if (nb_queues > dev_info.max_tx_queues) {
600                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
601                     nb_queues,
602                     dev_info.max_tx_queues);
603             }
604 
605             struct rte_ether_addr addr;
606             rte_eth_macaddr_get(port_id, &addr);
607             printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
608                        " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
609                     (unsigned)port_id,
610                     addr.addr_bytes[0], addr.addr_bytes[1],
611                     addr.addr_bytes[2], addr.addr_bytes[3],
612                     addr.addr_bytes[4], addr.addr_bytes[5]);
613 
614             rte_memcpy(pconf->mac,
615                 addr.addr_bytes, RTE_ETHER_ADDR_LEN);
616 
617             /* Set RSS mode */
618             uint64_t default_rss_hf = ETH_RSS_PROTO_MASK;
619             port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
620             port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf;
621             if (dev_info.hash_key_size == 52) {
622                 rsskey = default_rsskey_52bytes;
623                 rsskey_len = 52;
624             }
625             if (ff_global_cfg.dpdk.symmetric_rss) {
626                 printf("Use symmetric Receive-side Scaling(RSS) key\n");
627                 rsskey = symmetric_rsskey;
628             }
629             port_conf.rx_adv_conf.rss_conf.rss_key = rsskey;
630             port_conf.rx_adv_conf.rss_conf.rss_key_len = rsskey_len;
631             port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads;
632             if (port_conf.rx_adv_conf.rss_conf.rss_hf !=
633                     ETH_RSS_PROTO_MASK) {
634                 printf("Port %u modified RSS hash function based on hardware support,"
635                         "requested:%#"PRIx64" configured:%#"PRIx64"\n",
636                         port_id, default_rss_hf,
637                         port_conf.rx_adv_conf.rss_conf.rss_hf);
638             }
639 
640             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) {
641                 port_conf.txmode.offloads |=
642                     DEV_TX_OFFLOAD_MBUF_FAST_FREE;
643             }
644 
645             /* Set Rx VLAN stripping */
646             if (ff_global_cfg.dpdk.vlan_strip) {
647                 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
648                     port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
649                 }
650             }
651 
652             /* Enable HW CRC stripping */
653             port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC;
654 
655             /* FIXME: Enable TCP LRO ?*/
656             #if 0
657             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
658                 printf("LRO is supported\n");
659                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
660                 pconf->hw_features.rx_lro = 1;
661             }
662             #endif
663 
664             /* Set Rx checksum checking */
665             if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
666                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
667                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
668                 printf("RX checksum offload supported\n");
669                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
670                 pconf->hw_features.rx_csum = 1;
671             }
672 
673             if (ff_global_cfg.dpdk.tx_csum_offoad_skip == 0) {
674                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
675                     printf("TX ip checksum offload supported\n");
676                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
677                     pconf->hw_features.tx_csum_ip = 1;
678                 }
679 
680                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
681                     (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
682                     printf("TX TCP&UDP checksum offload supported\n");
683                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM;
684                     pconf->hw_features.tx_csum_l4 = 1;
685                 }
686             } else {
687                 printf("TX checksum offoad is disabled\n");
688             }
689 
690             if (ff_global_cfg.dpdk.tso) {
691                 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
692                     printf("TSO is supported\n");
693                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO;
694                     pconf->hw_features.tx_tso = 1;
695                 }
696             } else {
697                 printf("TSO is disabled\n");
698             }
699 
700             if (dev_info.reta_size) {
701                 /* reta size must be power of 2 */
702                 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
703 
704                 rss_reta_size[port_id] = dev_info.reta_size;
705                 printf("port[%d]: rss table size: %d\n", port_id,
706                     dev_info.reta_size);
707             }
708 
709             if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
710                 continue;
711             }
712 
713             ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
714             if (ret != 0) {
715                 return ret;
716             }
717 
718             static uint16_t nb_rxd = RX_QUEUE_SIZE;
719             static uint16_t nb_txd = TX_QUEUE_SIZE;
720             ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd);
721             if (ret < 0)
722                 printf("Could not adjust number of descriptors "
723                         "for port%u (%d)\n", (unsigned)port_id, ret);
724 
725             uint16_t q;
726             for (q = 0; q < nb_queues; q++) {
727                 if (numa_on) {
728                     uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
729                     socketid = rte_lcore_to_socket_id(lcore_id);
730                 }
731                 mbuf_pool = pktmbuf_pool[socketid];
732 
733                 txq_conf = dev_info.default_txconf;
734                 txq_conf.offloads = port_conf.txmode.offloads;
735                 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd,
736                     socketid, &txq_conf);
737                 if (ret < 0) {
738                     return ret;
739                 }
740 
741                 rxq_conf = dev_info.default_rxconf;
742                 rxq_conf.offloads = port_conf.rxmode.offloads;
743                 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd,
744                     socketid, &rxq_conf, mbuf_pool);
745                 if (ret < 0) {
746                     return ret;
747                 }
748             }
749 
750 
751             if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME,
752                     strlen(dev_info.driver_name)) == 0) {
753 
754                 rte_eth_macaddr_get(port_id, &addr);
755                 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
756                            " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
757                         (unsigned)port_id,
758                         addr.addr_bytes[0], addr.addr_bytes[1],
759                         addr.addr_bytes[2], addr.addr_bytes[3],
760                         addr.addr_bytes[4], addr.addr_bytes[5]);
761 
762                 rte_memcpy(pconf->mac,
763                     addr.addr_bytes, RTE_ETHER_ADDR_LEN);
764 
765                 int mode, count, x;
766                 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS;
767 
768                 mode = rte_eth_bond_mode_get(port_id);
769                 printf("Port %u, bond mode:%d\n", port_id, mode);
770 
771                 count = rte_eth_bond_slaves_get(port_id, slaves, len);
772                 printf("Port %u, %s's slave ports count:%d\n", port_id,
773                             ff_global_cfg.dpdk.bond_cfgs->name, count);
774                 for (x=0; x<count; x++) {
775                     printf("Port %u, %s's slave port[%u]\n", port_id,
776                             ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]);
777                 }
778             }
779 
780             ret = rte_eth_dev_start(port_id);
781             if (ret < 0) {
782                 return ret;
783             }
784 
785             if (nb_queues > 1) {
786                 /* set HW rss hash function to Toeplitz. */
787                 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
788                     struct rte_eth_hash_filter_info info = {0};
789                     info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
790                     info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
791 
792                     if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
793                         RTE_ETH_FILTER_SET, &info) < 0) {
794                         rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
795                             port_id);
796                     }
797                 }
798 
799                 set_rss_table(port_id, dev_info.reta_size, nb_queues);
800             }
801 
802             /* Enable RX in promiscuous mode for the Ethernet device. */
803             if (ff_global_cfg.dpdk.promiscuous) {
804                 ret = rte_eth_promiscuous_enable(port_id);
805                 if (ret == 0) {
806                     printf("set port %u to promiscuous mode ok\n", port_id);
807                 } else {
808                     printf("set port %u to promiscuous mode error\n", port_id);
809                 }
810             }
811         }
812     }
813 
814     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
815         check_all_ports_link_status();
816     }
817 
818     return 0;
819 }
820 
821 static int
822 init_clock(void)
823 {
824     rte_timer_subsystem_init();
825     uint64_t hz = rte_get_timer_hz();
826     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
827     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
828 
829     rte_timer_init(&freebsd_clock);
830     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
831         rte_lcore_id(), &ff_hardclock_job, NULL);
832 
833     ff_update_current_ts();
834 
835     return 0;
836 }
837 
838 int
839 ff_dpdk_init(int argc, char **argv)
840 {
841     if (ff_global_cfg.dpdk.nb_procs < 1 ||
842         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
843         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
844         ff_global_cfg.dpdk.proc_id < 0) {
845         printf("param num_procs[%d] or proc_id[%d] error!\n",
846             ff_global_cfg.dpdk.nb_procs,
847             ff_global_cfg.dpdk.proc_id);
848         exit(1);
849     }
850 
851     int ret = rte_eal_init(argc, argv);
852     if (ret < 0) {
853         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
854     }
855 
856     numa_on = ff_global_cfg.dpdk.numa_on;
857 
858     idle_sleep = ff_global_cfg.dpdk.idle_sleep;
859     pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \
860         BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay;
861 
862     init_lcore_conf();
863 
864     init_mem_pool();
865 
866     init_dispatch_ring();
867 
868     init_msg_ring();
869 
870 #ifdef FF_KNI
871     enable_kni = ff_global_cfg.kni.enable;
872     if (enable_kni) {
873         init_kni();
874     }
875 #endif
876 
877 #ifdef FF_USE_PAGE_ARRAY
878     ff_mmap_init();
879 #endif
880 
881     ret = init_port_start();
882     if (ret < 0) {
883         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
884     }
885 
886     init_clock();
887 
888     return 0;
889 }
890 
891 static void
892 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
893 {
894     uint8_t rx_csum = ctx->hw_features.rx_csum;
895     if (rx_csum) {
896         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
897             rte_pktmbuf_free(pkt);
898             return;
899         }
900     }
901 
902     void *data = rte_pktmbuf_mtod(pkt, void*);
903     uint16_t len = rte_pktmbuf_data_len(pkt);
904 
905     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
906     if (hdr == NULL) {
907         rte_pktmbuf_free(pkt);
908         return;
909     }
910 
911     if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) {
912         ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci);
913     }
914 
915     struct rte_mbuf *pn = pkt->next;
916     void *prev = hdr;
917     while(pn != NULL) {
918         data = rte_pktmbuf_mtod(pn, void*);
919         len = rte_pktmbuf_data_len(pn);
920 
921         void *mb = ff_mbuf_get(prev, data, len);
922         if (mb == NULL) {
923             ff_mbuf_free(hdr);
924             rte_pktmbuf_free(pkt);
925             return;
926         }
927         pn = pn->next;
928         prev = mb;
929     }
930 
931     ff_veth_process_packet(ctx->ifp, hdr);
932 }
933 
934 static enum FilterReturn
935 protocol_filter(const void *data, uint16_t len)
936 {
937     if(len < RTE_ETHER_ADDR_LEN)
938         return FILTER_UNKNOWN;
939 
940     const struct rte_ether_hdr *hdr;
941     const struct rte_vlan_hdr *vlanhdr;
942     hdr = (const struct rte_ether_hdr *)data;
943     uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type);
944     data += RTE_ETHER_HDR_LEN;
945     len -= RTE_ETHER_HDR_LEN;
946 
947     if (ether_type == RTE_ETHER_TYPE_VLAN) {
948         vlanhdr = (struct rte_vlan_hdr *)data;
949         ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto);
950         data += sizeof(struct rte_vlan_hdr);
951         len -= sizeof(struct rte_vlan_hdr);
952     }
953 
954     if(ether_type == RTE_ETHER_TYPE_ARP)
955         return FILTER_ARP;
956 
957 #ifdef INET6
958     if (ether_type == RTE_ETHER_TYPE_IPV6) {
959         return ff_kni_proto_filter(data,
960             len, ether_type);
961     }
962 #endif
963 
964 #ifndef FF_KNI
965     return FILTER_UNKNOWN;
966 #else
967     if (!enable_kni) {
968         return FILTER_UNKNOWN;
969     }
970 
971     if(ether_type != RTE_ETHER_TYPE_IPV4)
972         return FILTER_UNKNOWN;
973 
974     return ff_kni_proto_filter(data,
975         len, ether_type);
976 #endif
977 }
978 
979 static inline void
980 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
981 {
982     struct rte_mbuf *md;
983     void *src, *dst;
984 
985     dst = rte_pktmbuf_mtod(mi, void *);
986     src = rte_pktmbuf_mtod(m, void *);
987 
988     mi->data_len = m->data_len;
989     rte_memcpy(dst, src, m->data_len);
990 
991     mi->port = m->port;
992     mi->vlan_tci = m->vlan_tci;
993     mi->vlan_tci_outer = m->vlan_tci_outer;
994     mi->tx_offload = m->tx_offload;
995     mi->hash = m->hash;
996     mi->ol_flags = m->ol_flags;
997     mi->packet_type = m->packet_type;
998 }
999 
1000 /* copied from rte_pktmbuf_clone */
1001 static inline struct rte_mbuf *
1002 pktmbuf_deep_clone(const struct rte_mbuf *md,
1003     struct rte_mempool *mp)
1004 {
1005     struct rte_mbuf *mc, *mi, **prev;
1006     uint32_t pktlen;
1007     uint8_t nseg;
1008 
1009     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
1010         return NULL;
1011 
1012     mi = mc;
1013     prev = &mi->next;
1014     pktlen = md->pkt_len;
1015     nseg = 0;
1016 
1017     do {
1018         nseg++;
1019         pktmbuf_deep_attach(mi, md);
1020         *prev = mi;
1021         prev = &mi->next;
1022     } while ((md = md->next) != NULL &&
1023         (mi = rte_pktmbuf_alloc(mp)) != NULL);
1024 
1025     *prev = NULL;
1026     mc->nb_segs = nseg;
1027     mc->pkt_len = pktlen;
1028 
1029     /* Allocation of new indirect segment failed */
1030     if (unlikely (mi == NULL)) {
1031         rte_pktmbuf_free(mc);
1032         return NULL;
1033     }
1034 
1035     __rte_mbuf_sanity_check(mc, 1);
1036     return mc;
1037 }
1038 
1039 static inline void
1040 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
1041     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
1042 {
1043     struct lcore_conf *qconf = &lcore_conf;
1044     uint16_t nb_queues = qconf->nb_queue_list[port_id];
1045 
1046     uint16_t i;
1047     for (i = 0; i < count; i++) {
1048         struct rte_mbuf *rtem = bufs[i];
1049 
1050         if (unlikely( ff_global_cfg.pcap.enable)) {
1051             if (!pkts_from_ring) {
1052                 ff_dump_packets( ff_global_cfg.pcap.save_path, rtem, ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1053             }
1054         }
1055 
1056         void *data = rte_pktmbuf_mtod(rtem, void*);
1057         uint16_t len = rte_pktmbuf_data_len(rtem);
1058 
1059         if (!pkts_from_ring) {
1060             ff_traffic.rx_packets++;
1061             ff_traffic.rx_bytes += len;
1062         }
1063 
1064         if (!pkts_from_ring && packet_dispatcher) {
1065             int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues);
1066             if (ret == FF_DISPATCH_RESPONSE) {
1067                 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len;
1068 
1069                 /*
1070                  * We have not support vlan out strip
1071                  */
1072                 if (rtem->vlan_tci) {
1073                     data = rte_pktmbuf_prepend(rtem, sizeof(struct rte_vlan_hdr));
1074                     if (data != NULL) {
1075                         memmove(data, data + sizeof(struct rte_vlan_hdr), RTE_ETHER_HDR_LEN);
1076                         struct rte_ether_hdr *etherhdr = (struct rte_ether_hdr *)data;
1077                         struct rte_vlan_hdr *vlanhdr = (struct rte_vlan_hdr *)(data + RTE_ETHER_HDR_LEN);
1078                         vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci);
1079                         vlanhdr->eth_proto = etherhdr->ether_type;
1080                         etherhdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN);
1081                     }
1082                 }
1083                 send_single_packet(rtem, port_id);
1084                 continue;
1085             }
1086 
1087             if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) {
1088                 rte_pktmbuf_free(rtem);
1089                 continue;
1090             }
1091 
1092             if (ret != queue_id) {
1093                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
1094                 if (ret < 0)
1095                     rte_pktmbuf_free(rtem);
1096 
1097                 continue;
1098             }
1099         }
1100 
1101         enum FilterReturn filter = protocol_filter(data, len);
1102 #ifdef INET6
1103         if (filter == FILTER_ARP || filter == FILTER_NDP) {
1104 #else
1105         if (filter == FILTER_ARP) {
1106 #endif
1107             struct rte_mempool *mbuf_pool;
1108             struct rte_mbuf *mbuf_clone;
1109             if (!pkts_from_ring) {
1110                 uint16_t j;
1111                 for(j = 0; j < nb_queues; ++j) {
1112                     if(j == queue_id)
1113                         continue;
1114 
1115                     unsigned socket_id = 0;
1116                     if (numa_on) {
1117                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1118                         socket_id = rte_lcore_to_socket_id(lcore_id);
1119                     }
1120                     mbuf_pool = pktmbuf_pool[socket_id];
1121                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1122                     if(mbuf_clone) {
1123                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1124                             mbuf_clone);
1125                         if (ret < 0)
1126                             rte_pktmbuf_free(mbuf_clone);
1127                     }
1128                 }
1129             }
1130 
1131 #ifdef FF_KNI
1132             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1133                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1134                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1135                 if(mbuf_clone) {
1136                     ff_kni_enqueue(port_id, mbuf_clone);
1137                 }
1138             }
1139 #endif
1140             ff_veth_input(ctx, rtem);
1141 #ifdef FF_KNI
1142         } else if (enable_kni) {
1143             if (knictl_action == FF_KNICTL_ACTION_ALL_TO_KNI){
1144                 ff_kni_enqueue(port_id, rtem);
1145             } else if (knictl_action == FF_KNICTL_ACTION_ALL_TO_FF){
1146                 ff_veth_input(ctx, rtem);
1147             } else if (knictl_action == FF_KNICTL_ACTION_DEFAULT){
1148                 if (enable_kni &&
1149                         ((filter == FILTER_KNI && kni_accept) ||
1150                         (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1151                         ff_kni_enqueue(port_id, rtem);
1152                 } else {
1153                     ff_veth_input(ctx, rtem);
1154                 }
1155             } else {
1156                 ff_veth_input(ctx, rtem);
1157             }
1158 #endif
1159         } else {
1160             ff_veth_input(ctx, rtem);
1161         }
1162     }
1163 }
1164 
1165 static inline int
1166 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1167     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1168 {
1169     /* read packet from ring buf and to process */
1170     uint16_t nb_rb;
1171     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1172         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1173 
1174     if(nb_rb > 0) {
1175         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1176     }
1177 
1178     return 0;
1179 }
1180 
1181 static inline void
1182 handle_sysctl_msg(struct ff_msg *msg)
1183 {
1184     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1185         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1186         msg->sysctl.newlen);
1187 
1188     if (ret < 0) {
1189         msg->result = errno;
1190     } else {
1191         msg->result = 0;
1192     }
1193 }
1194 
1195 static inline void
1196 handle_ioctl_msg(struct ff_msg *msg)
1197 {
1198     int fd, ret;
1199 #ifdef INET6
1200     if (msg->msg_type == FF_IOCTL6) {
1201         fd = ff_socket(AF_INET6, SOCK_DGRAM, 0);
1202     } else
1203 #endif
1204         fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1205 
1206     if (fd < 0) {
1207         ret = -1;
1208         goto done;
1209     }
1210 
1211     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1212 
1213     ff_close(fd);
1214 
1215 done:
1216     if (ret < 0) {
1217         msg->result = errno;
1218     } else {
1219         msg->result = 0;
1220     }
1221 }
1222 
1223 static inline void
1224 handle_route_msg(struct ff_msg *msg)
1225 {
1226     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1227         &msg->route.len, msg->route.maxlen);
1228     if (ret < 0) {
1229         msg->result = errno;
1230     } else {
1231         msg->result = 0;
1232     }
1233 }
1234 
1235 static inline void
1236 handle_top_msg(struct ff_msg *msg)
1237 {
1238     msg->top = ff_top_status;
1239     msg->result = 0;
1240 }
1241 
1242 #ifdef FF_NETGRAPH
1243 static inline void
1244 handle_ngctl_msg(struct ff_msg *msg)
1245 {
1246     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1247     if (ret < 0) {
1248         msg->result = errno;
1249     } else {
1250         msg->result = 0;
1251         msg->ngctl.ret = ret;
1252     }
1253 }
1254 #endif
1255 
1256 #ifdef FF_IPFW
1257 static inline void
1258 handle_ipfw_msg(struct ff_msg *msg)
1259 {
1260     int fd, ret;
1261     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1262     if (fd < 0) {
1263         ret = -1;
1264         goto done;
1265     }
1266 
1267     switch (msg->ipfw.cmd) {
1268         case FF_IPFW_GET:
1269             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1270                 msg->ipfw.optname, msg->ipfw.optval,
1271                 msg->ipfw.optlen);
1272             break;
1273         case FF_IPFW_SET:
1274             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1275                 msg->ipfw.optname, msg->ipfw.optval,
1276                 *(msg->ipfw.optlen));
1277             break;
1278         default:
1279             ret = -1;
1280             errno = ENOTSUP;
1281             break;
1282     }
1283 
1284     ff_close(fd);
1285 
1286 done:
1287     if (ret < 0) {
1288         msg->result = errno;
1289     } else {
1290         msg->result = 0;
1291     }
1292 }
1293 #endif
1294 
1295 static inline void
1296 handle_traffic_msg(struct ff_msg *msg)
1297 {
1298     msg->traffic = ff_traffic;
1299     msg->result = 0;
1300 }
1301 
1302 #ifdef FF_KNI
1303 static inline void
1304 handle_knictl_msg(struct ff_msg *msg)
1305 {
1306     if (msg->knictl.kni_cmd == FF_KNICTL_CMD_SET){
1307         switch (msg->knictl.kni_action){
1308             case FF_KNICTL_ACTION_ALL_TO_FF: knictl_action = FF_KNICTL_ACTION_ALL_TO_FF; msg->result = 0; printf("new kni action: alltoff\n"); break;
1309             case FF_KNICTL_ACTION_ALL_TO_KNI: knictl_action = FF_KNICTL_ACTION_ALL_TO_KNI; msg->result = 0; printf("new kni action: alltokni\n"); break;
1310             case FF_KNICTL_ACTION_DEFAULT: knictl_action = FF_KNICTL_ACTION_DEFAULT; msg->result = 0; printf("new kni action: default\n"); break;
1311             default: msg->result = -1;
1312         }
1313     }
1314     else if (msg->knictl.kni_cmd == FF_KNICTL_CMD_GET){
1315         msg->knictl.kni_action = knictl_action;
1316     } else {
1317         msg->result = -2;
1318     }
1319 }
1320 #endif
1321 
1322 static inline void
1323 handle_default_msg(struct ff_msg *msg)
1324 {
1325     msg->result = ENOTSUP;
1326 }
1327 
1328 static inline void
1329 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1330 {
1331     switch (msg->msg_type) {
1332         case FF_SYSCTL:
1333             handle_sysctl_msg(msg);
1334             break;
1335         case FF_IOCTL:
1336 #ifdef INET6
1337         case FF_IOCTL6:
1338 #endif
1339             handle_ioctl_msg(msg);
1340             break;
1341         case FF_ROUTE:
1342             handle_route_msg(msg);
1343             break;
1344         case FF_TOP:
1345             handle_top_msg(msg);
1346             break;
1347 #ifdef FF_NETGRAPH
1348         case FF_NGCTL:
1349             handle_ngctl_msg(msg);
1350             break;
1351 #endif
1352 #ifdef FF_IPFW
1353         case FF_IPFW_CTL:
1354             handle_ipfw_msg(msg);
1355             break;
1356 #endif
1357         case FF_TRAFFIC:
1358             handle_traffic_msg(msg);
1359             break;
1360 #ifdef FF_KNI
1361         case FF_KNICTL:
1362             handle_knictl_msg(msg);
1363             break;
1364 #endif
1365         default:
1366             handle_default_msg(msg);
1367             break;
1368     }
1369     rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg);
1370 }
1371 
1372 static inline int
1373 process_msg_ring(uint16_t proc_id)
1374 {
1375     void *msg;
1376     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1377 
1378     if (unlikely(ret == 0)) {
1379         handle_msg((struct ff_msg *)msg, proc_id);
1380     }
1381 
1382     return 0;
1383 }
1384 
1385 /* Send burst of packets on an output interface */
1386 static inline int
1387 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1388 {
1389     struct rte_mbuf **m_table;
1390     int ret;
1391     uint16_t queueid;
1392 
1393     queueid = qconf->tx_queue_id[port];
1394     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1395 
1396     if (unlikely(ff_global_cfg.pcap.enable)) {
1397         uint16_t i;
1398         for (i = 0; i < n; i++) {
1399             ff_dump_packets( ff_global_cfg.pcap.save_path, m_table[i],
1400                ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1401         }
1402     }
1403 
1404     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1405     ff_traffic.tx_packets += ret;
1406     uint16_t i;
1407     for (i = 0; i < ret; i++) {
1408         ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]);
1409 #ifdef FF_USE_PAGE_ARRAY
1410         if (qconf->tx_mbufs[port].bsd_m_table[i])
1411             ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs);
1412 #endif
1413     }
1414     if (unlikely(ret < n)) {
1415         do {
1416             rte_pktmbuf_free(m_table[ret]);
1417 #ifdef FF_USE_PAGE_ARRAY
1418             if ( qconf->tx_mbufs[port].bsd_m_table[ret] )
1419                 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]);
1420 #endif
1421         } while (++ret < n);
1422     }
1423     return 0;
1424 }
1425 
1426 /* Enqueue a single packet, and send burst if queue is filled */
1427 static inline int
1428 send_single_packet(struct rte_mbuf *m, uint8_t port)
1429 {
1430     uint16_t len;
1431     struct lcore_conf *qconf;
1432 
1433     qconf = &lcore_conf;
1434     len = qconf->tx_mbufs[port].len;
1435     qconf->tx_mbufs[port].m_table[len] = m;
1436     len++;
1437 
1438     /* enough pkts to be sent */
1439     if (unlikely(len == MAX_PKT_BURST)) {
1440         send_burst(qconf, MAX_PKT_BURST, port);
1441         len = 0;
1442     }
1443 
1444     qconf->tx_mbufs[port].len = len;
1445     return 0;
1446 }
1447 
1448 int
1449 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1450     int total)
1451 {
1452 #ifdef FF_USE_PAGE_ARRAY
1453     struct lcore_conf *qconf = &lcore_conf;
1454     int    len = 0;
1455 
1456     len = ff_if_send_onepkt(ctx, m,total);
1457     if (unlikely(len == MAX_PKT_BURST)) {
1458         send_burst(qconf, MAX_PKT_BURST, ctx->port_id);
1459         len = 0;
1460     }
1461     qconf->tx_mbufs[ctx->port_id].len = len;
1462     return 0;
1463 #endif
1464     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1465     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1466     if (head == NULL) {
1467         ff_mbuf_free(m);
1468         return -1;
1469     }
1470 
1471     head->pkt_len = total;
1472     head->nb_segs = 0;
1473 
1474     int off = 0;
1475     struct rte_mbuf *cur = head, *prev = NULL;
1476     while(total > 0) {
1477         if (cur == NULL) {
1478             cur = rte_pktmbuf_alloc(mbuf_pool);
1479             if (cur == NULL) {
1480                 rte_pktmbuf_free(head);
1481                 ff_mbuf_free(m);
1482                 return -1;
1483             }
1484         }
1485 
1486         if (prev != NULL) {
1487             prev->next = cur;
1488         }
1489         head->nb_segs++;
1490 
1491         prev = cur;
1492         void *data = rte_pktmbuf_mtod(cur, void*);
1493         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1494         int ret = ff_mbuf_copydata(m, data, off, len);
1495         if (ret < 0) {
1496             rte_pktmbuf_free(head);
1497             ff_mbuf_free(m);
1498             return -1;
1499         }
1500 
1501 
1502         cur->data_len = len;
1503         off += len;
1504         total -= len;
1505         cur = NULL;
1506     }
1507 
1508     struct ff_tx_offload offload = {0};
1509     ff_mbuf_tx_offload(m, &offload);
1510 
1511     void *data = rte_pktmbuf_mtod(head, void*);
1512 
1513     if (offload.ip_csum) {
1514         /* ipv6 not supported yet */
1515         struct rte_ipv4_hdr *iph;
1516         int iph_len;
1517         iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN);
1518         iph_len = (iph->version_ihl & 0x0f) << 2;
1519 
1520         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1521         head->l2_len = RTE_ETHER_HDR_LEN;
1522         head->l3_len = iph_len;
1523     }
1524 
1525     if (ctx->hw_features.tx_csum_l4) {
1526         struct rte_ipv4_hdr *iph;
1527         int iph_len;
1528         iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN);
1529         iph_len = (iph->version_ihl & 0x0f) << 2;
1530 
1531         if (offload.tcp_csum) {
1532             head->ol_flags |= PKT_TX_TCP_CKSUM;
1533             head->l2_len = RTE_ETHER_HDR_LEN;
1534             head->l3_len = iph_len;
1535         }
1536 
1537         /*
1538          *  TCP segmentation offload.
1539          *
1540          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1541          *    implies PKT_TX_TCP_CKSUM)
1542          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1543          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1544          *    write the IP checksum to 0 in the packet
1545          *  - fill the mbuf offload information: l2_len,
1546          *    l3_len, l4_len, tso_segsz
1547          *  - calculate the pseudo header checksum without taking ip_len
1548          *    in account, and set it in the TCP header. Refer to
1549          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1550          *    used as helpers.
1551          */
1552         if (offload.tso_seg_size) {
1553             struct rte_tcp_hdr *tcph;
1554             int tcph_len;
1555             tcph = (struct rte_tcp_hdr *)((char *)iph + iph_len);
1556             tcph_len = (tcph->data_off & 0xf0) >> 2;
1557             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1558 
1559             head->ol_flags |= PKT_TX_TCP_SEG;
1560             head->l4_len = tcph_len;
1561             head->tso_segsz = offload.tso_seg_size;
1562         }
1563 
1564         if (offload.udp_csum) {
1565             head->ol_flags |= PKT_TX_UDP_CKSUM;
1566             head->l2_len = RTE_ETHER_HDR_LEN;
1567             head->l3_len = iph_len;
1568         }
1569     }
1570 
1571     ff_mbuf_free(m);
1572 
1573     return send_single_packet(head, ctx->port_id);
1574 }
1575 
1576 static int
1577 main_loop(void *arg)
1578 {
1579     struct loop_routine *lr = (struct loop_routine *)arg;
1580 
1581     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1582     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc;
1583     int i, j, nb_rx, idle;
1584     uint16_t port_id, queue_id;
1585     struct lcore_conf *qconf;
1586     uint64_t drain_tsc = 0;
1587     struct ff_dpdk_if_context *ctx;
1588 
1589     if (pkt_tx_delay) {
1590         drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay;
1591     }
1592 
1593     prev_tsc = 0;
1594     usch_tsc = 0;
1595 
1596     qconf = &lcore_conf;
1597 
1598     while (1) {
1599         cur_tsc = rte_rdtsc();
1600         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1601             rte_timer_manage();
1602         }
1603 
1604         idle = 1;
1605         sys_tsc = 0;
1606         usr_tsc = 0;
1607 
1608         /*
1609          * TX burst queue drain
1610          */
1611         diff_tsc = cur_tsc - prev_tsc;
1612         if (unlikely(diff_tsc >= drain_tsc)) {
1613             for (i = 0; i < qconf->nb_tx_port; i++) {
1614                 port_id = qconf->tx_port_id[i];
1615                 if (qconf->tx_mbufs[port_id].len == 0)
1616                     continue;
1617 
1618                 idle = 0;
1619 
1620                 send_burst(qconf,
1621                     qconf->tx_mbufs[port_id].len,
1622                     port_id);
1623                 qconf->tx_mbufs[port_id].len = 0;
1624             }
1625 
1626             prev_tsc = cur_tsc;
1627         }
1628 
1629         /*
1630          * Read packet from RX queues
1631          */
1632         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1633             port_id = qconf->rx_queue_list[i].port_id;
1634             queue_id = qconf->rx_queue_list[i].queue_id;
1635             ctx = veth_ctx[port_id];
1636 
1637 #ifdef FF_KNI
1638             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1639                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1640             }
1641 #endif
1642 
1643             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1644 
1645             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1646                 MAX_PKT_BURST);
1647             if (nb_rx == 0)
1648                 continue;
1649 
1650             idle = 0;
1651 
1652             /* Prefetch first packets */
1653             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1654                 rte_prefetch0(rte_pktmbuf_mtod(
1655                         pkts_burst[j], void *));
1656             }
1657 
1658             /* Prefetch and handle already prefetched packets */
1659             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1660                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1661                         j + PREFETCH_OFFSET], void *));
1662                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1663             }
1664 
1665             /* Handle remaining prefetched packets */
1666             for (; j < nb_rx; j++) {
1667                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1668             }
1669         }
1670 
1671         process_msg_ring(qconf->proc_id);
1672 
1673         div_tsc = rte_rdtsc();
1674 
1675         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) {
1676             usch_tsc = cur_tsc;
1677             lr->loop(lr->arg);
1678         }
1679 
1680         idle_sleep_tsc = rte_rdtsc();
1681         if (likely(idle && idle_sleep)) {
1682             usleep(idle_sleep);
1683             end_tsc = rte_rdtsc();
1684         } else {
1685             end_tsc = idle_sleep_tsc;
1686         }
1687 
1688         if (usch_tsc == cur_tsc) {
1689             usr_tsc = idle_sleep_tsc - div_tsc;
1690         }
1691 
1692         if (!idle) {
1693             sys_tsc = div_tsc - cur_tsc;
1694             ff_top_status.sys_tsc += sys_tsc;
1695         }
1696 
1697         ff_top_status.usr_tsc += usr_tsc;
1698         ff_top_status.work_tsc += end_tsc - cur_tsc;
1699         ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1700 
1701         ff_top_status.loops++;
1702     }
1703 
1704     return 0;
1705 }
1706 
1707 int
1708 ff_dpdk_if_up(void) {
1709     int i;
1710     struct lcore_conf *qconf = &lcore_conf;
1711     for (i = 0; i < qconf->nb_tx_port; i++) {
1712         uint16_t port_id = qconf->tx_port_id[i];
1713 
1714         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1715         veth_ctx[port_id] = ff_veth_attach(pconf);
1716         if (veth_ctx[port_id] == NULL) {
1717             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1718         }
1719     }
1720 
1721     return 0;
1722 }
1723 
1724 void
1725 ff_dpdk_run(loop_func_t loop, void *arg) {
1726     struct loop_routine *lr = rte_malloc(NULL,
1727         sizeof(struct loop_routine), 0);
1728     lr->loop = loop;
1729     lr->arg = arg;
1730     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1731     rte_eal_mp_wait_lcore();
1732     rte_free(lr);
1733 }
1734 
1735 void
1736 ff_dpdk_pktmbuf_free(void *m)
1737 {
1738     rte_pktmbuf_free((struct rte_mbuf *)m);
1739 }
1740 
1741 static uint32_t
1742 toeplitz_hash(unsigned keylen, const uint8_t *key,
1743     unsigned datalen, const uint8_t *data)
1744 {
1745     uint32_t hash = 0, v;
1746     u_int i, b;
1747 
1748     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1749 
1750     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1751     for (i = 0; i < datalen; i++) {
1752         for (b = 0; b < 8; b++) {
1753             if (data[i] & (1<<(7-b)))
1754                 hash ^= v;
1755             v <<= 1;
1756             if ((i + 4) < keylen &&
1757                 (key[i+4] & (1<<(7-b))))
1758                 v |= 1;
1759         }
1760     }
1761     return (hash);
1762 }
1763 
1764 int
1765 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
1766     uint16_t sport, uint16_t dport)
1767 {
1768     struct lcore_conf *qconf = &lcore_conf;
1769     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
1770     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
1771 
1772     if (nb_queues <= 1) {
1773         return 1;
1774     }
1775 
1776     uint16_t reta_size = rss_reta_size[ctx->port_id];
1777     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
1778 
1779     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
1780         sizeof(dport)];
1781 
1782     unsigned datalen = 0;
1783 
1784     bcopy(&saddr, &data[datalen], sizeof(saddr));
1785     datalen += sizeof(saddr);
1786 
1787     bcopy(&daddr, &data[datalen], sizeof(daddr));
1788     datalen += sizeof(daddr);
1789 
1790     bcopy(&sport, &data[datalen], sizeof(sport));
1791     datalen += sizeof(sport);
1792 
1793     bcopy(&dport, &data[datalen], sizeof(dport));
1794     datalen += sizeof(dport);
1795 
1796     uint32_t hash = 0;
1797     hash = toeplitz_hash(rsskey_len, rsskey, datalen, data);
1798 
1799     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
1800 }
1801 
1802 void
1803 ff_regist_packet_dispatcher(dispatch_func_t func)
1804 {
1805     packet_dispatcher = func;
1806 }
1807 
1808 uint64_t
1809 ff_get_tsc_ns()
1810 {
1811     uint64_t cur_tsc = rte_rdtsc();
1812     uint64_t hz = rte_get_tsc_hz();
1813     return ((double)cur_tsc/(double)hz) * NS_PER_S;
1814 }
1815 
1816