xref: /f-stack/lib/ff_dpdk_if.c (revision 32ff8fda)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 #include <unistd.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30 
31 #include <rte_common.h>
32 #include <rte_byteorder.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memcpy.h>
36 #include <rte_memzone.h>
37 #include <rte_config.h>
38 #include <rte_eal.h>
39 #include <rte_pci.h>
40 #include <rte_mbuf.h>
41 #include <rte_memory.h>
42 #include <rte_lcore.h>
43 #include <rte_launch.h>
44 #include <rte_ethdev.h>
45 #include <rte_debug.h>
46 #include <rte_common.h>
47 #include <rte_ether.h>
48 #include <rte_malloc.h>
49 #include <rte_cycles.h>
50 #include <rte_timer.h>
51 #include <rte_thash.h>
52 #include <rte_ip.h>
53 #include <rte_tcp.h>
54 #include <rte_udp.h>
55 #include <rte_eth_bond.h>
56 
57 #include "ff_dpdk_if.h"
58 #include "ff_dpdk_pcap.h"
59 #include "ff_dpdk_kni.h"
60 #include "ff_config.h"
61 #include "ff_veth.h"
62 #include "ff_host_interface.h"
63 #include "ff_msg.h"
64 #include "ff_api.h"
65 #include "ff_memory.h"
66 
67 #ifdef FF_KNI
68 #define KNI_MBUF_MAX 2048
69 #define KNI_QUEUE_SIZE 2048
70 
71 int enable_kni;
72 static int kni_accept;
73 static int knictl_action = FF_KNICTL_ACTION_DEFAULT;
74 #endif
75 
76 static int numa_on;
77 
78 static unsigned idle_sleep;
79 static unsigned pkt_tx_delay;
80 
81 static struct rte_timer freebsd_clock;
82 
83 // Mellanox Linux's driver key
84 static uint8_t default_rsskey_40bytes[40] = {
85     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
86     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
87     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
88     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
89     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
90 };
91 
92 static uint8_t default_rsskey_52bytes[52] = {
93     0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23,
94     0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30,
95     0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02,
96     0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c,
97     0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55,
98     0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e,
99     0x81, 0x15, 0x03, 0x66
100 };
101 
102 static uint8_t symmetric_rsskey[52] = {
103     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
104     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
105     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
106     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
107     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
108     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
109     0x6d, 0x5a, 0x6d, 0x5a
110 };
111 
112 static int rsskey_len = sizeof(default_rsskey_40bytes);
113 static uint8_t *rsskey = default_rsskey_40bytes;
114 
115 struct lcore_conf lcore_conf;
116 
117 struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
118 
119 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
120 static dispatch_func_t packet_dispatcher;
121 
122 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
123 
124 #define BOND_DRIVER_NAME    "net_bonding"
125 
126 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port);
127 
128 struct ff_msg_ring {
129     char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE];
130     /* ring[0] for lcore recv msg, other send */
131     /* ring[1] for lcore send msg, other read */
132     struct rte_ring *ring[FF_MSG_NUM];
133 } __rte_cache_aligned;
134 
135 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
136 static struct rte_mempool *message_pool;
137 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
138 
139 static struct ff_top_args ff_top_status;
140 static struct ff_traffic_args ff_traffic;
141 extern void ff_hardclock(void);
142 
143 static void
144 ff_hardclock_job(__rte_unused struct rte_timer *timer,
145     __rte_unused void *arg) {
146     ff_hardclock();
147     ff_update_current_ts();
148 }
149 
150 struct ff_dpdk_if_context *
151 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
152 {
153     struct ff_dpdk_if_context *ctx;
154 
155     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
156     if (ctx == NULL)
157         return NULL;
158 
159     ctx->sc = sc;
160     ctx->ifp = ifp;
161     ctx->port_id = cfg->port_id;
162     ctx->hw_features = cfg->hw_features;
163 
164     return ctx;
165 }
166 
167 void
168 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
169 {
170     free(ctx);
171 }
172 
173 static void
174 check_all_ports_link_status(void)
175 {
176     #define CHECK_INTERVAL 100 /* 100ms */
177     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
178 
179     uint16_t portid;
180     uint8_t count, all_ports_up, print_flag = 0;
181     struct rte_eth_link link;
182 
183     printf("\nChecking link status");
184     fflush(stdout);
185 
186     int i, nb_ports;
187     nb_ports = ff_global_cfg.dpdk.nb_ports;
188     for (count = 0; count <= MAX_CHECK_TIME; count++) {
189         all_ports_up = 1;
190         for (i = 0; i < nb_ports; i++) {
191             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
192             memset(&link, 0, sizeof(link));
193             rte_eth_link_get_nowait(portid, &link);
194 
195             /* print link status if flag set */
196             if (print_flag == 1) {
197                 if (link.link_status) {
198                     printf("Port %d Link Up - speed %u "
199                         "Mbps - %s\n", (int)portid,
200                         (unsigned)link.link_speed,
201                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
202                         ("full-duplex") : ("half-duplex\n"));
203                 } else {
204                     printf("Port %d Link Down\n", (int)portid);
205                 }
206                 continue;
207             }
208             /* clear all_ports_up flag if any link down */
209             if (link.link_status == 0) {
210                 all_ports_up = 0;
211                 break;
212             }
213         }
214 
215         /* after finally printing all link status, get out */
216         if (print_flag == 1)
217             break;
218 
219         if (all_ports_up == 0) {
220             printf(".");
221             fflush(stdout);
222             rte_delay_ms(CHECK_INTERVAL);
223         }
224 
225         /* set the print_flag if all ports up or timeout */
226         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
227             print_flag = 1;
228             printf("done\n");
229         }
230     }
231 }
232 
233 static int
234 init_lcore_conf(void)
235 {
236     uint8_t nb_dev_ports = rte_eth_dev_count_avail();
237     if (nb_dev_ports == 0) {
238         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
239     }
240 
241     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
242         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
243                  ff_global_cfg.dpdk.max_portid);
244     }
245 
246     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
247     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
248 
249     uint16_t socket_id = 0;
250     if (numa_on) {
251         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
252     }
253 
254     lcore_conf.socket_id = socket_id;
255 
256     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
257     if (!rte_lcore_is_enabled(lcore_id)) {
258         rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
259     }
260 
261     int j;
262     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
263         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
264         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
265 
266         int queueid = -1;
267         int i;
268         for (i = 0; i < pconf->nb_lcores; i++) {
269             if (pconf->lcore_list[i] == lcore_id) {
270                 queueid = i;
271             }
272         }
273         if (queueid < 0) {
274             continue;
275         }
276         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
277         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
278         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
279         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
280         lcore_conf.nb_rx_queue++;
281 
282         lcore_conf.tx_queue_id[port_id] = queueid;
283         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
284         lcore_conf.nb_tx_port++;
285 
286         /* Enable pcap dump */
287         if (ff_global_cfg.pcap.enable) {
288             ff_enable_pcap(ff_global_cfg.pcap.save_path, ff_global_cfg.pcap.snap_len);
289         }
290 
291         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
292     }
293 
294     if (lcore_conf.nb_rx_queue == 0) {
295         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
296     }
297 
298     return 0;
299 }
300 
301 static int
302 init_mem_pool(void)
303 {
304     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
305     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
306     uint32_t nb_tx_queue = nb_lcores;
307     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
308     uint16_t max_portid = ff_global_cfg.dpdk.max_portid;
309 
310     unsigned nb_mbuf = RTE_ALIGN_CEIL (
311         (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE          +
312         nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST    +
313         nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE  +
314         nb_lcores * MEMPOOL_CACHE_SIZE +
315 #ifdef FF_KNI
316         nb_ports * KNI_MBUF_MAX +
317         nb_ports * KNI_QUEUE_SIZE +
318 #endif
319         nb_lcores * nb_ports * DISPATCH_RING_SIZE),
320         (unsigned)8192);
321 
322     unsigned socketid = 0;
323     uint16_t i, lcore_id;
324     char s[64];
325 
326     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
327         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
328         if (numa_on) {
329             socketid = rte_lcore_to_socket_id(lcore_id);
330         }
331 
332         if (socketid >= NB_SOCKETS) {
333             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
334                 socketid, i, NB_SOCKETS);
335         }
336 
337         if (pktmbuf_pool[socketid] != NULL) {
338             continue;
339         }
340 
341         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
342             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
343             pktmbuf_pool[socketid] =
344                 rte_pktmbuf_pool_create(s, nb_mbuf,
345                     MEMPOOL_CACHE_SIZE, 0,
346                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
347         } else {
348             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
349             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
350         }
351 
352         if (pktmbuf_pool[socketid] == NULL) {
353             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
354         } else {
355             printf("create mbuf pool on socket %d\n", socketid);
356         }
357 
358 #ifdef FF_USE_PAGE_ARRAY
359         nb_mbuf = RTE_ALIGN_CEIL (
360             nb_ports*nb_lcores*MAX_PKT_BURST    +
361             nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
362             nb_lcores*MEMPOOL_CACHE_SIZE,
363             (unsigned)4096);
364         ff_init_ref_pool(nb_mbuf, socketid);
365 #endif
366     }
367 
368     return 0;
369 }
370 
371 static struct rte_ring *
372 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
373 {
374     struct rte_ring *ring;
375 
376     if (name == NULL) {
377         rte_exit(EXIT_FAILURE, "create ring failed, no name!\n");
378     }
379 
380     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
381         ring = rte_ring_create(name, count, socket_id, flags);
382     } else {
383         ring = rte_ring_lookup(name);
384     }
385 
386     if (ring == NULL) {
387         rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name);
388     }
389 
390     return ring;
391 }
392 
393 static int
394 init_dispatch_ring(void)
395 {
396     int j;
397     char name_buf[RTE_RING_NAMESIZE];
398     int queueid;
399 
400     unsigned socketid = lcore_conf.socket_id;
401 
402     /* Create ring according to ports actually being used. */
403     int nb_ports = ff_global_cfg.dpdk.nb_ports;
404     for (j = 0; j < nb_ports; j++) {
405         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
406         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
407         int nb_queues = pconf->nb_lcores;
408         if (dispatch_ring[portid] == NULL) {
409             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
410 
411             dispatch_ring[portid] = rte_zmalloc(name_buf,
412                 sizeof(struct rte_ring *) * nb_queues,
413                 RTE_CACHE_LINE_SIZE);
414             if (dispatch_ring[portid] == NULL) {
415                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
416                     "failed\n", name_buf);
417             }
418         }
419 
420         for(queueid = 0; queueid < nb_queues; ++queueid) {
421             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
422                 portid, queueid);
423             dispatch_ring[portid][queueid] = create_ring(name_buf,
424                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
425 
426             if (dispatch_ring[portid][queueid] == NULL)
427                 rte_panic("create ring:%s failed!\n", name_buf);
428 
429             printf("create ring:%s success, %u ring entries are now free!\n",
430                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
431         }
432     }
433 
434     return 0;
435 }
436 
437 static void
438 ff_msg_init(struct rte_mempool *mp,
439     __attribute__((unused)) void *opaque_arg,
440     void *obj, __attribute__((unused)) unsigned i)
441 {
442     struct ff_msg *msg = (struct ff_msg *)obj;
443     msg->msg_type = FF_UNKNOWN;
444     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
445     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
446     msg->original_buf = NULL;
447     msg->original_buf_len = 0;
448 }
449 
450 static int
451 init_msg_ring(void)
452 {
453     uint16_t i, j;
454     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
455     unsigned socketid = lcore_conf.socket_id;
456 
457     /* Create message buffer pool */
458     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
459         message_pool = rte_mempool_create(FF_MSG_POOL,
460            MSG_RING_SIZE * 2 * nb_procs,
461            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
462            NULL, NULL, ff_msg_init, NULL,
463            socketid, 0);
464     } else {
465         message_pool = rte_mempool_lookup(FF_MSG_POOL);
466     }
467 
468     if (message_pool == NULL) {
469         rte_panic("Create msg mempool failed\n");
470     }
471 
472     for(i = 0; i < nb_procs; ++i) {
473         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
474             "%s%u", FF_MSG_RING_IN, i);
475         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
476             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
477         if (msg_ring[i].ring[0] == NULL)
478             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
479 
480         for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) {
481             snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE,
482                 "%s%u_%u", FF_MSG_RING_OUT, i, j);
483             msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j],
484                 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
485             if (msg_ring[i].ring[j] == NULL)
486                 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]);
487         }
488     }
489 
490     return 0;
491 }
492 
493 #ifdef FF_KNI
494 
495 static enum FF_KNICTL_CMD get_kni_action(const char *c){
496     if (!c)
497         return FF_KNICTL_ACTION_DEFAULT;
498     if (0 == strcasecmp(c, "alltokni")){
499         return FF_KNICTL_ACTION_ALL_TO_KNI;
500     } else  if (0 == strcasecmp(c, "alltoff")){
501         return FF_KNICTL_ACTION_ALL_TO_FF;
502     } else if (0 == strcasecmp(c, "default")){
503         return FF_KNICTL_ACTION_DEFAULT;
504     } else {
505         return FF_KNICTL_ACTION_DEFAULT;
506     }
507 }
508 
509 static int
510 init_kni(void)
511 {
512     int nb_ports = rte_eth_dev_count_avail();
513     kni_accept = 0;
514     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
515         kni_accept = 1;
516 
517     knictl_action = get_kni_action(ff_global_cfg.kni.kni_action);
518 
519     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
520         ff_global_cfg.kni.udp_port);
521 
522     unsigned socket_id = lcore_conf.socket_id;
523     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
524 
525     nb_ports = ff_global_cfg.dpdk.nb_ports;
526     int i, ret;
527     for (i = 0; i < nb_ports; i++) {
528         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
529         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
530     }
531 
532     return 0;
533 }
534 #endif
535 
536 static void
537 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
538 {
539     if (reta_size == 0) {
540         return;
541     }
542 
543     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
544     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
545 
546     /* config HW indirection table */
547     unsigned i, j, hash=0;
548     for (i = 0; i < reta_conf_size; i++) {
549         reta_conf[i].mask = ~0ULL;
550         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
551             reta_conf[i].reta[j] = hash++ % nb_queues;
552         }
553     }
554 
555     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
556         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
557             port_id);
558     }
559 }
560 
561 static int
562 init_port_start(void)
563 {
564     int nb_ports = ff_global_cfg.dpdk.nb_ports;
565     unsigned socketid = 0;
566     struct rte_mempool *mbuf_pool;
567     uint16_t i, j;
568 
569     for (i = 0; i < nb_ports; i++) {
570         uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i];
571         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id];
572         uint16_t nb_queues = pconf->nb_lcores;
573 
574         for (j=0; j<=pconf->nb_slaves; j++) {
575             if (j < pconf->nb_slaves) {
576                 port_id = pconf->slave_portid_list[j];
577                 printf("To init %s's %d'st slave port[%d]\n",
578                         ff_global_cfg.dpdk.bond_cfgs->name,
579                         j, port_id);
580             } else {
581                 port_id = u_port_id;
582             }
583 
584             struct rte_eth_dev_info dev_info;
585             struct rte_eth_conf port_conf = {0};
586             struct rte_eth_rxconf rxq_conf;
587             struct rte_eth_txconf txq_conf;
588 
589             int ret = rte_eth_dev_info_get(port_id, &dev_info);
590             if (ret != 0)
591                 rte_exit(EXIT_FAILURE,
592                     "Error during getting device (port %u) info: %s\n",
593                     port_id, strerror(-ret));
594 
595             if (nb_queues > dev_info.max_rx_queues) {
596                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
597                     nb_queues,
598                     dev_info.max_rx_queues);
599             }
600 
601             if (nb_queues > dev_info.max_tx_queues) {
602                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
603                     nb_queues,
604                     dev_info.max_tx_queues);
605             }
606 
607             struct rte_ether_addr addr;
608             rte_eth_macaddr_get(port_id, &addr);
609             printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
610                        " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
611                     (unsigned)port_id,
612                     addr.addr_bytes[0], addr.addr_bytes[1],
613                     addr.addr_bytes[2], addr.addr_bytes[3],
614                     addr.addr_bytes[4], addr.addr_bytes[5]);
615 
616             rte_memcpy(pconf->mac,
617                 addr.addr_bytes, RTE_ETHER_ADDR_LEN);
618 
619             /* Set RSS mode */
620             uint64_t default_rss_hf = ETH_RSS_PROTO_MASK;
621             port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
622             port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf;
623             if (dev_info.hash_key_size == 52) {
624                 rsskey = default_rsskey_52bytes;
625                 rsskey_len = 52;
626             }
627             if (ff_global_cfg.dpdk.symmetric_rss) {
628                 printf("Use symmetric Receive-side Scaling(RSS) key\n");
629                 rsskey = symmetric_rsskey;
630             }
631             port_conf.rx_adv_conf.rss_conf.rss_key = rsskey;
632             port_conf.rx_adv_conf.rss_conf.rss_key_len = rsskey_len;
633             port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads;
634             if (port_conf.rx_adv_conf.rss_conf.rss_hf !=
635                     ETH_RSS_PROTO_MASK) {
636                 printf("Port %u modified RSS hash function based on hardware support,"
637                         "requested:%#"PRIx64" configured:%#"PRIx64"\n",
638                         port_id, default_rss_hf,
639                         port_conf.rx_adv_conf.rss_conf.rss_hf);
640             }
641 
642             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) {
643                 port_conf.txmode.offloads |=
644                     DEV_TX_OFFLOAD_MBUF_FAST_FREE;
645             }
646 
647             /* Set Rx VLAN stripping */
648             if (ff_global_cfg.dpdk.vlan_strip) {
649                 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
650                     port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
651                 }
652             }
653 
654             /* Enable HW CRC stripping */
655             port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC;
656 
657             /* FIXME: Enable TCP LRO ?*/
658             #if 0
659             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
660                 printf("LRO is supported\n");
661                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
662                 pconf->hw_features.rx_lro = 1;
663             }
664             #endif
665 
666             /* Set Rx checksum checking */
667             if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
668                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
669                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
670                 printf("RX checksum offload supported\n");
671                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
672                 pconf->hw_features.rx_csum = 1;
673             }
674 
675             if (ff_global_cfg.dpdk.tx_csum_offoad_skip == 0) {
676                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
677                     printf("TX ip checksum offload supported\n");
678                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
679                     pconf->hw_features.tx_csum_ip = 1;
680                 }
681 
682                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
683                     (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
684                     printf("TX TCP&UDP checksum offload supported\n");
685                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM;
686                     pconf->hw_features.tx_csum_l4 = 1;
687                 }
688             } else {
689                 printf("TX checksum offoad is disabled\n");
690             }
691 
692             if (ff_global_cfg.dpdk.tso) {
693                 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
694                     printf("TSO is supported\n");
695                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO;
696                     pconf->hw_features.tx_tso = 1;
697                 }
698             } else {
699                 printf("TSO is disabled\n");
700             }
701 
702             if (dev_info.reta_size) {
703                 /* reta size must be power of 2 */
704                 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
705 
706                 rss_reta_size[port_id] = dev_info.reta_size;
707                 printf("port[%d]: rss table size: %d\n", port_id,
708                     dev_info.reta_size);
709             }
710 
711             if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
712                 continue;
713             }
714 
715             ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
716             if (ret != 0) {
717                 return ret;
718             }
719 
720             static uint16_t nb_rxd = RX_QUEUE_SIZE;
721             static uint16_t nb_txd = TX_QUEUE_SIZE;
722             ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd);
723             if (ret < 0)
724                 printf("Could not adjust number of descriptors "
725                         "for port%u (%d)\n", (unsigned)port_id, ret);
726 
727             uint16_t q;
728             for (q = 0; q < nb_queues; q++) {
729                 if (numa_on) {
730                     uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
731                     socketid = rte_lcore_to_socket_id(lcore_id);
732                 }
733                 mbuf_pool = pktmbuf_pool[socketid];
734 
735                 txq_conf = dev_info.default_txconf;
736                 txq_conf.offloads = port_conf.txmode.offloads;
737                 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd,
738                     socketid, &txq_conf);
739                 if (ret < 0) {
740                     return ret;
741                 }
742 
743                 rxq_conf = dev_info.default_rxconf;
744                 rxq_conf.offloads = port_conf.rxmode.offloads;
745                 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd,
746                     socketid, &rxq_conf, mbuf_pool);
747                 if (ret < 0) {
748                     return ret;
749                 }
750             }
751 
752 
753             if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME,
754                     strlen(dev_info.driver_name)) == 0) {
755 
756                 rte_eth_macaddr_get(port_id, &addr);
757                 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
758                            " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
759                         (unsigned)port_id,
760                         addr.addr_bytes[0], addr.addr_bytes[1],
761                         addr.addr_bytes[2], addr.addr_bytes[3],
762                         addr.addr_bytes[4], addr.addr_bytes[5]);
763 
764                 rte_memcpy(pconf->mac,
765                     addr.addr_bytes, RTE_ETHER_ADDR_LEN);
766 
767                 int mode, count, x;
768                 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS;
769 
770                 mode = rte_eth_bond_mode_get(port_id);
771                 printf("Port %u, bond mode:%d\n", port_id, mode);
772 
773                 count = rte_eth_bond_slaves_get(port_id, slaves, len);
774                 printf("Port %u, %s's slave ports count:%d\n", port_id,
775                             ff_global_cfg.dpdk.bond_cfgs->name, count);
776                 for (x=0; x<count; x++) {
777                     printf("Port %u, %s's slave port[%u]\n", port_id,
778                             ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]);
779                 }
780             }
781 
782             ret = rte_eth_dev_start(port_id);
783             if (ret < 0) {
784                 return ret;
785             }
786 
787             if (nb_queues > 1) {
788                 /* set HW rss hash function to Toeplitz. */
789                 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
790                     struct rte_eth_hash_filter_info info = {0};
791                     info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
792                     info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
793 
794                     if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
795                         RTE_ETH_FILTER_SET, &info) < 0) {
796                         rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
797                             port_id);
798                     }
799                 }
800 
801                 set_rss_table(port_id, dev_info.reta_size, nb_queues);
802             }
803 
804             /* Enable RX in promiscuous mode for the Ethernet device. */
805             if (ff_global_cfg.dpdk.promiscuous) {
806                 ret = rte_eth_promiscuous_enable(port_id);
807                 if (ret == 0) {
808                     printf("set port %u to promiscuous mode ok\n", port_id);
809                 } else {
810                     printf("set port %u to promiscuous mode error\n", port_id);
811                 }
812             }
813         }
814     }
815 
816     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
817         check_all_ports_link_status();
818     }
819 
820     return 0;
821 }
822 
823 static int
824 init_clock(void)
825 {
826     rte_timer_subsystem_init();
827     uint64_t hz = rte_get_timer_hz();
828     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
829     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
830 
831     rte_timer_init(&freebsd_clock);
832     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
833         rte_lcore_id(), &ff_hardclock_job, NULL);
834 
835     ff_update_current_ts();
836 
837     return 0;
838 }
839 
840 int
841 ff_dpdk_init(int argc, char **argv)
842 {
843     if (ff_global_cfg.dpdk.nb_procs < 1 ||
844         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
845         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
846         ff_global_cfg.dpdk.proc_id < 0) {
847         printf("param num_procs[%d] or proc_id[%d] error!\n",
848             ff_global_cfg.dpdk.nb_procs,
849             ff_global_cfg.dpdk.proc_id);
850         exit(1);
851     }
852 
853     int ret = rte_eal_init(argc, argv);
854     if (ret < 0) {
855         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
856     }
857 
858     numa_on = ff_global_cfg.dpdk.numa_on;
859 
860     idle_sleep = ff_global_cfg.dpdk.idle_sleep;
861     pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \
862         BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay;
863 
864     init_lcore_conf();
865 
866     init_mem_pool();
867 
868     init_dispatch_ring();
869 
870     init_msg_ring();
871 
872 #ifdef FF_KNI
873     enable_kni = ff_global_cfg.kni.enable;
874     if (enable_kni) {
875         init_kni();
876     }
877 #endif
878 
879 #ifdef FF_USE_PAGE_ARRAY
880     ff_mmap_init();
881 #endif
882 
883     ret = init_port_start();
884     if (ret < 0) {
885         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
886     }
887 
888     init_clock();
889 
890     return 0;
891 }
892 
893 static void
894 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
895 {
896     uint8_t rx_csum = ctx->hw_features.rx_csum;
897     if (rx_csum) {
898         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
899             rte_pktmbuf_free(pkt);
900             return;
901         }
902     }
903 
904     void *data = rte_pktmbuf_mtod(pkt, void*);
905     uint16_t len = rte_pktmbuf_data_len(pkt);
906 
907     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
908     if (hdr == NULL) {
909         rte_pktmbuf_free(pkt);
910         return;
911     }
912 
913     if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) {
914         ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci);
915     }
916 
917     struct rte_mbuf *pn = pkt->next;
918     void *prev = hdr;
919     while(pn != NULL) {
920         data = rte_pktmbuf_mtod(pn, void*);
921         len = rte_pktmbuf_data_len(pn);
922 
923         void *mb = ff_mbuf_get(prev, data, len);
924         if (mb == NULL) {
925             ff_mbuf_free(hdr);
926             rte_pktmbuf_free(pkt);
927             return;
928         }
929         pn = pn->next;
930         prev = mb;
931     }
932 
933     ff_veth_process_packet(ctx->ifp, hdr);
934 }
935 
936 static enum FilterReturn
937 protocol_filter(const void *data, uint16_t len)
938 {
939     if(len < RTE_ETHER_ADDR_LEN)
940         return FILTER_UNKNOWN;
941 
942     const struct rte_ether_hdr *hdr;
943     const struct rte_vlan_hdr *vlanhdr;
944     hdr = (const struct rte_ether_hdr *)data;
945     uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type);
946     data += RTE_ETHER_HDR_LEN;
947     len -= RTE_ETHER_HDR_LEN;
948 
949     if (ether_type == RTE_ETHER_TYPE_VLAN) {
950         vlanhdr = (struct rte_vlan_hdr *)data;
951         ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto);
952         data += sizeof(struct rte_vlan_hdr);
953         len -= sizeof(struct rte_vlan_hdr);
954     }
955 
956     if(ether_type == RTE_ETHER_TYPE_ARP)
957         return FILTER_ARP;
958 
959 #ifdef INET6
960     if (ether_type == RTE_ETHER_TYPE_IPV6) {
961         return ff_kni_proto_filter(data,
962             len, ether_type);
963     }
964 #endif
965 
966 #ifndef FF_KNI
967     return FILTER_UNKNOWN;
968 #else
969     if (!enable_kni) {
970         return FILTER_UNKNOWN;
971     }
972 
973     if(ether_type != RTE_ETHER_TYPE_IPV4)
974         return FILTER_UNKNOWN;
975 
976     return ff_kni_proto_filter(data,
977         len, ether_type);
978 #endif
979 }
980 
981 static inline void
982 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
983 {
984     struct rte_mbuf *md;
985     void *src, *dst;
986 
987     dst = rte_pktmbuf_mtod(mi, void *);
988     src = rte_pktmbuf_mtod(m, void *);
989 
990     mi->data_len = m->data_len;
991     rte_memcpy(dst, src, m->data_len);
992 
993     mi->port = m->port;
994     mi->vlan_tci = m->vlan_tci;
995     mi->vlan_tci_outer = m->vlan_tci_outer;
996     mi->tx_offload = m->tx_offload;
997     mi->hash = m->hash;
998     mi->ol_flags = m->ol_flags;
999     mi->packet_type = m->packet_type;
1000 }
1001 
1002 /* copied from rte_pktmbuf_clone */
1003 static inline struct rte_mbuf *
1004 pktmbuf_deep_clone(const struct rte_mbuf *md,
1005     struct rte_mempool *mp)
1006 {
1007     struct rte_mbuf *mc, *mi, **prev;
1008     uint32_t pktlen;
1009     uint8_t nseg;
1010 
1011     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
1012         return NULL;
1013 
1014     mi = mc;
1015     prev = &mi->next;
1016     pktlen = md->pkt_len;
1017     nseg = 0;
1018 
1019     do {
1020         nseg++;
1021         pktmbuf_deep_attach(mi, md);
1022         *prev = mi;
1023         prev = &mi->next;
1024     } while ((md = md->next) != NULL &&
1025         (mi = rte_pktmbuf_alloc(mp)) != NULL);
1026 
1027     *prev = NULL;
1028     mc->nb_segs = nseg;
1029     mc->pkt_len = pktlen;
1030 
1031     /* Allocation of new indirect segment failed */
1032     if (unlikely (mi == NULL)) {
1033         rte_pktmbuf_free(mc);
1034         return NULL;
1035     }
1036 
1037     __rte_mbuf_sanity_check(mc, 1);
1038     return mc;
1039 }
1040 
1041 static inline void
1042 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
1043     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
1044 {
1045     struct lcore_conf *qconf = &lcore_conf;
1046     uint16_t nb_queues = qconf->nb_queue_list[port_id];
1047 
1048     uint16_t i;
1049     for (i = 0; i < count; i++) {
1050         struct rte_mbuf *rtem = bufs[i];
1051 
1052         if (unlikely( ff_global_cfg.pcap.enable)) {
1053             if (!pkts_from_ring) {
1054                 ff_dump_packets( ff_global_cfg.pcap.save_path, rtem, ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1055             }
1056         }
1057 
1058         void *data = rte_pktmbuf_mtod(rtem, void*);
1059         uint16_t len = rte_pktmbuf_data_len(rtem);
1060 
1061         if (!pkts_from_ring) {
1062             ff_traffic.rx_packets++;
1063             ff_traffic.rx_bytes += len;
1064         }
1065 
1066         if (!pkts_from_ring && packet_dispatcher) {
1067             int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues);
1068             if (ret == FF_DISPATCH_RESPONSE) {
1069                 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len;
1070 
1071                 /*
1072                  * We have not support vlan out strip
1073                  */
1074                 if (rtem->vlan_tci) {
1075                     data = rte_pktmbuf_prepend(rtem, sizeof(struct rte_vlan_hdr));
1076                     if (data != NULL) {
1077                         memmove(data, data + sizeof(struct rte_vlan_hdr), RTE_ETHER_HDR_LEN);
1078                         struct rte_ether_hdr *etherhdr = (struct rte_ether_hdr *)data;
1079                         struct rte_vlan_hdr *vlanhdr = (struct rte_vlan_hdr *)(data + RTE_ETHER_HDR_LEN);
1080                         vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci);
1081                         vlanhdr->eth_proto = etherhdr->ether_type;
1082                         etherhdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN);
1083                     }
1084                 }
1085                 send_single_packet(rtem, port_id);
1086                 continue;
1087             }
1088 
1089             if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) {
1090                 rte_pktmbuf_free(rtem);
1091                 continue;
1092             }
1093 
1094             if (ret != queue_id) {
1095                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
1096                 if (ret < 0)
1097                     rte_pktmbuf_free(rtem);
1098 
1099                 continue;
1100             }
1101         }
1102 
1103         enum FilterReturn filter = protocol_filter(data, len);
1104 #ifdef INET6
1105         if (filter == FILTER_ARP || filter == FILTER_NDP) {
1106 #else
1107         if (filter == FILTER_ARP) {
1108 #endif
1109             struct rte_mempool *mbuf_pool;
1110             struct rte_mbuf *mbuf_clone;
1111             if (!pkts_from_ring) {
1112                 uint16_t j;
1113                 for(j = 0; j < nb_queues; ++j) {
1114                     if(j == queue_id)
1115                         continue;
1116 
1117                     unsigned socket_id = 0;
1118                     if (numa_on) {
1119                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1120                         socket_id = rte_lcore_to_socket_id(lcore_id);
1121                     }
1122                     mbuf_pool = pktmbuf_pool[socket_id];
1123                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1124                     if(mbuf_clone) {
1125                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1126                             mbuf_clone);
1127                         if (ret < 0)
1128                             rte_pktmbuf_free(mbuf_clone);
1129                     }
1130                 }
1131             }
1132 
1133 #ifdef FF_KNI
1134             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1135                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1136                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1137                 if(mbuf_clone) {
1138                     ff_kni_enqueue(port_id, mbuf_clone);
1139                 }
1140             }
1141 #endif
1142             ff_veth_input(ctx, rtem);
1143 #ifdef FF_KNI
1144         } else if (enable_kni) {
1145             if (knictl_action == FF_KNICTL_ACTION_ALL_TO_KNI){
1146                 ff_kni_enqueue(port_id, rtem);
1147             } else if (knictl_action == FF_KNICTL_ACTION_ALL_TO_FF){
1148                 ff_veth_input(ctx, rtem);
1149             } else if (knictl_action == FF_KNICTL_ACTION_DEFAULT){
1150                 if (enable_kni &&
1151                         ((filter == FILTER_KNI && kni_accept) ||
1152                         (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1153                         ff_kni_enqueue(port_id, rtem);
1154                 } else {
1155                     ff_veth_input(ctx, rtem);
1156                 }
1157             } else {
1158                 ff_veth_input(ctx, rtem);
1159             }
1160 #endif
1161         } else {
1162             ff_veth_input(ctx, rtem);
1163         }
1164     }
1165 }
1166 
1167 static inline int
1168 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1169     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1170 {
1171     /* read packet from ring buf and to process */
1172     uint16_t nb_rb;
1173     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1174         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1175 
1176     if(nb_rb > 0) {
1177         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1178     }
1179 
1180     return 0;
1181 }
1182 
1183 static inline void
1184 handle_sysctl_msg(struct ff_msg *msg)
1185 {
1186     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1187         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1188         msg->sysctl.newlen);
1189 
1190     if (ret < 0) {
1191         msg->result = errno;
1192     } else {
1193         msg->result = 0;
1194     }
1195 }
1196 
1197 static inline void
1198 handle_ioctl_msg(struct ff_msg *msg)
1199 {
1200     int fd, ret;
1201 #ifdef INET6
1202     if (msg->msg_type == FF_IOCTL6) {
1203         fd = ff_socket(AF_INET6, SOCK_DGRAM, 0);
1204     } else
1205 #endif
1206         fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1207 
1208     if (fd < 0) {
1209         ret = -1;
1210         goto done;
1211     }
1212 
1213     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1214 
1215     ff_close(fd);
1216 
1217 done:
1218     if (ret < 0) {
1219         msg->result = errno;
1220     } else {
1221         msg->result = 0;
1222     }
1223 }
1224 
1225 static inline void
1226 handle_route_msg(struct ff_msg *msg)
1227 {
1228     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1229         &msg->route.len, msg->route.maxlen);
1230     if (ret < 0) {
1231         msg->result = errno;
1232     } else {
1233         msg->result = 0;
1234     }
1235 }
1236 
1237 static inline void
1238 handle_top_msg(struct ff_msg *msg)
1239 {
1240     msg->top = ff_top_status;
1241     msg->result = 0;
1242 }
1243 
1244 #ifdef FF_NETGRAPH
1245 static inline void
1246 handle_ngctl_msg(struct ff_msg *msg)
1247 {
1248     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1249     if (ret < 0) {
1250         msg->result = errno;
1251     } else {
1252         msg->result = 0;
1253         msg->ngctl.ret = ret;
1254     }
1255 }
1256 #endif
1257 
1258 #ifdef FF_IPFW
1259 static inline void
1260 handle_ipfw_msg(struct ff_msg *msg)
1261 {
1262     int fd, ret;
1263     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1264     if (fd < 0) {
1265         ret = -1;
1266         goto done;
1267     }
1268 
1269     switch (msg->ipfw.cmd) {
1270         case FF_IPFW_GET:
1271             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1272                 msg->ipfw.optname, msg->ipfw.optval,
1273                 msg->ipfw.optlen);
1274             break;
1275         case FF_IPFW_SET:
1276             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1277                 msg->ipfw.optname, msg->ipfw.optval,
1278                 *(msg->ipfw.optlen));
1279             break;
1280         default:
1281             ret = -1;
1282             errno = ENOTSUP;
1283             break;
1284     }
1285 
1286     ff_close(fd);
1287 
1288 done:
1289     if (ret < 0) {
1290         msg->result = errno;
1291     } else {
1292         msg->result = 0;
1293     }
1294 }
1295 #endif
1296 
1297 static inline void
1298 handle_traffic_msg(struct ff_msg *msg)
1299 {
1300     msg->traffic = ff_traffic;
1301     msg->result = 0;
1302 }
1303 
1304 #ifdef FF_KNI
1305 static inline void
1306 handle_knictl_msg(struct ff_msg *msg)
1307 {
1308     if (msg->knictl.kni_cmd == FF_KNICTL_CMD_SET){
1309         switch (msg->knictl.kni_action){
1310             case FF_KNICTL_ACTION_ALL_TO_FF: knictl_action = FF_KNICTL_ACTION_ALL_TO_FF; msg->result = 0; printf("new kni action: alltoff\n"); break;
1311             case FF_KNICTL_ACTION_ALL_TO_KNI: knictl_action = FF_KNICTL_ACTION_ALL_TO_KNI; msg->result = 0; printf("new kni action: alltokni\n"); break;
1312             case FF_KNICTL_ACTION_DEFAULT: knictl_action = FF_KNICTL_ACTION_DEFAULT; msg->result = 0; printf("new kni action: default\n"); break;
1313             default: msg->result = -1;
1314         }
1315     }
1316     else if (msg->knictl.kni_cmd == FF_KNICTL_CMD_GET){
1317         msg->knictl.kni_action = knictl_action;
1318     } else {
1319         msg->result = -2;
1320     }
1321 }
1322 #endif
1323 
1324 static inline void
1325 handle_default_msg(struct ff_msg *msg)
1326 {
1327     msg->result = ENOTSUP;
1328 }
1329 
1330 static inline void
1331 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1332 {
1333     switch (msg->msg_type) {
1334         case FF_SYSCTL:
1335             handle_sysctl_msg(msg);
1336             break;
1337         case FF_IOCTL:
1338 #ifdef INET6
1339         case FF_IOCTL6:
1340 #endif
1341             handle_ioctl_msg(msg);
1342             break;
1343         case FF_ROUTE:
1344             handle_route_msg(msg);
1345             break;
1346         case FF_TOP:
1347             handle_top_msg(msg);
1348             break;
1349 #ifdef FF_NETGRAPH
1350         case FF_NGCTL:
1351             handle_ngctl_msg(msg);
1352             break;
1353 #endif
1354 #ifdef FF_IPFW
1355         case FF_IPFW_CTL:
1356             handle_ipfw_msg(msg);
1357             break;
1358 #endif
1359         case FF_TRAFFIC:
1360             handle_traffic_msg(msg);
1361             break;
1362 #ifdef FF_KNI
1363         case FF_KNICTL:
1364             handle_knictl_msg(msg);
1365             break;
1366 #endif
1367         default:
1368             handle_default_msg(msg);
1369             break;
1370     }
1371     rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg);
1372 }
1373 
1374 static inline int
1375 process_msg_ring(uint16_t proc_id)
1376 {
1377     void *msg;
1378     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1379 
1380     if (unlikely(ret == 0)) {
1381         handle_msg((struct ff_msg *)msg, proc_id);
1382     }
1383 
1384     return 0;
1385 }
1386 
1387 /* Send burst of packets on an output interface */
1388 static inline int
1389 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1390 {
1391     struct rte_mbuf **m_table;
1392     int ret;
1393     uint16_t queueid;
1394 
1395     queueid = qconf->tx_queue_id[port];
1396     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1397 
1398     if (unlikely(ff_global_cfg.pcap.enable)) {
1399         uint16_t i;
1400         for (i = 0; i < n; i++) {
1401             ff_dump_packets( ff_global_cfg.pcap.save_path, m_table[i],
1402                ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1403         }
1404     }
1405 
1406     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1407     ff_traffic.tx_packets += ret;
1408     uint16_t i;
1409     for (i = 0; i < ret; i++) {
1410         ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]);
1411 #ifdef FF_USE_PAGE_ARRAY
1412         if (qconf->tx_mbufs[port].bsd_m_table[i])
1413             ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs);
1414 #endif
1415     }
1416     if (unlikely(ret < n)) {
1417         do {
1418             rte_pktmbuf_free(m_table[ret]);
1419 #ifdef FF_USE_PAGE_ARRAY
1420             if ( qconf->tx_mbufs[port].bsd_m_table[ret] )
1421                 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]);
1422 #endif
1423         } while (++ret < n);
1424     }
1425     return 0;
1426 }
1427 
1428 /* Enqueue a single packet, and send burst if queue is filled */
1429 static inline int
1430 send_single_packet(struct rte_mbuf *m, uint8_t port)
1431 {
1432     uint16_t len;
1433     struct lcore_conf *qconf;
1434 
1435     qconf = &lcore_conf;
1436     len = qconf->tx_mbufs[port].len;
1437     qconf->tx_mbufs[port].m_table[len] = m;
1438     len++;
1439 
1440     /* enough pkts to be sent */
1441     if (unlikely(len == MAX_PKT_BURST)) {
1442         send_burst(qconf, MAX_PKT_BURST, port);
1443         len = 0;
1444     }
1445 
1446     qconf->tx_mbufs[port].len = len;
1447     return 0;
1448 }
1449 
1450 int
1451 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1452     int total)
1453 {
1454 #ifdef FF_USE_PAGE_ARRAY
1455     struct lcore_conf *qconf = &lcore_conf;
1456     int    len = 0;
1457 
1458     len = ff_if_send_onepkt(ctx, m,total);
1459     if (unlikely(len == MAX_PKT_BURST)) {
1460         send_burst(qconf, MAX_PKT_BURST, ctx->port_id);
1461         len = 0;
1462     }
1463     qconf->tx_mbufs[ctx->port_id].len = len;
1464     return 0;
1465 #endif
1466     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1467     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1468     if (head == NULL) {
1469         ff_mbuf_free(m);
1470         return -1;
1471     }
1472 
1473     head->pkt_len = total;
1474     head->nb_segs = 0;
1475 
1476     int off = 0;
1477     struct rte_mbuf *cur = head, *prev = NULL;
1478     while(total > 0) {
1479         if (cur == NULL) {
1480             cur = rte_pktmbuf_alloc(mbuf_pool);
1481             if (cur == NULL) {
1482                 rte_pktmbuf_free(head);
1483                 ff_mbuf_free(m);
1484                 return -1;
1485             }
1486         }
1487 
1488         if (prev != NULL) {
1489             prev->next = cur;
1490         }
1491         head->nb_segs++;
1492 
1493         prev = cur;
1494         void *data = rte_pktmbuf_mtod(cur, void*);
1495         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1496         int ret = ff_mbuf_copydata(m, data, off, len);
1497         if (ret < 0) {
1498             rte_pktmbuf_free(head);
1499             ff_mbuf_free(m);
1500             return -1;
1501         }
1502 
1503 
1504         cur->data_len = len;
1505         off += len;
1506         total -= len;
1507         cur = NULL;
1508     }
1509 
1510     struct ff_tx_offload offload = {0};
1511     ff_mbuf_tx_offload(m, &offload);
1512 
1513     void *data = rte_pktmbuf_mtod(head, void*);
1514 
1515     if (offload.ip_csum) {
1516         /* ipv6 not supported yet */
1517         struct rte_ipv4_hdr *iph;
1518         int iph_len;
1519         iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN);
1520         iph_len = (iph->version_ihl & 0x0f) << 2;
1521 
1522         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1523         head->l2_len = RTE_ETHER_HDR_LEN;
1524         head->l3_len = iph_len;
1525     }
1526 
1527     if (ctx->hw_features.tx_csum_l4) {
1528         struct rte_ipv4_hdr *iph;
1529         int iph_len;
1530         iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN);
1531         iph_len = (iph->version_ihl & 0x0f) << 2;
1532 
1533         if (offload.tcp_csum) {
1534             head->ol_flags |= PKT_TX_TCP_CKSUM;
1535             head->l2_len = RTE_ETHER_HDR_LEN;
1536             head->l3_len = iph_len;
1537         }
1538 
1539         /*
1540          *  TCP segmentation offload.
1541          *
1542          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1543          *    implies PKT_TX_TCP_CKSUM)
1544          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1545          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1546          *    write the IP checksum to 0 in the packet
1547          *  - fill the mbuf offload information: l2_len,
1548          *    l3_len, l4_len, tso_segsz
1549          *  - calculate the pseudo header checksum without taking ip_len
1550          *    in account, and set it in the TCP header. Refer to
1551          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1552          *    used as helpers.
1553          */
1554         if (offload.tso_seg_size) {
1555             struct rte_tcp_hdr *tcph;
1556             int tcph_len;
1557             tcph = (struct rte_tcp_hdr *)((char *)iph + iph_len);
1558             tcph_len = (tcph->data_off & 0xf0) >> 2;
1559             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1560 
1561             head->ol_flags |= PKT_TX_TCP_SEG;
1562             head->l4_len = tcph_len;
1563             head->tso_segsz = offload.tso_seg_size;
1564         }
1565 
1566         if (offload.udp_csum) {
1567             head->ol_flags |= PKT_TX_UDP_CKSUM;
1568             head->l2_len = RTE_ETHER_HDR_LEN;
1569             head->l3_len = iph_len;
1570         }
1571     }
1572 
1573     ff_mbuf_free(m);
1574 
1575     return send_single_packet(head, ctx->port_id);
1576 }
1577 
1578 static int
1579 main_loop(void *arg)
1580 {
1581     struct loop_routine *lr = (struct loop_routine *)arg;
1582 
1583     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1584     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc;
1585     int i, j, nb_rx, idle;
1586     uint16_t port_id, queue_id;
1587     struct lcore_conf *qconf;
1588     uint64_t drain_tsc = 0;
1589     struct ff_dpdk_if_context *ctx;
1590 
1591     if (pkt_tx_delay) {
1592         drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay;
1593     }
1594 
1595     prev_tsc = 0;
1596     usch_tsc = 0;
1597 
1598     qconf = &lcore_conf;
1599 
1600     while (1) {
1601         cur_tsc = rte_rdtsc();
1602         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1603             rte_timer_manage();
1604         }
1605 
1606         idle = 1;
1607         sys_tsc = 0;
1608         usr_tsc = 0;
1609 
1610         /*
1611          * TX burst queue drain
1612          */
1613         diff_tsc = cur_tsc - prev_tsc;
1614         if (unlikely(diff_tsc >= drain_tsc)) {
1615             for (i = 0; i < qconf->nb_tx_port; i++) {
1616                 port_id = qconf->tx_port_id[i];
1617                 if (qconf->tx_mbufs[port_id].len == 0)
1618                     continue;
1619 
1620                 idle = 0;
1621 
1622                 send_burst(qconf,
1623                     qconf->tx_mbufs[port_id].len,
1624                     port_id);
1625                 qconf->tx_mbufs[port_id].len = 0;
1626             }
1627 
1628             prev_tsc = cur_tsc;
1629         }
1630 
1631         /*
1632          * Read packet from RX queues
1633          */
1634         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1635             port_id = qconf->rx_queue_list[i].port_id;
1636             queue_id = qconf->rx_queue_list[i].queue_id;
1637             ctx = veth_ctx[port_id];
1638 
1639 #ifdef FF_KNI
1640             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1641                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1642             }
1643 #endif
1644 
1645             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1646 
1647             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1648                 MAX_PKT_BURST);
1649             if (nb_rx == 0)
1650                 continue;
1651 
1652             idle = 0;
1653 
1654             /* Prefetch first packets */
1655             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1656                 rte_prefetch0(rte_pktmbuf_mtod(
1657                         pkts_burst[j], void *));
1658             }
1659 
1660             /* Prefetch and handle already prefetched packets */
1661             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1662                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1663                         j + PREFETCH_OFFSET], void *));
1664                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1665             }
1666 
1667             /* Handle remaining prefetched packets */
1668             for (; j < nb_rx; j++) {
1669                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1670             }
1671         }
1672 
1673         process_msg_ring(qconf->proc_id);
1674 
1675         div_tsc = rte_rdtsc();
1676 
1677         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) {
1678             usch_tsc = cur_tsc;
1679             lr->loop(lr->arg);
1680         }
1681 
1682         idle_sleep_tsc = rte_rdtsc();
1683         if (likely(idle && idle_sleep)) {
1684             usleep(idle_sleep);
1685             end_tsc = rte_rdtsc();
1686         } else {
1687             end_tsc = idle_sleep_tsc;
1688         }
1689 
1690         if (usch_tsc == cur_tsc) {
1691             usr_tsc = idle_sleep_tsc - div_tsc;
1692         }
1693 
1694         if (!idle) {
1695             sys_tsc = div_tsc - cur_tsc;
1696             ff_top_status.sys_tsc += sys_tsc;
1697         }
1698 
1699         ff_top_status.usr_tsc += usr_tsc;
1700         ff_top_status.work_tsc += end_tsc - cur_tsc;
1701         ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1702 
1703         ff_top_status.loops++;
1704     }
1705 
1706     return 0;
1707 }
1708 
1709 int
1710 ff_dpdk_if_up(void) {
1711     int i;
1712     struct lcore_conf *qconf = &lcore_conf;
1713     for (i = 0; i < qconf->nb_tx_port; i++) {
1714         uint16_t port_id = qconf->tx_port_id[i];
1715 
1716         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1717         veth_ctx[port_id] = ff_veth_attach(pconf);
1718         if (veth_ctx[port_id] == NULL) {
1719             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1720         }
1721     }
1722 
1723     return 0;
1724 }
1725 
1726 void
1727 ff_dpdk_run(loop_func_t loop, void *arg) {
1728     struct loop_routine *lr = rte_malloc(NULL,
1729         sizeof(struct loop_routine), 0);
1730     lr->loop = loop;
1731     lr->arg = arg;
1732     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1733     rte_eal_mp_wait_lcore();
1734     rte_free(lr);
1735 }
1736 
1737 void
1738 ff_dpdk_pktmbuf_free(void *m)
1739 {
1740     rte_pktmbuf_free((struct rte_mbuf *)m);
1741 }
1742 
1743 static uint32_t
1744 toeplitz_hash(unsigned keylen, const uint8_t *key,
1745     unsigned datalen, const uint8_t *data)
1746 {
1747     uint32_t hash = 0, v;
1748     u_int i, b;
1749 
1750     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1751 
1752     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1753     for (i = 0; i < datalen; i++) {
1754         for (b = 0; b < 8; b++) {
1755             if (data[i] & (1<<(7-b)))
1756                 hash ^= v;
1757             v <<= 1;
1758             if ((i + 4) < keylen &&
1759                 (key[i+4] & (1<<(7-b))))
1760                 v |= 1;
1761         }
1762     }
1763     return (hash);
1764 }
1765 
1766 int
1767 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
1768     uint16_t sport, uint16_t dport)
1769 {
1770     struct lcore_conf *qconf = &lcore_conf;
1771     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
1772     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
1773 
1774     if (nb_queues <= 1) {
1775         return 1;
1776     }
1777 
1778     uint16_t reta_size = rss_reta_size[ctx->port_id];
1779     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
1780 
1781     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
1782         sizeof(dport)];
1783 
1784     unsigned datalen = 0;
1785 
1786     bcopy(&saddr, &data[datalen], sizeof(saddr));
1787     datalen += sizeof(saddr);
1788 
1789     bcopy(&daddr, &data[datalen], sizeof(daddr));
1790     datalen += sizeof(daddr);
1791 
1792     bcopy(&sport, &data[datalen], sizeof(sport));
1793     datalen += sizeof(sport);
1794 
1795     bcopy(&dport, &data[datalen], sizeof(dport));
1796     datalen += sizeof(dport);
1797 
1798     uint32_t hash = 0;
1799     hash = toeplitz_hash(rsskey_len, rsskey, datalen, data);
1800 
1801     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
1802 }
1803 
1804 void
1805 ff_regist_packet_dispatcher(dispatch_func_t func)
1806 {
1807     packet_dispatcher = func;
1808 }
1809 
1810 uint64_t
1811 ff_get_tsc_ns()
1812 {
1813     uint64_t cur_tsc = rte_rdtsc();
1814     uint64_t hz = rte_get_tsc_hz();
1815     return ((double)cur_tsc/(double)hz) * NS_PER_S;
1816 }
1817 
1818