xref: /f-stack/lib/ff_dpdk_if.c (revision e2391e5e)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 #include <unistd.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30 
31 #include <rte_common.h>
32 #include <rte_byteorder.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memcpy.h>
36 #include <rte_memzone.h>
37 #include <rte_config.h>
38 #include <rte_eal.h>
39 #include <rte_pci.h>
40 #include <rte_mbuf.h>
41 #include <rte_memory.h>
42 #include <rte_lcore.h>
43 #include <rte_launch.h>
44 #include <rte_ethdev.h>
45 #include <rte_debug.h>
46 #include <rte_common.h>
47 #include <rte_ether.h>
48 #include <rte_malloc.h>
49 #include <rte_cycles.h>
50 #include <rte_timer.h>
51 #include <rte_thash.h>
52 #include <rte_ip.h>
53 #include <rte_tcp.h>
54 #include <rte_udp.h>
55 
56 #include "ff_dpdk_if.h"
57 #include "ff_dpdk_pcap.h"
58 #include "ff_dpdk_kni.h"
59 #include "ff_config.h"
60 #include "ff_veth.h"
61 #include "ff_host_interface.h"
62 #include "ff_msg.h"
63 #include "ff_api.h"
64 #include "ff_memory.h"
65 
66 #ifdef FF_KNI
67 #define KNI_MBUF_MAX 2048
68 #define KNI_QUEUE_SIZE 2048
69 
70 int enable_kni;
71 static int kni_accept;
72 #endif
73 
74 static int numa_on;
75 
76 static unsigned idle_sleep;
77 static unsigned pkt_tx_delay;
78 
79 static struct rte_timer freebsd_clock;
80 
81 // Mellanox Linux's driver key
82 static uint8_t default_rsskey_40bytes[40] = {
83     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
84     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
85     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
86     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
87     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
88 };
89 
90 static int use_rsskey_52bytes = 0;
91 static uint8_t default_rsskey_52bytes[52] = {
92     0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23,
93     0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30,
94     0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02,
95     0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c,
96     0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55,
97     0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e,
98     0x81, 0x15, 0x03, 0x66
99 };
100 
101 struct lcore_conf lcore_conf;
102 
103 struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
104 
105 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
106 static dispatch_func_t packet_dispatcher;
107 
108 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
109 
110 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port);
111 
112 struct ff_msg_ring {
113     char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE];
114     /* ring[0] for lcore recv msg, other send */
115     /* ring[1] for lcore send msg, other read */
116     struct rte_ring *ring[FF_MSG_NUM];
117 } __rte_cache_aligned;
118 
119 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
120 static struct rte_mempool *message_pool;
121 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
122 
123 static struct ff_top_args ff_top_status;
124 static struct ff_traffic_args ff_traffic;
125 extern void ff_hardclock(void);
126 
127 static void
128 ff_hardclock_job(__rte_unused struct rte_timer *timer,
129     __rte_unused void *arg) {
130     ff_hardclock();
131     ff_update_current_ts();
132 }
133 
134 struct ff_dpdk_if_context *
135 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
136 {
137     struct ff_dpdk_if_context *ctx;
138 
139     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
140     if (ctx == NULL)
141         return NULL;
142 
143     ctx->sc = sc;
144     ctx->ifp = ifp;
145     ctx->port_id = cfg->port_id;
146     ctx->hw_features = cfg->hw_features;
147 
148     return ctx;
149 }
150 
151 void
152 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
153 {
154     free(ctx);
155 }
156 
157 static void
158 check_all_ports_link_status(void)
159 {
160     #define CHECK_INTERVAL 100 /* 100ms */
161     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
162 
163     uint16_t portid;
164     uint8_t count, all_ports_up, print_flag = 0;
165     struct rte_eth_link link;
166 
167     printf("\nChecking link status");
168     fflush(stdout);
169 
170     int i, nb_ports;
171     nb_ports = ff_global_cfg.dpdk.nb_ports;
172     for (count = 0; count <= MAX_CHECK_TIME; count++) {
173         all_ports_up = 1;
174         for (i = 0; i < nb_ports; i++) {
175             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
176             memset(&link, 0, sizeof(link));
177             rte_eth_link_get_nowait(portid, &link);
178 
179             /* print link status if flag set */
180             if (print_flag == 1) {
181                 if (link.link_status) {
182                     printf("Port %d Link Up - speed %u "
183                         "Mbps - %s\n", (int)portid,
184                         (unsigned)link.link_speed,
185                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
186                         ("full-duplex") : ("half-duplex\n"));
187                 } else {
188                     printf("Port %d Link Down\n", (int)portid);
189                 }
190                 continue;
191             }
192             /* clear all_ports_up flag if any link down */
193             if (link.link_status == 0) {
194                 all_ports_up = 0;
195                 break;
196             }
197         }
198 
199         /* after finally printing all link status, get out */
200         if (print_flag == 1)
201             break;
202 
203         if (all_ports_up == 0) {
204             printf(".");
205             fflush(stdout);
206             rte_delay_ms(CHECK_INTERVAL);
207         }
208 
209         /* set the print_flag if all ports up or timeout */
210         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
211             print_flag = 1;
212             printf("done\n");
213         }
214     }
215 }
216 
217 static int
218 init_lcore_conf(void)
219 {
220     uint8_t nb_dev_ports = rte_eth_dev_count_avail();
221     if (nb_dev_ports == 0) {
222         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
223     }
224 
225     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
226         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
227                  ff_global_cfg.dpdk.max_portid);
228     }
229 
230     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
231     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
232 
233     uint16_t proc_id;
234     for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) {
235         uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id];
236         if (!lcore_config[lcore_id].detected) {
237             rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
238         }
239     }
240 
241     uint16_t socket_id = 0;
242     if (numa_on) {
243         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
244     }
245 
246     lcore_conf.socket_id = socket_id;
247 
248     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
249     int j;
250     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
251         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
252         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
253 
254         int queueid = -1;
255         int i;
256         for (i = 0; i < pconf->nb_lcores; i++) {
257             if (pconf->lcore_list[i] == lcore_id) {
258                 queueid = i;
259             }
260         }
261         if (queueid < 0) {
262             continue;
263         }
264         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
265         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
266         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
267         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
268         lcore_conf.nb_rx_queue++;
269 
270         lcore_conf.tx_queue_id[port_id] = queueid;
271         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
272         lcore_conf.nb_tx_port++;
273 
274         lcore_conf.pcap[port_id] = pconf->pcap;
275         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
276     }
277 
278     if (lcore_conf.nb_rx_queue == 0) {
279         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
280     }
281 
282     return 0;
283 }
284 
285 static int
286 init_mem_pool(void)
287 {
288     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
289     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
290     uint32_t nb_tx_queue = nb_lcores;
291     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
292 
293     unsigned nb_mbuf = RTE_ALIGN_CEIL (
294         (nb_rx_queue*RX_QUEUE_SIZE          +
295         nb_ports*nb_lcores*MAX_PKT_BURST    +
296         nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
297         nb_lcores*MEMPOOL_CACHE_SIZE +
298 #ifdef FF_KNI
299         nb_ports*KNI_MBUF_MAX +
300         nb_ports*KNI_QUEUE_SIZE +
301 #endif
302         nb_lcores*nb_ports*DISPATCH_RING_SIZE),
303         (unsigned)8192);
304 
305     unsigned socketid = 0;
306     uint16_t i, lcore_id;
307     char s[64];
308 
309     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
310         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
311         if (numa_on) {
312             socketid = rte_lcore_to_socket_id(lcore_id);
313         }
314 
315         if (socketid >= NB_SOCKETS) {
316             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
317                 socketid, i, NB_SOCKETS);
318         }
319 
320         if (pktmbuf_pool[socketid] != NULL) {
321             continue;
322         }
323 
324         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
325             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
326             pktmbuf_pool[socketid] =
327                 rte_pktmbuf_pool_create(s, nb_mbuf,
328                     MEMPOOL_CACHE_SIZE, 0,
329                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
330         } else {
331             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
332             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
333         }
334 
335         if (pktmbuf_pool[socketid] == NULL) {
336             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
337         } else {
338             printf("create mbuf pool on socket %d\n", socketid);
339         }
340 
341 #ifdef FF_USE_PAGE_ARRAY
342         nb_mbuf = RTE_ALIGN_CEIL (
343             nb_ports*nb_lcores*MAX_PKT_BURST    +
344             nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
345             nb_lcores*MEMPOOL_CACHE_SIZE,
346             (unsigned)4096);
347         ff_init_ref_pool(nb_mbuf, socketid);
348 #endif
349     }
350 
351     return 0;
352 }
353 
354 static struct rte_ring *
355 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
356 {
357     struct rte_ring *ring;
358 
359     if (name == NULL) {
360         rte_exit(EXIT_FAILURE, "create ring failed, no name!\n");
361     }
362 
363     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
364         ring = rte_ring_create(name, count, socket_id, flags);
365     } else {
366         ring = rte_ring_lookup(name);
367     }
368 
369     if (ring == NULL) {
370         rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name);
371     }
372 
373     return ring;
374 }
375 
376 static int
377 init_dispatch_ring(void)
378 {
379     int j;
380     char name_buf[RTE_RING_NAMESIZE];
381     int queueid;
382 
383     unsigned socketid = lcore_conf.socket_id;
384 
385     /* Create ring according to ports actually being used. */
386     int nb_ports = ff_global_cfg.dpdk.nb_ports;
387     for (j = 0; j < nb_ports; j++) {
388         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
389         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
390         int nb_queues = pconf->nb_lcores;
391         if (dispatch_ring[portid] == NULL) {
392             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
393 
394             dispatch_ring[portid] = rte_zmalloc(name_buf,
395                 sizeof(struct rte_ring *) * nb_queues,
396                 RTE_CACHE_LINE_SIZE);
397             if (dispatch_ring[portid] == NULL) {
398                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
399                     "failed\n", name_buf);
400             }
401         }
402 
403         for(queueid = 0; queueid < nb_queues; ++queueid) {
404             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
405                 portid, queueid);
406             dispatch_ring[portid][queueid] = create_ring(name_buf,
407                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
408 
409             if (dispatch_ring[portid][queueid] == NULL)
410                 rte_panic("create ring:%s failed!\n", name_buf);
411 
412             printf("create ring:%s success, %u ring entries are now free!\n",
413                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
414         }
415     }
416 
417     return 0;
418 }
419 
420 static void
421 ff_msg_init(struct rte_mempool *mp,
422     __attribute__((unused)) void *opaque_arg,
423     void *obj, __attribute__((unused)) unsigned i)
424 {
425     struct ff_msg *msg = (struct ff_msg *)obj;
426     msg->msg_type = FF_UNKNOWN;
427     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
428     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
429 }
430 
431 static int
432 init_msg_ring(void)
433 {
434     uint16_t i, j;
435     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
436     unsigned socketid = lcore_conf.socket_id;
437 
438     /* Create message buffer pool */
439     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
440         message_pool = rte_mempool_create(FF_MSG_POOL,
441            MSG_RING_SIZE * 2 * nb_procs,
442            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
443            NULL, NULL, ff_msg_init, NULL,
444            socketid, 0);
445     } else {
446         message_pool = rte_mempool_lookup(FF_MSG_POOL);
447     }
448 
449     if (message_pool == NULL) {
450         rte_panic("Create msg mempool failed\n");
451     }
452 
453     for(i = 0; i < nb_procs; ++i) {
454         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
455             "%s%u", FF_MSG_RING_IN, i);
456         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
457             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
458         if (msg_ring[i].ring[0] == NULL)
459             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
460 
461         for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) {
462             snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE,
463                 "%s%u_%u", FF_MSG_RING_OUT, i, j);
464             msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j],
465                 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
466             if (msg_ring[i].ring[j] == NULL)
467                 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]);
468         }
469     }
470 
471     return 0;
472 }
473 
474 #ifdef FF_KNI
475 static int
476 init_kni(void)
477 {
478     int nb_ports = rte_eth_dev_count_avail();
479     kni_accept = 0;
480     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
481         kni_accept = 1;
482 
483     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
484         ff_global_cfg.kni.udp_port);
485 
486     unsigned socket_id = lcore_conf.socket_id;
487     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
488 
489     nb_ports = ff_global_cfg.dpdk.nb_ports;
490     int i, ret;
491     for (i = 0; i < nb_ports; i++) {
492         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
493         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
494     }
495 
496     return 0;
497 }
498 #endif
499 
500 static void
501 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
502 {
503     if (reta_size == 0) {
504         return;
505     }
506 
507     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
508     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
509 
510     /* config HW indirection table */
511     unsigned i, j, hash=0;
512     for (i = 0; i < reta_conf_size; i++) {
513         reta_conf[i].mask = ~0ULL;
514         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
515             reta_conf[i].reta[j] = hash++ % nb_queues;
516         }
517     }
518 
519     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
520         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
521             port_id);
522     }
523 }
524 
525 static int
526 init_port_start(void)
527 {
528     int nb_ports = ff_global_cfg.dpdk.nb_ports;
529     unsigned socketid = 0;
530     struct rte_mempool *mbuf_pool;
531     uint16_t i;
532 
533     for (i = 0; i < nb_ports; i++) {
534         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
535         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
536         uint16_t nb_queues = pconf->nb_lcores;
537 
538         struct rte_eth_dev_info dev_info;
539         struct rte_eth_conf port_conf = {0};
540         struct rte_eth_rxconf rxq_conf;
541         struct rte_eth_txconf txq_conf;
542 
543         rte_eth_dev_info_get(port_id, &dev_info);
544 
545         if (nb_queues > dev_info.max_rx_queues) {
546             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
547                 nb_queues,
548                 dev_info.max_rx_queues);
549         }
550 
551         if (nb_queues > dev_info.max_tx_queues) {
552             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
553                 nb_queues,
554                 dev_info.max_tx_queues);
555         }
556 
557         struct ether_addr addr;
558         rte_eth_macaddr_get(port_id, &addr);
559         printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
560                    " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
561                 (unsigned)port_id,
562                 addr.addr_bytes[0], addr.addr_bytes[1],
563                 addr.addr_bytes[2], addr.addr_bytes[3],
564                 addr.addr_bytes[4], addr.addr_bytes[5]);
565 
566         rte_memcpy(pconf->mac,
567             addr.addr_bytes, ETHER_ADDR_LEN);
568 
569         /* Set RSS mode */
570         uint64_t default_rss_hf = ETH_RSS_PROTO_MASK;
571         port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
572         port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf;
573         if (dev_info.hash_key_size == 52) {
574             port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_52bytes;
575             port_conf.rx_adv_conf.rss_conf.rss_key_len = 52;
576 	    use_rsskey_52bytes = 1;
577         }else{
578             port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes;
579             port_conf.rx_adv_conf.rss_conf.rss_key_len = 40;
580         }
581         port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads;
582         if (port_conf.rx_adv_conf.rss_conf.rss_hf !=
583                 ETH_RSS_PROTO_MASK) {
584             printf("Port %u modified RSS hash function based on hardware support,"
585                     "requested:%#"PRIx64" configured:%#"PRIx64"\n",
586                     port_id, default_rss_hf,
587                     port_conf.rx_adv_conf.rss_conf.rss_hf);
588         }
589 
590         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) {
591             port_conf.txmode.offloads |=
592                 DEV_TX_OFFLOAD_MBUF_FAST_FREE;
593         }
594 
595         /* Set Rx VLAN stripping */
596         if (ff_global_cfg.dpdk.vlan_strip) {
597             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
598                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
599             }
600         }
601 
602         /* Enable HW CRC stripping */
603         port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC;
604 
605         /* FIXME: Enable TCP LRO ?*/
606         #if 0
607         if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
608             printf("LRO is supported\n");
609             port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
610             pconf->hw_features.rx_lro = 1;
611         }
612         #endif
613 
614         /* Set Rx checksum checking */
615         if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
616             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
617             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
618             printf("RX checksum offload supported\n");
619             port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
620             pconf->hw_features.rx_csum = 1;
621         }
622 
623         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
624             printf("TX ip checksum offload supported\n");
625             port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
626             pconf->hw_features.tx_csum_ip = 1;
627         }
628 
629         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
630             (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
631             printf("TX TCP&UDP checksum offload supported\n");
632             port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM;
633             pconf->hw_features.tx_csum_l4 = 1;
634         }
635 
636         if (ff_global_cfg.dpdk.tso) {
637             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
638                 printf("TSO is supported\n");
639                 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO;
640                 pconf->hw_features.tx_tso = 1;
641             }
642         } else {
643             printf("TSO is disabled\n");
644         }
645 
646         if (dev_info.reta_size) {
647             /* reta size must be power of 2 */
648             assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
649 
650             rss_reta_size[port_id] = dev_info.reta_size;
651             printf("port[%d]: rss table size: %d\n", port_id,
652                 dev_info.reta_size);
653         }
654 
655         if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
656             continue;
657         }
658 
659         int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
660         if (ret != 0) {
661             return ret;
662         }
663 
664         static uint16_t nb_rxd = RX_QUEUE_SIZE;
665         static uint16_t nb_txd = TX_QUEUE_SIZE;
666         ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd);
667         if (ret < 0)
668             printf("Could not adjust number of descriptors "
669                     "for port%u (%d)\n", (unsigned)port_id, ret);
670 
671         uint16_t q;
672         for (q = 0; q < nb_queues; q++) {
673             if (numa_on) {
674                 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
675                 socketid = rte_lcore_to_socket_id(lcore_id);
676             }
677             mbuf_pool = pktmbuf_pool[socketid];
678 
679             txq_conf = dev_info.default_txconf;
680             txq_conf.offloads = port_conf.txmode.offloads;
681             ret = rte_eth_tx_queue_setup(port_id, q, nb_txd,
682                 socketid, &txq_conf);
683             if (ret < 0) {
684                 return ret;
685             }
686 
687             rxq_conf = dev_info.default_rxconf;
688             rxq_conf.offloads = port_conf.rxmode.offloads;
689             ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd,
690                 socketid, &rxq_conf, mbuf_pool);
691             if (ret < 0) {
692                 return ret;
693             }
694         }
695 
696         ret = rte_eth_dev_start(port_id);
697         if (ret < 0) {
698             return ret;
699         }
700 
701         if (nb_queues > 1) {
702             /* set HW rss hash function to Toeplitz. */
703             if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
704                 struct rte_eth_hash_filter_info info = {0};
705                 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
706                 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
707 
708                 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
709                     RTE_ETH_FILTER_SET, &info) < 0) {
710                     rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
711                         port_id);
712                 }
713             }
714 
715             set_rss_table(port_id, dev_info.reta_size, nb_queues);
716         }
717 
718         /* Enable RX in promiscuous mode for the Ethernet device. */
719         if (ff_global_cfg.dpdk.promiscuous) {
720             rte_eth_promiscuous_enable(port_id);
721             ret = rte_eth_promiscuous_get(port_id);
722             if (ret == 1) {
723                 printf("set port %u to promiscuous mode ok\n", port_id);
724             } else {
725                 printf("set port %u to promiscuous mode error\n", port_id);
726             }
727         }
728 
729         /* Enable pcap dump */
730         if (pconf->pcap) {
731             ff_enable_pcap(pconf->pcap);
732         }
733     }
734 
735     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
736         check_all_ports_link_status();
737     }
738 
739     return 0;
740 }
741 
742 static int
743 init_clock(void)
744 {
745     rte_timer_subsystem_init();
746     uint64_t hz = rte_get_timer_hz();
747     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
748     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
749 
750     rte_timer_init(&freebsd_clock);
751     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
752         rte_lcore_id(), &ff_hardclock_job, NULL);
753 
754     ff_update_current_ts();
755 
756     return 0;
757 }
758 
759 int
760 ff_dpdk_init(int argc, char **argv)
761 {
762     if (ff_global_cfg.dpdk.nb_procs < 1 ||
763         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
764         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
765         ff_global_cfg.dpdk.proc_id < 0) {
766         printf("param num_procs[%d] or proc_id[%d] error!\n",
767             ff_global_cfg.dpdk.nb_procs,
768             ff_global_cfg.dpdk.proc_id);
769         exit(1);
770     }
771 
772     int ret = rte_eal_init(argc, argv);
773     if (ret < 0) {
774         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
775     }
776 
777     numa_on = ff_global_cfg.dpdk.numa_on;
778 
779     idle_sleep = ff_global_cfg.dpdk.idle_sleep;
780     pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \
781         BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay;
782 
783     init_lcore_conf();
784 
785     init_mem_pool();
786 
787     init_dispatch_ring();
788 
789     init_msg_ring();
790 
791 #ifdef FF_KNI
792     enable_kni = ff_global_cfg.kni.enable;
793     if (enable_kni) {
794         init_kni();
795     }
796 #endif
797 
798 #ifdef FF_USE_PAGE_ARRAY
799     ff_mmap_init();
800 #endif
801 
802     ret = init_port_start();
803     if (ret < 0) {
804         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
805     }
806 
807     init_clock();
808 
809     return 0;
810 }
811 
812 static void
813 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
814 {
815     uint8_t rx_csum = ctx->hw_features.rx_csum;
816     if (rx_csum) {
817         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
818             rte_pktmbuf_free(pkt);
819             return;
820         }
821     }
822 
823     void *data = rte_pktmbuf_mtod(pkt, void*);
824     uint16_t len = rte_pktmbuf_data_len(pkt);
825 
826     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
827     if (hdr == NULL) {
828         rte_pktmbuf_free(pkt);
829         return;
830     }
831 
832     if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) {
833         ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci);
834     }
835 
836     struct rte_mbuf *pn = pkt->next;
837     void *prev = hdr;
838     while(pn != NULL) {
839         data = rte_pktmbuf_mtod(pn, void*);
840         len = rte_pktmbuf_data_len(pn);
841 
842         void *mb = ff_mbuf_get(prev, data, len);
843         if (mb == NULL) {
844             ff_mbuf_free(hdr);
845             rte_pktmbuf_free(pkt);
846             return;
847         }
848         pn = pn->next;
849         prev = mb;
850     }
851 
852     ff_veth_process_packet(ctx->ifp, hdr);
853 }
854 
855 static enum FilterReturn
856 protocol_filter(const void *data, uint16_t len)
857 {
858     if(len < ETHER_HDR_LEN)
859         return FILTER_UNKNOWN;
860 
861     const struct ether_hdr *hdr;
862     const struct vlan_hdr *vlanhdr;
863     hdr = (const struct ether_hdr *)data;
864     uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type);
865     data += ETHER_HDR_LEN;
866     len -= ETHER_HDR_LEN;
867 
868     if (ether_type == ETHER_TYPE_VLAN) {
869         vlanhdr = (struct vlan_hdr *)data;
870         ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto);
871         data += sizeof(struct vlan_hdr);
872         len -= sizeof(struct vlan_hdr);
873     }
874 
875     if(ether_type == ETHER_TYPE_ARP)
876         return FILTER_ARP;
877 
878 #ifdef INET6
879     if (ether_type == ETHER_TYPE_IPv6) {
880         return ff_kni_proto_filter(data,
881             len, ether_type);
882     }
883 #endif
884 
885 #ifndef FF_KNI
886     return FILTER_UNKNOWN;
887 #else
888     if (!enable_kni) {
889         return FILTER_UNKNOWN;
890     }
891 
892     if(ether_type != ETHER_TYPE_IPv4)
893         return FILTER_UNKNOWN;
894 
895     return ff_kni_proto_filter(data,
896         len, ether_type);
897 #endif
898 }
899 
900 static inline void
901 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
902 {
903     struct rte_mbuf *md;
904     void *src, *dst;
905 
906     dst = rte_pktmbuf_mtod(mi, void *);
907     src = rte_pktmbuf_mtod(m, void *);
908 
909     mi->data_len = m->data_len;
910     rte_memcpy(dst, src, m->data_len);
911 
912     mi->port = m->port;
913     mi->vlan_tci = m->vlan_tci;
914     mi->vlan_tci_outer = m->vlan_tci_outer;
915     mi->tx_offload = m->tx_offload;
916     mi->hash = m->hash;
917     mi->ol_flags = m->ol_flags;
918     mi->packet_type = m->packet_type;
919 }
920 
921 /* copied from rte_pktmbuf_clone */
922 static inline struct rte_mbuf *
923 pktmbuf_deep_clone(const struct rte_mbuf *md,
924     struct rte_mempool *mp)
925 {
926     struct rte_mbuf *mc, *mi, **prev;
927     uint32_t pktlen;
928     uint8_t nseg;
929 
930     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
931         return NULL;
932 
933     mi = mc;
934     prev = &mi->next;
935     pktlen = md->pkt_len;
936     nseg = 0;
937 
938     do {
939         nseg++;
940         pktmbuf_deep_attach(mi, md);
941         *prev = mi;
942         prev = &mi->next;
943     } while ((md = md->next) != NULL &&
944         (mi = rte_pktmbuf_alloc(mp)) != NULL);
945 
946     *prev = NULL;
947     mc->nb_segs = nseg;
948     mc->pkt_len = pktlen;
949 
950     /* Allocation of new indirect segment failed */
951     if (unlikely (mi == NULL)) {
952         rte_pktmbuf_free(mc);
953         return NULL;
954     }
955 
956     __rte_mbuf_sanity_check(mc, 1);
957     return mc;
958 }
959 
960 static inline void
961 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
962     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
963 {
964     struct lcore_conf *qconf = &lcore_conf;
965     uint16_t nb_queues = qconf->nb_queue_list[port_id];
966 
967     uint16_t i;
968     for (i = 0; i < count; i++) {
969         struct rte_mbuf *rtem = bufs[i];
970 
971         if (unlikely(qconf->pcap[port_id] != NULL)) {
972             if (!pkts_from_ring) {
973                 ff_dump_packets(qconf->pcap[port_id], rtem);
974             }
975         }
976 
977         void *data = rte_pktmbuf_mtod(rtem, void*);
978         uint16_t len = rte_pktmbuf_data_len(rtem);
979 
980         if (!pkts_from_ring) {
981             ff_traffic.rx_packets++;
982             ff_traffic.rx_bytes += len;
983         }
984 
985         if (!pkts_from_ring && packet_dispatcher) {
986             int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues);
987             if (ret == FF_DISPATCH_RESPONSE) {
988                 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len;
989 
990                 /*
991                  * We have not support vlan out strip
992                  */
993                 if (rtem->vlan_tci) {
994                     data = rte_pktmbuf_prepend(rtem, sizeof(struct vlan_hdr));
995                     if (data != NULL) {
996                         memmove(data, data + sizeof(struct vlan_hdr), ETHER_HDR_LEN);
997                         struct ether_hdr *etherhdr = (struct ether_hdr *)data;
998                         struct vlan_hdr *vlanhdr = (struct vlan_hdr *)(data + ETHER_HDR_LEN);
999                         vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci);
1000                         vlanhdr->eth_proto = etherhdr->ether_type;
1001                         etherhdr->ether_type = rte_cpu_to_be_16(ETHER_TYPE_VLAN);
1002                     }
1003                 }
1004                 send_single_packet(rtem, port_id);
1005                 continue;
1006             }
1007 
1008             if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) {
1009                 rte_pktmbuf_free(rtem);
1010                 continue;
1011             }
1012 
1013             if (ret != queue_id) {
1014                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
1015                 if (ret < 0)
1016                     rte_pktmbuf_free(rtem);
1017 
1018                 continue;
1019             }
1020         }
1021 
1022         enum FilterReturn filter = protocol_filter(data, len);
1023 #ifdef INET6
1024         if (filter == FILTER_ARP || filter == FILTER_NDP) {
1025 #else
1026         if (filter == FILTER_ARP) {
1027 #endif
1028             struct rte_mempool *mbuf_pool;
1029             struct rte_mbuf *mbuf_clone;
1030             if (!pkts_from_ring) {
1031                 uint16_t j;
1032                 for(j = 0; j < nb_queues; ++j) {
1033                     if(j == queue_id)
1034                         continue;
1035 
1036                     unsigned socket_id = 0;
1037                     if (numa_on) {
1038                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1039                         socket_id = rte_lcore_to_socket_id(lcore_id);
1040                     }
1041                     mbuf_pool = pktmbuf_pool[socket_id];
1042                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1043                     if(mbuf_clone) {
1044                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1045                             mbuf_clone);
1046                         if (ret < 0)
1047                             rte_pktmbuf_free(mbuf_clone);
1048                     }
1049                 }
1050             }
1051 
1052 #ifdef FF_KNI
1053             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1054                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1055                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1056                 if(mbuf_clone) {
1057                     ff_kni_enqueue(port_id, mbuf_clone);
1058                 }
1059             }
1060 #endif
1061             ff_veth_input(ctx, rtem);
1062 #ifdef FF_KNI
1063         } else if (enable_kni &&
1064             ((filter == FILTER_KNI && kni_accept) ||
1065             (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1066             ff_kni_enqueue(port_id, rtem);
1067 #endif
1068         } else {
1069             ff_veth_input(ctx, rtem);
1070         }
1071     }
1072 }
1073 
1074 static inline int
1075 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1076     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1077 {
1078     /* read packet from ring buf and to process */
1079     uint16_t nb_rb;
1080     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1081         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1082 
1083     if(nb_rb > 0) {
1084         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1085     }
1086 
1087     return 0;
1088 }
1089 
1090 static inline void
1091 handle_sysctl_msg(struct ff_msg *msg)
1092 {
1093     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1094         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1095         msg->sysctl.newlen);
1096 
1097     if (ret < 0) {
1098         msg->result = errno;
1099     } else {
1100         msg->result = 0;
1101     }
1102 }
1103 
1104 static inline void
1105 handle_ioctl_msg(struct ff_msg *msg)
1106 {
1107     int fd, ret;
1108 #ifdef INET6
1109     if (msg->msg_type == FF_IOCTL6) {
1110         fd = ff_socket(AF_INET6, SOCK_DGRAM, 0);
1111     } else
1112 #endif
1113         fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1114 
1115     if (fd < 0) {
1116         ret = -1;
1117         goto done;
1118     }
1119 
1120     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1121 
1122     ff_close(fd);
1123 
1124 done:
1125     if (ret < 0) {
1126         msg->result = errno;
1127     } else {
1128         msg->result = 0;
1129     }
1130 }
1131 
1132 static inline void
1133 handle_route_msg(struct ff_msg *msg)
1134 {
1135     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1136         &msg->route.len, msg->route.maxlen);
1137     if (ret < 0) {
1138         msg->result = errno;
1139     } else {
1140         msg->result = 0;
1141     }
1142 }
1143 
1144 static inline void
1145 handle_top_msg(struct ff_msg *msg)
1146 {
1147     msg->top = ff_top_status;
1148     msg->result = 0;
1149 }
1150 
1151 #ifdef FF_NETGRAPH
1152 static inline void
1153 handle_ngctl_msg(struct ff_msg *msg)
1154 {
1155     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1156     if (ret < 0) {
1157         msg->result = errno;
1158     } else {
1159         msg->result = 0;
1160         msg->ngctl.ret = ret;
1161     }
1162 }
1163 #endif
1164 
1165 #ifdef FF_IPFW
1166 static inline void
1167 handle_ipfw_msg(struct ff_msg *msg)
1168 {
1169     int fd, ret;
1170     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1171     if (fd < 0) {
1172         ret = -1;
1173         goto done;
1174     }
1175 
1176     switch (msg->ipfw.cmd) {
1177         case FF_IPFW_GET:
1178             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1179                 msg->ipfw.optname, msg->ipfw.optval,
1180                 msg->ipfw.optlen);
1181             break;
1182         case FF_IPFW_SET:
1183             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1184                 msg->ipfw.optname, msg->ipfw.optval,
1185                 *(msg->ipfw.optlen));
1186             break;
1187         default:
1188             ret = -1;
1189             errno = ENOTSUP;
1190             break;
1191     }
1192 
1193     ff_close(fd);
1194 
1195 done:
1196     if (ret < 0) {
1197         msg->result = errno;
1198     } else {
1199         msg->result = 0;
1200     }
1201 }
1202 #endif
1203 
1204 static inline void
1205 handle_traffic_msg(struct ff_msg *msg)
1206 {
1207     msg->traffic = ff_traffic;
1208     msg->result = 0;
1209 }
1210 
1211 static inline void
1212 handle_default_msg(struct ff_msg *msg)
1213 {
1214     msg->result = ENOTSUP;
1215 }
1216 
1217 static inline void
1218 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1219 {
1220     switch (msg->msg_type) {
1221         case FF_SYSCTL:
1222             handle_sysctl_msg(msg);
1223             break;
1224         case FF_IOCTL:
1225 #ifdef INET6
1226         case FF_IOCTL6:
1227 #endif
1228             handle_ioctl_msg(msg);
1229             break;
1230         case FF_ROUTE:
1231             handle_route_msg(msg);
1232             break;
1233         case FF_TOP:
1234             handle_top_msg(msg);
1235             break;
1236 #ifdef FF_NETGRAPH
1237         case FF_NGCTL:
1238             handle_ngctl_msg(msg);
1239             break;
1240 #endif
1241 #ifdef FF_IPFW
1242         case FF_IPFW_CTL:
1243             handle_ipfw_msg(msg);
1244             break;
1245 #endif
1246         case FF_TRAFFIC:
1247             handle_traffic_msg(msg);
1248             break;
1249         default:
1250             handle_default_msg(msg);
1251             break;
1252     }
1253     rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg);
1254 }
1255 
1256 static inline int
1257 process_msg_ring(uint16_t proc_id)
1258 {
1259     void *msg;
1260     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1261 
1262     if (unlikely(ret == 0)) {
1263         handle_msg((struct ff_msg *)msg, proc_id);
1264     }
1265 
1266     return 0;
1267 }
1268 
1269 /* Send burst of packets on an output interface */
1270 static inline int
1271 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1272 {
1273     struct rte_mbuf **m_table;
1274     int ret;
1275     uint16_t queueid;
1276 
1277     queueid = qconf->tx_queue_id[port];
1278     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1279 
1280     if (unlikely(qconf->pcap[port] != NULL)) {
1281         uint16_t i;
1282         for (i = 0; i < n; i++) {
1283             ff_dump_packets(qconf->pcap[port], m_table[i]);
1284         }
1285     }
1286 
1287     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1288     ff_traffic.tx_packets += ret;
1289     uint16_t i;
1290     for (i = 0; i < ret; i++) {
1291         ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]);
1292 #ifdef FF_USE_PAGE_ARRAY
1293         if (qconf->tx_mbufs[port].bsd_m_table[i])
1294             ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs);
1295 #endif
1296     }
1297     if (unlikely(ret < n)) {
1298         do {
1299             rte_pktmbuf_free(m_table[ret]);
1300 #ifdef FF_USE_PAGE_ARRAY
1301             if ( qconf->tx_mbufs[port].bsd_m_table[ret] )
1302                 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]);
1303 #endif
1304         } while (++ret < n);
1305     }
1306     return 0;
1307 }
1308 
1309 /* Enqueue a single packet, and send burst if queue is filled */
1310 static inline int
1311 send_single_packet(struct rte_mbuf *m, uint8_t port)
1312 {
1313     uint16_t len;
1314     struct lcore_conf *qconf;
1315 
1316     qconf = &lcore_conf;
1317     len = qconf->tx_mbufs[port].len;
1318     qconf->tx_mbufs[port].m_table[len] = m;
1319     len++;
1320 
1321     /* enough pkts to be sent */
1322     if (unlikely(len == MAX_PKT_BURST)) {
1323         send_burst(qconf, MAX_PKT_BURST, port);
1324         len = 0;
1325     }
1326 
1327     qconf->tx_mbufs[port].len = len;
1328     return 0;
1329 }
1330 
1331 int
1332 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1333     int total)
1334 {
1335 #ifdef FF_USE_PAGE_ARRAY
1336     struct lcore_conf *qconf = &lcore_conf;
1337     int    len = 0;
1338 
1339     len = ff_if_send_onepkt(ctx, m,total);
1340     if (unlikely(len == MAX_PKT_BURST)) {
1341         send_burst(qconf, MAX_PKT_BURST, ctx->port_id);
1342         len = 0;
1343     }
1344     qconf->tx_mbufs[ctx->port_id].len = len;
1345     return 0;
1346 #endif
1347     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1348     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1349     if (head == NULL) {
1350         ff_mbuf_free(m);
1351         return -1;
1352     }
1353 
1354     head->pkt_len = total;
1355     head->nb_segs = 0;
1356 
1357     int off = 0;
1358     struct rte_mbuf *cur = head, *prev = NULL;
1359     while(total > 0) {
1360         if (cur == NULL) {
1361             cur = rte_pktmbuf_alloc(mbuf_pool);
1362             if (cur == NULL) {
1363                 rte_pktmbuf_free(head);
1364                 ff_mbuf_free(m);
1365                 return -1;
1366             }
1367         }
1368 
1369         if (prev != NULL) {
1370             prev->next = cur;
1371         }
1372         head->nb_segs++;
1373 
1374         prev = cur;
1375         void *data = rte_pktmbuf_mtod(cur, void*);
1376         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1377         int ret = ff_mbuf_copydata(m, data, off, len);
1378         if (ret < 0) {
1379             rte_pktmbuf_free(head);
1380             ff_mbuf_free(m);
1381             return -1;
1382         }
1383 
1384 
1385         cur->data_len = len;
1386         off += len;
1387         total -= len;
1388         cur = NULL;
1389     }
1390 
1391     struct ff_tx_offload offload = {0};
1392     ff_mbuf_tx_offload(m, &offload);
1393 
1394     void *data = rte_pktmbuf_mtod(head, void*);
1395 
1396     if (offload.ip_csum) {
1397         /* ipv6 not supported yet */
1398         struct ipv4_hdr *iph;
1399         int iph_len;
1400         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1401         iph_len = (iph->version_ihl & 0x0f) << 2;
1402 
1403         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1404         head->l2_len = ETHER_HDR_LEN;
1405         head->l3_len = iph_len;
1406     }
1407 
1408     if (ctx->hw_features.tx_csum_l4) {
1409         struct ipv4_hdr *iph;
1410         int iph_len;
1411         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1412         iph_len = (iph->version_ihl & 0x0f) << 2;
1413 
1414         if (offload.tcp_csum) {
1415             head->ol_flags |= PKT_TX_TCP_CKSUM;
1416             head->l2_len = ETHER_HDR_LEN;
1417             head->l3_len = iph_len;
1418         }
1419 
1420         /*
1421          *  TCP segmentation offload.
1422          *
1423          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1424          *    implies PKT_TX_TCP_CKSUM)
1425          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1426          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1427          *    write the IP checksum to 0 in the packet
1428          *  - fill the mbuf offload information: l2_len,
1429          *    l3_len, l4_len, tso_segsz
1430          *  - calculate the pseudo header checksum without taking ip_len
1431          *    in account, and set it in the TCP header. Refer to
1432          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1433          *    used as helpers.
1434          */
1435         if (offload.tso_seg_size) {
1436             struct tcp_hdr *tcph;
1437             int tcph_len;
1438             tcph = (struct tcp_hdr *)((char *)iph + iph_len);
1439             tcph_len = (tcph->data_off & 0xf0) >> 2;
1440             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1441 
1442             head->ol_flags |= PKT_TX_TCP_SEG;
1443             head->l4_len = tcph_len;
1444             head->tso_segsz = offload.tso_seg_size;
1445         }
1446 
1447         if (offload.udp_csum) {
1448             head->ol_flags |= PKT_TX_UDP_CKSUM;
1449             head->l2_len = ETHER_HDR_LEN;
1450             head->l3_len = iph_len;
1451         }
1452     }
1453 
1454     ff_mbuf_free(m);
1455 
1456     return send_single_packet(head, ctx->port_id);
1457 }
1458 
1459 static int
1460 main_loop(void *arg)
1461 {
1462     struct loop_routine *lr = (struct loop_routine *)arg;
1463 
1464     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1465     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc;
1466     int i, j, nb_rx, idle;
1467     uint16_t port_id, queue_id;
1468     struct lcore_conf *qconf;
1469     uint64_t drain_tsc = 0;
1470     struct ff_dpdk_if_context *ctx;
1471 
1472     if (pkt_tx_delay) {
1473         drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay;
1474     }
1475 
1476     prev_tsc = 0;
1477     usch_tsc = 0;
1478 
1479     qconf = &lcore_conf;
1480 
1481     while (1) {
1482         cur_tsc = rte_rdtsc();
1483         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1484             rte_timer_manage();
1485         }
1486 
1487         idle = 1;
1488         sys_tsc = 0;
1489         usr_tsc = 0;
1490 
1491         /*
1492          * TX burst queue drain
1493          */
1494         diff_tsc = cur_tsc - prev_tsc;
1495         if (unlikely(diff_tsc >= drain_tsc)) {
1496             for (i = 0; i < qconf->nb_tx_port; i++) {
1497                 port_id = qconf->tx_port_id[i];
1498                 if (qconf->tx_mbufs[port_id].len == 0)
1499                     continue;
1500 
1501                 idle = 0;
1502 
1503                 send_burst(qconf,
1504                     qconf->tx_mbufs[port_id].len,
1505                     port_id);
1506                 qconf->tx_mbufs[port_id].len = 0;
1507             }
1508 
1509             prev_tsc = cur_tsc;
1510         }
1511 
1512         /*
1513          * Read packet from RX queues
1514          */
1515         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1516             port_id = qconf->rx_queue_list[i].port_id;
1517             queue_id = qconf->rx_queue_list[i].queue_id;
1518             ctx = veth_ctx[port_id];
1519 
1520 #ifdef FF_KNI
1521             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1522                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1523             }
1524 #endif
1525 
1526             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1527 
1528             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1529                 MAX_PKT_BURST);
1530             if (nb_rx == 0)
1531                 continue;
1532 
1533             idle = 0;
1534 
1535             /* Prefetch first packets */
1536             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1537                 rte_prefetch0(rte_pktmbuf_mtod(
1538                         pkts_burst[j], void *));
1539             }
1540 
1541             /* Prefetch and handle already prefetched packets */
1542             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1543                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1544                         j + PREFETCH_OFFSET], void *));
1545                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1546             }
1547 
1548             /* Handle remaining prefetched packets */
1549             for (; j < nb_rx; j++) {
1550                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1551             }
1552         }
1553 
1554         process_msg_ring(qconf->proc_id);
1555 
1556         div_tsc = rte_rdtsc();
1557 
1558         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) {
1559             usch_tsc = cur_tsc;
1560             lr->loop(lr->arg);
1561         }
1562 
1563         idle_sleep_tsc = rte_rdtsc();
1564         if (likely(idle && idle_sleep)) {
1565             usleep(idle_sleep);
1566             end_tsc = rte_rdtsc();
1567         } else {
1568             end_tsc = idle_sleep_tsc;
1569         }
1570 
1571         if (usch_tsc == cur_tsc) {
1572             usr_tsc = idle_sleep_tsc - div_tsc;
1573         }
1574 
1575         if (!idle) {
1576             sys_tsc = div_tsc - cur_tsc;
1577             ff_top_status.sys_tsc += sys_tsc;
1578         }
1579 
1580         ff_top_status.usr_tsc += usr_tsc;
1581         ff_top_status.work_tsc += end_tsc - cur_tsc;
1582         ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1583 
1584         ff_top_status.loops++;
1585     }
1586 
1587     return 0;
1588 }
1589 
1590 int
1591 ff_dpdk_if_up(void) {
1592     int i;
1593     struct lcore_conf *qconf = &lcore_conf;
1594     for (i = 0; i < qconf->nb_tx_port; i++) {
1595         uint16_t port_id = qconf->tx_port_id[i];
1596 
1597         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1598         veth_ctx[port_id] = ff_veth_attach(pconf);
1599         if (veth_ctx[port_id] == NULL) {
1600             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1601         }
1602     }
1603 
1604     return 0;
1605 }
1606 
1607 void
1608 ff_dpdk_run(loop_func_t loop, void *arg) {
1609     struct loop_routine *lr = rte_malloc(NULL,
1610         sizeof(struct loop_routine), 0);
1611     lr->loop = loop;
1612     lr->arg = arg;
1613     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1614     rte_eal_mp_wait_lcore();
1615     rte_free(lr);
1616 }
1617 
1618 void
1619 ff_dpdk_pktmbuf_free(void *m)
1620 {
1621     rte_pktmbuf_free((struct rte_mbuf *)m);
1622 }
1623 
1624 static uint32_t
1625 toeplitz_hash(unsigned keylen, const uint8_t *key,
1626     unsigned datalen, const uint8_t *data)
1627 {
1628     uint32_t hash = 0, v;
1629     u_int i, b;
1630 
1631     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1632 
1633     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1634     for (i = 0; i < datalen; i++) {
1635         for (b = 0; b < 8; b++) {
1636             if (data[i] & (1<<(7-b)))
1637                 hash ^= v;
1638             v <<= 1;
1639             if ((i + 4) < keylen &&
1640                 (key[i+4] & (1<<(7-b))))
1641                 v |= 1;
1642         }
1643     }
1644     return (hash);
1645 }
1646 
1647 int
1648 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
1649     uint16_t sport, uint16_t dport)
1650 {
1651     struct lcore_conf *qconf = &lcore_conf;
1652     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
1653     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
1654 
1655     if (nb_queues <= 1) {
1656         return 1;
1657     }
1658 
1659     uint16_t reta_size = rss_reta_size[ctx->port_id];
1660     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
1661 
1662     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
1663         sizeof(dport)];
1664 
1665     unsigned datalen = 0;
1666 
1667     bcopy(&saddr, &data[datalen], sizeof(saddr));
1668     datalen += sizeof(saddr);
1669 
1670     bcopy(&daddr, &data[datalen], sizeof(daddr));
1671     datalen += sizeof(daddr);
1672 
1673     bcopy(&sport, &data[datalen], sizeof(sport));
1674     datalen += sizeof(sport);
1675 
1676     bcopy(&dport, &data[datalen], sizeof(dport));
1677     datalen += sizeof(dport);
1678 
1679     uint32_t hash = 0;
1680     if ( !use_rsskey_52bytes )
1681         hash = toeplitz_hash(sizeof(default_rsskey_40bytes),
1682             default_rsskey_40bytes, datalen, data);
1683     else
1684         hash = toeplitz_hash(sizeof(default_rsskey_52bytes),
1685 	    default_rsskey_52bytes, datalen, data);
1686     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
1687 }
1688 
1689 void
1690 ff_regist_packet_dispatcher(dispatch_func_t func)
1691 {
1692     packet_dispatcher = func;
1693 }
1694 
1695 uint64_t
1696 ff_get_tsc_ns()
1697 {
1698     uint64_t cur_tsc = rte_rdtsc();
1699     uint64_t hz = rte_get_tsc_hz();
1700     return ((double)cur_tsc/(double)hz) * NS_PER_S;
1701 }
1702 
1703