xref: /f-stack/lib/ff_dpdk_if.c (revision 03df98de)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 
28 #include <rte_common.h>
29 #include <rte_byteorder.h>
30 #include <rte_log.h>
31 #include <rte_memory.h>
32 #include <rte_memcpy.h>
33 #include <rte_memzone.h>
34 #include <rte_config.h>
35 #include <rte_eal.h>
36 #include <rte_pci.h>
37 #include <rte_mbuf.h>
38 #include <rte_memory.h>
39 #include <rte_lcore.h>
40 #include <rte_launch.h>
41 #include <rte_ethdev.h>
42 #include <rte_debug.h>
43 #include <rte_common.h>
44 #include <rte_ether.h>
45 #include <rte_malloc.h>
46 #include <rte_cycles.h>
47 #include <rte_timer.h>
48 #include <rte_thash.h>
49 #include <rte_ip.h>
50 #include <rte_tcp.h>
51 #include <rte_udp.h>
52 
53 #include "ff_dpdk_if.h"
54 #include "ff_dpdk_pcap.h"
55 #include "ff_dpdk_kni.h"
56 #include "ff_config.h"
57 #include "ff_veth.h"
58 #include "ff_host_interface.h"
59 #include "ff_msg.h"
60 #include "ff_api.h"
61 
62 #define MEMPOOL_CACHE_SIZE 256
63 
64 #define DISPATCH_RING_SIZE 2048
65 
66 #define MSG_RING_SIZE 32
67 
68 /*
69  * Configurable number of RX/TX ring descriptors
70  */
71 #define RX_QUEUE_SIZE 512
72 #define TX_QUEUE_SIZE 512
73 
74 #define MAX_PKT_BURST 32
75 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
76 
77 /*
78  * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send.
79  */
80 #define MAX_TX_BURST    (MAX_PKT_BURST / 2)
81 
82 #define NB_SOCKETS 8
83 
84 /* Configure how many packets ahead to prefetch, when reading packets */
85 #define PREFETCH_OFFSET    3
86 
87 #define MAX_RX_QUEUE_PER_LCORE 16
88 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS
89 #define MAX_RX_QUEUE_PER_PORT 128
90 
91 #ifdef FF_KNI
92 #define KNI_MBUF_MAX 2048
93 #define KNI_QUEUE_SIZE 2048
94 
95 static int enable_kni;
96 static int kni_accept;
97 #endif
98 
99 static int numa_on;
100 
101 static struct rte_timer freebsd_clock;
102 
103 // Mellanox Linux's driver key
104 static uint8_t default_rsskey_40bytes[40] = {
105     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
106     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
107     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
108     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
109     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
110 };
111 
112 static struct rte_eth_conf default_port_conf = {
113     .rxmode = {
114         .mq_mode = ETH_MQ_RX_RSS,
115         .max_rx_pkt_len = ETHER_MAX_LEN,
116         .split_hdr_size = 0, /**< hdr buf size */
117         .header_split   = 0, /**< Header Split disabled */
118         .hw_ip_checksum = 0, /**< IP checksum offload disabled */
119         .hw_vlan_filter = 0, /**< VLAN filtering disabled */
120         .hw_vlan_strip  = 0, /**< VLAN strip disabled. */
121         .hw_vlan_extend = 0, /**< Extended VLAN disabled. */
122         .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
123         .hw_strip_crc   = 0, /**< CRC stripped by hardware */
124         .enable_lro     = 0, /**< LRO disabled */
125     },
126     .rx_adv_conf = {
127         .rss_conf = {
128             .rss_key = default_rsskey_40bytes,
129             .rss_key_len = 40,
130             .rss_hf = ETH_RSS_PROTO_MASK,
131         },
132     },
133     .txmode = {
134         .mq_mode = ETH_MQ_TX_NONE,
135     },
136 };
137 
138 struct mbuf_table {
139     uint16_t len;
140     struct rte_mbuf *m_table[MAX_PKT_BURST];
141 };
142 
143 struct lcore_rx_queue {
144     uint16_t port_id;
145     uint16_t queue_id;
146 } __rte_cache_aligned;
147 
148 struct lcore_conf {
149     uint16_t proc_id;
150     uint16_t socket_id;
151     uint16_t nb_queue_list[RTE_MAX_ETHPORTS];
152     struct ff_port_cfg *port_cfgs;
153 
154     uint16_t nb_rx_queue;
155     struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
156     uint16_t nb_tx_port;
157     uint16_t tx_port_id[RTE_MAX_ETHPORTS];
158     uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
159     struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
160     char *pcap[RTE_MAX_ETHPORTS];
161 } __rte_cache_aligned;
162 
163 static struct lcore_conf lcore_conf;
164 
165 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
166 
167 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
168 static dispatch_func_t packet_dispatcher;
169 
170 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
171 
172 struct ff_msg_ring {
173     char ring_name[2][RTE_RING_NAMESIZE];
174     /* ring[0] for lcore recv msg, other send */
175     /* ring[1] for lcore send msg, other read */
176     struct rte_ring *ring[2];
177 } __rte_cache_aligned;
178 
179 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
180 static struct rte_mempool *message_pool;
181 
182 struct ff_dpdk_if_context {
183     void *sc;
184     void *ifp;
185     uint16_t port_id;
186     struct ff_hw_features hw_features;
187 } __rte_cache_aligned;
188 
189 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
190 
191 static struct ff_top_args ff_top_status;
192 static struct ff_traffic_args ff_traffic;
193 
194 extern void ff_hardclock(void);
195 
196 static void
197 ff_hardclock_job(__rte_unused struct rte_timer *timer,
198     __rte_unused void *arg) {
199     ff_hardclock();
200     ff_update_current_ts();
201 }
202 
203 struct ff_dpdk_if_context *
204 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
205 {
206     struct ff_dpdk_if_context *ctx;
207 
208     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
209     if (ctx == NULL)
210         return NULL;
211 
212     ctx->sc = sc;
213     ctx->ifp = ifp;
214     ctx->port_id = cfg->port_id;
215     ctx->hw_features = cfg->hw_features;
216 
217     return ctx;
218 }
219 
220 void
221 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
222 {
223     free(ctx);
224 }
225 
226 static void
227 check_all_ports_link_status(void)
228 {
229     #define CHECK_INTERVAL 100 /* 100ms */
230     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
231 
232     uint16_t portid;
233     uint8_t count, all_ports_up, print_flag = 0;
234     struct rte_eth_link link;
235 
236     printf("\nChecking link status");
237     fflush(stdout);
238 
239     int i, nb_ports;
240     nb_ports = ff_global_cfg.dpdk.nb_ports;
241     for (count = 0; count <= MAX_CHECK_TIME; count++) {
242         all_ports_up = 1;
243         for (i = 0; i < nb_ports; i++) {
244             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
245             memset(&link, 0, sizeof(link));
246             rte_eth_link_get_nowait(portid, &link);
247 
248             /* print link status if flag set */
249             if (print_flag == 1) {
250                 if (link.link_status) {
251                     printf("Port %d Link Up - speed %u "
252                         "Mbps - %s\n", (int)portid,
253                         (unsigned)link.link_speed,
254                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
255                         ("full-duplex") : ("half-duplex\n"));
256                 } else {
257                     printf("Port %d Link Down\n", (int)portid);
258                 }
259                 continue;
260             }
261             /* clear all_ports_up flag if any link down */
262             if (link.link_status == 0) {
263                 all_ports_up = 0;
264                 break;
265             }
266         }
267 
268         /* after finally printing all link status, get out */
269         if (print_flag == 1)
270             break;
271 
272         if (all_ports_up == 0) {
273             printf(".");
274             fflush(stdout);
275             rte_delay_ms(CHECK_INTERVAL);
276         }
277 
278         /* set the print_flag if all ports up or timeout */
279         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
280             print_flag = 1;
281             printf("done\n");
282         }
283     }
284 }
285 
286 static int
287 init_lcore_conf(void)
288 {
289     uint8_t nb_dev_ports = rte_eth_dev_count();
290     if (nb_dev_ports == 0) {
291         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
292     }
293 
294     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
295         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
296                  ff_global_cfg.dpdk.max_portid);
297     }
298 
299     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
300     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
301 
302     uint16_t proc_id;
303     for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) {
304         uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id];
305         if (!lcore_config[lcore_id].detected) {
306             rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
307         }
308     }
309 
310     uint16_t socket_id = 0;
311     if (numa_on) {
312         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
313     }
314 
315     lcore_conf.socket_id = socket_id;
316 
317     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
318     int j;
319     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
320         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
321         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
322 
323         int queueid = -1;
324         int i;
325         for (i = 0; i < pconf->nb_lcores; i++) {
326             if (pconf->lcore_list[i] == lcore_id) {
327                 queueid = i;
328             }
329         }
330         if (queueid < 0) {
331             continue;
332         }
333         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
334         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
335         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
336         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
337         lcore_conf.nb_rx_queue++;
338 
339         lcore_conf.tx_queue_id[port_id] = queueid;
340         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
341         lcore_conf.nb_tx_port++;
342 
343         lcore_conf.pcap[port_id] = pconf->pcap;
344         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
345     }
346 
347     if (lcore_conf.nb_rx_queue == 0) {
348         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
349     }
350 
351     return 0;
352 }
353 
354 static int
355 init_mem_pool(void)
356 {
357     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
358     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
359     uint32_t nb_tx_queue = nb_lcores;
360     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
361 
362     unsigned nb_mbuf = RTE_MAX (
363         (nb_rx_queue*RX_QUEUE_SIZE          +
364         nb_ports*nb_lcores*MAX_PKT_BURST    +
365         nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
366         nb_lcores*MEMPOOL_CACHE_SIZE +
367 #ifdef FF_KNI
368         nb_ports*KNI_MBUF_MAX +
369         nb_ports*KNI_QUEUE_SIZE +
370 #endif
371         nb_lcores*nb_ports*DISPATCH_RING_SIZE),
372         (unsigned)8192);
373 
374     unsigned socketid = 0;
375     uint16_t i, lcore_id;
376     char s[64];
377 
378     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
379         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
380         if (numa_on) {
381             socketid = rte_lcore_to_socket_id(lcore_id);
382         }
383 
384         if (socketid >= NB_SOCKETS) {
385             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
386                 socketid, i, NB_SOCKETS);
387         }
388 
389         if (pktmbuf_pool[socketid] != NULL) {
390             continue;
391         }
392 
393         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
394             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
395             pktmbuf_pool[socketid] =
396                 rte_pktmbuf_pool_create(s, nb_mbuf,
397                     MEMPOOL_CACHE_SIZE, 0,
398                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
399         } else {
400             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
401             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
402         }
403 
404         if (pktmbuf_pool[socketid] == NULL) {
405             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
406         } else {
407             printf("create mbuf pool on socket %d\n", socketid);
408         }
409     }
410 
411     return 0;
412 }
413 
414 static struct rte_ring *
415 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
416 {
417     struct rte_ring *ring;
418 
419     if (name == NULL)
420         return NULL;
421 
422     /* If already create, just attached it */
423     if (likely((ring = rte_ring_lookup(name)) != NULL))
424         return ring;
425 
426     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
427         return rte_ring_create(name, count, socket_id, flags);
428     } else {
429         return rte_ring_lookup(name);
430     }
431 }
432 
433 static int
434 init_dispatch_ring(void)
435 {
436     int j;
437     char name_buf[RTE_RING_NAMESIZE];
438     int queueid;
439 
440     unsigned socketid = lcore_conf.socket_id;
441 
442     /* Create ring according to ports actually being used. */
443     int nb_ports = ff_global_cfg.dpdk.nb_ports;
444     for (j = 0; j < nb_ports; j++) {
445         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
446         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
447         int nb_queues = pconf->nb_lcores;
448         if (dispatch_ring[portid] == NULL) {
449             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
450 
451             dispatch_ring[portid] = rte_zmalloc(name_buf,
452                 sizeof(struct rte_ring *) * nb_queues,
453                 RTE_CACHE_LINE_SIZE);
454             if (dispatch_ring[portid] == NULL) {
455                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
456                     "failed\n", name_buf);
457             }
458         }
459 
460         for(queueid = 0; queueid < nb_queues; ++queueid) {
461             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
462                 portid, queueid);
463             dispatch_ring[portid][queueid] = create_ring(name_buf,
464                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
465 
466             if (dispatch_ring[portid][queueid] == NULL)
467                 rte_panic("create ring:%s failed!\n", name_buf);
468 
469             printf("create ring:%s success, %u ring entries are now free!\n",
470                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
471         }
472     }
473 
474     return 0;
475 }
476 
477 static void
478 ff_msg_init(struct rte_mempool *mp,
479     __attribute__((unused)) void *opaque_arg,
480     void *obj, __attribute__((unused)) unsigned i)
481 {
482     struct ff_msg *msg = (struct ff_msg *)obj;
483     msg->msg_type = FF_UNKNOWN;
484     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
485     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
486 }
487 
488 static int
489 init_msg_ring(void)
490 {
491     uint16_t i;
492     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
493     unsigned socketid = lcore_conf.socket_id;
494 
495     /* Create message buffer pool */
496     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
497         message_pool = rte_mempool_create(FF_MSG_POOL,
498            MSG_RING_SIZE * 2 * nb_procs,
499            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
500            NULL, NULL, ff_msg_init, NULL,
501            socketid, 0);
502     } else {
503         message_pool = rte_mempool_lookup(FF_MSG_POOL);
504     }
505 
506     if (message_pool == NULL) {
507         rte_panic("Create msg mempool failed\n");
508     }
509 
510     for(i = 0; i < nb_procs; ++i) {
511         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
512             "%s%u", FF_MSG_RING_IN, i);
513         snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE,
514             "%s%u", FF_MSG_RING_OUT, i);
515 
516         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
517             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
518         if (msg_ring[i].ring[0] == NULL)
519             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
520 
521         msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1],
522             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
523         if (msg_ring[i].ring[1] == NULL)
524             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
525     }
526 
527     return 0;
528 }
529 
530 #ifdef FF_KNI
531 static int
532 init_kni(void)
533 {
534     int nb_ports = rte_eth_dev_count();
535     kni_accept = 0;
536     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
537         kni_accept = 1;
538 
539     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
540         ff_global_cfg.kni.udp_port);
541 
542     unsigned socket_id = lcore_conf.socket_id;
543     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
544 
545     nb_ports = ff_global_cfg.dpdk.nb_ports;
546     int i, ret;
547     for (i = 0; i < nb_ports; i++) {
548         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
549         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
550     }
551 
552     return 0;
553 }
554 #endif
555 
556 static void
557 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
558 {
559     if (reta_size == 0) {
560         return;
561     }
562 
563     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
564     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
565 
566     /* config HW indirection table */
567     unsigned i, j, hash=0;
568     for (i = 0; i < reta_conf_size; i++) {
569         reta_conf[i].mask = ~0ULL;
570         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
571             reta_conf[i].reta[j] = hash++ % nb_queues;
572         }
573     }
574 
575     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
576         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
577             port_id);
578     }
579 }
580 
581 static int
582 init_port_start(void)
583 {
584     int nb_ports = ff_global_cfg.dpdk.nb_ports;
585     unsigned socketid = 0;
586     struct rte_mempool *mbuf_pool;
587     uint16_t i;
588 
589     for (i = 0; i < nb_ports; i++) {
590         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
591         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
592         uint16_t nb_queues = pconf->nb_lcores;
593 
594         struct rte_eth_dev_info dev_info;
595         rte_eth_dev_info_get(port_id, &dev_info);
596 
597         if (nb_queues > dev_info.max_rx_queues) {
598             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
599                 nb_queues,
600                 dev_info.max_rx_queues);
601         }
602 
603         if (nb_queues > dev_info.max_tx_queues) {
604             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
605                 nb_queues,
606                 dev_info.max_tx_queues);
607         }
608 
609         struct ether_addr addr;
610         rte_eth_macaddr_get(port_id, &addr);
611         printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
612                    " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
613                 (unsigned)port_id,
614                 addr.addr_bytes[0], addr.addr_bytes[1],
615                 addr.addr_bytes[2], addr.addr_bytes[3],
616                 addr.addr_bytes[4], addr.addr_bytes[5]);
617 
618         rte_memcpy(pconf->mac,
619             addr.addr_bytes, ETHER_ADDR_LEN);
620 
621         /* Clear txq_flags - we do not need multi-mempool and refcnt */
622         dev_info.default_txconf.txq_flags = ETH_TXQ_FLAGS_NOMULTMEMP |
623             ETH_TXQ_FLAGS_NOREFCOUNT;
624 
625         /* Disable features that are not supported by port's HW */
626         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) {
627             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP;
628         }
629 
630         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
631             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP;
632         }
633 
634         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) {
635             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP;
636         }
637 
638         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) {
639             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL;
640         }
641 
642         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) &&
643             !(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TSO)) {
644             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
645         }
646 
647         struct rte_eth_conf port_conf = {0};
648 
649         /* Set RSS mode */
650         port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
651         port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK;
652         port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes;
653         port_conf.rx_adv_conf.rss_conf.rss_key_len = 40;
654 
655         /* Set Rx VLAN stripping */
656         if (ff_global_cfg.dpdk.vlan_strip) {
657             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
658                 port_conf.rxmode.hw_vlan_strip = 1;
659             }
660         }
661 
662         /* Enable HW CRC stripping */
663         port_conf.rxmode.hw_strip_crc = 1;
664 
665         /* FIXME: Enable TCP LRO ?*/
666         #if 0
667         if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
668             printf("LRO is supported\n");
669             port_conf.rxmode.enable_lro = 1;
670             pconf->hw_features.rx_lro = 1;
671         }
672         #endif
673 
674         /* Set Rx checksum checking */
675         if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
676             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
677             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
678             printf("RX checksum offload supported\n");
679             port_conf.rxmode.hw_ip_checksum = 1;
680             pconf->hw_features.rx_csum = 1;
681         }
682 
683         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
684             printf("TX ip checksum offload supported\n");
685             pconf->hw_features.tx_csum_ip = 1;
686         }
687 
688         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
689             (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
690             printf("TX TCP&UDP checksum offload supported\n");
691             pconf->hw_features.tx_csum_l4 = 1;
692         }
693 
694         if (ff_global_cfg.dpdk.tso) {
695             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
696                 printf("TSO is supported\n");
697                 pconf->hw_features.tx_tso = 1;
698             }
699         } else {
700             printf("TSO is disabled\n");
701         }
702 
703         if (dev_info.reta_size) {
704             /* reta size must be power of 2 */
705             assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
706 
707             rss_reta_size[port_id] = dev_info.reta_size;
708             printf("port[%d]: rss table size: %d\n", port_id,
709                 dev_info.reta_size);
710         }
711 
712         if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
713             continue;
714         }
715 
716         int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
717         if (ret != 0) {
718             return ret;
719         }
720         uint16_t q;
721         for (q = 0; q < nb_queues; q++) {
722             if (numa_on) {
723                 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
724                 socketid = rte_lcore_to_socket_id(lcore_id);
725             }
726             mbuf_pool = pktmbuf_pool[socketid];
727 
728             ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE,
729                 socketid, &dev_info.default_txconf);
730             if (ret < 0) {
731                 return ret;
732             }
733 
734             ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE,
735                 socketid, &dev_info.default_rxconf, mbuf_pool);
736             if (ret < 0) {
737                 return ret;
738             }
739         }
740 
741         ret = rte_eth_dev_start(port_id);
742         if (ret < 0) {
743             return ret;
744         }
745 
746         if (nb_queues > 1) {
747             /* set HW rss hash function to Toeplitz. */
748             if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
749                 struct rte_eth_hash_filter_info info = {0};
750                 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
751                 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
752 
753                 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
754                     RTE_ETH_FILTER_SET, &info) < 0) {
755                     rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
756                         port_id);
757                 }
758             }
759 
760             set_rss_table(port_id, dev_info.reta_size, nb_queues);
761         }
762 
763         /* Enable RX in promiscuous mode for the Ethernet device. */
764         if (ff_global_cfg.dpdk.promiscuous) {
765             rte_eth_promiscuous_enable(port_id);
766             ret = rte_eth_promiscuous_get(port_id);
767             if (ret == 1) {
768                 printf("set port %u to promiscuous mode ok\n", port_id);
769             } else {
770                 printf("set port %u to promiscuous mode error\n", port_id);
771             }
772         }
773 
774         /* Enable pcap dump */
775         if (pconf->pcap) {
776             ff_enable_pcap(pconf->pcap);
777         }
778     }
779 
780     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
781         check_all_ports_link_status();
782     }
783 
784     return 0;
785 }
786 
787 static int
788 init_clock(void)
789 {
790     rte_timer_subsystem_init();
791     uint64_t hz = rte_get_timer_hz();
792     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
793     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
794 
795     rte_timer_init(&freebsd_clock);
796     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
797         rte_lcore_id(), &ff_hardclock_job, NULL);
798 
799     ff_update_current_ts();
800 
801     return 0;
802 }
803 
804 int
805 ff_dpdk_init(int argc, char **argv)
806 {
807     if (ff_global_cfg.dpdk.nb_procs < 1 ||
808         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
809         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
810         ff_global_cfg.dpdk.proc_id < 0) {
811         printf("param num_procs[%d] or proc_id[%d] error!\n",
812             ff_global_cfg.dpdk.nb_procs,
813             ff_global_cfg.dpdk.proc_id);
814         exit(1);
815     }
816 
817     int ret = rte_eal_init(argc, argv);
818     if (ret < 0) {
819         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
820     }
821 
822     numa_on = ff_global_cfg.dpdk.numa_on;
823 
824     init_lcore_conf();
825 
826     init_mem_pool();
827 
828     init_dispatch_ring();
829 
830     init_msg_ring();
831 
832 #ifdef FF_KNI
833     enable_kni = ff_global_cfg.kni.enable;
834     if (enable_kni) {
835         init_kni();
836     }
837 #endif
838 
839     ret = init_port_start();
840     if (ret < 0) {
841         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
842     }
843 
844     init_clock();
845 
846     return 0;
847 }
848 
849 static void
850 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
851 {
852     uint8_t rx_csum = ctx->hw_features.rx_csum;
853     if (rx_csum) {
854         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
855             rte_pktmbuf_free(pkt);
856             return;
857         }
858     }
859 
860     /*
861      * FIXME: should we save pkt->vlan_tci
862      * if (pkt->ol_flags & PKT_RX_VLAN_PKT)
863      */
864 
865     void *data = rte_pktmbuf_mtod(pkt, void*);
866     uint16_t len = rte_pktmbuf_data_len(pkt);
867 
868     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
869     if (hdr == NULL) {
870         rte_pktmbuf_free(pkt);
871         return;
872     }
873 
874     struct rte_mbuf *pn = pkt->next;
875     void *prev = hdr;
876     while(pn != NULL) {
877         data = rte_pktmbuf_mtod(pn, void*);
878         len = rte_pktmbuf_data_len(pn);
879 
880         void *mb = ff_mbuf_get(prev, data, len);
881         if (mb == NULL) {
882             ff_mbuf_free(hdr);
883             rte_pktmbuf_free(pkt);
884             return;
885         }
886         pn = pn->next;
887         prev = mb;
888     }
889 
890     ff_veth_process_packet(ctx->ifp, hdr);
891 }
892 
893 static enum FilterReturn
894 protocol_filter(const void *data, uint16_t len)
895 {
896     if(len < ETHER_HDR_LEN)
897         return FILTER_UNKNOWN;
898 
899     const struct ether_hdr *hdr;
900     hdr = (const struct ether_hdr *)data;
901 
902     if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP)
903         return FILTER_ARP;
904 
905 #ifndef FF_KNI
906     return FILTER_UNKNOWN;
907 #else
908     if (!enable_kni) {
909         return FILTER_UNKNOWN;
910     }
911 
912     if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4)
913         return FILTER_UNKNOWN;
914 
915     return ff_kni_proto_filter(data + ETHER_HDR_LEN,
916         len - ETHER_HDR_LEN);
917 #endif
918 }
919 
920 static inline void
921 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
922 {
923     struct rte_mbuf *md;
924     void *src, *dst;
925 
926     dst = rte_pktmbuf_mtod(mi, void *);
927     src = rte_pktmbuf_mtod(m, void *);
928 
929     mi->data_len = m->data_len;
930     rte_memcpy(dst, src, m->data_len);
931 
932     mi->port = m->port;
933     mi->vlan_tci = m->vlan_tci;
934     mi->vlan_tci_outer = m->vlan_tci_outer;
935     mi->tx_offload = m->tx_offload;
936     mi->hash = m->hash;
937     mi->ol_flags = m->ol_flags;
938     mi->packet_type = m->packet_type;
939 }
940 
941 /* copied from rte_pktmbuf_clone */
942 static inline struct rte_mbuf *
943 pktmbuf_deep_clone(const struct rte_mbuf *md,
944     struct rte_mempool *mp)
945 {
946     struct rte_mbuf *mc, *mi, **prev;
947     uint32_t pktlen;
948     uint8_t nseg;
949 
950     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
951         return NULL;
952 
953     mi = mc;
954     prev = &mi->next;
955     pktlen = md->pkt_len;
956     nseg = 0;
957 
958     do {
959         nseg++;
960         pktmbuf_deep_attach(mi, md);
961         *prev = mi;
962         prev = &mi->next;
963     } while ((md = md->next) != NULL &&
964         (mi = rte_pktmbuf_alloc(mp)) != NULL);
965 
966     *prev = NULL;
967     mc->nb_segs = nseg;
968     mc->pkt_len = pktlen;
969 
970     /* Allocation of new indirect segment failed */
971     if (unlikely (mi == NULL)) {
972         rte_pktmbuf_free(mc);
973         return NULL;
974     }
975 
976     __rte_mbuf_sanity_check(mc, 1);
977     return mc;
978 }
979 
980 static inline void
981 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
982     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
983 {
984     struct lcore_conf *qconf = &lcore_conf;
985     uint16_t nb_queues = qconf->nb_queue_list[port_id];
986 
987     uint16_t i;
988     for (i = 0; i < count; i++) {
989         struct rte_mbuf *rtem = bufs[i];
990 
991         if (unlikely(qconf->pcap[port_id] != NULL)) {
992             if (!pkts_from_ring) {
993                 ff_dump_packets(qconf->pcap[port_id], rtem);
994             }
995         }
996 
997         void *data = rte_pktmbuf_mtod(rtem, void*);
998         uint16_t len = rte_pktmbuf_data_len(rtem);
999 
1000         if (!pkts_from_ring) {
1001             ff_traffic.rx_packets++;
1002             ff_traffic.rx_bytes += len;
1003         }
1004 
1005         if (!pkts_from_ring && packet_dispatcher) {
1006             int ret = (*packet_dispatcher)(data, len, queue_id, nb_queues);
1007             if (ret < 0 || ret >= nb_queues) {
1008                 rte_pktmbuf_free(rtem);
1009                 continue;
1010             }
1011 
1012             if (ret != queue_id) {
1013                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
1014                 if (ret < 0)
1015                     rte_pktmbuf_free(rtem);
1016 
1017                 continue;
1018             }
1019         }
1020 
1021         enum FilterReturn filter = protocol_filter(data, len);
1022         if (filter == FILTER_ARP) {
1023             struct rte_mempool *mbuf_pool;
1024             struct rte_mbuf *mbuf_clone;
1025             if (!pkts_from_ring) {
1026                 uint16_t j;
1027                 for(j = 0; j < nb_queues; ++j) {
1028                     if(j == queue_id)
1029                         continue;
1030 
1031                     unsigned socket_id = 0;
1032                     if (numa_on) {
1033                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1034                         socket_id = rte_lcore_to_socket_id(lcore_id);
1035                     }
1036                     mbuf_pool = pktmbuf_pool[socket_id];
1037                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1038                     if(mbuf_clone) {
1039                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1040                             mbuf_clone);
1041                         if (ret < 0)
1042                             rte_pktmbuf_free(mbuf_clone);
1043                     }
1044                 }
1045             }
1046 
1047 #ifdef FF_KNI
1048             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1049                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1050                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1051                 if(mbuf_clone) {
1052                     ff_kni_enqueue(port_id, mbuf_clone);
1053                 }
1054             }
1055 #endif
1056             ff_veth_input(ctx, rtem);
1057 #ifdef FF_KNI
1058         } else if (enable_kni &&
1059             ((filter == FILTER_KNI && kni_accept) ||
1060             (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1061             ff_kni_enqueue(port_id, rtem);
1062 #endif
1063         } else {
1064             ff_veth_input(ctx, rtem);
1065         }
1066     }
1067 }
1068 
1069 static inline int
1070 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1071     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1072 {
1073     /* read packet from ring buf and to process */
1074     uint16_t nb_rb;
1075     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1076         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1077 
1078     if(nb_rb > 0) {
1079         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1080     }
1081 
1082     return 0;
1083 }
1084 
1085 static inline void
1086 handle_sysctl_msg(struct ff_msg *msg)
1087 {
1088     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1089         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1090         msg->sysctl.newlen);
1091 
1092     if (ret < 0) {
1093         msg->result = errno;
1094     } else {
1095         msg->result = 0;
1096     }
1097 }
1098 
1099 static inline void
1100 handle_ioctl_msg(struct ff_msg *msg)
1101 {
1102     int fd, ret;
1103     fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1104     if (fd < 0) {
1105         ret = -1;
1106         goto done;
1107     }
1108 
1109     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1110 
1111     ff_close(fd);
1112 
1113 done:
1114     if (ret < 0) {
1115         msg->result = errno;
1116     } else {
1117         msg->result = 0;
1118     }
1119 }
1120 
1121 static inline void
1122 handle_route_msg(struct ff_msg *msg)
1123 {
1124     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1125         &msg->route.len, msg->route.maxlen);
1126     if (ret < 0) {
1127         msg->result = errno;
1128     } else {
1129         msg->result = 0;
1130     }
1131 }
1132 
1133 static struct ff_top_args ff_status;
1134 static inline void
1135 handle_top_msg(struct ff_msg *msg)
1136 {
1137     msg->top = ff_status;
1138     msg->result = 0;
1139 }
1140 
1141 #ifdef FF_NETGRAPH
1142 static inline void
1143 handle_ngctl_msg(struct ff_msg *msg)
1144 {
1145     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1146     if (ret < 0) {
1147         msg->result = errno;
1148     } else {
1149         msg->result = 0;
1150         msg->ngctl.ret = ret;
1151     }
1152 }
1153 #endif
1154 
1155 #ifdef FF_IPFW
1156 static inline void
1157 handle_ipfw_msg(struct ff_msg *msg)
1158 {
1159     int fd, ret;
1160     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1161     if (fd < 0) {
1162         ret = -1;
1163         goto done;
1164     }
1165 
1166     switch (msg->ipfw.cmd) {
1167         case FF_IPFW_GET:
1168             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1169                 msg->ipfw.optname, msg->ipfw.optval,
1170                 msg->ipfw.optlen);
1171             break;
1172         case FF_IPFW_SET:
1173             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1174                 msg->ipfw.optname, msg->ipfw.optval,
1175                 *(msg->ipfw.optlen));
1176             break;
1177         default:
1178             ret = -1;
1179             errno = ENOTSUP;
1180             break;
1181     }
1182 
1183     ff_close(fd);
1184 
1185 done:
1186     if (ret < 0) {
1187         msg->result = errno;
1188     } else {
1189         msg->result = 0;
1190     }
1191 }
1192 #endif
1193 
1194 static inline void
1195 handle_traffic_msg(struct ff_msg *msg)
1196 {
1197     msg->traffic = ff_traffic;
1198     msg->result = 0;
1199 }
1200 
1201 static inline void
1202 handle_default_msg(struct ff_msg *msg)
1203 {
1204     msg->result = ENOTSUP;
1205 }
1206 
1207 static inline void
1208 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1209 {
1210     switch (msg->msg_type) {
1211         case FF_SYSCTL:
1212             handle_sysctl_msg(msg);
1213             break;
1214         case FF_IOCTL:
1215             handle_ioctl_msg(msg);
1216             break;
1217         case FF_ROUTE:
1218             handle_route_msg(msg);
1219             break;
1220         case FF_TOP:
1221             handle_top_msg(msg);
1222             break;
1223 #ifdef FF_NETGRAPH
1224         case FF_NGCTL:
1225             handle_ngctl_msg(msg);
1226             break;
1227 #endif
1228 #ifdef FF_IPFW
1229         case FF_IPFW_CTL:
1230             handle_ipfw_msg(msg);
1231             break;
1232 #endif
1233         case FF_TRAFFIC:
1234             handle_traffic_msg(msg);
1235             break;
1236         default:
1237             handle_default_msg(msg);
1238             break;
1239     }
1240     rte_ring_enqueue(msg_ring[proc_id].ring[1], msg);
1241 }
1242 
1243 static inline int
1244 process_msg_ring(uint16_t proc_id)
1245 {
1246     void *msg;
1247     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1248 
1249     if (unlikely(ret == 0)) {
1250         handle_msg((struct ff_msg *)msg, proc_id);
1251     }
1252 
1253     return 0;
1254 }
1255 
1256 /* Send burst of packets on an output interface */
1257 static inline int
1258 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1259 {
1260     struct rte_mbuf **m_table;
1261     int ret;
1262     uint16_t queueid;
1263 
1264     queueid = qconf->tx_queue_id[port];
1265     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1266 
1267     if (unlikely(qconf->pcap[port] != NULL)) {
1268         uint16_t i;
1269         for (i = 0; i < n; i++) {
1270             ff_dump_packets(qconf->pcap[port], m_table[i]);
1271         }
1272     }
1273 
1274     ff_traffic.tx_packets += n;
1275     uint16_t i;
1276     for (i = 0; i < n; i++) {
1277         ff_traffic.tx_bytes += rte_pktmbuf_data_len(m_table[i]);
1278     }
1279 
1280     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1281     if (unlikely(ret < n)) {
1282         do {
1283             rte_pktmbuf_free(m_table[ret]);
1284         } while (++ret < n);
1285     }
1286 
1287     return 0;
1288 }
1289 
1290 /* Enqueue a single packet, and send burst if queue is filled */
1291 static inline int
1292 send_single_packet(struct rte_mbuf *m, uint8_t port)
1293 {
1294     uint16_t len;
1295     struct lcore_conf *qconf;
1296 
1297     qconf = &lcore_conf;
1298     len = qconf->tx_mbufs[port].len;
1299     qconf->tx_mbufs[port].m_table[len] = m;
1300     len++;
1301 
1302     /* enough pkts to be sent */
1303     if (unlikely(len == MAX_PKT_BURST)) {
1304         send_burst(qconf, MAX_PKT_BURST, port);
1305         len = 0;
1306     }
1307 
1308     qconf->tx_mbufs[port].len = len;
1309     return 0;
1310 }
1311 
1312 int
1313 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1314     int total)
1315 {
1316     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1317     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1318     if (head == NULL) {
1319         ff_mbuf_free(m);
1320         return -1;
1321     }
1322 
1323     head->pkt_len = total;
1324     head->nb_segs = 0;
1325 
1326     int off = 0;
1327     struct rte_mbuf *cur = head, *prev = NULL;
1328     while(total > 0) {
1329         if (cur == NULL) {
1330             cur = rte_pktmbuf_alloc(mbuf_pool);
1331             if (cur == NULL) {
1332                 rte_pktmbuf_free(head);
1333                 ff_mbuf_free(m);
1334                 return -1;
1335             }
1336         }
1337 
1338         void *data = rte_pktmbuf_mtod(cur, void*);
1339         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1340         int ret = ff_mbuf_copydata(m, data, off, len);
1341         if (ret < 0) {
1342             rte_pktmbuf_free(head);
1343             ff_mbuf_free(m);
1344             return -1;
1345         }
1346 
1347         if (prev != NULL) {
1348             prev->next = cur;
1349         }
1350         prev = cur;
1351 
1352         cur->data_len = len;
1353         off += len;
1354         total -= len;
1355         head->nb_segs++;
1356         cur = NULL;
1357     }
1358 
1359     struct ff_tx_offload offload = {0};
1360     ff_mbuf_tx_offload(m, &offload);
1361 
1362     void *data = rte_pktmbuf_mtod(head, void*);
1363 
1364     if (offload.ip_csum) {
1365         /* ipv6 not supported yet */
1366         struct ipv4_hdr *iph;
1367         int iph_len;
1368         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1369         iph_len = (iph->version_ihl & 0x0f) << 2;
1370 
1371         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1372         head->l2_len = ETHER_HDR_LEN;
1373         head->l3_len = iph_len;
1374     }
1375 
1376     if (ctx->hw_features.tx_csum_l4) {
1377         struct ipv4_hdr *iph;
1378         int iph_len;
1379         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1380         iph_len = (iph->version_ihl & 0x0f) << 2;
1381 
1382         if (offload.tcp_csum) {
1383             head->ol_flags |= PKT_TX_TCP_CKSUM;
1384             head->l2_len = ETHER_HDR_LEN;
1385             head->l3_len = iph_len;
1386         }
1387 
1388         /*
1389          *  TCP segmentation offload.
1390          *
1391          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1392          *    implies PKT_TX_TCP_CKSUM)
1393          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1394          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1395          *    write the IP checksum to 0 in the packet
1396          *  - fill the mbuf offload information: l2_len,
1397          *    l3_len, l4_len, tso_segsz
1398          *  - calculate the pseudo header checksum without taking ip_len
1399          *    in account, and set it in the TCP header. Refer to
1400          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1401          *    used as helpers.
1402          */
1403         if (offload.tso_seg_size) {
1404             struct tcp_hdr *tcph;
1405             int tcph_len;
1406             tcph = (struct tcp_hdr *)((char *)iph + iph_len);
1407             tcph_len = (tcph->data_off & 0xf0) >> 2;
1408             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1409 
1410             head->ol_flags |= PKT_TX_TCP_SEG;
1411             head->l4_len = tcph_len;
1412             head->tso_segsz = offload.tso_seg_size;
1413         }
1414 
1415         if (offload.udp_csum) {
1416             head->ol_flags |= PKT_TX_UDP_CKSUM;
1417             head->l2_len = ETHER_HDR_LEN;
1418             head->l3_len = iph_len;
1419         }
1420     }
1421 
1422     ff_mbuf_free(m);
1423 
1424     return send_single_packet(head, ctx->port_id);
1425 }
1426 
1427 static int
1428 main_loop(void *arg)
1429 {
1430     struct loop_routine *lr = (struct loop_routine *)arg;
1431 
1432     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1433     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc;
1434     int i, j, nb_rx, idle;
1435     uint16_t port_id, queue_id;
1436     struct lcore_conf *qconf;
1437     const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) /
1438         US_PER_S * BURST_TX_DRAIN_US;
1439     struct ff_dpdk_if_context *ctx;
1440 
1441     prev_tsc = 0;
1442     usch_tsc = 0;
1443 
1444     qconf = &lcore_conf;
1445 
1446     while (1) {
1447         cur_tsc = rte_rdtsc();
1448         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1449             rte_timer_manage();
1450         }
1451 
1452         idle = 1;
1453         sys_tsc = 0;
1454         usr_tsc = 0;
1455 
1456         /*
1457          * TX burst queue drain
1458          */
1459         diff_tsc = cur_tsc - prev_tsc;
1460         if (unlikely(diff_tsc > drain_tsc)) {
1461             for (i = 0; i < qconf->nb_tx_port; i++) {
1462                 port_id = qconf->tx_port_id[i];
1463                 if (qconf->tx_mbufs[port_id].len == 0)
1464                     continue;
1465 
1466                 idle = 0;
1467 
1468                 send_burst(qconf,
1469                     qconf->tx_mbufs[port_id].len,
1470                     port_id);
1471                 qconf->tx_mbufs[port_id].len = 0;
1472             }
1473 
1474             prev_tsc = cur_tsc;
1475         }
1476 
1477         /*
1478          * Read packet from RX queues
1479          */
1480         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1481             port_id = qconf->rx_queue_list[i].port_id;
1482             queue_id = qconf->rx_queue_list[i].queue_id;
1483             ctx = veth_ctx[port_id];
1484 
1485 #ifdef FF_KNI
1486             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1487                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1488             }
1489 #endif
1490 
1491             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1492 
1493             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1494                 MAX_PKT_BURST);
1495             if (nb_rx == 0)
1496                 continue;
1497 
1498             idle = 0;
1499 
1500             /* Prefetch first packets */
1501             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1502                 rte_prefetch0(rte_pktmbuf_mtod(
1503                         pkts_burst[j], void *));
1504             }
1505 
1506             /* Prefetch and handle already prefetched packets */
1507             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1508                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1509                         j + PREFETCH_OFFSET], void *));
1510                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1511             }
1512 
1513             /* Handle remaining prefetched packets */
1514             for (; j < nb_rx; j++) {
1515                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1516             }
1517         }
1518 
1519         process_msg_ring(qconf->proc_id);
1520 
1521         div_tsc = rte_rdtsc();
1522 
1523         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) {
1524             usch_tsc = cur_tsc;
1525             lr->loop(lr->arg);
1526         }
1527 
1528         end_tsc = rte_rdtsc();
1529 
1530         if (usch_tsc == cur_tsc) {
1531             usr_tsc = end_tsc - div_tsc;
1532         }
1533 
1534         if (!idle) {
1535             sys_tsc = div_tsc - cur_tsc;
1536             ff_top_status.sys_tsc += sys_tsc;
1537         }
1538 
1539         ff_top_status.usr_tsc += usr_tsc;
1540         ff_top_status.work_tsc += end_tsc - cur_tsc;
1541         ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1542 
1543         ff_top_status.loops++;
1544     }
1545 
1546     return 0;
1547 }
1548 
1549 int
1550 ff_dpdk_if_up(void) {
1551     int i;
1552     struct lcore_conf *qconf = &lcore_conf;
1553     for (i = 0; i < qconf->nb_tx_port; i++) {
1554         uint16_t port_id = qconf->tx_port_id[i];
1555 
1556         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1557         veth_ctx[port_id] = ff_veth_attach(pconf);
1558         if (veth_ctx[port_id] == NULL) {
1559             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1560         }
1561     }
1562 
1563     return 0;
1564 }
1565 
1566 void
1567 ff_dpdk_run(loop_func_t loop, void *arg) {
1568     struct loop_routine *lr = rte_malloc(NULL,
1569         sizeof(struct loop_routine), 0);
1570     lr->loop = loop;
1571     lr->arg = arg;
1572     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1573     rte_eal_mp_wait_lcore();
1574     rte_free(lr);
1575 }
1576 
1577 void
1578 ff_dpdk_pktmbuf_free(void *m)
1579 {
1580     rte_pktmbuf_free((struct rte_mbuf *)m);
1581 }
1582 
1583 static uint32_t
1584 toeplitz_hash(unsigned keylen, const uint8_t *key,
1585     unsigned datalen, const uint8_t *data)
1586 {
1587     uint32_t hash = 0, v;
1588     u_int i, b;
1589 
1590     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1591 
1592     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1593     for (i = 0; i < datalen; i++) {
1594         for (b = 0; b < 8; b++) {
1595             if (data[i] & (1<<(7-b)))
1596                 hash ^= v;
1597             v <<= 1;
1598             if ((i + 4) < keylen &&
1599                 (key[i+4] & (1<<(7-b))))
1600                 v |= 1;
1601         }
1602     }
1603     return (hash);
1604 }
1605 
1606 int
1607 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
1608     uint16_t sport, uint16_t dport)
1609 {
1610     struct lcore_conf *qconf = &lcore_conf;
1611     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
1612     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
1613 
1614     if (nb_queues <= 1) {
1615         return 1;
1616     }
1617 
1618     uint16_t reta_size = rss_reta_size[ctx->port_id];
1619     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
1620 
1621     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
1622         sizeof(dport)];
1623 
1624     unsigned datalen = 0;
1625 
1626     bcopy(&saddr, &data[datalen], sizeof(saddr));
1627     datalen += sizeof(saddr);
1628 
1629     bcopy(&daddr, &data[datalen], sizeof(daddr));
1630     datalen += sizeof(daddr);
1631 
1632     bcopy(&sport, &data[datalen], sizeof(sport));
1633     datalen += sizeof(sport);
1634 
1635     bcopy(&dport, &data[datalen], sizeof(dport));
1636     datalen += sizeof(dport);
1637 
1638     uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes),
1639         default_rsskey_40bytes, datalen, data);
1640 
1641     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
1642 }
1643 
1644 void
1645 ff_regist_packet_dispatcher(dispatch_func_t func)
1646 {
1647     packet_dispatcher = func;
1648 }
1649 
1650 uint64_t
1651 ff_get_tsc_ns()
1652 {
1653     uint64_t cur_tsc = rte_rdtsc();
1654     uint64_t hz = rte_get_tsc_hz();
1655     return ((double)cur_tsc/(double)hz) * NS_PER_S;
1656 }
1657 
1658