xref: /f-stack/lib/ff_dpdk_if.c (revision d7140ab7)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 #include <unistd.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30 
31 #include <rte_common.h>
32 #include <rte_byteorder.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memcpy.h>
36 #include <rte_memzone.h>
37 #include <rte_config.h>
38 #include <rte_eal.h>
39 #include <rte_pci.h>
40 #include <rte_mbuf.h>
41 #include <rte_memory.h>
42 #include <rte_lcore.h>
43 #include <rte_launch.h>
44 #include <rte_ethdev.h>
45 #include <rte_debug.h>
46 #include <rte_common.h>
47 #include <rte_ether.h>
48 #include <rte_malloc.h>
49 #include <rte_cycles.h>
50 #include <rte_timer.h>
51 #include <rte_thash.h>
52 #include <rte_ip.h>
53 #include <rte_tcp.h>
54 #include <rte_udp.h>
55 
56 #include "ff_dpdk_if.h"
57 #include "ff_dpdk_pcap.h"
58 #include "ff_dpdk_kni.h"
59 #include "ff_config.h"
60 #include "ff_veth.h"
61 #include "ff_host_interface.h"
62 #include "ff_msg.h"
63 #include "ff_api.h"
64 #include "ff_memory.h"
65 
66 #ifdef FF_KNI
67 #define KNI_MBUF_MAX 2048
68 #define KNI_QUEUE_SIZE 2048
69 
70 static int enable_kni;
71 static int kni_accept;
72 #endif
73 
74 static int numa_on;
75 
76 static unsigned idle_sleep;
77 static unsigned pkt_tx_delay;
78 
79 static struct rte_timer freebsd_clock;
80 
81 // Mellanox Linux's driver key
82 static uint8_t default_rsskey_40bytes[40] = {
83     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
84     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
85     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
86     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
87     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
88 };
89 
90 struct lcore_conf lcore_conf;
91 
92 struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
93 
94 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
95 static dispatch_func_t packet_dispatcher;
96 
97 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
98 
99 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port);
100 
101 struct ff_msg_ring {
102     char ring_name[2][RTE_RING_NAMESIZE];
103     /* ring[0] for lcore recv msg, other send */
104     /* ring[1] for lcore send msg, other read */
105     struct rte_ring *ring[2];
106 } __rte_cache_aligned;
107 
108 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
109 static struct rte_mempool *message_pool;
110 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
111 
112 static struct ff_top_args ff_top_status;
113 static struct ff_traffic_args ff_traffic;
114 extern void ff_hardclock(void);
115 
116 static void
117 ff_hardclock_job(__rte_unused struct rte_timer *timer,
118     __rte_unused void *arg) {
119     ff_hardclock();
120     ff_update_current_ts();
121 }
122 
123 struct ff_dpdk_if_context *
124 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
125 {
126     struct ff_dpdk_if_context *ctx;
127 
128     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
129     if (ctx == NULL)
130         return NULL;
131 
132     ctx->sc = sc;
133     ctx->ifp = ifp;
134     ctx->port_id = cfg->port_id;
135     ctx->hw_features = cfg->hw_features;
136 
137     return ctx;
138 }
139 
140 void
141 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
142 {
143     free(ctx);
144 }
145 
146 static void
147 check_all_ports_link_status(void)
148 {
149     #define CHECK_INTERVAL 100 /* 100ms */
150     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
151 
152     uint16_t portid;
153     uint8_t count, all_ports_up, print_flag = 0;
154     struct rte_eth_link link;
155 
156     printf("\nChecking link status");
157     fflush(stdout);
158 
159     int i, nb_ports;
160     nb_ports = ff_global_cfg.dpdk.nb_ports;
161     for (count = 0; count <= MAX_CHECK_TIME; count++) {
162         all_ports_up = 1;
163         for (i = 0; i < nb_ports; i++) {
164             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
165             memset(&link, 0, sizeof(link));
166             rte_eth_link_get_nowait(portid, &link);
167 
168             /* print link status if flag set */
169             if (print_flag == 1) {
170                 if (link.link_status) {
171                     printf("Port %d Link Up - speed %u "
172                         "Mbps - %s\n", (int)portid,
173                         (unsigned)link.link_speed,
174                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
175                         ("full-duplex") : ("half-duplex\n"));
176                 } else {
177                     printf("Port %d Link Down\n", (int)portid);
178                 }
179                 continue;
180             }
181             /* clear all_ports_up flag if any link down */
182             if (link.link_status == 0) {
183                 all_ports_up = 0;
184                 break;
185             }
186         }
187 
188         /* after finally printing all link status, get out */
189         if (print_flag == 1)
190             break;
191 
192         if (all_ports_up == 0) {
193             printf(".");
194             fflush(stdout);
195             rte_delay_ms(CHECK_INTERVAL);
196         }
197 
198         /* set the print_flag if all ports up or timeout */
199         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
200             print_flag = 1;
201             printf("done\n");
202         }
203     }
204 }
205 
206 static int
207 init_lcore_conf(void)
208 {
209     uint8_t nb_dev_ports = rte_eth_dev_count_avail();
210     if (nb_dev_ports == 0) {
211         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
212     }
213 
214     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
215         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
216                  ff_global_cfg.dpdk.max_portid);
217     }
218 
219     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
220     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
221 
222     uint16_t proc_id;
223     for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) {
224         uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id];
225         if (!lcore_config[lcore_id].detected) {
226             rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
227         }
228     }
229 
230     uint16_t socket_id = 0;
231     if (numa_on) {
232         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
233     }
234 
235     lcore_conf.socket_id = socket_id;
236 
237     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
238     int j;
239     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
240         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
241         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
242 
243         int queueid = -1;
244         int i;
245         for (i = 0; i < pconf->nb_lcores; i++) {
246             if (pconf->lcore_list[i] == lcore_id) {
247                 queueid = i;
248             }
249         }
250         if (queueid < 0) {
251             continue;
252         }
253         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
254         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
255         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
256         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
257         lcore_conf.nb_rx_queue++;
258 
259         lcore_conf.tx_queue_id[port_id] = queueid;
260         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
261         lcore_conf.nb_tx_port++;
262 
263         lcore_conf.pcap[port_id] = pconf->pcap;
264         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
265     }
266 
267     if (lcore_conf.nb_rx_queue == 0) {
268         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
269     }
270 
271     return 0;
272 }
273 
274 static int
275 init_mem_pool(void)
276 {
277     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
278     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
279     uint32_t nb_tx_queue = nb_lcores;
280     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
281 
282     unsigned nb_mbuf = RTE_ALIGN_CEIL (
283         (nb_rx_queue*RX_QUEUE_SIZE          +
284         nb_ports*nb_lcores*MAX_PKT_BURST    +
285         nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
286         nb_lcores*MEMPOOL_CACHE_SIZE +
287 #ifdef FF_KNI
288         nb_ports*KNI_MBUF_MAX +
289         nb_ports*KNI_QUEUE_SIZE +
290 #endif
291         nb_lcores*nb_ports*DISPATCH_RING_SIZE),
292         (unsigned)8192);
293 
294     unsigned socketid = 0;
295     uint16_t i, lcore_id;
296     char s[64];
297 
298     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
299         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
300         if (numa_on) {
301             socketid = rte_lcore_to_socket_id(lcore_id);
302         }
303 
304         if (socketid >= NB_SOCKETS) {
305             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
306                 socketid, i, NB_SOCKETS);
307         }
308 
309         if (pktmbuf_pool[socketid] != NULL) {
310             continue;
311         }
312 
313         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
314             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
315             pktmbuf_pool[socketid] =
316                 rte_pktmbuf_pool_create(s, nb_mbuf,
317                     MEMPOOL_CACHE_SIZE, 0,
318                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
319         } else {
320             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
321             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
322         }
323 
324         if (pktmbuf_pool[socketid] == NULL) {
325             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
326         } else {
327             printf("create mbuf pool on socket %d\n", socketid);
328         }
329 
330 #ifdef FF_USE_PAGE_ARRAY
331         nb_mbuf = RTE_ALIGN_CEIL (
332             nb_ports*nb_lcores*MAX_PKT_BURST    +
333             nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
334             nb_lcores*MEMPOOL_CACHE_SIZE,
335             (unsigned)4096);
336         ff_init_ref_pool(nb_mbuf, socketid);
337 #endif
338     }
339 
340     return 0;
341 }
342 
343 static struct rte_ring *
344 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
345 {
346     struct rte_ring *ring;
347 
348     if (name == NULL) {
349         rte_exit(EXIT_FAILURE, "create ring failed, no name!\n");
350     }
351 
352     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
353         ring = rte_ring_create(name, count, socket_id, flags);
354     } else {
355         ring = rte_ring_lookup(name);
356     }
357 
358     if (ring == NULL) {
359         rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name);
360     }
361 
362     return ring;
363 }
364 
365 static int
366 init_dispatch_ring(void)
367 {
368     int j;
369     char name_buf[RTE_RING_NAMESIZE];
370     int queueid;
371 
372     unsigned socketid = lcore_conf.socket_id;
373 
374     /* Create ring according to ports actually being used. */
375     int nb_ports = ff_global_cfg.dpdk.nb_ports;
376     for (j = 0; j < nb_ports; j++) {
377         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
378         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
379         int nb_queues = pconf->nb_lcores;
380         if (dispatch_ring[portid] == NULL) {
381             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
382 
383             dispatch_ring[portid] = rte_zmalloc(name_buf,
384                 sizeof(struct rte_ring *) * nb_queues,
385                 RTE_CACHE_LINE_SIZE);
386             if (dispatch_ring[portid] == NULL) {
387                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
388                     "failed\n", name_buf);
389             }
390         }
391 
392         for(queueid = 0; queueid < nb_queues; ++queueid) {
393             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
394                 portid, queueid);
395             dispatch_ring[portid][queueid] = create_ring(name_buf,
396                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
397 
398             if (dispatch_ring[portid][queueid] == NULL)
399                 rte_panic("create ring:%s failed!\n", name_buf);
400 
401             printf("create ring:%s success, %u ring entries are now free!\n",
402                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
403         }
404     }
405 
406     return 0;
407 }
408 
409 static void
410 ff_msg_init(struct rte_mempool *mp,
411     __attribute__((unused)) void *opaque_arg,
412     void *obj, __attribute__((unused)) unsigned i)
413 {
414     struct ff_msg *msg = (struct ff_msg *)obj;
415     msg->msg_type = FF_UNKNOWN;
416     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
417     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
418 }
419 
420 static int
421 init_msg_ring(void)
422 {
423     uint16_t i;
424     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
425     unsigned socketid = lcore_conf.socket_id;
426 
427     /* Create message buffer pool */
428     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
429         message_pool = rte_mempool_create(FF_MSG_POOL,
430            MSG_RING_SIZE * 2 * nb_procs,
431            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
432            NULL, NULL, ff_msg_init, NULL,
433            socketid, 0);
434     } else {
435         message_pool = rte_mempool_lookup(FF_MSG_POOL);
436     }
437 
438     if (message_pool == NULL) {
439         rte_panic("Create msg mempool failed\n");
440     }
441 
442     for(i = 0; i < nb_procs; ++i) {
443         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
444             "%s%u", FF_MSG_RING_IN, i);
445         snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE,
446             "%s%u", FF_MSG_RING_OUT, i);
447 
448         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
449             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
450         if (msg_ring[i].ring[0] == NULL)
451             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
452 
453         msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1],
454             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
455         if (msg_ring[i].ring[1] == NULL)
456             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
457     }
458 
459     return 0;
460 }
461 
462 #ifdef FF_KNI
463 static int
464 init_kni(void)
465 {
466     int nb_ports = rte_eth_dev_count_avail();
467     kni_accept = 0;
468     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
469         kni_accept = 1;
470 
471     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
472         ff_global_cfg.kni.udp_port);
473 
474     unsigned socket_id = lcore_conf.socket_id;
475     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
476 
477     nb_ports = ff_global_cfg.dpdk.nb_ports;
478     int i, ret;
479     for (i = 0; i < nb_ports; i++) {
480         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
481         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
482     }
483 
484     return 0;
485 }
486 #endif
487 
488 static void
489 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
490 {
491     if (reta_size == 0) {
492         return;
493     }
494 
495     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
496     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
497 
498     /* config HW indirection table */
499     unsigned i, j, hash=0;
500     for (i = 0; i < reta_conf_size; i++) {
501         reta_conf[i].mask = ~0ULL;
502         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
503             reta_conf[i].reta[j] = hash++ % nb_queues;
504         }
505     }
506 
507     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
508         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
509             port_id);
510     }
511 }
512 
513 static int
514 init_port_start(void)
515 {
516     int nb_ports = ff_global_cfg.dpdk.nb_ports;
517     unsigned socketid = 0;
518     struct rte_mempool *mbuf_pool;
519     uint16_t i;
520 
521     for (i = 0; i < nb_ports; i++) {
522         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
523         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
524         uint16_t nb_queues = pconf->nb_lcores;
525 
526         struct rte_eth_dev_info dev_info;
527         struct rte_eth_conf port_conf = {0};
528         struct rte_eth_rxconf rxq_conf;
529         struct rte_eth_txconf txq_conf;
530 
531         rte_eth_dev_info_get(port_id, &dev_info);
532 
533         if (nb_queues > dev_info.max_rx_queues) {
534             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
535                 nb_queues,
536                 dev_info.max_rx_queues);
537         }
538 
539         if (nb_queues > dev_info.max_tx_queues) {
540             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
541                 nb_queues,
542                 dev_info.max_tx_queues);
543         }
544 
545         struct ether_addr addr;
546         rte_eth_macaddr_get(port_id, &addr);
547         printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
548                    " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
549                 (unsigned)port_id,
550                 addr.addr_bytes[0], addr.addr_bytes[1],
551                 addr.addr_bytes[2], addr.addr_bytes[3],
552                 addr.addr_bytes[4], addr.addr_bytes[5]);
553 
554         rte_memcpy(pconf->mac,
555             addr.addr_bytes, ETHER_ADDR_LEN);
556 
557         /* Set RSS mode */
558         uint64_t default_rss_hf = ETH_RSS_PROTO_MASK;
559         port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
560         port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf;
561         port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes;
562         port_conf.rx_adv_conf.rss_conf.rss_key_len = 40;
563         port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads;
564         if (port_conf.rx_adv_conf.rss_conf.rss_hf !=
565                 ETH_RSS_PROTO_MASK) {
566             printf("Port %u modified RSS hash function based on hardware support,"
567                     "requested:%#"PRIx64" configured:%#"PRIx64"\n",
568                     port_id, default_rss_hf,
569                     port_conf.rx_adv_conf.rss_conf.rss_hf);
570         }
571 
572         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) {
573             port_conf.txmode.offloads |=
574                 DEV_TX_OFFLOAD_MBUF_FAST_FREE;
575         }
576 
577         /* Set Rx VLAN stripping */
578         if (ff_global_cfg.dpdk.vlan_strip) {
579             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
580                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
581             }
582         }
583 
584         /* Enable HW CRC stripping */
585         if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_KEEP_CRC) {
586             port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_KEEP_CRC;
587         }
588 
589         /* FIXME: Enable TCP LRO ?*/
590         #if 0
591         if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
592             printf("LRO is supported\n");
593             port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
594             pconf->hw_features.rx_lro = 1;
595         }
596         #endif
597 
598         /* Set Rx checksum checking */
599         if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
600             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
601             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
602             printf("RX checksum offload supported\n");
603             port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
604             pconf->hw_features.rx_csum = 1;
605         }
606 
607         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
608             printf("TX ip checksum offload supported\n");
609             port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
610             pconf->hw_features.tx_csum_ip = 1;
611         }
612 
613         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
614             (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
615             printf("TX TCP&UDP checksum offload supported\n");
616             port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM;
617             pconf->hw_features.tx_csum_l4 = 1;
618         }
619 
620         if (ff_global_cfg.dpdk.tso) {
621             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
622                 printf("TSO is supported\n");
623                 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO;
624                 pconf->hw_features.tx_tso = 1;
625             }
626         } else {
627             printf("TSO is disabled\n");
628         }
629 
630         if (dev_info.reta_size) {
631             /* reta size must be power of 2 */
632             assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
633 
634             rss_reta_size[port_id] = dev_info.reta_size;
635             printf("port[%d]: rss table size: %d\n", port_id,
636                 dev_info.reta_size);
637         }
638 
639         if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
640             continue;
641         }
642 
643         int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
644         if (ret != 0) {
645             return ret;
646         }
647 
648         static uint16_t nb_rxd = RX_QUEUE_SIZE;
649         static uint16_t nb_txd = TX_QUEUE_SIZE;
650         ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd);
651         if (ret < 0)
652             printf("Could not adjust number of descriptors "
653                     "for port%u (%d)\n", (unsigned)port_id, ret);
654 
655         uint16_t q;
656         for (q = 0; q < nb_queues; q++) {
657             if (numa_on) {
658                 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
659                 socketid = rte_lcore_to_socket_id(lcore_id);
660             }
661             mbuf_pool = pktmbuf_pool[socketid];
662 
663             txq_conf = dev_info.default_txconf;
664             txq_conf.offloads = port_conf.txmode.offloads;
665             ret = rte_eth_tx_queue_setup(port_id, q, nb_txd,
666                 socketid, &txq_conf);
667             if (ret < 0) {
668                 return ret;
669             }
670 
671             rxq_conf = dev_info.default_rxconf;
672             rxq_conf.offloads = port_conf.rxmode.offloads;
673             ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd,
674                 socketid, &rxq_conf, mbuf_pool);
675             if (ret < 0) {
676                 return ret;
677             }
678         }
679 
680         ret = rte_eth_dev_start(port_id);
681         if (ret < 0) {
682             return ret;
683         }
684 
685         if (nb_queues > 1) {
686             /* set HW rss hash function to Toeplitz. */
687             if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
688                 struct rte_eth_hash_filter_info info = {0};
689                 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
690                 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
691 
692                 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
693                     RTE_ETH_FILTER_SET, &info) < 0) {
694                     rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
695                         port_id);
696                 }
697             }
698 
699             set_rss_table(port_id, dev_info.reta_size, nb_queues);
700         }
701 
702         /* Enable RX in promiscuous mode for the Ethernet device. */
703         if (ff_global_cfg.dpdk.promiscuous) {
704             rte_eth_promiscuous_enable(port_id);
705             ret = rte_eth_promiscuous_get(port_id);
706             if (ret == 1) {
707                 printf("set port %u to promiscuous mode ok\n", port_id);
708             } else {
709                 printf("set port %u to promiscuous mode error\n", port_id);
710             }
711         }
712 
713         /* Enable pcap dump */
714         if (pconf->pcap) {
715             ff_enable_pcap(pconf->pcap);
716         }
717     }
718 
719     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
720         check_all_ports_link_status();
721     }
722 
723     return 0;
724 }
725 
726 static int
727 init_clock(void)
728 {
729     rte_timer_subsystem_init();
730     uint64_t hz = rte_get_timer_hz();
731     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
732     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
733 
734     rte_timer_init(&freebsd_clock);
735     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
736         rte_lcore_id(), &ff_hardclock_job, NULL);
737 
738     ff_update_current_ts();
739 
740     return 0;
741 }
742 
743 int
744 ff_dpdk_init(int argc, char **argv)
745 {
746     if (ff_global_cfg.dpdk.nb_procs < 1 ||
747         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
748         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
749         ff_global_cfg.dpdk.proc_id < 0) {
750         printf("param num_procs[%d] or proc_id[%d] error!\n",
751             ff_global_cfg.dpdk.nb_procs,
752             ff_global_cfg.dpdk.proc_id);
753         exit(1);
754     }
755 
756     int ret = rte_eal_init(argc, argv);
757     if (ret < 0) {
758         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
759     }
760 
761     numa_on = ff_global_cfg.dpdk.numa_on;
762 
763     idle_sleep = ff_global_cfg.dpdk.idle_sleep;
764     pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \
765         BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay;
766 
767     init_lcore_conf();
768 
769     init_mem_pool();
770 
771     init_dispatch_ring();
772 
773     init_msg_ring();
774 
775 #ifdef FF_KNI
776     enable_kni = ff_global_cfg.kni.enable;
777     if (enable_kni) {
778         init_kni();
779     }
780 #endif
781 
782 #ifdef FF_USE_PAGE_ARRAY
783     ff_mmap_init();
784 #endif
785 
786     ret = init_port_start();
787     if (ret < 0) {
788         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
789     }
790 
791     init_clock();
792 
793     return 0;
794 }
795 
796 static void
797 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
798 {
799     uint8_t rx_csum = ctx->hw_features.rx_csum;
800     if (rx_csum) {
801         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
802             rte_pktmbuf_free(pkt);
803             return;
804         }
805     }
806 
807     /*
808      * FIXME: should we save pkt->vlan_tci
809      * if (pkt->ol_flags & PKT_RX_VLAN_PKT)
810      */
811 
812     void *data = rte_pktmbuf_mtod(pkt, void*);
813     uint16_t len = rte_pktmbuf_data_len(pkt);
814 
815     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
816     if (hdr == NULL) {
817         rte_pktmbuf_free(pkt);
818         return;
819     }
820 
821     struct rte_mbuf *pn = pkt->next;
822     void *prev = hdr;
823     while(pn != NULL) {
824         data = rte_pktmbuf_mtod(pn, void*);
825         len = rte_pktmbuf_data_len(pn);
826 
827         void *mb = ff_mbuf_get(prev, data, len);
828         if (mb == NULL) {
829             ff_mbuf_free(hdr);
830             rte_pktmbuf_free(pkt);
831             return;
832         }
833         pn = pn->next;
834         prev = mb;
835     }
836 
837     ff_veth_process_packet(ctx->ifp, hdr);
838 }
839 
840 static enum FilterReturn
841 protocol_filter(const void *data, uint16_t len)
842 {
843     if(len < ETHER_HDR_LEN)
844         return FILTER_UNKNOWN;
845 
846     const struct ether_hdr *hdr;
847     hdr = (const struct ether_hdr *)data;
848     uint16_t eth_frame_type = rte_be_to_cpu_16(hdr->ether_type);
849 
850     if(eth_frame_type == ETHER_TYPE_ARP)
851         return FILTER_ARP;
852 
853 #ifndef FF_KNI
854     return FILTER_UNKNOWN;
855 #else
856     if (!enable_kni) {
857         return FILTER_UNKNOWN;
858     }
859 
860     if(eth_frame_type != ETHER_TYPE_IPv4
861 #ifdef INET6
862             && eth_frame_type != ETHER_TYPE_IPv6
863 #endif
864             )
865         return FILTER_UNKNOWN;
866 
867     return ff_kni_proto_filter(data + ETHER_HDR_LEN,
868         len - ETHER_HDR_LEN, eth_frame_type);
869 #endif
870 }
871 
872 static inline void
873 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
874 {
875     struct rte_mbuf *md;
876     void *src, *dst;
877 
878     dst = rte_pktmbuf_mtod(mi, void *);
879     src = rte_pktmbuf_mtod(m, void *);
880 
881     mi->data_len = m->data_len;
882     rte_memcpy(dst, src, m->data_len);
883 
884     mi->port = m->port;
885     mi->vlan_tci = m->vlan_tci;
886     mi->vlan_tci_outer = m->vlan_tci_outer;
887     mi->tx_offload = m->tx_offload;
888     mi->hash = m->hash;
889     mi->ol_flags = m->ol_flags;
890     mi->packet_type = m->packet_type;
891 }
892 
893 /* copied from rte_pktmbuf_clone */
894 static inline struct rte_mbuf *
895 pktmbuf_deep_clone(const struct rte_mbuf *md,
896     struct rte_mempool *mp)
897 {
898     struct rte_mbuf *mc, *mi, **prev;
899     uint32_t pktlen;
900     uint8_t nseg;
901 
902     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
903         return NULL;
904 
905     mi = mc;
906     prev = &mi->next;
907     pktlen = md->pkt_len;
908     nseg = 0;
909 
910     do {
911         nseg++;
912         pktmbuf_deep_attach(mi, md);
913         *prev = mi;
914         prev = &mi->next;
915     } while ((md = md->next) != NULL &&
916         (mi = rte_pktmbuf_alloc(mp)) != NULL);
917 
918     *prev = NULL;
919     mc->nb_segs = nseg;
920     mc->pkt_len = pktlen;
921 
922     /* Allocation of new indirect segment failed */
923     if (unlikely (mi == NULL)) {
924         rte_pktmbuf_free(mc);
925         return NULL;
926     }
927 
928     __rte_mbuf_sanity_check(mc, 1);
929     return mc;
930 }
931 
932 static inline void
933 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
934     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
935 {
936     struct lcore_conf *qconf = &lcore_conf;
937     uint16_t nb_queues = qconf->nb_queue_list[port_id];
938 
939     uint16_t i;
940     for (i = 0; i < count; i++) {
941         struct rte_mbuf *rtem = bufs[i];
942 
943         if (unlikely(qconf->pcap[port_id] != NULL)) {
944             if (!pkts_from_ring) {
945                 ff_dump_packets(qconf->pcap[port_id], rtem);
946             }
947         }
948 
949         void *data = rte_pktmbuf_mtod(rtem, void*);
950         uint16_t len = rte_pktmbuf_data_len(rtem);
951 
952         if (!pkts_from_ring) {
953             ff_traffic.rx_packets++;
954             ff_traffic.rx_bytes += len;
955         }
956 
957         if (!pkts_from_ring && packet_dispatcher) {
958             int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues);
959             if (ret == FF_DISPATCH_RESPONSE) {
960                 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len;
961                 send_single_packet(rtem, port_id);
962                 continue;
963             }
964 
965             if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) {
966                 rte_pktmbuf_free(rtem);
967                 continue;
968             }
969 
970             if (ret != queue_id) {
971                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
972                 if (ret < 0)
973                     rte_pktmbuf_free(rtem);
974 
975                 continue;
976             }
977         }
978 
979         enum FilterReturn filter = protocol_filter(data, len);
980         if (filter == FILTER_ARP) {
981             struct rte_mempool *mbuf_pool;
982             struct rte_mbuf *mbuf_clone;
983             if (!pkts_from_ring) {
984                 uint16_t j;
985                 for(j = 0; j < nb_queues; ++j) {
986                     if(j == queue_id)
987                         continue;
988 
989                     unsigned socket_id = 0;
990                     if (numa_on) {
991                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
992                         socket_id = rte_lcore_to_socket_id(lcore_id);
993                     }
994                     mbuf_pool = pktmbuf_pool[socket_id];
995                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
996                     if(mbuf_clone) {
997                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
998                             mbuf_clone);
999                         if (ret < 0)
1000                             rte_pktmbuf_free(mbuf_clone);
1001                     }
1002                 }
1003             }
1004 
1005 #ifdef FF_KNI
1006             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1007                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1008                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1009                 if(mbuf_clone) {
1010                     ff_kni_enqueue(port_id, mbuf_clone);
1011                 }
1012             }
1013 #endif
1014             ff_veth_input(ctx, rtem);
1015 #ifdef FF_KNI
1016         } else if (enable_kni &&
1017             ((filter == FILTER_KNI && kni_accept) ||
1018             (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1019             ff_kni_enqueue(port_id, rtem);
1020 #endif
1021         } else {
1022             ff_veth_input(ctx, rtem);
1023         }
1024     }
1025 }
1026 
1027 static inline int
1028 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1029     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1030 {
1031     /* read packet from ring buf and to process */
1032     uint16_t nb_rb;
1033     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1034         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1035 
1036     if(nb_rb > 0) {
1037         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1038     }
1039 
1040     return 0;
1041 }
1042 
1043 static inline void
1044 handle_sysctl_msg(struct ff_msg *msg)
1045 {
1046     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1047         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1048         msg->sysctl.newlen);
1049 
1050     if (ret < 0) {
1051         msg->result = errno;
1052     } else {
1053         msg->result = 0;
1054     }
1055 }
1056 
1057 static inline void
1058 handle_ioctl_msg(struct ff_msg *msg)
1059 {
1060     int fd, ret;
1061 #ifdef INET6
1062     if (msg->msg_type == FF_IOCTL6) {
1063         fd = ff_socket(AF_INET6, SOCK_DGRAM, 0);
1064     } else
1065 #endif
1066         fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1067 
1068     if (fd < 0) {
1069         ret = -1;
1070         goto done;
1071     }
1072 
1073     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1074 
1075     ff_close(fd);
1076 
1077 done:
1078     if (ret < 0) {
1079         msg->result = errno;
1080     } else {
1081         msg->result = 0;
1082     }
1083 }
1084 
1085 static inline void
1086 handle_route_msg(struct ff_msg *msg)
1087 {
1088     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1089         &msg->route.len, msg->route.maxlen);
1090     if (ret < 0) {
1091         msg->result = errno;
1092     } else {
1093         msg->result = 0;
1094     }
1095 }
1096 
1097 static inline void
1098 handle_top_msg(struct ff_msg *msg)
1099 {
1100     msg->top = ff_top_status;
1101     msg->result = 0;
1102 }
1103 
1104 #ifdef FF_NETGRAPH
1105 static inline void
1106 handle_ngctl_msg(struct ff_msg *msg)
1107 {
1108     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1109     if (ret < 0) {
1110         msg->result = errno;
1111     } else {
1112         msg->result = 0;
1113         msg->ngctl.ret = ret;
1114     }
1115 }
1116 #endif
1117 
1118 #ifdef FF_IPFW
1119 static inline void
1120 handle_ipfw_msg(struct ff_msg *msg)
1121 {
1122     int fd, ret;
1123     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1124     if (fd < 0) {
1125         ret = -1;
1126         goto done;
1127     }
1128 
1129     switch (msg->ipfw.cmd) {
1130         case FF_IPFW_GET:
1131             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1132                 msg->ipfw.optname, msg->ipfw.optval,
1133                 msg->ipfw.optlen);
1134             break;
1135         case FF_IPFW_SET:
1136             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1137                 msg->ipfw.optname, msg->ipfw.optval,
1138                 *(msg->ipfw.optlen));
1139             break;
1140         default:
1141             ret = -1;
1142             errno = ENOTSUP;
1143             break;
1144     }
1145 
1146     ff_close(fd);
1147 
1148 done:
1149     if (ret < 0) {
1150         msg->result = errno;
1151     } else {
1152         msg->result = 0;
1153     }
1154 }
1155 #endif
1156 
1157 static inline void
1158 handle_traffic_msg(struct ff_msg *msg)
1159 {
1160     msg->traffic = ff_traffic;
1161     msg->result = 0;
1162 }
1163 
1164 static inline void
1165 handle_default_msg(struct ff_msg *msg)
1166 {
1167     msg->result = ENOTSUP;
1168 }
1169 
1170 static inline void
1171 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1172 {
1173     switch (msg->msg_type) {
1174         case FF_SYSCTL:
1175             handle_sysctl_msg(msg);
1176             break;
1177         case FF_IOCTL:
1178 #ifdef INET6
1179         case FF_IOCTL6:
1180 #endif
1181             handle_ioctl_msg(msg);
1182             break;
1183         case FF_ROUTE:
1184             handle_route_msg(msg);
1185             break;
1186         case FF_TOP:
1187             handle_top_msg(msg);
1188             break;
1189 #ifdef FF_NETGRAPH
1190         case FF_NGCTL:
1191             handle_ngctl_msg(msg);
1192             break;
1193 #endif
1194 #ifdef FF_IPFW
1195         case FF_IPFW_CTL:
1196             handle_ipfw_msg(msg);
1197             break;
1198 #endif
1199         case FF_TRAFFIC:
1200             handle_traffic_msg(msg);
1201             break;
1202         default:
1203             handle_default_msg(msg);
1204             break;
1205     }
1206     rte_ring_enqueue(msg_ring[proc_id].ring[1], msg);
1207 }
1208 
1209 static inline int
1210 process_msg_ring(uint16_t proc_id)
1211 {
1212     void *msg;
1213     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1214 
1215     if (unlikely(ret == 0)) {
1216         handle_msg((struct ff_msg *)msg, proc_id);
1217     }
1218 
1219     return 0;
1220 }
1221 
1222 /* Send burst of packets on an output interface */
1223 static inline int
1224 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1225 {
1226     struct rte_mbuf **m_table;
1227     int ret;
1228     uint16_t queueid;
1229 
1230     queueid = qconf->tx_queue_id[port];
1231     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1232 
1233     if (unlikely(qconf->pcap[port] != NULL)) {
1234         uint16_t i;
1235         for (i = 0; i < n; i++) {
1236             ff_dump_packets(qconf->pcap[port], m_table[i]);
1237         }
1238     }
1239 
1240     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1241     ff_traffic.tx_packets += ret;
1242     uint16_t i;
1243     for (i = 0; i < ret; i++) {
1244         ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]);
1245 #ifdef FF_USE_PAGE_ARRAY
1246         if (qconf->tx_mbufs[port].bsd_m_table[i])
1247             ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs);
1248 #endif
1249     }
1250     if (unlikely(ret < n)) {
1251         do {
1252             rte_pktmbuf_free(m_table[ret]);
1253 #ifdef FF_USE_PAGE_ARRAY
1254             if ( qconf->tx_mbufs[port].bsd_m_table[ret] )
1255                 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]);
1256 #endif
1257         } while (++ret < n);
1258     }
1259     return 0;
1260 }
1261 
1262 /* Enqueue a single packet, and send burst if queue is filled */
1263 static inline int
1264 send_single_packet(struct rte_mbuf *m, uint8_t port)
1265 {
1266     uint16_t len;
1267     struct lcore_conf *qconf;
1268 
1269     qconf = &lcore_conf;
1270     len = qconf->tx_mbufs[port].len;
1271     qconf->tx_mbufs[port].m_table[len] = m;
1272     len++;
1273 
1274     /* enough pkts to be sent */
1275     if (unlikely(len == MAX_PKT_BURST)) {
1276         send_burst(qconf, MAX_PKT_BURST, port);
1277         len = 0;
1278     }
1279 
1280     qconf->tx_mbufs[port].len = len;
1281     return 0;
1282 }
1283 
1284 int
1285 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1286     int total)
1287 {
1288 #ifdef FF_USE_PAGE_ARRAY
1289     struct lcore_conf *qconf = &lcore_conf;
1290     int    len = 0;
1291 
1292     len = ff_if_send_onepkt(ctx, m,total);
1293     if (unlikely(len == MAX_PKT_BURST)) {
1294         send_burst(qconf, MAX_PKT_BURST, ctx->port_id);
1295         len = 0;
1296     }
1297     qconf->tx_mbufs[ctx->port_id].len = len;
1298     return 0;
1299 #endif
1300     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1301     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1302     if (head == NULL) {
1303         ff_mbuf_free(m);
1304         return -1;
1305     }
1306 
1307     head->pkt_len = total;
1308     head->nb_segs = 0;
1309 
1310     int off = 0;
1311     struct rte_mbuf *cur = head, *prev = NULL;
1312     while(total > 0) {
1313         if (cur == NULL) {
1314             cur = rte_pktmbuf_alloc(mbuf_pool);
1315             if (cur == NULL) {
1316                 rte_pktmbuf_free(head);
1317                 ff_mbuf_free(m);
1318                 return -1;
1319             }
1320         }
1321 
1322         if (prev != NULL) {
1323             prev->next = cur;
1324         }
1325         head->nb_segs++;
1326 
1327         prev = cur;
1328         void *data = rte_pktmbuf_mtod(cur, void*);
1329         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1330         int ret = ff_mbuf_copydata(m, data, off, len);
1331         if (ret < 0) {
1332             rte_pktmbuf_free(head);
1333             ff_mbuf_free(m);
1334             return -1;
1335         }
1336 
1337 
1338         cur->data_len = len;
1339         off += len;
1340         total -= len;
1341         cur = NULL;
1342     }
1343 
1344     struct ff_tx_offload offload = {0};
1345     ff_mbuf_tx_offload(m, &offload);
1346 
1347     void *data = rte_pktmbuf_mtod(head, void*);
1348 
1349     if (offload.ip_csum) {
1350         /* ipv6 not supported yet */
1351         struct ipv4_hdr *iph;
1352         int iph_len;
1353         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1354         iph_len = (iph->version_ihl & 0x0f) << 2;
1355 
1356         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1357         head->l2_len = ETHER_HDR_LEN;
1358         head->l3_len = iph_len;
1359     }
1360 
1361     if (ctx->hw_features.tx_csum_l4) {
1362         struct ipv4_hdr *iph;
1363         int iph_len;
1364         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1365         iph_len = (iph->version_ihl & 0x0f) << 2;
1366 
1367         if (offload.tcp_csum) {
1368             head->ol_flags |= PKT_TX_TCP_CKSUM;
1369             head->l2_len = ETHER_HDR_LEN;
1370             head->l3_len = iph_len;
1371         }
1372 
1373         /*
1374          *  TCP segmentation offload.
1375          *
1376          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1377          *    implies PKT_TX_TCP_CKSUM)
1378          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1379          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1380          *    write the IP checksum to 0 in the packet
1381          *  - fill the mbuf offload information: l2_len,
1382          *    l3_len, l4_len, tso_segsz
1383          *  - calculate the pseudo header checksum without taking ip_len
1384          *    in account, and set it in the TCP header. Refer to
1385          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1386          *    used as helpers.
1387          */
1388         if (offload.tso_seg_size) {
1389             struct tcp_hdr *tcph;
1390             int tcph_len;
1391             tcph = (struct tcp_hdr *)((char *)iph + iph_len);
1392             tcph_len = (tcph->data_off & 0xf0) >> 2;
1393             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1394 
1395             head->ol_flags |= PKT_TX_TCP_SEG;
1396             head->l4_len = tcph_len;
1397             head->tso_segsz = offload.tso_seg_size;
1398         }
1399 
1400         if (offload.udp_csum) {
1401             head->ol_flags |= PKT_TX_UDP_CKSUM;
1402             head->l2_len = ETHER_HDR_LEN;
1403             head->l3_len = iph_len;
1404         }
1405     }
1406 
1407     ff_mbuf_free(m);
1408 
1409     return send_single_packet(head, ctx->port_id);
1410 }
1411 
1412 static int
1413 main_loop(void *arg)
1414 {
1415     struct loop_routine *lr = (struct loop_routine *)arg;
1416 
1417     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1418     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc;
1419     int i, j, nb_rx, idle;
1420     uint16_t port_id, queue_id;
1421     struct lcore_conf *qconf;
1422     uint64_t drain_tsc = 0;
1423     struct ff_dpdk_if_context *ctx;
1424 
1425     if (pkt_tx_delay) {
1426         drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay;
1427     }
1428 
1429     prev_tsc = 0;
1430     usch_tsc = 0;
1431 
1432     qconf = &lcore_conf;
1433 
1434     while (1) {
1435         cur_tsc = rte_rdtsc();
1436         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1437             rte_timer_manage();
1438         }
1439 
1440         idle = 1;
1441         sys_tsc = 0;
1442         usr_tsc = 0;
1443 
1444         /*
1445          * TX burst queue drain
1446          */
1447         diff_tsc = cur_tsc - prev_tsc;
1448         if (unlikely(diff_tsc >= drain_tsc)) {
1449             for (i = 0; i < qconf->nb_tx_port; i++) {
1450                 port_id = qconf->tx_port_id[i];
1451                 if (qconf->tx_mbufs[port_id].len == 0)
1452                     continue;
1453 
1454                 idle = 0;
1455 
1456                 send_burst(qconf,
1457                     qconf->tx_mbufs[port_id].len,
1458                     port_id);
1459                 qconf->tx_mbufs[port_id].len = 0;
1460             }
1461 
1462             prev_tsc = cur_tsc;
1463         }
1464 
1465         /*
1466          * Read packet from RX queues
1467          */
1468         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1469             port_id = qconf->rx_queue_list[i].port_id;
1470             queue_id = qconf->rx_queue_list[i].queue_id;
1471             ctx = veth_ctx[port_id];
1472 
1473 #ifdef FF_KNI
1474             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1475                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1476             }
1477 #endif
1478 
1479             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1480 
1481             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1482                 MAX_PKT_BURST);
1483             if (nb_rx == 0)
1484                 continue;
1485 
1486             idle = 0;
1487 
1488             /* Prefetch first packets */
1489             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1490                 rte_prefetch0(rte_pktmbuf_mtod(
1491                         pkts_burst[j], void *));
1492             }
1493 
1494             /* Prefetch and handle already prefetched packets */
1495             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1496                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1497                         j + PREFETCH_OFFSET], void *));
1498                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1499             }
1500 
1501             /* Handle remaining prefetched packets */
1502             for (; j < nb_rx; j++) {
1503                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1504             }
1505         }
1506 
1507         process_msg_ring(qconf->proc_id);
1508 
1509         div_tsc = rte_rdtsc();
1510 
1511         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) {
1512             usch_tsc = cur_tsc;
1513             lr->loop(lr->arg);
1514         }
1515 
1516         idle_sleep_tsc = rte_rdtsc();
1517         if (likely(idle && idle_sleep)) {
1518             usleep(idle_sleep);
1519             end_tsc = rte_rdtsc();
1520         } else {
1521             end_tsc = idle_sleep_tsc;
1522         }
1523 
1524         if (usch_tsc == cur_tsc) {
1525             usr_tsc = idle_sleep_tsc - div_tsc;
1526         }
1527 
1528         if (!idle) {
1529             sys_tsc = div_tsc - cur_tsc;
1530             ff_top_status.sys_tsc += sys_tsc;
1531         }
1532 
1533         ff_top_status.usr_tsc += usr_tsc;
1534         ff_top_status.work_tsc += end_tsc - cur_tsc;
1535         ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1536 
1537         ff_top_status.loops++;
1538     }
1539 
1540     return 0;
1541 }
1542 
1543 int
1544 ff_dpdk_if_up(void) {
1545     int i;
1546     struct lcore_conf *qconf = &lcore_conf;
1547     for (i = 0; i < qconf->nb_tx_port; i++) {
1548         uint16_t port_id = qconf->tx_port_id[i];
1549 
1550         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1551         veth_ctx[port_id] = ff_veth_attach(pconf);
1552         if (veth_ctx[port_id] == NULL) {
1553             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1554         }
1555     }
1556 
1557     return 0;
1558 }
1559 
1560 void
1561 ff_dpdk_run(loop_func_t loop, void *arg) {
1562     struct loop_routine *lr = rte_malloc(NULL,
1563         sizeof(struct loop_routine), 0);
1564     lr->loop = loop;
1565     lr->arg = arg;
1566     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1567     rte_eal_mp_wait_lcore();
1568     rte_free(lr);
1569 }
1570 
1571 void
1572 ff_dpdk_pktmbuf_free(void *m)
1573 {
1574     rte_pktmbuf_free((struct rte_mbuf *)m);
1575 }
1576 
1577 static uint32_t
1578 toeplitz_hash(unsigned keylen, const uint8_t *key,
1579     unsigned datalen, const uint8_t *data)
1580 {
1581     uint32_t hash = 0, v;
1582     u_int i, b;
1583 
1584     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1585 
1586     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1587     for (i = 0; i < datalen; i++) {
1588         for (b = 0; b < 8; b++) {
1589             if (data[i] & (1<<(7-b)))
1590                 hash ^= v;
1591             v <<= 1;
1592             if ((i + 4) < keylen &&
1593                 (key[i+4] & (1<<(7-b))))
1594                 v |= 1;
1595         }
1596     }
1597     return (hash);
1598 }
1599 
1600 int
1601 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
1602     uint16_t sport, uint16_t dport)
1603 {
1604     struct lcore_conf *qconf = &lcore_conf;
1605     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
1606     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
1607 
1608     if (nb_queues <= 1) {
1609         return 1;
1610     }
1611 
1612     uint16_t reta_size = rss_reta_size[ctx->port_id];
1613     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
1614 
1615     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
1616         sizeof(dport)];
1617 
1618     unsigned datalen = 0;
1619 
1620     bcopy(&saddr, &data[datalen], sizeof(saddr));
1621     datalen += sizeof(saddr);
1622 
1623     bcopy(&daddr, &data[datalen], sizeof(daddr));
1624     datalen += sizeof(daddr);
1625 
1626     bcopy(&sport, &data[datalen], sizeof(sport));
1627     datalen += sizeof(sport);
1628 
1629     bcopy(&dport, &data[datalen], sizeof(dport));
1630     datalen += sizeof(dport);
1631 
1632     uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes),
1633         default_rsskey_40bytes, datalen, data);
1634 
1635     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
1636 }
1637 
1638 void
1639 ff_regist_packet_dispatcher(dispatch_func_t func)
1640 {
1641     packet_dispatcher = func;
1642 }
1643 
1644 uint64_t
1645 ff_get_tsc_ns()
1646 {
1647     uint64_t cur_tsc = rte_rdtsc();
1648     uint64_t hz = rte_get_tsc_hz();
1649     return ((double)cur_tsc/(double)hz) * NS_PER_S;
1650 }
1651 
1652