xref: /f-stack/lib/ff_dpdk_if.c (revision 4dfcdbcc)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 #include <unistd.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30 
31 #include <rte_common.h>
32 #include <rte_byteorder.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memcpy.h>
36 #include <rte_memzone.h>
37 #include <rte_config.h>
38 #include <rte_eal.h>
39 #include <rte_pci.h>
40 #include <rte_mbuf.h>
41 #include <rte_memory.h>
42 #include <rte_lcore.h>
43 #include <rte_launch.h>
44 #include <rte_ethdev.h>
45 #include <rte_debug.h>
46 #include <rte_common.h>
47 #include <rte_ether.h>
48 #include <rte_malloc.h>
49 #include <rte_cycles.h>
50 #include <rte_timer.h>
51 #include <rte_thash.h>
52 #include <rte_ip.h>
53 #include <rte_tcp.h>
54 #include <rte_udp.h>
55 #include <rte_eth_bond.h>
56 
57 #include "ff_dpdk_if.h"
58 #include "ff_dpdk_pcap.h"
59 #include "ff_dpdk_kni.h"
60 #include "ff_config.h"
61 #include "ff_veth.h"
62 #include "ff_host_interface.h"
63 #include "ff_msg.h"
64 #include "ff_api.h"
65 #include "ff_memory.h"
66 
67 #ifdef FF_KNI
68 #define KNI_MBUF_MAX 2048
69 #define KNI_QUEUE_SIZE 2048
70 
71 int enable_kni;
72 static int kni_accept;
73 #endif
74 
75 static int numa_on;
76 
77 static unsigned idle_sleep;
78 static unsigned pkt_tx_delay;
79 
80 static struct rte_timer freebsd_clock;
81 
82 // Mellanox Linux's driver key
83 static uint8_t default_rsskey_40bytes[40] = {
84     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
85     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
86     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
87     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
88     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
89 };
90 
91 static int use_rsskey_52bytes = 0;
92 static uint8_t default_rsskey_52bytes[52] = {
93     0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23,
94     0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30,
95     0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02,
96     0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c,
97     0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55,
98     0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e,
99     0x81, 0x15, 0x03, 0x66
100 };
101 
102 struct lcore_conf lcore_conf;
103 
104 struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
105 
106 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
107 static dispatch_func_t packet_dispatcher;
108 
109 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
110 
111 #define BOND_DRIVER_NAME    "net_bonding"
112 
113 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port);
114 
115 struct ff_msg_ring {
116     char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE];
117     /* ring[0] for lcore recv msg, other send */
118     /* ring[1] for lcore send msg, other read */
119     struct rte_ring *ring[FF_MSG_NUM];
120 } __rte_cache_aligned;
121 
122 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
123 static struct rte_mempool *message_pool;
124 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
125 
126 static struct ff_top_args ff_top_status;
127 static struct ff_traffic_args ff_traffic;
128 extern void ff_hardclock(void);
129 
130 static void
131 ff_hardclock_job(__rte_unused struct rte_timer *timer,
132     __rte_unused void *arg) {
133     ff_hardclock();
134     ff_update_current_ts();
135 }
136 
137 struct ff_dpdk_if_context *
138 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
139 {
140     struct ff_dpdk_if_context *ctx;
141 
142     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
143     if (ctx == NULL)
144         return NULL;
145 
146     ctx->sc = sc;
147     ctx->ifp = ifp;
148     ctx->port_id = cfg->port_id;
149     ctx->hw_features = cfg->hw_features;
150 
151     return ctx;
152 }
153 
154 void
155 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
156 {
157     free(ctx);
158 }
159 
160 static void
161 check_all_ports_link_status(void)
162 {
163     #define CHECK_INTERVAL 100 /* 100ms */
164     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
165 
166     uint16_t portid;
167     uint8_t count, all_ports_up, print_flag = 0;
168     struct rte_eth_link link;
169 
170     printf("\nChecking link status");
171     fflush(stdout);
172 
173     int i, nb_ports;
174     nb_ports = ff_global_cfg.dpdk.nb_ports;
175     for (count = 0; count <= MAX_CHECK_TIME; count++) {
176         all_ports_up = 1;
177         for (i = 0; i < nb_ports; i++) {
178             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
179             memset(&link, 0, sizeof(link));
180             rte_eth_link_get_nowait(portid, &link);
181 
182             /* print link status if flag set */
183             if (print_flag == 1) {
184                 if (link.link_status) {
185                     printf("Port %d Link Up - speed %u "
186                         "Mbps - %s\n", (int)portid,
187                         (unsigned)link.link_speed,
188                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
189                         ("full-duplex") : ("half-duplex\n"));
190                 } else {
191                     printf("Port %d Link Down\n", (int)portid);
192                 }
193                 continue;
194             }
195             /* clear all_ports_up flag if any link down */
196             if (link.link_status == 0) {
197                 all_ports_up = 0;
198                 break;
199             }
200         }
201 
202         /* after finally printing all link status, get out */
203         if (print_flag == 1)
204             break;
205 
206         if (all_ports_up == 0) {
207             printf(".");
208             fflush(stdout);
209             rte_delay_ms(CHECK_INTERVAL);
210         }
211 
212         /* set the print_flag if all ports up or timeout */
213         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
214             print_flag = 1;
215             printf("done\n");
216         }
217     }
218 }
219 
220 static int
221 init_lcore_conf(void)
222 {
223     uint8_t nb_dev_ports = rte_eth_dev_count_avail();
224     if (nb_dev_ports == 0) {
225         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
226     }
227 
228     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
229         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
230                  ff_global_cfg.dpdk.max_portid);
231     }
232 
233     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
234     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
235 
236     uint16_t proc_id;
237     for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) {
238         uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id];
239         if (!lcore_config[lcore_id].detected) {
240             rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
241         }
242     }
243 
244     uint16_t socket_id = 0;
245     if (numa_on) {
246         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
247     }
248 
249     lcore_conf.socket_id = socket_id;
250 
251     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
252     int j;
253     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
254         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
255         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
256 
257         int queueid = -1;
258         int i;
259         for (i = 0; i < pconf->nb_lcores; i++) {
260             if (pconf->lcore_list[i] == lcore_id) {
261                 queueid = i;
262             }
263         }
264         if (queueid < 0) {
265             continue;
266         }
267         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
268         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
269         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
270         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
271         lcore_conf.nb_rx_queue++;
272 
273         lcore_conf.tx_queue_id[port_id] = queueid;
274         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
275         lcore_conf.nb_tx_port++;
276 
277         /* Enable pcap dump */
278         if (ff_global_cfg.pcap.enable) {
279             ff_enable_pcap(ff_global_cfg.pcap.save_path, ff_global_cfg.pcap.snap_len);
280         }
281 
282         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
283     }
284 
285     if (lcore_conf.nb_rx_queue == 0) {
286         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
287     }
288 
289     return 0;
290 }
291 
292 static int
293 init_mem_pool(void)
294 {
295     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
296     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
297     uint32_t nb_tx_queue = nb_lcores;
298     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
299     uint16_t max_portid = ff_global_cfg.dpdk.max_portid;
300 
301     unsigned nb_mbuf = RTE_ALIGN_CEIL (
302         (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE          +
303         nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST    +
304         nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE  +
305         nb_lcores * MEMPOOL_CACHE_SIZE +
306 #ifdef FF_KNI
307         nb_ports * KNI_MBUF_MAX +
308         nb_ports * KNI_QUEUE_SIZE +
309 #endif
310         nb_lcores * nb_ports * DISPATCH_RING_SIZE),
311         (unsigned)8192);
312 
313     unsigned socketid = 0;
314     uint16_t i, lcore_id;
315     char s[64];
316 
317     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
318         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
319         if (numa_on) {
320             socketid = rte_lcore_to_socket_id(lcore_id);
321         }
322 
323         if (socketid >= NB_SOCKETS) {
324             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
325                 socketid, i, NB_SOCKETS);
326         }
327 
328         if (pktmbuf_pool[socketid] != NULL) {
329             continue;
330         }
331 
332         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
333             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
334             pktmbuf_pool[socketid] =
335                 rte_pktmbuf_pool_create(s, nb_mbuf,
336                     MEMPOOL_CACHE_SIZE, 0,
337                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
338         } else {
339             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
340             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
341         }
342 
343         if (pktmbuf_pool[socketid] == NULL) {
344             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
345         } else {
346             printf("create mbuf pool on socket %d\n", socketid);
347         }
348 
349 #ifdef FF_USE_PAGE_ARRAY
350         nb_mbuf = RTE_ALIGN_CEIL (
351             nb_ports*nb_lcores*MAX_PKT_BURST    +
352             nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
353             nb_lcores*MEMPOOL_CACHE_SIZE,
354             (unsigned)4096);
355         ff_init_ref_pool(nb_mbuf, socketid);
356 #endif
357     }
358 
359     return 0;
360 }
361 
362 static struct rte_ring *
363 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
364 {
365     struct rte_ring *ring;
366 
367     if (name == NULL) {
368         rte_exit(EXIT_FAILURE, "create ring failed, no name!\n");
369     }
370 
371     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
372         ring = rte_ring_create(name, count, socket_id, flags);
373     } else {
374         ring = rte_ring_lookup(name);
375     }
376 
377     if (ring == NULL) {
378         rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name);
379     }
380 
381     return ring;
382 }
383 
384 static int
385 init_dispatch_ring(void)
386 {
387     int j;
388     char name_buf[RTE_RING_NAMESIZE];
389     int queueid;
390 
391     unsigned socketid = lcore_conf.socket_id;
392 
393     /* Create ring according to ports actually being used. */
394     int nb_ports = ff_global_cfg.dpdk.nb_ports;
395     for (j = 0; j < nb_ports; j++) {
396         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
397         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
398         int nb_queues = pconf->nb_lcores;
399         if (dispatch_ring[portid] == NULL) {
400             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
401 
402             dispatch_ring[portid] = rte_zmalloc(name_buf,
403                 sizeof(struct rte_ring *) * nb_queues,
404                 RTE_CACHE_LINE_SIZE);
405             if (dispatch_ring[portid] == NULL) {
406                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
407                     "failed\n", name_buf);
408             }
409         }
410 
411         for(queueid = 0; queueid < nb_queues; ++queueid) {
412             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
413                 portid, queueid);
414             dispatch_ring[portid][queueid] = create_ring(name_buf,
415                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
416 
417             if (dispatch_ring[portid][queueid] == NULL)
418                 rte_panic("create ring:%s failed!\n", name_buf);
419 
420             printf("create ring:%s success, %u ring entries are now free!\n",
421                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
422         }
423     }
424 
425     return 0;
426 }
427 
428 static void
429 ff_msg_init(struct rte_mempool *mp,
430     __attribute__((unused)) void *opaque_arg,
431     void *obj, __attribute__((unused)) unsigned i)
432 {
433     struct ff_msg *msg = (struct ff_msg *)obj;
434     msg->msg_type = FF_UNKNOWN;
435     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
436     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
437 }
438 
439 static int
440 init_msg_ring(void)
441 {
442     uint16_t i, j;
443     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
444     unsigned socketid = lcore_conf.socket_id;
445 
446     /* Create message buffer pool */
447     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
448         message_pool = rte_mempool_create(FF_MSG_POOL,
449            MSG_RING_SIZE * 2 * nb_procs,
450            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
451            NULL, NULL, ff_msg_init, NULL,
452            socketid, 0);
453     } else {
454         message_pool = rte_mempool_lookup(FF_MSG_POOL);
455     }
456 
457     if (message_pool == NULL) {
458         rte_panic("Create msg mempool failed\n");
459     }
460 
461     for(i = 0; i < nb_procs; ++i) {
462         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
463             "%s%u", FF_MSG_RING_IN, i);
464         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
465             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
466         if (msg_ring[i].ring[0] == NULL)
467             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
468 
469         for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) {
470             snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE,
471                 "%s%u_%u", FF_MSG_RING_OUT, i, j);
472             msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j],
473                 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
474             if (msg_ring[i].ring[j] == NULL)
475                 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]);
476         }
477     }
478 
479     return 0;
480 }
481 
482 #ifdef FF_KNI
483 static int
484 init_kni(void)
485 {
486     int nb_ports = rte_eth_dev_count_avail();
487     kni_accept = 0;
488     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
489         kni_accept = 1;
490 
491     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
492         ff_global_cfg.kni.udp_port);
493 
494     unsigned socket_id = lcore_conf.socket_id;
495     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
496 
497     nb_ports = ff_global_cfg.dpdk.nb_ports;
498     int i, ret;
499     for (i = 0; i < nb_ports; i++) {
500         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
501         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
502     }
503 
504     return 0;
505 }
506 #endif
507 
508 static void
509 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
510 {
511     if (reta_size == 0) {
512         return;
513     }
514 
515     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
516     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
517 
518     /* config HW indirection table */
519     unsigned i, j, hash=0;
520     for (i = 0; i < reta_conf_size; i++) {
521         reta_conf[i].mask = ~0ULL;
522         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
523             reta_conf[i].reta[j] = hash++ % nb_queues;
524         }
525     }
526 
527     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
528         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
529             port_id);
530     }
531 }
532 
533 static int
534 init_port_start(void)
535 {
536     int nb_ports = ff_global_cfg.dpdk.nb_ports;
537     unsigned socketid = 0;
538     struct rte_mempool *mbuf_pool;
539     uint16_t i, j;
540 
541     for (i = 0; i < nb_ports; i++) {
542         uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i];
543         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id];
544         uint16_t nb_queues = pconf->nb_lcores;
545 
546         for (j=0; j<=pconf->nb_slaves; j++) {
547             if (j < pconf->nb_slaves) {
548                 port_id = pconf->slave_portid_list[j];
549                 printf("To init %s's %d'st slave port[%d]\n",
550                         ff_global_cfg.dpdk.bond_cfgs->name,
551                         j, port_id);
552             } else {
553                 port_id = u_port_id;
554             }
555 
556             struct rte_eth_dev_info dev_info;
557             struct rte_eth_conf port_conf = {0};
558             struct rte_eth_rxconf rxq_conf;
559             struct rte_eth_txconf txq_conf;
560 
561             rte_eth_dev_info_get(port_id, &dev_info);
562 
563             if (nb_queues > dev_info.max_rx_queues) {
564                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
565                     nb_queues,
566                     dev_info.max_rx_queues);
567             }
568 
569             if (nb_queues > dev_info.max_tx_queues) {
570                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
571                     nb_queues,
572                     dev_info.max_tx_queues);
573             }
574 
575             struct ether_addr addr;
576             rte_eth_macaddr_get(port_id, &addr);
577             printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
578                        " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
579                     (unsigned)port_id,
580                     addr.addr_bytes[0], addr.addr_bytes[1],
581                     addr.addr_bytes[2], addr.addr_bytes[3],
582                     addr.addr_bytes[4], addr.addr_bytes[5]);
583 
584             rte_memcpy(pconf->mac,
585                 addr.addr_bytes, ETHER_ADDR_LEN);
586 
587             /* Set RSS mode */
588             uint64_t default_rss_hf = ETH_RSS_PROTO_MASK;
589             port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
590             port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf;
591             if (dev_info.hash_key_size == 52) {
592                 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_52bytes;
593                 port_conf.rx_adv_conf.rss_conf.rss_key_len = 52;
594                 use_rsskey_52bytes = 1;
595             } else {
596                 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes;
597                 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40;
598             }
599             port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads;
600             if (port_conf.rx_adv_conf.rss_conf.rss_hf !=
601                     ETH_RSS_PROTO_MASK) {
602                 printf("Port %u modified RSS hash function based on hardware support,"
603                         "requested:%#"PRIx64" configured:%#"PRIx64"\n",
604                         port_id, default_rss_hf,
605                         port_conf.rx_adv_conf.rss_conf.rss_hf);
606             }
607 
608             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) {
609                 port_conf.txmode.offloads |=
610                     DEV_TX_OFFLOAD_MBUF_FAST_FREE;
611             }
612 
613             /* Set Rx VLAN stripping */
614             if (ff_global_cfg.dpdk.vlan_strip) {
615                 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
616                     port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
617                 }
618             }
619 
620             /* Enable HW CRC stripping */
621             port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC;
622 
623             /* FIXME: Enable TCP LRO ?*/
624             #if 0
625             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
626                 printf("LRO is supported\n");
627                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
628                 pconf->hw_features.rx_lro = 1;
629             }
630             #endif
631 
632             /* Set Rx checksum checking */
633             if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
634                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
635                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
636                 printf("RX checksum offload supported\n");
637                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
638                 pconf->hw_features.rx_csum = 1;
639             }
640 
641             if (ff_global_cfg.dpdk.tx_csum_offoad_skip == 0) {
642                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
643                     printf("TX ip checksum offload supported\n");
644                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
645                     pconf->hw_features.tx_csum_ip = 1;
646                 }
647 
648                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
649                     (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
650                     printf("TX TCP&UDP checksum offload supported\n");
651                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM;
652                     pconf->hw_features.tx_csum_l4 = 1;
653                 }
654             } else {
655                 printf("TX checksum offoad is disabled\n");
656             }
657 
658             if (ff_global_cfg.dpdk.tso) {
659                 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
660                     printf("TSO is supported\n");
661                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO;
662                     pconf->hw_features.tx_tso = 1;
663                 }
664             } else {
665                 printf("TSO is disabled\n");
666             }
667 
668             if (dev_info.reta_size) {
669                 /* reta size must be power of 2 */
670                 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
671 
672                 rss_reta_size[port_id] = dev_info.reta_size;
673                 printf("port[%d]: rss table size: %d\n", port_id,
674                     dev_info.reta_size);
675             }
676 
677             if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
678                 continue;
679             }
680 
681             int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
682             if (ret != 0) {
683                 return ret;
684             }
685 
686             static uint16_t nb_rxd = RX_QUEUE_SIZE;
687             static uint16_t nb_txd = TX_QUEUE_SIZE;
688             ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd);
689             if (ret < 0)
690                 printf("Could not adjust number of descriptors "
691                         "for port%u (%d)\n", (unsigned)port_id, ret);
692 
693             uint16_t q;
694             for (q = 0; q < nb_queues; q++) {
695                 if (numa_on) {
696                     uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
697                     socketid = rte_lcore_to_socket_id(lcore_id);
698                 }
699                 mbuf_pool = pktmbuf_pool[socketid];
700 
701                 txq_conf = dev_info.default_txconf;
702                 txq_conf.offloads = port_conf.txmode.offloads;
703                 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd,
704                     socketid, &txq_conf);
705                 if (ret < 0) {
706                     return ret;
707                 }
708 
709                 rxq_conf = dev_info.default_rxconf;
710                 rxq_conf.offloads = port_conf.rxmode.offloads;
711                 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd,
712                     socketid, &rxq_conf, mbuf_pool);
713                 if (ret < 0) {
714                     return ret;
715                 }
716             }
717 
718 
719             if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME,
720                     strlen(dev_info.driver_name)) == 0) {
721 
722                 rte_eth_macaddr_get(port_id, &addr);
723                 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
724                            " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
725                         (unsigned)port_id,
726                         addr.addr_bytes[0], addr.addr_bytes[1],
727                         addr.addr_bytes[2], addr.addr_bytes[3],
728                         addr.addr_bytes[4], addr.addr_bytes[5]);
729 
730                 rte_memcpy(pconf->mac,
731                     addr.addr_bytes, ETHER_ADDR_LEN);
732 
733                 int mode, count, x;
734                 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS;
735 
736                 mode = rte_eth_bond_mode_get(port_id);
737                 printf("Port %u, bond mode:%d\n", port_id, mode);
738 
739                 count = rte_eth_bond_slaves_get(port_id, slaves, len);
740                 printf("Port %u, %s's slave ports count:%d\n", port_id,
741                             ff_global_cfg.dpdk.bond_cfgs->name, count);
742                 for (x=0; x<count; x++) {
743                     printf("Port %u, %s's slave port[%u]\n", port_id,
744                             ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]);
745                 }
746             }
747 
748             ret = rte_eth_dev_start(port_id);
749             if (ret < 0) {
750                 return ret;
751             }
752 
753             if (nb_queues > 1) {
754                 /* set HW rss hash function to Toeplitz. */
755                 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
756                     struct rte_eth_hash_filter_info info = {0};
757                     info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
758                     info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
759 
760                     if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
761                         RTE_ETH_FILTER_SET, &info) < 0) {
762                         rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
763                             port_id);
764                     }
765                 }
766 
767                 set_rss_table(port_id, dev_info.reta_size, nb_queues);
768             }
769 
770             /* Enable RX in promiscuous mode for the Ethernet device. */
771             if (ff_global_cfg.dpdk.promiscuous) {
772                 rte_eth_promiscuous_enable(port_id);
773                 ret = rte_eth_promiscuous_get(port_id);
774                 if (ret == 1) {
775                     printf("set port %u to promiscuous mode ok\n", port_id);
776                 } else {
777                     printf("set port %u to promiscuous mode error\n", port_id);
778                 }
779             }
780         }
781     }
782 
783     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
784         check_all_ports_link_status();
785     }
786 
787     return 0;
788 }
789 
790 static int
791 init_clock(void)
792 {
793     rte_timer_subsystem_init();
794     uint64_t hz = rte_get_timer_hz();
795     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
796     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
797 
798     rte_timer_init(&freebsd_clock);
799     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
800         rte_lcore_id(), &ff_hardclock_job, NULL);
801 
802     ff_update_current_ts();
803 
804     return 0;
805 }
806 
807 int
808 ff_dpdk_init(int argc, char **argv)
809 {
810     if (ff_global_cfg.dpdk.nb_procs < 1 ||
811         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
812         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
813         ff_global_cfg.dpdk.proc_id < 0) {
814         printf("param num_procs[%d] or proc_id[%d] error!\n",
815             ff_global_cfg.dpdk.nb_procs,
816             ff_global_cfg.dpdk.proc_id);
817         exit(1);
818     }
819 
820     int ret = rte_eal_init(argc, argv);
821     if (ret < 0) {
822         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
823     }
824 
825     numa_on = ff_global_cfg.dpdk.numa_on;
826 
827     idle_sleep = ff_global_cfg.dpdk.idle_sleep;
828     pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \
829         BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay;
830 
831     init_lcore_conf();
832 
833     init_mem_pool();
834 
835     init_dispatch_ring();
836 
837     init_msg_ring();
838 
839 #ifdef FF_KNI
840     enable_kni = ff_global_cfg.kni.enable;
841     if (enable_kni) {
842         init_kni();
843     }
844 #endif
845 
846 #ifdef FF_USE_PAGE_ARRAY
847     ff_mmap_init();
848 #endif
849 
850     ret = init_port_start();
851     if (ret < 0) {
852         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
853     }
854 
855     init_clock();
856 
857     return 0;
858 }
859 
860 static void
861 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
862 {
863     uint8_t rx_csum = ctx->hw_features.rx_csum;
864     if (rx_csum) {
865         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
866             rte_pktmbuf_free(pkt);
867             return;
868         }
869     }
870 
871     void *data = rte_pktmbuf_mtod(pkt, void*);
872     uint16_t len = rte_pktmbuf_data_len(pkt);
873 
874     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
875     if (hdr == NULL) {
876         rte_pktmbuf_free(pkt);
877         return;
878     }
879 
880     if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) {
881         ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci);
882     }
883 
884     struct rte_mbuf *pn = pkt->next;
885     void *prev = hdr;
886     while(pn != NULL) {
887         data = rte_pktmbuf_mtod(pn, void*);
888         len = rte_pktmbuf_data_len(pn);
889 
890         void *mb = ff_mbuf_get(prev, data, len);
891         if (mb == NULL) {
892             ff_mbuf_free(hdr);
893             rte_pktmbuf_free(pkt);
894             return;
895         }
896         pn = pn->next;
897         prev = mb;
898     }
899 
900     ff_veth_process_packet(ctx->ifp, hdr);
901 }
902 
903 static enum FilterReturn
904 protocol_filter(const void *data, uint16_t len)
905 {
906     if(len < ETHER_HDR_LEN)
907         return FILTER_UNKNOWN;
908 
909     const struct ether_hdr *hdr;
910     const struct vlan_hdr *vlanhdr;
911     hdr = (const struct ether_hdr *)data;
912     uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type);
913     data += ETHER_HDR_LEN;
914     len -= ETHER_HDR_LEN;
915 
916     if (ether_type == ETHER_TYPE_VLAN) {
917         vlanhdr = (struct vlan_hdr *)data;
918         ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto);
919         data += sizeof(struct vlan_hdr);
920         len -= sizeof(struct vlan_hdr);
921     }
922 
923     if(ether_type == ETHER_TYPE_ARP)
924         return FILTER_ARP;
925 
926 #ifdef INET6
927     if (ether_type == ETHER_TYPE_IPv6) {
928         return ff_kni_proto_filter(data,
929             len, ether_type);
930     }
931 #endif
932 
933 #ifndef FF_KNI
934     return FILTER_UNKNOWN;
935 #else
936     if (!enable_kni) {
937         return FILTER_UNKNOWN;
938     }
939 
940     if(ether_type != ETHER_TYPE_IPv4)
941         return FILTER_UNKNOWN;
942 
943     return ff_kni_proto_filter(data,
944         len, ether_type);
945 #endif
946 }
947 
948 static inline void
949 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
950 {
951     struct rte_mbuf *md;
952     void *src, *dst;
953 
954     dst = rte_pktmbuf_mtod(mi, void *);
955     src = rte_pktmbuf_mtod(m, void *);
956 
957     mi->data_len = m->data_len;
958     rte_memcpy(dst, src, m->data_len);
959 
960     mi->port = m->port;
961     mi->vlan_tci = m->vlan_tci;
962     mi->vlan_tci_outer = m->vlan_tci_outer;
963     mi->tx_offload = m->tx_offload;
964     mi->hash = m->hash;
965     mi->ol_flags = m->ol_flags;
966     mi->packet_type = m->packet_type;
967 }
968 
969 /* copied from rte_pktmbuf_clone */
970 static inline struct rte_mbuf *
971 pktmbuf_deep_clone(const struct rte_mbuf *md,
972     struct rte_mempool *mp)
973 {
974     struct rte_mbuf *mc, *mi, **prev;
975     uint32_t pktlen;
976     uint8_t nseg;
977 
978     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
979         return NULL;
980 
981     mi = mc;
982     prev = &mi->next;
983     pktlen = md->pkt_len;
984     nseg = 0;
985 
986     do {
987         nseg++;
988         pktmbuf_deep_attach(mi, md);
989         *prev = mi;
990         prev = &mi->next;
991     } while ((md = md->next) != NULL &&
992         (mi = rte_pktmbuf_alloc(mp)) != NULL);
993 
994     *prev = NULL;
995     mc->nb_segs = nseg;
996     mc->pkt_len = pktlen;
997 
998     /* Allocation of new indirect segment failed */
999     if (unlikely (mi == NULL)) {
1000         rte_pktmbuf_free(mc);
1001         return NULL;
1002     }
1003 
1004     __rte_mbuf_sanity_check(mc, 1);
1005     return mc;
1006 }
1007 
1008 static inline void
1009 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
1010     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
1011 {
1012     struct lcore_conf *qconf = &lcore_conf;
1013     uint16_t nb_queues = qconf->nb_queue_list[port_id];
1014 
1015     uint16_t i;
1016     for (i = 0; i < count; i++) {
1017         struct rte_mbuf *rtem = bufs[i];
1018 
1019         if (unlikely( ff_global_cfg.pcap.enable)) {
1020             if (!pkts_from_ring) {
1021                 ff_dump_packets( ff_global_cfg.pcap.save_path, rtem, ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1022             }
1023         }
1024 
1025         void *data = rte_pktmbuf_mtod(rtem, void*);
1026         uint16_t len = rte_pktmbuf_data_len(rtem);
1027 
1028         if (!pkts_from_ring) {
1029             ff_traffic.rx_packets++;
1030             ff_traffic.rx_bytes += len;
1031         }
1032 
1033         if (!pkts_from_ring && packet_dispatcher) {
1034             int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues);
1035             if (ret == FF_DISPATCH_RESPONSE) {
1036                 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len;
1037 
1038                 /*
1039                  * We have not support vlan out strip
1040                  */
1041                 if (rtem->vlan_tci) {
1042                     data = rte_pktmbuf_prepend(rtem, sizeof(struct vlan_hdr));
1043                     if (data != NULL) {
1044                         memmove(data, data + sizeof(struct vlan_hdr), ETHER_HDR_LEN);
1045                         struct ether_hdr *etherhdr = (struct ether_hdr *)data;
1046                         struct vlan_hdr *vlanhdr = (struct vlan_hdr *)(data + ETHER_HDR_LEN);
1047                         vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci);
1048                         vlanhdr->eth_proto = etherhdr->ether_type;
1049                         etherhdr->ether_type = rte_cpu_to_be_16(ETHER_TYPE_VLAN);
1050                     }
1051                 }
1052                 send_single_packet(rtem, port_id);
1053                 continue;
1054             }
1055 
1056             if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) {
1057                 rte_pktmbuf_free(rtem);
1058                 continue;
1059             }
1060 
1061             if (ret != queue_id) {
1062                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
1063                 if (ret < 0)
1064                     rte_pktmbuf_free(rtem);
1065 
1066                 continue;
1067             }
1068         }
1069 
1070         enum FilterReturn filter = protocol_filter(data, len);
1071 #ifdef INET6
1072         if (filter == FILTER_ARP || filter == FILTER_NDP) {
1073 #else
1074         if (filter == FILTER_ARP) {
1075 #endif
1076             struct rte_mempool *mbuf_pool;
1077             struct rte_mbuf *mbuf_clone;
1078             if (!pkts_from_ring) {
1079                 uint16_t j;
1080                 for(j = 0; j < nb_queues; ++j) {
1081                     if(j == queue_id)
1082                         continue;
1083 
1084                     unsigned socket_id = 0;
1085                     if (numa_on) {
1086                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1087                         socket_id = rte_lcore_to_socket_id(lcore_id);
1088                     }
1089                     mbuf_pool = pktmbuf_pool[socket_id];
1090                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1091                     if(mbuf_clone) {
1092                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1093                             mbuf_clone);
1094                         if (ret < 0)
1095                             rte_pktmbuf_free(mbuf_clone);
1096                     }
1097                 }
1098             }
1099 
1100 #ifdef FF_KNI
1101             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1102                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1103                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1104                 if(mbuf_clone) {
1105                     ff_kni_enqueue(port_id, mbuf_clone);
1106                 }
1107             }
1108 #endif
1109             ff_veth_input(ctx, rtem);
1110 #ifdef FF_KNI
1111         } else if (enable_kni &&
1112             ((filter == FILTER_KNI && kni_accept) ||
1113             (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1114             ff_kni_enqueue(port_id, rtem);
1115 #endif
1116         } else {
1117             ff_veth_input(ctx, rtem);
1118         }
1119     }
1120 }
1121 
1122 static inline int
1123 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1124     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1125 {
1126     /* read packet from ring buf and to process */
1127     uint16_t nb_rb;
1128     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1129         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1130 
1131     if(nb_rb > 0) {
1132         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1133     }
1134 
1135     return 0;
1136 }
1137 
1138 static inline void
1139 handle_sysctl_msg(struct ff_msg *msg)
1140 {
1141     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1142         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1143         msg->sysctl.newlen);
1144 
1145     if (ret < 0) {
1146         msg->result = errno;
1147     } else {
1148         msg->result = 0;
1149     }
1150 }
1151 
1152 static inline void
1153 handle_ioctl_msg(struct ff_msg *msg)
1154 {
1155     int fd, ret;
1156 #ifdef INET6
1157     if (msg->msg_type == FF_IOCTL6) {
1158         fd = ff_socket(AF_INET6, SOCK_DGRAM, 0);
1159     } else
1160 #endif
1161         fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1162 
1163     if (fd < 0) {
1164         ret = -1;
1165         goto done;
1166     }
1167 
1168     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1169 
1170     ff_close(fd);
1171 
1172 done:
1173     if (ret < 0) {
1174         msg->result = errno;
1175     } else {
1176         msg->result = 0;
1177     }
1178 }
1179 
1180 static inline void
1181 handle_route_msg(struct ff_msg *msg)
1182 {
1183     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1184         &msg->route.len, msg->route.maxlen);
1185     if (ret < 0) {
1186         msg->result = errno;
1187     } else {
1188         msg->result = 0;
1189     }
1190 }
1191 
1192 static inline void
1193 handle_top_msg(struct ff_msg *msg)
1194 {
1195     msg->top = ff_top_status;
1196     msg->result = 0;
1197 }
1198 
1199 #ifdef FF_NETGRAPH
1200 static inline void
1201 handle_ngctl_msg(struct ff_msg *msg)
1202 {
1203     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1204     if (ret < 0) {
1205         msg->result = errno;
1206     } else {
1207         msg->result = 0;
1208         msg->ngctl.ret = ret;
1209     }
1210 }
1211 #endif
1212 
1213 #ifdef FF_IPFW
1214 static inline void
1215 handle_ipfw_msg(struct ff_msg *msg)
1216 {
1217     int fd, ret;
1218     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1219     if (fd < 0) {
1220         ret = -1;
1221         goto done;
1222     }
1223 
1224     switch (msg->ipfw.cmd) {
1225         case FF_IPFW_GET:
1226             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1227                 msg->ipfw.optname, msg->ipfw.optval,
1228                 msg->ipfw.optlen);
1229             break;
1230         case FF_IPFW_SET:
1231             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1232                 msg->ipfw.optname, msg->ipfw.optval,
1233                 *(msg->ipfw.optlen));
1234             break;
1235         default:
1236             ret = -1;
1237             errno = ENOTSUP;
1238             break;
1239     }
1240 
1241     ff_close(fd);
1242 
1243 done:
1244     if (ret < 0) {
1245         msg->result = errno;
1246     } else {
1247         msg->result = 0;
1248     }
1249 }
1250 #endif
1251 
1252 static inline void
1253 handle_traffic_msg(struct ff_msg *msg)
1254 {
1255     msg->traffic = ff_traffic;
1256     msg->result = 0;
1257 }
1258 
1259 static inline void
1260 handle_default_msg(struct ff_msg *msg)
1261 {
1262     msg->result = ENOTSUP;
1263 }
1264 
1265 static inline void
1266 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1267 {
1268     switch (msg->msg_type) {
1269         case FF_SYSCTL:
1270             handle_sysctl_msg(msg);
1271             break;
1272         case FF_IOCTL:
1273 #ifdef INET6
1274         case FF_IOCTL6:
1275 #endif
1276             handle_ioctl_msg(msg);
1277             break;
1278         case FF_ROUTE:
1279             handle_route_msg(msg);
1280             break;
1281         case FF_TOP:
1282             handle_top_msg(msg);
1283             break;
1284 #ifdef FF_NETGRAPH
1285         case FF_NGCTL:
1286             handle_ngctl_msg(msg);
1287             break;
1288 #endif
1289 #ifdef FF_IPFW
1290         case FF_IPFW_CTL:
1291             handle_ipfw_msg(msg);
1292             break;
1293 #endif
1294         case FF_TRAFFIC:
1295             handle_traffic_msg(msg);
1296             break;
1297         default:
1298             handle_default_msg(msg);
1299             break;
1300     }
1301     rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg);
1302 }
1303 
1304 static inline int
1305 process_msg_ring(uint16_t proc_id)
1306 {
1307     void *msg;
1308     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1309 
1310     if (unlikely(ret == 0)) {
1311         handle_msg((struct ff_msg *)msg, proc_id);
1312     }
1313 
1314     return 0;
1315 }
1316 
1317 /* Send burst of packets on an output interface */
1318 static inline int
1319 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1320 {
1321     struct rte_mbuf **m_table;
1322     int ret;
1323     uint16_t queueid;
1324 
1325     queueid = qconf->tx_queue_id[port];
1326     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1327 
1328     if (unlikely(ff_global_cfg.pcap.enable)) {
1329         uint16_t i;
1330         for (i = 0; i < n; i++) {
1331             ff_dump_packets( ff_global_cfg.pcap.save_path, m_table[i],
1332                ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1333         }
1334     }
1335 
1336     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1337     ff_traffic.tx_packets += ret;
1338     uint16_t i;
1339     for (i = 0; i < ret; i++) {
1340         ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]);
1341 #ifdef FF_USE_PAGE_ARRAY
1342         if (qconf->tx_mbufs[port].bsd_m_table[i])
1343             ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs);
1344 #endif
1345     }
1346     if (unlikely(ret < n)) {
1347         do {
1348             rte_pktmbuf_free(m_table[ret]);
1349 #ifdef FF_USE_PAGE_ARRAY
1350             if ( qconf->tx_mbufs[port].bsd_m_table[ret] )
1351                 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]);
1352 #endif
1353         } while (++ret < n);
1354     }
1355     return 0;
1356 }
1357 
1358 /* Enqueue a single packet, and send burst if queue is filled */
1359 static inline int
1360 send_single_packet(struct rte_mbuf *m, uint8_t port)
1361 {
1362     uint16_t len;
1363     struct lcore_conf *qconf;
1364 
1365     qconf = &lcore_conf;
1366     len = qconf->tx_mbufs[port].len;
1367     qconf->tx_mbufs[port].m_table[len] = m;
1368     len++;
1369 
1370     /* enough pkts to be sent */
1371     if (unlikely(len == MAX_PKT_BURST)) {
1372         send_burst(qconf, MAX_PKT_BURST, port);
1373         len = 0;
1374     }
1375 
1376     qconf->tx_mbufs[port].len = len;
1377     return 0;
1378 }
1379 
1380 int
1381 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1382     int total)
1383 {
1384 #ifdef FF_USE_PAGE_ARRAY
1385     struct lcore_conf *qconf = &lcore_conf;
1386     int    len = 0;
1387 
1388     len = ff_if_send_onepkt(ctx, m,total);
1389     if (unlikely(len == MAX_PKT_BURST)) {
1390         send_burst(qconf, MAX_PKT_BURST, ctx->port_id);
1391         len = 0;
1392     }
1393     qconf->tx_mbufs[ctx->port_id].len = len;
1394     return 0;
1395 #endif
1396     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1397     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1398     if (head == NULL) {
1399         ff_mbuf_free(m);
1400         return -1;
1401     }
1402 
1403     head->pkt_len = total;
1404     head->nb_segs = 0;
1405 
1406     int off = 0;
1407     struct rte_mbuf *cur = head, *prev = NULL;
1408     while(total > 0) {
1409         if (cur == NULL) {
1410             cur = rte_pktmbuf_alloc(mbuf_pool);
1411             if (cur == NULL) {
1412                 rte_pktmbuf_free(head);
1413                 ff_mbuf_free(m);
1414                 return -1;
1415             }
1416         }
1417 
1418         if (prev != NULL) {
1419             prev->next = cur;
1420         }
1421         head->nb_segs++;
1422 
1423         prev = cur;
1424         void *data = rte_pktmbuf_mtod(cur, void*);
1425         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1426         int ret = ff_mbuf_copydata(m, data, off, len);
1427         if (ret < 0) {
1428             rte_pktmbuf_free(head);
1429             ff_mbuf_free(m);
1430             return -1;
1431         }
1432 
1433 
1434         cur->data_len = len;
1435         off += len;
1436         total -= len;
1437         cur = NULL;
1438     }
1439 
1440     struct ff_tx_offload offload = {0};
1441     ff_mbuf_tx_offload(m, &offload);
1442 
1443     void *data = rte_pktmbuf_mtod(head, void*);
1444 
1445     if (offload.ip_csum) {
1446         /* ipv6 not supported yet */
1447         struct ipv4_hdr *iph;
1448         int iph_len;
1449         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1450         iph_len = (iph->version_ihl & 0x0f) << 2;
1451 
1452         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1453         head->l2_len = ETHER_HDR_LEN;
1454         head->l3_len = iph_len;
1455     }
1456 
1457     if (ctx->hw_features.tx_csum_l4) {
1458         struct ipv4_hdr *iph;
1459         int iph_len;
1460         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1461         iph_len = (iph->version_ihl & 0x0f) << 2;
1462 
1463         if (offload.tcp_csum) {
1464             head->ol_flags |= PKT_TX_TCP_CKSUM;
1465             head->l2_len = ETHER_HDR_LEN;
1466             head->l3_len = iph_len;
1467         }
1468 
1469         /*
1470          *  TCP segmentation offload.
1471          *
1472          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1473          *    implies PKT_TX_TCP_CKSUM)
1474          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1475          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1476          *    write the IP checksum to 0 in the packet
1477          *  - fill the mbuf offload information: l2_len,
1478          *    l3_len, l4_len, tso_segsz
1479          *  - calculate the pseudo header checksum without taking ip_len
1480          *    in account, and set it in the TCP header. Refer to
1481          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1482          *    used as helpers.
1483          */
1484         if (offload.tso_seg_size) {
1485             struct tcp_hdr *tcph;
1486             int tcph_len;
1487             tcph = (struct tcp_hdr *)((char *)iph + iph_len);
1488             tcph_len = (tcph->data_off & 0xf0) >> 2;
1489             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1490 
1491             head->ol_flags |= PKT_TX_TCP_SEG;
1492             head->l4_len = tcph_len;
1493             head->tso_segsz = offload.tso_seg_size;
1494         }
1495 
1496         if (offload.udp_csum) {
1497             head->ol_flags |= PKT_TX_UDP_CKSUM;
1498             head->l2_len = ETHER_HDR_LEN;
1499             head->l3_len = iph_len;
1500         }
1501     }
1502 
1503     ff_mbuf_free(m);
1504 
1505     return send_single_packet(head, ctx->port_id);
1506 }
1507 
1508 static int
1509 main_loop(void *arg)
1510 {
1511     struct loop_routine *lr = (struct loop_routine *)arg;
1512 
1513     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1514     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc;
1515     int i, j, nb_rx, idle;
1516     uint16_t port_id, queue_id;
1517     struct lcore_conf *qconf;
1518     uint64_t drain_tsc = 0;
1519     struct ff_dpdk_if_context *ctx;
1520 
1521     if (pkt_tx_delay) {
1522         drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay;
1523     }
1524 
1525     prev_tsc = 0;
1526     usch_tsc = 0;
1527 
1528     qconf = &lcore_conf;
1529 
1530     while (1) {
1531         cur_tsc = rte_rdtsc();
1532         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1533             rte_timer_manage();
1534         }
1535 
1536         idle = 1;
1537         sys_tsc = 0;
1538         usr_tsc = 0;
1539 
1540         /*
1541          * TX burst queue drain
1542          */
1543         diff_tsc = cur_tsc - prev_tsc;
1544         if (unlikely(diff_tsc >= drain_tsc)) {
1545             for (i = 0; i < qconf->nb_tx_port; i++) {
1546                 port_id = qconf->tx_port_id[i];
1547                 if (qconf->tx_mbufs[port_id].len == 0)
1548                     continue;
1549 
1550                 idle = 0;
1551 
1552                 send_burst(qconf,
1553                     qconf->tx_mbufs[port_id].len,
1554                     port_id);
1555                 qconf->tx_mbufs[port_id].len = 0;
1556             }
1557 
1558             prev_tsc = cur_tsc;
1559         }
1560 
1561         /*
1562          * Read packet from RX queues
1563          */
1564         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1565             port_id = qconf->rx_queue_list[i].port_id;
1566             queue_id = qconf->rx_queue_list[i].queue_id;
1567             ctx = veth_ctx[port_id];
1568 
1569 #ifdef FF_KNI
1570             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1571                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1572             }
1573 #endif
1574 
1575             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1576 
1577             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1578                 MAX_PKT_BURST);
1579             if (nb_rx == 0)
1580                 continue;
1581 
1582             idle = 0;
1583 
1584             /* Prefetch first packets */
1585             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1586                 rte_prefetch0(rte_pktmbuf_mtod(
1587                         pkts_burst[j], void *));
1588             }
1589 
1590             /* Prefetch and handle already prefetched packets */
1591             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1592                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1593                         j + PREFETCH_OFFSET], void *));
1594                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1595             }
1596 
1597             /* Handle remaining prefetched packets */
1598             for (; j < nb_rx; j++) {
1599                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1600             }
1601         }
1602 
1603         process_msg_ring(qconf->proc_id);
1604 
1605         div_tsc = rte_rdtsc();
1606 
1607         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) {
1608             usch_tsc = cur_tsc;
1609             lr->loop(lr->arg);
1610         }
1611 
1612         idle_sleep_tsc = rte_rdtsc();
1613         if (likely(idle && idle_sleep)) {
1614             usleep(idle_sleep);
1615             end_tsc = rte_rdtsc();
1616         } else {
1617             end_tsc = idle_sleep_tsc;
1618         }
1619 
1620         if (usch_tsc == cur_tsc) {
1621             usr_tsc = idle_sleep_tsc - div_tsc;
1622         }
1623 
1624         if (!idle) {
1625             sys_tsc = div_tsc - cur_tsc;
1626             ff_top_status.sys_tsc += sys_tsc;
1627         }
1628 
1629         ff_top_status.usr_tsc += usr_tsc;
1630         ff_top_status.work_tsc += end_tsc - cur_tsc;
1631         ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1632 
1633         ff_top_status.loops++;
1634     }
1635 
1636     return 0;
1637 }
1638 
1639 int
1640 ff_dpdk_if_up(void) {
1641     int i;
1642     struct lcore_conf *qconf = &lcore_conf;
1643     for (i = 0; i < qconf->nb_tx_port; i++) {
1644         uint16_t port_id = qconf->tx_port_id[i];
1645 
1646         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1647         veth_ctx[port_id] = ff_veth_attach(pconf);
1648         if (veth_ctx[port_id] == NULL) {
1649             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1650         }
1651     }
1652 
1653     return 0;
1654 }
1655 
1656 void
1657 ff_dpdk_run(loop_func_t loop, void *arg) {
1658     struct loop_routine *lr = rte_malloc(NULL,
1659         sizeof(struct loop_routine), 0);
1660     lr->loop = loop;
1661     lr->arg = arg;
1662     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1663     rte_eal_mp_wait_lcore();
1664     rte_free(lr);
1665 }
1666 
1667 void
1668 ff_dpdk_pktmbuf_free(void *m)
1669 {
1670     rte_pktmbuf_free((struct rte_mbuf *)m);
1671 }
1672 
1673 static uint32_t
1674 toeplitz_hash(unsigned keylen, const uint8_t *key,
1675     unsigned datalen, const uint8_t *data)
1676 {
1677     uint32_t hash = 0, v;
1678     u_int i, b;
1679 
1680     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1681 
1682     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1683     for (i = 0; i < datalen; i++) {
1684         for (b = 0; b < 8; b++) {
1685             if (data[i] & (1<<(7-b)))
1686                 hash ^= v;
1687             v <<= 1;
1688             if ((i + 4) < keylen &&
1689                 (key[i+4] & (1<<(7-b))))
1690                 v |= 1;
1691         }
1692     }
1693     return (hash);
1694 }
1695 
1696 int
1697 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
1698     uint16_t sport, uint16_t dport)
1699 {
1700     struct lcore_conf *qconf = &lcore_conf;
1701     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
1702     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
1703 
1704     if (nb_queues <= 1) {
1705         return 1;
1706     }
1707 
1708     uint16_t reta_size = rss_reta_size[ctx->port_id];
1709     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
1710 
1711     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
1712         sizeof(dport)];
1713 
1714     unsigned datalen = 0;
1715 
1716     bcopy(&saddr, &data[datalen], sizeof(saddr));
1717     datalen += sizeof(saddr);
1718 
1719     bcopy(&daddr, &data[datalen], sizeof(daddr));
1720     datalen += sizeof(daddr);
1721 
1722     bcopy(&sport, &data[datalen], sizeof(sport));
1723     datalen += sizeof(sport);
1724 
1725     bcopy(&dport, &data[datalen], sizeof(dport));
1726     datalen += sizeof(dport);
1727 
1728     uint32_t hash = 0;
1729     if ( !use_rsskey_52bytes )
1730         hash = toeplitz_hash(sizeof(default_rsskey_40bytes),
1731             default_rsskey_40bytes, datalen, data);
1732     else
1733         hash = toeplitz_hash(sizeof(default_rsskey_52bytes),
1734 	    default_rsskey_52bytes, datalen, data);
1735     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
1736 }
1737 
1738 void
1739 ff_regist_packet_dispatcher(dispatch_func_t func)
1740 {
1741     packet_dispatcher = func;
1742 }
1743 
1744 uint64_t
1745 ff_get_tsc_ns()
1746 {
1747     uint64_t cur_tsc = rte_rdtsc();
1748     uint64_t hz = rte_get_tsc_hz();
1749     return ((double)cur_tsc/(double)hz) * NS_PER_S;
1750 }
1751 
1752