xref: /f-stack/lib/ff_dpdk_if.c (revision c27efec2)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 #include <unistd.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30 
31 #include <rte_common.h>
32 #include <rte_byteorder.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memcpy.h>
36 #include <rte_memzone.h>
37 #include <rte_config.h>
38 #include <rte_eal.h>
39 #include <rte_pci.h>
40 #include <rte_mbuf.h>
41 #include <rte_memory.h>
42 #include <rte_lcore.h>
43 #include <rte_launch.h>
44 #include <rte_ethdev.h>
45 #include <rte_debug.h>
46 #include <rte_common.h>
47 #include <rte_ether.h>
48 #include <rte_malloc.h>
49 #include <rte_cycles.h>
50 #include <rte_timer.h>
51 #include <rte_thash.h>
52 #include <rte_ip.h>
53 #include <rte_tcp.h>
54 #include <rte_udp.h>
55 #include <rte_eth_bond.h>
56 
57 #include "ff_dpdk_if.h"
58 #include "ff_dpdk_pcap.h"
59 #include "ff_dpdk_kni.h"
60 #include "ff_config.h"
61 #include "ff_veth.h"
62 #include "ff_host_interface.h"
63 #include "ff_msg.h"
64 #include "ff_api.h"
65 #include "ff_memory.h"
66 
67 #ifdef FF_KNI
68 #define KNI_MBUF_MAX 2048
69 #define KNI_QUEUE_SIZE 2048
70 
71 int enable_kni;
72 static int kni_accept;
73 static int knictl_action = FF_KNICTL_ACTION_DEFAULT;
74 #endif
75 
76 static int numa_on;
77 
78 static unsigned idle_sleep;
79 static unsigned pkt_tx_delay;
80 
81 static struct rte_timer freebsd_clock;
82 
83 // Mellanox Linux's driver key
84 static uint8_t default_rsskey_40bytes[40] = {
85     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
86     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
87     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
88     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
89     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
90 };
91 
92 static int use_rsskey_52bytes = 0;
93 static uint8_t default_rsskey_52bytes[52] = {
94     0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23,
95     0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30,
96     0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02,
97     0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c,
98     0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55,
99     0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e,
100     0x81, 0x15, 0x03, 0x66
101 };
102 
103 struct lcore_conf lcore_conf;
104 
105 struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
106 
107 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
108 static dispatch_func_t packet_dispatcher;
109 
110 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
111 
112 #define BOND_DRIVER_NAME    "net_bonding"
113 
114 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port);
115 
116 struct ff_msg_ring {
117     char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE];
118     /* ring[0] for lcore recv msg, other send */
119     /* ring[1] for lcore send msg, other read */
120     struct rte_ring *ring[FF_MSG_NUM];
121 } __rte_cache_aligned;
122 
123 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
124 static struct rte_mempool *message_pool;
125 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
126 
127 static struct ff_top_args ff_top_status;
128 static struct ff_traffic_args ff_traffic;
129 extern void ff_hardclock(void);
130 
131 static void
132 ff_hardclock_job(__rte_unused struct rte_timer *timer,
133     __rte_unused void *arg) {
134     ff_hardclock();
135     ff_update_current_ts();
136 }
137 
138 struct ff_dpdk_if_context *
139 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
140 {
141     struct ff_dpdk_if_context *ctx;
142 
143     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
144     if (ctx == NULL)
145         return NULL;
146 
147     ctx->sc = sc;
148     ctx->ifp = ifp;
149     ctx->port_id = cfg->port_id;
150     ctx->hw_features = cfg->hw_features;
151 
152     return ctx;
153 }
154 
155 void
156 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
157 {
158     free(ctx);
159 }
160 
161 static void
162 check_all_ports_link_status(void)
163 {
164     #define CHECK_INTERVAL 100 /* 100ms */
165     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
166 
167     uint16_t portid;
168     uint8_t count, all_ports_up, print_flag = 0;
169     struct rte_eth_link link;
170 
171     printf("\nChecking link status");
172     fflush(stdout);
173 
174     int i, nb_ports;
175     nb_ports = ff_global_cfg.dpdk.nb_ports;
176     for (count = 0; count <= MAX_CHECK_TIME; count++) {
177         all_ports_up = 1;
178         for (i = 0; i < nb_ports; i++) {
179             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
180             memset(&link, 0, sizeof(link));
181             rte_eth_link_get_nowait(portid, &link);
182 
183             /* print link status if flag set */
184             if (print_flag == 1) {
185                 if (link.link_status) {
186                     printf("Port %d Link Up - speed %u "
187                         "Mbps - %s\n", (int)portid,
188                         (unsigned)link.link_speed,
189                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
190                         ("full-duplex") : ("half-duplex\n"));
191                 } else {
192                     printf("Port %d Link Down\n", (int)portid);
193                 }
194                 continue;
195             }
196             /* clear all_ports_up flag if any link down */
197             if (link.link_status == 0) {
198                 all_ports_up = 0;
199                 break;
200             }
201         }
202 
203         /* after finally printing all link status, get out */
204         if (print_flag == 1)
205             break;
206 
207         if (all_ports_up == 0) {
208             printf(".");
209             fflush(stdout);
210             rte_delay_ms(CHECK_INTERVAL);
211         }
212 
213         /* set the print_flag if all ports up or timeout */
214         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
215             print_flag = 1;
216             printf("done\n");
217         }
218     }
219 }
220 
221 static int
222 init_lcore_conf(void)
223 {
224     uint8_t nb_dev_ports = rte_eth_dev_count_avail();
225     if (nb_dev_ports == 0) {
226         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
227     }
228 
229     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
230         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
231                  ff_global_cfg.dpdk.max_portid);
232     }
233 
234     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
235     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
236 
237     uint16_t proc_id;
238     for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) {
239         uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id];
240         if (!lcore_config[lcore_id].detected) {
241             rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
242         }
243     }
244 
245     uint16_t socket_id = 0;
246     if (numa_on) {
247         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
248     }
249 
250     lcore_conf.socket_id = socket_id;
251 
252     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
253     int j;
254     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
255         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
256         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
257 
258         int queueid = -1;
259         int i;
260         for (i = 0; i < pconf->nb_lcores; i++) {
261             if (pconf->lcore_list[i] == lcore_id) {
262                 queueid = i;
263             }
264         }
265         if (queueid < 0) {
266             continue;
267         }
268         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
269         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
270         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
271         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
272         lcore_conf.nb_rx_queue++;
273 
274         lcore_conf.tx_queue_id[port_id] = queueid;
275         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
276         lcore_conf.nb_tx_port++;
277 
278         /* Enable pcap dump */
279         if (ff_global_cfg.pcap.enable) {
280             ff_enable_pcap(ff_global_cfg.pcap.save_path, ff_global_cfg.pcap.snap_len);
281         }
282 
283         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
284     }
285 
286     if (lcore_conf.nb_rx_queue == 0) {
287         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
288     }
289 
290     return 0;
291 }
292 
293 static int
294 init_mem_pool(void)
295 {
296     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
297     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
298     uint32_t nb_tx_queue = nb_lcores;
299     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
300     uint16_t max_portid = ff_global_cfg.dpdk.max_portid;
301 
302     unsigned nb_mbuf = RTE_ALIGN_CEIL (
303         (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE          +
304         nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST    +
305         nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE  +
306         nb_lcores * MEMPOOL_CACHE_SIZE +
307 #ifdef FF_KNI
308         nb_ports * KNI_MBUF_MAX +
309         nb_ports * KNI_QUEUE_SIZE +
310 #endif
311         nb_lcores * nb_ports * DISPATCH_RING_SIZE),
312         (unsigned)8192);
313 
314     unsigned socketid = 0;
315     uint16_t i, lcore_id;
316     char s[64];
317 
318     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
319         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
320         if (numa_on) {
321             socketid = rte_lcore_to_socket_id(lcore_id);
322         }
323 
324         if (socketid >= NB_SOCKETS) {
325             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
326                 socketid, i, NB_SOCKETS);
327         }
328 
329         if (pktmbuf_pool[socketid] != NULL) {
330             continue;
331         }
332 
333         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
334             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
335             pktmbuf_pool[socketid] =
336                 rte_pktmbuf_pool_create(s, nb_mbuf,
337                     MEMPOOL_CACHE_SIZE, 0,
338                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
339         } else {
340             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
341             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
342         }
343 
344         if (pktmbuf_pool[socketid] == NULL) {
345             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
346         } else {
347             printf("create mbuf pool on socket %d\n", socketid);
348         }
349 
350 #ifdef FF_USE_PAGE_ARRAY
351         nb_mbuf = RTE_ALIGN_CEIL (
352             nb_ports*nb_lcores*MAX_PKT_BURST    +
353             nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
354             nb_lcores*MEMPOOL_CACHE_SIZE,
355             (unsigned)4096);
356         ff_init_ref_pool(nb_mbuf, socketid);
357 #endif
358     }
359 
360     return 0;
361 }
362 
363 static struct rte_ring *
364 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
365 {
366     struct rte_ring *ring;
367 
368     if (name == NULL) {
369         rte_exit(EXIT_FAILURE, "create ring failed, no name!\n");
370     }
371 
372     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
373         ring = rte_ring_create(name, count, socket_id, flags);
374     } else {
375         ring = rte_ring_lookup(name);
376     }
377 
378     if (ring == NULL) {
379         rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name);
380     }
381 
382     return ring;
383 }
384 
385 static int
386 init_dispatch_ring(void)
387 {
388     int j;
389     char name_buf[RTE_RING_NAMESIZE];
390     int queueid;
391 
392     unsigned socketid = lcore_conf.socket_id;
393 
394     /* Create ring according to ports actually being used. */
395     int nb_ports = ff_global_cfg.dpdk.nb_ports;
396     for (j = 0; j < nb_ports; j++) {
397         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
398         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
399         int nb_queues = pconf->nb_lcores;
400         if (dispatch_ring[portid] == NULL) {
401             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
402 
403             dispatch_ring[portid] = rte_zmalloc(name_buf,
404                 sizeof(struct rte_ring *) * nb_queues,
405                 RTE_CACHE_LINE_SIZE);
406             if (dispatch_ring[portid] == NULL) {
407                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
408                     "failed\n", name_buf);
409             }
410         }
411 
412         for(queueid = 0; queueid < nb_queues; ++queueid) {
413             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
414                 portid, queueid);
415             dispatch_ring[portid][queueid] = create_ring(name_buf,
416                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
417 
418             if (dispatch_ring[portid][queueid] == NULL)
419                 rte_panic("create ring:%s failed!\n", name_buf);
420 
421             printf("create ring:%s success, %u ring entries are now free!\n",
422                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
423         }
424     }
425 
426     return 0;
427 }
428 
429 static void
430 ff_msg_init(struct rte_mempool *mp,
431     __attribute__((unused)) void *opaque_arg,
432     void *obj, __attribute__((unused)) unsigned i)
433 {
434     struct ff_msg *msg = (struct ff_msg *)obj;
435     msg->msg_type = FF_UNKNOWN;
436     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
437     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
438 }
439 
440 static int
441 init_msg_ring(void)
442 {
443     uint16_t i, j;
444     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
445     unsigned socketid = lcore_conf.socket_id;
446 
447     /* Create message buffer pool */
448     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
449         message_pool = rte_mempool_create(FF_MSG_POOL,
450            MSG_RING_SIZE * 2 * nb_procs,
451            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
452            NULL, NULL, ff_msg_init, NULL,
453            socketid, 0);
454     } else {
455         message_pool = rte_mempool_lookup(FF_MSG_POOL);
456     }
457 
458     if (message_pool == NULL) {
459         rte_panic("Create msg mempool failed\n");
460     }
461 
462     for(i = 0; i < nb_procs; ++i) {
463         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
464             "%s%u", FF_MSG_RING_IN, i);
465         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
466             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
467         if (msg_ring[i].ring[0] == NULL)
468             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
469 
470         for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) {
471             snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE,
472                 "%s%u_%u", FF_MSG_RING_OUT, i, j);
473             msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j],
474                 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
475             if (msg_ring[i].ring[j] == NULL)
476                 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]);
477         }
478     }
479 
480     return 0;
481 }
482 
483 #ifdef FF_KNI
484 
485 static enum FF_KNICTL_CMD get_kni_action(const char *c){
486     if (!c)
487         return FF_KNICTL_ACTION_DEFAULT;
488     if (0 == strcasecmp(c, "alltokni")){
489         return FF_KNICTL_ACTION_ALL_TO_KNI;
490     } else  if (0 == strcasecmp(c, "alltoff")){
491         return FF_KNICTL_ACTION_ALL_TO_FF;
492     } else if (0 == strcasecmp(c, "default")){
493         return FF_KNICTL_ACTION_DEFAULT;
494     } else {
495         return FF_KNICTL_ACTION_DEFAULT;
496     }
497 }
498 
499 static int
500 init_kni(void)
501 {
502     int nb_ports = rte_eth_dev_count_avail();
503     kni_accept = 0;
504     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
505         kni_accept = 1;
506 
507     knictl_action = get_kni_action(ff_global_cfg.kni.kni_action);
508 
509     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
510         ff_global_cfg.kni.udp_port);
511 
512     unsigned socket_id = lcore_conf.socket_id;
513     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
514 
515     nb_ports = ff_global_cfg.dpdk.nb_ports;
516     int i, ret;
517     for (i = 0; i < nb_ports; i++) {
518         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
519         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
520     }
521 
522     return 0;
523 }
524 #endif
525 
526 static void
527 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
528 {
529     if (reta_size == 0) {
530         return;
531     }
532 
533     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
534     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
535 
536     /* config HW indirection table */
537     unsigned i, j, hash=0;
538     for (i = 0; i < reta_conf_size; i++) {
539         reta_conf[i].mask = ~0ULL;
540         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
541             reta_conf[i].reta[j] = hash++ % nb_queues;
542         }
543     }
544 
545     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
546         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
547             port_id);
548     }
549 }
550 
551 static int
552 init_port_start(void)
553 {
554     int nb_ports = ff_global_cfg.dpdk.nb_ports;
555     unsigned socketid = 0;
556     struct rte_mempool *mbuf_pool;
557     uint16_t i, j;
558 
559     for (i = 0; i < nb_ports; i++) {
560         uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i];
561         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id];
562         uint16_t nb_queues = pconf->nb_lcores;
563 
564         for (j=0; j<=pconf->nb_slaves; j++) {
565             if (j < pconf->nb_slaves) {
566                 port_id = pconf->slave_portid_list[j];
567                 printf("To init %s's %d'st slave port[%d]\n",
568                         ff_global_cfg.dpdk.bond_cfgs->name,
569                         j, port_id);
570             } else {
571                 port_id = u_port_id;
572             }
573 
574             struct rte_eth_dev_info dev_info;
575             struct rte_eth_conf port_conf = {0};
576             struct rte_eth_rxconf rxq_conf;
577             struct rte_eth_txconf txq_conf;
578 
579             rte_eth_dev_info_get(port_id, &dev_info);
580 
581             if (nb_queues > dev_info.max_rx_queues) {
582                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
583                     nb_queues,
584                     dev_info.max_rx_queues);
585             }
586 
587             if (nb_queues > dev_info.max_tx_queues) {
588                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
589                     nb_queues,
590                     dev_info.max_tx_queues);
591             }
592 
593             struct ether_addr addr;
594             rte_eth_macaddr_get(port_id, &addr);
595             printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
596                        " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
597                     (unsigned)port_id,
598                     addr.addr_bytes[0], addr.addr_bytes[1],
599                     addr.addr_bytes[2], addr.addr_bytes[3],
600                     addr.addr_bytes[4], addr.addr_bytes[5]);
601 
602             rte_memcpy(pconf->mac,
603                 addr.addr_bytes, ETHER_ADDR_LEN);
604 
605             /* Set RSS mode */
606             uint64_t default_rss_hf = ETH_RSS_PROTO_MASK;
607             port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
608             port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf;
609             if (dev_info.hash_key_size == 52) {
610                 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_52bytes;
611                 port_conf.rx_adv_conf.rss_conf.rss_key_len = 52;
612                 use_rsskey_52bytes = 1;
613             } else {
614                 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes;
615                 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40;
616             }
617             port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads;
618             if (port_conf.rx_adv_conf.rss_conf.rss_hf !=
619                     ETH_RSS_PROTO_MASK) {
620                 printf("Port %u modified RSS hash function based on hardware support,"
621                         "requested:%#"PRIx64" configured:%#"PRIx64"\n",
622                         port_id, default_rss_hf,
623                         port_conf.rx_adv_conf.rss_conf.rss_hf);
624             }
625 
626             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) {
627                 port_conf.txmode.offloads |=
628                     DEV_TX_OFFLOAD_MBUF_FAST_FREE;
629             }
630 
631             /* Set Rx VLAN stripping */
632             if (ff_global_cfg.dpdk.vlan_strip) {
633                 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
634                     port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
635                 }
636             }
637 
638             /* Enable HW CRC stripping */
639             port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC;
640 
641             /* FIXME: Enable TCP LRO ?*/
642             #if 0
643             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
644                 printf("LRO is supported\n");
645                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
646                 pconf->hw_features.rx_lro = 1;
647             }
648             #endif
649 
650             /* Set Rx checksum checking */
651             if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
652                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
653                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
654                 printf("RX checksum offload supported\n");
655                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
656                 pconf->hw_features.rx_csum = 1;
657             }
658 
659             if (ff_global_cfg.dpdk.tx_csum_offoad_skip == 0) {
660                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
661                     printf("TX ip checksum offload supported\n");
662                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
663                     pconf->hw_features.tx_csum_ip = 1;
664                 }
665 
666                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
667                     (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
668                     printf("TX TCP&UDP checksum offload supported\n");
669                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM;
670                     pconf->hw_features.tx_csum_l4 = 1;
671                 }
672             } else {
673                 printf("TX checksum offoad is disabled\n");
674             }
675 
676             if (ff_global_cfg.dpdk.tso) {
677                 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
678                     printf("TSO is supported\n");
679                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO;
680                     pconf->hw_features.tx_tso = 1;
681                 }
682             } else {
683                 printf("TSO is disabled\n");
684             }
685 
686             if (dev_info.reta_size) {
687                 /* reta size must be power of 2 */
688                 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
689 
690                 rss_reta_size[port_id] = dev_info.reta_size;
691                 printf("port[%d]: rss table size: %d\n", port_id,
692                     dev_info.reta_size);
693             }
694 
695             if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
696                 continue;
697             }
698 
699             int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
700             if (ret != 0) {
701                 return ret;
702             }
703 
704             static uint16_t nb_rxd = RX_QUEUE_SIZE;
705             static uint16_t nb_txd = TX_QUEUE_SIZE;
706             ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd);
707             if (ret < 0)
708                 printf("Could not adjust number of descriptors "
709                         "for port%u (%d)\n", (unsigned)port_id, ret);
710 
711             uint16_t q;
712             for (q = 0; q < nb_queues; q++) {
713                 if (numa_on) {
714                     uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
715                     socketid = rte_lcore_to_socket_id(lcore_id);
716                 }
717                 mbuf_pool = pktmbuf_pool[socketid];
718 
719                 txq_conf = dev_info.default_txconf;
720                 txq_conf.offloads = port_conf.txmode.offloads;
721                 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd,
722                     socketid, &txq_conf);
723                 if (ret < 0) {
724                     return ret;
725                 }
726 
727                 rxq_conf = dev_info.default_rxconf;
728                 rxq_conf.offloads = port_conf.rxmode.offloads;
729                 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd,
730                     socketid, &rxq_conf, mbuf_pool);
731                 if (ret < 0) {
732                     return ret;
733                 }
734             }
735 
736 
737             if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME,
738                     strlen(dev_info.driver_name)) == 0) {
739 
740                 rte_eth_macaddr_get(port_id, &addr);
741                 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
742                            " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
743                         (unsigned)port_id,
744                         addr.addr_bytes[0], addr.addr_bytes[1],
745                         addr.addr_bytes[2], addr.addr_bytes[3],
746                         addr.addr_bytes[4], addr.addr_bytes[5]);
747 
748                 rte_memcpy(pconf->mac,
749                     addr.addr_bytes, ETHER_ADDR_LEN);
750 
751                 int mode, count, x;
752                 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS;
753 
754                 mode = rte_eth_bond_mode_get(port_id);
755                 printf("Port %u, bond mode:%d\n", port_id, mode);
756 
757                 count = rte_eth_bond_slaves_get(port_id, slaves, len);
758                 printf("Port %u, %s's slave ports count:%d\n", port_id,
759                             ff_global_cfg.dpdk.bond_cfgs->name, count);
760                 for (x=0; x<count; x++) {
761                     printf("Port %u, %s's slave port[%u]\n", port_id,
762                             ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]);
763                 }
764             }
765 
766             ret = rte_eth_dev_start(port_id);
767             if (ret < 0) {
768                 return ret;
769             }
770 
771             if (nb_queues > 1) {
772                 /* set HW rss hash function to Toeplitz. */
773                 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
774                     struct rte_eth_hash_filter_info info = {0};
775                     info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
776                     info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
777 
778                     if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
779                         RTE_ETH_FILTER_SET, &info) < 0) {
780                         rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
781                             port_id);
782                     }
783                 }
784 
785                 set_rss_table(port_id, dev_info.reta_size, nb_queues);
786             }
787 
788             /* Enable RX in promiscuous mode for the Ethernet device. */
789             if (ff_global_cfg.dpdk.promiscuous) {
790                 rte_eth_promiscuous_enable(port_id);
791                 ret = rte_eth_promiscuous_get(port_id);
792                 if (ret == 1) {
793                     printf("set port %u to promiscuous mode ok\n", port_id);
794                 } else {
795                     printf("set port %u to promiscuous mode error\n", port_id);
796                 }
797             }
798         }
799     }
800 
801     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
802         check_all_ports_link_status();
803     }
804 
805     return 0;
806 }
807 
808 static int
809 init_clock(void)
810 {
811     rte_timer_subsystem_init();
812     uint64_t hz = rte_get_timer_hz();
813     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
814     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
815 
816     rte_timer_init(&freebsd_clock);
817     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
818         rte_lcore_id(), &ff_hardclock_job, NULL);
819 
820     ff_update_current_ts();
821 
822     return 0;
823 }
824 
825 int
826 ff_dpdk_init(int argc, char **argv)
827 {
828     if (ff_global_cfg.dpdk.nb_procs < 1 ||
829         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
830         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
831         ff_global_cfg.dpdk.proc_id < 0) {
832         printf("param num_procs[%d] or proc_id[%d] error!\n",
833             ff_global_cfg.dpdk.nb_procs,
834             ff_global_cfg.dpdk.proc_id);
835         exit(1);
836     }
837 
838     int ret = rte_eal_init(argc, argv);
839     if (ret < 0) {
840         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
841     }
842 
843     numa_on = ff_global_cfg.dpdk.numa_on;
844 
845     idle_sleep = ff_global_cfg.dpdk.idle_sleep;
846     pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \
847         BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay;
848 
849     init_lcore_conf();
850 
851     init_mem_pool();
852 
853     init_dispatch_ring();
854 
855     init_msg_ring();
856 
857 #ifdef FF_KNI
858     enable_kni = ff_global_cfg.kni.enable;
859     if (enable_kni) {
860         init_kni();
861     }
862 #endif
863 
864 #ifdef FF_USE_PAGE_ARRAY
865     ff_mmap_init();
866 #endif
867 
868     ret = init_port_start();
869     if (ret < 0) {
870         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
871     }
872 
873     init_clock();
874 
875     return 0;
876 }
877 
878 static void
879 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
880 {
881     uint8_t rx_csum = ctx->hw_features.rx_csum;
882     if (rx_csum) {
883         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
884             rte_pktmbuf_free(pkt);
885             return;
886         }
887     }
888 
889     void *data = rte_pktmbuf_mtod(pkt, void*);
890     uint16_t len = rte_pktmbuf_data_len(pkt);
891 
892     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
893     if (hdr == NULL) {
894         rte_pktmbuf_free(pkt);
895         return;
896     }
897 
898     if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) {
899         ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci);
900     }
901 
902     struct rte_mbuf *pn = pkt->next;
903     void *prev = hdr;
904     while(pn != NULL) {
905         data = rte_pktmbuf_mtod(pn, void*);
906         len = rte_pktmbuf_data_len(pn);
907 
908         void *mb = ff_mbuf_get(prev, data, len);
909         if (mb == NULL) {
910             ff_mbuf_free(hdr);
911             rte_pktmbuf_free(pkt);
912             return;
913         }
914         pn = pn->next;
915         prev = mb;
916     }
917 
918     ff_veth_process_packet(ctx->ifp, hdr);
919 }
920 
921 static enum FilterReturn
922 protocol_filter(const void *data, uint16_t len)
923 {
924     if(len < ETHER_HDR_LEN)
925         return FILTER_UNKNOWN;
926 
927     const struct ether_hdr *hdr;
928     const struct vlan_hdr *vlanhdr;
929     hdr = (const struct ether_hdr *)data;
930     uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type);
931     data += ETHER_HDR_LEN;
932     len -= ETHER_HDR_LEN;
933 
934     if (ether_type == ETHER_TYPE_VLAN) {
935         vlanhdr = (struct vlan_hdr *)data;
936         ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto);
937         data += sizeof(struct vlan_hdr);
938         len -= sizeof(struct vlan_hdr);
939     }
940 
941     if(ether_type == ETHER_TYPE_ARP)
942         return FILTER_ARP;
943 
944 #ifdef INET6
945     if (ether_type == ETHER_TYPE_IPv6) {
946         return ff_kni_proto_filter(data,
947             len, ether_type);
948     }
949 #endif
950 
951 #ifndef FF_KNI
952     return FILTER_UNKNOWN;
953 #else
954     if (!enable_kni) {
955         return FILTER_UNKNOWN;
956     }
957 
958     if(ether_type != ETHER_TYPE_IPv4)
959         return FILTER_UNKNOWN;
960 
961     return ff_kni_proto_filter(data,
962         len, ether_type);
963 #endif
964 }
965 
966 static inline void
967 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
968 {
969     struct rte_mbuf *md;
970     void *src, *dst;
971 
972     dst = rte_pktmbuf_mtod(mi, void *);
973     src = rte_pktmbuf_mtod(m, void *);
974 
975     mi->data_len = m->data_len;
976     rte_memcpy(dst, src, m->data_len);
977 
978     mi->port = m->port;
979     mi->vlan_tci = m->vlan_tci;
980     mi->vlan_tci_outer = m->vlan_tci_outer;
981     mi->tx_offload = m->tx_offload;
982     mi->hash = m->hash;
983     mi->ol_flags = m->ol_flags;
984     mi->packet_type = m->packet_type;
985 }
986 
987 /* copied from rte_pktmbuf_clone */
988 static inline struct rte_mbuf *
989 pktmbuf_deep_clone(const struct rte_mbuf *md,
990     struct rte_mempool *mp)
991 {
992     struct rte_mbuf *mc, *mi, **prev;
993     uint32_t pktlen;
994     uint8_t nseg;
995 
996     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
997         return NULL;
998 
999     mi = mc;
1000     prev = &mi->next;
1001     pktlen = md->pkt_len;
1002     nseg = 0;
1003 
1004     do {
1005         nseg++;
1006         pktmbuf_deep_attach(mi, md);
1007         *prev = mi;
1008         prev = &mi->next;
1009     } while ((md = md->next) != NULL &&
1010         (mi = rte_pktmbuf_alloc(mp)) != NULL);
1011 
1012     *prev = NULL;
1013     mc->nb_segs = nseg;
1014     mc->pkt_len = pktlen;
1015 
1016     /* Allocation of new indirect segment failed */
1017     if (unlikely (mi == NULL)) {
1018         rte_pktmbuf_free(mc);
1019         return NULL;
1020     }
1021 
1022     __rte_mbuf_sanity_check(mc, 1);
1023     return mc;
1024 }
1025 
1026 static inline void
1027 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
1028     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
1029 {
1030     struct lcore_conf *qconf = &lcore_conf;
1031     uint16_t nb_queues = qconf->nb_queue_list[port_id];
1032 
1033     uint16_t i;
1034     for (i = 0; i < count; i++) {
1035         struct rte_mbuf *rtem = bufs[i];
1036 
1037         if (unlikely( ff_global_cfg.pcap.enable)) {
1038             if (!pkts_from_ring) {
1039                 ff_dump_packets( ff_global_cfg.pcap.save_path, rtem, ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1040             }
1041         }
1042 
1043         void *data = rte_pktmbuf_mtod(rtem, void*);
1044         uint16_t len = rte_pktmbuf_data_len(rtem);
1045 
1046         if (!pkts_from_ring) {
1047             ff_traffic.rx_packets++;
1048             ff_traffic.rx_bytes += len;
1049         }
1050 
1051         if (!pkts_from_ring && packet_dispatcher) {
1052             int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues);
1053             if (ret == FF_DISPATCH_RESPONSE) {
1054                 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len;
1055 
1056                 /*
1057                  * We have not support vlan out strip
1058                  */
1059                 if (rtem->vlan_tci) {
1060                     data = rte_pktmbuf_prepend(rtem, sizeof(struct vlan_hdr));
1061                     if (data != NULL) {
1062                         memmove(data, data + sizeof(struct vlan_hdr), ETHER_HDR_LEN);
1063                         struct ether_hdr *etherhdr = (struct ether_hdr *)data;
1064                         struct vlan_hdr *vlanhdr = (struct vlan_hdr *)(data + ETHER_HDR_LEN);
1065                         vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci);
1066                         vlanhdr->eth_proto = etherhdr->ether_type;
1067                         etherhdr->ether_type = rte_cpu_to_be_16(ETHER_TYPE_VLAN);
1068                     }
1069                 }
1070                 send_single_packet(rtem, port_id);
1071                 continue;
1072             }
1073 
1074             if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) {
1075                 rte_pktmbuf_free(rtem);
1076                 continue;
1077             }
1078 
1079             if (ret != queue_id) {
1080                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
1081                 if (ret < 0)
1082                     rte_pktmbuf_free(rtem);
1083 
1084                 continue;
1085             }
1086         }
1087 
1088         enum FilterReturn filter = protocol_filter(data, len);
1089 #ifdef INET6
1090         if (filter == FILTER_ARP || filter == FILTER_NDP) {
1091 #else
1092         if (filter == FILTER_ARP) {
1093 #endif
1094             struct rte_mempool *mbuf_pool;
1095             struct rte_mbuf *mbuf_clone;
1096             if (!pkts_from_ring) {
1097                 uint16_t j;
1098                 for(j = 0; j < nb_queues; ++j) {
1099                     if(j == queue_id)
1100                         continue;
1101 
1102                     unsigned socket_id = 0;
1103                     if (numa_on) {
1104                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1105                         socket_id = rte_lcore_to_socket_id(lcore_id);
1106                     }
1107                     mbuf_pool = pktmbuf_pool[socket_id];
1108                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1109                     if(mbuf_clone) {
1110                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1111                             mbuf_clone);
1112                         if (ret < 0)
1113                             rte_pktmbuf_free(mbuf_clone);
1114                     }
1115                 }
1116             }
1117 
1118 #ifdef FF_KNI
1119             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1120                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1121                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1122                 if(mbuf_clone) {
1123                     ff_kni_enqueue(port_id, mbuf_clone);
1124                 }
1125             }
1126 #endif
1127             ff_veth_input(ctx, rtem);
1128 #ifdef FF_KNI
1129         } else if (enable_kni) {
1130             if (knictl_action == FF_KNICTL_ACTION_ALL_TO_KNI){
1131                 ff_kni_enqueue(port_id, rtem);
1132             } else if (knictl_action == FF_KNICTL_ACTION_ALL_TO_FF){
1133                 ff_veth_input(ctx, rtem);
1134             } else if (knictl_action == FF_KNICTL_ACTION_DEFAULT){
1135                 if (enable_kni &&
1136                         ((filter == FILTER_KNI && kni_accept) ||
1137                         (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1138                         ff_kni_enqueue(port_id, rtem);
1139                 } else {
1140                     ff_veth_input(ctx, rtem);
1141                 }
1142             } else {
1143                 ff_veth_input(ctx, rtem);
1144             }
1145 #endif
1146         } else {
1147             ff_veth_input(ctx, rtem);
1148         }
1149     }
1150 }
1151 
1152 static inline int
1153 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1154     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1155 {
1156     /* read packet from ring buf and to process */
1157     uint16_t nb_rb;
1158     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1159         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1160 
1161     if(nb_rb > 0) {
1162         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1163     }
1164 
1165     return 0;
1166 }
1167 
1168 static inline void
1169 handle_sysctl_msg(struct ff_msg *msg)
1170 {
1171     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1172         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1173         msg->sysctl.newlen);
1174 
1175     if (ret < 0) {
1176         msg->result = errno;
1177     } else {
1178         msg->result = 0;
1179     }
1180 }
1181 
1182 static inline void
1183 handle_ioctl_msg(struct ff_msg *msg)
1184 {
1185     int fd, ret;
1186 #ifdef INET6
1187     if (msg->msg_type == FF_IOCTL6) {
1188         fd = ff_socket(AF_INET6, SOCK_DGRAM, 0);
1189     } else
1190 #endif
1191         fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1192 
1193     if (fd < 0) {
1194         ret = -1;
1195         goto done;
1196     }
1197 
1198     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1199 
1200     ff_close(fd);
1201 
1202 done:
1203     if (ret < 0) {
1204         msg->result = errno;
1205     } else {
1206         msg->result = 0;
1207     }
1208 }
1209 
1210 static inline void
1211 handle_route_msg(struct ff_msg *msg)
1212 {
1213     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1214         &msg->route.len, msg->route.maxlen);
1215     if (ret < 0) {
1216         msg->result = errno;
1217     } else {
1218         msg->result = 0;
1219     }
1220 }
1221 
1222 static inline void
1223 handle_top_msg(struct ff_msg *msg)
1224 {
1225     msg->top = ff_top_status;
1226     msg->result = 0;
1227 }
1228 
1229 #ifdef FF_NETGRAPH
1230 static inline void
1231 handle_ngctl_msg(struct ff_msg *msg)
1232 {
1233     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1234     if (ret < 0) {
1235         msg->result = errno;
1236     } else {
1237         msg->result = 0;
1238         msg->ngctl.ret = ret;
1239     }
1240 }
1241 #endif
1242 
1243 #ifdef FF_IPFW
1244 static inline void
1245 handle_ipfw_msg(struct ff_msg *msg)
1246 {
1247     int fd, ret;
1248     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1249     if (fd < 0) {
1250         ret = -1;
1251         goto done;
1252     }
1253 
1254     switch (msg->ipfw.cmd) {
1255         case FF_IPFW_GET:
1256             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1257                 msg->ipfw.optname, msg->ipfw.optval,
1258                 msg->ipfw.optlen);
1259             break;
1260         case FF_IPFW_SET:
1261             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1262                 msg->ipfw.optname, msg->ipfw.optval,
1263                 *(msg->ipfw.optlen));
1264             break;
1265         default:
1266             ret = -1;
1267             errno = ENOTSUP;
1268             break;
1269     }
1270 
1271     ff_close(fd);
1272 
1273 done:
1274     if (ret < 0) {
1275         msg->result = errno;
1276     } else {
1277         msg->result = 0;
1278     }
1279 }
1280 #endif
1281 
1282 static inline void
1283 handle_traffic_msg(struct ff_msg *msg)
1284 {
1285     msg->traffic = ff_traffic;
1286     msg->result = 0;
1287 }
1288 
1289 #ifdef FF_KNI
1290 static inline void
1291 handle_knictl_msg(struct ff_msg *msg)
1292 {
1293     if (msg->knictl.kni_cmd == FF_KNICTL_CMD_SET){
1294         switch (msg->knictl.kni_action){
1295             case FF_KNICTL_ACTION_ALL_TO_FF: knictl_action = FF_KNICTL_ACTION_ALL_TO_FF; msg->result = 0; printf("new kni action: alltoff\n"); break;
1296             case FF_KNICTL_ACTION_ALL_TO_KNI: knictl_action = FF_KNICTL_ACTION_ALL_TO_KNI; msg->result = 0; printf("new kni action: alltokni\n"); break;
1297             case FF_KNICTL_ACTION_DEFAULT: knictl_action = FF_KNICTL_ACTION_DEFAULT; msg->result = 0; printf("new kni action: default\n"); break;
1298             default: msg->result = -1;
1299         }
1300     }
1301     else if (msg->knictl.kni_cmd == FF_KNICTL_CMD_GET){
1302         msg->knictl.kni_action = knictl_action;
1303     } else {
1304         msg->result = -2;
1305     }
1306 }
1307 #endif
1308 
1309 static inline void
1310 handle_default_msg(struct ff_msg *msg)
1311 {
1312     msg->result = ENOTSUP;
1313 }
1314 
1315 static inline void
1316 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1317 {
1318     switch (msg->msg_type) {
1319         case FF_SYSCTL:
1320             handle_sysctl_msg(msg);
1321             break;
1322         case FF_IOCTL:
1323 #ifdef INET6
1324         case FF_IOCTL6:
1325 #endif
1326             handle_ioctl_msg(msg);
1327             break;
1328         case FF_ROUTE:
1329             handle_route_msg(msg);
1330             break;
1331         case FF_TOP:
1332             handle_top_msg(msg);
1333             break;
1334 #ifdef FF_NETGRAPH
1335         case FF_NGCTL:
1336             handle_ngctl_msg(msg);
1337             break;
1338 #endif
1339 #ifdef FF_IPFW
1340         case FF_IPFW_CTL:
1341             handle_ipfw_msg(msg);
1342             break;
1343 #endif
1344         case FF_TRAFFIC:
1345             handle_traffic_msg(msg);
1346             break;
1347 #ifdef FF_KNI
1348         case FF_KNICTL:
1349             handle_knictl_msg(msg);
1350             break;
1351 #endif
1352         default:
1353             handle_default_msg(msg);
1354             break;
1355     }
1356     rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg);
1357 }
1358 
1359 static inline int
1360 process_msg_ring(uint16_t proc_id)
1361 {
1362     void *msg;
1363     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1364 
1365     if (unlikely(ret == 0)) {
1366         handle_msg((struct ff_msg *)msg, proc_id);
1367     }
1368 
1369     return 0;
1370 }
1371 
1372 /* Send burst of packets on an output interface */
1373 static inline int
1374 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1375 {
1376     struct rte_mbuf **m_table;
1377     int ret;
1378     uint16_t queueid;
1379 
1380     queueid = qconf->tx_queue_id[port];
1381     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1382 
1383     if (unlikely(ff_global_cfg.pcap.enable)) {
1384         uint16_t i;
1385         for (i = 0; i < n; i++) {
1386             ff_dump_packets( ff_global_cfg.pcap.save_path, m_table[i],
1387                ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1388         }
1389     }
1390 
1391     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1392     ff_traffic.tx_packets += ret;
1393     uint16_t i;
1394     for (i = 0; i < ret; i++) {
1395         ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]);
1396 #ifdef FF_USE_PAGE_ARRAY
1397         if (qconf->tx_mbufs[port].bsd_m_table[i])
1398             ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs);
1399 #endif
1400     }
1401     if (unlikely(ret < n)) {
1402         do {
1403             rte_pktmbuf_free(m_table[ret]);
1404 #ifdef FF_USE_PAGE_ARRAY
1405             if ( qconf->tx_mbufs[port].bsd_m_table[ret] )
1406                 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]);
1407 #endif
1408         } while (++ret < n);
1409     }
1410     return 0;
1411 }
1412 
1413 /* Enqueue a single packet, and send burst if queue is filled */
1414 static inline int
1415 send_single_packet(struct rte_mbuf *m, uint8_t port)
1416 {
1417     uint16_t len;
1418     struct lcore_conf *qconf;
1419 
1420     qconf = &lcore_conf;
1421     len = qconf->tx_mbufs[port].len;
1422     qconf->tx_mbufs[port].m_table[len] = m;
1423     len++;
1424 
1425     /* enough pkts to be sent */
1426     if (unlikely(len == MAX_PKT_BURST)) {
1427         send_burst(qconf, MAX_PKT_BURST, port);
1428         len = 0;
1429     }
1430 
1431     qconf->tx_mbufs[port].len = len;
1432     return 0;
1433 }
1434 
1435 int
1436 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1437     int total)
1438 {
1439 #ifdef FF_USE_PAGE_ARRAY
1440     struct lcore_conf *qconf = &lcore_conf;
1441     int    len = 0;
1442 
1443     len = ff_if_send_onepkt(ctx, m,total);
1444     if (unlikely(len == MAX_PKT_BURST)) {
1445         send_burst(qconf, MAX_PKT_BURST, ctx->port_id);
1446         len = 0;
1447     }
1448     qconf->tx_mbufs[ctx->port_id].len = len;
1449     return 0;
1450 #endif
1451     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1452     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1453     if (head == NULL) {
1454         ff_mbuf_free(m);
1455         return -1;
1456     }
1457 
1458     head->pkt_len = total;
1459     head->nb_segs = 0;
1460 
1461     int off = 0;
1462     struct rte_mbuf *cur = head, *prev = NULL;
1463     while(total > 0) {
1464         if (cur == NULL) {
1465             cur = rte_pktmbuf_alloc(mbuf_pool);
1466             if (cur == NULL) {
1467                 rte_pktmbuf_free(head);
1468                 ff_mbuf_free(m);
1469                 return -1;
1470             }
1471         }
1472 
1473         if (prev != NULL) {
1474             prev->next = cur;
1475         }
1476         head->nb_segs++;
1477 
1478         prev = cur;
1479         void *data = rte_pktmbuf_mtod(cur, void*);
1480         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1481         int ret = ff_mbuf_copydata(m, data, off, len);
1482         if (ret < 0) {
1483             rte_pktmbuf_free(head);
1484             ff_mbuf_free(m);
1485             return -1;
1486         }
1487 
1488 
1489         cur->data_len = len;
1490         off += len;
1491         total -= len;
1492         cur = NULL;
1493     }
1494 
1495     struct ff_tx_offload offload = {0};
1496     ff_mbuf_tx_offload(m, &offload);
1497 
1498     void *data = rte_pktmbuf_mtod(head, void*);
1499 
1500     if (offload.ip_csum) {
1501         /* ipv6 not supported yet */
1502         struct ipv4_hdr *iph;
1503         int iph_len;
1504         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1505         iph_len = (iph->version_ihl & 0x0f) << 2;
1506 
1507         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1508         head->l2_len = ETHER_HDR_LEN;
1509         head->l3_len = iph_len;
1510     }
1511 
1512     if (ctx->hw_features.tx_csum_l4) {
1513         struct ipv4_hdr *iph;
1514         int iph_len;
1515         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1516         iph_len = (iph->version_ihl & 0x0f) << 2;
1517 
1518         if (offload.tcp_csum) {
1519             head->ol_flags |= PKT_TX_TCP_CKSUM;
1520             head->l2_len = ETHER_HDR_LEN;
1521             head->l3_len = iph_len;
1522         }
1523 
1524         /*
1525          *  TCP segmentation offload.
1526          *
1527          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1528          *    implies PKT_TX_TCP_CKSUM)
1529          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1530          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1531          *    write the IP checksum to 0 in the packet
1532          *  - fill the mbuf offload information: l2_len,
1533          *    l3_len, l4_len, tso_segsz
1534          *  - calculate the pseudo header checksum without taking ip_len
1535          *    in account, and set it in the TCP header. Refer to
1536          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1537          *    used as helpers.
1538          */
1539         if (offload.tso_seg_size) {
1540             struct tcp_hdr *tcph;
1541             int tcph_len;
1542             tcph = (struct tcp_hdr *)((char *)iph + iph_len);
1543             tcph_len = (tcph->data_off & 0xf0) >> 2;
1544             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1545 
1546             head->ol_flags |= PKT_TX_TCP_SEG;
1547             head->l4_len = tcph_len;
1548             head->tso_segsz = offload.tso_seg_size;
1549         }
1550 
1551         if (offload.udp_csum) {
1552             head->ol_flags |= PKT_TX_UDP_CKSUM;
1553             head->l2_len = ETHER_HDR_LEN;
1554             head->l3_len = iph_len;
1555         }
1556     }
1557 
1558     ff_mbuf_free(m);
1559 
1560     return send_single_packet(head, ctx->port_id);
1561 }
1562 
1563 static int
1564 main_loop(void *arg)
1565 {
1566     struct loop_routine *lr = (struct loop_routine *)arg;
1567 
1568     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1569     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc;
1570     int i, j, nb_rx, idle;
1571     uint16_t port_id, queue_id;
1572     struct lcore_conf *qconf;
1573     uint64_t drain_tsc = 0;
1574     struct ff_dpdk_if_context *ctx;
1575 
1576     if (pkt_tx_delay) {
1577         drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay;
1578     }
1579 
1580     prev_tsc = 0;
1581     usch_tsc = 0;
1582 
1583     qconf = &lcore_conf;
1584 
1585     while (1) {
1586         cur_tsc = rte_rdtsc();
1587         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1588             rte_timer_manage();
1589         }
1590 
1591         idle = 1;
1592         sys_tsc = 0;
1593         usr_tsc = 0;
1594 
1595         /*
1596          * TX burst queue drain
1597          */
1598         diff_tsc = cur_tsc - prev_tsc;
1599         if (unlikely(diff_tsc >= drain_tsc)) {
1600             for (i = 0; i < qconf->nb_tx_port; i++) {
1601                 port_id = qconf->tx_port_id[i];
1602                 if (qconf->tx_mbufs[port_id].len == 0)
1603                     continue;
1604 
1605                 idle = 0;
1606 
1607                 send_burst(qconf,
1608                     qconf->tx_mbufs[port_id].len,
1609                     port_id);
1610                 qconf->tx_mbufs[port_id].len = 0;
1611             }
1612 
1613             prev_tsc = cur_tsc;
1614         }
1615 
1616         /*
1617          * Read packet from RX queues
1618          */
1619         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1620             port_id = qconf->rx_queue_list[i].port_id;
1621             queue_id = qconf->rx_queue_list[i].queue_id;
1622             ctx = veth_ctx[port_id];
1623 
1624 #ifdef FF_KNI
1625             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1626                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1627             }
1628 #endif
1629 
1630             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1631 
1632             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1633                 MAX_PKT_BURST);
1634             if (nb_rx == 0)
1635                 continue;
1636 
1637             idle = 0;
1638 
1639             /* Prefetch first packets */
1640             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1641                 rte_prefetch0(rte_pktmbuf_mtod(
1642                         pkts_burst[j], void *));
1643             }
1644 
1645             /* Prefetch and handle already prefetched packets */
1646             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1647                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1648                         j + PREFETCH_OFFSET], void *));
1649                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1650             }
1651 
1652             /* Handle remaining prefetched packets */
1653             for (; j < nb_rx; j++) {
1654                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1655             }
1656         }
1657 
1658         process_msg_ring(qconf->proc_id);
1659 
1660         div_tsc = rte_rdtsc();
1661 
1662         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) {
1663             usch_tsc = cur_tsc;
1664             lr->loop(lr->arg);
1665         }
1666 
1667         idle_sleep_tsc = rte_rdtsc();
1668         if (likely(idle && idle_sleep)) {
1669             usleep(idle_sleep);
1670             end_tsc = rte_rdtsc();
1671         } else {
1672             end_tsc = idle_sleep_tsc;
1673         }
1674 
1675         if (usch_tsc == cur_tsc) {
1676             usr_tsc = idle_sleep_tsc - div_tsc;
1677         }
1678 
1679         if (!idle) {
1680             sys_tsc = div_tsc - cur_tsc;
1681             ff_top_status.sys_tsc += sys_tsc;
1682         }
1683 
1684         ff_top_status.usr_tsc += usr_tsc;
1685         ff_top_status.work_tsc += end_tsc - cur_tsc;
1686         ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1687 
1688         ff_top_status.loops++;
1689     }
1690 
1691     return 0;
1692 }
1693 
1694 int
1695 ff_dpdk_if_up(void) {
1696     int i;
1697     struct lcore_conf *qconf = &lcore_conf;
1698     for (i = 0; i < qconf->nb_tx_port; i++) {
1699         uint16_t port_id = qconf->tx_port_id[i];
1700 
1701         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1702         veth_ctx[port_id] = ff_veth_attach(pconf);
1703         if (veth_ctx[port_id] == NULL) {
1704             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1705         }
1706     }
1707 
1708     return 0;
1709 }
1710 
1711 void
1712 ff_dpdk_run(loop_func_t loop, void *arg) {
1713     struct loop_routine *lr = rte_malloc(NULL,
1714         sizeof(struct loop_routine), 0);
1715     lr->loop = loop;
1716     lr->arg = arg;
1717     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1718     rte_eal_mp_wait_lcore();
1719     rte_free(lr);
1720 }
1721 
1722 void
1723 ff_dpdk_pktmbuf_free(void *m)
1724 {
1725     rte_pktmbuf_free((struct rte_mbuf *)m);
1726 }
1727 
1728 static uint32_t
1729 toeplitz_hash(unsigned keylen, const uint8_t *key,
1730     unsigned datalen, const uint8_t *data)
1731 {
1732     uint32_t hash = 0, v;
1733     u_int i, b;
1734 
1735     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1736 
1737     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1738     for (i = 0; i < datalen; i++) {
1739         for (b = 0; b < 8; b++) {
1740             if (data[i] & (1<<(7-b)))
1741                 hash ^= v;
1742             v <<= 1;
1743             if ((i + 4) < keylen &&
1744                 (key[i+4] & (1<<(7-b))))
1745                 v |= 1;
1746         }
1747     }
1748     return (hash);
1749 }
1750 
1751 int
1752 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
1753     uint16_t sport, uint16_t dport)
1754 {
1755     struct lcore_conf *qconf = &lcore_conf;
1756     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
1757     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
1758 
1759     if (nb_queues <= 1) {
1760         return 1;
1761     }
1762 
1763     uint16_t reta_size = rss_reta_size[ctx->port_id];
1764     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
1765 
1766     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
1767         sizeof(dport)];
1768 
1769     unsigned datalen = 0;
1770 
1771     bcopy(&saddr, &data[datalen], sizeof(saddr));
1772     datalen += sizeof(saddr);
1773 
1774     bcopy(&daddr, &data[datalen], sizeof(daddr));
1775     datalen += sizeof(daddr);
1776 
1777     bcopy(&sport, &data[datalen], sizeof(sport));
1778     datalen += sizeof(sport);
1779 
1780     bcopy(&dport, &data[datalen], sizeof(dport));
1781     datalen += sizeof(dport);
1782 
1783     uint32_t hash = 0;
1784     if ( !use_rsskey_52bytes )
1785         hash = toeplitz_hash(sizeof(default_rsskey_40bytes),
1786             default_rsskey_40bytes, datalen, data);
1787     else
1788         hash = toeplitz_hash(sizeof(default_rsskey_52bytes),
1789 	    default_rsskey_52bytes, datalen, data);
1790     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
1791 }
1792 
1793 void
1794 ff_regist_packet_dispatcher(dispatch_func_t func)
1795 {
1796     packet_dispatcher = func;
1797 }
1798 
1799 uint64_t
1800 ff_get_tsc_ns()
1801 {
1802     uint64_t cur_tsc = rte_rdtsc();
1803     uint64_t hz = rte_get_tsc_hz();
1804     return ((double)cur_tsc/(double)hz) * NS_PER_S;
1805 }
1806 
1807