xref: /f-stack/lib/ff_dpdk_if.c (revision 3da8d17d)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 #include <unistd.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30 
31 #include <rte_common.h>
32 #include <rte_byteorder.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memcpy.h>
36 #include <rte_memzone.h>
37 #include <rte_config.h>
38 #include <rte_eal.h>
39 #include <rte_pci.h>
40 #include <rte_mbuf.h>
41 #include <rte_memory.h>
42 #include <rte_lcore.h>
43 #include <rte_launch.h>
44 #include <rte_ethdev.h>
45 #include <rte_debug.h>
46 #include <rte_common.h>
47 #include <rte_ether.h>
48 #include <rte_malloc.h>
49 #include <rte_cycles.h>
50 #include <rte_timer.h>
51 #include <rte_thash.h>
52 #include <rte_ip.h>
53 #include <rte_tcp.h>
54 #include <rte_udp.h>
55 #include <rte_eth_bond.h>
56 
57 #include "ff_dpdk_if.h"
58 #include "ff_dpdk_pcap.h"
59 #include "ff_dpdk_kni.h"
60 #include "ff_config.h"
61 #include "ff_veth.h"
62 #include "ff_host_interface.h"
63 #include "ff_msg.h"
64 #include "ff_api.h"
65 #include "ff_memory.h"
66 
67 #ifdef FF_KNI
68 #define KNI_MBUF_MAX 2048
69 #define KNI_QUEUE_SIZE 2048
70 
71 int enable_kni;
72 static int kni_accept;
73 #endif
74 
75 static int numa_on;
76 
77 static unsigned idle_sleep;
78 static unsigned pkt_tx_delay;
79 
80 static struct rte_timer freebsd_clock;
81 
82 // Mellanox Linux's driver key
83 static uint8_t default_rsskey_40bytes[40] = {
84     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
85     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
86     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
87     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
88     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
89 };
90 
91 static int use_rsskey_52bytes = 0;
92 static uint8_t default_rsskey_52bytes[52] = {
93     0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23,
94     0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30,
95     0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02,
96     0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c,
97     0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55,
98     0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e,
99     0x81, 0x15, 0x03, 0x66
100 };
101 
102 struct lcore_conf lcore_conf;
103 
104 struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
105 
106 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
107 static dispatch_func_t packet_dispatcher;
108 
109 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
110 
111 #define BOND_DRIVER_NAME    "net_bonding"
112 
113 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port);
114 
115 struct ff_msg_ring {
116     char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE];
117     /* ring[0] for lcore recv msg, other send */
118     /* ring[1] for lcore send msg, other read */
119     struct rte_ring *ring[FF_MSG_NUM];
120 } __rte_cache_aligned;
121 
122 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
123 static struct rte_mempool *message_pool;
124 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
125 
126 static struct ff_top_args ff_top_status;
127 static struct ff_traffic_args ff_traffic;
128 extern void ff_hardclock(void);
129 
130 static void
131 ff_hardclock_job(__rte_unused struct rte_timer *timer,
132     __rte_unused void *arg) {
133     ff_hardclock();
134     ff_update_current_ts();
135 }
136 
137 struct ff_dpdk_if_context *
138 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
139 {
140     struct ff_dpdk_if_context *ctx;
141 
142     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
143     if (ctx == NULL)
144         return NULL;
145 
146     ctx->sc = sc;
147     ctx->ifp = ifp;
148     ctx->port_id = cfg->port_id;
149     ctx->hw_features = cfg->hw_features;
150 
151     return ctx;
152 }
153 
154 void
155 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
156 {
157     free(ctx);
158 }
159 
160 static void
161 check_all_ports_link_status(void)
162 {
163     #define CHECK_INTERVAL 100 /* 100ms */
164     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
165 
166     uint16_t portid;
167     uint8_t count, all_ports_up, print_flag = 0;
168     struct rte_eth_link link;
169 
170     printf("\nChecking link status");
171     fflush(stdout);
172 
173     int i, nb_ports;
174     nb_ports = ff_global_cfg.dpdk.nb_ports;
175     for (count = 0; count <= MAX_CHECK_TIME; count++) {
176         all_ports_up = 1;
177         for (i = 0; i < nb_ports; i++) {
178             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
179             memset(&link, 0, sizeof(link));
180             rte_eth_link_get_nowait(portid, &link);
181 
182             /* print link status if flag set */
183             if (print_flag == 1) {
184                 if (link.link_status) {
185                     printf("Port %d Link Up - speed %u "
186                         "Mbps - %s\n", (int)portid,
187                         (unsigned)link.link_speed,
188                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
189                         ("full-duplex") : ("half-duplex\n"));
190                 } else {
191                     printf("Port %d Link Down\n", (int)portid);
192                 }
193                 continue;
194             }
195             /* clear all_ports_up flag if any link down */
196             if (link.link_status == 0) {
197                 all_ports_up = 0;
198                 break;
199             }
200         }
201 
202         /* after finally printing all link status, get out */
203         if (print_flag == 1)
204             break;
205 
206         if (all_ports_up == 0) {
207             printf(".");
208             fflush(stdout);
209             rte_delay_ms(CHECK_INTERVAL);
210         }
211 
212         /* set the print_flag if all ports up or timeout */
213         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
214             print_flag = 1;
215             printf("done\n");
216         }
217     }
218 }
219 
220 static int
221 init_lcore_conf(void)
222 {
223     uint8_t nb_dev_ports = rte_eth_dev_count_avail();
224     if (nb_dev_ports == 0) {
225         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
226     }
227 
228     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
229         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
230                  ff_global_cfg.dpdk.max_portid);
231     }
232 
233     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
234     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
235 
236     uint16_t proc_id;
237     for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) {
238         uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id];
239         if (!lcore_config[lcore_id].detected) {
240             rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
241         }
242     }
243 
244     uint16_t socket_id = 0;
245     if (numa_on) {
246         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
247     }
248 
249     lcore_conf.socket_id = socket_id;
250 
251     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
252     int j;
253     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
254         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
255         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
256 
257         int queueid = -1;
258         int i;
259         for (i = 0; i < pconf->nb_lcores; i++) {
260             if (pconf->lcore_list[i] == lcore_id) {
261                 queueid = i;
262             }
263         }
264         if (queueid < 0) {
265             continue;
266         }
267         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
268         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
269         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
270         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
271         lcore_conf.nb_rx_queue++;
272 
273         lcore_conf.tx_queue_id[port_id] = queueid;
274         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
275         lcore_conf.nb_tx_port++;
276 
277         lcore_conf.pcap[port_id] = pconf->pcap;
278         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
279     }
280 
281     if (lcore_conf.nb_rx_queue == 0) {
282         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
283     }
284 
285     return 0;
286 }
287 
288 static int
289 init_mem_pool(void)
290 {
291     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
292     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
293     uint32_t nb_tx_queue = nb_lcores;
294     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
295     uint16_t max_portid = ff_global_cfg.dpdk.max_portid;
296 
297     unsigned nb_mbuf = RTE_ALIGN_CEIL (
298         (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE          +
299         nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST    +
300         nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE  +
301         nb_lcores * MEMPOOL_CACHE_SIZE +
302 #ifdef FF_KNI
303         nb_ports * KNI_MBUF_MAX +
304         nb_ports * KNI_QUEUE_SIZE +
305 #endif
306         nb_lcores * nb_ports * DISPATCH_RING_SIZE),
307         (unsigned)8192);
308 
309     unsigned socketid = 0;
310     uint16_t i, lcore_id;
311     char s[64];
312 
313     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
314         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
315         if (numa_on) {
316             socketid = rte_lcore_to_socket_id(lcore_id);
317         }
318 
319         if (socketid >= NB_SOCKETS) {
320             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
321                 socketid, i, NB_SOCKETS);
322         }
323 
324         if (pktmbuf_pool[socketid] != NULL) {
325             continue;
326         }
327 
328         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
329             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
330             pktmbuf_pool[socketid] =
331                 rte_pktmbuf_pool_create(s, nb_mbuf,
332                     MEMPOOL_CACHE_SIZE, 0,
333                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
334         } else {
335             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
336             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
337         }
338 
339         if (pktmbuf_pool[socketid] == NULL) {
340             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
341         } else {
342             printf("create mbuf pool on socket %d\n", socketid);
343         }
344 
345 #ifdef FF_USE_PAGE_ARRAY
346         nb_mbuf = RTE_ALIGN_CEIL (
347             nb_ports*nb_lcores*MAX_PKT_BURST    +
348             nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
349             nb_lcores*MEMPOOL_CACHE_SIZE,
350             (unsigned)4096);
351         ff_init_ref_pool(nb_mbuf, socketid);
352 #endif
353     }
354 
355     return 0;
356 }
357 
358 static struct rte_ring *
359 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
360 {
361     struct rte_ring *ring;
362 
363     if (name == NULL) {
364         rte_exit(EXIT_FAILURE, "create ring failed, no name!\n");
365     }
366 
367     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
368         ring = rte_ring_create(name, count, socket_id, flags);
369     } else {
370         ring = rte_ring_lookup(name);
371     }
372 
373     if (ring == NULL) {
374         rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name);
375     }
376 
377     return ring;
378 }
379 
380 static int
381 init_dispatch_ring(void)
382 {
383     int j;
384     char name_buf[RTE_RING_NAMESIZE];
385     int queueid;
386 
387     unsigned socketid = lcore_conf.socket_id;
388 
389     /* Create ring according to ports actually being used. */
390     int nb_ports = ff_global_cfg.dpdk.nb_ports;
391     for (j = 0; j < nb_ports; j++) {
392         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
393         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
394         int nb_queues = pconf->nb_lcores;
395         if (dispatch_ring[portid] == NULL) {
396             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
397 
398             dispatch_ring[portid] = rte_zmalloc(name_buf,
399                 sizeof(struct rte_ring *) * nb_queues,
400                 RTE_CACHE_LINE_SIZE);
401             if (dispatch_ring[portid] == NULL) {
402                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
403                     "failed\n", name_buf);
404             }
405         }
406 
407         for(queueid = 0; queueid < nb_queues; ++queueid) {
408             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
409                 portid, queueid);
410             dispatch_ring[portid][queueid] = create_ring(name_buf,
411                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
412 
413             if (dispatch_ring[portid][queueid] == NULL)
414                 rte_panic("create ring:%s failed!\n", name_buf);
415 
416             printf("create ring:%s success, %u ring entries are now free!\n",
417                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
418         }
419     }
420 
421     return 0;
422 }
423 
424 static void
425 ff_msg_init(struct rte_mempool *mp,
426     __attribute__((unused)) void *opaque_arg,
427     void *obj, __attribute__((unused)) unsigned i)
428 {
429     struct ff_msg *msg = (struct ff_msg *)obj;
430     msg->msg_type = FF_UNKNOWN;
431     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
432     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
433 }
434 
435 static int
436 init_msg_ring(void)
437 {
438     uint16_t i, j;
439     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
440     unsigned socketid = lcore_conf.socket_id;
441 
442     /* Create message buffer pool */
443     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
444         message_pool = rte_mempool_create(FF_MSG_POOL,
445            MSG_RING_SIZE * 2 * nb_procs,
446            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
447            NULL, NULL, ff_msg_init, NULL,
448            socketid, 0);
449     } else {
450         message_pool = rte_mempool_lookup(FF_MSG_POOL);
451     }
452 
453     if (message_pool == NULL) {
454         rte_panic("Create msg mempool failed\n");
455     }
456 
457     for(i = 0; i < nb_procs; ++i) {
458         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
459             "%s%u", FF_MSG_RING_IN, i);
460         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
461             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
462         if (msg_ring[i].ring[0] == NULL)
463             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
464 
465         for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) {
466             snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE,
467                 "%s%u_%u", FF_MSG_RING_OUT, i, j);
468             msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j],
469                 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
470             if (msg_ring[i].ring[j] == NULL)
471                 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]);
472         }
473     }
474 
475     return 0;
476 }
477 
478 #ifdef FF_KNI
479 static int
480 init_kni(void)
481 {
482     int nb_ports = rte_eth_dev_count_avail();
483     kni_accept = 0;
484     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
485         kni_accept = 1;
486 
487     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
488         ff_global_cfg.kni.udp_port);
489 
490     unsigned socket_id = lcore_conf.socket_id;
491     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
492 
493     nb_ports = ff_global_cfg.dpdk.nb_ports;
494     int i, ret;
495     for (i = 0; i < nb_ports; i++) {
496         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
497         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
498     }
499 
500     return 0;
501 }
502 #endif
503 
504 static void
505 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
506 {
507     if (reta_size == 0) {
508         return;
509     }
510 
511     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
512     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
513 
514     /* config HW indirection table */
515     unsigned i, j, hash=0;
516     for (i = 0; i < reta_conf_size; i++) {
517         reta_conf[i].mask = ~0ULL;
518         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
519             reta_conf[i].reta[j] = hash++ % nb_queues;
520         }
521     }
522 
523     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
524         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
525             port_id);
526     }
527 }
528 
529 static int
530 init_port_start(void)
531 {
532     int nb_ports = ff_global_cfg.dpdk.nb_ports;
533     unsigned socketid = 0;
534     struct rte_mempool *mbuf_pool;
535     uint16_t i, j;
536 
537     for (i = 0; i < nb_ports; i++) {
538         uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i];
539         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id];
540         uint16_t nb_queues = pconf->nb_lcores;
541 
542         for (j=0; j<=pconf->nb_slaves; j++) {
543             if (j < pconf->nb_slaves) {
544                 port_id = pconf->slave_portid_list[j];
545                 printf("To init %s's %d'st slave port[%d]\n",
546                         ff_global_cfg.dpdk.bond_cfgs->name,
547                         j, port_id);
548             } else {
549                 port_id = u_port_id;
550             }
551 
552             struct rte_eth_dev_info dev_info;
553             struct rte_eth_conf port_conf = {0};
554             struct rte_eth_rxconf rxq_conf;
555             struct rte_eth_txconf txq_conf;
556 
557             rte_eth_dev_info_get(port_id, &dev_info);
558 
559             if (nb_queues > dev_info.max_rx_queues) {
560                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
561                     nb_queues,
562                     dev_info.max_rx_queues);
563             }
564 
565             if (nb_queues > dev_info.max_tx_queues) {
566                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
567                     nb_queues,
568                     dev_info.max_tx_queues);
569             }
570 
571             struct ether_addr addr;
572             rte_eth_macaddr_get(port_id, &addr);
573             printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
574                        " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
575                     (unsigned)port_id,
576                     addr.addr_bytes[0], addr.addr_bytes[1],
577                     addr.addr_bytes[2], addr.addr_bytes[3],
578                     addr.addr_bytes[4], addr.addr_bytes[5]);
579 
580             rte_memcpy(pconf->mac,
581                 addr.addr_bytes, ETHER_ADDR_LEN);
582 
583             /* Set RSS mode */
584             uint64_t default_rss_hf = ETH_RSS_PROTO_MASK;
585             port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
586             port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf;
587             if (dev_info.hash_key_size == 52) {
588                 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_52bytes;
589                 port_conf.rx_adv_conf.rss_conf.rss_key_len = 52;
590                 use_rsskey_52bytes = 1;
591             } else {
592                 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes;
593                 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40;
594             }
595             port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads;
596             if (port_conf.rx_adv_conf.rss_conf.rss_hf !=
597                     ETH_RSS_PROTO_MASK) {
598                 printf("Port %u modified RSS hash function based on hardware support,"
599                         "requested:%#"PRIx64" configured:%#"PRIx64"\n",
600                         port_id, default_rss_hf,
601                         port_conf.rx_adv_conf.rss_conf.rss_hf);
602             }
603 
604             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) {
605                 port_conf.txmode.offloads |=
606                     DEV_TX_OFFLOAD_MBUF_FAST_FREE;
607             }
608 
609             /* Set Rx VLAN stripping */
610             if (ff_global_cfg.dpdk.vlan_strip) {
611                 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
612                     port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
613                 }
614             }
615 
616             /* Enable HW CRC stripping */
617             port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC;
618 
619             /* FIXME: Enable TCP LRO ?*/
620             #if 0
621             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
622                 printf("LRO is supported\n");
623                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
624                 pconf->hw_features.rx_lro = 1;
625             }
626             #endif
627 
628             /* Set Rx checksum checking */
629             if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
630                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
631                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
632                 printf("RX checksum offload supported\n");
633                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
634                 pconf->hw_features.rx_csum = 1;
635             }
636 
637             if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
638                 printf("TX ip checksum offload supported\n");
639                 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
640                 pconf->hw_features.tx_csum_ip = 1;
641             }
642 
643             if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
644                 (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
645                 printf("TX TCP&UDP checksum offload supported\n");
646                 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM;
647                 pconf->hw_features.tx_csum_l4 = 1;
648             }
649 
650             if (ff_global_cfg.dpdk.tso) {
651                 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
652                     printf("TSO is supported\n");
653                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO;
654                     pconf->hw_features.tx_tso = 1;
655                 }
656             } else {
657                 printf("TSO is disabled\n");
658             }
659 
660             if (dev_info.reta_size) {
661                 /* reta size must be power of 2 */
662                 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
663 
664                 rss_reta_size[port_id] = dev_info.reta_size;
665                 printf("port[%d]: rss table size: %d\n", port_id,
666                     dev_info.reta_size);
667             }
668 
669             if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
670                 continue;
671             }
672 
673             int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
674             if (ret != 0) {
675                 return ret;
676             }
677 
678             static uint16_t nb_rxd = RX_QUEUE_SIZE;
679             static uint16_t nb_txd = TX_QUEUE_SIZE;
680             ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd);
681             if (ret < 0)
682                 printf("Could not adjust number of descriptors "
683                         "for port%u (%d)\n", (unsigned)port_id, ret);
684 
685             uint16_t q;
686             for (q = 0; q < nb_queues; q++) {
687                 if (numa_on) {
688                     uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
689                     socketid = rte_lcore_to_socket_id(lcore_id);
690                 }
691                 mbuf_pool = pktmbuf_pool[socketid];
692 
693                 txq_conf = dev_info.default_txconf;
694                 txq_conf.offloads = port_conf.txmode.offloads;
695                 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd,
696                     socketid, &txq_conf);
697                 if (ret < 0) {
698                     return ret;
699                 }
700 
701                 rxq_conf = dev_info.default_rxconf;
702                 rxq_conf.offloads = port_conf.rxmode.offloads;
703                 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd,
704                     socketid, &rxq_conf, mbuf_pool);
705                 if (ret < 0) {
706                     return ret;
707                 }
708             }
709 
710 
711             if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME,
712                     strlen(dev_info.driver_name)) == 0) {
713 
714                 rte_eth_macaddr_get(port_id, &addr);
715                 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
716                            " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
717                         (unsigned)port_id,
718                         addr.addr_bytes[0], addr.addr_bytes[1],
719                         addr.addr_bytes[2], addr.addr_bytes[3],
720                         addr.addr_bytes[4], addr.addr_bytes[5]);
721 
722                 rte_memcpy(pconf->mac,
723                     addr.addr_bytes, ETHER_ADDR_LEN);
724 
725                 int mode, count, x;
726                 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS;
727 
728                 mode = rte_eth_bond_mode_get(port_id);
729                 printf("Port %u, bond mode:%d\n", port_id, mode);
730 
731                 count = rte_eth_bond_slaves_get(port_id, slaves, len);
732                 printf("Port %u, %s's slave ports count:%d\n", port_id,
733                             ff_global_cfg.dpdk.bond_cfgs->name, count);
734                 for (x=0; x<count; x++) {
735                     printf("Port %u, %s's slave port[%u]\n", port_id,
736                             ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]);
737                 }
738             }
739 
740             ret = rte_eth_dev_start(port_id);
741             if (ret < 0) {
742                 return ret;
743             }
744 
745             if (nb_queues > 1) {
746                 /* set HW rss hash function to Toeplitz. */
747                 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
748                     struct rte_eth_hash_filter_info info = {0};
749                     info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
750                     info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
751 
752                     if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
753                         RTE_ETH_FILTER_SET, &info) < 0) {
754                         rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
755                             port_id);
756                     }
757                 }
758 
759                 set_rss_table(port_id, dev_info.reta_size, nb_queues);
760             }
761 
762             /* Enable RX in promiscuous mode for the Ethernet device. */
763             if (ff_global_cfg.dpdk.promiscuous) {
764                 rte_eth_promiscuous_enable(port_id);
765                 ret = rte_eth_promiscuous_get(port_id);
766                 if (ret == 1) {
767                     printf("set port %u to promiscuous mode ok\n", port_id);
768                 } else {
769                     printf("set port %u to promiscuous mode error\n", port_id);
770                 }
771             }
772 
773             /* Enable pcap dump */
774             if (pconf->pcap) {
775                 ff_enable_pcap(pconf->pcap);
776             }
777         }
778     }
779 
780     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
781         check_all_ports_link_status();
782     }
783 
784     return 0;
785 }
786 
787 static int
788 init_clock(void)
789 {
790     rte_timer_subsystem_init();
791     uint64_t hz = rte_get_timer_hz();
792     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
793     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
794 
795     rte_timer_init(&freebsd_clock);
796     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
797         rte_lcore_id(), &ff_hardclock_job, NULL);
798 
799     ff_update_current_ts();
800 
801     return 0;
802 }
803 
804 int
805 ff_dpdk_init(int argc, char **argv)
806 {
807     if (ff_global_cfg.dpdk.nb_procs < 1 ||
808         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
809         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
810         ff_global_cfg.dpdk.proc_id < 0) {
811         printf("param num_procs[%d] or proc_id[%d] error!\n",
812             ff_global_cfg.dpdk.nb_procs,
813             ff_global_cfg.dpdk.proc_id);
814         exit(1);
815     }
816 
817     int ret = rte_eal_init(argc, argv);
818     if (ret < 0) {
819         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
820     }
821 
822     numa_on = ff_global_cfg.dpdk.numa_on;
823 
824     idle_sleep = ff_global_cfg.dpdk.idle_sleep;
825     pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \
826         BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay;
827 
828     init_lcore_conf();
829 
830     init_mem_pool();
831 
832     init_dispatch_ring();
833 
834     init_msg_ring();
835 
836 #ifdef FF_KNI
837     enable_kni = ff_global_cfg.kni.enable;
838     if (enable_kni) {
839         init_kni();
840     }
841 #endif
842 
843 #ifdef FF_USE_PAGE_ARRAY
844     ff_mmap_init();
845 #endif
846 
847     ret = init_port_start();
848     if (ret < 0) {
849         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
850     }
851 
852     init_clock();
853 
854     return 0;
855 }
856 
857 static void
858 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
859 {
860     uint8_t rx_csum = ctx->hw_features.rx_csum;
861     if (rx_csum) {
862         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
863             rte_pktmbuf_free(pkt);
864             return;
865         }
866     }
867 
868     void *data = rte_pktmbuf_mtod(pkt, void*);
869     uint16_t len = rte_pktmbuf_data_len(pkt);
870 
871     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
872     if (hdr == NULL) {
873         rte_pktmbuf_free(pkt);
874         return;
875     }
876 
877     if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) {
878         ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci);
879     }
880 
881     struct rte_mbuf *pn = pkt->next;
882     void *prev = hdr;
883     while(pn != NULL) {
884         data = rte_pktmbuf_mtod(pn, void*);
885         len = rte_pktmbuf_data_len(pn);
886 
887         void *mb = ff_mbuf_get(prev, data, len);
888         if (mb == NULL) {
889             ff_mbuf_free(hdr);
890             rte_pktmbuf_free(pkt);
891             return;
892         }
893         pn = pn->next;
894         prev = mb;
895     }
896 
897     ff_veth_process_packet(ctx->ifp, hdr);
898 }
899 
900 static enum FilterReturn
901 protocol_filter(const void *data, uint16_t len)
902 {
903     if(len < ETHER_HDR_LEN)
904         return FILTER_UNKNOWN;
905 
906     const struct ether_hdr *hdr;
907     const struct vlan_hdr *vlanhdr;
908     hdr = (const struct ether_hdr *)data;
909     uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type);
910     data += ETHER_HDR_LEN;
911     len -= ETHER_HDR_LEN;
912 
913     if (ether_type == ETHER_TYPE_VLAN) {
914         vlanhdr = (struct vlan_hdr *)data;
915         ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto);
916         data += sizeof(struct vlan_hdr);
917         len -= sizeof(struct vlan_hdr);
918     }
919 
920     if(ether_type == ETHER_TYPE_ARP)
921         return FILTER_ARP;
922 
923 #ifdef INET6
924     if (ether_type == ETHER_TYPE_IPv6) {
925         return ff_kni_proto_filter(data,
926             len, ether_type);
927     }
928 #endif
929 
930 #ifndef FF_KNI
931     return FILTER_UNKNOWN;
932 #else
933     if (!enable_kni) {
934         return FILTER_UNKNOWN;
935     }
936 
937     if(ether_type != ETHER_TYPE_IPv4)
938         return FILTER_UNKNOWN;
939 
940     return ff_kni_proto_filter(data,
941         len, ether_type);
942 #endif
943 }
944 
945 static inline void
946 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
947 {
948     struct rte_mbuf *md;
949     void *src, *dst;
950 
951     dst = rte_pktmbuf_mtod(mi, void *);
952     src = rte_pktmbuf_mtod(m, void *);
953 
954     mi->data_len = m->data_len;
955     rte_memcpy(dst, src, m->data_len);
956 
957     mi->port = m->port;
958     mi->vlan_tci = m->vlan_tci;
959     mi->vlan_tci_outer = m->vlan_tci_outer;
960     mi->tx_offload = m->tx_offload;
961     mi->hash = m->hash;
962     mi->ol_flags = m->ol_flags;
963     mi->packet_type = m->packet_type;
964 }
965 
966 /* copied from rte_pktmbuf_clone */
967 static inline struct rte_mbuf *
968 pktmbuf_deep_clone(const struct rte_mbuf *md,
969     struct rte_mempool *mp)
970 {
971     struct rte_mbuf *mc, *mi, **prev;
972     uint32_t pktlen;
973     uint8_t nseg;
974 
975     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
976         return NULL;
977 
978     mi = mc;
979     prev = &mi->next;
980     pktlen = md->pkt_len;
981     nseg = 0;
982 
983     do {
984         nseg++;
985         pktmbuf_deep_attach(mi, md);
986         *prev = mi;
987         prev = &mi->next;
988     } while ((md = md->next) != NULL &&
989         (mi = rte_pktmbuf_alloc(mp)) != NULL);
990 
991     *prev = NULL;
992     mc->nb_segs = nseg;
993     mc->pkt_len = pktlen;
994 
995     /* Allocation of new indirect segment failed */
996     if (unlikely (mi == NULL)) {
997         rte_pktmbuf_free(mc);
998         return NULL;
999     }
1000 
1001     __rte_mbuf_sanity_check(mc, 1);
1002     return mc;
1003 }
1004 
1005 static inline void
1006 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
1007     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
1008 {
1009     struct lcore_conf *qconf = &lcore_conf;
1010     uint16_t nb_queues = qconf->nb_queue_list[port_id];
1011 
1012     uint16_t i;
1013     for (i = 0; i < count; i++) {
1014         struct rte_mbuf *rtem = bufs[i];
1015 
1016         if (unlikely(qconf->pcap[port_id] != NULL)) {
1017             if (!pkts_from_ring) {
1018                 ff_dump_packets(qconf->pcap[port_id], rtem);
1019             }
1020         }
1021 
1022         void *data = rte_pktmbuf_mtod(rtem, void*);
1023         uint16_t len = rte_pktmbuf_data_len(rtem);
1024 
1025         if (!pkts_from_ring) {
1026             ff_traffic.rx_packets++;
1027             ff_traffic.rx_bytes += len;
1028         }
1029 
1030         if (!pkts_from_ring && packet_dispatcher) {
1031             int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues);
1032             if (ret == FF_DISPATCH_RESPONSE) {
1033                 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len;
1034 
1035                 /*
1036                  * We have not support vlan out strip
1037                  */
1038                 if (rtem->vlan_tci) {
1039                     data = rte_pktmbuf_prepend(rtem, sizeof(struct vlan_hdr));
1040                     if (data != NULL) {
1041                         memmove(data, data + sizeof(struct vlan_hdr), ETHER_HDR_LEN);
1042                         struct ether_hdr *etherhdr = (struct ether_hdr *)data;
1043                         struct vlan_hdr *vlanhdr = (struct vlan_hdr *)(data + ETHER_HDR_LEN);
1044                         vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci);
1045                         vlanhdr->eth_proto = etherhdr->ether_type;
1046                         etherhdr->ether_type = rte_cpu_to_be_16(ETHER_TYPE_VLAN);
1047                     }
1048                 }
1049                 send_single_packet(rtem, port_id);
1050                 continue;
1051             }
1052 
1053             if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) {
1054                 rte_pktmbuf_free(rtem);
1055                 continue;
1056             }
1057 
1058             if (ret != queue_id) {
1059                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
1060                 if (ret < 0)
1061                     rte_pktmbuf_free(rtem);
1062 
1063                 continue;
1064             }
1065         }
1066 
1067         enum FilterReturn filter = protocol_filter(data, len);
1068 #ifdef INET6
1069         if (filter == FILTER_ARP || filter == FILTER_NDP) {
1070 #else
1071         if (filter == FILTER_ARP) {
1072 #endif
1073             struct rte_mempool *mbuf_pool;
1074             struct rte_mbuf *mbuf_clone;
1075             if (!pkts_from_ring) {
1076                 uint16_t j;
1077                 for(j = 0; j < nb_queues; ++j) {
1078                     if(j == queue_id)
1079                         continue;
1080 
1081                     unsigned socket_id = 0;
1082                     if (numa_on) {
1083                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1084                         socket_id = rte_lcore_to_socket_id(lcore_id);
1085                     }
1086                     mbuf_pool = pktmbuf_pool[socket_id];
1087                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1088                     if(mbuf_clone) {
1089                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1090                             mbuf_clone);
1091                         if (ret < 0)
1092                             rte_pktmbuf_free(mbuf_clone);
1093                     }
1094                 }
1095             }
1096 
1097 #ifdef FF_KNI
1098             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1099                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1100                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1101                 if(mbuf_clone) {
1102                     ff_kni_enqueue(port_id, mbuf_clone);
1103                 }
1104             }
1105 #endif
1106             ff_veth_input(ctx, rtem);
1107 #ifdef FF_KNI
1108         } else if (enable_kni &&
1109             ((filter == FILTER_KNI && kni_accept) ||
1110             (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1111             ff_kni_enqueue(port_id, rtem);
1112 #endif
1113         } else {
1114             ff_veth_input(ctx, rtem);
1115         }
1116     }
1117 }
1118 
1119 static inline int
1120 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1121     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1122 {
1123     /* read packet from ring buf and to process */
1124     uint16_t nb_rb;
1125     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1126         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1127 
1128     if(nb_rb > 0) {
1129         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1130     }
1131 
1132     return 0;
1133 }
1134 
1135 static inline void
1136 handle_sysctl_msg(struct ff_msg *msg)
1137 {
1138     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1139         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1140         msg->sysctl.newlen);
1141 
1142     if (ret < 0) {
1143         msg->result = errno;
1144     } else {
1145         msg->result = 0;
1146     }
1147 }
1148 
1149 static inline void
1150 handle_ioctl_msg(struct ff_msg *msg)
1151 {
1152     int fd, ret;
1153 #ifdef INET6
1154     if (msg->msg_type == FF_IOCTL6) {
1155         fd = ff_socket(AF_INET6, SOCK_DGRAM, 0);
1156     } else
1157 #endif
1158         fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1159 
1160     if (fd < 0) {
1161         ret = -1;
1162         goto done;
1163     }
1164 
1165     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1166 
1167     ff_close(fd);
1168 
1169 done:
1170     if (ret < 0) {
1171         msg->result = errno;
1172     } else {
1173         msg->result = 0;
1174     }
1175 }
1176 
1177 static inline void
1178 handle_route_msg(struct ff_msg *msg)
1179 {
1180     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1181         &msg->route.len, msg->route.maxlen);
1182     if (ret < 0) {
1183         msg->result = errno;
1184     } else {
1185         msg->result = 0;
1186     }
1187 }
1188 
1189 static inline void
1190 handle_top_msg(struct ff_msg *msg)
1191 {
1192     msg->top = ff_top_status;
1193     msg->result = 0;
1194 }
1195 
1196 #ifdef FF_NETGRAPH
1197 static inline void
1198 handle_ngctl_msg(struct ff_msg *msg)
1199 {
1200     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1201     if (ret < 0) {
1202         msg->result = errno;
1203     } else {
1204         msg->result = 0;
1205         msg->ngctl.ret = ret;
1206     }
1207 }
1208 #endif
1209 
1210 #ifdef FF_IPFW
1211 static inline void
1212 handle_ipfw_msg(struct ff_msg *msg)
1213 {
1214     int fd, ret;
1215     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1216     if (fd < 0) {
1217         ret = -1;
1218         goto done;
1219     }
1220 
1221     switch (msg->ipfw.cmd) {
1222         case FF_IPFW_GET:
1223             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1224                 msg->ipfw.optname, msg->ipfw.optval,
1225                 msg->ipfw.optlen);
1226             break;
1227         case FF_IPFW_SET:
1228             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1229                 msg->ipfw.optname, msg->ipfw.optval,
1230                 *(msg->ipfw.optlen));
1231             break;
1232         default:
1233             ret = -1;
1234             errno = ENOTSUP;
1235             break;
1236     }
1237 
1238     ff_close(fd);
1239 
1240 done:
1241     if (ret < 0) {
1242         msg->result = errno;
1243     } else {
1244         msg->result = 0;
1245     }
1246 }
1247 #endif
1248 
1249 static inline void
1250 handle_traffic_msg(struct ff_msg *msg)
1251 {
1252     msg->traffic = ff_traffic;
1253     msg->result = 0;
1254 }
1255 
1256 static inline void
1257 handle_default_msg(struct ff_msg *msg)
1258 {
1259     msg->result = ENOTSUP;
1260 }
1261 
1262 static inline void
1263 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1264 {
1265     switch (msg->msg_type) {
1266         case FF_SYSCTL:
1267             handle_sysctl_msg(msg);
1268             break;
1269         case FF_IOCTL:
1270 #ifdef INET6
1271         case FF_IOCTL6:
1272 #endif
1273             handle_ioctl_msg(msg);
1274             break;
1275         case FF_ROUTE:
1276             handle_route_msg(msg);
1277             break;
1278         case FF_TOP:
1279             handle_top_msg(msg);
1280             break;
1281 #ifdef FF_NETGRAPH
1282         case FF_NGCTL:
1283             handle_ngctl_msg(msg);
1284             break;
1285 #endif
1286 #ifdef FF_IPFW
1287         case FF_IPFW_CTL:
1288             handle_ipfw_msg(msg);
1289             break;
1290 #endif
1291         case FF_TRAFFIC:
1292             handle_traffic_msg(msg);
1293             break;
1294         default:
1295             handle_default_msg(msg);
1296             break;
1297     }
1298     rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg);
1299 }
1300 
1301 static inline int
1302 process_msg_ring(uint16_t proc_id)
1303 {
1304     void *msg;
1305     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1306 
1307     if (unlikely(ret == 0)) {
1308         handle_msg((struct ff_msg *)msg, proc_id);
1309     }
1310 
1311     return 0;
1312 }
1313 
1314 /* Send burst of packets on an output interface */
1315 static inline int
1316 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1317 {
1318     struct rte_mbuf **m_table;
1319     int ret;
1320     uint16_t queueid;
1321 
1322     queueid = qconf->tx_queue_id[port];
1323     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1324 
1325     if (unlikely(qconf->pcap[port] != NULL)) {
1326         uint16_t i;
1327         for (i = 0; i < n; i++) {
1328             ff_dump_packets(qconf->pcap[port], m_table[i]);
1329         }
1330     }
1331 
1332     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1333     ff_traffic.tx_packets += ret;
1334     uint16_t i;
1335     for (i = 0; i < ret; i++) {
1336         ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]);
1337 #ifdef FF_USE_PAGE_ARRAY
1338         if (qconf->tx_mbufs[port].bsd_m_table[i])
1339             ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs);
1340 #endif
1341     }
1342     if (unlikely(ret < n)) {
1343         do {
1344             rte_pktmbuf_free(m_table[ret]);
1345 #ifdef FF_USE_PAGE_ARRAY
1346             if ( qconf->tx_mbufs[port].bsd_m_table[ret] )
1347                 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]);
1348 #endif
1349         } while (++ret < n);
1350     }
1351     return 0;
1352 }
1353 
1354 /* Enqueue a single packet, and send burst if queue is filled */
1355 static inline int
1356 send_single_packet(struct rte_mbuf *m, uint8_t port)
1357 {
1358     uint16_t len;
1359     struct lcore_conf *qconf;
1360 
1361     qconf = &lcore_conf;
1362     len = qconf->tx_mbufs[port].len;
1363     qconf->tx_mbufs[port].m_table[len] = m;
1364     len++;
1365 
1366     /* enough pkts to be sent */
1367     if (unlikely(len == MAX_PKT_BURST)) {
1368         send_burst(qconf, MAX_PKT_BURST, port);
1369         len = 0;
1370     }
1371 
1372     qconf->tx_mbufs[port].len = len;
1373     return 0;
1374 }
1375 
1376 int
1377 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1378     int total)
1379 {
1380 #ifdef FF_USE_PAGE_ARRAY
1381     struct lcore_conf *qconf = &lcore_conf;
1382     int    len = 0;
1383 
1384     len = ff_if_send_onepkt(ctx, m,total);
1385     if (unlikely(len == MAX_PKT_BURST)) {
1386         send_burst(qconf, MAX_PKT_BURST, ctx->port_id);
1387         len = 0;
1388     }
1389     qconf->tx_mbufs[ctx->port_id].len = len;
1390     return 0;
1391 #endif
1392     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1393     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1394     if (head == NULL) {
1395         ff_mbuf_free(m);
1396         return -1;
1397     }
1398 
1399     head->pkt_len = total;
1400     head->nb_segs = 0;
1401 
1402     int off = 0;
1403     struct rte_mbuf *cur = head, *prev = NULL;
1404     while(total > 0) {
1405         if (cur == NULL) {
1406             cur = rte_pktmbuf_alloc(mbuf_pool);
1407             if (cur == NULL) {
1408                 rte_pktmbuf_free(head);
1409                 ff_mbuf_free(m);
1410                 return -1;
1411             }
1412         }
1413 
1414         if (prev != NULL) {
1415             prev->next = cur;
1416         }
1417         head->nb_segs++;
1418 
1419         prev = cur;
1420         void *data = rte_pktmbuf_mtod(cur, void*);
1421         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1422         int ret = ff_mbuf_copydata(m, data, off, len);
1423         if (ret < 0) {
1424             rte_pktmbuf_free(head);
1425             ff_mbuf_free(m);
1426             return -1;
1427         }
1428 
1429 
1430         cur->data_len = len;
1431         off += len;
1432         total -= len;
1433         cur = NULL;
1434     }
1435 
1436     struct ff_tx_offload offload = {0};
1437     ff_mbuf_tx_offload(m, &offload);
1438 
1439     void *data = rte_pktmbuf_mtod(head, void*);
1440 
1441     if (offload.ip_csum) {
1442         /* ipv6 not supported yet */
1443         struct ipv4_hdr *iph;
1444         int iph_len;
1445         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1446         iph_len = (iph->version_ihl & 0x0f) << 2;
1447 
1448         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1449         head->l2_len = ETHER_HDR_LEN;
1450         head->l3_len = iph_len;
1451     }
1452 
1453     if (ctx->hw_features.tx_csum_l4) {
1454         struct ipv4_hdr *iph;
1455         int iph_len;
1456         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1457         iph_len = (iph->version_ihl & 0x0f) << 2;
1458 
1459         if (offload.tcp_csum) {
1460             head->ol_flags |= PKT_TX_TCP_CKSUM;
1461             head->l2_len = ETHER_HDR_LEN;
1462             head->l3_len = iph_len;
1463         }
1464 
1465         /*
1466          *  TCP segmentation offload.
1467          *
1468          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1469          *    implies PKT_TX_TCP_CKSUM)
1470          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1471          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1472          *    write the IP checksum to 0 in the packet
1473          *  - fill the mbuf offload information: l2_len,
1474          *    l3_len, l4_len, tso_segsz
1475          *  - calculate the pseudo header checksum without taking ip_len
1476          *    in account, and set it in the TCP header. Refer to
1477          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1478          *    used as helpers.
1479          */
1480         if (offload.tso_seg_size) {
1481             struct tcp_hdr *tcph;
1482             int tcph_len;
1483             tcph = (struct tcp_hdr *)((char *)iph + iph_len);
1484             tcph_len = (tcph->data_off & 0xf0) >> 2;
1485             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1486 
1487             head->ol_flags |= PKT_TX_TCP_SEG;
1488             head->l4_len = tcph_len;
1489             head->tso_segsz = offload.tso_seg_size;
1490         }
1491 
1492         if (offload.udp_csum) {
1493             head->ol_flags |= PKT_TX_UDP_CKSUM;
1494             head->l2_len = ETHER_HDR_LEN;
1495             head->l3_len = iph_len;
1496         }
1497     }
1498 
1499     ff_mbuf_free(m);
1500 
1501     return send_single_packet(head, ctx->port_id);
1502 }
1503 
1504 static int
1505 main_loop(void *arg)
1506 {
1507     struct loop_routine *lr = (struct loop_routine *)arg;
1508 
1509     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1510     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc;
1511     int i, j, nb_rx, idle;
1512     uint16_t port_id, queue_id;
1513     struct lcore_conf *qconf;
1514     uint64_t drain_tsc = 0;
1515     struct ff_dpdk_if_context *ctx;
1516 
1517     if (pkt_tx_delay) {
1518         drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay;
1519     }
1520 
1521     prev_tsc = 0;
1522     usch_tsc = 0;
1523 
1524     qconf = &lcore_conf;
1525 
1526     while (1) {
1527         cur_tsc = rte_rdtsc();
1528         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1529             rte_timer_manage();
1530         }
1531 
1532         idle = 1;
1533         sys_tsc = 0;
1534         usr_tsc = 0;
1535 
1536         /*
1537          * TX burst queue drain
1538          */
1539         diff_tsc = cur_tsc - prev_tsc;
1540         if (unlikely(diff_tsc >= drain_tsc)) {
1541             for (i = 0; i < qconf->nb_tx_port; i++) {
1542                 port_id = qconf->tx_port_id[i];
1543                 if (qconf->tx_mbufs[port_id].len == 0)
1544                     continue;
1545 
1546                 idle = 0;
1547 
1548                 send_burst(qconf,
1549                     qconf->tx_mbufs[port_id].len,
1550                     port_id);
1551                 qconf->tx_mbufs[port_id].len = 0;
1552             }
1553 
1554             prev_tsc = cur_tsc;
1555         }
1556 
1557         /*
1558          * Read packet from RX queues
1559          */
1560         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1561             port_id = qconf->rx_queue_list[i].port_id;
1562             queue_id = qconf->rx_queue_list[i].queue_id;
1563             ctx = veth_ctx[port_id];
1564 
1565 #ifdef FF_KNI
1566             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1567                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1568             }
1569 #endif
1570 
1571             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1572 
1573             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1574                 MAX_PKT_BURST);
1575             if (nb_rx == 0)
1576                 continue;
1577 
1578             idle = 0;
1579 
1580             /* Prefetch first packets */
1581             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1582                 rte_prefetch0(rte_pktmbuf_mtod(
1583                         pkts_burst[j], void *));
1584             }
1585 
1586             /* Prefetch and handle already prefetched packets */
1587             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1588                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1589                         j + PREFETCH_OFFSET], void *));
1590                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1591             }
1592 
1593             /* Handle remaining prefetched packets */
1594             for (; j < nb_rx; j++) {
1595                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1596             }
1597         }
1598 
1599         process_msg_ring(qconf->proc_id);
1600 
1601         div_tsc = rte_rdtsc();
1602 
1603         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) {
1604             usch_tsc = cur_tsc;
1605             lr->loop(lr->arg);
1606         }
1607 
1608         idle_sleep_tsc = rte_rdtsc();
1609         if (likely(idle && idle_sleep)) {
1610             usleep(idle_sleep);
1611             end_tsc = rte_rdtsc();
1612         } else {
1613             end_tsc = idle_sleep_tsc;
1614         }
1615 
1616         if (usch_tsc == cur_tsc) {
1617             usr_tsc = idle_sleep_tsc - div_tsc;
1618         }
1619 
1620         if (!idle) {
1621             sys_tsc = div_tsc - cur_tsc;
1622             ff_top_status.sys_tsc += sys_tsc;
1623         }
1624 
1625         ff_top_status.usr_tsc += usr_tsc;
1626         ff_top_status.work_tsc += end_tsc - cur_tsc;
1627         ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1628 
1629         ff_top_status.loops++;
1630     }
1631 
1632     return 0;
1633 }
1634 
1635 int
1636 ff_dpdk_if_up(void) {
1637     int i;
1638     struct lcore_conf *qconf = &lcore_conf;
1639     for (i = 0; i < qconf->nb_tx_port; i++) {
1640         uint16_t port_id = qconf->tx_port_id[i];
1641 
1642         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1643         veth_ctx[port_id] = ff_veth_attach(pconf);
1644         if (veth_ctx[port_id] == NULL) {
1645             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1646         }
1647     }
1648 
1649     return 0;
1650 }
1651 
1652 void
1653 ff_dpdk_run(loop_func_t loop, void *arg) {
1654     struct loop_routine *lr = rte_malloc(NULL,
1655         sizeof(struct loop_routine), 0);
1656     lr->loop = loop;
1657     lr->arg = arg;
1658     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1659     rte_eal_mp_wait_lcore();
1660     rte_free(lr);
1661 }
1662 
1663 void
1664 ff_dpdk_pktmbuf_free(void *m)
1665 {
1666     rte_pktmbuf_free((struct rte_mbuf *)m);
1667 }
1668 
1669 static uint32_t
1670 toeplitz_hash(unsigned keylen, const uint8_t *key,
1671     unsigned datalen, const uint8_t *data)
1672 {
1673     uint32_t hash = 0, v;
1674     u_int i, b;
1675 
1676     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1677 
1678     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1679     for (i = 0; i < datalen; i++) {
1680         for (b = 0; b < 8; b++) {
1681             if (data[i] & (1<<(7-b)))
1682                 hash ^= v;
1683             v <<= 1;
1684             if ((i + 4) < keylen &&
1685                 (key[i+4] & (1<<(7-b))))
1686                 v |= 1;
1687         }
1688     }
1689     return (hash);
1690 }
1691 
1692 int
1693 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
1694     uint16_t sport, uint16_t dport)
1695 {
1696     struct lcore_conf *qconf = &lcore_conf;
1697     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
1698     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
1699 
1700     if (nb_queues <= 1) {
1701         return 1;
1702     }
1703 
1704     uint16_t reta_size = rss_reta_size[ctx->port_id];
1705     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
1706 
1707     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
1708         sizeof(dport)];
1709 
1710     unsigned datalen = 0;
1711 
1712     bcopy(&saddr, &data[datalen], sizeof(saddr));
1713     datalen += sizeof(saddr);
1714 
1715     bcopy(&daddr, &data[datalen], sizeof(daddr));
1716     datalen += sizeof(daddr);
1717 
1718     bcopy(&sport, &data[datalen], sizeof(sport));
1719     datalen += sizeof(sport);
1720 
1721     bcopy(&dport, &data[datalen], sizeof(dport));
1722     datalen += sizeof(dport);
1723 
1724     uint32_t hash = 0;
1725     if ( !use_rsskey_52bytes )
1726         hash = toeplitz_hash(sizeof(default_rsskey_40bytes),
1727             default_rsskey_40bytes, datalen, data);
1728     else
1729         hash = toeplitz_hash(sizeof(default_rsskey_52bytes),
1730 	    default_rsskey_52bytes, datalen, data);
1731     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
1732 }
1733 
1734 void
1735 ff_regist_packet_dispatcher(dispatch_func_t func)
1736 {
1737     packet_dispatcher = func;
1738 }
1739 
1740 uint64_t
1741 ff_get_tsc_ns()
1742 {
1743     uint64_t cur_tsc = rte_rdtsc();
1744     uint64_t hz = rte_get_tsc_hz();
1745     return ((double)cur_tsc/(double)hz) * NS_PER_S;
1746 }
1747 
1748