xref: /f-stack/lib/ff_dpdk_if.c (revision e7145e36)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 #include <unistd.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30 
31 #include <rte_common.h>
32 #include <rte_byteorder.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memcpy.h>
36 #include <rte_memzone.h>
37 #include <rte_config.h>
38 #include <rte_eal.h>
39 #include <rte_pci.h>
40 #include <rte_mbuf.h>
41 #include <rte_memory.h>
42 #include <rte_lcore.h>
43 #include <rte_launch.h>
44 #include <rte_ethdev.h>
45 #include <rte_debug.h>
46 #include <rte_common.h>
47 #include <rte_ether.h>
48 #include <rte_malloc.h>
49 #include <rte_cycles.h>
50 #include <rte_timer.h>
51 #include <rte_thash.h>
52 #include <rte_ip.h>
53 #include <rte_tcp.h>
54 #include <rte_udp.h>
55 #include <rte_eth_bond.h>
56 
57 #include "ff_dpdk_if.h"
58 #include "ff_dpdk_pcap.h"
59 #include "ff_dpdk_kni.h"
60 #include "ff_config.h"
61 #include "ff_veth.h"
62 #include "ff_host_interface.h"
63 #include "ff_msg.h"
64 #include "ff_api.h"
65 #include "ff_memory.h"
66 
67 #ifdef FF_KNI
68 #define KNI_MBUF_MAX 2048
69 #define KNI_QUEUE_SIZE 2048
70 
71 int enable_kni;
72 static int kni_accept;
73 #endif
74 
75 static int numa_on;
76 
77 static unsigned idle_sleep;
78 static unsigned pkt_tx_delay;
79 
80 static struct rte_timer freebsd_clock;
81 
82 // Mellanox Linux's driver key
83 static uint8_t default_rsskey_40bytes[40] = {
84     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
85     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
86     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
87     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
88     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
89 };
90 
91 static int use_rsskey_52bytes = 0;
92 static uint8_t default_rsskey_52bytes[52] = {
93     0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23,
94     0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30,
95     0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02,
96     0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c,
97     0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55,
98     0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e,
99     0x81, 0x15, 0x03, 0x66
100 };
101 
102 struct lcore_conf lcore_conf;
103 
104 struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
105 
106 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
107 static dispatch_func_t packet_dispatcher;
108 
109 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
110 
111 #define BOND_DRIVER_NAME    "net_bonding"
112 
113 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port);
114 
115 struct ff_msg_ring {
116     char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE];
117     /* ring[0] for lcore recv msg, other send */
118     /* ring[1] for lcore send msg, other read */
119     struct rte_ring *ring[FF_MSG_NUM];
120 } __rte_cache_aligned;
121 
122 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
123 static struct rte_mempool *message_pool;
124 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
125 
126 static struct ff_top_args ff_top_status;
127 static struct ff_traffic_args ff_traffic;
128 extern void ff_hardclock(void);
129 
130 static void
131 ff_hardclock_job(__rte_unused struct rte_timer *timer,
132     __rte_unused void *arg) {
133     ff_hardclock();
134     ff_update_current_ts();
135 }
136 
137 struct ff_dpdk_if_context *
138 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
139 {
140     struct ff_dpdk_if_context *ctx;
141 
142     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
143     if (ctx == NULL)
144         return NULL;
145 
146     ctx->sc = sc;
147     ctx->ifp = ifp;
148     ctx->port_id = cfg->port_id;
149     ctx->hw_features = cfg->hw_features;
150 
151     return ctx;
152 }
153 
154 void
155 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
156 {
157     free(ctx);
158 }
159 
160 static void
161 check_all_ports_link_status(void)
162 {
163     #define CHECK_INTERVAL 100 /* 100ms */
164     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
165 
166     uint16_t portid;
167     uint8_t count, all_ports_up, print_flag = 0;
168     struct rte_eth_link link;
169 
170     printf("\nChecking link status");
171     fflush(stdout);
172 
173     int i, nb_ports;
174     nb_ports = ff_global_cfg.dpdk.nb_ports;
175     for (count = 0; count <= MAX_CHECK_TIME; count++) {
176         all_ports_up = 1;
177         for (i = 0; i < nb_ports; i++) {
178             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
179             memset(&link, 0, sizeof(link));
180             rte_eth_link_get_nowait(portid, &link);
181 
182             /* print link status if flag set */
183             if (print_flag == 1) {
184                 if (link.link_status) {
185                     printf("Port %d Link Up - speed %u "
186                         "Mbps - %s\n", (int)portid,
187                         (unsigned)link.link_speed,
188                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
189                         ("full-duplex") : ("half-duplex\n"));
190                 } else {
191                     printf("Port %d Link Down\n", (int)portid);
192                 }
193                 continue;
194             }
195             /* clear all_ports_up flag if any link down */
196             if (link.link_status == 0) {
197                 all_ports_up = 0;
198                 break;
199             }
200         }
201 
202         /* after finally printing all link status, get out */
203         if (print_flag == 1)
204             break;
205 
206         if (all_ports_up == 0) {
207             printf(".");
208             fflush(stdout);
209             rte_delay_ms(CHECK_INTERVAL);
210         }
211 
212         /* set the print_flag if all ports up or timeout */
213         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
214             print_flag = 1;
215             printf("done\n");
216         }
217     }
218 }
219 
220 static int
221 init_lcore_conf(void)
222 {
223     uint8_t nb_dev_ports = rte_eth_dev_count_avail();
224     if (nb_dev_ports == 0) {
225         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
226     }
227 
228     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
229         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
230                  ff_global_cfg.dpdk.max_portid);
231     }
232 
233     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
234     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
235 
236     uint16_t proc_id;
237     for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) {
238         uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id];
239         if (!lcore_config[lcore_id].detected) {
240             rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
241         }
242     }
243 
244     uint16_t socket_id = 0;
245     if (numa_on) {
246         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
247     }
248 
249     lcore_conf.socket_id = socket_id;
250 
251     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
252     int j;
253     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
254         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
255         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
256 
257         int queueid = -1;
258         int i;
259         for (i = 0; i < pconf->nb_lcores; i++) {
260             if (pconf->lcore_list[i] == lcore_id) {
261                 queueid = i;
262             }
263         }
264         if (queueid < 0) {
265             continue;
266         }
267         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
268         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
269         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
270         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
271         lcore_conf.nb_rx_queue++;
272 
273         lcore_conf.tx_queue_id[port_id] = queueid;
274         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
275         lcore_conf.nb_tx_port++;
276 
277         lcore_conf.pcap[port_id] = pconf->pcap;
278         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
279     }
280 
281     if (lcore_conf.nb_rx_queue == 0) {
282         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
283     }
284 
285     return 0;
286 }
287 
288 static int
289 init_mem_pool(void)
290 {
291     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
292     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
293     uint32_t nb_tx_queue = nb_lcores;
294     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
295     uint16_t max_portid = ff_global_cfg.dpdk.max_portid;
296 
297     unsigned nb_mbuf = RTE_ALIGN_CEIL (
298         (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE          +
299         nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST    +
300         nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE  +
301         nb_lcores * MEMPOOL_CACHE_SIZE +
302 #ifdef FF_KNI
303         nb_ports * KNI_MBUF_MAX +
304         nb_ports * KNI_QUEUE_SIZE +
305 #endif
306         nb_lcores * nb_ports * DISPATCH_RING_SIZE),
307         (unsigned)8192);
308 
309     unsigned socketid = 0;
310     uint16_t i, lcore_id;
311     char s[64];
312 
313     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
314         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
315         if (numa_on) {
316             socketid = rte_lcore_to_socket_id(lcore_id);
317         }
318 
319         if (socketid >= NB_SOCKETS) {
320             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
321                 socketid, i, NB_SOCKETS);
322         }
323 
324         if (pktmbuf_pool[socketid] != NULL) {
325             continue;
326         }
327 
328         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
329             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
330             pktmbuf_pool[socketid] =
331                 rte_pktmbuf_pool_create(s, nb_mbuf,
332                     MEMPOOL_CACHE_SIZE, 0,
333                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
334         } else {
335             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
336             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
337         }
338 
339         if (pktmbuf_pool[socketid] == NULL) {
340             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
341         } else {
342             printf("create mbuf pool on socket %d\n", socketid);
343         }
344 
345 #ifdef FF_USE_PAGE_ARRAY
346         nb_mbuf = RTE_ALIGN_CEIL (
347             nb_ports*nb_lcores*MAX_PKT_BURST    +
348             nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
349             nb_lcores*MEMPOOL_CACHE_SIZE,
350             (unsigned)4096);
351         ff_init_ref_pool(nb_mbuf, socketid);
352 #endif
353     }
354 
355     return 0;
356 }
357 
358 static struct rte_ring *
359 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
360 {
361     struct rte_ring *ring;
362 
363     if (name == NULL) {
364         rte_exit(EXIT_FAILURE, "create ring failed, no name!\n");
365     }
366 
367     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
368         ring = rte_ring_create(name, count, socket_id, flags);
369     } else {
370         ring = rte_ring_lookup(name);
371     }
372 
373     if (ring == NULL) {
374         rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name);
375     }
376 
377     return ring;
378 }
379 
380 static int
381 init_dispatch_ring(void)
382 {
383     int j;
384     char name_buf[RTE_RING_NAMESIZE];
385     int queueid;
386 
387     unsigned socketid = lcore_conf.socket_id;
388 
389     /* Create ring according to ports actually being used. */
390     int nb_ports = ff_global_cfg.dpdk.nb_ports;
391     for (j = 0; j < nb_ports; j++) {
392         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
393         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
394         int nb_queues = pconf->nb_lcores;
395         if (dispatch_ring[portid] == NULL) {
396             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
397 
398             dispatch_ring[portid] = rte_zmalloc(name_buf,
399                 sizeof(struct rte_ring *) * nb_queues,
400                 RTE_CACHE_LINE_SIZE);
401             if (dispatch_ring[portid] == NULL) {
402                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
403                     "failed\n", name_buf);
404             }
405         }
406 
407         for(queueid = 0; queueid < nb_queues; ++queueid) {
408             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
409                 portid, queueid);
410             dispatch_ring[portid][queueid] = create_ring(name_buf,
411                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
412 
413             if (dispatch_ring[portid][queueid] == NULL)
414                 rte_panic("create ring:%s failed!\n", name_buf);
415 
416             printf("create ring:%s success, %u ring entries are now free!\n",
417                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
418         }
419     }
420 
421     return 0;
422 }
423 
424 static void
425 ff_msg_init(struct rte_mempool *mp,
426     __attribute__((unused)) void *opaque_arg,
427     void *obj, __attribute__((unused)) unsigned i)
428 {
429     struct ff_msg *msg = (struct ff_msg *)obj;
430     msg->msg_type = FF_UNKNOWN;
431     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
432     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
433 }
434 
435 static int
436 init_msg_ring(void)
437 {
438     uint16_t i, j;
439     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
440     unsigned socketid = lcore_conf.socket_id;
441 
442     /* Create message buffer pool */
443     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
444         message_pool = rte_mempool_create(FF_MSG_POOL,
445            MSG_RING_SIZE * 2 * nb_procs,
446            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
447            NULL, NULL, ff_msg_init, NULL,
448            socketid, 0);
449     } else {
450         message_pool = rte_mempool_lookup(FF_MSG_POOL);
451     }
452 
453     if (message_pool == NULL) {
454         rte_panic("Create msg mempool failed\n");
455     }
456 
457     for(i = 0; i < nb_procs; ++i) {
458         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
459             "%s%u", FF_MSG_RING_IN, i);
460         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
461             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
462         if (msg_ring[i].ring[0] == NULL)
463             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
464 
465         for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) {
466             snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE,
467                 "%s%u_%u", FF_MSG_RING_OUT, i, j);
468             msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j],
469                 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
470             if (msg_ring[i].ring[j] == NULL)
471                 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]);
472         }
473     }
474 
475     return 0;
476 }
477 
478 #ifdef FF_KNI
479 static int
480 init_kni(void)
481 {
482     int nb_ports = rte_eth_dev_count_avail();
483     kni_accept = 0;
484     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
485         kni_accept = 1;
486 
487     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
488         ff_global_cfg.kni.udp_port);
489 
490     unsigned socket_id = lcore_conf.socket_id;
491     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
492 
493     nb_ports = ff_global_cfg.dpdk.nb_ports;
494     int i, ret;
495     for (i = 0; i < nb_ports; i++) {
496         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
497         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
498     }
499 
500     return 0;
501 }
502 #endif
503 
504 static void
505 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
506 {
507     if (reta_size == 0) {
508         return;
509     }
510 
511     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
512     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
513 
514     /* config HW indirection table */
515     unsigned i, j, hash=0;
516     for (i = 0; i < reta_conf_size; i++) {
517         reta_conf[i].mask = ~0ULL;
518         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
519             reta_conf[i].reta[j] = hash++ % nb_queues;
520         }
521     }
522 
523     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
524         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
525             port_id);
526     }
527 }
528 
529 static int
530 init_port_start(void)
531 {
532     int nb_ports = ff_global_cfg.dpdk.nb_ports;
533     unsigned socketid = 0;
534     struct rte_mempool *mbuf_pool;
535     uint16_t i, j;
536 
537     for (i = 0; i < nb_ports; i++) {
538         uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i];
539         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id];
540         uint16_t nb_queues = pconf->nb_lcores;
541 
542         for (j=0; j<=pconf->nb_slaves; j++) {
543             if (j < pconf->nb_slaves) {
544                 port_id = pconf->slave_portid_list[j];
545                 printf("To init %s's %d'st slave port[%d]\n",
546                         ff_global_cfg.dpdk.bond_cfgs->name,
547                         j, port_id);
548             } else {
549                 port_id = u_port_id;
550             }
551 
552             struct rte_eth_dev_info dev_info;
553             struct rte_eth_conf port_conf = {0};
554             struct rte_eth_rxconf rxq_conf;
555             struct rte_eth_txconf txq_conf;
556 
557             rte_eth_dev_info_get(port_id, &dev_info);
558 
559             if (nb_queues > dev_info.max_rx_queues) {
560                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
561                     nb_queues,
562                     dev_info.max_rx_queues);
563             }
564 
565             if (nb_queues > dev_info.max_tx_queues) {
566                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
567                     nb_queues,
568                     dev_info.max_tx_queues);
569             }
570 
571             struct ether_addr addr;
572             rte_eth_macaddr_get(port_id, &addr);
573             printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
574                        " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
575                     (unsigned)port_id,
576                     addr.addr_bytes[0], addr.addr_bytes[1],
577                     addr.addr_bytes[2], addr.addr_bytes[3],
578                     addr.addr_bytes[4], addr.addr_bytes[5]);
579 
580             rte_memcpy(pconf->mac,
581                 addr.addr_bytes, ETHER_ADDR_LEN);
582 
583             /* Set RSS mode */
584             uint64_t default_rss_hf = ETH_RSS_PROTO_MASK;
585             port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
586             port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf;
587             if (dev_info.hash_key_size == 52) {
588                 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_52bytes;
589                 port_conf.rx_adv_conf.rss_conf.rss_key_len = 52;
590                 use_rsskey_52bytes = 1;
591             } else {
592                 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes;
593                 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40;
594             }
595             port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads;
596             if (port_conf.rx_adv_conf.rss_conf.rss_hf !=
597                     ETH_RSS_PROTO_MASK) {
598                 printf("Port %u modified RSS hash function based on hardware support,"
599                         "requested:%#"PRIx64" configured:%#"PRIx64"\n",
600                         port_id, default_rss_hf,
601                         port_conf.rx_adv_conf.rss_conf.rss_hf);
602             }
603 
604             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) {
605                 port_conf.txmode.offloads |=
606                     DEV_TX_OFFLOAD_MBUF_FAST_FREE;
607             }
608 
609             /* Set Rx VLAN stripping */
610             if (ff_global_cfg.dpdk.vlan_strip) {
611                 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
612                     port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
613                 }
614             }
615 
616             /* Enable HW CRC stripping */
617             port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC;
618 
619             /* FIXME: Enable TCP LRO ?*/
620             #if 0
621             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
622                 printf("LRO is supported\n");
623                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
624                 pconf->hw_features.rx_lro = 1;
625             }
626             #endif
627 
628             /* Set Rx checksum checking */
629             if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
630                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
631                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
632                 printf("RX checksum offload supported\n");
633                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
634                 pconf->hw_features.rx_csum = 1;
635             }
636 
637             if (ff_global_cfg.dpdk.tx_csum_offoad_skip == 0) {
638                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
639                     printf("TX ip checksum offload supported\n");
640                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
641                     pconf->hw_features.tx_csum_ip = 1;
642                 }
643 
644                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
645                     (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
646                     printf("TX TCP&UDP checksum offload supported\n");
647                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM;
648                     pconf->hw_features.tx_csum_l4 = 1;
649                 }
650             } else {
651                 printf("TX checksum offoad is disabled\n");
652             }
653 
654             if (ff_global_cfg.dpdk.tso) {
655                 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
656                     printf("TSO is supported\n");
657                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO;
658                     pconf->hw_features.tx_tso = 1;
659                 }
660             } else {
661                 printf("TSO is disabled\n");
662             }
663 
664             if (dev_info.reta_size) {
665                 /* reta size must be power of 2 */
666                 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
667 
668                 rss_reta_size[port_id] = dev_info.reta_size;
669                 printf("port[%d]: rss table size: %d\n", port_id,
670                     dev_info.reta_size);
671             }
672 
673             if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
674                 continue;
675             }
676 
677             int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
678             if (ret != 0) {
679                 return ret;
680             }
681 
682             static uint16_t nb_rxd = RX_QUEUE_SIZE;
683             static uint16_t nb_txd = TX_QUEUE_SIZE;
684             ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd);
685             if (ret < 0)
686                 printf("Could not adjust number of descriptors "
687                         "for port%u (%d)\n", (unsigned)port_id, ret);
688 
689             uint16_t q;
690             for (q = 0; q < nb_queues; q++) {
691                 if (numa_on) {
692                     uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
693                     socketid = rte_lcore_to_socket_id(lcore_id);
694                 }
695                 mbuf_pool = pktmbuf_pool[socketid];
696 
697                 txq_conf = dev_info.default_txconf;
698                 txq_conf.offloads = port_conf.txmode.offloads;
699                 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd,
700                     socketid, &txq_conf);
701                 if (ret < 0) {
702                     return ret;
703                 }
704 
705                 rxq_conf = dev_info.default_rxconf;
706                 rxq_conf.offloads = port_conf.rxmode.offloads;
707                 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd,
708                     socketid, &rxq_conf, mbuf_pool);
709                 if (ret < 0) {
710                     return ret;
711                 }
712             }
713 
714 
715             if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME,
716                     strlen(dev_info.driver_name)) == 0) {
717 
718                 rte_eth_macaddr_get(port_id, &addr);
719                 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
720                            " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
721                         (unsigned)port_id,
722                         addr.addr_bytes[0], addr.addr_bytes[1],
723                         addr.addr_bytes[2], addr.addr_bytes[3],
724                         addr.addr_bytes[4], addr.addr_bytes[5]);
725 
726                 rte_memcpy(pconf->mac,
727                     addr.addr_bytes, ETHER_ADDR_LEN);
728 
729                 int mode, count, x;
730                 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS;
731 
732                 mode = rte_eth_bond_mode_get(port_id);
733                 printf("Port %u, bond mode:%d\n", port_id, mode);
734 
735                 count = rte_eth_bond_slaves_get(port_id, slaves, len);
736                 printf("Port %u, %s's slave ports count:%d\n", port_id,
737                             ff_global_cfg.dpdk.bond_cfgs->name, count);
738                 for (x=0; x<count; x++) {
739                     printf("Port %u, %s's slave port[%u]\n", port_id,
740                             ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]);
741                 }
742             }
743 
744             ret = rte_eth_dev_start(port_id);
745             if (ret < 0) {
746                 return ret;
747             }
748 
749             if (nb_queues > 1) {
750                 /* set HW rss hash function to Toeplitz. */
751                 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
752                     struct rte_eth_hash_filter_info info = {0};
753                     info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
754                     info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
755 
756                     if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
757                         RTE_ETH_FILTER_SET, &info) < 0) {
758                         rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
759                             port_id);
760                     }
761                 }
762 
763                 set_rss_table(port_id, dev_info.reta_size, nb_queues);
764             }
765 
766             /* Enable RX in promiscuous mode for the Ethernet device. */
767             if (ff_global_cfg.dpdk.promiscuous) {
768                 rte_eth_promiscuous_enable(port_id);
769                 ret = rte_eth_promiscuous_get(port_id);
770                 if (ret == 1) {
771                     printf("set port %u to promiscuous mode ok\n", port_id);
772                 } else {
773                     printf("set port %u to promiscuous mode error\n", port_id);
774                 }
775             }
776 
777             /* Enable pcap dump */
778             if (pconf->pcap) {
779                 ff_enable_pcap(pconf->pcap);
780             }
781         }
782     }
783 
784     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
785         check_all_ports_link_status();
786     }
787 
788     return 0;
789 }
790 
791 static int
792 init_clock(void)
793 {
794     rte_timer_subsystem_init();
795     uint64_t hz = rte_get_timer_hz();
796     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
797     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
798 
799     rte_timer_init(&freebsd_clock);
800     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
801         rte_lcore_id(), &ff_hardclock_job, NULL);
802 
803     ff_update_current_ts();
804 
805     return 0;
806 }
807 
808 int
809 ff_dpdk_init(int argc, char **argv)
810 {
811     if (ff_global_cfg.dpdk.nb_procs < 1 ||
812         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
813         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
814         ff_global_cfg.dpdk.proc_id < 0) {
815         printf("param num_procs[%d] or proc_id[%d] error!\n",
816             ff_global_cfg.dpdk.nb_procs,
817             ff_global_cfg.dpdk.proc_id);
818         exit(1);
819     }
820 
821     int ret = rte_eal_init(argc, argv);
822     if (ret < 0) {
823         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
824     }
825 
826     numa_on = ff_global_cfg.dpdk.numa_on;
827 
828     idle_sleep = ff_global_cfg.dpdk.idle_sleep;
829     pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \
830         BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay;
831 
832     init_lcore_conf();
833 
834     init_mem_pool();
835 
836     init_dispatch_ring();
837 
838     init_msg_ring();
839 
840 #ifdef FF_KNI
841     enable_kni = ff_global_cfg.kni.enable;
842     if (enable_kni) {
843         init_kni();
844     }
845 #endif
846 
847 #ifdef FF_USE_PAGE_ARRAY
848     ff_mmap_init();
849 #endif
850 
851     ret = init_port_start();
852     if (ret < 0) {
853         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
854     }
855 
856     init_clock();
857 
858     return 0;
859 }
860 
861 static void
862 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
863 {
864     uint8_t rx_csum = ctx->hw_features.rx_csum;
865     if (rx_csum) {
866         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
867             rte_pktmbuf_free(pkt);
868             return;
869         }
870     }
871 
872     void *data = rte_pktmbuf_mtod(pkt, void*);
873     uint16_t len = rte_pktmbuf_data_len(pkt);
874 
875     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
876     if (hdr == NULL) {
877         rte_pktmbuf_free(pkt);
878         return;
879     }
880 
881     if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) {
882         ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci);
883     }
884 
885     struct rte_mbuf *pn = pkt->next;
886     void *prev = hdr;
887     while(pn != NULL) {
888         data = rte_pktmbuf_mtod(pn, void*);
889         len = rte_pktmbuf_data_len(pn);
890 
891         void *mb = ff_mbuf_get(prev, data, len);
892         if (mb == NULL) {
893             ff_mbuf_free(hdr);
894             rte_pktmbuf_free(pkt);
895             return;
896         }
897         pn = pn->next;
898         prev = mb;
899     }
900 
901     ff_veth_process_packet(ctx->ifp, hdr);
902 }
903 
904 static enum FilterReturn
905 protocol_filter(const void *data, uint16_t len)
906 {
907     if(len < ETHER_HDR_LEN)
908         return FILTER_UNKNOWN;
909 
910     const struct ether_hdr *hdr;
911     const struct vlan_hdr *vlanhdr;
912     hdr = (const struct ether_hdr *)data;
913     uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type);
914     data += ETHER_HDR_LEN;
915     len -= ETHER_HDR_LEN;
916 
917     if (ether_type == ETHER_TYPE_VLAN) {
918         vlanhdr = (struct vlan_hdr *)data;
919         ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto);
920         data += sizeof(struct vlan_hdr);
921         len -= sizeof(struct vlan_hdr);
922     }
923 
924     if(ether_type == ETHER_TYPE_ARP)
925         return FILTER_ARP;
926 
927 #ifdef INET6
928     if (ether_type == ETHER_TYPE_IPv6) {
929         return ff_kni_proto_filter(data,
930             len, ether_type);
931     }
932 #endif
933 
934 #ifndef FF_KNI
935     return FILTER_UNKNOWN;
936 #else
937     if (!enable_kni) {
938         return FILTER_UNKNOWN;
939     }
940 
941     if(ether_type != ETHER_TYPE_IPv4)
942         return FILTER_UNKNOWN;
943 
944     return ff_kni_proto_filter(data,
945         len, ether_type);
946 #endif
947 }
948 
949 static inline void
950 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
951 {
952     struct rte_mbuf *md;
953     void *src, *dst;
954 
955     dst = rte_pktmbuf_mtod(mi, void *);
956     src = rte_pktmbuf_mtod(m, void *);
957 
958     mi->data_len = m->data_len;
959     rte_memcpy(dst, src, m->data_len);
960 
961     mi->port = m->port;
962     mi->vlan_tci = m->vlan_tci;
963     mi->vlan_tci_outer = m->vlan_tci_outer;
964     mi->tx_offload = m->tx_offload;
965     mi->hash = m->hash;
966     mi->ol_flags = m->ol_flags;
967     mi->packet_type = m->packet_type;
968 }
969 
970 /* copied from rte_pktmbuf_clone */
971 static inline struct rte_mbuf *
972 pktmbuf_deep_clone(const struct rte_mbuf *md,
973     struct rte_mempool *mp)
974 {
975     struct rte_mbuf *mc, *mi, **prev;
976     uint32_t pktlen;
977     uint8_t nseg;
978 
979     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
980         return NULL;
981 
982     mi = mc;
983     prev = &mi->next;
984     pktlen = md->pkt_len;
985     nseg = 0;
986 
987     do {
988         nseg++;
989         pktmbuf_deep_attach(mi, md);
990         *prev = mi;
991         prev = &mi->next;
992     } while ((md = md->next) != NULL &&
993         (mi = rte_pktmbuf_alloc(mp)) != NULL);
994 
995     *prev = NULL;
996     mc->nb_segs = nseg;
997     mc->pkt_len = pktlen;
998 
999     /* Allocation of new indirect segment failed */
1000     if (unlikely (mi == NULL)) {
1001         rte_pktmbuf_free(mc);
1002         return NULL;
1003     }
1004 
1005     __rte_mbuf_sanity_check(mc, 1);
1006     return mc;
1007 }
1008 
1009 static inline void
1010 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
1011     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
1012 {
1013     struct lcore_conf *qconf = &lcore_conf;
1014     uint16_t nb_queues = qconf->nb_queue_list[port_id];
1015 
1016     uint16_t i;
1017     for (i = 0; i < count; i++) {
1018         struct rte_mbuf *rtem = bufs[i];
1019 
1020         if (unlikely(qconf->pcap[port_id] != NULL)) {
1021             if (!pkts_from_ring) {
1022                 ff_dump_packets(qconf->pcap[port_id], rtem);
1023             }
1024         }
1025 
1026         void *data = rte_pktmbuf_mtod(rtem, void*);
1027         uint16_t len = rte_pktmbuf_data_len(rtem);
1028 
1029         if (!pkts_from_ring) {
1030             ff_traffic.rx_packets++;
1031             ff_traffic.rx_bytes += len;
1032         }
1033 
1034         if (!pkts_from_ring && packet_dispatcher) {
1035             int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues);
1036             if (ret == FF_DISPATCH_RESPONSE) {
1037                 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len;
1038 
1039                 /*
1040                  * We have not support vlan out strip
1041                  */
1042                 if (rtem->vlan_tci) {
1043                     data = rte_pktmbuf_prepend(rtem, sizeof(struct vlan_hdr));
1044                     if (data != NULL) {
1045                         memmove(data, data + sizeof(struct vlan_hdr), ETHER_HDR_LEN);
1046                         struct ether_hdr *etherhdr = (struct ether_hdr *)data;
1047                         struct vlan_hdr *vlanhdr = (struct vlan_hdr *)(data + ETHER_HDR_LEN);
1048                         vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci);
1049                         vlanhdr->eth_proto = etherhdr->ether_type;
1050                         etherhdr->ether_type = rte_cpu_to_be_16(ETHER_TYPE_VLAN);
1051                     }
1052                 }
1053                 send_single_packet(rtem, port_id);
1054                 continue;
1055             }
1056 
1057             if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) {
1058                 rte_pktmbuf_free(rtem);
1059                 continue;
1060             }
1061 
1062             if (ret != queue_id) {
1063                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
1064                 if (ret < 0)
1065                     rte_pktmbuf_free(rtem);
1066 
1067                 continue;
1068             }
1069         }
1070 
1071         enum FilterReturn filter = protocol_filter(data, len);
1072 #ifdef INET6
1073         if (filter == FILTER_ARP || filter == FILTER_NDP) {
1074 #else
1075         if (filter == FILTER_ARP) {
1076 #endif
1077             struct rte_mempool *mbuf_pool;
1078             struct rte_mbuf *mbuf_clone;
1079             if (!pkts_from_ring) {
1080                 uint16_t j;
1081                 for(j = 0; j < nb_queues; ++j) {
1082                     if(j == queue_id)
1083                         continue;
1084 
1085                     unsigned socket_id = 0;
1086                     if (numa_on) {
1087                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1088                         socket_id = rte_lcore_to_socket_id(lcore_id);
1089                     }
1090                     mbuf_pool = pktmbuf_pool[socket_id];
1091                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1092                     if(mbuf_clone) {
1093                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1094                             mbuf_clone);
1095                         if (ret < 0)
1096                             rte_pktmbuf_free(mbuf_clone);
1097                     }
1098                 }
1099             }
1100 
1101 #ifdef FF_KNI
1102             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1103                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1104                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1105                 if(mbuf_clone) {
1106                     ff_kni_enqueue(port_id, mbuf_clone);
1107                 }
1108             }
1109 #endif
1110             ff_veth_input(ctx, rtem);
1111 #ifdef FF_KNI
1112         } else if (enable_kni &&
1113             ((filter == FILTER_KNI && kni_accept) ||
1114             (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1115             ff_kni_enqueue(port_id, rtem);
1116 #endif
1117         } else {
1118             ff_veth_input(ctx, rtem);
1119         }
1120     }
1121 }
1122 
1123 static inline int
1124 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1125     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1126 {
1127     /* read packet from ring buf and to process */
1128     uint16_t nb_rb;
1129     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1130         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1131 
1132     if(nb_rb > 0) {
1133         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1134     }
1135 
1136     return 0;
1137 }
1138 
1139 static inline void
1140 handle_sysctl_msg(struct ff_msg *msg)
1141 {
1142     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1143         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1144         msg->sysctl.newlen);
1145 
1146     if (ret < 0) {
1147         msg->result = errno;
1148     } else {
1149         msg->result = 0;
1150     }
1151 }
1152 
1153 static inline void
1154 handle_ioctl_msg(struct ff_msg *msg)
1155 {
1156     int fd, ret;
1157 #ifdef INET6
1158     if (msg->msg_type == FF_IOCTL6) {
1159         fd = ff_socket(AF_INET6, SOCK_DGRAM, 0);
1160     } else
1161 #endif
1162         fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1163 
1164     if (fd < 0) {
1165         ret = -1;
1166         goto done;
1167     }
1168 
1169     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1170 
1171     ff_close(fd);
1172 
1173 done:
1174     if (ret < 0) {
1175         msg->result = errno;
1176     } else {
1177         msg->result = 0;
1178     }
1179 }
1180 
1181 static inline void
1182 handle_route_msg(struct ff_msg *msg)
1183 {
1184     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1185         &msg->route.len, msg->route.maxlen);
1186     if (ret < 0) {
1187         msg->result = errno;
1188     } else {
1189         msg->result = 0;
1190     }
1191 }
1192 
1193 static inline void
1194 handle_top_msg(struct ff_msg *msg)
1195 {
1196     msg->top = ff_top_status;
1197     msg->result = 0;
1198 }
1199 
1200 #ifdef FF_NETGRAPH
1201 static inline void
1202 handle_ngctl_msg(struct ff_msg *msg)
1203 {
1204     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1205     if (ret < 0) {
1206         msg->result = errno;
1207     } else {
1208         msg->result = 0;
1209         msg->ngctl.ret = ret;
1210     }
1211 }
1212 #endif
1213 
1214 #ifdef FF_IPFW
1215 static inline void
1216 handle_ipfw_msg(struct ff_msg *msg)
1217 {
1218     int fd, ret;
1219     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1220     if (fd < 0) {
1221         ret = -1;
1222         goto done;
1223     }
1224 
1225     switch (msg->ipfw.cmd) {
1226         case FF_IPFW_GET:
1227             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1228                 msg->ipfw.optname, msg->ipfw.optval,
1229                 msg->ipfw.optlen);
1230             break;
1231         case FF_IPFW_SET:
1232             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1233                 msg->ipfw.optname, msg->ipfw.optval,
1234                 *(msg->ipfw.optlen));
1235             break;
1236         default:
1237             ret = -1;
1238             errno = ENOTSUP;
1239             break;
1240     }
1241 
1242     ff_close(fd);
1243 
1244 done:
1245     if (ret < 0) {
1246         msg->result = errno;
1247     } else {
1248         msg->result = 0;
1249     }
1250 }
1251 #endif
1252 
1253 static inline void
1254 handle_traffic_msg(struct ff_msg *msg)
1255 {
1256     msg->traffic = ff_traffic;
1257     msg->result = 0;
1258 }
1259 
1260 static inline void
1261 handle_default_msg(struct ff_msg *msg)
1262 {
1263     msg->result = ENOTSUP;
1264 }
1265 
1266 static inline void
1267 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1268 {
1269     switch (msg->msg_type) {
1270         case FF_SYSCTL:
1271             handle_sysctl_msg(msg);
1272             break;
1273         case FF_IOCTL:
1274 #ifdef INET6
1275         case FF_IOCTL6:
1276 #endif
1277             handle_ioctl_msg(msg);
1278             break;
1279         case FF_ROUTE:
1280             handle_route_msg(msg);
1281             break;
1282         case FF_TOP:
1283             handle_top_msg(msg);
1284             break;
1285 #ifdef FF_NETGRAPH
1286         case FF_NGCTL:
1287             handle_ngctl_msg(msg);
1288             break;
1289 #endif
1290 #ifdef FF_IPFW
1291         case FF_IPFW_CTL:
1292             handle_ipfw_msg(msg);
1293             break;
1294 #endif
1295         case FF_TRAFFIC:
1296             handle_traffic_msg(msg);
1297             break;
1298         default:
1299             handle_default_msg(msg);
1300             break;
1301     }
1302     rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg);
1303 }
1304 
1305 static inline int
1306 process_msg_ring(uint16_t proc_id)
1307 {
1308     void *msg;
1309     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1310 
1311     if (unlikely(ret == 0)) {
1312         handle_msg((struct ff_msg *)msg, proc_id);
1313     }
1314 
1315     return 0;
1316 }
1317 
1318 /* Send burst of packets on an output interface */
1319 static inline int
1320 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1321 {
1322     struct rte_mbuf **m_table;
1323     int ret;
1324     uint16_t queueid;
1325 
1326     queueid = qconf->tx_queue_id[port];
1327     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1328 
1329     if (unlikely(qconf->pcap[port] != NULL)) {
1330         uint16_t i;
1331         for (i = 0; i < n; i++) {
1332             ff_dump_packets(qconf->pcap[port], m_table[i]);
1333         }
1334     }
1335 
1336     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1337     ff_traffic.tx_packets += ret;
1338     uint16_t i;
1339     for (i = 0; i < ret; i++) {
1340         ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]);
1341 #ifdef FF_USE_PAGE_ARRAY
1342         if (qconf->tx_mbufs[port].bsd_m_table[i])
1343             ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs);
1344 #endif
1345     }
1346     if (unlikely(ret < n)) {
1347         do {
1348             rte_pktmbuf_free(m_table[ret]);
1349 #ifdef FF_USE_PAGE_ARRAY
1350             if ( qconf->tx_mbufs[port].bsd_m_table[ret] )
1351                 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]);
1352 #endif
1353         } while (++ret < n);
1354     }
1355     return 0;
1356 }
1357 
1358 /* Enqueue a single packet, and send burst if queue is filled */
1359 static inline int
1360 send_single_packet(struct rte_mbuf *m, uint8_t port)
1361 {
1362     uint16_t len;
1363     struct lcore_conf *qconf;
1364 
1365     qconf = &lcore_conf;
1366     len = qconf->tx_mbufs[port].len;
1367     qconf->tx_mbufs[port].m_table[len] = m;
1368     len++;
1369 
1370     /* enough pkts to be sent */
1371     if (unlikely(len == MAX_PKT_BURST)) {
1372         send_burst(qconf, MAX_PKT_BURST, port);
1373         len = 0;
1374     }
1375 
1376     qconf->tx_mbufs[port].len = len;
1377     return 0;
1378 }
1379 
1380 int
1381 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1382     int total)
1383 {
1384 #ifdef FF_USE_PAGE_ARRAY
1385     struct lcore_conf *qconf = &lcore_conf;
1386     int    len = 0;
1387 
1388     len = ff_if_send_onepkt(ctx, m,total);
1389     if (unlikely(len == MAX_PKT_BURST)) {
1390         send_burst(qconf, MAX_PKT_BURST, ctx->port_id);
1391         len = 0;
1392     }
1393     qconf->tx_mbufs[ctx->port_id].len = len;
1394     return 0;
1395 #endif
1396     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1397     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1398     if (head == NULL) {
1399         ff_mbuf_free(m);
1400         return -1;
1401     }
1402 
1403     head->pkt_len = total;
1404     head->nb_segs = 0;
1405 
1406     int off = 0;
1407     struct rte_mbuf *cur = head, *prev = NULL;
1408     while(total > 0) {
1409         if (cur == NULL) {
1410             cur = rte_pktmbuf_alloc(mbuf_pool);
1411             if (cur == NULL) {
1412                 rte_pktmbuf_free(head);
1413                 ff_mbuf_free(m);
1414                 return -1;
1415             }
1416         }
1417 
1418         if (prev != NULL) {
1419             prev->next = cur;
1420         }
1421         head->nb_segs++;
1422 
1423         prev = cur;
1424         void *data = rte_pktmbuf_mtod(cur, void*);
1425         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1426         int ret = ff_mbuf_copydata(m, data, off, len);
1427         if (ret < 0) {
1428             rte_pktmbuf_free(head);
1429             ff_mbuf_free(m);
1430             return -1;
1431         }
1432 
1433 
1434         cur->data_len = len;
1435         off += len;
1436         total -= len;
1437         cur = NULL;
1438     }
1439 
1440     struct ff_tx_offload offload = {0};
1441     ff_mbuf_tx_offload(m, &offload);
1442 
1443     void *data = rte_pktmbuf_mtod(head, void*);
1444 
1445     if (offload.ip_csum) {
1446         /* ipv6 not supported yet */
1447         struct ipv4_hdr *iph;
1448         int iph_len;
1449         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1450         iph_len = (iph->version_ihl & 0x0f) << 2;
1451 
1452         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1453         head->l2_len = ETHER_HDR_LEN;
1454         head->l3_len = iph_len;
1455     }
1456 
1457     if (ctx->hw_features.tx_csum_l4) {
1458         struct ipv4_hdr *iph;
1459         int iph_len;
1460         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1461         iph_len = (iph->version_ihl & 0x0f) << 2;
1462 
1463         if (offload.tcp_csum) {
1464             head->ol_flags |= PKT_TX_TCP_CKSUM;
1465             head->l2_len = ETHER_HDR_LEN;
1466             head->l3_len = iph_len;
1467         }
1468 
1469         /*
1470          *  TCP segmentation offload.
1471          *
1472          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1473          *    implies PKT_TX_TCP_CKSUM)
1474          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1475          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1476          *    write the IP checksum to 0 in the packet
1477          *  - fill the mbuf offload information: l2_len,
1478          *    l3_len, l4_len, tso_segsz
1479          *  - calculate the pseudo header checksum without taking ip_len
1480          *    in account, and set it in the TCP header. Refer to
1481          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1482          *    used as helpers.
1483          */
1484         if (offload.tso_seg_size) {
1485             struct tcp_hdr *tcph;
1486             int tcph_len;
1487             tcph = (struct tcp_hdr *)((char *)iph + iph_len);
1488             tcph_len = (tcph->data_off & 0xf0) >> 2;
1489             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1490 
1491             head->ol_flags |= PKT_TX_TCP_SEG;
1492             head->l4_len = tcph_len;
1493             head->tso_segsz = offload.tso_seg_size;
1494         }
1495 
1496         if (offload.udp_csum) {
1497             head->ol_flags |= PKT_TX_UDP_CKSUM;
1498             head->l2_len = ETHER_HDR_LEN;
1499             head->l3_len = iph_len;
1500         }
1501     }
1502 
1503     ff_mbuf_free(m);
1504 
1505     return send_single_packet(head, ctx->port_id);
1506 }
1507 
1508 static int
1509 main_loop(void *arg)
1510 {
1511     struct loop_routine *lr = (struct loop_routine *)arg;
1512 
1513     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1514     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc;
1515     int i, j, nb_rx, idle;
1516     uint16_t port_id, queue_id;
1517     struct lcore_conf *qconf;
1518     uint64_t drain_tsc = 0;
1519     struct ff_dpdk_if_context *ctx;
1520 
1521     if (pkt_tx_delay) {
1522         drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay;
1523     }
1524 
1525     prev_tsc = 0;
1526     usch_tsc = 0;
1527 
1528     qconf = &lcore_conf;
1529 
1530     while (1) {
1531         cur_tsc = rte_rdtsc();
1532         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1533             rte_timer_manage();
1534         }
1535 
1536         idle = 1;
1537         sys_tsc = 0;
1538         usr_tsc = 0;
1539 
1540         /*
1541          * TX burst queue drain
1542          */
1543         diff_tsc = cur_tsc - prev_tsc;
1544         if (unlikely(diff_tsc >= drain_tsc)) {
1545             for (i = 0; i < qconf->nb_tx_port; i++) {
1546                 port_id = qconf->tx_port_id[i];
1547                 if (qconf->tx_mbufs[port_id].len == 0)
1548                     continue;
1549 
1550                 idle = 0;
1551 
1552                 send_burst(qconf,
1553                     qconf->tx_mbufs[port_id].len,
1554                     port_id);
1555                 qconf->tx_mbufs[port_id].len = 0;
1556             }
1557 
1558             prev_tsc = cur_tsc;
1559         }
1560 
1561         /*
1562          * Read packet from RX queues
1563          */
1564         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1565             port_id = qconf->rx_queue_list[i].port_id;
1566             queue_id = qconf->rx_queue_list[i].queue_id;
1567             ctx = veth_ctx[port_id];
1568 
1569 #ifdef FF_KNI
1570             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1571                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1572             }
1573 #endif
1574 
1575             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1576 
1577             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1578                 MAX_PKT_BURST);
1579             if (nb_rx == 0)
1580                 continue;
1581 
1582             idle = 0;
1583 
1584             /* Prefetch first packets */
1585             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1586                 rte_prefetch0(rte_pktmbuf_mtod(
1587                         pkts_burst[j], void *));
1588             }
1589 
1590             /* Prefetch and handle already prefetched packets */
1591             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1592                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1593                         j + PREFETCH_OFFSET], void *));
1594                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1595             }
1596 
1597             /* Handle remaining prefetched packets */
1598             for (; j < nb_rx; j++) {
1599                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1600             }
1601         }
1602 
1603         process_msg_ring(qconf->proc_id);
1604 
1605         div_tsc = rte_rdtsc();
1606 
1607         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) {
1608             usch_tsc = cur_tsc;
1609             lr->loop(lr->arg);
1610         }
1611 
1612         idle_sleep_tsc = rte_rdtsc();
1613         if (likely(idle && idle_sleep)) {
1614             usleep(idle_sleep);
1615             end_tsc = rte_rdtsc();
1616         } else {
1617             end_tsc = idle_sleep_tsc;
1618         }
1619 
1620         if (usch_tsc == cur_tsc) {
1621             usr_tsc = idle_sleep_tsc - div_tsc;
1622         }
1623 
1624         if (!idle) {
1625             sys_tsc = div_tsc - cur_tsc;
1626             ff_top_status.sys_tsc += sys_tsc;
1627         }
1628 
1629         ff_top_status.usr_tsc += usr_tsc;
1630         ff_top_status.work_tsc += end_tsc - cur_tsc;
1631         ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1632 
1633         ff_top_status.loops++;
1634     }
1635 
1636     return 0;
1637 }
1638 
1639 int
1640 ff_dpdk_if_up(void) {
1641     int i;
1642     struct lcore_conf *qconf = &lcore_conf;
1643     for (i = 0; i < qconf->nb_tx_port; i++) {
1644         uint16_t port_id = qconf->tx_port_id[i];
1645 
1646         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1647         veth_ctx[port_id] = ff_veth_attach(pconf);
1648         if (veth_ctx[port_id] == NULL) {
1649             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1650         }
1651     }
1652 
1653     return 0;
1654 }
1655 
1656 void
1657 ff_dpdk_run(loop_func_t loop, void *arg) {
1658     struct loop_routine *lr = rte_malloc(NULL,
1659         sizeof(struct loop_routine), 0);
1660     lr->loop = loop;
1661     lr->arg = arg;
1662     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1663     rte_eal_mp_wait_lcore();
1664     rte_free(lr);
1665 }
1666 
1667 void
1668 ff_dpdk_pktmbuf_free(void *m)
1669 {
1670     rte_pktmbuf_free((struct rte_mbuf *)m);
1671 }
1672 
1673 static uint32_t
1674 toeplitz_hash(unsigned keylen, const uint8_t *key,
1675     unsigned datalen, const uint8_t *data)
1676 {
1677     uint32_t hash = 0, v;
1678     u_int i, b;
1679 
1680     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1681 
1682     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1683     for (i = 0; i < datalen; i++) {
1684         for (b = 0; b < 8; b++) {
1685             if (data[i] & (1<<(7-b)))
1686                 hash ^= v;
1687             v <<= 1;
1688             if ((i + 4) < keylen &&
1689                 (key[i+4] & (1<<(7-b))))
1690                 v |= 1;
1691         }
1692     }
1693     return (hash);
1694 }
1695 
1696 int
1697 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
1698     uint16_t sport, uint16_t dport)
1699 {
1700     struct lcore_conf *qconf = &lcore_conf;
1701     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
1702     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
1703 
1704     if (nb_queues <= 1) {
1705         return 1;
1706     }
1707 
1708     uint16_t reta_size = rss_reta_size[ctx->port_id];
1709     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
1710 
1711     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
1712         sizeof(dport)];
1713 
1714     unsigned datalen = 0;
1715 
1716     bcopy(&saddr, &data[datalen], sizeof(saddr));
1717     datalen += sizeof(saddr);
1718 
1719     bcopy(&daddr, &data[datalen], sizeof(daddr));
1720     datalen += sizeof(daddr);
1721 
1722     bcopy(&sport, &data[datalen], sizeof(sport));
1723     datalen += sizeof(sport);
1724 
1725     bcopy(&dport, &data[datalen], sizeof(dport));
1726     datalen += sizeof(dport);
1727 
1728     uint32_t hash = 0;
1729     if ( !use_rsskey_52bytes )
1730         hash = toeplitz_hash(sizeof(default_rsskey_40bytes),
1731             default_rsskey_40bytes, datalen, data);
1732     else
1733         hash = toeplitz_hash(sizeof(default_rsskey_52bytes),
1734 	    default_rsskey_52bytes, datalen, data);
1735     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
1736 }
1737 
1738 void
1739 ff_regist_packet_dispatcher(dispatch_func_t func)
1740 {
1741     packet_dispatcher = func;
1742 }
1743 
1744 uint64_t
1745 ff_get_tsc_ns()
1746 {
1747     uint64_t cur_tsc = rte_rdtsc();
1748     uint64_t hz = rte_get_tsc_hz();
1749     return ((double)cur_tsc/(double)hz) * NS_PER_S;
1750 }
1751 
1752