xref: /f-stack/lib/ff_dpdk_if.c (revision 4418919f)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 #include <unistd.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30 
31 #include <rte_common.h>
32 #include <rte_byteorder.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memcpy.h>
36 #include <rte_memzone.h>
37 #include <rte_config.h>
38 #include <rte_eal.h>
39 #include <rte_pci.h>
40 #include <rte_mbuf.h>
41 #include <rte_memory.h>
42 #include <rte_lcore.h>
43 #include <rte_launch.h>
44 #include <rte_ethdev.h>
45 #include <rte_debug.h>
46 #include <rte_common.h>
47 #include <rte_ether.h>
48 #include <rte_malloc.h>
49 #include <rte_cycles.h>
50 #include <rte_timer.h>
51 #include <rte_thash.h>
52 #include <rte_ip.h>
53 #include <rte_tcp.h>
54 #include <rte_udp.h>
55 #include <rte_eth_bond.h>
56 
57 #include "ff_dpdk_if.h"
58 #include "ff_dpdk_pcap.h"
59 #include "ff_dpdk_kni.h"
60 #include "ff_config.h"
61 #include "ff_veth.h"
62 #include "ff_host_interface.h"
63 #include "ff_msg.h"
64 #include "ff_api.h"
65 #include "ff_memory.h"
66 
67 #ifdef FF_KNI
68 #define KNI_MBUF_MAX 2048
69 #define KNI_QUEUE_SIZE 2048
70 
71 int enable_kni;
72 static int kni_accept;
73 static int knictl_action = FF_KNICTL_ACTION_DEFAULT;
74 #endif
75 
76 static int numa_on;
77 
78 static unsigned idle_sleep;
79 static unsigned pkt_tx_delay;
80 
81 static struct rte_timer freebsd_clock;
82 
83 // Mellanox Linux's driver key
84 static uint8_t default_rsskey_40bytes[40] = {
85     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
86     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
87     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
88     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
89     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
90 };
91 
92 static int use_rsskey_52bytes = 0;
93 static uint8_t default_rsskey_52bytes[52] = {
94     0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23,
95     0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30,
96     0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02,
97     0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c,
98     0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55,
99     0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e,
100     0x81, 0x15, 0x03, 0x66
101 };
102 
103 struct lcore_conf lcore_conf;
104 
105 struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
106 
107 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
108 static dispatch_func_t packet_dispatcher;
109 
110 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
111 
112 #define BOND_DRIVER_NAME    "net_bonding"
113 
114 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port);
115 
116 struct ff_msg_ring {
117     char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE];
118     /* ring[0] for lcore recv msg, other send */
119     /* ring[1] for lcore send msg, other read */
120     struct rte_ring *ring[FF_MSG_NUM];
121 } __rte_cache_aligned;
122 
123 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
124 static struct rte_mempool *message_pool;
125 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
126 
127 static struct ff_top_args ff_top_status;
128 static struct ff_traffic_args ff_traffic;
129 extern void ff_hardclock(void);
130 
131 static void
132 ff_hardclock_job(__rte_unused struct rte_timer *timer,
133     __rte_unused void *arg) {
134     ff_hardclock();
135     ff_update_current_ts();
136 }
137 
138 struct ff_dpdk_if_context *
139 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
140 {
141     struct ff_dpdk_if_context *ctx;
142 
143     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
144     if (ctx == NULL)
145         return NULL;
146 
147     ctx->sc = sc;
148     ctx->ifp = ifp;
149     ctx->port_id = cfg->port_id;
150     ctx->hw_features = cfg->hw_features;
151 
152     return ctx;
153 }
154 
155 void
156 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
157 {
158     free(ctx);
159 }
160 
161 static void
162 check_all_ports_link_status(void)
163 {
164     #define CHECK_INTERVAL 100 /* 100ms */
165     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
166 
167     uint16_t portid;
168     uint8_t count, all_ports_up, print_flag = 0;
169     struct rte_eth_link link;
170 
171     printf("\nChecking link status");
172     fflush(stdout);
173 
174     int i, nb_ports;
175     nb_ports = ff_global_cfg.dpdk.nb_ports;
176     for (count = 0; count <= MAX_CHECK_TIME; count++) {
177         all_ports_up = 1;
178         for (i = 0; i < nb_ports; i++) {
179             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
180             memset(&link, 0, sizeof(link));
181             rte_eth_link_get_nowait(portid, &link);
182 
183             /* print link status if flag set */
184             if (print_flag == 1) {
185                 if (link.link_status) {
186                     printf("Port %d Link Up - speed %u "
187                         "Mbps - %s\n", (int)portid,
188                         (unsigned)link.link_speed,
189                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
190                         ("full-duplex") : ("half-duplex\n"));
191                 } else {
192                     printf("Port %d Link Down\n", (int)portid);
193                 }
194                 continue;
195             }
196             /* clear all_ports_up flag if any link down */
197             if (link.link_status == 0) {
198                 all_ports_up = 0;
199                 break;
200             }
201         }
202 
203         /* after finally printing all link status, get out */
204         if (print_flag == 1)
205             break;
206 
207         if (all_ports_up == 0) {
208             printf(".");
209             fflush(stdout);
210             rte_delay_ms(CHECK_INTERVAL);
211         }
212 
213         /* set the print_flag if all ports up or timeout */
214         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
215             print_flag = 1;
216             printf("done\n");
217         }
218     }
219 }
220 
221 static int
222 init_lcore_conf(void)
223 {
224     uint8_t nb_dev_ports = rte_eth_dev_count_avail();
225     if (nb_dev_ports == 0) {
226         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
227     }
228 
229     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
230         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
231                  ff_global_cfg.dpdk.max_portid);
232     }
233 
234     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
235     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
236 
237     uint16_t socket_id = 0;
238     if (numa_on) {
239         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
240     }
241 
242     lcore_conf.socket_id = socket_id;
243 
244     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
245     if (!rte_lcore_is_enabled(lcore_id)) {
246         rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
247     }
248 
249     int j;
250     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
251         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
252         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
253 
254         int queueid = -1;
255         int i;
256         for (i = 0; i < pconf->nb_lcores; i++) {
257             if (pconf->lcore_list[i] == lcore_id) {
258                 queueid = i;
259             }
260         }
261         if (queueid < 0) {
262             continue;
263         }
264         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
265         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
266         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
267         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
268         lcore_conf.nb_rx_queue++;
269 
270         lcore_conf.tx_queue_id[port_id] = queueid;
271         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
272         lcore_conf.nb_tx_port++;
273 
274         /* Enable pcap dump */
275         if (ff_global_cfg.pcap.enable) {
276             ff_enable_pcap(ff_global_cfg.pcap.save_path, ff_global_cfg.pcap.snap_len);
277         }
278 
279         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
280     }
281 
282     if (lcore_conf.nb_rx_queue == 0) {
283         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
284     }
285 
286     return 0;
287 }
288 
289 static int
290 init_mem_pool(void)
291 {
292     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
293     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
294     uint32_t nb_tx_queue = nb_lcores;
295     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
296     uint16_t max_portid = ff_global_cfg.dpdk.max_portid;
297 
298     unsigned nb_mbuf = RTE_ALIGN_CEIL (
299         (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE          +
300         nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST    +
301         nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE  +
302         nb_lcores * MEMPOOL_CACHE_SIZE +
303 #ifdef FF_KNI
304         nb_ports * KNI_MBUF_MAX +
305         nb_ports * KNI_QUEUE_SIZE +
306 #endif
307         nb_lcores * nb_ports * DISPATCH_RING_SIZE),
308         (unsigned)8192);
309 
310     unsigned socketid = 0;
311     uint16_t i, lcore_id;
312     char s[64];
313 
314     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
315         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
316         if (numa_on) {
317             socketid = rte_lcore_to_socket_id(lcore_id);
318         }
319 
320         if (socketid >= NB_SOCKETS) {
321             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
322                 socketid, i, NB_SOCKETS);
323         }
324 
325         if (pktmbuf_pool[socketid] != NULL) {
326             continue;
327         }
328 
329         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
330             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
331             pktmbuf_pool[socketid] =
332                 rte_pktmbuf_pool_create(s, nb_mbuf,
333                     MEMPOOL_CACHE_SIZE, 0,
334                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
335         } else {
336             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
337             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
338         }
339 
340         if (pktmbuf_pool[socketid] == NULL) {
341             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
342         } else {
343             printf("create mbuf pool on socket %d\n", socketid);
344         }
345 
346 #ifdef FF_USE_PAGE_ARRAY
347         nb_mbuf = RTE_ALIGN_CEIL (
348             nb_ports*nb_lcores*MAX_PKT_BURST    +
349             nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
350             nb_lcores*MEMPOOL_CACHE_SIZE,
351             (unsigned)4096);
352         ff_init_ref_pool(nb_mbuf, socketid);
353 #endif
354     }
355 
356     return 0;
357 }
358 
359 static struct rte_ring *
360 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
361 {
362     struct rte_ring *ring;
363 
364     if (name == NULL) {
365         rte_exit(EXIT_FAILURE, "create ring failed, no name!\n");
366     }
367 
368     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
369         ring = rte_ring_create(name, count, socket_id, flags);
370     } else {
371         ring = rte_ring_lookup(name);
372     }
373 
374     if (ring == NULL) {
375         rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name);
376     }
377 
378     return ring;
379 }
380 
381 static int
382 init_dispatch_ring(void)
383 {
384     int j;
385     char name_buf[RTE_RING_NAMESIZE];
386     int queueid;
387 
388     unsigned socketid = lcore_conf.socket_id;
389 
390     /* Create ring according to ports actually being used. */
391     int nb_ports = ff_global_cfg.dpdk.nb_ports;
392     for (j = 0; j < nb_ports; j++) {
393         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
394         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
395         int nb_queues = pconf->nb_lcores;
396         if (dispatch_ring[portid] == NULL) {
397             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
398 
399             dispatch_ring[portid] = rte_zmalloc(name_buf,
400                 sizeof(struct rte_ring *) * nb_queues,
401                 RTE_CACHE_LINE_SIZE);
402             if (dispatch_ring[portid] == NULL) {
403                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
404                     "failed\n", name_buf);
405             }
406         }
407 
408         for(queueid = 0; queueid < nb_queues; ++queueid) {
409             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
410                 portid, queueid);
411             dispatch_ring[portid][queueid] = create_ring(name_buf,
412                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
413 
414             if (dispatch_ring[portid][queueid] == NULL)
415                 rte_panic("create ring:%s failed!\n", name_buf);
416 
417             printf("create ring:%s success, %u ring entries are now free!\n",
418                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
419         }
420     }
421 
422     return 0;
423 }
424 
425 static void
426 ff_msg_init(struct rte_mempool *mp,
427     __attribute__((unused)) void *opaque_arg,
428     void *obj, __attribute__((unused)) unsigned i)
429 {
430     struct ff_msg *msg = (struct ff_msg *)obj;
431     msg->msg_type = FF_UNKNOWN;
432     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
433     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
434 }
435 
436 static int
437 init_msg_ring(void)
438 {
439     uint16_t i, j;
440     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
441     unsigned socketid = lcore_conf.socket_id;
442 
443     /* Create message buffer pool */
444     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
445         message_pool = rte_mempool_create(FF_MSG_POOL,
446            MSG_RING_SIZE * 2 * nb_procs,
447            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
448            NULL, NULL, ff_msg_init, NULL,
449            socketid, 0);
450     } else {
451         message_pool = rte_mempool_lookup(FF_MSG_POOL);
452     }
453 
454     if (message_pool == NULL) {
455         rte_panic("Create msg mempool failed\n");
456     }
457 
458     for(i = 0; i < nb_procs; ++i) {
459         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
460             "%s%u", FF_MSG_RING_IN, i);
461         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
462             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
463         if (msg_ring[i].ring[0] == NULL)
464             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
465 
466         for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) {
467             snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE,
468                 "%s%u_%u", FF_MSG_RING_OUT, i, j);
469             msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j],
470                 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
471             if (msg_ring[i].ring[j] == NULL)
472                 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]);
473         }
474     }
475 
476     return 0;
477 }
478 
479 #ifdef FF_KNI
480 
481 static enum FF_KNICTL_CMD get_kni_action(const char *c){
482     if (!c)
483         return FF_KNICTL_ACTION_DEFAULT;
484     if (0 == strcasecmp(c, "alltokni")){
485         return FF_KNICTL_ACTION_ALL_TO_KNI;
486     } else  if (0 == strcasecmp(c, "alltoff")){
487         return FF_KNICTL_ACTION_ALL_TO_FF;
488     } else if (0 == strcasecmp(c, "default")){
489         return FF_KNICTL_ACTION_DEFAULT;
490     } else {
491         return FF_KNICTL_ACTION_DEFAULT;
492     }
493 }
494 
495 static int
496 init_kni(void)
497 {
498     int nb_ports = rte_eth_dev_count_avail();
499     kni_accept = 0;
500     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
501         kni_accept = 1;
502 
503     knictl_action = get_kni_action(ff_global_cfg.kni.kni_action);
504 
505     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
506         ff_global_cfg.kni.udp_port);
507 
508     unsigned socket_id = lcore_conf.socket_id;
509     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
510 
511     nb_ports = ff_global_cfg.dpdk.nb_ports;
512     int i, ret;
513     for (i = 0; i < nb_ports; i++) {
514         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
515         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
516     }
517 
518     return 0;
519 }
520 #endif
521 
522 static void
523 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
524 {
525     if (reta_size == 0) {
526         return;
527     }
528 
529     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
530     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
531 
532     /* config HW indirection table */
533     unsigned i, j, hash=0;
534     for (i = 0; i < reta_conf_size; i++) {
535         reta_conf[i].mask = ~0ULL;
536         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
537             reta_conf[i].reta[j] = hash++ % nb_queues;
538         }
539     }
540 
541     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
542         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
543             port_id);
544     }
545 }
546 
547 static int
548 init_port_start(void)
549 {
550     int nb_ports = ff_global_cfg.dpdk.nb_ports;
551     unsigned socketid = 0;
552     struct rte_mempool *mbuf_pool;
553     uint16_t i, j;
554 
555     for (i = 0; i < nb_ports; i++) {
556         uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i];
557         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id];
558         uint16_t nb_queues = pconf->nb_lcores;
559 
560         for (j=0; j<=pconf->nb_slaves; j++) {
561             if (j < pconf->nb_slaves) {
562                 port_id = pconf->slave_portid_list[j];
563                 printf("To init %s's %d'st slave port[%d]\n",
564                         ff_global_cfg.dpdk.bond_cfgs->name,
565                         j, port_id);
566             } else {
567                 port_id = u_port_id;
568             }
569 
570             struct rte_eth_dev_info dev_info;
571             struct rte_eth_conf port_conf = {0};
572             struct rte_eth_rxconf rxq_conf;
573             struct rte_eth_txconf txq_conf;
574 
575             int ret = rte_eth_dev_info_get(port_id, &dev_info);
576             if (ret != 0)
577                 rte_exit(EXIT_FAILURE,
578                     "Error during getting device (port %u) info: %s\n",
579                     port_id, strerror(-ret));
580 
581             if (nb_queues > dev_info.max_rx_queues) {
582                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
583                     nb_queues,
584                     dev_info.max_rx_queues);
585             }
586 
587             if (nb_queues > dev_info.max_tx_queues) {
588                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
589                     nb_queues,
590                     dev_info.max_tx_queues);
591             }
592 
593             struct rte_ether_addr addr;
594             rte_eth_macaddr_get(port_id, &addr);
595             printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
596                        " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
597                     (unsigned)port_id,
598                     addr.addr_bytes[0], addr.addr_bytes[1],
599                     addr.addr_bytes[2], addr.addr_bytes[3],
600                     addr.addr_bytes[4], addr.addr_bytes[5]);
601 
602             rte_memcpy(pconf->mac,
603                 addr.addr_bytes, RTE_ETHER_ADDR_LEN);
604 
605             /* Set RSS mode */
606             uint64_t default_rss_hf = ETH_RSS_PROTO_MASK;
607             port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
608             port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf;
609             if (dev_info.hash_key_size == 52) {
610                 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_52bytes;
611                 port_conf.rx_adv_conf.rss_conf.rss_key_len = 52;
612                 use_rsskey_52bytes = 1;
613             } else {
614                 port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes;
615                 port_conf.rx_adv_conf.rss_conf.rss_key_len = 40;
616             }
617             port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads;
618             if (port_conf.rx_adv_conf.rss_conf.rss_hf !=
619                     ETH_RSS_PROTO_MASK) {
620                 printf("Port %u modified RSS hash function based on hardware support,"
621                         "requested:%#"PRIx64" configured:%#"PRIx64"\n",
622                         port_id, default_rss_hf,
623                         port_conf.rx_adv_conf.rss_conf.rss_hf);
624             }
625 
626             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) {
627                 port_conf.txmode.offloads |=
628                     DEV_TX_OFFLOAD_MBUF_FAST_FREE;
629             }
630 
631             /* Set Rx VLAN stripping */
632             if (ff_global_cfg.dpdk.vlan_strip) {
633                 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
634                     port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
635                 }
636             }
637 
638             /* Enable HW CRC stripping */
639             port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC;
640 
641             /* FIXME: Enable TCP LRO ?*/
642             #if 0
643             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
644                 printf("LRO is supported\n");
645                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
646                 pconf->hw_features.rx_lro = 1;
647             }
648             #endif
649 
650             /* Set Rx checksum checking */
651             if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
652                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
653                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
654                 printf("RX checksum offload supported\n");
655                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
656                 pconf->hw_features.rx_csum = 1;
657             }
658 
659             if (ff_global_cfg.dpdk.tx_csum_offoad_skip == 0) {
660                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
661                     printf("TX ip checksum offload supported\n");
662                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
663                     pconf->hw_features.tx_csum_ip = 1;
664                 }
665 
666                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
667                     (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
668                     printf("TX TCP&UDP checksum offload supported\n");
669                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM;
670                     pconf->hw_features.tx_csum_l4 = 1;
671                 }
672             } else {
673                 printf("TX checksum offoad is disabled\n");
674             }
675 
676             if (ff_global_cfg.dpdk.tso) {
677                 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
678                     printf("TSO is supported\n");
679                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO;
680                     pconf->hw_features.tx_tso = 1;
681                 }
682             } else {
683                 printf("TSO is disabled\n");
684             }
685 
686             if (dev_info.reta_size) {
687                 /* reta size must be power of 2 */
688                 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
689 
690                 rss_reta_size[port_id] = dev_info.reta_size;
691                 printf("port[%d]: rss table size: %d\n", port_id,
692                     dev_info.reta_size);
693             }
694 
695             if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
696                 continue;
697             }
698 
699             ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
700             if (ret != 0) {
701                 return ret;
702             }
703 
704             static uint16_t nb_rxd = RX_QUEUE_SIZE;
705             static uint16_t nb_txd = TX_QUEUE_SIZE;
706             ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd);
707             if (ret < 0)
708                 printf("Could not adjust number of descriptors "
709                         "for port%u (%d)\n", (unsigned)port_id, ret);
710 
711             uint16_t q;
712             for (q = 0; q < nb_queues; q++) {
713                 if (numa_on) {
714                     uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
715                     socketid = rte_lcore_to_socket_id(lcore_id);
716                 }
717                 mbuf_pool = pktmbuf_pool[socketid];
718 
719                 txq_conf = dev_info.default_txconf;
720                 txq_conf.offloads = port_conf.txmode.offloads;
721                 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd,
722                     socketid, &txq_conf);
723                 if (ret < 0) {
724                     return ret;
725                 }
726 
727                 rxq_conf = dev_info.default_rxconf;
728                 rxq_conf.offloads = port_conf.rxmode.offloads;
729                 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd,
730                     socketid, &rxq_conf, mbuf_pool);
731                 if (ret < 0) {
732                     return ret;
733                 }
734             }
735 
736 
737             if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME,
738                     strlen(dev_info.driver_name)) == 0) {
739 
740                 rte_eth_macaddr_get(port_id, &addr);
741                 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
742                            " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
743                         (unsigned)port_id,
744                         addr.addr_bytes[0], addr.addr_bytes[1],
745                         addr.addr_bytes[2], addr.addr_bytes[3],
746                         addr.addr_bytes[4], addr.addr_bytes[5]);
747 
748                 rte_memcpy(pconf->mac,
749                     addr.addr_bytes, RTE_ETHER_ADDR_LEN);
750 
751                 int mode, count, x;
752                 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS;
753 
754                 mode = rte_eth_bond_mode_get(port_id);
755                 printf("Port %u, bond mode:%d\n", port_id, mode);
756 
757                 count = rte_eth_bond_slaves_get(port_id, slaves, len);
758                 printf("Port %u, %s's slave ports count:%d\n", port_id,
759                             ff_global_cfg.dpdk.bond_cfgs->name, count);
760                 for (x=0; x<count; x++) {
761                     printf("Port %u, %s's slave port[%u]\n", port_id,
762                             ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]);
763                 }
764             }
765 
766             ret = rte_eth_dev_start(port_id);
767             if (ret < 0) {
768                 return ret;
769             }
770 
771             if (nb_queues > 1) {
772                 /* set HW rss hash function to Toeplitz. */
773                 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
774                     struct rte_eth_hash_filter_info info = {0};
775                     info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
776                     info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
777 
778                     if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
779                         RTE_ETH_FILTER_SET, &info) < 0) {
780                         rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
781                             port_id);
782                     }
783                 }
784 
785                 set_rss_table(port_id, dev_info.reta_size, nb_queues);
786             }
787 
788             /* Enable RX in promiscuous mode for the Ethernet device. */
789             if (ff_global_cfg.dpdk.promiscuous) {
790                 ret = rte_eth_promiscuous_enable(port_id);
791                 if (ret == 0) {
792                     printf("set port %u to promiscuous mode ok\n", port_id);
793                 } else {
794                     printf("set port %u to promiscuous mode error\n", port_id);
795                 }
796             }
797         }
798     }
799 
800     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
801         check_all_ports_link_status();
802     }
803 
804     return 0;
805 }
806 
807 static int
808 init_clock(void)
809 {
810     rte_timer_subsystem_init();
811     uint64_t hz = rte_get_timer_hz();
812     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
813     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
814 
815     rte_timer_init(&freebsd_clock);
816     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
817         rte_lcore_id(), &ff_hardclock_job, NULL);
818 
819     ff_update_current_ts();
820 
821     return 0;
822 }
823 
824 int
825 ff_dpdk_init(int argc, char **argv)
826 {
827     if (ff_global_cfg.dpdk.nb_procs < 1 ||
828         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
829         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
830         ff_global_cfg.dpdk.proc_id < 0) {
831         printf("param num_procs[%d] or proc_id[%d] error!\n",
832             ff_global_cfg.dpdk.nb_procs,
833             ff_global_cfg.dpdk.proc_id);
834         exit(1);
835     }
836 
837     int ret = rte_eal_init(argc, argv);
838     if (ret < 0) {
839         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
840     }
841 
842     numa_on = ff_global_cfg.dpdk.numa_on;
843 
844     idle_sleep = ff_global_cfg.dpdk.idle_sleep;
845     pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \
846         BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay;
847 
848     init_lcore_conf();
849 
850     init_mem_pool();
851 
852     init_dispatch_ring();
853 
854     init_msg_ring();
855 
856 #ifdef FF_KNI
857     enable_kni = ff_global_cfg.kni.enable;
858     if (enable_kni) {
859         init_kni();
860     }
861 #endif
862 
863 #ifdef FF_USE_PAGE_ARRAY
864     ff_mmap_init();
865 #endif
866 
867     ret = init_port_start();
868     if (ret < 0) {
869         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
870     }
871 
872     init_clock();
873 
874     return 0;
875 }
876 
877 static void
878 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
879 {
880     uint8_t rx_csum = ctx->hw_features.rx_csum;
881     if (rx_csum) {
882         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
883             rte_pktmbuf_free(pkt);
884             return;
885         }
886     }
887 
888     void *data = rte_pktmbuf_mtod(pkt, void*);
889     uint16_t len = rte_pktmbuf_data_len(pkt);
890 
891     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
892     if (hdr == NULL) {
893         rte_pktmbuf_free(pkt);
894         return;
895     }
896 
897     if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) {
898         ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci);
899     }
900 
901     struct rte_mbuf *pn = pkt->next;
902     void *prev = hdr;
903     while(pn != NULL) {
904         data = rte_pktmbuf_mtod(pn, void*);
905         len = rte_pktmbuf_data_len(pn);
906 
907         void *mb = ff_mbuf_get(prev, data, len);
908         if (mb == NULL) {
909             ff_mbuf_free(hdr);
910             rte_pktmbuf_free(pkt);
911             return;
912         }
913         pn = pn->next;
914         prev = mb;
915     }
916 
917     ff_veth_process_packet(ctx->ifp, hdr);
918 }
919 
920 static enum FilterReturn
921 protocol_filter(const void *data, uint16_t len)
922 {
923     if(len < RTE_ETHER_ADDR_LEN)
924         return FILTER_UNKNOWN;
925 
926     const struct rte_ether_hdr *hdr;
927     const struct rte_vlan_hdr *vlanhdr;
928     hdr = (const struct rte_ether_hdr *)data;
929     uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type);
930     data += RTE_ETHER_HDR_LEN;
931     len -= RTE_ETHER_HDR_LEN;
932 
933     if (ether_type == RTE_ETHER_TYPE_VLAN) {
934         vlanhdr = (struct rte_vlan_hdr *)data;
935         ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto);
936         data += sizeof(struct rte_vlan_hdr);
937         len -= sizeof(struct rte_vlan_hdr);
938     }
939 
940     if(ether_type == RTE_ETHER_TYPE_ARP)
941         return FILTER_ARP;
942 
943 #ifdef INET6
944     if (ether_type == RTE_ETHER_TYPE_IPV6) {
945         return ff_kni_proto_filter(data,
946             len, ether_type);
947     }
948 #endif
949 
950 #ifndef FF_KNI
951     return FILTER_UNKNOWN;
952 #else
953     if (!enable_kni) {
954         return FILTER_UNKNOWN;
955     }
956 
957     if(ether_type != RTE_ETHER_TYPE_IPV4)
958         return FILTER_UNKNOWN;
959 
960     return ff_kni_proto_filter(data,
961         len, ether_type);
962 #endif
963 }
964 
965 static inline void
966 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
967 {
968     struct rte_mbuf *md;
969     void *src, *dst;
970 
971     dst = rte_pktmbuf_mtod(mi, void *);
972     src = rte_pktmbuf_mtod(m, void *);
973 
974     mi->data_len = m->data_len;
975     rte_memcpy(dst, src, m->data_len);
976 
977     mi->port = m->port;
978     mi->vlan_tci = m->vlan_tci;
979     mi->vlan_tci_outer = m->vlan_tci_outer;
980     mi->tx_offload = m->tx_offload;
981     mi->hash = m->hash;
982     mi->ol_flags = m->ol_flags;
983     mi->packet_type = m->packet_type;
984 }
985 
986 /* copied from rte_pktmbuf_clone */
987 static inline struct rte_mbuf *
988 pktmbuf_deep_clone(const struct rte_mbuf *md,
989     struct rte_mempool *mp)
990 {
991     struct rte_mbuf *mc, *mi, **prev;
992     uint32_t pktlen;
993     uint8_t nseg;
994 
995     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
996         return NULL;
997 
998     mi = mc;
999     prev = &mi->next;
1000     pktlen = md->pkt_len;
1001     nseg = 0;
1002 
1003     do {
1004         nseg++;
1005         pktmbuf_deep_attach(mi, md);
1006         *prev = mi;
1007         prev = &mi->next;
1008     } while ((md = md->next) != NULL &&
1009         (mi = rte_pktmbuf_alloc(mp)) != NULL);
1010 
1011     *prev = NULL;
1012     mc->nb_segs = nseg;
1013     mc->pkt_len = pktlen;
1014 
1015     /* Allocation of new indirect segment failed */
1016     if (unlikely (mi == NULL)) {
1017         rte_pktmbuf_free(mc);
1018         return NULL;
1019     }
1020 
1021     __rte_mbuf_sanity_check(mc, 1);
1022     return mc;
1023 }
1024 
1025 static inline void
1026 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
1027     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
1028 {
1029     struct lcore_conf *qconf = &lcore_conf;
1030     uint16_t nb_queues = qconf->nb_queue_list[port_id];
1031 
1032     uint16_t i;
1033     for (i = 0; i < count; i++) {
1034         struct rte_mbuf *rtem = bufs[i];
1035 
1036         if (unlikely( ff_global_cfg.pcap.enable)) {
1037             if (!pkts_from_ring) {
1038                 ff_dump_packets( ff_global_cfg.pcap.save_path, rtem, ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1039             }
1040         }
1041 
1042         void *data = rte_pktmbuf_mtod(rtem, void*);
1043         uint16_t len = rte_pktmbuf_data_len(rtem);
1044 
1045         if (!pkts_from_ring) {
1046             ff_traffic.rx_packets++;
1047             ff_traffic.rx_bytes += len;
1048         }
1049 
1050         if (!pkts_from_ring && packet_dispatcher) {
1051             int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues);
1052             if (ret == FF_DISPATCH_RESPONSE) {
1053                 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len;
1054 
1055                 /*
1056                  * We have not support vlan out strip
1057                  */
1058                 if (rtem->vlan_tci) {
1059                     data = rte_pktmbuf_prepend(rtem, sizeof(struct rte_vlan_hdr));
1060                     if (data != NULL) {
1061                         memmove(data, data + sizeof(struct rte_vlan_hdr), RTE_ETHER_HDR_LEN);
1062                         struct rte_ether_hdr *etherhdr = (struct rte_ether_hdr *)data;
1063                         struct rte_vlan_hdr *vlanhdr = (struct rte_vlan_hdr *)(data + RTE_ETHER_HDR_LEN);
1064                         vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci);
1065                         vlanhdr->eth_proto = etherhdr->ether_type;
1066                         etherhdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN);
1067                     }
1068                 }
1069                 send_single_packet(rtem, port_id);
1070                 continue;
1071             }
1072 
1073             if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) {
1074                 rte_pktmbuf_free(rtem);
1075                 continue;
1076             }
1077 
1078             if (ret != queue_id) {
1079                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
1080                 if (ret < 0)
1081                     rte_pktmbuf_free(rtem);
1082 
1083                 continue;
1084             }
1085         }
1086 
1087         enum FilterReturn filter = protocol_filter(data, len);
1088 #ifdef INET6
1089         if (filter == FILTER_ARP || filter == FILTER_NDP) {
1090 #else
1091         if (filter == FILTER_ARP) {
1092 #endif
1093             struct rte_mempool *mbuf_pool;
1094             struct rte_mbuf *mbuf_clone;
1095             if (!pkts_from_ring) {
1096                 uint16_t j;
1097                 for(j = 0; j < nb_queues; ++j) {
1098                     if(j == queue_id)
1099                         continue;
1100 
1101                     unsigned socket_id = 0;
1102                     if (numa_on) {
1103                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1104                         socket_id = rte_lcore_to_socket_id(lcore_id);
1105                     }
1106                     mbuf_pool = pktmbuf_pool[socket_id];
1107                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1108                     if(mbuf_clone) {
1109                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1110                             mbuf_clone);
1111                         if (ret < 0)
1112                             rte_pktmbuf_free(mbuf_clone);
1113                     }
1114                 }
1115             }
1116 
1117 #ifdef FF_KNI
1118             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1119                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1120                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1121                 if(mbuf_clone) {
1122                     ff_kni_enqueue(port_id, mbuf_clone);
1123                 }
1124             }
1125 #endif
1126             ff_veth_input(ctx, rtem);
1127 #ifdef FF_KNI
1128         } else if (enable_kni) {
1129             if (knictl_action == FF_KNICTL_ACTION_ALL_TO_KNI){
1130                 ff_kni_enqueue(port_id, rtem);
1131             } else if (knictl_action == FF_KNICTL_ACTION_ALL_TO_FF){
1132                 ff_veth_input(ctx, rtem);
1133             } else if (knictl_action == FF_KNICTL_ACTION_DEFAULT){
1134                 if (enable_kni &&
1135                         ((filter == FILTER_KNI && kni_accept) ||
1136                         (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1137                         ff_kni_enqueue(port_id, rtem);
1138                 } else {
1139                     ff_veth_input(ctx, rtem);
1140                 }
1141             } else {
1142                 ff_veth_input(ctx, rtem);
1143             }
1144 #endif
1145         } else {
1146             ff_veth_input(ctx, rtem);
1147         }
1148     }
1149 }
1150 
1151 static inline int
1152 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1153     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1154 {
1155     /* read packet from ring buf and to process */
1156     uint16_t nb_rb;
1157     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1158         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1159 
1160     if(nb_rb > 0) {
1161         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1162     }
1163 
1164     return 0;
1165 }
1166 
1167 static inline void
1168 handle_sysctl_msg(struct ff_msg *msg)
1169 {
1170     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1171         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1172         msg->sysctl.newlen);
1173 
1174     if (ret < 0) {
1175         msg->result = errno;
1176     } else {
1177         msg->result = 0;
1178     }
1179 }
1180 
1181 static inline void
1182 handle_ioctl_msg(struct ff_msg *msg)
1183 {
1184     int fd, ret;
1185 #ifdef INET6
1186     if (msg->msg_type == FF_IOCTL6) {
1187         fd = ff_socket(AF_INET6, SOCK_DGRAM, 0);
1188     } else
1189 #endif
1190         fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1191 
1192     if (fd < 0) {
1193         ret = -1;
1194         goto done;
1195     }
1196 
1197     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1198 
1199     ff_close(fd);
1200 
1201 done:
1202     if (ret < 0) {
1203         msg->result = errno;
1204     } else {
1205         msg->result = 0;
1206     }
1207 }
1208 
1209 static inline void
1210 handle_route_msg(struct ff_msg *msg)
1211 {
1212     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1213         &msg->route.len, msg->route.maxlen);
1214     if (ret < 0) {
1215         msg->result = errno;
1216     } else {
1217         msg->result = 0;
1218     }
1219 }
1220 
1221 static inline void
1222 handle_top_msg(struct ff_msg *msg)
1223 {
1224     msg->top = ff_top_status;
1225     msg->result = 0;
1226 }
1227 
1228 #ifdef FF_NETGRAPH
1229 static inline void
1230 handle_ngctl_msg(struct ff_msg *msg)
1231 {
1232     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1233     if (ret < 0) {
1234         msg->result = errno;
1235     } else {
1236         msg->result = 0;
1237         msg->ngctl.ret = ret;
1238     }
1239 }
1240 #endif
1241 
1242 #ifdef FF_IPFW
1243 static inline void
1244 handle_ipfw_msg(struct ff_msg *msg)
1245 {
1246     int fd, ret;
1247     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1248     if (fd < 0) {
1249         ret = -1;
1250         goto done;
1251     }
1252 
1253     switch (msg->ipfw.cmd) {
1254         case FF_IPFW_GET:
1255             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1256                 msg->ipfw.optname, msg->ipfw.optval,
1257                 msg->ipfw.optlen);
1258             break;
1259         case FF_IPFW_SET:
1260             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1261                 msg->ipfw.optname, msg->ipfw.optval,
1262                 *(msg->ipfw.optlen));
1263             break;
1264         default:
1265             ret = -1;
1266             errno = ENOTSUP;
1267             break;
1268     }
1269 
1270     ff_close(fd);
1271 
1272 done:
1273     if (ret < 0) {
1274         msg->result = errno;
1275     } else {
1276         msg->result = 0;
1277     }
1278 }
1279 #endif
1280 
1281 static inline void
1282 handle_traffic_msg(struct ff_msg *msg)
1283 {
1284     msg->traffic = ff_traffic;
1285     msg->result = 0;
1286 }
1287 
1288 #ifdef FF_KNI
1289 static inline void
1290 handle_knictl_msg(struct ff_msg *msg)
1291 {
1292     if (msg->knictl.kni_cmd == FF_KNICTL_CMD_SET){
1293         switch (msg->knictl.kni_action){
1294             case FF_KNICTL_ACTION_ALL_TO_FF: knictl_action = FF_KNICTL_ACTION_ALL_TO_FF; msg->result = 0; printf("new kni action: alltoff\n"); break;
1295             case FF_KNICTL_ACTION_ALL_TO_KNI: knictl_action = FF_KNICTL_ACTION_ALL_TO_KNI; msg->result = 0; printf("new kni action: alltokni\n"); break;
1296             case FF_KNICTL_ACTION_DEFAULT: knictl_action = FF_KNICTL_ACTION_DEFAULT; msg->result = 0; printf("new kni action: default\n"); break;
1297             default: msg->result = -1;
1298         }
1299     }
1300     else if (msg->knictl.kni_cmd == FF_KNICTL_CMD_GET){
1301         msg->knictl.kni_action = knictl_action;
1302     } else {
1303         msg->result = -2;
1304     }
1305 }
1306 #endif
1307 
1308 static inline void
1309 handle_default_msg(struct ff_msg *msg)
1310 {
1311     msg->result = ENOTSUP;
1312 }
1313 
1314 static inline void
1315 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1316 {
1317     switch (msg->msg_type) {
1318         case FF_SYSCTL:
1319             handle_sysctl_msg(msg);
1320             break;
1321         case FF_IOCTL:
1322 #ifdef INET6
1323         case FF_IOCTL6:
1324 #endif
1325             handle_ioctl_msg(msg);
1326             break;
1327         case FF_ROUTE:
1328             handle_route_msg(msg);
1329             break;
1330         case FF_TOP:
1331             handle_top_msg(msg);
1332             break;
1333 #ifdef FF_NETGRAPH
1334         case FF_NGCTL:
1335             handle_ngctl_msg(msg);
1336             break;
1337 #endif
1338 #ifdef FF_IPFW
1339         case FF_IPFW_CTL:
1340             handle_ipfw_msg(msg);
1341             break;
1342 #endif
1343         case FF_TRAFFIC:
1344             handle_traffic_msg(msg);
1345             break;
1346 #ifdef FF_KNI
1347         case FF_KNICTL:
1348             handle_knictl_msg(msg);
1349             break;
1350 #endif
1351         default:
1352             handle_default_msg(msg);
1353             break;
1354     }
1355     rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg);
1356 }
1357 
1358 static inline int
1359 process_msg_ring(uint16_t proc_id)
1360 {
1361     void *msg;
1362     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1363 
1364     if (unlikely(ret == 0)) {
1365         handle_msg((struct ff_msg *)msg, proc_id);
1366     }
1367 
1368     return 0;
1369 }
1370 
1371 /* Send burst of packets on an output interface */
1372 static inline int
1373 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1374 {
1375     struct rte_mbuf **m_table;
1376     int ret;
1377     uint16_t queueid;
1378 
1379     queueid = qconf->tx_queue_id[port];
1380     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1381 
1382     if (unlikely(ff_global_cfg.pcap.enable)) {
1383         uint16_t i;
1384         for (i = 0; i < n; i++) {
1385             ff_dump_packets( ff_global_cfg.pcap.save_path, m_table[i],
1386                ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1387         }
1388     }
1389 
1390     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1391     ff_traffic.tx_packets += ret;
1392     uint16_t i;
1393     for (i = 0; i < ret; i++) {
1394         ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]);
1395 #ifdef FF_USE_PAGE_ARRAY
1396         if (qconf->tx_mbufs[port].bsd_m_table[i])
1397             ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs);
1398 #endif
1399     }
1400     if (unlikely(ret < n)) {
1401         do {
1402             rte_pktmbuf_free(m_table[ret]);
1403 #ifdef FF_USE_PAGE_ARRAY
1404             if ( qconf->tx_mbufs[port].bsd_m_table[ret] )
1405                 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]);
1406 #endif
1407         } while (++ret < n);
1408     }
1409     return 0;
1410 }
1411 
1412 /* Enqueue a single packet, and send burst if queue is filled */
1413 static inline int
1414 send_single_packet(struct rte_mbuf *m, uint8_t port)
1415 {
1416     uint16_t len;
1417     struct lcore_conf *qconf;
1418 
1419     qconf = &lcore_conf;
1420     len = qconf->tx_mbufs[port].len;
1421     qconf->tx_mbufs[port].m_table[len] = m;
1422     len++;
1423 
1424     /* enough pkts to be sent */
1425     if (unlikely(len == MAX_PKT_BURST)) {
1426         send_burst(qconf, MAX_PKT_BURST, port);
1427         len = 0;
1428     }
1429 
1430     qconf->tx_mbufs[port].len = len;
1431     return 0;
1432 }
1433 
1434 int
1435 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1436     int total)
1437 {
1438 #ifdef FF_USE_PAGE_ARRAY
1439     struct lcore_conf *qconf = &lcore_conf;
1440     int    len = 0;
1441 
1442     len = ff_if_send_onepkt(ctx, m,total);
1443     if (unlikely(len == MAX_PKT_BURST)) {
1444         send_burst(qconf, MAX_PKT_BURST, ctx->port_id);
1445         len = 0;
1446     }
1447     qconf->tx_mbufs[ctx->port_id].len = len;
1448     return 0;
1449 #endif
1450     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1451     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1452     if (head == NULL) {
1453         ff_mbuf_free(m);
1454         return -1;
1455     }
1456 
1457     head->pkt_len = total;
1458     head->nb_segs = 0;
1459 
1460     int off = 0;
1461     struct rte_mbuf *cur = head, *prev = NULL;
1462     while(total > 0) {
1463         if (cur == NULL) {
1464             cur = rte_pktmbuf_alloc(mbuf_pool);
1465             if (cur == NULL) {
1466                 rte_pktmbuf_free(head);
1467                 ff_mbuf_free(m);
1468                 return -1;
1469             }
1470         }
1471 
1472         if (prev != NULL) {
1473             prev->next = cur;
1474         }
1475         head->nb_segs++;
1476 
1477         prev = cur;
1478         void *data = rte_pktmbuf_mtod(cur, void*);
1479         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1480         int ret = ff_mbuf_copydata(m, data, off, len);
1481         if (ret < 0) {
1482             rte_pktmbuf_free(head);
1483             ff_mbuf_free(m);
1484             return -1;
1485         }
1486 
1487 
1488         cur->data_len = len;
1489         off += len;
1490         total -= len;
1491         cur = NULL;
1492     }
1493 
1494     struct ff_tx_offload offload = {0};
1495     ff_mbuf_tx_offload(m, &offload);
1496 
1497     void *data = rte_pktmbuf_mtod(head, void*);
1498 
1499     if (offload.ip_csum) {
1500         /* ipv6 not supported yet */
1501         struct rte_ipv4_hdr *iph;
1502         int iph_len;
1503         iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN);
1504         iph_len = (iph->version_ihl & 0x0f) << 2;
1505 
1506         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1507         head->l2_len = RTE_ETHER_HDR_LEN;
1508         head->l3_len = iph_len;
1509     }
1510 
1511     if (ctx->hw_features.tx_csum_l4) {
1512         struct rte_ipv4_hdr *iph;
1513         int iph_len;
1514         iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN);
1515         iph_len = (iph->version_ihl & 0x0f) << 2;
1516 
1517         if (offload.tcp_csum) {
1518             head->ol_flags |= PKT_TX_TCP_CKSUM;
1519             head->l2_len = RTE_ETHER_HDR_LEN;
1520             head->l3_len = iph_len;
1521         }
1522 
1523         /*
1524          *  TCP segmentation offload.
1525          *
1526          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1527          *    implies PKT_TX_TCP_CKSUM)
1528          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1529          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1530          *    write the IP checksum to 0 in the packet
1531          *  - fill the mbuf offload information: l2_len,
1532          *    l3_len, l4_len, tso_segsz
1533          *  - calculate the pseudo header checksum without taking ip_len
1534          *    in account, and set it in the TCP header. Refer to
1535          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1536          *    used as helpers.
1537          */
1538         if (offload.tso_seg_size) {
1539             struct rte_tcp_hdr *tcph;
1540             int tcph_len;
1541             tcph = (struct rte_tcp_hdr *)((char *)iph + iph_len);
1542             tcph_len = (tcph->data_off & 0xf0) >> 2;
1543             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1544 
1545             head->ol_flags |= PKT_TX_TCP_SEG;
1546             head->l4_len = tcph_len;
1547             head->tso_segsz = offload.tso_seg_size;
1548         }
1549 
1550         if (offload.udp_csum) {
1551             head->ol_flags |= PKT_TX_UDP_CKSUM;
1552             head->l2_len = RTE_ETHER_HDR_LEN;
1553             head->l3_len = iph_len;
1554         }
1555     }
1556 
1557     ff_mbuf_free(m);
1558 
1559     return send_single_packet(head, ctx->port_id);
1560 }
1561 
1562 static int
1563 main_loop(void *arg)
1564 {
1565     struct loop_routine *lr = (struct loop_routine *)arg;
1566 
1567     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1568     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc;
1569     int i, j, nb_rx, idle;
1570     uint16_t port_id, queue_id;
1571     struct lcore_conf *qconf;
1572     uint64_t drain_tsc = 0;
1573     struct ff_dpdk_if_context *ctx;
1574 
1575     if (pkt_tx_delay) {
1576         drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay;
1577     }
1578 
1579     prev_tsc = 0;
1580     usch_tsc = 0;
1581 
1582     qconf = &lcore_conf;
1583 
1584     while (1) {
1585         cur_tsc = rte_rdtsc();
1586         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1587             rte_timer_manage();
1588         }
1589 
1590         idle = 1;
1591         sys_tsc = 0;
1592         usr_tsc = 0;
1593 
1594         /*
1595          * TX burst queue drain
1596          */
1597         diff_tsc = cur_tsc - prev_tsc;
1598         if (unlikely(diff_tsc >= drain_tsc)) {
1599             for (i = 0; i < qconf->nb_tx_port; i++) {
1600                 port_id = qconf->tx_port_id[i];
1601                 if (qconf->tx_mbufs[port_id].len == 0)
1602                     continue;
1603 
1604                 idle = 0;
1605 
1606                 send_burst(qconf,
1607                     qconf->tx_mbufs[port_id].len,
1608                     port_id);
1609                 qconf->tx_mbufs[port_id].len = 0;
1610             }
1611 
1612             prev_tsc = cur_tsc;
1613         }
1614 
1615         /*
1616          * Read packet from RX queues
1617          */
1618         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1619             port_id = qconf->rx_queue_list[i].port_id;
1620             queue_id = qconf->rx_queue_list[i].queue_id;
1621             ctx = veth_ctx[port_id];
1622 
1623 #ifdef FF_KNI
1624             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1625                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1626             }
1627 #endif
1628 
1629             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1630 
1631             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1632                 MAX_PKT_BURST);
1633             if (nb_rx == 0)
1634                 continue;
1635 
1636             idle = 0;
1637 
1638             /* Prefetch first packets */
1639             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1640                 rte_prefetch0(rte_pktmbuf_mtod(
1641                         pkts_burst[j], void *));
1642             }
1643 
1644             /* Prefetch and handle already prefetched packets */
1645             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1646                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1647                         j + PREFETCH_OFFSET], void *));
1648                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1649             }
1650 
1651             /* Handle remaining prefetched packets */
1652             for (; j < nb_rx; j++) {
1653                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1654             }
1655         }
1656 
1657         process_msg_ring(qconf->proc_id);
1658 
1659         div_tsc = rte_rdtsc();
1660 
1661         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) {
1662             usch_tsc = cur_tsc;
1663             lr->loop(lr->arg);
1664         }
1665 
1666         idle_sleep_tsc = rte_rdtsc();
1667         if (likely(idle && idle_sleep)) {
1668             usleep(idle_sleep);
1669             end_tsc = rte_rdtsc();
1670         } else {
1671             end_tsc = idle_sleep_tsc;
1672         }
1673 
1674         if (usch_tsc == cur_tsc) {
1675             usr_tsc = idle_sleep_tsc - div_tsc;
1676         }
1677 
1678         if (!idle) {
1679             sys_tsc = div_tsc - cur_tsc;
1680             ff_top_status.sys_tsc += sys_tsc;
1681         }
1682 
1683         ff_top_status.usr_tsc += usr_tsc;
1684         ff_top_status.work_tsc += end_tsc - cur_tsc;
1685         ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1686 
1687         ff_top_status.loops++;
1688     }
1689 
1690     return 0;
1691 }
1692 
1693 int
1694 ff_dpdk_if_up(void) {
1695     int i;
1696     struct lcore_conf *qconf = &lcore_conf;
1697     for (i = 0; i < qconf->nb_tx_port; i++) {
1698         uint16_t port_id = qconf->tx_port_id[i];
1699 
1700         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1701         veth_ctx[port_id] = ff_veth_attach(pconf);
1702         if (veth_ctx[port_id] == NULL) {
1703             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1704         }
1705     }
1706 
1707     return 0;
1708 }
1709 
1710 void
1711 ff_dpdk_run(loop_func_t loop, void *arg) {
1712     struct loop_routine *lr = rte_malloc(NULL,
1713         sizeof(struct loop_routine), 0);
1714     lr->loop = loop;
1715     lr->arg = arg;
1716     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1717     rte_eal_mp_wait_lcore();
1718     rte_free(lr);
1719 }
1720 
1721 void
1722 ff_dpdk_pktmbuf_free(void *m)
1723 {
1724     rte_pktmbuf_free((struct rte_mbuf *)m);
1725 }
1726 
1727 static uint32_t
1728 toeplitz_hash(unsigned keylen, const uint8_t *key,
1729     unsigned datalen, const uint8_t *data)
1730 {
1731     uint32_t hash = 0, v;
1732     u_int i, b;
1733 
1734     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1735 
1736     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1737     for (i = 0; i < datalen; i++) {
1738         for (b = 0; b < 8; b++) {
1739             if (data[i] & (1<<(7-b)))
1740                 hash ^= v;
1741             v <<= 1;
1742             if ((i + 4) < keylen &&
1743                 (key[i+4] & (1<<(7-b))))
1744                 v |= 1;
1745         }
1746     }
1747     return (hash);
1748 }
1749 
1750 int
1751 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
1752     uint16_t sport, uint16_t dport)
1753 {
1754     struct lcore_conf *qconf = &lcore_conf;
1755     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
1756     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
1757 
1758     if (nb_queues <= 1) {
1759         return 1;
1760     }
1761 
1762     uint16_t reta_size = rss_reta_size[ctx->port_id];
1763     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
1764 
1765     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
1766         sizeof(dport)];
1767 
1768     unsigned datalen = 0;
1769 
1770     bcopy(&saddr, &data[datalen], sizeof(saddr));
1771     datalen += sizeof(saddr);
1772 
1773     bcopy(&daddr, &data[datalen], sizeof(daddr));
1774     datalen += sizeof(daddr);
1775 
1776     bcopy(&sport, &data[datalen], sizeof(sport));
1777     datalen += sizeof(sport);
1778 
1779     bcopy(&dport, &data[datalen], sizeof(dport));
1780     datalen += sizeof(dport);
1781 
1782     uint32_t hash = 0;
1783     if ( !use_rsskey_52bytes )
1784         hash = toeplitz_hash(sizeof(default_rsskey_40bytes),
1785             default_rsskey_40bytes, datalen, data);
1786     else
1787         hash = toeplitz_hash(sizeof(default_rsskey_52bytes),
1788 	    default_rsskey_52bytes, datalen, data);
1789     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
1790 }
1791 
1792 void
1793 ff_regist_packet_dispatcher(dispatch_func_t func)
1794 {
1795     packet_dispatcher = func;
1796 }
1797 
1798 uint64_t
1799 ff_get_tsc_ns()
1800 {
1801     uint64_t cur_tsc = rte_rdtsc();
1802     uint64_t hz = rte_get_tsc_hz();
1803     return ((double)cur_tsc/(double)hz) * NS_PER_S;
1804 }
1805 
1806