xref: /f-stack/lib/ff_dpdk_if.c (revision ec61049c)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 #include <unistd.h>
28 
29 #include <rte_common.h>
30 #include <rte_byteorder.h>
31 #include <rte_log.h>
32 #include <rte_memory.h>
33 #include <rte_memcpy.h>
34 #include <rte_memzone.h>
35 #include <rte_config.h>
36 #include <rte_eal.h>
37 #include <rte_pci.h>
38 #include <rte_mbuf.h>
39 #include <rte_memory.h>
40 #include <rte_lcore.h>
41 #include <rte_launch.h>
42 #include <rte_ethdev.h>
43 #include <rte_debug.h>
44 #include <rte_common.h>
45 #include <rte_ether.h>
46 #include <rte_malloc.h>
47 #include <rte_cycles.h>
48 #include <rte_timer.h>
49 #include <rte_thash.h>
50 #include <rte_ip.h>
51 #include <rte_tcp.h>
52 #include <rte_udp.h>
53 
54 #include "ff_dpdk_if.h"
55 #include "ff_dpdk_pcap.h"
56 #include "ff_dpdk_kni.h"
57 #include "ff_config.h"
58 #include "ff_veth.h"
59 #include "ff_host_interface.h"
60 #include "ff_msg.h"
61 #include "ff_api.h"
62 
63 #define MEMPOOL_CACHE_SIZE 256
64 
65 #define DISPATCH_RING_SIZE 2048
66 
67 #define MSG_RING_SIZE 32
68 
69 /*
70  * Configurable number of RX/TX ring descriptors
71  */
72 #define RX_QUEUE_SIZE 512
73 #define TX_QUEUE_SIZE 512
74 
75 #define MAX_PKT_BURST 32
76 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
77 
78 /*
79  * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send.
80  */
81 #define MAX_TX_BURST    (MAX_PKT_BURST / 2)
82 
83 #define NB_SOCKETS 8
84 
85 /* Configure how many packets ahead to prefetch, when reading packets */
86 #define PREFETCH_OFFSET    3
87 
88 #define MAX_RX_QUEUE_PER_LCORE 16
89 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS
90 #define MAX_RX_QUEUE_PER_PORT 128
91 
92 #ifdef FF_KNI
93 #define KNI_MBUF_MAX 2048
94 #define KNI_QUEUE_SIZE 2048
95 
96 static int enable_kni;
97 static int kni_accept;
98 #endif
99 
100 static int numa_on;
101 
102 static unsigned idle_sleep;
103 
104 static struct rte_timer freebsd_clock;
105 
106 // Mellanox Linux's driver key
107 static uint8_t default_rsskey_40bytes[40] = {
108     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
109     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
110     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
111     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
112     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
113 };
114 
115 struct mbuf_table {
116     uint16_t len;
117     struct rte_mbuf *m_table[MAX_PKT_BURST];
118 };
119 
120 struct lcore_rx_queue {
121     uint16_t port_id;
122     uint16_t queue_id;
123 } __rte_cache_aligned;
124 
125 struct lcore_conf {
126     uint16_t proc_id;
127     uint16_t socket_id;
128     uint16_t nb_queue_list[RTE_MAX_ETHPORTS];
129     struct ff_port_cfg *port_cfgs;
130 
131     uint16_t nb_rx_queue;
132     struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
133     uint16_t nb_tx_port;
134     uint16_t tx_port_id[RTE_MAX_ETHPORTS];
135     uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
136     struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
137     char *pcap[RTE_MAX_ETHPORTS];
138 } __rte_cache_aligned;
139 
140 static struct lcore_conf lcore_conf;
141 
142 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
143 
144 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
145 static dispatch_func_t packet_dispatcher;
146 
147 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
148 
149 struct ff_msg_ring {
150     char ring_name[2][RTE_RING_NAMESIZE];
151     /* ring[0] for lcore recv msg, other send */
152     /* ring[1] for lcore send msg, other read */
153     struct rte_ring *ring[2];
154 } __rte_cache_aligned;
155 
156 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
157 static struct rte_mempool *message_pool;
158 
159 struct ff_dpdk_if_context {
160     void *sc;
161     void *ifp;
162     uint16_t port_id;
163     struct ff_hw_features hw_features;
164 } __rte_cache_aligned;
165 
166 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
167 
168 static struct ff_top_args ff_top_status;
169 static struct ff_traffic_args ff_traffic;
170 
171 extern void ff_hardclock(void);
172 
173 static void
174 ff_hardclock_job(__rte_unused struct rte_timer *timer,
175     __rte_unused void *arg) {
176     ff_hardclock();
177     ff_update_current_ts();
178 }
179 
180 struct ff_dpdk_if_context *
181 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
182 {
183     struct ff_dpdk_if_context *ctx;
184 
185     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
186     if (ctx == NULL)
187         return NULL;
188 
189     ctx->sc = sc;
190     ctx->ifp = ifp;
191     ctx->port_id = cfg->port_id;
192     ctx->hw_features = cfg->hw_features;
193 
194     return ctx;
195 }
196 
197 void
198 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
199 {
200     free(ctx);
201 }
202 
203 static void
204 check_all_ports_link_status(void)
205 {
206     #define CHECK_INTERVAL 100 /* 100ms */
207     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
208 
209     uint16_t portid;
210     uint8_t count, all_ports_up, print_flag = 0;
211     struct rte_eth_link link;
212 
213     printf("\nChecking link status");
214     fflush(stdout);
215 
216     int i, nb_ports;
217     nb_ports = ff_global_cfg.dpdk.nb_ports;
218     for (count = 0; count <= MAX_CHECK_TIME; count++) {
219         all_ports_up = 1;
220         for (i = 0; i < nb_ports; i++) {
221             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
222             memset(&link, 0, sizeof(link));
223             rte_eth_link_get_nowait(portid, &link);
224 
225             /* print link status if flag set */
226             if (print_flag == 1) {
227                 if (link.link_status) {
228                     printf("Port %d Link Up - speed %u "
229                         "Mbps - %s\n", (int)portid,
230                         (unsigned)link.link_speed,
231                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
232                         ("full-duplex") : ("half-duplex\n"));
233                 } else {
234                     printf("Port %d Link Down\n", (int)portid);
235                 }
236                 continue;
237             }
238             /* clear all_ports_up flag if any link down */
239             if (link.link_status == 0) {
240                 all_ports_up = 0;
241                 break;
242             }
243         }
244 
245         /* after finally printing all link status, get out */
246         if (print_flag == 1)
247             break;
248 
249         if (all_ports_up == 0) {
250             printf(".");
251             fflush(stdout);
252             rte_delay_ms(CHECK_INTERVAL);
253         }
254 
255         /* set the print_flag if all ports up or timeout */
256         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
257             print_flag = 1;
258             printf("done\n");
259         }
260     }
261 }
262 
263 static int
264 init_lcore_conf(void)
265 {
266     uint8_t nb_dev_ports = rte_eth_dev_count_avail();
267     if (nb_dev_ports == 0) {
268         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
269     }
270 
271     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
272         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
273                  ff_global_cfg.dpdk.max_portid);
274     }
275 
276     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
277     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
278 
279     uint16_t proc_id;
280     for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) {
281         uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id];
282         if (!lcore_config[lcore_id].detected) {
283             rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
284         }
285     }
286 
287     uint16_t socket_id = 0;
288     if (numa_on) {
289         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
290     }
291 
292     lcore_conf.socket_id = socket_id;
293 
294     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
295     int j;
296     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
297         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
298         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
299 
300         int queueid = -1;
301         int i;
302         for (i = 0; i < pconf->nb_lcores; i++) {
303             if (pconf->lcore_list[i] == lcore_id) {
304                 queueid = i;
305             }
306         }
307         if (queueid < 0) {
308             continue;
309         }
310         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
311         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
312         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
313         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
314         lcore_conf.nb_rx_queue++;
315 
316         lcore_conf.tx_queue_id[port_id] = queueid;
317         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
318         lcore_conf.nb_tx_port++;
319 
320         lcore_conf.pcap[port_id] = pconf->pcap;
321         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
322     }
323 
324     if (lcore_conf.nb_rx_queue == 0) {
325         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
326     }
327 
328     return 0;
329 }
330 
331 static int
332 init_mem_pool(void)
333 {
334     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
335     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
336     uint32_t nb_tx_queue = nb_lcores;
337     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
338 
339     unsigned nb_mbuf = RTE_MAX (
340         (nb_rx_queue*RX_QUEUE_SIZE          +
341         nb_ports*nb_lcores*MAX_PKT_BURST    +
342         nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
343         nb_lcores*MEMPOOL_CACHE_SIZE +
344 #ifdef FF_KNI
345         nb_ports*KNI_MBUF_MAX +
346         nb_ports*KNI_QUEUE_SIZE +
347 #endif
348         nb_lcores*nb_ports*DISPATCH_RING_SIZE),
349         (unsigned)8192);
350 
351     unsigned socketid = 0;
352     uint16_t i, lcore_id;
353     char s[64];
354 
355     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
356         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
357         if (numa_on) {
358             socketid = rte_lcore_to_socket_id(lcore_id);
359         }
360 
361         if (socketid >= NB_SOCKETS) {
362             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
363                 socketid, i, NB_SOCKETS);
364         }
365 
366         if (pktmbuf_pool[socketid] != NULL) {
367             continue;
368         }
369 
370         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
371             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
372             pktmbuf_pool[socketid] =
373                 rte_pktmbuf_pool_create(s, nb_mbuf,
374                     MEMPOOL_CACHE_SIZE, 0,
375                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
376         } else {
377             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
378             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
379         }
380 
381         if (pktmbuf_pool[socketid] == NULL) {
382             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
383         } else {
384             printf("create mbuf pool on socket %d\n", socketid);
385         }
386     }
387 
388     return 0;
389 }
390 
391 static struct rte_ring *
392 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
393 {
394     struct rte_ring *ring;
395 
396     if (name == NULL) {
397         rte_exit(EXIT_FAILURE, "create ring failed, no name!\n");
398     }
399 
400     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
401         ring = rte_ring_create(name, count, socket_id, flags);
402     } else {
403         ring = rte_ring_lookup(name);
404     }
405 
406     if (ring == NULL) {
407         rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name);
408     }
409 
410     return ring;
411 }
412 
413 static int
414 init_dispatch_ring(void)
415 {
416     int j;
417     char name_buf[RTE_RING_NAMESIZE];
418     int queueid;
419 
420     unsigned socketid = lcore_conf.socket_id;
421 
422     /* Create ring according to ports actually being used. */
423     int nb_ports = ff_global_cfg.dpdk.nb_ports;
424     for (j = 0; j < nb_ports; j++) {
425         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
426         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
427         int nb_queues = pconf->nb_lcores;
428         if (dispatch_ring[portid] == NULL) {
429             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
430 
431             dispatch_ring[portid] = rte_zmalloc(name_buf,
432                 sizeof(struct rte_ring *) * nb_queues,
433                 RTE_CACHE_LINE_SIZE);
434             if (dispatch_ring[portid] == NULL) {
435                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
436                     "failed\n", name_buf);
437             }
438         }
439 
440         for(queueid = 0; queueid < nb_queues; ++queueid) {
441             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
442                 portid, queueid);
443             dispatch_ring[portid][queueid] = create_ring(name_buf,
444                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
445 
446             if (dispatch_ring[portid][queueid] == NULL)
447                 rte_panic("create ring:%s failed!\n", name_buf);
448 
449             printf("create ring:%s success, %u ring entries are now free!\n",
450                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
451         }
452     }
453 
454     return 0;
455 }
456 
457 static void
458 ff_msg_init(struct rte_mempool *mp,
459     __attribute__((unused)) void *opaque_arg,
460     void *obj, __attribute__((unused)) unsigned i)
461 {
462     struct ff_msg *msg = (struct ff_msg *)obj;
463     msg->msg_type = FF_UNKNOWN;
464     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
465     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
466 }
467 
468 static int
469 init_msg_ring(void)
470 {
471     uint16_t i;
472     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
473     unsigned socketid = lcore_conf.socket_id;
474 
475     /* Create message buffer pool */
476     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
477         message_pool = rte_mempool_create(FF_MSG_POOL,
478            MSG_RING_SIZE * 2 * nb_procs,
479            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
480            NULL, NULL, ff_msg_init, NULL,
481            socketid, 0);
482     } else {
483         message_pool = rte_mempool_lookup(FF_MSG_POOL);
484     }
485 
486     if (message_pool == NULL) {
487         rte_panic("Create msg mempool failed\n");
488     }
489 
490     for(i = 0; i < nb_procs; ++i) {
491         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
492             "%s%u", FF_MSG_RING_IN, i);
493         snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE,
494             "%s%u", FF_MSG_RING_OUT, i);
495 
496         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
497             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
498         if (msg_ring[i].ring[0] == NULL)
499             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
500 
501         msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1],
502             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
503         if (msg_ring[i].ring[1] == NULL)
504             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
505     }
506 
507     return 0;
508 }
509 
510 #ifdef FF_KNI
511 static int
512 init_kni(void)
513 {
514     int nb_ports = rte_eth_dev_count_avail();
515     kni_accept = 0;
516     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
517         kni_accept = 1;
518 
519     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
520         ff_global_cfg.kni.udp_port);
521 
522     unsigned socket_id = lcore_conf.socket_id;
523     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
524 
525     nb_ports = ff_global_cfg.dpdk.nb_ports;
526     int i, ret;
527     for (i = 0; i < nb_ports; i++) {
528         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
529         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
530     }
531 
532     return 0;
533 }
534 #endif
535 
536 static void
537 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
538 {
539     if (reta_size == 0) {
540         return;
541     }
542 
543     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
544     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
545 
546     /* config HW indirection table */
547     unsigned i, j, hash=0;
548     for (i = 0; i < reta_conf_size; i++) {
549         reta_conf[i].mask = ~0ULL;
550         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
551             reta_conf[i].reta[j] = hash++ % nb_queues;
552         }
553     }
554 
555     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
556         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
557             port_id);
558     }
559 }
560 
561 static int
562 init_port_start(void)
563 {
564     int nb_ports = ff_global_cfg.dpdk.nb_ports;
565     unsigned socketid = 0;
566     struct rte_mempool *mbuf_pool;
567     uint16_t i;
568 
569     for (i = 0; i < nb_ports; i++) {
570         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
571         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
572         uint16_t nb_queues = pconf->nb_lcores;
573 
574         struct rte_eth_dev_info dev_info;
575         rte_eth_dev_info_get(port_id, &dev_info);
576 
577         if (nb_queues > dev_info.max_rx_queues) {
578             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
579                 nb_queues,
580                 dev_info.max_rx_queues);
581         }
582 
583         if (nb_queues > dev_info.max_tx_queues) {
584             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
585                 nb_queues,
586                 dev_info.max_tx_queues);
587         }
588 
589         struct ether_addr addr;
590         rte_eth_macaddr_get(port_id, &addr);
591         printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
592                    " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
593                 (unsigned)port_id,
594                 addr.addr_bytes[0], addr.addr_bytes[1],
595                 addr.addr_bytes[2], addr.addr_bytes[3],
596                 addr.addr_bytes[4], addr.addr_bytes[5]);
597 
598         rte_memcpy(pconf->mac,
599             addr.addr_bytes, ETHER_ADDR_LEN);
600 
601         struct rte_eth_conf port_conf = {0};
602 
603         /* Set RSS mode */
604         uint64_t default_rss_hf = ETH_RSS_PROTO_MASK;
605         port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
606         port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf;
607         port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes;
608         port_conf.rx_adv_conf.rss_conf.rss_key_len = 40;
609         port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads;
610          if (port_conf.rx_adv_conf.rss_conf.rss_hf !=
611                 ETH_RSS_PROTO_MASK) {
612             printf("Port %u modified RSS hash function based on hardware support,"
613                     "requested:%#"PRIx64" configured:%#"PRIx64"\n",
614                     port_id, default_rss_hf,
615                     port_conf.rx_adv_conf.rss_conf.rss_hf);
616         }
617 
618         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
619             port_conf.txmode.offloads |=
620                 DEV_TX_OFFLOAD_MBUF_FAST_FREE;
621 
622         /* Set Rx VLAN stripping */
623         if (ff_global_cfg.dpdk.vlan_strip) {
624             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
625                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
626             }
627         }
628 
629         /* Enable HW CRC stripping */
630         if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_KEEP_CRC) {
631             port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_KEEP_CRC;
632         }
633 
634         /* FIXME: Enable TCP LRO ?*/
635         #if 0
636         if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
637             printf("LRO is supported\n");
638             port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
639             pconf->hw_features.rx_lro = 1;
640         }
641         #endif
642 
643         /* Set Rx checksum checking */
644         if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
645             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
646             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
647             printf("RX checksum offload supported\n");
648             port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
649             pconf->hw_features.rx_csum = 1;
650         }
651 
652         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
653             printf("TX ip checksum offload supported\n");
654             port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
655             pconf->hw_features.tx_csum_ip = 1;
656         }
657 
658         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
659             (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
660             printf("TX TCP&UDP checksum offload supported\n");
661             port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM;
662             pconf->hw_features.tx_csum_l4 = 1;
663         }
664 
665         if (ff_global_cfg.dpdk.tso) {
666             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
667                 printf("TSO is supported\n");
668                 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO;
669                 pconf->hw_features.tx_tso = 1;
670             }
671         } else {
672             printf("TSO is disabled\n");
673         }
674 
675         if (dev_info.reta_size) {
676             /* reta size must be power of 2 */
677             assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
678 
679             rss_reta_size[port_id] = dev_info.reta_size;
680             printf("port[%d]: rss table size: %d\n", port_id,
681                 dev_info.reta_size);
682         }
683 
684         if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
685             continue;
686         }
687 
688         int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
689         if (ret != 0) {
690             return ret;
691         }
692         uint16_t q;
693         for (q = 0; q < nb_queues; q++) {
694             if (numa_on) {
695                 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
696                 socketid = rte_lcore_to_socket_id(lcore_id);
697             }
698             mbuf_pool = pktmbuf_pool[socketid];
699 
700             ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE,
701                 socketid, &dev_info.default_txconf);
702             if (ret < 0) {
703                 return ret;
704             }
705 
706             ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE,
707                 socketid, &dev_info.default_rxconf, mbuf_pool);
708             if (ret < 0) {
709                 return ret;
710             }
711         }
712 
713         ret = rte_eth_dev_start(port_id);
714         if (ret < 0) {
715             return ret;
716         }
717 
718         if (nb_queues > 1) {
719             /* set HW rss hash function to Toeplitz. */
720             if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
721                 struct rte_eth_hash_filter_info info = {0};
722                 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
723                 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
724 
725                 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
726                     RTE_ETH_FILTER_SET, &info) < 0) {
727                     rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
728                         port_id);
729                 }
730             }
731 
732             set_rss_table(port_id, dev_info.reta_size, nb_queues);
733         }
734 
735         /* Enable RX in promiscuous mode for the Ethernet device. */
736         if (ff_global_cfg.dpdk.promiscuous) {
737             rte_eth_promiscuous_enable(port_id);
738             ret = rte_eth_promiscuous_get(port_id);
739             if (ret == 1) {
740                 printf("set port %u to promiscuous mode ok\n", port_id);
741             } else {
742                 printf("set port %u to promiscuous mode error\n", port_id);
743             }
744         }
745 
746         /* Enable pcap dump */
747         if (pconf->pcap) {
748             ff_enable_pcap(pconf->pcap);
749         }
750     }
751 
752     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
753         check_all_ports_link_status();
754     }
755 
756     return 0;
757 }
758 
759 static int
760 init_clock(void)
761 {
762     rte_timer_subsystem_init();
763     uint64_t hz = rte_get_timer_hz();
764     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
765     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
766 
767     rte_timer_init(&freebsd_clock);
768     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
769         rte_lcore_id(), &ff_hardclock_job, NULL);
770 
771     ff_update_current_ts();
772 
773     return 0;
774 }
775 
776 int
777 ff_dpdk_init(int argc, char **argv)
778 {
779     if (ff_global_cfg.dpdk.nb_procs < 1 ||
780         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
781         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
782         ff_global_cfg.dpdk.proc_id < 0) {
783         printf("param num_procs[%d] or proc_id[%d] error!\n",
784             ff_global_cfg.dpdk.nb_procs,
785             ff_global_cfg.dpdk.proc_id);
786         exit(1);
787     }
788 
789     int ret = rte_eal_init(argc, argv);
790     if (ret < 0) {
791         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
792     }
793 
794     numa_on = ff_global_cfg.dpdk.numa_on;
795 
796     idle_sleep = ff_global_cfg.dpdk.idle_sleep;
797 
798     init_lcore_conf();
799 
800     init_mem_pool();
801 
802     init_dispatch_ring();
803 
804     init_msg_ring();
805 
806 #ifdef FF_KNI
807     enable_kni = ff_global_cfg.kni.enable;
808     if (enable_kni) {
809         init_kni();
810     }
811 #endif
812 
813     ret = init_port_start();
814     if (ret < 0) {
815         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
816     }
817 
818     init_clock();
819 
820     return 0;
821 }
822 
823 static void
824 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
825 {
826     uint8_t rx_csum = ctx->hw_features.rx_csum;
827     if (rx_csum) {
828         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
829             rte_pktmbuf_free(pkt);
830             return;
831         }
832     }
833 
834     /*
835      * FIXME: should we save pkt->vlan_tci
836      * if (pkt->ol_flags & PKT_RX_VLAN_PKT)
837      */
838 
839     void *data = rte_pktmbuf_mtod(pkt, void*);
840     uint16_t len = rte_pktmbuf_data_len(pkt);
841 
842     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
843     if (hdr == NULL) {
844         rte_pktmbuf_free(pkt);
845         return;
846     }
847 
848     struct rte_mbuf *pn = pkt->next;
849     void *prev = hdr;
850     while(pn != NULL) {
851         data = rte_pktmbuf_mtod(pn, void*);
852         len = rte_pktmbuf_data_len(pn);
853 
854         void *mb = ff_mbuf_get(prev, data, len);
855         if (mb == NULL) {
856             ff_mbuf_free(hdr);
857             rte_pktmbuf_free(pkt);
858             return;
859         }
860         pn = pn->next;
861         prev = mb;
862     }
863 
864     ff_veth_process_packet(ctx->ifp, hdr);
865 }
866 
867 static enum FilterReturn
868 protocol_filter(const void *data, uint16_t len)
869 {
870     if(len < ETHER_HDR_LEN)
871         return FILTER_UNKNOWN;
872 
873     const struct ether_hdr *hdr;
874     hdr = (const struct ether_hdr *)data;
875 
876     if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP)
877         return FILTER_ARP;
878 
879 #ifndef FF_KNI
880     return FILTER_UNKNOWN;
881 #else
882     if (!enable_kni) {
883         return FILTER_UNKNOWN;
884     }
885 
886     if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4)
887         return FILTER_UNKNOWN;
888 
889     return ff_kni_proto_filter(data + ETHER_HDR_LEN,
890         len - ETHER_HDR_LEN);
891 #endif
892 }
893 
894 static inline void
895 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
896 {
897     struct rte_mbuf *md;
898     void *src, *dst;
899 
900     dst = rte_pktmbuf_mtod(mi, void *);
901     src = rte_pktmbuf_mtod(m, void *);
902 
903     mi->data_len = m->data_len;
904     rte_memcpy(dst, src, m->data_len);
905 
906     mi->port = m->port;
907     mi->vlan_tci = m->vlan_tci;
908     mi->vlan_tci_outer = m->vlan_tci_outer;
909     mi->tx_offload = m->tx_offload;
910     mi->hash = m->hash;
911     mi->ol_flags = m->ol_flags;
912     mi->packet_type = m->packet_type;
913 }
914 
915 /* copied from rte_pktmbuf_clone */
916 static inline struct rte_mbuf *
917 pktmbuf_deep_clone(const struct rte_mbuf *md,
918     struct rte_mempool *mp)
919 {
920     struct rte_mbuf *mc, *mi, **prev;
921     uint32_t pktlen;
922     uint8_t nseg;
923 
924     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
925         return NULL;
926 
927     mi = mc;
928     prev = &mi->next;
929     pktlen = md->pkt_len;
930     nseg = 0;
931 
932     do {
933         nseg++;
934         pktmbuf_deep_attach(mi, md);
935         *prev = mi;
936         prev = &mi->next;
937     } while ((md = md->next) != NULL &&
938         (mi = rte_pktmbuf_alloc(mp)) != NULL);
939 
940     *prev = NULL;
941     mc->nb_segs = nseg;
942     mc->pkt_len = pktlen;
943 
944     /* Allocation of new indirect segment failed */
945     if (unlikely (mi == NULL)) {
946         rte_pktmbuf_free(mc);
947         return NULL;
948     }
949 
950     __rte_mbuf_sanity_check(mc, 1);
951     return mc;
952 }
953 
954 static inline void
955 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
956     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
957 {
958     struct lcore_conf *qconf = &lcore_conf;
959     uint16_t nb_queues = qconf->nb_queue_list[port_id];
960 
961     uint16_t i;
962     for (i = 0; i < count; i++) {
963         struct rte_mbuf *rtem = bufs[i];
964 
965         if (unlikely(qconf->pcap[port_id] != NULL)) {
966             if (!pkts_from_ring) {
967                 ff_dump_packets(qconf->pcap[port_id], rtem);
968             }
969         }
970 
971         void *data = rte_pktmbuf_mtod(rtem, void*);
972         uint16_t len = rte_pktmbuf_data_len(rtem);
973 
974         if (!pkts_from_ring) {
975             ff_traffic.rx_packets++;
976             ff_traffic.rx_bytes += len;
977         }
978 
979         if (!pkts_from_ring && packet_dispatcher) {
980             int ret = (*packet_dispatcher)(data, len, queue_id, nb_queues);
981             if (ret < 0 || ret >= nb_queues) {
982                 rte_pktmbuf_free(rtem);
983                 continue;
984             }
985 
986             if (ret != queue_id) {
987                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
988                 if (ret < 0)
989                     rte_pktmbuf_free(rtem);
990 
991                 continue;
992             }
993         }
994 
995         enum FilterReturn filter = protocol_filter(data, len);
996         if (filter == FILTER_ARP) {
997             struct rte_mempool *mbuf_pool;
998             struct rte_mbuf *mbuf_clone;
999             if (!pkts_from_ring) {
1000                 uint16_t j;
1001                 for(j = 0; j < nb_queues; ++j) {
1002                     if(j == queue_id)
1003                         continue;
1004 
1005                     unsigned socket_id = 0;
1006                     if (numa_on) {
1007                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1008                         socket_id = rte_lcore_to_socket_id(lcore_id);
1009                     }
1010                     mbuf_pool = pktmbuf_pool[socket_id];
1011                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1012                     if(mbuf_clone) {
1013                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1014                             mbuf_clone);
1015                         if (ret < 0)
1016                             rte_pktmbuf_free(mbuf_clone);
1017                     }
1018                 }
1019             }
1020 
1021 #ifdef FF_KNI
1022             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1023                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1024                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1025                 if(mbuf_clone) {
1026                     ff_kni_enqueue(port_id, mbuf_clone);
1027                 }
1028             }
1029 #endif
1030             ff_veth_input(ctx, rtem);
1031 #ifdef FF_KNI
1032         } else if (enable_kni &&
1033             ((filter == FILTER_KNI && kni_accept) ||
1034             (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1035             ff_kni_enqueue(port_id, rtem);
1036 #endif
1037         } else {
1038             ff_veth_input(ctx, rtem);
1039         }
1040     }
1041 }
1042 
1043 static inline int
1044 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1045     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1046 {
1047     /* read packet from ring buf and to process */
1048     uint16_t nb_rb;
1049     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1050         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1051 
1052     if(nb_rb > 0) {
1053         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1054     }
1055 
1056     return 0;
1057 }
1058 
1059 static inline void
1060 handle_sysctl_msg(struct ff_msg *msg)
1061 {
1062     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1063         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1064         msg->sysctl.newlen);
1065 
1066     if (ret < 0) {
1067         msg->result = errno;
1068     } else {
1069         msg->result = 0;
1070     }
1071 }
1072 
1073 static inline void
1074 handle_ioctl_msg(struct ff_msg *msg)
1075 {
1076     int fd, ret;
1077     fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1078     if (fd < 0) {
1079         ret = -1;
1080         goto done;
1081     }
1082 
1083     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1084 
1085     ff_close(fd);
1086 
1087 done:
1088     if (ret < 0) {
1089         msg->result = errno;
1090     } else {
1091         msg->result = 0;
1092     }
1093 }
1094 
1095 static inline void
1096 handle_route_msg(struct ff_msg *msg)
1097 {
1098     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1099         &msg->route.len, msg->route.maxlen);
1100     if (ret < 0) {
1101         msg->result = errno;
1102     } else {
1103         msg->result = 0;
1104     }
1105 }
1106 
1107 static inline void
1108 handle_top_msg(struct ff_msg *msg)
1109 {
1110     msg->top = ff_top_status;
1111     msg->result = 0;
1112 }
1113 
1114 #ifdef FF_NETGRAPH
1115 static inline void
1116 handle_ngctl_msg(struct ff_msg *msg)
1117 {
1118     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1119     if (ret < 0) {
1120         msg->result = errno;
1121     } else {
1122         msg->result = 0;
1123         msg->ngctl.ret = ret;
1124     }
1125 }
1126 #endif
1127 
1128 #ifdef FF_IPFW
1129 static inline void
1130 handle_ipfw_msg(struct ff_msg *msg)
1131 {
1132     int fd, ret;
1133     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1134     if (fd < 0) {
1135         ret = -1;
1136         goto done;
1137     }
1138 
1139     switch (msg->ipfw.cmd) {
1140         case FF_IPFW_GET:
1141             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1142                 msg->ipfw.optname, msg->ipfw.optval,
1143                 msg->ipfw.optlen);
1144             break;
1145         case FF_IPFW_SET:
1146             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1147                 msg->ipfw.optname, msg->ipfw.optval,
1148                 *(msg->ipfw.optlen));
1149             break;
1150         default:
1151             ret = -1;
1152             errno = ENOTSUP;
1153             break;
1154     }
1155 
1156     ff_close(fd);
1157 
1158 done:
1159     if (ret < 0) {
1160         msg->result = errno;
1161     } else {
1162         msg->result = 0;
1163     }
1164 }
1165 #endif
1166 
1167 static inline void
1168 handle_traffic_msg(struct ff_msg *msg)
1169 {
1170     msg->traffic = ff_traffic;
1171     msg->result = 0;
1172 }
1173 
1174 static inline void
1175 handle_default_msg(struct ff_msg *msg)
1176 {
1177     msg->result = ENOTSUP;
1178 }
1179 
1180 static inline void
1181 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1182 {
1183     switch (msg->msg_type) {
1184         case FF_SYSCTL:
1185             handle_sysctl_msg(msg);
1186             break;
1187         case FF_IOCTL:
1188             handle_ioctl_msg(msg);
1189             break;
1190         case FF_ROUTE:
1191             handle_route_msg(msg);
1192             break;
1193         case FF_TOP:
1194             handle_top_msg(msg);
1195             break;
1196 #ifdef FF_NETGRAPH
1197         case FF_NGCTL:
1198             handle_ngctl_msg(msg);
1199             break;
1200 #endif
1201 #ifdef FF_IPFW
1202         case FF_IPFW_CTL:
1203             handle_ipfw_msg(msg);
1204             break;
1205 #endif
1206         case FF_TRAFFIC:
1207             handle_traffic_msg(msg);
1208             break;
1209         default:
1210             handle_default_msg(msg);
1211             break;
1212     }
1213     rte_ring_enqueue(msg_ring[proc_id].ring[1], msg);
1214 }
1215 
1216 static inline int
1217 process_msg_ring(uint16_t proc_id)
1218 {
1219     void *msg;
1220     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1221 
1222     if (unlikely(ret == 0)) {
1223         handle_msg((struct ff_msg *)msg, proc_id);
1224     }
1225 
1226     return 0;
1227 }
1228 
1229 /* Send burst of packets on an output interface */
1230 static inline int
1231 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1232 {
1233     struct rte_mbuf **m_table;
1234     int ret;
1235     uint16_t queueid;
1236 
1237     queueid = qconf->tx_queue_id[port];
1238     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1239 
1240     if (unlikely(qconf->pcap[port] != NULL)) {
1241         uint16_t i;
1242         for (i = 0; i < n; i++) {
1243             ff_dump_packets(qconf->pcap[port], m_table[i]);
1244         }
1245     }
1246 
1247     ff_traffic.tx_packets += n;
1248     uint16_t i;
1249     for (i = 0; i < n; i++) {
1250         ff_traffic.tx_bytes += rte_pktmbuf_data_len(m_table[i]);
1251     }
1252 
1253     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1254     if (unlikely(ret < n)) {
1255         do {
1256             rte_pktmbuf_free(m_table[ret]);
1257         } while (++ret < n);
1258     }
1259 
1260     return 0;
1261 }
1262 
1263 /* Enqueue a single packet, and send burst if queue is filled */
1264 static inline int
1265 send_single_packet(struct rte_mbuf *m, uint8_t port)
1266 {
1267     uint16_t len;
1268     struct lcore_conf *qconf;
1269 
1270     qconf = &lcore_conf;
1271     len = qconf->tx_mbufs[port].len;
1272     qconf->tx_mbufs[port].m_table[len] = m;
1273     len++;
1274 
1275     /* enough pkts to be sent */
1276     if (unlikely(len == MAX_PKT_BURST)) {
1277         send_burst(qconf, MAX_PKT_BURST, port);
1278         len = 0;
1279     }
1280 
1281     qconf->tx_mbufs[port].len = len;
1282     return 0;
1283 }
1284 
1285 int
1286 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1287     int total)
1288 {
1289     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1290     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1291     if (head == NULL) {
1292         ff_mbuf_free(m);
1293         return -1;
1294     }
1295 
1296     head->pkt_len = total;
1297     head->nb_segs = 0;
1298 
1299     int off = 0;
1300     struct rte_mbuf *cur = head, *prev = NULL;
1301     while(total > 0) {
1302         if (cur == NULL) {
1303             cur = rte_pktmbuf_alloc(mbuf_pool);
1304             if (cur == NULL) {
1305                 rte_pktmbuf_free(head);
1306                 ff_mbuf_free(m);
1307                 return -1;
1308             }
1309         }
1310 
1311         if (prev != NULL) {
1312             prev->next = cur;
1313         }
1314         head->nb_segs++;
1315 
1316         prev = cur;
1317         void *data = rte_pktmbuf_mtod(cur, void*);
1318         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1319         int ret = ff_mbuf_copydata(m, data, off, len);
1320         if (ret < 0) {
1321             rte_pktmbuf_free(head);
1322             ff_mbuf_free(m);
1323             return -1;
1324         }
1325 
1326 
1327         cur->data_len = len;
1328         off += len;
1329         total -= len;
1330         cur = NULL;
1331     }
1332 
1333     struct ff_tx_offload offload = {0};
1334     ff_mbuf_tx_offload(m, &offload);
1335 
1336     void *data = rte_pktmbuf_mtod(head, void*);
1337 
1338     if (offload.ip_csum) {
1339         /* ipv6 not supported yet */
1340         struct ipv4_hdr *iph;
1341         int iph_len;
1342         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1343         iph_len = (iph->version_ihl & 0x0f) << 2;
1344 
1345         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1346         head->l2_len = ETHER_HDR_LEN;
1347         head->l3_len = iph_len;
1348     }
1349 
1350     if (ctx->hw_features.tx_csum_l4) {
1351         struct ipv4_hdr *iph;
1352         int iph_len;
1353         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1354         iph_len = (iph->version_ihl & 0x0f) << 2;
1355 
1356         if (offload.tcp_csum) {
1357             head->ol_flags |= PKT_TX_TCP_CKSUM;
1358             head->l2_len = ETHER_HDR_LEN;
1359             head->l3_len = iph_len;
1360         }
1361 
1362         /*
1363          *  TCP segmentation offload.
1364          *
1365          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1366          *    implies PKT_TX_TCP_CKSUM)
1367          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1368          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1369          *    write the IP checksum to 0 in the packet
1370          *  - fill the mbuf offload information: l2_len,
1371          *    l3_len, l4_len, tso_segsz
1372          *  - calculate the pseudo header checksum without taking ip_len
1373          *    in account, and set it in the TCP header. Refer to
1374          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1375          *    used as helpers.
1376          */
1377         if (offload.tso_seg_size) {
1378             struct tcp_hdr *tcph;
1379             int tcph_len;
1380             tcph = (struct tcp_hdr *)((char *)iph + iph_len);
1381             tcph_len = (tcph->data_off & 0xf0) >> 2;
1382             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1383 
1384             head->ol_flags |= PKT_TX_TCP_SEG;
1385             head->l4_len = tcph_len;
1386             head->tso_segsz = offload.tso_seg_size;
1387         }
1388 
1389         if (offload.udp_csum) {
1390             head->ol_flags |= PKT_TX_UDP_CKSUM;
1391             head->l2_len = ETHER_HDR_LEN;
1392             head->l3_len = iph_len;
1393         }
1394     }
1395 
1396     ff_mbuf_free(m);
1397 
1398     return send_single_packet(head, ctx->port_id);
1399 }
1400 
1401 static int
1402 main_loop(void *arg)
1403 {
1404     struct loop_routine *lr = (struct loop_routine *)arg;
1405 
1406     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1407     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc;
1408     int i, j, nb_rx, idle;
1409     uint16_t port_id, queue_id;
1410     struct lcore_conf *qconf;
1411     const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) /
1412         US_PER_S * BURST_TX_DRAIN_US;
1413     struct ff_dpdk_if_context *ctx;
1414 
1415     prev_tsc = 0;
1416     usch_tsc = 0;
1417 
1418     qconf = &lcore_conf;
1419 
1420     while (1) {
1421         cur_tsc = rte_rdtsc();
1422         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1423             rte_timer_manage();
1424         }
1425 
1426         idle = 1;
1427         sys_tsc = 0;
1428         usr_tsc = 0;
1429 
1430         /*
1431          * TX burst queue drain
1432          */
1433         diff_tsc = cur_tsc - prev_tsc;
1434         if (unlikely(diff_tsc > drain_tsc)) {
1435             for (i = 0; i < qconf->nb_tx_port; i++) {
1436                 port_id = qconf->tx_port_id[i];
1437                 if (qconf->tx_mbufs[port_id].len == 0)
1438                     continue;
1439 
1440                 idle = 0;
1441 
1442                 send_burst(qconf,
1443                     qconf->tx_mbufs[port_id].len,
1444                     port_id);
1445                 qconf->tx_mbufs[port_id].len = 0;
1446             }
1447 
1448             prev_tsc = cur_tsc;
1449         }
1450 
1451         /*
1452          * Read packet from RX queues
1453          */
1454         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1455             port_id = qconf->rx_queue_list[i].port_id;
1456             queue_id = qconf->rx_queue_list[i].queue_id;
1457             ctx = veth_ctx[port_id];
1458 
1459 #ifdef FF_KNI
1460             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1461                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1462             }
1463 #endif
1464 
1465             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1466 
1467             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1468                 MAX_PKT_BURST);
1469             if (nb_rx == 0)
1470                 continue;
1471 
1472             idle = 0;
1473 
1474             /* Prefetch first packets */
1475             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1476                 rte_prefetch0(rte_pktmbuf_mtod(
1477                         pkts_burst[j], void *));
1478             }
1479 
1480             /* Prefetch and handle already prefetched packets */
1481             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1482                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1483                         j + PREFETCH_OFFSET], void *));
1484                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1485             }
1486 
1487             /* Handle remaining prefetched packets */
1488             for (; j < nb_rx; j++) {
1489                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1490             }
1491         }
1492 
1493         process_msg_ring(qconf->proc_id);
1494 
1495         div_tsc = rte_rdtsc();
1496 
1497         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) {
1498             usch_tsc = cur_tsc;
1499             lr->loop(lr->arg);
1500         }
1501 
1502         idle_sleep_tsc = rte_rdtsc();
1503         if (likely(idle && idle_sleep)) {
1504             usleep(idle_sleep);
1505             end_tsc = rte_rdtsc();
1506         } else {
1507             end_tsc = idle_sleep_tsc;
1508         }
1509 
1510         end_tsc = rte_rdtsc();
1511 
1512         if (usch_tsc == cur_tsc) {
1513             usr_tsc = idle_sleep_tsc - div_tsc;
1514         }
1515 
1516         if (!idle) {
1517             sys_tsc = div_tsc - cur_tsc;
1518             ff_top_status.sys_tsc += sys_tsc;
1519         }
1520 
1521         ff_top_status.usr_tsc += usr_tsc;
1522         ff_top_status.work_tsc += end_tsc - cur_tsc;
1523         ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1524 
1525         ff_top_status.loops++;
1526     }
1527 
1528     return 0;
1529 }
1530 
1531 int
1532 ff_dpdk_if_up(void) {
1533     int i;
1534     struct lcore_conf *qconf = &lcore_conf;
1535     for (i = 0; i < qconf->nb_tx_port; i++) {
1536         uint16_t port_id = qconf->tx_port_id[i];
1537 
1538         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1539         veth_ctx[port_id] = ff_veth_attach(pconf);
1540         if (veth_ctx[port_id] == NULL) {
1541             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1542         }
1543     }
1544 
1545     return 0;
1546 }
1547 
1548 void
1549 ff_dpdk_run(loop_func_t loop, void *arg) {
1550     struct loop_routine *lr = rte_malloc(NULL,
1551         sizeof(struct loop_routine), 0);
1552     lr->loop = loop;
1553     lr->arg = arg;
1554     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1555     rte_eal_mp_wait_lcore();
1556     rte_free(lr);
1557 }
1558 
1559 void
1560 ff_dpdk_pktmbuf_free(void *m)
1561 {
1562     rte_pktmbuf_free((struct rte_mbuf *)m);
1563 }
1564 
1565 static uint32_t
1566 toeplitz_hash(unsigned keylen, const uint8_t *key,
1567     unsigned datalen, const uint8_t *data)
1568 {
1569     uint32_t hash = 0, v;
1570     u_int i, b;
1571 
1572     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1573 
1574     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1575     for (i = 0; i < datalen; i++) {
1576         for (b = 0; b < 8; b++) {
1577             if (data[i] & (1<<(7-b)))
1578                 hash ^= v;
1579             v <<= 1;
1580             if ((i + 4) < keylen &&
1581                 (key[i+4] & (1<<(7-b))))
1582                 v |= 1;
1583         }
1584     }
1585     return (hash);
1586 }
1587 
1588 int
1589 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
1590     uint16_t sport, uint16_t dport)
1591 {
1592     struct lcore_conf *qconf = &lcore_conf;
1593     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
1594     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
1595 
1596     if (nb_queues <= 1) {
1597         return 1;
1598     }
1599 
1600     uint16_t reta_size = rss_reta_size[ctx->port_id];
1601     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
1602 
1603     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
1604         sizeof(dport)];
1605 
1606     unsigned datalen = 0;
1607 
1608     bcopy(&saddr, &data[datalen], sizeof(saddr));
1609     datalen += sizeof(saddr);
1610 
1611     bcopy(&daddr, &data[datalen], sizeof(daddr));
1612     datalen += sizeof(daddr);
1613 
1614     bcopy(&sport, &data[datalen], sizeof(sport));
1615     datalen += sizeof(sport);
1616 
1617     bcopy(&dport, &data[datalen], sizeof(dport));
1618     datalen += sizeof(dport);
1619 
1620     uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes),
1621         default_rsskey_40bytes, datalen, data);
1622 
1623     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
1624 }
1625 
1626 void
1627 ff_regist_packet_dispatcher(dispatch_func_t func)
1628 {
1629     packet_dispatcher = func;
1630 }
1631 
1632 uint64_t
1633 ff_get_tsc_ns()
1634 {
1635     uint64_t cur_tsc = rte_rdtsc();
1636     uint64_t hz = rte_get_tsc_hz();
1637     return ((double)cur_tsc/(double)hz) * NS_PER_S;
1638 }
1639 
1640