xref: /f-stack/lib/ff_dpdk_if.c (revision 490ee526)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 #include <unistd.h>
28 
29 #include <rte_common.h>
30 #include <rte_byteorder.h>
31 #include <rte_log.h>
32 #include <rte_memory.h>
33 #include <rte_memcpy.h>
34 #include <rte_memzone.h>
35 #include <rte_config.h>
36 #include <rte_eal.h>
37 #include <rte_pci.h>
38 #include <rte_mbuf.h>
39 #include <rte_memory.h>
40 #include <rte_lcore.h>
41 #include <rte_launch.h>
42 #include <rte_ethdev.h>
43 #include <rte_debug.h>
44 #include <rte_common.h>
45 #include <rte_ether.h>
46 #include <rte_malloc.h>
47 #include <rte_cycles.h>
48 #include <rte_timer.h>
49 #include <rte_thash.h>
50 #include <rte_ip.h>
51 #include <rte_tcp.h>
52 #include <rte_udp.h>
53 
54 #include "ff_dpdk_if.h"
55 #include "ff_dpdk_pcap.h"
56 #include "ff_dpdk_kni.h"
57 #include "ff_config.h"
58 #include "ff_veth.h"
59 #include "ff_host_interface.h"
60 #include "ff_msg.h"
61 #include "ff_api.h"
62 
63 #define MEMPOOL_CACHE_SIZE 256
64 
65 #define DISPATCH_RING_SIZE 2048
66 
67 #define MSG_RING_SIZE 32
68 
69 /*
70  * Configurable number of RX/TX ring descriptors
71  */
72 #define RX_QUEUE_SIZE 512
73 #define TX_QUEUE_SIZE 512
74 
75 #define MAX_PKT_BURST 32
76 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
77 
78 /*
79  * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send.
80  */
81 #define MAX_TX_BURST    (MAX_PKT_BURST / 2)
82 
83 #define NB_SOCKETS 8
84 
85 /* Configure how many packets ahead to prefetch, when reading packets */
86 #define PREFETCH_OFFSET    3
87 
88 #define MAX_RX_QUEUE_PER_LCORE 16
89 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS
90 #define MAX_RX_QUEUE_PER_PORT 128
91 
92 #ifdef FF_KNI
93 #define KNI_MBUF_MAX 2048
94 #define KNI_QUEUE_SIZE 2048
95 
96 static int enable_kni;
97 static int kni_accept;
98 #endif
99 
100 static int numa_on;
101 
102 static unsigned idle_sleep;
103 
104 static struct rte_timer freebsd_clock;
105 
106 // Mellanox Linux's driver key
107 static uint8_t default_rsskey_40bytes[40] = {
108     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
109     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
110     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
111     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
112     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
113 };
114 
115 struct mbuf_table {
116     uint16_t len;
117     struct rte_mbuf *m_table[MAX_PKT_BURST];
118 };
119 
120 struct lcore_rx_queue {
121     uint16_t port_id;
122     uint16_t queue_id;
123 } __rte_cache_aligned;
124 
125 struct lcore_conf {
126     uint16_t proc_id;
127     uint16_t socket_id;
128     uint16_t nb_queue_list[RTE_MAX_ETHPORTS];
129     struct ff_port_cfg *port_cfgs;
130 
131     uint16_t nb_rx_queue;
132     struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
133     uint16_t nb_tx_port;
134     uint16_t tx_port_id[RTE_MAX_ETHPORTS];
135     uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
136     struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
137     char *pcap[RTE_MAX_ETHPORTS];
138 } __rte_cache_aligned;
139 
140 static struct lcore_conf lcore_conf;
141 
142 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
143 
144 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
145 static dispatch_func_t packet_dispatcher;
146 
147 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
148 
149 struct ff_msg_ring {
150     char ring_name[2][RTE_RING_NAMESIZE];
151     /* ring[0] for lcore recv msg, other send */
152     /* ring[1] for lcore send msg, other read */
153     struct rte_ring *ring[2];
154 } __rte_cache_aligned;
155 
156 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
157 static struct rte_mempool *message_pool;
158 
159 struct ff_dpdk_if_context {
160     void *sc;
161     void *ifp;
162     uint16_t port_id;
163     struct ff_hw_features hw_features;
164 } __rte_cache_aligned;
165 
166 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
167 
168 static struct ff_top_args ff_top_status;
169 static struct ff_traffic_args ff_traffic;
170 
171 extern void ff_hardclock(void);
172 
173 static void
174 ff_hardclock_job(__rte_unused struct rte_timer *timer,
175     __rte_unused void *arg) {
176     ff_hardclock();
177     ff_update_current_ts();
178 }
179 
180 struct ff_dpdk_if_context *
181 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
182 {
183     struct ff_dpdk_if_context *ctx;
184 
185     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
186     if (ctx == NULL)
187         return NULL;
188 
189     ctx->sc = sc;
190     ctx->ifp = ifp;
191     ctx->port_id = cfg->port_id;
192     ctx->hw_features = cfg->hw_features;
193 
194     return ctx;
195 }
196 
197 void
198 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
199 {
200     free(ctx);
201 }
202 
203 static void
204 check_all_ports_link_status(void)
205 {
206     #define CHECK_INTERVAL 100 /* 100ms */
207     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
208 
209     uint16_t portid;
210     uint8_t count, all_ports_up, print_flag = 0;
211     struct rte_eth_link link;
212 
213     printf("\nChecking link status");
214     fflush(stdout);
215 
216     int i, nb_ports;
217     nb_ports = ff_global_cfg.dpdk.nb_ports;
218     for (count = 0; count <= MAX_CHECK_TIME; count++) {
219         all_ports_up = 1;
220         for (i = 0; i < nb_ports; i++) {
221             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
222             memset(&link, 0, sizeof(link));
223             rte_eth_link_get_nowait(portid, &link);
224 
225             /* print link status if flag set */
226             if (print_flag == 1) {
227                 if (link.link_status) {
228                     printf("Port %d Link Up - speed %u "
229                         "Mbps - %s\n", (int)portid,
230                         (unsigned)link.link_speed,
231                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
232                         ("full-duplex") : ("half-duplex\n"));
233                 } else {
234                     printf("Port %d Link Down\n", (int)portid);
235                 }
236                 continue;
237             }
238             /* clear all_ports_up flag if any link down */
239             if (link.link_status == 0) {
240                 all_ports_up = 0;
241                 break;
242             }
243         }
244 
245         /* after finally printing all link status, get out */
246         if (print_flag == 1)
247             break;
248 
249         if (all_ports_up == 0) {
250             printf(".");
251             fflush(stdout);
252             rte_delay_ms(CHECK_INTERVAL);
253         }
254 
255         /* set the print_flag if all ports up or timeout */
256         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
257             print_flag = 1;
258             printf("done\n");
259         }
260     }
261 }
262 
263 static int
264 init_lcore_conf(void)
265 {
266     uint8_t nb_dev_ports = rte_eth_dev_count_avail();
267     if (nb_dev_ports == 0) {
268         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
269     }
270 
271     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
272         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
273                  ff_global_cfg.dpdk.max_portid);
274     }
275 
276     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
277     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
278 
279     uint16_t proc_id;
280     for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) {
281         uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id];
282         if (!lcore_config[lcore_id].detected) {
283             rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
284         }
285     }
286 
287     uint16_t socket_id = 0;
288     if (numa_on) {
289         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
290     }
291 
292     lcore_conf.socket_id = socket_id;
293 
294     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
295     int j;
296     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
297         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
298         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
299 
300         int queueid = -1;
301         int i;
302         for (i = 0; i < pconf->nb_lcores; i++) {
303             if (pconf->lcore_list[i] == lcore_id) {
304                 queueid = i;
305             }
306         }
307         if (queueid < 0) {
308             continue;
309         }
310         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
311         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
312         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
313         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
314         lcore_conf.nb_rx_queue++;
315 
316         lcore_conf.tx_queue_id[port_id] = queueid;
317         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
318         lcore_conf.nb_tx_port++;
319 
320         lcore_conf.pcap[port_id] = pconf->pcap;
321         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
322     }
323 
324     if (lcore_conf.nb_rx_queue == 0) {
325         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
326     }
327 
328     return 0;
329 }
330 
331 static int
332 init_mem_pool(void)
333 {
334     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
335     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
336     uint32_t nb_tx_queue = nb_lcores;
337     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
338 
339     unsigned nb_mbuf = RTE_MAX (
340         (nb_rx_queue*RX_QUEUE_SIZE          +
341         nb_ports*nb_lcores*MAX_PKT_BURST    +
342         nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
343         nb_lcores*MEMPOOL_CACHE_SIZE +
344 #ifdef FF_KNI
345         nb_ports*KNI_MBUF_MAX +
346         nb_ports*KNI_QUEUE_SIZE +
347 #endif
348         nb_lcores*nb_ports*DISPATCH_RING_SIZE),
349         (unsigned)8192);
350 
351     unsigned socketid = 0;
352     uint16_t i, lcore_id;
353     char s[64];
354 
355     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
356         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
357         if (numa_on) {
358             socketid = rte_lcore_to_socket_id(lcore_id);
359         }
360 
361         if (socketid >= NB_SOCKETS) {
362             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
363                 socketid, i, NB_SOCKETS);
364         }
365 
366         if (pktmbuf_pool[socketid] != NULL) {
367             continue;
368         }
369 
370         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
371             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
372             pktmbuf_pool[socketid] =
373                 rte_pktmbuf_pool_create(s, nb_mbuf,
374                     MEMPOOL_CACHE_SIZE, 0,
375                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
376         } else {
377             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
378             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
379         }
380 
381         if (pktmbuf_pool[socketid] == NULL) {
382             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
383         } else {
384             printf("create mbuf pool on socket %d\n", socketid);
385         }
386     }
387 
388     return 0;
389 }
390 
391 static struct rte_ring *
392 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
393 {
394     struct rte_ring *ring;
395 
396     if (name == NULL) {
397         rte_exit(EXIT_FAILURE, "create ring failed, no name!\n");
398     }
399 
400     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
401         ring = rte_ring_create(name, count, socket_id, flags);
402     } else {
403         ring = rte_ring_lookup(name);
404     }
405 
406     if (ring == NULL) {
407         rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name);
408     }
409 
410     return ring;
411 }
412 
413 static int
414 init_dispatch_ring(void)
415 {
416     int j;
417     char name_buf[RTE_RING_NAMESIZE];
418     int queueid;
419 
420     unsigned socketid = lcore_conf.socket_id;
421 
422     /* Create ring according to ports actually being used. */
423     int nb_ports = ff_global_cfg.dpdk.nb_ports;
424     for (j = 0; j < nb_ports; j++) {
425         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
426         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
427         int nb_queues = pconf->nb_lcores;
428         if (dispatch_ring[portid] == NULL) {
429             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
430 
431             dispatch_ring[portid] = rte_zmalloc(name_buf,
432                 sizeof(struct rte_ring *) * nb_queues,
433                 RTE_CACHE_LINE_SIZE);
434             if (dispatch_ring[portid] == NULL) {
435                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
436                     "failed\n", name_buf);
437             }
438         }
439 
440         for(queueid = 0; queueid < nb_queues; ++queueid) {
441             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
442                 portid, queueid);
443             dispatch_ring[portid][queueid] = create_ring(name_buf,
444                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
445 
446             if (dispatch_ring[portid][queueid] == NULL)
447                 rte_panic("create ring:%s failed!\n", name_buf);
448 
449             printf("create ring:%s success, %u ring entries are now free!\n",
450                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
451         }
452     }
453 
454     return 0;
455 }
456 
457 static void
458 ff_msg_init(struct rte_mempool *mp,
459     __attribute__((unused)) void *opaque_arg,
460     void *obj, __attribute__((unused)) unsigned i)
461 {
462     struct ff_msg *msg = (struct ff_msg *)obj;
463     msg->msg_type = FF_UNKNOWN;
464     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
465     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
466 }
467 
468 static int
469 init_msg_ring(void)
470 {
471     uint16_t i;
472     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
473     unsigned socketid = lcore_conf.socket_id;
474 
475     /* Create message buffer pool */
476     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
477         message_pool = rte_mempool_create(FF_MSG_POOL,
478            MSG_RING_SIZE * 2 * nb_procs,
479            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
480            NULL, NULL, ff_msg_init, NULL,
481            socketid, 0);
482     } else {
483         message_pool = rte_mempool_lookup(FF_MSG_POOL);
484     }
485 
486     if (message_pool == NULL) {
487         rte_panic("Create msg mempool failed\n");
488     }
489 
490     for(i = 0; i < nb_procs; ++i) {
491         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
492             "%s%u", FF_MSG_RING_IN, i);
493         snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE,
494             "%s%u", FF_MSG_RING_OUT, i);
495 
496         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
497             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
498         if (msg_ring[i].ring[0] == NULL)
499             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
500 
501         msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1],
502             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
503         if (msg_ring[i].ring[1] == NULL)
504             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
505     }
506 
507     return 0;
508 }
509 
510 #ifdef FF_KNI
511 static int
512 init_kni(void)
513 {
514     int nb_ports = rte_eth_dev_count_avail();
515     kni_accept = 0;
516     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
517         kni_accept = 1;
518 
519     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
520         ff_global_cfg.kni.udp_port);
521 
522     unsigned socket_id = lcore_conf.socket_id;
523     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
524 
525     nb_ports = ff_global_cfg.dpdk.nb_ports;
526     int i, ret;
527     for (i = 0; i < nb_ports; i++) {
528         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
529         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
530     }
531 
532     return 0;
533 }
534 #endif
535 
536 static void
537 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
538 {
539     if (reta_size == 0) {
540         return;
541     }
542 
543     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
544     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
545 
546     /* config HW indirection table */
547     unsigned i, j, hash=0;
548     for (i = 0; i < reta_conf_size; i++) {
549         reta_conf[i].mask = ~0ULL;
550         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
551             reta_conf[i].reta[j] = hash++ % nb_queues;
552         }
553     }
554 
555     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
556         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
557             port_id);
558     }
559 }
560 
561 static int
562 init_port_start(void)
563 {
564     int nb_ports = ff_global_cfg.dpdk.nb_ports;
565     unsigned socketid = 0;
566     struct rte_mempool *mbuf_pool;
567     uint16_t i;
568 
569     for (i = 0; i < nb_ports; i++) {
570         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
571         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
572         uint16_t nb_queues = pconf->nb_lcores;
573 
574         struct rte_eth_dev_info dev_info;
575         rte_eth_dev_info_get(port_id, &dev_info);
576 
577         if (nb_queues > dev_info.max_rx_queues) {
578             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
579                 nb_queues,
580                 dev_info.max_rx_queues);
581         }
582 
583         if (nb_queues > dev_info.max_tx_queues) {
584             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
585                 nb_queues,
586                 dev_info.max_tx_queues);
587         }
588 
589         struct ether_addr addr;
590         rte_eth_macaddr_get(port_id, &addr);
591         printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
592                    " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
593                 (unsigned)port_id,
594                 addr.addr_bytes[0], addr.addr_bytes[1],
595                 addr.addr_bytes[2], addr.addr_bytes[3],
596                 addr.addr_bytes[4], addr.addr_bytes[5]);
597 
598         rte_memcpy(pconf->mac,
599             addr.addr_bytes, ETHER_ADDR_LEN);
600 
601         struct rte_eth_conf port_conf = {0};
602 
603         /* Set RSS mode */
604         uint64_t default_rss_hf = ETH_RSS_PROTO_MASK;
605         port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
606         port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf;
607         port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes;
608         port_conf.rx_adv_conf.rss_conf.rss_key_len = 40;
609         port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads;
610          if (port_conf.rx_adv_conf.rss_conf.rss_hf !=
611                 ETH_RSS_PROTO_MASK) {
612             printf("Port %u modified RSS hash function based on hardware support,"
613                     "requested:%#"PRIx64" configured:%#"PRIx64"\n",
614                     port_id, default_rss_hf,
615                     port_conf.rx_adv_conf.rss_conf.rss_hf);
616         }
617 
618         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
619             port_conf.txmode.offloads |=
620                 DEV_TX_OFFLOAD_MBUF_FAST_FREE;
621 
622         /* Set Rx VLAN stripping */
623         if (ff_global_cfg.dpdk.vlan_strip) {
624             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
625                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
626             }
627         }
628 
629         /* Enable HW CRC stripping */
630         if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_KEEP_CRC) {
631             port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_KEEP_CRC;
632         }
633 
634         /* FIXME: Enable TCP LRO ?*/
635         #if 0
636         if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
637             printf("LRO is supported\n");
638             port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
639             pconf->hw_features.rx_lro = 1;
640         }
641         #endif
642 
643         /* Set Rx checksum checking */
644         if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
645             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
646             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
647             printf("RX checksum offload supported\n");
648             port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
649             pconf->hw_features.rx_csum = 1;
650         }
651 
652         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
653             printf("TX ip checksum offload supported\n");
654             pconf->hw_features.tx_csum_ip = 1;
655         }
656 
657         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
658             (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
659             printf("TX TCP&UDP checksum offload supported\n");
660             pconf->hw_features.tx_csum_l4 = 1;
661         }
662 
663         if (ff_global_cfg.dpdk.tso) {
664             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
665                 printf("TSO is supported\n");
666                 pconf->hw_features.tx_tso = 1;
667             }
668         } else {
669             printf("TSO is disabled\n");
670         }
671 
672         if (dev_info.reta_size) {
673             /* reta size must be power of 2 */
674             assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
675 
676             rss_reta_size[port_id] = dev_info.reta_size;
677             printf("port[%d]: rss table size: %d\n", port_id,
678                 dev_info.reta_size);
679         }
680 
681         if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
682             continue;
683         }
684 
685         int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
686         if (ret != 0) {
687             return ret;
688         }
689         uint16_t q;
690         for (q = 0; q < nb_queues; q++) {
691             if (numa_on) {
692                 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
693                 socketid = rte_lcore_to_socket_id(lcore_id);
694             }
695             mbuf_pool = pktmbuf_pool[socketid];
696 
697             ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE,
698                 socketid, &dev_info.default_txconf);
699             if (ret < 0) {
700                 return ret;
701             }
702 
703             ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE,
704                 socketid, &dev_info.default_rxconf, mbuf_pool);
705             if (ret < 0) {
706                 return ret;
707             }
708         }
709 
710         ret = rte_eth_dev_start(port_id);
711         if (ret < 0) {
712             return ret;
713         }
714 
715         if (nb_queues > 1) {
716             /* set HW rss hash function to Toeplitz. */
717             if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
718                 struct rte_eth_hash_filter_info info = {0};
719                 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
720                 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
721 
722                 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
723                     RTE_ETH_FILTER_SET, &info) < 0) {
724                     rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
725                         port_id);
726                 }
727             }
728 
729             set_rss_table(port_id, dev_info.reta_size, nb_queues);
730         }
731 
732         /* Enable RX in promiscuous mode for the Ethernet device. */
733         if (ff_global_cfg.dpdk.promiscuous) {
734             rte_eth_promiscuous_enable(port_id);
735             ret = rte_eth_promiscuous_get(port_id);
736             if (ret == 1) {
737                 printf("set port %u to promiscuous mode ok\n", port_id);
738             } else {
739                 printf("set port %u to promiscuous mode error\n", port_id);
740             }
741         }
742 
743         /* Enable pcap dump */
744         if (pconf->pcap) {
745             ff_enable_pcap(pconf->pcap);
746         }
747     }
748 
749     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
750         check_all_ports_link_status();
751     }
752 
753     return 0;
754 }
755 
756 static int
757 init_clock(void)
758 {
759     rte_timer_subsystem_init();
760     uint64_t hz = rte_get_timer_hz();
761     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
762     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
763 
764     rte_timer_init(&freebsd_clock);
765     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
766         rte_lcore_id(), &ff_hardclock_job, NULL);
767 
768     ff_update_current_ts();
769 
770     return 0;
771 }
772 
773 int
774 ff_dpdk_init(int argc, char **argv)
775 {
776     if (ff_global_cfg.dpdk.nb_procs < 1 ||
777         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
778         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
779         ff_global_cfg.dpdk.proc_id < 0) {
780         printf("param num_procs[%d] or proc_id[%d] error!\n",
781             ff_global_cfg.dpdk.nb_procs,
782             ff_global_cfg.dpdk.proc_id);
783         exit(1);
784     }
785 
786     int ret = rte_eal_init(argc, argv);
787     if (ret < 0) {
788         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
789     }
790 
791     numa_on = ff_global_cfg.dpdk.numa_on;
792 
793     idle_sleep = ff_global_cfg.dpdk.idle_sleep;
794 
795     init_lcore_conf();
796 
797     init_mem_pool();
798 
799     init_dispatch_ring();
800 
801     init_msg_ring();
802 
803 #ifdef FF_KNI
804     enable_kni = ff_global_cfg.kni.enable;
805     if (enable_kni) {
806         init_kni();
807     }
808 #endif
809 
810     ret = init_port_start();
811     if (ret < 0) {
812         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
813     }
814 
815     init_clock();
816 
817     return 0;
818 }
819 
820 static void
821 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
822 {
823     uint8_t rx_csum = ctx->hw_features.rx_csum;
824     if (rx_csum) {
825         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
826             rte_pktmbuf_free(pkt);
827             return;
828         }
829     }
830 
831     /*
832      * FIXME: should we save pkt->vlan_tci
833      * if (pkt->ol_flags & PKT_RX_VLAN_PKT)
834      */
835 
836     void *data = rte_pktmbuf_mtod(pkt, void*);
837     uint16_t len = rte_pktmbuf_data_len(pkt);
838 
839     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
840     if (hdr == NULL) {
841         rte_pktmbuf_free(pkt);
842         return;
843     }
844 
845     struct rte_mbuf *pn = pkt->next;
846     void *prev = hdr;
847     while(pn != NULL) {
848         data = rte_pktmbuf_mtod(pn, void*);
849         len = rte_pktmbuf_data_len(pn);
850 
851         void *mb = ff_mbuf_get(prev, data, len);
852         if (mb == NULL) {
853             ff_mbuf_free(hdr);
854             rte_pktmbuf_free(pkt);
855             return;
856         }
857         pn = pn->next;
858         prev = mb;
859     }
860 
861     ff_veth_process_packet(ctx->ifp, hdr);
862 }
863 
864 static enum FilterReturn
865 protocol_filter(const void *data, uint16_t len)
866 {
867     if(len < ETHER_HDR_LEN)
868         return FILTER_UNKNOWN;
869 
870     const struct ether_hdr *hdr;
871     hdr = (const struct ether_hdr *)data;
872 
873     if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP)
874         return FILTER_ARP;
875 
876 #ifndef FF_KNI
877     return FILTER_UNKNOWN;
878 #else
879     if (!enable_kni) {
880         return FILTER_UNKNOWN;
881     }
882 
883     if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4)
884         return FILTER_UNKNOWN;
885 
886     return ff_kni_proto_filter(data + ETHER_HDR_LEN,
887         len - ETHER_HDR_LEN);
888 #endif
889 }
890 
891 static inline void
892 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
893 {
894     struct rte_mbuf *md;
895     void *src, *dst;
896 
897     dst = rte_pktmbuf_mtod(mi, void *);
898     src = rte_pktmbuf_mtod(m, void *);
899 
900     mi->data_len = m->data_len;
901     rte_memcpy(dst, src, m->data_len);
902 
903     mi->port = m->port;
904     mi->vlan_tci = m->vlan_tci;
905     mi->vlan_tci_outer = m->vlan_tci_outer;
906     mi->tx_offload = m->tx_offload;
907     mi->hash = m->hash;
908     mi->ol_flags = m->ol_flags;
909     mi->packet_type = m->packet_type;
910 }
911 
912 /* copied from rte_pktmbuf_clone */
913 static inline struct rte_mbuf *
914 pktmbuf_deep_clone(const struct rte_mbuf *md,
915     struct rte_mempool *mp)
916 {
917     struct rte_mbuf *mc, *mi, **prev;
918     uint32_t pktlen;
919     uint8_t nseg;
920 
921     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
922         return NULL;
923 
924     mi = mc;
925     prev = &mi->next;
926     pktlen = md->pkt_len;
927     nseg = 0;
928 
929     do {
930         nseg++;
931         pktmbuf_deep_attach(mi, md);
932         *prev = mi;
933         prev = &mi->next;
934     } while ((md = md->next) != NULL &&
935         (mi = rte_pktmbuf_alloc(mp)) != NULL);
936 
937     *prev = NULL;
938     mc->nb_segs = nseg;
939     mc->pkt_len = pktlen;
940 
941     /* Allocation of new indirect segment failed */
942     if (unlikely (mi == NULL)) {
943         rte_pktmbuf_free(mc);
944         return NULL;
945     }
946 
947     __rte_mbuf_sanity_check(mc, 1);
948     return mc;
949 }
950 
951 static inline void
952 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
953     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
954 {
955     struct lcore_conf *qconf = &lcore_conf;
956     uint16_t nb_queues = qconf->nb_queue_list[port_id];
957 
958     uint16_t i;
959     for (i = 0; i < count; i++) {
960         struct rte_mbuf *rtem = bufs[i];
961 
962         if (unlikely(qconf->pcap[port_id] != NULL)) {
963             if (!pkts_from_ring) {
964                 ff_dump_packets(qconf->pcap[port_id], rtem);
965             }
966         }
967 
968         void *data = rte_pktmbuf_mtod(rtem, void*);
969         uint16_t len = rte_pktmbuf_data_len(rtem);
970 
971         if (!pkts_from_ring) {
972             ff_traffic.rx_packets++;
973             ff_traffic.rx_bytes += len;
974         }
975 
976         if (!pkts_from_ring && packet_dispatcher) {
977             int ret = (*packet_dispatcher)(data, len, queue_id, nb_queues);
978             if (ret < 0 || ret >= nb_queues) {
979                 rte_pktmbuf_free(rtem);
980                 continue;
981             }
982 
983             if (ret != queue_id) {
984                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
985                 if (ret < 0)
986                     rte_pktmbuf_free(rtem);
987 
988                 continue;
989             }
990         }
991 
992         enum FilterReturn filter = protocol_filter(data, len);
993         if (filter == FILTER_ARP) {
994             struct rte_mempool *mbuf_pool;
995             struct rte_mbuf *mbuf_clone;
996             if (!pkts_from_ring) {
997                 uint16_t j;
998                 for(j = 0; j < nb_queues; ++j) {
999                     if(j == queue_id)
1000                         continue;
1001 
1002                     unsigned socket_id = 0;
1003                     if (numa_on) {
1004                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1005                         socket_id = rte_lcore_to_socket_id(lcore_id);
1006                     }
1007                     mbuf_pool = pktmbuf_pool[socket_id];
1008                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1009                     if(mbuf_clone) {
1010                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1011                             mbuf_clone);
1012                         if (ret < 0)
1013                             rte_pktmbuf_free(mbuf_clone);
1014                     }
1015                 }
1016             }
1017 
1018 #ifdef FF_KNI
1019             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1020                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1021                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1022                 if(mbuf_clone) {
1023                     ff_kni_enqueue(port_id, mbuf_clone);
1024                 }
1025             }
1026 #endif
1027             ff_veth_input(ctx, rtem);
1028 #ifdef FF_KNI
1029         } else if (enable_kni &&
1030             ((filter == FILTER_KNI && kni_accept) ||
1031             (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1032             ff_kni_enqueue(port_id, rtem);
1033 #endif
1034         } else {
1035             ff_veth_input(ctx, rtem);
1036         }
1037     }
1038 }
1039 
1040 static inline int
1041 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1042     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1043 {
1044     /* read packet from ring buf and to process */
1045     uint16_t nb_rb;
1046     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1047         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1048 
1049     if(nb_rb > 0) {
1050         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1051     }
1052 
1053     return 0;
1054 }
1055 
1056 static inline void
1057 handle_sysctl_msg(struct ff_msg *msg)
1058 {
1059     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1060         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1061         msg->sysctl.newlen);
1062 
1063     if (ret < 0) {
1064         msg->result = errno;
1065     } else {
1066         msg->result = 0;
1067     }
1068 }
1069 
1070 static inline void
1071 handle_ioctl_msg(struct ff_msg *msg)
1072 {
1073     int fd, ret;
1074     fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1075     if (fd < 0) {
1076         ret = -1;
1077         goto done;
1078     }
1079 
1080     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1081 
1082     ff_close(fd);
1083 
1084 done:
1085     if (ret < 0) {
1086         msg->result = errno;
1087     } else {
1088         msg->result = 0;
1089     }
1090 }
1091 
1092 static inline void
1093 handle_route_msg(struct ff_msg *msg)
1094 {
1095     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1096         &msg->route.len, msg->route.maxlen);
1097     if (ret < 0) {
1098         msg->result = errno;
1099     } else {
1100         msg->result = 0;
1101     }
1102 }
1103 
1104 static inline void
1105 handle_top_msg(struct ff_msg *msg)
1106 {
1107     msg->top = ff_top_status;
1108     msg->result = 0;
1109 }
1110 
1111 #ifdef FF_NETGRAPH
1112 static inline void
1113 handle_ngctl_msg(struct ff_msg *msg)
1114 {
1115     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1116     if (ret < 0) {
1117         msg->result = errno;
1118     } else {
1119         msg->result = 0;
1120         msg->ngctl.ret = ret;
1121     }
1122 }
1123 #endif
1124 
1125 #ifdef FF_IPFW
1126 static inline void
1127 handle_ipfw_msg(struct ff_msg *msg)
1128 {
1129     int fd, ret;
1130     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1131     if (fd < 0) {
1132         ret = -1;
1133         goto done;
1134     }
1135 
1136     switch (msg->ipfw.cmd) {
1137         case FF_IPFW_GET:
1138             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1139                 msg->ipfw.optname, msg->ipfw.optval,
1140                 msg->ipfw.optlen);
1141             break;
1142         case FF_IPFW_SET:
1143             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1144                 msg->ipfw.optname, msg->ipfw.optval,
1145                 *(msg->ipfw.optlen));
1146             break;
1147         default:
1148             ret = -1;
1149             errno = ENOTSUP;
1150             break;
1151     }
1152 
1153     ff_close(fd);
1154 
1155 done:
1156     if (ret < 0) {
1157         msg->result = errno;
1158     } else {
1159         msg->result = 0;
1160     }
1161 }
1162 #endif
1163 
1164 static inline void
1165 handle_traffic_msg(struct ff_msg *msg)
1166 {
1167     msg->traffic = ff_traffic;
1168     msg->result = 0;
1169 }
1170 
1171 static inline void
1172 handle_default_msg(struct ff_msg *msg)
1173 {
1174     msg->result = ENOTSUP;
1175 }
1176 
1177 static inline void
1178 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1179 {
1180     switch (msg->msg_type) {
1181         case FF_SYSCTL:
1182             handle_sysctl_msg(msg);
1183             break;
1184         case FF_IOCTL:
1185             handle_ioctl_msg(msg);
1186             break;
1187         case FF_ROUTE:
1188             handle_route_msg(msg);
1189             break;
1190         case FF_TOP:
1191             handle_top_msg(msg);
1192             break;
1193 #ifdef FF_NETGRAPH
1194         case FF_NGCTL:
1195             handle_ngctl_msg(msg);
1196             break;
1197 #endif
1198 #ifdef FF_IPFW
1199         case FF_IPFW_CTL:
1200             handle_ipfw_msg(msg);
1201             break;
1202 #endif
1203         case FF_TRAFFIC:
1204             handle_traffic_msg(msg);
1205             break;
1206         default:
1207             handle_default_msg(msg);
1208             break;
1209     }
1210     rte_ring_enqueue(msg_ring[proc_id].ring[1], msg);
1211 }
1212 
1213 static inline int
1214 process_msg_ring(uint16_t proc_id)
1215 {
1216     void *msg;
1217     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1218 
1219     if (unlikely(ret == 0)) {
1220         handle_msg((struct ff_msg *)msg, proc_id);
1221     }
1222 
1223     return 0;
1224 }
1225 
1226 /* Send burst of packets on an output interface */
1227 static inline int
1228 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1229 {
1230     struct rte_mbuf **m_table;
1231     int ret;
1232     uint16_t queueid;
1233 
1234     queueid = qconf->tx_queue_id[port];
1235     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1236 
1237     if (unlikely(qconf->pcap[port] != NULL)) {
1238         uint16_t i;
1239         for (i = 0; i < n; i++) {
1240             ff_dump_packets(qconf->pcap[port], m_table[i]);
1241         }
1242     }
1243 
1244     ff_traffic.tx_packets += n;
1245     uint16_t i;
1246     for (i = 0; i < n; i++) {
1247         ff_traffic.tx_bytes += rte_pktmbuf_data_len(m_table[i]);
1248     }
1249 
1250     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1251     if (unlikely(ret < n)) {
1252         do {
1253             rte_pktmbuf_free(m_table[ret]);
1254         } while (++ret < n);
1255     }
1256 
1257     return 0;
1258 }
1259 
1260 /* Enqueue a single packet, and send burst if queue is filled */
1261 static inline int
1262 send_single_packet(struct rte_mbuf *m, uint8_t port)
1263 {
1264     uint16_t len;
1265     struct lcore_conf *qconf;
1266 
1267     qconf = &lcore_conf;
1268     len = qconf->tx_mbufs[port].len;
1269     qconf->tx_mbufs[port].m_table[len] = m;
1270     len++;
1271 
1272     /* enough pkts to be sent */
1273     if (unlikely(len == MAX_PKT_BURST)) {
1274         send_burst(qconf, MAX_PKT_BURST, port);
1275         len = 0;
1276     }
1277 
1278     qconf->tx_mbufs[port].len = len;
1279     return 0;
1280 }
1281 
1282 int
1283 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1284     int total)
1285 {
1286     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1287     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1288     if (head == NULL) {
1289         ff_mbuf_free(m);
1290         return -1;
1291     }
1292 
1293     head->pkt_len = total;
1294     head->nb_segs = 0;
1295 
1296     int off = 0;
1297     struct rte_mbuf *cur = head, *prev = NULL;
1298     while(total > 0) {
1299         if (cur == NULL) {
1300             cur = rte_pktmbuf_alloc(mbuf_pool);
1301             if (cur == NULL) {
1302                 rte_pktmbuf_free(head);
1303                 ff_mbuf_free(m);
1304                 return -1;
1305             }
1306         }
1307 
1308         if (prev != NULL) {
1309             prev->next = cur;
1310         }
1311         head->nb_segs++;
1312 
1313         prev = cur;
1314         void *data = rte_pktmbuf_mtod(cur, void*);
1315         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1316         int ret = ff_mbuf_copydata(m, data, off, len);
1317         if (ret < 0) {
1318             rte_pktmbuf_free(head);
1319             ff_mbuf_free(m);
1320             return -1;
1321         }
1322 
1323 
1324         cur->data_len = len;
1325         off += len;
1326         total -= len;
1327         cur = NULL;
1328     }
1329 
1330     struct ff_tx_offload offload = {0};
1331     ff_mbuf_tx_offload(m, &offload);
1332 
1333     void *data = rte_pktmbuf_mtod(head, void*);
1334 
1335     if (offload.ip_csum) {
1336         /* ipv6 not supported yet */
1337         struct ipv4_hdr *iph;
1338         int iph_len;
1339         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1340         iph_len = (iph->version_ihl & 0x0f) << 2;
1341 
1342         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1343         head->l2_len = ETHER_HDR_LEN;
1344         head->l3_len = iph_len;
1345     }
1346 
1347     if (ctx->hw_features.tx_csum_l4) {
1348         struct ipv4_hdr *iph;
1349         int iph_len;
1350         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1351         iph_len = (iph->version_ihl & 0x0f) << 2;
1352 
1353         if (offload.tcp_csum) {
1354             head->ol_flags |= PKT_TX_TCP_CKSUM;
1355             head->l2_len = ETHER_HDR_LEN;
1356             head->l3_len = iph_len;
1357         }
1358 
1359         /*
1360          *  TCP segmentation offload.
1361          *
1362          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1363          *    implies PKT_TX_TCP_CKSUM)
1364          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1365          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1366          *    write the IP checksum to 0 in the packet
1367          *  - fill the mbuf offload information: l2_len,
1368          *    l3_len, l4_len, tso_segsz
1369          *  - calculate the pseudo header checksum without taking ip_len
1370          *    in account, and set it in the TCP header. Refer to
1371          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1372          *    used as helpers.
1373          */
1374         if (offload.tso_seg_size) {
1375             struct tcp_hdr *tcph;
1376             int tcph_len;
1377             tcph = (struct tcp_hdr *)((char *)iph + iph_len);
1378             tcph_len = (tcph->data_off & 0xf0) >> 2;
1379             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1380 
1381             head->ol_flags |= PKT_TX_TCP_SEG;
1382             head->l4_len = tcph_len;
1383             head->tso_segsz = offload.tso_seg_size;
1384         }
1385 
1386         if (offload.udp_csum) {
1387             head->ol_flags |= PKT_TX_UDP_CKSUM;
1388             head->l2_len = ETHER_HDR_LEN;
1389             head->l3_len = iph_len;
1390         }
1391     }
1392 
1393     ff_mbuf_free(m);
1394 
1395     return send_single_packet(head, ctx->port_id);
1396 }
1397 
1398 static int
1399 main_loop(void *arg)
1400 {
1401     struct loop_routine *lr = (struct loop_routine *)arg;
1402 
1403     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1404     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc;
1405     int i, j, nb_rx, idle;
1406     uint16_t port_id, queue_id;
1407     struct lcore_conf *qconf;
1408     const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) /
1409         US_PER_S * BURST_TX_DRAIN_US;
1410     struct ff_dpdk_if_context *ctx;
1411 
1412     prev_tsc = 0;
1413     usch_tsc = 0;
1414 
1415     qconf = &lcore_conf;
1416 
1417     while (1) {
1418         cur_tsc = rte_rdtsc();
1419         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1420             rte_timer_manage();
1421         }
1422 
1423         idle = 1;
1424         sys_tsc = 0;
1425         usr_tsc = 0;
1426 
1427         /*
1428          * TX burst queue drain
1429          */
1430         diff_tsc = cur_tsc - prev_tsc;
1431         if (unlikely(diff_tsc > drain_tsc)) {
1432             for (i = 0; i < qconf->nb_tx_port; i++) {
1433                 port_id = qconf->tx_port_id[i];
1434                 if (qconf->tx_mbufs[port_id].len == 0)
1435                     continue;
1436 
1437                 idle = 0;
1438 
1439                 send_burst(qconf,
1440                     qconf->tx_mbufs[port_id].len,
1441                     port_id);
1442                 qconf->tx_mbufs[port_id].len = 0;
1443             }
1444 
1445             prev_tsc = cur_tsc;
1446         }
1447 
1448         /*
1449          * Read packet from RX queues
1450          */
1451         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1452             port_id = qconf->rx_queue_list[i].port_id;
1453             queue_id = qconf->rx_queue_list[i].queue_id;
1454             ctx = veth_ctx[port_id];
1455 
1456 #ifdef FF_KNI
1457             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1458                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1459             }
1460 #endif
1461 
1462             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1463 
1464             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1465                 MAX_PKT_BURST);
1466             if (nb_rx == 0)
1467                 continue;
1468 
1469             idle = 0;
1470 
1471             /* Prefetch first packets */
1472             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1473                 rte_prefetch0(rte_pktmbuf_mtod(
1474                         pkts_burst[j], void *));
1475             }
1476 
1477             /* Prefetch and handle already prefetched packets */
1478             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1479                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1480                         j + PREFETCH_OFFSET], void *));
1481                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1482             }
1483 
1484             /* Handle remaining prefetched packets */
1485             for (; j < nb_rx; j++) {
1486                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1487             }
1488         }
1489 
1490         process_msg_ring(qconf->proc_id);
1491 
1492         div_tsc = rte_rdtsc();
1493 
1494         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) {
1495             usch_tsc = cur_tsc;
1496             lr->loop(lr->arg);
1497         }
1498 
1499         idle_sleep_tsc = rte_rdtsc();
1500         if (likely(idle && idle_sleep)) {
1501             usleep(idle_sleep);
1502             end_tsc = rte_rdtsc();
1503         } else {
1504             end_tsc = idle_sleep_tsc;
1505         }
1506 
1507         end_tsc = rte_rdtsc();
1508 
1509         if (usch_tsc == cur_tsc) {
1510             usr_tsc = idle_sleep_tsc - div_tsc;
1511         }
1512 
1513         if (!idle) {
1514             sys_tsc = div_tsc - cur_tsc;
1515             ff_top_status.sys_tsc += sys_tsc;
1516         }
1517 
1518         ff_top_status.usr_tsc += usr_tsc;
1519         ff_top_status.work_tsc += end_tsc - cur_tsc;
1520         ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1521 
1522         ff_top_status.loops++;
1523     }
1524 
1525     return 0;
1526 }
1527 
1528 int
1529 ff_dpdk_if_up(void) {
1530     int i;
1531     struct lcore_conf *qconf = &lcore_conf;
1532     for (i = 0; i < qconf->nb_tx_port; i++) {
1533         uint16_t port_id = qconf->tx_port_id[i];
1534 
1535         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1536         veth_ctx[port_id] = ff_veth_attach(pconf);
1537         if (veth_ctx[port_id] == NULL) {
1538             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1539         }
1540     }
1541 
1542     return 0;
1543 }
1544 
1545 void
1546 ff_dpdk_run(loop_func_t loop, void *arg) {
1547     struct loop_routine *lr = rte_malloc(NULL,
1548         sizeof(struct loop_routine), 0);
1549     lr->loop = loop;
1550     lr->arg = arg;
1551     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1552     rte_eal_mp_wait_lcore();
1553     rte_free(lr);
1554 }
1555 
1556 void
1557 ff_dpdk_pktmbuf_free(void *m)
1558 {
1559     rte_pktmbuf_free((struct rte_mbuf *)m);
1560 }
1561 
1562 static uint32_t
1563 toeplitz_hash(unsigned keylen, const uint8_t *key,
1564     unsigned datalen, const uint8_t *data)
1565 {
1566     uint32_t hash = 0, v;
1567     u_int i, b;
1568 
1569     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1570 
1571     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1572     for (i = 0; i < datalen; i++) {
1573         for (b = 0; b < 8; b++) {
1574             if (data[i] & (1<<(7-b)))
1575                 hash ^= v;
1576             v <<= 1;
1577             if ((i + 4) < keylen &&
1578                 (key[i+4] & (1<<(7-b))))
1579                 v |= 1;
1580         }
1581     }
1582     return (hash);
1583 }
1584 
1585 int
1586 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
1587     uint16_t sport, uint16_t dport)
1588 {
1589     struct lcore_conf *qconf = &lcore_conf;
1590     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
1591     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
1592 
1593     if (nb_queues <= 1) {
1594         return 1;
1595     }
1596 
1597     uint16_t reta_size = rss_reta_size[ctx->port_id];
1598     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
1599 
1600     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
1601         sizeof(dport)];
1602 
1603     unsigned datalen = 0;
1604 
1605     bcopy(&saddr, &data[datalen], sizeof(saddr));
1606     datalen += sizeof(saddr);
1607 
1608     bcopy(&daddr, &data[datalen], sizeof(daddr));
1609     datalen += sizeof(daddr);
1610 
1611     bcopy(&sport, &data[datalen], sizeof(sport));
1612     datalen += sizeof(sport);
1613 
1614     bcopy(&dport, &data[datalen], sizeof(dport));
1615     datalen += sizeof(dport);
1616 
1617     uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes),
1618         default_rsskey_40bytes, datalen, data);
1619 
1620     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
1621 }
1622 
1623 void
1624 ff_regist_packet_dispatcher(dispatch_func_t func)
1625 {
1626     packet_dispatcher = func;
1627 }
1628 
1629 uint64_t
1630 ff_get_tsc_ns()
1631 {
1632     uint64_t cur_tsc = rte_rdtsc();
1633     uint64_t hz = rte_get_tsc_hz();
1634     return ((double)cur_tsc/(double)hz) * NS_PER_S;
1635 }
1636 
1637