xref: /f-stack/lib/ff_dpdk_if.c (revision 61467f3e)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 
28 #include <rte_common.h>
29 #include <rte_byteorder.h>
30 #include <rte_log.h>
31 #include <rte_memory.h>
32 #include <rte_memcpy.h>
33 #include <rte_memzone.h>
34 #include <rte_config.h>
35 #include <rte_eal.h>
36 #include <rte_pci.h>
37 #include <rte_mbuf.h>
38 #include <rte_memory.h>
39 #include <rte_lcore.h>
40 #include <rte_launch.h>
41 #include <rte_ethdev.h>
42 #include <rte_debug.h>
43 #include <rte_common.h>
44 #include <rte_ether.h>
45 #include <rte_malloc.h>
46 #include <rte_cycles.h>
47 #include <rte_timer.h>
48 #include <rte_thash.h>
49 #include <rte_ip.h>
50 #include <rte_tcp.h>
51 #include <rte_udp.h>
52 
53 #include "ff_dpdk_if.h"
54 #include "ff_dpdk_pcap.h"
55 #include "ff_dpdk_kni.h"
56 #include "ff_config.h"
57 #include "ff_veth.h"
58 #include "ff_host_interface.h"
59 #include "ff_msg.h"
60 #include "ff_api.h"
61 
62 #define MEMPOOL_CACHE_SIZE 256
63 
64 #define DISPATCH_RING_SIZE 2048
65 
66 #define MSG_RING_SIZE 32
67 
68 /*
69  * Configurable number of RX/TX ring descriptors
70  */
71 #define RX_QUEUE_SIZE 512
72 #define TX_QUEUE_SIZE 512
73 
74 #define MAX_PKT_BURST 32
75 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
76 
77 /*
78  * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send.
79  */
80 #define MAX_TX_BURST    (MAX_PKT_BURST / 2)
81 
82 #define NB_SOCKETS 8
83 
84 /* Configure how many packets ahead to prefetch, when reading packets */
85 #define PREFETCH_OFFSET    3
86 
87 #define MAX_RX_QUEUE_PER_LCORE 16
88 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS
89 #define MAX_RX_QUEUE_PER_PORT 128
90 
91 #ifdef FF_KNI
92 #define KNI_MBUF_MAX 2048
93 #define KNI_QUEUE_SIZE 2048
94 
95 static int enable_kni;
96 static int kni_accept;
97 #endif
98 
99 static int numa_on;
100 
101 static struct rte_timer freebsd_clock;
102 
103 // Mellanox Linux's driver key
104 static uint8_t default_rsskey_40bytes[40] = {
105     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
106     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
107     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
108     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
109     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
110 };
111 
112 static struct rte_eth_conf default_port_conf = {
113     .rxmode = {
114         .mq_mode = ETH_MQ_RX_RSS,
115         .max_rx_pkt_len = ETHER_MAX_LEN,
116         .split_hdr_size = 0, /**< hdr buf size */
117         .header_split   = 0, /**< Header Split disabled */
118         .hw_ip_checksum = 0, /**< IP checksum offload disabled */
119         .hw_vlan_filter = 0, /**< VLAN filtering disabled */
120         .hw_vlan_strip  = 0, /**< VLAN strip disabled. */
121         .hw_vlan_extend = 0, /**< Extended VLAN disabled. */
122         .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
123         .hw_strip_crc   = 0, /**< CRC stripped by hardware */
124         .enable_lro     = 0, /**< LRO disabled */
125     },
126     .rx_adv_conf = {
127         .rss_conf = {
128             .rss_key = default_rsskey_40bytes,
129             .rss_key_len = 40,
130             .rss_hf = ETH_RSS_PROTO_MASK,
131         },
132     },
133     .txmode = {
134         .mq_mode = ETH_MQ_TX_NONE,
135     },
136 };
137 
138 struct mbuf_table {
139     uint16_t len;
140     struct rte_mbuf *m_table[MAX_PKT_BURST];
141 };
142 
143 struct lcore_rx_queue {
144     uint16_t port_id;
145     uint16_t queue_id;
146 } __rte_cache_aligned;
147 
148 struct lcore_conf {
149     uint16_t proc_id;
150     uint16_t socket_id;
151     uint16_t nb_queue_list[RTE_MAX_ETHPORTS];
152     struct ff_port_cfg *port_cfgs;
153 
154     uint16_t nb_rx_queue;
155     struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
156     uint16_t nb_tx_port;
157     uint16_t tx_port_id[RTE_MAX_ETHPORTS];
158     uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
159     struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
160     char *pcap[RTE_MAX_ETHPORTS];
161 } __rte_cache_aligned;
162 
163 static struct lcore_conf lcore_conf;
164 
165 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
166 
167 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
168 static dispatch_func_t packet_dispatcher;
169 
170 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
171 
172 struct ff_msg_ring {
173     char ring_name[2][RTE_RING_NAMESIZE];
174     /* ring[0] for lcore recv msg, other send */
175     /* ring[1] for lcore send msg, other read */
176     struct rte_ring *ring[2];
177 } __rte_cache_aligned;
178 
179 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
180 static struct rte_mempool *message_pool;
181 
182 struct ff_dpdk_if_context {
183     void *sc;
184     void *ifp;
185     uint16_t port_id;
186     struct ff_hw_features hw_features;
187 } __rte_cache_aligned;
188 
189 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
190 
191 extern void ff_hardclock(void);
192 
193 static void
194 ff_hardclock_job(__rte_unused struct rte_timer *timer,
195     __rte_unused void *arg) {
196     ff_hardclock();
197     ff_update_current_ts();
198 }
199 
200 struct ff_dpdk_if_context *
201 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
202 {
203     struct ff_dpdk_if_context *ctx;
204 
205     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
206     if (ctx == NULL)
207         return NULL;
208 
209     ctx->sc = sc;
210     ctx->ifp = ifp;
211     ctx->port_id = cfg->port_id;
212     ctx->hw_features = cfg->hw_features;
213 
214     return ctx;
215 }
216 
217 void
218 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
219 {
220     free(ctx);
221 }
222 
223 static void
224 check_all_ports_link_status(void)
225 {
226     #define CHECK_INTERVAL 100 /* 100ms */
227     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
228 
229     uint16_t portid;
230     uint8_t count, all_ports_up, print_flag = 0;
231     struct rte_eth_link link;
232 
233     printf("\nChecking link status");
234     fflush(stdout);
235 
236     int i, nb_ports;
237     nb_ports = ff_global_cfg.dpdk.nb_ports;
238     for (count = 0; count <= MAX_CHECK_TIME; count++) {
239         all_ports_up = 1;
240         for (i = 0; i < nb_ports; i++) {
241             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
242             memset(&link, 0, sizeof(link));
243             rte_eth_link_get_nowait(portid, &link);
244 
245             /* print link status if flag set */
246             if (print_flag == 1) {
247                 if (link.link_status) {
248                     printf("Port %d Link Up - speed %u "
249                         "Mbps - %s\n", (int)portid,
250                         (unsigned)link.link_speed,
251                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
252                         ("full-duplex") : ("half-duplex\n"));
253                 } else {
254                     printf("Port %d Link Down\n", (int)portid);
255                 }
256                 continue;
257             }
258             /* clear all_ports_up flag if any link down */
259             if (link.link_status == 0) {
260                 all_ports_up = 0;
261                 break;
262             }
263         }
264 
265         /* after finally printing all link status, get out */
266         if (print_flag == 1)
267             break;
268 
269         if (all_ports_up == 0) {
270             printf(".");
271             fflush(stdout);
272             rte_delay_ms(CHECK_INTERVAL);
273         }
274 
275         /* set the print_flag if all ports up or timeout */
276         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
277             print_flag = 1;
278             printf("done\n");
279         }
280     }
281 }
282 
283 static int
284 init_lcore_conf(void)
285 {
286     uint8_t nb_dev_ports = rte_eth_dev_count();
287     if (nb_dev_ports == 0) {
288         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
289     }
290 
291     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
292         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
293                  ff_global_cfg.dpdk.max_portid);
294     }
295 
296     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
297     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
298 
299     uint16_t proc_id;
300     for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) {
301         uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id];
302         if (!lcore_config[lcore_id].detected) {
303             rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
304         }
305     }
306 
307     uint16_t socket_id = 0;
308     if (numa_on) {
309         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
310     }
311 
312     lcore_conf.socket_id = socket_id;
313 
314     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
315     int j;
316     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
317         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
318         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
319 
320         int queueid = -1;
321         int i;
322         for (i = 0; i < pconf->nb_lcores; i++) {
323             if (pconf->lcore_list[i] == lcore_id) {
324                 queueid = i;
325             }
326         }
327         if (queueid < 0) {
328             continue;
329         }
330         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
331         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
332         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
333         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
334         lcore_conf.nb_rx_queue++;
335 
336         lcore_conf.tx_queue_id[port_id] = queueid;
337         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
338         lcore_conf.nb_tx_port++;
339 
340         lcore_conf.pcap[port_id] = pconf->pcap;
341         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
342     }
343 
344     if (lcore_conf.nb_rx_queue == 0) {
345         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
346     }
347 
348     return 0;
349 }
350 
351 static int
352 init_mem_pool(void)
353 {
354     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
355     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
356     uint32_t nb_tx_queue = nb_lcores;
357     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
358 
359     unsigned nb_mbuf = RTE_MAX (
360         (nb_rx_queue*RX_QUEUE_SIZE          +
361         nb_ports*nb_lcores*MAX_PKT_BURST    +
362         nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
363         nb_lcores*MEMPOOL_CACHE_SIZE +
364 #ifdef FF_KNI
365         nb_ports*KNI_MBUF_MAX +
366         nb_ports*KNI_QUEUE_SIZE +
367 #endif
368         nb_lcores*nb_ports*DISPATCH_RING_SIZE),
369         (unsigned)8192);
370 
371     unsigned socketid = 0;
372     uint16_t i, lcore_id;
373     char s[64];
374 
375     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
376         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
377         if (numa_on) {
378             socketid = rte_lcore_to_socket_id(lcore_id);
379         }
380 
381         if (socketid >= NB_SOCKETS) {
382             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
383                 socketid, i, NB_SOCKETS);
384         }
385 
386         if (pktmbuf_pool[socketid] != NULL) {
387             continue;
388         }
389 
390         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
391             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
392             pktmbuf_pool[socketid] =
393                 rte_pktmbuf_pool_create(s, nb_mbuf,
394                     MEMPOOL_CACHE_SIZE, 0,
395                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
396         } else {
397             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
398             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
399         }
400 
401         if (pktmbuf_pool[socketid] == NULL) {
402             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
403         } else {
404             printf("create mbuf pool on socket %d\n", socketid);
405         }
406     }
407 
408     return 0;
409 }
410 
411 static struct rte_ring *
412 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
413 {
414     struct rte_ring *ring;
415 
416     if (name == NULL)
417         return NULL;
418 
419     /* If already create, just attached it */
420     if (likely((ring = rte_ring_lookup(name)) != NULL))
421         return ring;
422 
423     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
424         return rte_ring_create(name, count, socket_id, flags);
425     } else {
426         return rte_ring_lookup(name);
427     }
428 }
429 
430 static int
431 init_dispatch_ring(void)
432 {
433     int j;
434     char name_buf[RTE_RING_NAMESIZE];
435     int queueid;
436 
437     unsigned socketid = lcore_conf.socket_id;
438 
439     /* Create ring according to ports actually being used. */
440     int nb_ports = ff_global_cfg.dpdk.nb_ports;
441     for (j = 0; j < nb_ports; j++) {
442         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
443         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
444         int nb_queues = pconf->nb_lcores;
445         if (dispatch_ring[portid] == NULL) {
446             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
447 
448             dispatch_ring[portid] = rte_zmalloc(name_buf,
449                 sizeof(struct rte_ring *) * nb_queues,
450                 RTE_CACHE_LINE_SIZE);
451             if (dispatch_ring[portid] == NULL) {
452                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
453                     "failed\n", name_buf);
454             }
455         }
456 
457         for(queueid = 0; queueid < nb_queues; ++queueid) {
458             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
459                 portid, queueid);
460             dispatch_ring[portid][queueid] = create_ring(name_buf,
461                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
462 
463             if (dispatch_ring[portid][queueid] == NULL)
464                 rte_panic("create ring:%s failed!\n", name_buf);
465 
466             printf("create ring:%s success, %u ring entries are now free!\n",
467                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
468         }
469     }
470 
471     return 0;
472 }
473 
474 static void
475 ff_msg_init(struct rte_mempool *mp,
476     __attribute__((unused)) void *opaque_arg,
477     void *obj, __attribute__((unused)) unsigned i)
478 {
479     struct ff_msg *msg = (struct ff_msg *)obj;
480     msg->msg_type = FF_UNKNOWN;
481     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
482     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
483 }
484 
485 static int
486 init_msg_ring(void)
487 {
488     uint16_t i;
489     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
490     unsigned socketid = lcore_conf.socket_id;
491 
492     /* Create message buffer pool */
493     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
494         message_pool = rte_mempool_create(FF_MSG_POOL,
495            MSG_RING_SIZE * 2 * nb_procs,
496            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
497            NULL, NULL, ff_msg_init, NULL,
498            socketid, 0);
499     } else {
500         message_pool = rte_mempool_lookup(FF_MSG_POOL);
501     }
502 
503     if (message_pool == NULL) {
504         rte_panic("Create msg mempool failed\n");
505     }
506 
507     for(i = 0; i < nb_procs; ++i) {
508         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
509             "%s%u", FF_MSG_RING_IN, i);
510         snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE,
511             "%s%u", FF_MSG_RING_OUT, i);
512 
513         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
514             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
515         if (msg_ring[i].ring[0] == NULL)
516             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
517 
518         msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1],
519             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
520         if (msg_ring[i].ring[1] == NULL)
521             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
522     }
523 
524     return 0;
525 }
526 
527 #ifdef FF_KNI
528 static int
529 init_kni(void)
530 {
531     int nb_ports = rte_eth_dev_count();
532     kni_accept = 0;
533     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
534         kni_accept = 1;
535 
536     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
537         ff_global_cfg.kni.udp_port);
538 
539     unsigned socket_id = lcore_conf.socket_id;
540     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
541 
542     nb_ports = ff_global_cfg.dpdk.nb_ports;
543     int i, ret;
544     for (i = 0; i < nb_ports; i++) {
545         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
546         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
547     }
548 
549     return 0;
550 }
551 #endif
552 
553 static void
554 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
555 {
556     if (reta_size == 0) {
557         return;
558     }
559 
560     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
561     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
562 
563     /* config HW indirection table */
564     unsigned i, j, hash=0;
565     for (i = 0; i < reta_conf_size; i++) {
566         reta_conf[i].mask = ~0ULL;
567         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
568             reta_conf[i].reta[j] = hash++ % nb_queues;
569         }
570     }
571 
572     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
573         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
574             port_id);
575     }
576 }
577 
578 static int
579 init_port_start(void)
580 {
581     int nb_ports = ff_global_cfg.dpdk.nb_ports;
582     unsigned socketid = 0;
583     struct rte_mempool *mbuf_pool;
584     uint16_t i;
585 
586     for (i = 0; i < nb_ports; i++) {
587         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
588         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
589         uint16_t nb_queues = pconf->nb_lcores;
590 
591         struct rte_eth_dev_info dev_info;
592         rte_eth_dev_info_get(port_id, &dev_info);
593 
594         if (nb_queues > dev_info.max_rx_queues) {
595             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
596                 nb_queues,
597                 dev_info.max_rx_queues);
598         }
599 
600         if (nb_queues > dev_info.max_tx_queues) {
601             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
602                 nb_queues,
603                 dev_info.max_tx_queues);
604         }
605 
606         struct ether_addr addr;
607         rte_eth_macaddr_get(port_id, &addr);
608         printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
609                    " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
610                 (unsigned)port_id,
611                 addr.addr_bytes[0], addr.addr_bytes[1],
612                 addr.addr_bytes[2], addr.addr_bytes[3],
613                 addr.addr_bytes[4], addr.addr_bytes[5]);
614 
615         rte_memcpy(pconf->mac,
616             addr.addr_bytes, ETHER_ADDR_LEN);
617 
618         /* Clear txq_flags - we do not need multi-mempool and refcnt */
619         dev_info.default_txconf.txq_flags = ETH_TXQ_FLAGS_NOMULTMEMP |
620             ETH_TXQ_FLAGS_NOREFCOUNT;
621 
622         /* Disable features that are not supported by port's HW */
623         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) {
624             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP;
625         }
626 
627         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
628             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP;
629         }
630 
631         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) {
632             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP;
633         }
634 
635         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) {
636             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL;
637         }
638 
639         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) &&
640             !(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TSO)) {
641             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
642         }
643 
644         struct rte_eth_conf port_conf = {0};
645 
646         /* Set RSS mode */
647         port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
648         port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK;
649         port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes;
650         port_conf.rx_adv_conf.rss_conf.rss_key_len = 40;
651 
652         /* Set Rx VLAN stripping */
653         if (ff_global_cfg.dpdk.vlan_strip) {
654             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
655                 port_conf.rxmode.hw_vlan_strip = 1;
656             }
657         }
658 
659         /* Enable HW CRC stripping */
660         port_conf.rxmode.hw_strip_crc = 1;
661 
662         /* FIXME: Enable TCP LRO ?*/
663         #if 0
664         if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
665             printf("LRO is supported\n");
666             port_conf.rxmode.enable_lro = 1;
667             pconf->hw_features.rx_lro = 1;
668         }
669         #endif
670 
671         /* Set Rx checksum checking */
672         if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
673             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
674             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
675             printf("RX checksum offload supported\n");
676             port_conf.rxmode.hw_ip_checksum = 1;
677             pconf->hw_features.rx_csum = 1;
678         }
679 
680         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
681             printf("TX ip checksum offload supported\n");
682             pconf->hw_features.tx_csum_ip = 1;
683         }
684 
685         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
686             (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
687             printf("TX TCP&UDP checksum offload supported\n");
688             pconf->hw_features.tx_csum_l4 = 1;
689         }
690 
691         if (ff_global_cfg.dpdk.tso) {
692             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
693                 printf("TSO is supported\n");
694                 pconf->hw_features.tx_tso = 1;
695             }
696         } else {
697             printf("TSO is disabled\n");
698         }
699 
700         if (dev_info.reta_size) {
701             /* reta size must be power of 2 */
702             assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
703 
704             rss_reta_size[port_id] = dev_info.reta_size;
705             printf("port[%d]: rss table size: %d\n", port_id,
706                 dev_info.reta_size);
707         }
708 
709         if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
710             continue;
711         }
712 
713         int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
714         if (ret != 0) {
715             return ret;
716         }
717         uint16_t q;
718         for (q = 0; q < nb_queues; q++) {
719             if (numa_on) {
720                 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
721                 socketid = rte_lcore_to_socket_id(lcore_id);
722             }
723             mbuf_pool = pktmbuf_pool[socketid];
724 
725             ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE,
726                 socketid, &dev_info.default_txconf);
727             if (ret < 0) {
728                 return ret;
729             }
730 
731             ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE,
732                 socketid, &dev_info.default_rxconf, mbuf_pool);
733             if (ret < 0) {
734                 return ret;
735             }
736         }
737 
738         ret = rte_eth_dev_start(port_id);
739         if (ret < 0) {
740             return ret;
741         }
742 
743         if (nb_queues > 1) {
744             /* set HW rss hash function to Toeplitz. */
745             if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
746                 struct rte_eth_hash_filter_info info = {0};
747                 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
748                 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
749 
750                 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
751                     RTE_ETH_FILTER_SET, &info) < 0) {
752                     rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
753                         port_id);
754                 }
755             }
756 
757             set_rss_table(port_id, dev_info.reta_size, nb_queues);
758         }
759 
760         /* Enable RX in promiscuous mode for the Ethernet device. */
761         if (ff_global_cfg.dpdk.promiscuous) {
762             rte_eth_promiscuous_enable(port_id);
763             ret = rte_eth_promiscuous_get(port_id);
764             if (ret == 1) {
765                 printf("set port %u to promiscuous mode ok\n", port_id);
766             } else {
767                 printf("set port %u to promiscuous mode error\n", port_id);
768             }
769         }
770 
771         /* Enable pcap dump */
772         if (pconf->pcap) {
773             ff_enable_pcap(pconf->pcap);
774         }
775     }
776 
777     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
778         check_all_ports_link_status();
779     }
780 
781     return 0;
782 }
783 
784 static int
785 init_clock(void)
786 {
787     rte_timer_subsystem_init();
788     uint64_t hz = rte_get_timer_hz();
789     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
790     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
791 
792     rte_timer_init(&freebsd_clock);
793     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
794         rte_lcore_id(), &ff_hardclock_job, NULL);
795 
796     ff_update_current_ts();
797 
798     return 0;
799 }
800 
801 int
802 ff_dpdk_init(int argc, char **argv)
803 {
804     if (ff_global_cfg.dpdk.nb_procs < 1 ||
805         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
806         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
807         ff_global_cfg.dpdk.proc_id < 0) {
808         printf("param num_procs[%d] or proc_id[%d] error!\n",
809             ff_global_cfg.dpdk.nb_procs,
810             ff_global_cfg.dpdk.proc_id);
811         exit(1);
812     }
813 
814     int ret = rte_eal_init(argc, argv);
815     if (ret < 0) {
816         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
817     }
818 
819     numa_on = ff_global_cfg.dpdk.numa_on;
820 
821     init_lcore_conf();
822 
823     init_mem_pool();
824 
825     init_dispatch_ring();
826 
827     init_msg_ring();
828 
829 #ifdef FF_KNI
830     enable_kni = ff_global_cfg.kni.enable;
831     if (enable_kni) {
832         init_kni();
833     }
834 #endif
835 
836     ret = init_port_start();
837     if (ret < 0) {
838         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
839     }
840 
841     init_clock();
842 
843     return 0;
844 }
845 
846 static void
847 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
848 {
849     uint8_t rx_csum = ctx->hw_features.rx_csum;
850     if (rx_csum) {
851         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
852             rte_pktmbuf_free(pkt);
853             return;
854         }
855     }
856 
857     /*
858      * FIXME: should we save pkt->vlan_tci
859      * if (pkt->ol_flags & PKT_RX_VLAN_PKT)
860      */
861 
862     void *data = rte_pktmbuf_mtod(pkt, void*);
863     uint16_t len = rte_pktmbuf_data_len(pkt);
864 
865     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
866     if (hdr == NULL) {
867         rte_pktmbuf_free(pkt);
868         return;
869     }
870 
871     struct rte_mbuf *pn = pkt->next;
872     void *prev = hdr;
873     while(pn != NULL) {
874         data = rte_pktmbuf_mtod(pn, void*);
875         len = rte_pktmbuf_data_len(pn);
876 
877         void *mb = ff_mbuf_get(prev, data, len);
878         if (mb == NULL) {
879             ff_mbuf_free(hdr);
880             rte_pktmbuf_free(pkt);
881             return;
882         }
883         pn = pn->next;
884         prev = mb;
885     }
886 
887     ff_veth_process_packet(ctx->ifp, hdr);
888 }
889 
890 static enum FilterReturn
891 protocol_filter(const void *data, uint16_t len)
892 {
893     if(len < ETHER_HDR_LEN)
894         return FILTER_UNKNOWN;
895 
896     const struct ether_hdr *hdr;
897     hdr = (const struct ether_hdr *)data;
898 
899     if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP)
900         return FILTER_ARP;
901 
902 #ifndef FF_KNI
903     return FILTER_UNKNOWN;
904 #else
905     if (!enable_kni) {
906         return FILTER_UNKNOWN;
907     }
908 
909     if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4)
910         return FILTER_UNKNOWN;
911 
912     return ff_kni_proto_filter(data + ETHER_HDR_LEN,
913         len - ETHER_HDR_LEN);
914 #endif
915 }
916 
917 static inline void
918 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
919 {
920     struct rte_mbuf *md;
921     void *src, *dst;
922 
923     dst = rte_pktmbuf_mtod(mi, void *);
924     src = rte_pktmbuf_mtod(m, void *);
925 
926     mi->data_len = m->data_len;
927     rte_memcpy(dst, src, m->data_len);
928 
929     mi->port = m->port;
930     mi->vlan_tci = m->vlan_tci;
931     mi->vlan_tci_outer = m->vlan_tci_outer;
932     mi->tx_offload = m->tx_offload;
933     mi->hash = m->hash;
934     mi->ol_flags = m->ol_flags;
935     mi->packet_type = m->packet_type;
936 }
937 
938 /* copied from rte_pktmbuf_clone */
939 static inline struct rte_mbuf *
940 pktmbuf_deep_clone(const struct rte_mbuf *md,
941     struct rte_mempool *mp)
942 {
943     struct rte_mbuf *mc, *mi, **prev;
944     uint32_t pktlen;
945     uint8_t nseg;
946 
947     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
948         return NULL;
949 
950     mi = mc;
951     prev = &mi->next;
952     pktlen = md->pkt_len;
953     nseg = 0;
954 
955     do {
956         nseg++;
957         pktmbuf_deep_attach(mi, md);
958         *prev = mi;
959         prev = &mi->next;
960     } while ((md = md->next) != NULL &&
961         (mi = rte_pktmbuf_alloc(mp)) != NULL);
962 
963     *prev = NULL;
964     mc->nb_segs = nseg;
965     mc->pkt_len = pktlen;
966 
967     /* Allocation of new indirect segment failed */
968     if (unlikely (mi == NULL)) {
969         rte_pktmbuf_free(mc);
970         return NULL;
971     }
972 
973     __rte_mbuf_sanity_check(mc, 1);
974     return mc;
975 }
976 
977 static inline void
978 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
979     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
980 {
981     struct lcore_conf *qconf = &lcore_conf;
982     uint16_t nb_queues = qconf->nb_queue_list[port_id];
983 
984     uint16_t i;
985     for (i = 0; i < count; i++) {
986         struct rte_mbuf *rtem = bufs[i];
987 
988         if (unlikely(qconf->pcap[port_id] != NULL)) {
989             if (!pkts_from_ring) {
990                 ff_dump_packets(qconf->pcap[port_id], rtem);
991             }
992         }
993 
994         void *data = rte_pktmbuf_mtod(rtem, void*);
995         uint16_t len = rte_pktmbuf_data_len(rtem);
996 
997         if (!pkts_from_ring && packet_dispatcher) {
998             int ret = (*packet_dispatcher)(data, len, queue_id, nb_queues);
999             if (ret < 0 || ret >= nb_queues) {
1000                 rte_pktmbuf_free(rtem);
1001                 continue;
1002             }
1003 
1004             if (ret != queue_id) {
1005                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
1006                 if (ret < 0)
1007                     rte_pktmbuf_free(rtem);
1008 
1009                 continue;
1010             }
1011         }
1012 
1013         enum FilterReturn filter = protocol_filter(data, len);
1014         if (filter == FILTER_ARP) {
1015             struct rte_mempool *mbuf_pool;
1016             struct rte_mbuf *mbuf_clone;
1017             if (!pkts_from_ring) {
1018                 uint16_t j;
1019                 for(j = 0; j < nb_queues; ++j) {
1020                     if(j == queue_id)
1021                         continue;
1022 
1023                     unsigned socket_id = 0;
1024                     if (numa_on) {
1025                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1026                         socket_id = rte_lcore_to_socket_id(lcore_id);
1027                     }
1028                     mbuf_pool = pktmbuf_pool[socket_id];
1029                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1030                     if(mbuf_clone) {
1031                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1032                             mbuf_clone);
1033                         if (ret < 0)
1034                             rte_pktmbuf_free(mbuf_clone);
1035                     }
1036                 }
1037             }
1038 
1039 #ifdef FF_KNI
1040             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1041                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1042                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1043                 if(mbuf_clone) {
1044                     ff_kni_enqueue(port_id, mbuf_clone);
1045                 }
1046             }
1047 #endif
1048             ff_veth_input(ctx, rtem);
1049 #ifdef FF_KNI
1050         } else if (enable_kni &&
1051             ((filter == FILTER_KNI && kni_accept) ||
1052             (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1053             ff_kni_enqueue(port_id, rtem);
1054 #endif
1055         } else {
1056             ff_veth_input(ctx, rtem);
1057         }
1058     }
1059 }
1060 
1061 static inline int
1062 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1063     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1064 {
1065     /* read packet from ring buf and to process */
1066     uint16_t nb_rb;
1067     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1068         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1069 
1070     if(nb_rb > 0) {
1071         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1072     }
1073 
1074     return 0;
1075 }
1076 
1077 static inline void
1078 handle_sysctl_msg(struct ff_msg *msg)
1079 {
1080     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1081         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1082         msg->sysctl.newlen);
1083 
1084     if (ret < 0) {
1085         msg->result = errno;
1086     } else {
1087         msg->result = 0;
1088     }
1089 }
1090 
1091 static inline void
1092 handle_ioctl_msg(struct ff_msg *msg)
1093 {
1094     int fd, ret;
1095     fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1096     if (fd < 0) {
1097         ret = -1;
1098         goto done;
1099     }
1100 
1101     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1102 
1103     ff_close(fd);
1104 
1105 done:
1106     if (ret < 0) {
1107         msg->result = errno;
1108     } else {
1109         msg->result = 0;
1110     }
1111 }
1112 
1113 static inline void
1114 handle_route_msg(struct ff_msg *msg)
1115 {
1116     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1117         &msg->route.len, msg->route.maxlen);
1118     if (ret < 0) {
1119         msg->result = errno;
1120     } else {
1121         msg->result = 0;
1122     }
1123 }
1124 
1125 static struct ff_top_args ff_status;
1126 static inline void
1127 handle_top_msg(struct ff_msg *msg)
1128 {
1129     msg->top = ff_status;
1130     msg->result = 0;
1131 }
1132 
1133 #ifdef FF_NETGRAPH
1134 static inline void
1135 handle_ngctl_msg(struct ff_msg *msg)
1136 {
1137     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1138     if (ret < 0) {
1139         msg->result = errno;
1140     } else {
1141         msg->result = 0;
1142         msg->ngctl.ret = ret;
1143     }
1144 }
1145 #endif
1146 
1147 #ifdef FF_IPFW
1148 static inline void
1149 handle_ipfw_msg(struct ff_msg *msg)
1150 {
1151     int fd, ret;
1152     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1153     if (fd < 0) {
1154         ret = -1;
1155         goto done;
1156     }
1157 
1158     switch (msg->ipfw.cmd) {
1159         case FF_IPFW_GET:
1160             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1161                 msg->ipfw.optname, msg->ipfw.optval,
1162                 msg->ipfw.optlen);
1163             break;
1164         case FF_IPFW_SET:
1165             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1166                 msg->ipfw.optname, msg->ipfw.optval,
1167                 *(msg->ipfw.optlen));
1168             break;
1169         default:
1170             ret = -1;
1171             errno = ENOTSUP;
1172             break;
1173     }
1174 
1175     ff_close(fd);
1176 
1177 done:
1178     if (ret < 0) {
1179         msg->result = errno;
1180     } else {
1181         msg->result = 0;
1182     }
1183 }
1184 #endif
1185 
1186 static inline void
1187 handle_default_msg(struct ff_msg *msg)
1188 {
1189     msg->result = ENOTSUP;
1190 }
1191 
1192 static inline void
1193 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1194 {
1195     switch (msg->msg_type) {
1196         case FF_SYSCTL:
1197             handle_sysctl_msg(msg);
1198             break;
1199         case FF_IOCTL:
1200             handle_ioctl_msg(msg);
1201             break;
1202         case FF_ROUTE:
1203             handle_route_msg(msg);
1204             break;
1205         case FF_TOP:
1206             handle_top_msg(msg);
1207             break;
1208 #ifdef FF_NETGRAPH
1209         case FF_NGCTL:
1210             handle_ngctl_msg(msg);
1211             break;
1212 #endif
1213 #ifdef FF_IPFW
1214         case FF_IPFW_CTL:
1215             handle_ipfw_msg(msg);
1216             break;
1217 #endif
1218         default:
1219             handle_default_msg(msg);
1220             break;
1221     }
1222     rte_ring_enqueue(msg_ring[proc_id].ring[1], msg);
1223 }
1224 
1225 static inline int
1226 process_msg_ring(uint16_t proc_id)
1227 {
1228     void *msg;
1229     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1230 
1231     if (unlikely(ret == 0)) {
1232         handle_msg((struct ff_msg *)msg, proc_id);
1233     }
1234 
1235     return 0;
1236 }
1237 
1238 /* Send burst of packets on an output interface */
1239 static inline int
1240 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1241 {
1242     struct rte_mbuf **m_table;
1243     int ret;
1244     uint16_t queueid;
1245 
1246     queueid = qconf->tx_queue_id[port];
1247     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1248 
1249     if (unlikely(qconf->pcap[port] != NULL)) {
1250         uint16_t i;
1251         for (i = 0; i < n; i++) {
1252             ff_dump_packets(qconf->pcap[port], m_table[i]);
1253         }
1254     }
1255 
1256     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1257     if (unlikely(ret < n)) {
1258         do {
1259             rte_pktmbuf_free(m_table[ret]);
1260         } while (++ret < n);
1261     }
1262 
1263     return 0;
1264 }
1265 
1266 /* Enqueue a single packet, and send burst if queue is filled */
1267 static inline int
1268 send_single_packet(struct rte_mbuf *m, uint8_t port)
1269 {
1270     uint16_t len;
1271     struct lcore_conf *qconf;
1272 
1273     qconf = &lcore_conf;
1274     len = qconf->tx_mbufs[port].len;
1275     qconf->tx_mbufs[port].m_table[len] = m;
1276     len++;
1277 
1278     /* enough pkts to be sent */
1279     if (unlikely(len == MAX_PKT_BURST)) {
1280         send_burst(qconf, MAX_PKT_BURST, port);
1281         len = 0;
1282     }
1283 
1284     qconf->tx_mbufs[port].len = len;
1285     return 0;
1286 }
1287 
1288 int
1289 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1290     int total)
1291 {
1292     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1293     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1294     if (head == NULL) {
1295         ff_mbuf_free(m);
1296         return -1;
1297     }
1298 
1299     head->pkt_len = total;
1300     head->nb_segs = 0;
1301 
1302     int off = 0;
1303     struct rte_mbuf *cur = head, *prev = NULL;
1304     while(total > 0) {
1305         if (cur == NULL) {
1306             cur = rte_pktmbuf_alloc(mbuf_pool);
1307             if (cur == NULL) {
1308                 rte_pktmbuf_free(head);
1309                 ff_mbuf_free(m);
1310                 return -1;
1311             }
1312         }
1313 
1314         void *data = rte_pktmbuf_mtod(cur, void*);
1315         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1316         int ret = ff_mbuf_copydata(m, data, off, len);
1317         if (ret < 0) {
1318             rte_pktmbuf_free(head);
1319             ff_mbuf_free(m);
1320             return -1;
1321         }
1322 
1323         if (prev != NULL) {
1324             prev->next = cur;
1325         }
1326         prev = cur;
1327 
1328         cur->data_len = len;
1329         off += len;
1330         total -= len;
1331         head->nb_segs++;
1332         cur = NULL;
1333     }
1334 
1335     struct ff_tx_offload offload = {0};
1336     ff_mbuf_tx_offload(m, &offload);
1337 
1338     void *data = rte_pktmbuf_mtod(head, void*);
1339 
1340     if (offload.ip_csum) {
1341         /* ipv6 not supported yet */
1342         struct ipv4_hdr *iph;
1343         int iph_len;
1344         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1345         iph_len = (iph->version_ihl & 0x0f) << 2;
1346 
1347         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1348         head->l2_len = ETHER_HDR_LEN;
1349         head->l3_len = iph_len;
1350     }
1351 
1352     if (ctx->hw_features.tx_csum_l4) {
1353         struct ipv4_hdr *iph;
1354         int iph_len;
1355         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1356         iph_len = (iph->version_ihl & 0x0f) << 2;
1357 
1358         if (offload.tcp_csum) {
1359             head->ol_flags |= PKT_TX_TCP_CKSUM;
1360             head->l2_len = ETHER_HDR_LEN;
1361             head->l3_len = iph_len;
1362         }
1363 
1364         /*
1365          *  TCP segmentation offload.
1366          *
1367          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1368          *    implies PKT_TX_TCP_CKSUM)
1369          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1370          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1371          *    write the IP checksum to 0 in the packet
1372          *  - fill the mbuf offload information: l2_len,
1373          *    l3_len, l4_len, tso_segsz
1374          *  - calculate the pseudo header checksum without taking ip_len
1375          *    in account, and set it in the TCP header. Refer to
1376          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1377          *    used as helpers.
1378          */
1379         if (offload.tso_seg_size) {
1380             struct tcp_hdr *tcph;
1381             int tcph_len;
1382             tcph = (struct tcp_hdr *)((char *)iph + iph_len);
1383             tcph_len = (tcph->data_off & 0xf0) >> 2;
1384             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1385 
1386             head->ol_flags |= PKT_TX_TCP_SEG;
1387             head->l4_len = tcph_len;
1388             head->tso_segsz = offload.tso_seg_size;
1389         }
1390 
1391         if (offload.udp_csum) {
1392             head->ol_flags |= PKT_TX_UDP_CKSUM;
1393             head->l2_len = ETHER_HDR_LEN;
1394             head->l3_len = iph_len;
1395         }
1396     }
1397 
1398     ff_mbuf_free(m);
1399 
1400     return send_single_packet(head, ctx->port_id);
1401 }
1402 
1403 static int
1404 main_loop(void *arg)
1405 {
1406     struct loop_routine *lr = (struct loop_routine *)arg;
1407 
1408     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1409     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc;
1410     int i, j, nb_rx, idle;
1411     uint16_t port_id, queue_id;
1412     struct lcore_conf *qconf;
1413     const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) /
1414         US_PER_S * BURST_TX_DRAIN_US;
1415     struct ff_dpdk_if_context *ctx;
1416 
1417     prev_tsc = 0;
1418     usch_tsc = 0;
1419 
1420     qconf = &lcore_conf;
1421 
1422     while (1) {
1423         cur_tsc = rte_rdtsc();
1424         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1425             rte_timer_manage();
1426         }
1427 
1428         idle = 1;
1429         sys_tsc = 0;
1430         usr_tsc = 0;
1431 
1432         /*
1433          * TX burst queue drain
1434          */
1435         diff_tsc = cur_tsc - prev_tsc;
1436         if (unlikely(diff_tsc > drain_tsc)) {
1437             for (i = 0; i < qconf->nb_tx_port; i++) {
1438                 port_id = qconf->tx_port_id[i];
1439                 if (qconf->tx_mbufs[port_id].len == 0)
1440                     continue;
1441 
1442                 idle = 0;
1443 
1444                 send_burst(qconf,
1445                     qconf->tx_mbufs[port_id].len,
1446                     port_id);
1447                 qconf->tx_mbufs[port_id].len = 0;
1448             }
1449 
1450             prev_tsc = cur_tsc;
1451         }
1452 
1453         /*
1454          * Read packet from RX queues
1455          */
1456         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1457             port_id = qconf->rx_queue_list[i].port_id;
1458             queue_id = qconf->rx_queue_list[i].queue_id;
1459             ctx = veth_ctx[port_id];
1460 
1461 #ifdef FF_KNI
1462             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1463                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1464             }
1465 #endif
1466 
1467             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1468 
1469             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1470                 MAX_PKT_BURST);
1471             if (nb_rx == 0)
1472                 continue;
1473 
1474             idle = 0;
1475 
1476             /* Prefetch first packets */
1477             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1478                 rte_prefetch0(rte_pktmbuf_mtod(
1479                         pkts_burst[j], void *));
1480             }
1481 
1482             /* Prefetch and handle already prefetched packets */
1483             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1484                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1485                         j + PREFETCH_OFFSET], void *));
1486                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1487             }
1488 
1489             /* Handle remaining prefetched packets */
1490             for (; j < nb_rx; j++) {
1491                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1492             }
1493         }
1494 
1495         process_msg_ring(qconf->proc_id);
1496 
1497         div_tsc = rte_rdtsc();
1498 
1499         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) {
1500             usch_tsc = cur_tsc;
1501             lr->loop(lr->arg);
1502         }
1503 
1504         end_tsc = rte_rdtsc();
1505 
1506         if (usch_tsc == cur_tsc) {
1507             usr_tsc = end_tsc - div_tsc;
1508         }
1509 
1510         if (!idle) {
1511             sys_tsc = div_tsc - cur_tsc;
1512             ff_status.sys_tsc += sys_tsc;
1513         }
1514 
1515         ff_status.usr_tsc += usr_tsc;
1516         ff_status.work_tsc += end_tsc - cur_tsc;
1517         ff_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1518 
1519         ff_status.loops++;
1520     }
1521 
1522     return 0;
1523 }
1524 
1525 int
1526 ff_dpdk_if_up(void) {
1527     int i;
1528     struct lcore_conf *qconf = &lcore_conf;
1529     for (i = 0; i < qconf->nb_tx_port; i++) {
1530         uint16_t port_id = qconf->tx_port_id[i];
1531 
1532         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1533         veth_ctx[port_id] = ff_veth_attach(pconf);
1534         if (veth_ctx[port_id] == NULL) {
1535             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1536         }
1537     }
1538 
1539     return 0;
1540 }
1541 
1542 void
1543 ff_dpdk_run(loop_func_t loop, void *arg) {
1544     struct loop_routine *lr = rte_malloc(NULL,
1545         sizeof(struct loop_routine), 0);
1546     lr->loop = loop;
1547     lr->arg = arg;
1548     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1549     rte_eal_mp_wait_lcore();
1550     rte_free(lr);
1551 }
1552 
1553 void
1554 ff_dpdk_pktmbuf_free(void *m)
1555 {
1556     rte_pktmbuf_free((struct rte_mbuf *)m);
1557 }
1558 
1559 static uint32_t
1560 toeplitz_hash(unsigned keylen, const uint8_t *key,
1561     unsigned datalen, const uint8_t *data)
1562 {
1563     uint32_t hash = 0, v;
1564     u_int i, b;
1565 
1566     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1567 
1568     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1569     for (i = 0; i < datalen; i++) {
1570         for (b = 0; b < 8; b++) {
1571             if (data[i] & (1<<(7-b)))
1572                 hash ^= v;
1573             v <<= 1;
1574             if ((i + 4) < keylen &&
1575                 (key[i+4] & (1<<(7-b))))
1576                 v |= 1;
1577         }
1578     }
1579     return (hash);
1580 }
1581 
1582 int
1583 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
1584     uint16_t sport, uint16_t dport)
1585 {
1586     struct lcore_conf *qconf = &lcore_conf;
1587     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
1588     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
1589 
1590     if (nb_queues <= 1) {
1591         return 1;
1592     }
1593 
1594     uint16_t reta_size = rss_reta_size[ctx->port_id];
1595     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
1596 
1597     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
1598         sizeof(dport)];
1599 
1600     unsigned datalen = 0;
1601 
1602     bcopy(&saddr, &data[datalen], sizeof(saddr));
1603     datalen += sizeof(saddr);
1604 
1605     bcopy(&daddr, &data[datalen], sizeof(daddr));
1606     datalen += sizeof(daddr);
1607 
1608     bcopy(&sport, &data[datalen], sizeof(sport));
1609     datalen += sizeof(sport);
1610 
1611     bcopy(&dport, &data[datalen], sizeof(dport));
1612     datalen += sizeof(dport);
1613 
1614     uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes),
1615         default_rsskey_40bytes, datalen, data);
1616 
1617     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
1618 }
1619 
1620 void
1621 ff_regist_packet_dispatcher(dispatch_func_t func)
1622 {
1623     packet_dispatcher = func;
1624 }
1625 
1626 uint64_t
1627 ff_get_tsc_ns()
1628 {
1629     uint64_t cur_tsc = rte_rdtsc();
1630     uint64_t hz = rte_get_tsc_hz();
1631     return ((double)cur_tsc/(double)hz) * NS_PER_S;
1632 }
1633 
1634