xref: /f-stack/lib/ff_dpdk_if.c (revision 2bfe3f2e)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 
28 #include <rte_common.h>
29 #include <rte_byteorder.h>
30 #include <rte_log.h>
31 #include <rte_memory.h>
32 #include <rte_memcpy.h>
33 #include <rte_memzone.h>
34 #include <rte_config.h>
35 #include <rte_eal.h>
36 #include <rte_pci.h>
37 #include <rte_mbuf.h>
38 #include <rte_memory.h>
39 #include <rte_lcore.h>
40 #include <rte_launch.h>
41 #include <rte_ethdev.h>
42 #include <rte_debug.h>
43 #include <rte_common.h>
44 #include <rte_ether.h>
45 #include <rte_malloc.h>
46 #include <rte_cycles.h>
47 #include <rte_timer.h>
48 #include <rte_thash.h>
49 #include <rte_ip.h>
50 #include <rte_tcp.h>
51 #include <rte_udp.h>
52 
53 #include "ff_dpdk_if.h"
54 #include "ff_dpdk_pcap.h"
55 #include "ff_dpdk_kni.h"
56 #include "ff_config.h"
57 #include "ff_veth.h"
58 #include "ff_host_interface.h"
59 #include "ff_msg.h"
60 #include "ff_api.h"
61 
62 #define MEMPOOL_CACHE_SIZE 256
63 
64 #define DISPATCH_RING_SIZE 2048
65 
66 #define MSG_RING_SIZE 32
67 
68 /*
69  * Configurable number of RX/TX ring descriptors
70  */
71 #define RX_QUEUE_SIZE 512
72 #define TX_QUEUE_SIZE 512
73 
74 #define MAX_PKT_BURST 32
75 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
76 
77 /*
78  * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send.
79  */
80 #define MAX_TX_BURST    (MAX_PKT_BURST / 2)
81 
82 #define NB_SOCKETS 8
83 
84 /* Configure how many packets ahead to prefetch, when reading packets */
85 #define PREFETCH_OFFSET    3
86 
87 #define MAX_RX_QUEUE_PER_LCORE 16
88 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS
89 #define MAX_RX_QUEUE_PER_PORT 128
90 
91 #define KNI_MBUF_MAX 2048
92 #define KNI_QUEUE_SIZE 2048
93 
94 static int enable_kni;
95 static int kni_accept;
96 
97 static int numa_on;
98 
99 static struct rte_timer freebsd_clock;
100 
101 // Mellanox Linux's driver key
102 static uint8_t default_rsskey_40bytes[40] = {
103     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
104     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
105     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
106     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
107     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
108 };
109 
110 static struct rte_eth_conf default_port_conf = {
111     .rxmode = {
112         .mq_mode = ETH_MQ_RX_RSS,
113         .max_rx_pkt_len = ETHER_MAX_LEN,
114         .split_hdr_size = 0, /**< hdr buf size */
115         .header_split   = 0, /**< Header Split disabled */
116         .hw_ip_checksum = 0, /**< IP checksum offload disabled */
117         .hw_vlan_filter = 0, /**< VLAN filtering disabled */
118         .hw_vlan_strip  = 0, /**< VLAN strip disabled. */
119         .hw_vlan_extend = 0, /**< Extended VLAN disabled. */
120         .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
121         .hw_strip_crc   = 0, /**< CRC stripped by hardware */
122         .enable_lro     = 0, /**< LRO disabled */
123     },
124     .rx_adv_conf = {
125         .rss_conf = {
126             .rss_key = default_rsskey_40bytes,
127             .rss_key_len = 40,
128             .rss_hf = ETH_RSS_PROTO_MASK,
129         },
130     },
131     .txmode = {
132         .mq_mode = ETH_MQ_TX_NONE,
133     },
134 };
135 
136 struct mbuf_table {
137     uint16_t len;
138     struct rte_mbuf *m_table[MAX_PKT_BURST];
139 };
140 
141 struct lcore_rx_queue {
142     uint16_t port_id;
143     uint16_t queue_id;
144 } __rte_cache_aligned;
145 
146 struct lcore_conf {
147     uint16_t proc_id;
148     uint16_t socket_id;
149     uint16_t nb_queue_list[RTE_MAX_ETHPORTS];
150     struct ff_port_cfg *port_cfgs;
151 
152     uint16_t nb_rx_queue;
153     struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
154     uint16_t nb_tx_port;
155     uint16_t tx_port_id[RTE_MAX_ETHPORTS];
156     uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
157     struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
158     char *pcap[RTE_MAX_ETHPORTS];
159 } __rte_cache_aligned;
160 
161 static struct lcore_conf lcore_conf;
162 
163 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
164 
165 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
166 static dispatch_func_t packet_dispatcher;
167 
168 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
169 
170 struct ff_msg_ring {
171     char ring_name[2][RTE_RING_NAMESIZE];
172     /* ring[0] for lcore recv msg, other send */
173     /* ring[1] for lcore send msg, other read */
174     struct rte_ring *ring[2];
175 } __rte_cache_aligned;
176 
177 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
178 static struct rte_mempool *message_pool;
179 
180 struct ff_dpdk_if_context {
181     void *sc;
182     void *ifp;
183     uint16_t port_id;
184     struct ff_hw_features hw_features;
185 } __rte_cache_aligned;
186 
187 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
188 
189 extern void ff_hardclock(void);
190 
191 static void
192 ff_hardclock_job(__rte_unused struct rte_timer *timer,
193     __rte_unused void *arg) {
194     ff_hardclock();
195     ff_update_current_ts();
196 }
197 
198 struct ff_dpdk_if_context *
199 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
200 {
201     struct ff_dpdk_if_context *ctx;
202 
203     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
204     if (ctx == NULL)
205         return NULL;
206 
207     ctx->sc = sc;
208     ctx->ifp = ifp;
209     ctx->port_id = cfg->port_id;
210     ctx->hw_features = cfg->hw_features;
211 
212     return ctx;
213 }
214 
215 void
216 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
217 {
218     free(ctx);
219 }
220 
221 static void
222 check_all_ports_link_status(void)
223 {
224     #define CHECK_INTERVAL 100 /* 100ms */
225     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
226 
227     uint16_t portid;
228     uint8_t count, all_ports_up, print_flag = 0;
229     struct rte_eth_link link;
230 
231     printf("\nChecking link status");
232     fflush(stdout);
233 
234     int i, nb_ports;
235     nb_ports = ff_global_cfg.dpdk.nb_ports;
236     for (count = 0; count <= MAX_CHECK_TIME; count++) {
237         all_ports_up = 1;
238         for (i = 0; i < nb_ports; i++) {
239             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
240             memset(&link, 0, sizeof(link));
241             rte_eth_link_get_nowait(portid, &link);
242 
243             /* print link status if flag set */
244             if (print_flag == 1) {
245                 if (link.link_status) {
246                     printf("Port %d Link Up - speed %u "
247                         "Mbps - %s\n", (int)portid,
248                         (unsigned)link.link_speed,
249                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
250                         ("full-duplex") : ("half-duplex\n"));
251                 } else {
252                     printf("Port %d Link Down\n", (int)portid);
253                 }
254                 continue;
255             }
256             /* clear all_ports_up flag if any link down */
257             if (link.link_status == 0) {
258                 all_ports_up = 0;
259                 break;
260             }
261         }
262 
263         /* after finally printing all link status, get out */
264         if (print_flag == 1)
265             break;
266 
267         if (all_ports_up == 0) {
268             printf(".");
269             fflush(stdout);
270             rte_delay_ms(CHECK_INTERVAL);
271         }
272 
273         /* set the print_flag if all ports up or timeout */
274         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
275             print_flag = 1;
276             printf("done\n");
277         }
278     }
279 }
280 
281 static int
282 init_lcore_conf(void)
283 {
284     uint8_t nb_dev_ports = rte_eth_dev_count();
285     if (nb_dev_ports == 0) {
286         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
287     }
288 
289     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
290         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
291                  ff_global_cfg.dpdk.max_portid);
292     }
293 
294     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
295     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
296 
297     uint16_t proc_id;
298     for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) {
299         uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id];
300         if (!lcore_config[lcore_id].detected) {
301             rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
302         }
303     }
304 
305     uint16_t socket_id = 0;
306     if (numa_on) {
307         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
308     }
309 
310     lcore_conf.socket_id = socket_id;
311 
312     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
313     int j;
314     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
315         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
316         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
317 
318         int queueid = -1;
319         int i;
320         for (i = 0; i < pconf->nb_lcores; i++) {
321             if (pconf->lcore_list[i] == lcore_id) {
322                 queueid = i;
323             }
324         }
325         if (queueid < 0) {
326             continue;
327         }
328         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
329         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
330         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
331         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
332         lcore_conf.nb_rx_queue++;
333 
334         lcore_conf.tx_queue_id[port_id] = queueid;
335         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
336         lcore_conf.nb_tx_port++;
337 
338         lcore_conf.pcap[port_id] = pconf->pcap;
339         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
340     }
341 
342     if (lcore_conf.nb_rx_queue == 0) {
343         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
344     }
345 
346     return 0;
347 }
348 
349 static int
350 init_mem_pool(void)
351 {
352     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
353     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
354     uint32_t nb_tx_queue = nb_lcores;
355     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
356 
357     unsigned nb_mbuf = RTE_MAX (
358         (nb_rx_queue*RX_QUEUE_SIZE          +
359         nb_ports*nb_lcores*MAX_PKT_BURST    +
360         nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
361         nb_lcores*MEMPOOL_CACHE_SIZE +
362         nb_ports*KNI_MBUF_MAX +
363         nb_ports*KNI_QUEUE_SIZE +
364         nb_lcores*nb_ports*DISPATCH_RING_SIZE),
365         (unsigned)8192);
366 
367     unsigned socketid = 0;
368     uint16_t i, lcore_id;
369     char s[64];
370 
371     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
372         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
373         if (numa_on) {
374             socketid = rte_lcore_to_socket_id(lcore_id);
375         }
376 
377         if (socketid >= NB_SOCKETS) {
378             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
379                 socketid, i, NB_SOCKETS);
380         }
381 
382         if (pktmbuf_pool[socketid] != NULL) {
383             continue;
384         }
385 
386         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
387             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
388             pktmbuf_pool[socketid] =
389                 rte_pktmbuf_pool_create(s, nb_mbuf,
390                     MEMPOOL_CACHE_SIZE, 0,
391                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
392         } else {
393             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
394             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
395         }
396 
397         if (pktmbuf_pool[socketid] == NULL) {
398             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
399         } else {
400             printf("create mbuf pool on socket %d\n", socketid);
401         }
402     }
403 
404     return 0;
405 }
406 
407 static struct rte_ring *
408 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
409 {
410     struct rte_ring *ring;
411 
412     if (name == NULL)
413         return NULL;
414 
415     /* If already create, just attached it */
416     if (likely((ring = rte_ring_lookup(name)) != NULL))
417         return ring;
418 
419     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
420         return rte_ring_create(name, count, socket_id, flags);
421     } else {
422         return rte_ring_lookup(name);
423     }
424 }
425 
426 static int
427 init_dispatch_ring(void)
428 {
429     int j;
430     char name_buf[RTE_RING_NAMESIZE];
431     int queueid;
432 
433     unsigned socketid = lcore_conf.socket_id;
434 
435     /* Create ring according to ports actually being used. */
436     int nb_ports = ff_global_cfg.dpdk.nb_ports;
437     for (j = 0; j < nb_ports; j++) {
438         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
439         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
440         int nb_queues = pconf->nb_lcores;
441         if (dispatch_ring[portid] == NULL) {
442             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
443 
444             dispatch_ring[portid] = rte_zmalloc(name_buf,
445                 sizeof(struct rte_ring *) * nb_queues,
446                 RTE_CACHE_LINE_SIZE);
447             if (dispatch_ring[portid] == NULL) {
448                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
449                     "failed\n", name_buf);
450             }
451         }
452 
453         for(queueid = 0; queueid < nb_queues; ++queueid) {
454             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
455                 portid, queueid);
456             dispatch_ring[portid][queueid] = create_ring(name_buf,
457                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
458 
459             if (dispatch_ring[portid][queueid] == NULL)
460                 rte_panic("create ring:%s failed!\n", name_buf);
461 
462             printf("create ring:%s success, %u ring entries are now free!\n",
463                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
464         }
465     }
466 
467     return 0;
468 }
469 
470 static void
471 ff_msg_init(struct rte_mempool *mp,
472     __attribute__((unused)) void *opaque_arg,
473     void *obj, __attribute__((unused)) unsigned i)
474 {
475     struct ff_msg *msg = (struct ff_msg *)obj;
476     msg->msg_type = FF_UNKNOWN;
477     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
478     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
479 }
480 
481 static int
482 init_msg_ring(void)
483 {
484     uint16_t i;
485     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
486     unsigned socketid = lcore_conf.socket_id;
487 
488     /* Create message buffer pool */
489     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
490         message_pool = rte_mempool_create(FF_MSG_POOL,
491            MSG_RING_SIZE * 2 * nb_procs,
492            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
493            NULL, NULL, ff_msg_init, NULL,
494            socketid, 0);
495     } else {
496         message_pool = rte_mempool_lookup(FF_MSG_POOL);
497     }
498 
499     if (message_pool == NULL) {
500         rte_panic("Create msg mempool failed\n");
501     }
502 
503     for(i = 0; i < nb_procs; ++i) {
504         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
505             "%s%u", FF_MSG_RING_IN, i);
506         snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE,
507             "%s%u", FF_MSG_RING_OUT, i);
508 
509         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
510             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
511         if (msg_ring[i].ring[0] == NULL)
512             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
513 
514         msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1],
515             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
516         if (msg_ring[i].ring[1] == NULL)
517             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
518     }
519 
520     return 0;
521 }
522 
523 static int
524 init_kni(void)
525 {
526     int nb_ports = rte_eth_dev_count();
527     kni_accept = 0;
528     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
529         kni_accept = 1;
530 
531     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
532         ff_global_cfg.kni.udp_port);
533 
534     unsigned socket_id = lcore_conf.socket_id;
535     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
536 
537     nb_ports = ff_global_cfg.dpdk.nb_ports;
538     int i, ret;
539     for (i = 0; i < nb_ports; i++) {
540         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
541         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
542     }
543 
544     return 0;
545 }
546 
547 static void
548 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
549 {
550     if (reta_size == 0) {
551         return;
552     }
553 
554     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
555     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
556 
557     /* config HW indirection table */
558     unsigned i, j, hash=0;
559     for (i = 0; i < reta_conf_size; i++) {
560         reta_conf[i].mask = ~0ULL;
561         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
562             reta_conf[i].reta[j] = hash++ % nb_queues;
563         }
564     }
565 
566     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
567         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
568             port_id);
569     }
570 }
571 
572 static int
573 init_port_start(void)
574 {
575     int nb_ports = ff_global_cfg.dpdk.nb_ports;
576     unsigned socketid = rte_lcore_to_socket_id(rte_lcore_id());
577     struct rte_mempool *mbuf_pool = pktmbuf_pool[socketid];
578     uint16_t i;
579 
580     for (i = 0; i < nb_ports; i++) {
581         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
582         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
583         uint16_t nb_queues = pconf->nb_lcores;
584 
585         struct rte_eth_dev_info dev_info;
586         rte_eth_dev_info_get(port_id, &dev_info);
587 
588         if (nb_queues > dev_info.max_rx_queues) {
589             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
590                 nb_queues,
591                 dev_info.max_rx_queues);
592         }
593 
594         if (nb_queues > dev_info.max_tx_queues) {
595             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
596                 nb_queues,
597                 dev_info.max_tx_queues);
598         }
599 
600         struct ether_addr addr;
601         rte_eth_macaddr_get(port_id, &addr);
602         printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
603                    " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
604                 (unsigned)port_id,
605                 addr.addr_bytes[0], addr.addr_bytes[1],
606                 addr.addr_bytes[2], addr.addr_bytes[3],
607                 addr.addr_bytes[4], addr.addr_bytes[5]);
608 
609         rte_memcpy(pconf->mac,
610             addr.addr_bytes, ETHER_ADDR_LEN);
611 
612         /* Clear txq_flags - we do not need multi-mempool and refcnt */
613         dev_info.default_txconf.txq_flags = ETH_TXQ_FLAGS_NOMULTMEMP |
614             ETH_TXQ_FLAGS_NOREFCOUNT;
615 
616         /* Disable features that are not supported by port's HW */
617         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) {
618             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP;
619         }
620 
621         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
622             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP;
623         }
624 
625         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) {
626             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP;
627         }
628 
629         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) {
630             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL;
631         }
632 
633         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) &&
634             !(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TSO)) {
635             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
636         }
637 
638         struct rte_eth_conf port_conf = {0};
639 
640         /* Set RSS mode */
641         port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
642         port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK;
643         port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes;
644         port_conf.rx_adv_conf.rss_conf.rss_key_len = 40;
645 
646         /* Set Rx VLAN stripping */
647         if (ff_global_cfg.dpdk.vlan_strip) {
648             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
649                 port_conf.rxmode.hw_vlan_strip = 1;
650             }
651         }
652 
653         /* Enable HW CRC stripping */
654         port_conf.rxmode.hw_strip_crc = 1;
655 
656         /* FIXME: Enable TCP LRO ?*/
657         #if 0
658         if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
659             printf("LRO is supported\n");
660             port_conf.rxmode.enable_lro = 1;
661             pconf->hw_features.rx_lro = 1;
662         }
663         #endif
664 
665         /* Set Rx checksum checking */
666         if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
667             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
668             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
669             printf("RX checksum offload supported\n");
670             port_conf.rxmode.hw_ip_checksum = 1;
671             pconf->hw_features.rx_csum = 1;
672         }
673 
674         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
675             printf("TX ip checksum offload supported\n");
676             pconf->hw_features.tx_csum_ip = 1;
677         }
678 
679         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
680             (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
681             printf("TX TCP&UDP checksum offload supported\n");
682             pconf->hw_features.tx_csum_l4 = 1;
683         }
684 
685         if (ff_global_cfg.dpdk.tso) {
686             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
687                 printf("TSO is supported\n");
688                 pconf->hw_features.tx_tso = 1;
689             }
690         } else {
691             printf("TSO is disabled\n");
692         }
693 
694         if (dev_info.reta_size) {
695             /* reta size must be power of 2 */
696             assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
697 
698             rss_reta_size[port_id] = dev_info.reta_size;
699             printf("port[%d]: rss table size: %d\n", port_id,
700                 dev_info.reta_size);
701         }
702 
703         if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
704             continue;
705         }
706 
707         int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
708         if (ret != 0) {
709             return ret;
710         }
711         uint16_t q;
712         for (q = 0; q < nb_queues; q++) {
713             ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE,
714                 socketid, &dev_info.default_txconf);
715             if (ret < 0) {
716                 return ret;
717             }
718 
719             ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE,
720                 socketid, &dev_info.default_rxconf, mbuf_pool);
721             if (ret < 0) {
722                 return ret;
723             }
724         }
725 
726         ret = rte_eth_dev_start(port_id);
727         if (ret < 0) {
728             return ret;
729         }
730 
731         if (nb_queues > 1) {
732             /* set HW rss hash function to Toeplitz. */
733             if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
734                 struct rte_eth_hash_filter_info info = {0};
735                 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
736                 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
737 
738                 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
739                     RTE_ETH_FILTER_SET, &info) < 0) {
740                     rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
741                         port_id);
742                 }
743             }
744 
745             set_rss_table(port_id, dev_info.reta_size, nb_queues);
746         }
747 
748         /* Enable RX in promiscuous mode for the Ethernet device. */
749         if (ff_global_cfg.dpdk.promiscuous) {
750             rte_eth_promiscuous_enable(port_id);
751             ret = rte_eth_promiscuous_get(port_id);
752             if (ret == 1) {
753                 printf("set port %u to promiscuous mode ok\n", port_id);
754             } else {
755                 printf("set port %u to promiscuous mode error\n", port_id);
756             }
757         }
758 
759         /* Enable pcap dump */
760         if (pconf->pcap) {
761             ff_enable_pcap(pconf->pcap);
762         }
763     }
764 
765     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
766         check_all_ports_link_status();
767     }
768 
769     return 0;
770 }
771 
772 static int
773 init_clock(void)
774 {
775     rte_timer_subsystem_init();
776     uint64_t hz = rte_get_timer_hz();
777     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
778     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
779 
780     rte_timer_init(&freebsd_clock);
781     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
782         rte_lcore_id(), &ff_hardclock_job, NULL);
783 
784     ff_update_current_ts();
785 
786     return 0;
787 }
788 
789 int
790 ff_dpdk_init(int argc, char **argv)
791 {
792     if (ff_global_cfg.dpdk.nb_procs < 1 ||
793         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
794         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
795         ff_global_cfg.dpdk.proc_id < 0) {
796         printf("param num_procs[%d] or proc_id[%d] error!\n",
797             ff_global_cfg.dpdk.nb_procs,
798             ff_global_cfg.dpdk.proc_id);
799         exit(1);
800     }
801 
802     int ret = rte_eal_init(argc, argv);
803     if (ret < 0) {
804         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
805     }
806 
807     numa_on = ff_global_cfg.dpdk.numa_on;
808 
809     init_lcore_conf();
810 
811     init_mem_pool();
812 
813     init_dispatch_ring();
814 
815     init_msg_ring();
816 
817     enable_kni = ff_global_cfg.kni.enable;
818     if (enable_kni) {
819         init_kni();
820     }
821 
822     ret = init_port_start();
823     if (ret < 0) {
824         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
825     }
826 
827     init_clock();
828 
829     return 0;
830 }
831 
832 static void
833 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
834 {
835     uint8_t rx_csum = ctx->hw_features.rx_csum;
836     if (rx_csum) {
837         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
838             return;
839         }
840     }
841 
842     /*
843      * FIXME: should we save pkt->vlan_tci
844      * if (pkt->ol_flags & PKT_RX_VLAN_PKT)
845      */
846 
847     void *data = rte_pktmbuf_mtod(pkt, void*);
848     uint16_t len = rte_pktmbuf_data_len(pkt);
849 
850     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
851     if (hdr == NULL) {
852         rte_pktmbuf_free(pkt);
853         return;
854     }
855 
856     struct rte_mbuf *pn = pkt->next;
857     void *prev = hdr;
858     while(pn != NULL) {
859         data = rte_pktmbuf_mtod(pn, void*);
860         len = rte_pktmbuf_data_len(pn);
861 
862         void *mb = ff_mbuf_get(prev, data, len);
863         if (mb == NULL) {
864             ff_mbuf_free(hdr);
865             rte_pktmbuf_free(pkt);
866             return;
867         }
868         pn = pn->next;
869         prev = mb;
870     }
871 
872     ff_veth_process_packet(ctx->ifp, hdr);
873 }
874 
875 static enum FilterReturn
876 protocol_filter(const void *data, uint16_t len)
877 {
878     if(len < ETHER_HDR_LEN)
879         return FILTER_UNKNOWN;
880 
881     const struct ether_hdr *hdr;
882     hdr = (const struct ether_hdr *)data;
883 
884     if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP)
885         return FILTER_ARP;
886 
887     if (!enable_kni) {
888         return FILTER_UNKNOWN;
889     }
890 
891     if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4)
892         return FILTER_UNKNOWN;
893 
894     return ff_kni_proto_filter(data + ETHER_HDR_LEN,
895         len - ETHER_HDR_LEN);
896 }
897 
898 static inline void
899 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
900 {
901     struct rte_mbuf *md;
902     void *src, *dst;
903 
904     dst = rte_pktmbuf_mtod(mi, void *);
905     src = rte_pktmbuf_mtod(m, void *);
906 
907     mi->data_len = m->data_len;
908     rte_memcpy(dst, src, m->data_len);
909 
910     mi->port = m->port;
911     mi->vlan_tci = m->vlan_tci;
912     mi->vlan_tci_outer = m->vlan_tci_outer;
913     mi->tx_offload = m->tx_offload;
914     mi->hash = m->hash;
915     mi->ol_flags = m->ol_flags;
916     mi->packet_type = m->packet_type;
917 }
918 
919 /* copied from rte_pktmbuf_clone */
920 static inline struct rte_mbuf *
921 pktmbuf_deep_clone(const struct rte_mbuf *md,
922     struct rte_mempool *mp)
923 {
924     struct rte_mbuf *mc, *mi, **prev;
925     uint32_t pktlen;
926     uint8_t nseg;
927 
928     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
929         return NULL;
930 
931     mi = mc;
932     prev = &mi->next;
933     pktlen = md->pkt_len;
934     nseg = 0;
935 
936     do {
937         nseg++;
938         pktmbuf_deep_attach(mi, md);
939         *prev = mi;
940         prev = &mi->next;
941     } while ((md = md->next) != NULL &&
942         (mi = rte_pktmbuf_alloc(mp)) != NULL);
943 
944     *prev = NULL;
945     mc->nb_segs = nseg;
946     mc->pkt_len = pktlen;
947 
948     /* Allocation of new indirect segment failed */
949     if (unlikely (mi == NULL)) {
950         rte_pktmbuf_free(mc);
951         return NULL;
952     }
953 
954     __rte_mbuf_sanity_check(mc, 1);
955     return mc;
956 }
957 
958 static inline void
959 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
960     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
961 {
962     struct lcore_conf *qconf = &lcore_conf;
963     uint16_t nb_queues = qconf->nb_queue_list[port_id];
964 
965     uint16_t i;
966     for (i = 0; i < count; i++) {
967         struct rte_mbuf *rtem = bufs[i];
968 
969         if (unlikely(qconf->pcap[port_id] != NULL)) {
970             if (!pkts_from_ring) {
971                 ff_dump_packets(qconf->pcap[port_id], rtem);
972             }
973         }
974 
975         void *data = rte_pktmbuf_mtod(rtem, void*);
976         uint16_t len = rte_pktmbuf_data_len(rtem);
977 
978         if (!pkts_from_ring && packet_dispatcher) {
979             int ret = (*packet_dispatcher)(data, len, queue_id, nb_queues);
980             if (ret < 0 || ret >= nb_queues) {
981                 rte_pktmbuf_free(rtem);
982                 continue;
983             }
984 
985             if (ret != queue_id) {
986                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
987                 if (ret < 0)
988                     rte_pktmbuf_free(rtem);
989 
990                 continue;
991             }
992         }
993 
994         enum FilterReturn filter = protocol_filter(data, len);
995         if (filter == FILTER_ARP) {
996             struct rte_mempool *mbuf_pool;
997             struct rte_mbuf *mbuf_clone;
998             if (!pkts_from_ring) {
999                 uint16_t j;
1000                 for(j = 0; j < nb_queues; ++j) {
1001                     if(j == queue_id)
1002                         continue;
1003 
1004                     unsigned socket_id = 0;
1005                     if (numa_on) {
1006                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1007                         socket_id = rte_lcore_to_socket_id(lcore_id);
1008                     }
1009                     mbuf_pool = pktmbuf_pool[socket_id];
1010                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1011                     if(mbuf_clone) {
1012                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1013                             mbuf_clone);
1014                         if (ret < 0)
1015                             rte_pktmbuf_free(mbuf_clone);
1016                     }
1017                 }
1018             }
1019 
1020             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1021                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1022                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1023                 if(mbuf_clone) {
1024                     ff_kni_enqueue(port_id, mbuf_clone);
1025                 }
1026             }
1027 
1028             ff_veth_input(ctx, rtem);
1029         } else if (enable_kni &&
1030             ((filter == FILTER_KNI && kni_accept) ||
1031             (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1032             ff_kni_enqueue(port_id, rtem);
1033         } else {
1034             ff_veth_input(ctx, rtem);
1035         }
1036     }
1037 }
1038 
1039 static inline int
1040 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1041     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1042 {
1043     /* read packet from ring buf and to process */
1044     uint16_t nb_rb;
1045     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1046         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1047 
1048     if(nb_rb > 0) {
1049         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1050     }
1051 
1052     return 0;
1053 }
1054 
1055 static inline void
1056 handle_sysctl_msg(struct ff_msg *msg)
1057 {
1058     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1059         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1060         msg->sysctl.newlen);
1061 
1062     if (ret < 0) {
1063         msg->result = errno;
1064     } else {
1065         msg->result = 0;
1066     }
1067 }
1068 
1069 static inline void
1070 handle_ioctl_msg(struct ff_msg *msg)
1071 {
1072     int fd, ret;
1073     fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1074     if (fd < 0) {
1075         ret = -1;
1076         goto done;
1077     }
1078 
1079     ret = ff_ioctl(fd, msg->ioctl.cmd, msg->ioctl.data);
1080 
1081     ff_close(fd);
1082 
1083 done:
1084     if (ret < 0) {
1085         msg->result = errno;
1086     } else {
1087         msg->result = 0;
1088     }
1089 }
1090 
1091 static inline void
1092 handle_route_msg(struct ff_msg *msg)
1093 {
1094     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1095         &msg->route.len, msg->route.maxlen);
1096     if (ret < 0) {
1097         msg->result = errno;
1098     } else {
1099         msg->result = 0;
1100     }
1101 }
1102 
1103 static struct ff_top_args ff_status;
1104 static inline void
1105 handle_top_msg(struct ff_msg *msg)
1106 {
1107     msg->top = ff_status;
1108     msg->result = 0;
1109 }
1110 
1111 #ifdef FF_NETGRAPH
1112 static inline void
1113 handle_ngctl_msg(struct ff_msg *msg)
1114 {
1115     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1116     if (ret < 0) {
1117         msg->result = errno;
1118     } else {
1119         msg->result = 0;
1120         msg->ngctl.ret = ret;
1121     }
1122 }
1123 #endif
1124 
1125 #ifdef FF_IPFW
1126 static inline void
1127 handle_ipfw_msg(struct ff_msg *msg)
1128 {
1129     int fd, ret;
1130     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1131     if (fd < 0) {
1132         ret = -1;
1133         goto done;
1134     }
1135 
1136     switch (msg->ipfw.cmd) {
1137         case FF_IPFW_GET:
1138             ret = ff_getsockopt(fd, msg->ipfw.level,
1139                 msg->ipfw.optname, msg->ipfw.optval,
1140                 msg->ipfw.optlen);
1141             break;
1142         case FF_IPFW_SET:
1143             ret = ff_setsockopt(fd, msg->ipfw.level,
1144                 msg->ipfw.optname, msg->ipfw.optval,
1145                 *(msg->ipfw.optlen));
1146             break;
1147         default:
1148             ret = -1;
1149             errno = ENOTSUP;
1150             break;
1151     }
1152 
1153     ff_close(fd);
1154 
1155 done:
1156     if (ret < 0) {
1157         msg->result = errno;
1158     } else {
1159         msg->result = 0;
1160     }
1161 }
1162 #endif
1163 
1164 static inline void
1165 handle_default_msg(struct ff_msg *msg)
1166 {
1167     msg->result = ENOTSUP;
1168 }
1169 
1170 static inline void
1171 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1172 {
1173     switch (msg->msg_type) {
1174         case FF_SYSCTL:
1175             handle_sysctl_msg(msg);
1176             break;
1177         case FF_IOCTL:
1178             handle_ioctl_msg(msg);
1179             break;
1180         case FF_ROUTE:
1181             handle_route_msg(msg);
1182             break;
1183         case FF_TOP:
1184             handle_top_msg(msg);
1185             break;
1186 #ifdef FF_NETGRAPH
1187         case FF_NGCTL:
1188             handle_ngctl_msg(msg);
1189             break;
1190 #endif
1191 #ifdef FF_IPFW
1192         case FF_IPFW_CTL:
1193             handle_ipfw_msg(msg);
1194             break;
1195 #endif
1196         default:
1197             handle_default_msg(msg);
1198             break;
1199     }
1200     rte_ring_enqueue(msg_ring[proc_id].ring[1], msg);
1201 }
1202 
1203 static inline int
1204 process_msg_ring(uint16_t proc_id)
1205 {
1206     void *msg;
1207     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1208 
1209     if (unlikely(ret == 0)) {
1210         handle_msg((struct ff_msg *)msg, proc_id);
1211     }
1212 
1213     return 0;
1214 }
1215 
1216 /* Send burst of packets on an output interface */
1217 static inline int
1218 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1219 {
1220     struct rte_mbuf **m_table;
1221     int ret;
1222     uint16_t queueid;
1223 
1224     queueid = qconf->tx_queue_id[port];
1225     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1226 
1227     if (unlikely(qconf->pcap[port] != NULL)) {
1228         uint16_t i;
1229         for (i = 0; i < n; i++) {
1230             ff_dump_packets(qconf->pcap[port], m_table[i]);
1231         }
1232     }
1233 
1234     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1235     if (unlikely(ret < n)) {
1236         do {
1237             rte_pktmbuf_free(m_table[ret]);
1238         } while (++ret < n);
1239     }
1240 
1241     return 0;
1242 }
1243 
1244 /* Enqueue a single packet, and send burst if queue is filled */
1245 static inline int
1246 send_single_packet(struct rte_mbuf *m, uint8_t port)
1247 {
1248     uint16_t len;
1249     struct lcore_conf *qconf;
1250 
1251     qconf = &lcore_conf;
1252     len = qconf->tx_mbufs[port].len;
1253     qconf->tx_mbufs[port].m_table[len] = m;
1254     len++;
1255 
1256     /* enough pkts to be sent */
1257     if (unlikely(len == MAX_PKT_BURST)) {
1258         send_burst(qconf, MAX_PKT_BURST, port);
1259         len = 0;
1260     }
1261 
1262     qconf->tx_mbufs[port].len = len;
1263     return 0;
1264 }
1265 
1266 int
1267 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1268     int total)
1269 {
1270     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1271     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1272     if (head == NULL) {
1273         ff_mbuf_free(m);
1274         return -1;
1275     }
1276 
1277     head->pkt_len = total;
1278     head->nb_segs = 0;
1279 
1280     int off = 0;
1281     struct rte_mbuf *cur = head, *prev = NULL;
1282     while(total > 0) {
1283         if (cur == NULL) {
1284             cur = rte_pktmbuf_alloc(mbuf_pool);
1285             if (cur == NULL) {
1286                 rte_pktmbuf_free(head);
1287                 ff_mbuf_free(m);
1288                 return -1;
1289             }
1290         }
1291 
1292         void *data = rte_pktmbuf_mtod(cur, void*);
1293         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1294         int ret = ff_mbuf_copydata(m, data, off, len);
1295         if (ret < 0) {
1296             rte_pktmbuf_free(head);
1297             ff_mbuf_free(m);
1298             return -1;
1299         }
1300 
1301         if (prev != NULL) {
1302             prev->next = cur;
1303         }
1304         prev = cur;
1305 
1306         cur->data_len = len;
1307         off += len;
1308         total -= len;
1309         head->nb_segs++;
1310         cur = NULL;
1311     }
1312 
1313     struct ff_tx_offload offload = {0};
1314     ff_mbuf_tx_offload(m, &offload);
1315 
1316     void *data = rte_pktmbuf_mtod(head, void*);
1317 
1318     if (offload.ip_csum) {
1319         /* ipv6 not supported yet */
1320         struct ipv4_hdr *iph;
1321         int iph_len;
1322         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1323         iph_len = (iph->version_ihl & 0x0f) << 2;
1324 
1325         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1326         head->l2_len = ETHER_HDR_LEN;
1327         head->l3_len = iph_len;
1328     }
1329 
1330     if (ctx->hw_features.tx_csum_l4) {
1331         struct ipv4_hdr *iph;
1332         int iph_len;
1333         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1334         iph_len = (iph->version_ihl & 0x0f) << 2;
1335 
1336         if (offload.tcp_csum) {
1337             head->ol_flags |= PKT_TX_TCP_CKSUM;
1338             head->l2_len = ETHER_HDR_LEN;
1339             head->l3_len = iph_len;
1340         }
1341 
1342         /*
1343          *  TCP segmentation offload.
1344          *
1345          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1346          *    implies PKT_TX_TCP_CKSUM)
1347          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1348          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1349          *    write the IP checksum to 0 in the packet
1350          *  - fill the mbuf offload information: l2_len,
1351          *    l3_len, l4_len, tso_segsz
1352          *  - calculate the pseudo header checksum without taking ip_len
1353          *    in account, and set it in the TCP header. Refer to
1354          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1355          *    used as helpers.
1356          */
1357         if (offload.tso_seg_size) {
1358             struct tcp_hdr *tcph;
1359             int tcph_len;
1360             tcph = (struct tcp_hdr *)((char *)iph + iph_len);
1361             tcph_len = (tcph->data_off & 0xf0) >> 2;
1362             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1363 
1364             head->ol_flags |= PKT_TX_TCP_SEG;
1365             head->l4_len = tcph_len;
1366             head->tso_segsz = offload.tso_seg_size;
1367         }
1368 
1369         if (offload.udp_csum) {
1370             head->ol_flags |= PKT_TX_UDP_CKSUM;
1371             head->l2_len = ETHER_HDR_LEN;
1372             head->l3_len = iph_len;
1373         }
1374     }
1375 
1376     ff_mbuf_free(m);
1377 
1378     return send_single_packet(head, ctx->port_id);
1379 }
1380 
1381 static int
1382 main_loop(void *arg)
1383 {
1384     struct loop_routine *lr = (struct loop_routine *)arg;
1385 
1386     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1387     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc;
1388     int i, j, nb_rx, idle;
1389     uint16_t port_id, queue_id;
1390     struct lcore_conf *qconf;
1391     const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) /
1392         US_PER_S * BURST_TX_DRAIN_US;
1393     struct ff_dpdk_if_context *ctx;
1394 
1395     prev_tsc = 0;
1396     usch_tsc = 0;
1397 
1398     qconf = &lcore_conf;
1399 
1400     while (1) {
1401         cur_tsc = rte_rdtsc();
1402         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1403             rte_timer_manage();
1404         }
1405 
1406         idle = 1;
1407         sys_tsc = 0;
1408         usr_tsc = 0;
1409 
1410         /*
1411          * TX burst queue drain
1412          */
1413         diff_tsc = cur_tsc - prev_tsc;
1414         if (unlikely(diff_tsc > drain_tsc)) {
1415             for (i = 0; i < qconf->nb_tx_port; i++) {
1416                 port_id = qconf->tx_port_id[i];
1417                 if (qconf->tx_mbufs[port_id].len == 0)
1418                     continue;
1419 
1420                 idle = 0;
1421 
1422                 send_burst(qconf,
1423                     qconf->tx_mbufs[port_id].len,
1424                     port_id);
1425                 qconf->tx_mbufs[port_id].len = 0;
1426             }
1427 
1428             prev_tsc = cur_tsc;
1429         }
1430 
1431         /*
1432          * Read packet from RX queues
1433          */
1434         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1435             port_id = qconf->rx_queue_list[i].port_id;
1436             queue_id = qconf->rx_queue_list[i].queue_id;
1437             ctx = veth_ctx[port_id];
1438 
1439             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1440                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1441             }
1442 
1443             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1444 
1445             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1446                 MAX_PKT_BURST);
1447             if (nb_rx == 0)
1448                 continue;
1449 
1450             idle = 0;
1451 
1452             /* Prefetch first packets */
1453             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1454                 rte_prefetch0(rte_pktmbuf_mtod(
1455                         pkts_burst[j], void *));
1456             }
1457 
1458             /* Prefetch and handle already prefetched packets */
1459             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1460                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1461                         j + PREFETCH_OFFSET], void *));
1462                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1463             }
1464 
1465             /* Handle remaining prefetched packets */
1466             for (; j < nb_rx; j++) {
1467                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1468             }
1469         }
1470 
1471         process_msg_ring(qconf->proc_id);
1472 
1473         div_tsc = rte_rdtsc();
1474 
1475         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) {
1476             usch_tsc = cur_tsc;
1477             lr->loop(lr->arg);
1478         }
1479 
1480         end_tsc = rte_rdtsc();
1481 
1482         if (usch_tsc == cur_tsc) {
1483             usr_tsc = end_tsc - div_tsc;
1484         }
1485 
1486         if (!idle) {
1487             sys_tsc = div_tsc - cur_tsc;
1488             ff_status.sys_tsc += sys_tsc;
1489         }
1490 
1491         ff_status.usr_tsc += usr_tsc;
1492         ff_status.work_tsc += end_tsc - cur_tsc;
1493         ff_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1494 
1495         ff_status.loops++;
1496     }
1497 
1498     return 0;
1499 }
1500 
1501 int
1502 ff_dpdk_if_up(void) {
1503     int i;
1504     struct lcore_conf *qconf = &lcore_conf;
1505     for (i = 0; i < qconf->nb_tx_port; i++) {
1506         uint16_t port_id = qconf->tx_port_id[i];
1507 
1508         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1509         veth_ctx[port_id] = ff_veth_attach(pconf);
1510         if (veth_ctx[port_id] == NULL) {
1511             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1512         }
1513     }
1514 
1515     return 0;
1516 }
1517 
1518 void
1519 ff_dpdk_run(loop_func_t loop, void *arg) {
1520     struct loop_routine *lr = rte_malloc(NULL,
1521         sizeof(struct loop_routine), 0);
1522     lr->loop = loop;
1523     lr->arg = arg;
1524     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1525     rte_eal_mp_wait_lcore();
1526     rte_free(lr);
1527 }
1528 
1529 void
1530 ff_dpdk_pktmbuf_free(void *m)
1531 {
1532     rte_pktmbuf_free((struct rte_mbuf *)m);
1533 }
1534 
1535 static uint32_t
1536 toeplitz_hash(unsigned keylen, const uint8_t *key,
1537     unsigned datalen, const uint8_t *data)
1538 {
1539     uint32_t hash = 0, v;
1540     u_int i, b;
1541 
1542     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1543 
1544     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1545     for (i = 0; i < datalen; i++) {
1546         for (b = 0; b < 8; b++) {
1547             if (data[i] & (1<<(7-b)))
1548                 hash ^= v;
1549             v <<= 1;
1550             if ((i + 4) < keylen &&
1551                 (key[i+4] & (1<<(7-b))))
1552                 v |= 1;
1553         }
1554     }
1555     return (hash);
1556 }
1557 
1558 int
1559 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
1560     uint16_t sport, uint16_t dport)
1561 {
1562     struct lcore_conf *qconf = &lcore_conf;
1563     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
1564     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
1565 
1566     if (nb_queues <= 1) {
1567         return 1;
1568     }
1569 
1570     uint16_t reta_size = rss_reta_size[ctx->port_id];
1571     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
1572 
1573     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
1574         sizeof(dport)];
1575 
1576     unsigned datalen = 0;
1577 
1578     bcopy(&saddr, &data[datalen], sizeof(saddr));
1579     datalen += sizeof(saddr);
1580 
1581     bcopy(&daddr, &data[datalen], sizeof(daddr));
1582     datalen += sizeof(daddr);
1583 
1584     bcopy(&sport, &data[datalen], sizeof(sport));
1585     datalen += sizeof(sport);
1586 
1587     bcopy(&dport, &data[datalen], sizeof(dport));
1588     datalen += sizeof(dport);
1589 
1590     uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes),
1591         default_rsskey_40bytes, datalen, data);
1592 
1593     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
1594 }
1595 
1596 void
1597 ff_regist_packet_dispatcher(dispatch_func_t func)
1598 {
1599     packet_dispatcher = func;
1600 }
1601 
1602 uint64_t
1603 ff_get_tsc_ns()
1604 {
1605     uint64_t cur_tsc = rte_rdtsc();
1606     uint64_t hz = rte_get_tsc_hz();
1607     return ((double)cur_tsc/(double)hz) * NS_PER_S;
1608 }
1609 
1610