xref: /f-stack/lib/ff_dpdk_if.c (revision 15eec4e8)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 #include <unistd.h>
28 
29 #include <rte_common.h>
30 #include <rte_byteorder.h>
31 #include <rte_log.h>
32 #include <rte_memory.h>
33 #include <rte_memcpy.h>
34 #include <rte_memzone.h>
35 #include <rte_config.h>
36 #include <rte_eal.h>
37 #include <rte_pci.h>
38 #include <rte_mbuf.h>
39 #include <rte_memory.h>
40 #include <rte_lcore.h>
41 #include <rte_launch.h>
42 #include <rte_ethdev.h>
43 #include <rte_debug.h>
44 #include <rte_common.h>
45 #include <rte_ether.h>
46 #include <rte_malloc.h>
47 #include <rte_cycles.h>
48 #include <rte_timer.h>
49 #include <rte_thash.h>
50 #include <rte_ip.h>
51 #include <rte_tcp.h>
52 #include <rte_udp.h>
53 
54 #include "ff_dpdk_if.h"
55 #include "ff_dpdk_pcap.h"
56 #include "ff_dpdk_kni.h"
57 #include "ff_config.h"
58 #include "ff_veth.h"
59 #include "ff_host_interface.h"
60 #include "ff_msg.h"
61 #include "ff_api.h"
62 
63 #define MEMPOOL_CACHE_SIZE 256
64 
65 #define DISPATCH_RING_SIZE 2048
66 
67 #define MSG_RING_SIZE 32
68 
69 /*
70  * Configurable number of RX/TX ring descriptors
71  */
72 #define RX_QUEUE_SIZE 512
73 #define TX_QUEUE_SIZE 512
74 
75 #define MAX_PKT_BURST 32
76 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
77 
78 /*
79  * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send.
80  */
81 #define MAX_TX_BURST    (MAX_PKT_BURST / 2)
82 
83 #define NB_SOCKETS 8
84 
85 /* Configure how many packets ahead to prefetch, when reading packets */
86 #define PREFETCH_OFFSET    3
87 
88 #define MAX_RX_QUEUE_PER_LCORE 16
89 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS
90 #define MAX_RX_QUEUE_PER_PORT 128
91 
92 #ifdef FF_KNI
93 #define KNI_MBUF_MAX 2048
94 #define KNI_QUEUE_SIZE 2048
95 
96 static int enable_kni;
97 static int kni_accept;
98 #endif
99 
100 static int numa_on;
101 
102 static unsigned idle_sleep;
103 
104 static struct rte_timer freebsd_clock;
105 
106 // Mellanox Linux's driver key
107 static uint8_t default_rsskey_40bytes[40] = {
108     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
109     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
110     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
111     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
112     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
113 };
114 
115 static struct rte_eth_conf default_port_conf = {
116     .rxmode = {
117         .mq_mode = ETH_MQ_RX_RSS,
118         .max_rx_pkt_len = ETHER_MAX_LEN,
119         .split_hdr_size = 0, /**< hdr buf size */
120         .header_split   = 0, /**< Header Split disabled */
121         .hw_ip_checksum = 0, /**< IP checksum offload disabled */
122         .hw_vlan_filter = 0, /**< VLAN filtering disabled */
123         .hw_vlan_strip  = 0, /**< VLAN strip disabled. */
124         .hw_vlan_extend = 0, /**< Extended VLAN disabled. */
125         .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
126         .hw_strip_crc   = 0, /**< CRC stripped by hardware */
127         .enable_lro     = 0, /**< LRO disabled */
128     },
129     .rx_adv_conf = {
130         .rss_conf = {
131             .rss_key = default_rsskey_40bytes,
132             .rss_key_len = 40,
133             .rss_hf = ETH_RSS_PROTO_MASK,
134         },
135     },
136     .txmode = {
137         .mq_mode = ETH_MQ_TX_NONE,
138     },
139 };
140 
141 struct mbuf_table {
142     uint16_t len;
143     struct rte_mbuf *m_table[MAX_PKT_BURST];
144 };
145 
146 struct lcore_rx_queue {
147     uint16_t port_id;
148     uint16_t queue_id;
149 } __rte_cache_aligned;
150 
151 struct lcore_conf {
152     uint16_t proc_id;
153     uint16_t socket_id;
154     uint16_t nb_queue_list[RTE_MAX_ETHPORTS];
155     struct ff_port_cfg *port_cfgs;
156 
157     uint16_t nb_rx_queue;
158     struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
159     uint16_t nb_tx_port;
160     uint16_t tx_port_id[RTE_MAX_ETHPORTS];
161     uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
162     struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
163     char *pcap[RTE_MAX_ETHPORTS];
164 } __rte_cache_aligned;
165 
166 static struct lcore_conf lcore_conf;
167 
168 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
169 
170 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
171 static dispatch_func_t packet_dispatcher;
172 
173 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
174 
175 struct ff_msg_ring {
176     char ring_name[2][RTE_RING_NAMESIZE];
177     /* ring[0] for lcore recv msg, other send */
178     /* ring[1] for lcore send msg, other read */
179     struct rte_ring *ring[2];
180 } __rte_cache_aligned;
181 
182 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
183 static struct rte_mempool *message_pool;
184 
185 struct ff_dpdk_if_context {
186     void *sc;
187     void *ifp;
188     uint16_t port_id;
189     struct ff_hw_features hw_features;
190 } __rte_cache_aligned;
191 
192 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
193 
194 static struct ff_top_args ff_top_status;
195 static struct ff_traffic_args ff_traffic;
196 
197 extern void ff_hardclock(void);
198 
199 static void
200 ff_hardclock_job(__rte_unused struct rte_timer *timer,
201     __rte_unused void *arg) {
202     ff_hardclock();
203     ff_update_current_ts();
204 }
205 
206 struct ff_dpdk_if_context *
207 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
208 {
209     struct ff_dpdk_if_context *ctx;
210 
211     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
212     if (ctx == NULL)
213         return NULL;
214 
215     ctx->sc = sc;
216     ctx->ifp = ifp;
217     ctx->port_id = cfg->port_id;
218     ctx->hw_features = cfg->hw_features;
219 
220     return ctx;
221 }
222 
223 void
224 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
225 {
226     free(ctx);
227 }
228 
229 static void
230 check_all_ports_link_status(void)
231 {
232     #define CHECK_INTERVAL 100 /* 100ms */
233     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
234 
235     uint16_t portid;
236     uint8_t count, all_ports_up, print_flag = 0;
237     struct rte_eth_link link;
238 
239     printf("\nChecking link status");
240     fflush(stdout);
241 
242     int i, nb_ports;
243     nb_ports = ff_global_cfg.dpdk.nb_ports;
244     for (count = 0; count <= MAX_CHECK_TIME; count++) {
245         all_ports_up = 1;
246         for (i = 0; i < nb_ports; i++) {
247             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
248             memset(&link, 0, sizeof(link));
249             rte_eth_link_get_nowait(portid, &link);
250 
251             /* print link status if flag set */
252             if (print_flag == 1) {
253                 if (link.link_status) {
254                     printf("Port %d Link Up - speed %u "
255                         "Mbps - %s\n", (int)portid,
256                         (unsigned)link.link_speed,
257                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
258                         ("full-duplex") : ("half-duplex\n"));
259                 } else {
260                     printf("Port %d Link Down\n", (int)portid);
261                 }
262                 continue;
263             }
264             /* clear all_ports_up flag if any link down */
265             if (link.link_status == 0) {
266                 all_ports_up = 0;
267                 break;
268             }
269         }
270 
271         /* after finally printing all link status, get out */
272         if (print_flag == 1)
273             break;
274 
275         if (all_ports_up == 0) {
276             printf(".");
277             fflush(stdout);
278             rte_delay_ms(CHECK_INTERVAL);
279         }
280 
281         /* set the print_flag if all ports up or timeout */
282         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
283             print_flag = 1;
284             printf("done\n");
285         }
286     }
287 }
288 
289 static int
290 init_lcore_conf(void)
291 {
292     uint8_t nb_dev_ports = rte_eth_dev_count();
293     if (nb_dev_ports == 0) {
294         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
295     }
296 
297     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
298         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
299                  ff_global_cfg.dpdk.max_portid);
300     }
301 
302     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
303     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
304 
305     uint16_t proc_id;
306     for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) {
307         uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id];
308         if (!lcore_config[lcore_id].detected) {
309             rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
310         }
311     }
312 
313     uint16_t socket_id = 0;
314     if (numa_on) {
315         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
316     }
317 
318     lcore_conf.socket_id = socket_id;
319 
320     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
321     int j;
322     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
323         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
324         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
325 
326         int queueid = -1;
327         int i;
328         for (i = 0; i < pconf->nb_lcores; i++) {
329             if (pconf->lcore_list[i] == lcore_id) {
330                 queueid = i;
331             }
332         }
333         if (queueid < 0) {
334             continue;
335         }
336         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
337         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
338         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
339         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
340         lcore_conf.nb_rx_queue++;
341 
342         lcore_conf.tx_queue_id[port_id] = queueid;
343         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
344         lcore_conf.nb_tx_port++;
345 
346         lcore_conf.pcap[port_id] = pconf->pcap;
347         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
348     }
349 
350     if (lcore_conf.nb_rx_queue == 0) {
351         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
352     }
353 
354     return 0;
355 }
356 
357 static int
358 init_mem_pool(void)
359 {
360     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
361     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
362     uint32_t nb_tx_queue = nb_lcores;
363     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
364 
365     unsigned nb_mbuf = RTE_MAX (
366         (nb_rx_queue*RX_QUEUE_SIZE          +
367         nb_ports*nb_lcores*MAX_PKT_BURST    +
368         nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
369         nb_lcores*MEMPOOL_CACHE_SIZE +
370 #ifdef FF_KNI
371         nb_ports*KNI_MBUF_MAX +
372         nb_ports*KNI_QUEUE_SIZE +
373 #endif
374         nb_lcores*nb_ports*DISPATCH_RING_SIZE),
375         (unsigned)8192);
376 
377     unsigned socketid = 0;
378     uint16_t i, lcore_id;
379     char s[64];
380 
381     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
382         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
383         if (numa_on) {
384             socketid = rte_lcore_to_socket_id(lcore_id);
385         }
386 
387         if (socketid >= NB_SOCKETS) {
388             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
389                 socketid, i, NB_SOCKETS);
390         }
391 
392         if (pktmbuf_pool[socketid] != NULL) {
393             continue;
394         }
395 
396         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
397             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
398             pktmbuf_pool[socketid] =
399                 rte_pktmbuf_pool_create(s, nb_mbuf,
400                     MEMPOOL_CACHE_SIZE, 0,
401                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
402         } else {
403             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
404             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
405         }
406 
407         if (pktmbuf_pool[socketid] == NULL) {
408             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
409         } else {
410             printf("create mbuf pool on socket %d\n", socketid);
411         }
412     }
413 
414     return 0;
415 }
416 
417 static struct rte_ring *
418 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
419 {
420     struct rte_ring *ring;
421 
422     if (name == NULL) {
423         rte_exit(EXIT_FAILURE, "create ring failed, no name!\n");
424     }
425 
426     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
427         ring = rte_ring_create(name, count, socket_id, flags);
428     } else {
429         ring = rte_ring_lookup(name);
430     }
431 
432     if (ring == NULL) {
433         rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name);
434     }
435 
436     return ring;
437 }
438 
439 static int
440 init_dispatch_ring(void)
441 {
442     int j;
443     char name_buf[RTE_RING_NAMESIZE];
444     int queueid;
445 
446     unsigned socketid = lcore_conf.socket_id;
447 
448     /* Create ring according to ports actually being used. */
449     int nb_ports = ff_global_cfg.dpdk.nb_ports;
450     for (j = 0; j < nb_ports; j++) {
451         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
452         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
453         int nb_queues = pconf->nb_lcores;
454         if (dispatch_ring[portid] == NULL) {
455             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
456 
457             dispatch_ring[portid] = rte_zmalloc(name_buf,
458                 sizeof(struct rte_ring *) * nb_queues,
459                 RTE_CACHE_LINE_SIZE);
460             if (dispatch_ring[portid] == NULL) {
461                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
462                     "failed\n", name_buf);
463             }
464         }
465 
466         for(queueid = 0; queueid < nb_queues; ++queueid) {
467             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
468                 portid, queueid);
469             dispatch_ring[portid][queueid] = create_ring(name_buf,
470                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
471 
472             if (dispatch_ring[portid][queueid] == NULL)
473                 rte_panic("create ring:%s failed!\n", name_buf);
474 
475             printf("create ring:%s success, %u ring entries are now free!\n",
476                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
477         }
478     }
479 
480     return 0;
481 }
482 
483 static void
484 ff_msg_init(struct rte_mempool *mp,
485     __attribute__((unused)) void *opaque_arg,
486     void *obj, __attribute__((unused)) unsigned i)
487 {
488     struct ff_msg *msg = (struct ff_msg *)obj;
489     msg->msg_type = FF_UNKNOWN;
490     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
491     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
492 }
493 
494 static int
495 init_msg_ring(void)
496 {
497     uint16_t i;
498     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
499     unsigned socketid = lcore_conf.socket_id;
500 
501     /* Create message buffer pool */
502     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
503         message_pool = rte_mempool_create(FF_MSG_POOL,
504            MSG_RING_SIZE * 2 * nb_procs,
505            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
506            NULL, NULL, ff_msg_init, NULL,
507            socketid, 0);
508     } else {
509         message_pool = rte_mempool_lookup(FF_MSG_POOL);
510     }
511 
512     if (message_pool == NULL) {
513         rte_panic("Create msg mempool failed\n");
514     }
515 
516     for(i = 0; i < nb_procs; ++i) {
517         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
518             "%s%u", FF_MSG_RING_IN, i);
519         snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE,
520             "%s%u", FF_MSG_RING_OUT, i);
521 
522         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
523             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
524         if (msg_ring[i].ring[0] == NULL)
525             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
526 
527         msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1],
528             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
529         if (msg_ring[i].ring[1] == NULL)
530             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
531     }
532 
533     return 0;
534 }
535 
536 #ifdef FF_KNI
537 static int
538 init_kni(void)
539 {
540     int nb_ports = rte_eth_dev_count();
541     kni_accept = 0;
542     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
543         kni_accept = 1;
544 
545     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
546         ff_global_cfg.kni.udp_port);
547 
548     unsigned socket_id = lcore_conf.socket_id;
549     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
550 
551     nb_ports = ff_global_cfg.dpdk.nb_ports;
552     int i, ret;
553     for (i = 0; i < nb_ports; i++) {
554         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
555         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
556     }
557 
558     return 0;
559 }
560 #endif
561 
562 static void
563 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
564 {
565     if (reta_size == 0) {
566         return;
567     }
568 
569     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
570     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
571 
572     /* config HW indirection table */
573     unsigned i, j, hash=0;
574     for (i = 0; i < reta_conf_size; i++) {
575         reta_conf[i].mask = ~0ULL;
576         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
577             reta_conf[i].reta[j] = hash++ % nb_queues;
578         }
579     }
580 
581     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
582         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
583             port_id);
584     }
585 }
586 
587 static int
588 init_port_start(void)
589 {
590     int nb_ports = ff_global_cfg.dpdk.nb_ports;
591     unsigned socketid = 0;
592     struct rte_mempool *mbuf_pool;
593     uint16_t i;
594 
595     for (i = 0; i < nb_ports; i++) {
596         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
597         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
598         uint16_t nb_queues = pconf->nb_lcores;
599 
600         struct rte_eth_dev_info dev_info;
601         rte_eth_dev_info_get(port_id, &dev_info);
602 
603         if (nb_queues > dev_info.max_rx_queues) {
604             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
605                 nb_queues,
606                 dev_info.max_rx_queues);
607         }
608 
609         if (nb_queues > dev_info.max_tx_queues) {
610             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
611                 nb_queues,
612                 dev_info.max_tx_queues);
613         }
614 
615         struct ether_addr addr;
616         rte_eth_macaddr_get(port_id, &addr);
617         printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
618                    " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
619                 (unsigned)port_id,
620                 addr.addr_bytes[0], addr.addr_bytes[1],
621                 addr.addr_bytes[2], addr.addr_bytes[3],
622                 addr.addr_bytes[4], addr.addr_bytes[5]);
623 
624         rte_memcpy(pconf->mac,
625             addr.addr_bytes, ETHER_ADDR_LEN);
626 
627         /* Clear txq_flags - we do not need multi-mempool and refcnt */
628         dev_info.default_txconf.txq_flags = ETH_TXQ_FLAGS_NOMULTMEMP |
629             ETH_TXQ_FLAGS_NOREFCOUNT;
630 
631         /* Disable features that are not supported by port's HW */
632         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) {
633             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP;
634         }
635 
636         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
637             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP;
638         }
639 
640         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) {
641             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP;
642         }
643 
644         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) {
645             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL;
646         }
647 
648         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) &&
649             !(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TSO)) {
650             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
651         }
652 
653         struct rte_eth_conf port_conf = {0};
654 
655         /* Set RSS mode */
656         port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
657         port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK;
658         port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes;
659         port_conf.rx_adv_conf.rss_conf.rss_key_len = 40;
660 
661         /* Set Rx VLAN stripping */
662         if (ff_global_cfg.dpdk.vlan_strip) {
663             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
664                 port_conf.rxmode.hw_vlan_strip = 1;
665             }
666         }
667 
668         /* Enable HW CRC stripping */
669         port_conf.rxmode.hw_strip_crc = 1;
670 
671         /* FIXME: Enable TCP LRO ?*/
672         #if 0
673         if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
674             printf("LRO is supported\n");
675             port_conf.rxmode.enable_lro = 1;
676             pconf->hw_features.rx_lro = 1;
677         }
678         #endif
679 
680         /* Set Rx checksum checking */
681         if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
682             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
683             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
684             printf("RX checksum offload supported\n");
685             port_conf.rxmode.hw_ip_checksum = 1;
686             pconf->hw_features.rx_csum = 1;
687         }
688 
689         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
690             printf("TX ip checksum offload supported\n");
691             pconf->hw_features.tx_csum_ip = 1;
692         }
693 
694         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
695             (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
696             printf("TX TCP&UDP checksum offload supported\n");
697             pconf->hw_features.tx_csum_l4 = 1;
698         }
699 
700         if (ff_global_cfg.dpdk.tso) {
701             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
702                 printf("TSO is supported\n");
703                 pconf->hw_features.tx_tso = 1;
704             }
705         } else {
706             printf("TSO is disabled\n");
707         }
708 
709         if (dev_info.reta_size) {
710             /* reta size must be power of 2 */
711             assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
712 
713             rss_reta_size[port_id] = dev_info.reta_size;
714             printf("port[%d]: rss table size: %d\n", port_id,
715                 dev_info.reta_size);
716         }
717 
718         if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
719             continue;
720         }
721 
722         int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
723         if (ret != 0) {
724             return ret;
725         }
726         uint16_t q;
727         for (q = 0; q < nb_queues; q++) {
728             if (numa_on) {
729                 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
730                 socketid = rte_lcore_to_socket_id(lcore_id);
731             }
732             mbuf_pool = pktmbuf_pool[socketid];
733 
734             ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE,
735                 socketid, &dev_info.default_txconf);
736             if (ret < 0) {
737                 return ret;
738             }
739 
740             ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE,
741                 socketid, &dev_info.default_rxconf, mbuf_pool);
742             if (ret < 0) {
743                 return ret;
744             }
745         }
746 
747         ret = rte_eth_dev_start(port_id);
748         if (ret < 0) {
749             return ret;
750         }
751 
752         if (nb_queues > 1) {
753             /* set HW rss hash function to Toeplitz. */
754             if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
755                 struct rte_eth_hash_filter_info info = {0};
756                 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
757                 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
758 
759                 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
760                     RTE_ETH_FILTER_SET, &info) < 0) {
761                     rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
762                         port_id);
763                 }
764             }
765 
766             set_rss_table(port_id, dev_info.reta_size, nb_queues);
767         }
768 
769         /* Enable RX in promiscuous mode for the Ethernet device. */
770         if (ff_global_cfg.dpdk.promiscuous) {
771             rte_eth_promiscuous_enable(port_id);
772             ret = rte_eth_promiscuous_get(port_id);
773             if (ret == 1) {
774                 printf("set port %u to promiscuous mode ok\n", port_id);
775             } else {
776                 printf("set port %u to promiscuous mode error\n", port_id);
777             }
778         }
779 
780         /* Enable pcap dump */
781         if (pconf->pcap) {
782             ff_enable_pcap(pconf->pcap);
783         }
784     }
785 
786     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
787         check_all_ports_link_status();
788     }
789 
790     return 0;
791 }
792 
793 static int
794 init_clock(void)
795 {
796     rte_timer_subsystem_init();
797     uint64_t hz = rte_get_timer_hz();
798     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
799     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
800 
801     rte_timer_init(&freebsd_clock);
802     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
803         rte_lcore_id(), &ff_hardclock_job, NULL);
804 
805     ff_update_current_ts();
806 
807     return 0;
808 }
809 
810 int
811 ff_dpdk_init(int argc, char **argv)
812 {
813     if (ff_global_cfg.dpdk.nb_procs < 1 ||
814         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
815         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
816         ff_global_cfg.dpdk.proc_id < 0) {
817         printf("param num_procs[%d] or proc_id[%d] error!\n",
818             ff_global_cfg.dpdk.nb_procs,
819             ff_global_cfg.dpdk.proc_id);
820         exit(1);
821     }
822 
823     int ret = rte_eal_init(argc, argv);
824     if (ret < 0) {
825         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
826     }
827 
828     numa_on = ff_global_cfg.dpdk.numa_on;
829 
830     idle_sleep = ff_global_cfg.dpdk.idle_sleep;
831 
832     init_lcore_conf();
833 
834     init_mem_pool();
835 
836     init_dispatch_ring();
837 
838     init_msg_ring();
839 
840 #ifdef FF_KNI
841     enable_kni = ff_global_cfg.kni.enable;
842     if (enable_kni) {
843         init_kni();
844     }
845 #endif
846 
847     ret = init_port_start();
848     if (ret < 0) {
849         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
850     }
851 
852     init_clock();
853 
854     return 0;
855 }
856 
857 static void
858 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
859 {
860     uint8_t rx_csum = ctx->hw_features.rx_csum;
861     if (rx_csum) {
862         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
863             rte_pktmbuf_free(pkt);
864             return;
865         }
866     }
867 
868     /*
869      * FIXME: should we save pkt->vlan_tci
870      * if (pkt->ol_flags & PKT_RX_VLAN_PKT)
871      */
872 
873     void *data = rte_pktmbuf_mtod(pkt, void*);
874     uint16_t len = rte_pktmbuf_data_len(pkt);
875 
876     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
877     if (hdr == NULL) {
878         rte_pktmbuf_free(pkt);
879         return;
880     }
881 
882     struct rte_mbuf *pn = pkt->next;
883     void *prev = hdr;
884     while(pn != NULL) {
885         data = rte_pktmbuf_mtod(pn, void*);
886         len = rte_pktmbuf_data_len(pn);
887 
888         void *mb = ff_mbuf_get(prev, data, len);
889         if (mb == NULL) {
890             ff_mbuf_free(hdr);
891             rte_pktmbuf_free(pkt);
892             return;
893         }
894         pn = pn->next;
895         prev = mb;
896     }
897 
898     ff_veth_process_packet(ctx->ifp, hdr);
899 }
900 
901 static enum FilterReturn
902 protocol_filter(const void *data, uint16_t len)
903 {
904     if(len < ETHER_HDR_LEN)
905         return FILTER_UNKNOWN;
906 
907     const struct ether_hdr *hdr;
908     hdr = (const struct ether_hdr *)data;
909 
910     if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP)
911         return FILTER_ARP;
912 
913 #ifndef FF_KNI
914     return FILTER_UNKNOWN;
915 #else
916     if (!enable_kni) {
917         return FILTER_UNKNOWN;
918     }
919 
920     if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4)
921         return FILTER_UNKNOWN;
922 
923     return ff_kni_proto_filter(data + ETHER_HDR_LEN,
924         len - ETHER_HDR_LEN);
925 #endif
926 }
927 
928 static inline void
929 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
930 {
931     struct rte_mbuf *md;
932     void *src, *dst;
933 
934     dst = rte_pktmbuf_mtod(mi, void *);
935     src = rte_pktmbuf_mtod(m, void *);
936 
937     mi->data_len = m->data_len;
938     rte_memcpy(dst, src, m->data_len);
939 
940     mi->port = m->port;
941     mi->vlan_tci = m->vlan_tci;
942     mi->vlan_tci_outer = m->vlan_tci_outer;
943     mi->tx_offload = m->tx_offload;
944     mi->hash = m->hash;
945     mi->ol_flags = m->ol_flags;
946     mi->packet_type = m->packet_type;
947 }
948 
949 /* copied from rte_pktmbuf_clone */
950 static inline struct rte_mbuf *
951 pktmbuf_deep_clone(const struct rte_mbuf *md,
952     struct rte_mempool *mp)
953 {
954     struct rte_mbuf *mc, *mi, **prev;
955     uint32_t pktlen;
956     uint8_t nseg;
957 
958     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
959         return NULL;
960 
961     mi = mc;
962     prev = &mi->next;
963     pktlen = md->pkt_len;
964     nseg = 0;
965 
966     do {
967         nseg++;
968         pktmbuf_deep_attach(mi, md);
969         *prev = mi;
970         prev = &mi->next;
971     } while ((md = md->next) != NULL &&
972         (mi = rte_pktmbuf_alloc(mp)) != NULL);
973 
974     *prev = NULL;
975     mc->nb_segs = nseg;
976     mc->pkt_len = pktlen;
977 
978     /* Allocation of new indirect segment failed */
979     if (unlikely (mi == NULL)) {
980         rte_pktmbuf_free(mc);
981         return NULL;
982     }
983 
984     __rte_mbuf_sanity_check(mc, 1);
985     return mc;
986 }
987 
988 static inline void
989 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
990     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
991 {
992     struct lcore_conf *qconf = &lcore_conf;
993     uint16_t nb_queues = qconf->nb_queue_list[port_id];
994 
995     uint16_t i;
996     for (i = 0; i < count; i++) {
997         struct rte_mbuf *rtem = bufs[i];
998 
999         if (unlikely(qconf->pcap[port_id] != NULL)) {
1000             if (!pkts_from_ring) {
1001                 ff_dump_packets(qconf->pcap[port_id], rtem);
1002             }
1003         }
1004 
1005         void *data = rte_pktmbuf_mtod(rtem, void*);
1006         uint16_t len = rte_pktmbuf_data_len(rtem);
1007 
1008         if (!pkts_from_ring) {
1009             ff_traffic.rx_packets++;
1010             ff_traffic.rx_bytes += len;
1011         }
1012 
1013         if (!pkts_from_ring && packet_dispatcher) {
1014             int ret = (*packet_dispatcher)(data, len, queue_id, nb_queues);
1015             if (ret < 0 || ret >= nb_queues) {
1016                 rte_pktmbuf_free(rtem);
1017                 continue;
1018             }
1019 
1020             if (ret != queue_id) {
1021                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
1022                 if (ret < 0)
1023                     rte_pktmbuf_free(rtem);
1024 
1025                 continue;
1026             }
1027         }
1028 
1029         enum FilterReturn filter = protocol_filter(data, len);
1030         if (filter == FILTER_ARP) {
1031             struct rte_mempool *mbuf_pool;
1032             struct rte_mbuf *mbuf_clone;
1033             if (!pkts_from_ring) {
1034                 uint16_t j;
1035                 for(j = 0; j < nb_queues; ++j) {
1036                     if(j == queue_id)
1037                         continue;
1038 
1039                     unsigned socket_id = 0;
1040                     if (numa_on) {
1041                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1042                         socket_id = rte_lcore_to_socket_id(lcore_id);
1043                     }
1044                     mbuf_pool = pktmbuf_pool[socket_id];
1045                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1046                     if(mbuf_clone) {
1047                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1048                             mbuf_clone);
1049                         if (ret < 0)
1050                             rte_pktmbuf_free(mbuf_clone);
1051                     }
1052                 }
1053             }
1054 
1055 #ifdef FF_KNI
1056             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1057                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1058                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1059                 if(mbuf_clone) {
1060                     ff_kni_enqueue(port_id, mbuf_clone);
1061                 }
1062             }
1063 #endif
1064             ff_veth_input(ctx, rtem);
1065 #ifdef FF_KNI
1066         } else if (enable_kni &&
1067             ((filter == FILTER_KNI && kni_accept) ||
1068             (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1069             ff_kni_enqueue(port_id, rtem);
1070 #endif
1071         } else {
1072             ff_veth_input(ctx, rtem);
1073         }
1074     }
1075 }
1076 
1077 static inline int
1078 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1079     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1080 {
1081     /* read packet from ring buf and to process */
1082     uint16_t nb_rb;
1083     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1084         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1085 
1086     if(nb_rb > 0) {
1087         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1088     }
1089 
1090     return 0;
1091 }
1092 
1093 static inline void
1094 handle_sysctl_msg(struct ff_msg *msg)
1095 {
1096     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1097         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1098         msg->sysctl.newlen);
1099 
1100     if (ret < 0) {
1101         msg->result = errno;
1102     } else {
1103         msg->result = 0;
1104     }
1105 }
1106 
1107 static inline void
1108 handle_ioctl_msg(struct ff_msg *msg)
1109 {
1110     int fd, ret;
1111     fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1112     if (fd < 0) {
1113         ret = -1;
1114         goto done;
1115     }
1116 
1117     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1118 
1119     ff_close(fd);
1120 
1121 done:
1122     if (ret < 0) {
1123         msg->result = errno;
1124     } else {
1125         msg->result = 0;
1126     }
1127 }
1128 
1129 static inline void
1130 handle_route_msg(struct ff_msg *msg)
1131 {
1132     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1133         &msg->route.len, msg->route.maxlen);
1134     if (ret < 0) {
1135         msg->result = errno;
1136     } else {
1137         msg->result = 0;
1138     }
1139 }
1140 
1141 static inline void
1142 handle_top_msg(struct ff_msg *msg)
1143 {
1144     msg->top = ff_top_status;
1145     msg->result = 0;
1146 }
1147 
1148 #ifdef FF_NETGRAPH
1149 static inline void
1150 handle_ngctl_msg(struct ff_msg *msg)
1151 {
1152     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1153     if (ret < 0) {
1154         msg->result = errno;
1155     } else {
1156         msg->result = 0;
1157         msg->ngctl.ret = ret;
1158     }
1159 }
1160 #endif
1161 
1162 #ifdef FF_IPFW
1163 static inline void
1164 handle_ipfw_msg(struct ff_msg *msg)
1165 {
1166     int fd, ret;
1167     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1168     if (fd < 0) {
1169         ret = -1;
1170         goto done;
1171     }
1172 
1173     switch (msg->ipfw.cmd) {
1174         case FF_IPFW_GET:
1175             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1176                 msg->ipfw.optname, msg->ipfw.optval,
1177                 msg->ipfw.optlen);
1178             break;
1179         case FF_IPFW_SET:
1180             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1181                 msg->ipfw.optname, msg->ipfw.optval,
1182                 *(msg->ipfw.optlen));
1183             break;
1184         default:
1185             ret = -1;
1186             errno = ENOTSUP;
1187             break;
1188     }
1189 
1190     ff_close(fd);
1191 
1192 done:
1193     if (ret < 0) {
1194         msg->result = errno;
1195     } else {
1196         msg->result = 0;
1197     }
1198 }
1199 #endif
1200 
1201 static inline void
1202 handle_traffic_msg(struct ff_msg *msg)
1203 {
1204     msg->traffic = ff_traffic;
1205     msg->result = 0;
1206 }
1207 
1208 static inline void
1209 handle_default_msg(struct ff_msg *msg)
1210 {
1211     msg->result = ENOTSUP;
1212 }
1213 
1214 static inline void
1215 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1216 {
1217     switch (msg->msg_type) {
1218         case FF_SYSCTL:
1219             handle_sysctl_msg(msg);
1220             break;
1221         case FF_IOCTL:
1222             handle_ioctl_msg(msg);
1223             break;
1224         case FF_ROUTE:
1225             handle_route_msg(msg);
1226             break;
1227         case FF_TOP:
1228             handle_top_msg(msg);
1229             break;
1230 #ifdef FF_NETGRAPH
1231         case FF_NGCTL:
1232             handle_ngctl_msg(msg);
1233             break;
1234 #endif
1235 #ifdef FF_IPFW
1236         case FF_IPFW_CTL:
1237             handle_ipfw_msg(msg);
1238             break;
1239 #endif
1240         case FF_TRAFFIC:
1241             handle_traffic_msg(msg);
1242             break;
1243         default:
1244             handle_default_msg(msg);
1245             break;
1246     }
1247     rte_ring_enqueue(msg_ring[proc_id].ring[1], msg);
1248 }
1249 
1250 static inline int
1251 process_msg_ring(uint16_t proc_id)
1252 {
1253     void *msg;
1254     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1255 
1256     if (unlikely(ret == 0)) {
1257         handle_msg((struct ff_msg *)msg, proc_id);
1258     }
1259 
1260     return 0;
1261 }
1262 
1263 /* Send burst of packets on an output interface */
1264 static inline int
1265 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1266 {
1267     struct rte_mbuf **m_table;
1268     int ret;
1269     uint16_t queueid;
1270 
1271     queueid = qconf->tx_queue_id[port];
1272     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1273 
1274     if (unlikely(qconf->pcap[port] != NULL)) {
1275         uint16_t i;
1276         for (i = 0; i < n; i++) {
1277             ff_dump_packets(qconf->pcap[port], m_table[i]);
1278         }
1279     }
1280 
1281     ff_traffic.tx_packets += n;
1282     uint16_t i;
1283     for (i = 0; i < n; i++) {
1284         ff_traffic.tx_bytes += rte_pktmbuf_data_len(m_table[i]);
1285     }
1286 
1287     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1288     if (unlikely(ret < n)) {
1289         do {
1290             rte_pktmbuf_free(m_table[ret]);
1291         } while (++ret < n);
1292     }
1293 
1294     return 0;
1295 }
1296 
1297 /* Enqueue a single packet, and send burst if queue is filled */
1298 static inline int
1299 send_single_packet(struct rte_mbuf *m, uint8_t port)
1300 {
1301     uint16_t len;
1302     struct lcore_conf *qconf;
1303 
1304     qconf = &lcore_conf;
1305     len = qconf->tx_mbufs[port].len;
1306     qconf->tx_mbufs[port].m_table[len] = m;
1307     len++;
1308 
1309     /* enough pkts to be sent */
1310     if (unlikely(len == MAX_PKT_BURST)) {
1311         send_burst(qconf, MAX_PKT_BURST, port);
1312         len = 0;
1313     }
1314 
1315     qconf->tx_mbufs[port].len = len;
1316     return 0;
1317 }
1318 
1319 int
1320 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1321     int total)
1322 {
1323     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1324     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1325     if (head == NULL) {
1326         ff_mbuf_free(m);
1327         return -1;
1328     }
1329 
1330     head->pkt_len = total;
1331     head->nb_segs = 0;
1332 
1333     int off = 0;
1334     struct rte_mbuf *cur = head, *prev = NULL;
1335     while(total > 0) {
1336         if (cur == NULL) {
1337             cur = rte_pktmbuf_alloc(mbuf_pool);
1338             if (cur == NULL) {
1339                 rte_pktmbuf_free(head);
1340                 ff_mbuf_free(m);
1341                 return -1;
1342             }
1343         }
1344 
1345         if (prev != NULL) {
1346             prev->next = cur;
1347         }
1348         head->nb_segs++;
1349 
1350         prev = cur;
1351         void *data = rte_pktmbuf_mtod(cur, void*);
1352         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1353         int ret = ff_mbuf_copydata(m, data, off, len);
1354         if (ret < 0) {
1355             rte_pktmbuf_free(head);
1356             ff_mbuf_free(m);
1357             return -1;
1358         }
1359 
1360 
1361         cur->data_len = len;
1362         off += len;
1363         total -= len;
1364         cur = NULL;
1365     }
1366 
1367     struct ff_tx_offload offload = {0};
1368     ff_mbuf_tx_offload(m, &offload);
1369 
1370     void *data = rte_pktmbuf_mtod(head, void*);
1371 
1372     if (offload.ip_csum) {
1373         /* ipv6 not supported yet */
1374         struct ipv4_hdr *iph;
1375         int iph_len;
1376         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1377         iph_len = (iph->version_ihl & 0x0f) << 2;
1378 
1379         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1380         head->l2_len = ETHER_HDR_LEN;
1381         head->l3_len = iph_len;
1382     }
1383 
1384     if (ctx->hw_features.tx_csum_l4) {
1385         struct ipv4_hdr *iph;
1386         int iph_len;
1387         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1388         iph_len = (iph->version_ihl & 0x0f) << 2;
1389 
1390         if (offload.tcp_csum) {
1391             head->ol_flags |= PKT_TX_TCP_CKSUM;
1392             head->l2_len = ETHER_HDR_LEN;
1393             head->l3_len = iph_len;
1394         }
1395 
1396         /*
1397          *  TCP segmentation offload.
1398          *
1399          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1400          *    implies PKT_TX_TCP_CKSUM)
1401          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1402          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1403          *    write the IP checksum to 0 in the packet
1404          *  - fill the mbuf offload information: l2_len,
1405          *    l3_len, l4_len, tso_segsz
1406          *  - calculate the pseudo header checksum without taking ip_len
1407          *    in account, and set it in the TCP header. Refer to
1408          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1409          *    used as helpers.
1410          */
1411         if (offload.tso_seg_size) {
1412             struct tcp_hdr *tcph;
1413             int tcph_len;
1414             tcph = (struct tcp_hdr *)((char *)iph + iph_len);
1415             tcph_len = (tcph->data_off & 0xf0) >> 2;
1416             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1417 
1418             head->ol_flags |= PKT_TX_TCP_SEG;
1419             head->l4_len = tcph_len;
1420             head->tso_segsz = offload.tso_seg_size;
1421         }
1422 
1423         if (offload.udp_csum) {
1424             head->ol_flags |= PKT_TX_UDP_CKSUM;
1425             head->l2_len = ETHER_HDR_LEN;
1426             head->l3_len = iph_len;
1427         }
1428     }
1429 
1430     ff_mbuf_free(m);
1431 
1432     return send_single_packet(head, ctx->port_id);
1433 }
1434 
1435 static int
1436 main_loop(void *arg)
1437 {
1438     struct loop_routine *lr = (struct loop_routine *)arg;
1439 
1440     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1441     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc;
1442     int i, j, nb_rx, idle;
1443     uint16_t port_id, queue_id;
1444     struct lcore_conf *qconf;
1445     const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) /
1446         US_PER_S * BURST_TX_DRAIN_US;
1447     struct ff_dpdk_if_context *ctx;
1448 
1449     prev_tsc = 0;
1450     usch_tsc = 0;
1451 
1452     qconf = &lcore_conf;
1453 
1454     while (1) {
1455         cur_tsc = rte_rdtsc();
1456         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1457             rte_timer_manage();
1458         }
1459 
1460         idle = 1;
1461         sys_tsc = 0;
1462         usr_tsc = 0;
1463 
1464         /*
1465          * TX burst queue drain
1466          */
1467         diff_tsc = cur_tsc - prev_tsc;
1468         if (unlikely(diff_tsc > drain_tsc)) {
1469             for (i = 0; i < qconf->nb_tx_port; i++) {
1470                 port_id = qconf->tx_port_id[i];
1471                 if (qconf->tx_mbufs[port_id].len == 0)
1472                     continue;
1473 
1474                 idle = 0;
1475 
1476                 send_burst(qconf,
1477                     qconf->tx_mbufs[port_id].len,
1478                     port_id);
1479                 qconf->tx_mbufs[port_id].len = 0;
1480             }
1481 
1482             prev_tsc = cur_tsc;
1483         }
1484 
1485         /*
1486          * Read packet from RX queues
1487          */
1488         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1489             port_id = qconf->rx_queue_list[i].port_id;
1490             queue_id = qconf->rx_queue_list[i].queue_id;
1491             ctx = veth_ctx[port_id];
1492 
1493 #ifdef FF_KNI
1494             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1495                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1496             }
1497 #endif
1498 
1499             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1500 
1501             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1502                 MAX_PKT_BURST);
1503             if (nb_rx == 0)
1504                 continue;
1505 
1506             idle = 0;
1507 
1508             /* Prefetch first packets */
1509             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1510                 rte_prefetch0(rte_pktmbuf_mtod(
1511                         pkts_burst[j], void *));
1512             }
1513 
1514             /* Prefetch and handle already prefetched packets */
1515             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1516                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1517                         j + PREFETCH_OFFSET], void *));
1518                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1519             }
1520 
1521             /* Handle remaining prefetched packets */
1522             for (; j < nb_rx; j++) {
1523                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1524             }
1525         }
1526 
1527         process_msg_ring(qconf->proc_id);
1528 
1529         div_tsc = rte_rdtsc();
1530 
1531         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) {
1532             usch_tsc = cur_tsc;
1533             lr->loop(lr->arg);
1534         }
1535 
1536         idle_sleep_tsc = rte_rdtsc();
1537         if (likely(idle && idle_sleep)) {
1538             usleep(idle_sleep);
1539             end_tsc = rte_rdtsc();
1540         } else {
1541             end_tsc = idle_sleep_tsc;
1542         }
1543 
1544         end_tsc = rte_rdtsc();
1545 
1546         if (usch_tsc == cur_tsc) {
1547             usr_tsc = idle_sleep_tsc - div_tsc;
1548         }
1549 
1550         if (!idle) {
1551             sys_tsc = div_tsc - cur_tsc;
1552             ff_top_status.sys_tsc += sys_tsc;
1553         }
1554 
1555         ff_top_status.usr_tsc += usr_tsc;
1556         ff_top_status.work_tsc += end_tsc - cur_tsc;
1557         ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1558 
1559         ff_top_status.loops++;
1560     }
1561 
1562     return 0;
1563 }
1564 
1565 int
1566 ff_dpdk_if_up(void) {
1567     int i;
1568     struct lcore_conf *qconf = &lcore_conf;
1569     for (i = 0; i < qconf->nb_tx_port; i++) {
1570         uint16_t port_id = qconf->tx_port_id[i];
1571 
1572         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1573         veth_ctx[port_id] = ff_veth_attach(pconf);
1574         if (veth_ctx[port_id] == NULL) {
1575             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1576         }
1577     }
1578 
1579     return 0;
1580 }
1581 
1582 void
1583 ff_dpdk_run(loop_func_t loop, void *arg) {
1584     struct loop_routine *lr = rte_malloc(NULL,
1585         sizeof(struct loop_routine), 0);
1586     lr->loop = loop;
1587     lr->arg = arg;
1588     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1589     rte_eal_mp_wait_lcore();
1590     rte_free(lr);
1591 }
1592 
1593 void
1594 ff_dpdk_pktmbuf_free(void *m)
1595 {
1596     rte_pktmbuf_free((struct rte_mbuf *)m);
1597 }
1598 
1599 static uint32_t
1600 toeplitz_hash(unsigned keylen, const uint8_t *key,
1601     unsigned datalen, const uint8_t *data)
1602 {
1603     uint32_t hash = 0, v;
1604     u_int i, b;
1605 
1606     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1607 
1608     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1609     for (i = 0; i < datalen; i++) {
1610         for (b = 0; b < 8; b++) {
1611             if (data[i] & (1<<(7-b)))
1612                 hash ^= v;
1613             v <<= 1;
1614             if ((i + 4) < keylen &&
1615                 (key[i+4] & (1<<(7-b))))
1616                 v |= 1;
1617         }
1618     }
1619     return (hash);
1620 }
1621 
1622 int
1623 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
1624     uint16_t sport, uint16_t dport)
1625 {
1626     struct lcore_conf *qconf = &lcore_conf;
1627     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
1628     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
1629 
1630     if (nb_queues <= 1) {
1631         return 1;
1632     }
1633 
1634     uint16_t reta_size = rss_reta_size[ctx->port_id];
1635     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
1636 
1637     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
1638         sizeof(dport)];
1639 
1640     unsigned datalen = 0;
1641 
1642     bcopy(&saddr, &data[datalen], sizeof(saddr));
1643     datalen += sizeof(saddr);
1644 
1645     bcopy(&daddr, &data[datalen], sizeof(daddr));
1646     datalen += sizeof(daddr);
1647 
1648     bcopy(&sport, &data[datalen], sizeof(sport));
1649     datalen += sizeof(sport);
1650 
1651     bcopy(&dport, &data[datalen], sizeof(dport));
1652     datalen += sizeof(dport);
1653 
1654     uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes),
1655         default_rsskey_40bytes, datalen, data);
1656 
1657     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
1658 }
1659 
1660 void
1661 ff_regist_packet_dispatcher(dispatch_func_t func)
1662 {
1663     packet_dispatcher = func;
1664 }
1665 
1666 uint64_t
1667 ff_get_tsc_ns()
1668 {
1669     uint64_t cur_tsc = rte_rdtsc();
1670     uint64_t hz = rte_get_tsc_hz();
1671     return ((double)cur_tsc/(double)hz) * NS_PER_S;
1672 }
1673 
1674