xref: /f-stack/lib/ff_dpdk_if.c (revision d30ea906)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 #include <unistd.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30 
31 #include <rte_common.h>
32 #include <rte_byteorder.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memcpy.h>
36 #include <rte_memzone.h>
37 #include <rte_config.h>
38 #include <rte_eal.h>
39 #include <rte_pci.h>
40 #include <rte_mbuf.h>
41 #include <rte_memory.h>
42 #include <rte_lcore.h>
43 #include <rte_launch.h>
44 #include <rte_ethdev.h>
45 #include <rte_debug.h>
46 #include <rte_common.h>
47 #include <rte_ether.h>
48 #include <rte_malloc.h>
49 #include <rte_cycles.h>
50 #include <rte_timer.h>
51 #include <rte_thash.h>
52 #include <rte_ip.h>
53 #include <rte_tcp.h>
54 #include <rte_udp.h>
55 
56 #include "ff_dpdk_if.h"
57 #include "ff_dpdk_pcap.h"
58 #include "ff_dpdk_kni.h"
59 #include "ff_config.h"
60 #include "ff_veth.h"
61 #include "ff_host_interface.h"
62 #include "ff_msg.h"
63 #include "ff_api.h"
64 #include "ff_memory.h"
65 
66 #ifdef FF_KNI
67 #define KNI_MBUF_MAX 2048
68 #define KNI_QUEUE_SIZE 2048
69 
70 static int enable_kni;
71 static int kni_accept;
72 #endif
73 
74 static int numa_on;
75 
76 static unsigned idle_sleep;
77 
78 static struct rte_timer freebsd_clock;
79 
80 // Mellanox Linux's driver key
81 static uint8_t default_rsskey_40bytes[40] = {
82     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
83     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
84     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
85     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
86     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
87 };
88 
89 struct lcore_conf lcore_conf;
90 
91 struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
92 
93 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
94 static dispatch_func_t packet_dispatcher;
95 
96 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
97 
98 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port);
99 
100 struct ff_msg_ring {
101     char ring_name[2][RTE_RING_NAMESIZE];
102     /* ring[0] for lcore recv msg, other send */
103     /* ring[1] for lcore send msg, other read */
104     struct rte_ring *ring[2];
105 } __rte_cache_aligned;
106 
107 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
108 static struct rte_mempool *message_pool;
109 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
110 
111 static struct ff_top_args ff_top_status;
112 static struct ff_traffic_args ff_traffic;
113 extern void ff_hardclock(void);
114 
115 static void
116 ff_hardclock_job(__rte_unused struct rte_timer *timer,
117     __rte_unused void *arg) {
118     ff_hardclock();
119     ff_update_current_ts();
120 }
121 
122 struct ff_dpdk_if_context *
123 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
124 {
125     struct ff_dpdk_if_context *ctx;
126 
127     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
128     if (ctx == NULL)
129         return NULL;
130 
131     ctx->sc = sc;
132     ctx->ifp = ifp;
133     ctx->port_id = cfg->port_id;
134     ctx->hw_features = cfg->hw_features;
135 
136     return ctx;
137 }
138 
139 void
140 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
141 {
142     free(ctx);
143 }
144 
145 static void
146 check_all_ports_link_status(void)
147 {
148     #define CHECK_INTERVAL 100 /* 100ms */
149     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
150 
151     uint16_t portid;
152     uint8_t count, all_ports_up, print_flag = 0;
153     struct rte_eth_link link;
154 
155     printf("\nChecking link status");
156     fflush(stdout);
157 
158     int i, nb_ports;
159     nb_ports = ff_global_cfg.dpdk.nb_ports;
160     for (count = 0; count <= MAX_CHECK_TIME; count++) {
161         all_ports_up = 1;
162         for (i = 0; i < nb_ports; i++) {
163             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
164             memset(&link, 0, sizeof(link));
165             rte_eth_link_get_nowait(portid, &link);
166 
167             /* print link status if flag set */
168             if (print_flag == 1) {
169                 if (link.link_status) {
170                     printf("Port %d Link Up - speed %u "
171                         "Mbps - %s\n", (int)portid,
172                         (unsigned)link.link_speed,
173                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
174                         ("full-duplex") : ("half-duplex\n"));
175                 } else {
176                     printf("Port %d Link Down\n", (int)portid);
177                 }
178                 continue;
179             }
180             /* clear all_ports_up flag if any link down */
181             if (link.link_status == 0) {
182                 all_ports_up = 0;
183                 break;
184             }
185         }
186 
187         /* after finally printing all link status, get out */
188         if (print_flag == 1)
189             break;
190 
191         if (all_ports_up == 0) {
192             printf(".");
193             fflush(stdout);
194             rte_delay_ms(CHECK_INTERVAL);
195         }
196 
197         /* set the print_flag if all ports up or timeout */
198         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
199             print_flag = 1;
200             printf("done\n");
201         }
202     }
203 }
204 
205 static int
206 init_lcore_conf(void)
207 {
208     uint8_t nb_dev_ports = rte_eth_dev_count_avail();
209     if (nb_dev_ports == 0) {
210         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
211     }
212 
213     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
214         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
215                  ff_global_cfg.dpdk.max_portid);
216     }
217 
218     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
219     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
220 
221     uint16_t proc_id;
222     for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) {
223         uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id];
224         if (!lcore_config[lcore_id].detected) {
225             rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
226         }
227     }
228 
229     uint16_t socket_id = 0;
230     if (numa_on) {
231         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
232     }
233 
234     lcore_conf.socket_id = socket_id;
235 
236     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
237     int j;
238     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
239         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
240         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
241 
242         int queueid = -1;
243         int i;
244         for (i = 0; i < pconf->nb_lcores; i++) {
245             if (pconf->lcore_list[i] == lcore_id) {
246                 queueid = i;
247             }
248         }
249         if (queueid < 0) {
250             continue;
251         }
252         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
253         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
254         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
255         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
256         lcore_conf.nb_rx_queue++;
257 
258         lcore_conf.tx_queue_id[port_id] = queueid;
259         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
260         lcore_conf.nb_tx_port++;
261 
262         lcore_conf.pcap[port_id] = pconf->pcap;
263         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
264     }
265 
266     if (lcore_conf.nb_rx_queue == 0) {
267         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
268     }
269 
270     return 0;
271 }
272 
273 static int
274 init_mem_pool(void)
275 {
276     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
277     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
278     uint32_t nb_tx_queue = nb_lcores;
279     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
280 
281     unsigned nb_mbuf = RTE_MAX (
282         (nb_rx_queue*RX_QUEUE_SIZE          +
283         nb_ports*nb_lcores*MAX_PKT_BURST    +
284         nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
285         nb_lcores*MEMPOOL_CACHE_SIZE +
286 #ifdef FF_KNI
287         nb_ports*KNI_MBUF_MAX +
288         nb_ports*KNI_QUEUE_SIZE +
289 #endif
290         nb_lcores*nb_ports*DISPATCH_RING_SIZE),
291         (unsigned)8192);
292 
293     unsigned socketid = 0;
294     uint16_t i, lcore_id;
295     char s[64];
296 
297     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
298         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
299         if (numa_on) {
300             socketid = rte_lcore_to_socket_id(lcore_id);
301         }
302 
303         if (socketid >= NB_SOCKETS) {
304             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
305                 socketid, i, NB_SOCKETS);
306         }
307 
308         if (pktmbuf_pool[socketid] != NULL) {
309             continue;
310         }
311 
312         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
313             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
314             pktmbuf_pool[socketid] =
315                 rte_pktmbuf_pool_create(s, nb_mbuf,
316                     MEMPOOL_CACHE_SIZE, 0,
317                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
318         } else {
319             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
320             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
321         }
322 
323         if (pktmbuf_pool[socketid] == NULL) {
324             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
325         } else {
326             printf("create mbuf pool on socket %d\n", socketid);
327         }
328 
329 #ifdef FF_USE_PAGE_ARRAY
330         nb_mbuf = RTE_MAX (
331             nb_ports*nb_lcores*MAX_PKT_BURST    +
332             nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
333             nb_lcores*MEMPOOL_CACHE_SIZE,
334             (unsigned)4096);
335         ff_init_ref_pool(nb_mbuf, socketid);
336 #endif
337     }
338 
339     return 0;
340 }
341 
342 static struct rte_ring *
343 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
344 {
345     struct rte_ring *ring;
346 
347     if (name == NULL) {
348         rte_exit(EXIT_FAILURE, "create ring failed, no name!\n");
349     }
350 
351     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
352         ring = rte_ring_create(name, count, socket_id, flags);
353     } else {
354         ring = rte_ring_lookup(name);
355     }
356 
357     if (ring == NULL) {
358         rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name);
359     }
360 
361     return ring;
362 }
363 
364 static int
365 init_dispatch_ring(void)
366 {
367     int j;
368     char name_buf[RTE_RING_NAMESIZE];
369     int queueid;
370 
371     unsigned socketid = lcore_conf.socket_id;
372 
373     /* Create ring according to ports actually being used. */
374     int nb_ports = ff_global_cfg.dpdk.nb_ports;
375     for (j = 0; j < nb_ports; j++) {
376         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
377         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
378         int nb_queues = pconf->nb_lcores;
379         if (dispatch_ring[portid] == NULL) {
380             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
381 
382             dispatch_ring[portid] = rte_zmalloc(name_buf,
383                 sizeof(struct rte_ring *) * nb_queues,
384                 RTE_CACHE_LINE_SIZE);
385             if (dispatch_ring[portid] == NULL) {
386                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
387                     "failed\n", name_buf);
388             }
389         }
390 
391         for(queueid = 0; queueid < nb_queues; ++queueid) {
392             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
393                 portid, queueid);
394             dispatch_ring[portid][queueid] = create_ring(name_buf,
395                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
396 
397             if (dispatch_ring[portid][queueid] == NULL)
398                 rte_panic("create ring:%s failed!\n", name_buf);
399 
400             printf("create ring:%s success, %u ring entries are now free!\n",
401                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
402         }
403     }
404 
405     return 0;
406 }
407 
408 static void
409 ff_msg_init(struct rte_mempool *mp,
410     __attribute__((unused)) void *opaque_arg,
411     void *obj, __attribute__((unused)) unsigned i)
412 {
413     struct ff_msg *msg = (struct ff_msg *)obj;
414     msg->msg_type = FF_UNKNOWN;
415     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
416     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
417 }
418 
419 static int
420 init_msg_ring(void)
421 {
422     uint16_t i;
423     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
424     unsigned socketid = lcore_conf.socket_id;
425 
426     /* Create message buffer pool */
427     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
428         message_pool = rte_mempool_create(FF_MSG_POOL,
429            MSG_RING_SIZE * 2 * nb_procs,
430            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
431            NULL, NULL, ff_msg_init, NULL,
432            socketid, 0);
433     } else {
434         message_pool = rte_mempool_lookup(FF_MSG_POOL);
435     }
436 
437     if (message_pool == NULL) {
438         rte_panic("Create msg mempool failed\n");
439     }
440 
441     for(i = 0; i < nb_procs; ++i) {
442         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
443             "%s%u", FF_MSG_RING_IN, i);
444         snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE,
445             "%s%u", FF_MSG_RING_OUT, i);
446 
447         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
448             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
449         if (msg_ring[i].ring[0] == NULL)
450             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
451 
452         msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1],
453             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
454         if (msg_ring[i].ring[1] == NULL)
455             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
456     }
457 
458     return 0;
459 }
460 
461 #ifdef FF_KNI
462 static int
463 init_kni(void)
464 {
465     int nb_ports = rte_eth_dev_count_avail();
466     kni_accept = 0;
467     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
468         kni_accept = 1;
469 
470     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
471         ff_global_cfg.kni.udp_port);
472 
473     unsigned socket_id = lcore_conf.socket_id;
474     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
475 
476     nb_ports = ff_global_cfg.dpdk.nb_ports;
477     int i, ret;
478     for (i = 0; i < nb_ports; i++) {
479         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
480         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
481     }
482 
483     return 0;
484 }
485 #endif
486 
487 static void
488 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
489 {
490     if (reta_size == 0) {
491         return;
492     }
493 
494     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
495     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
496 
497     /* config HW indirection table */
498     unsigned i, j, hash=0;
499     for (i = 0; i < reta_conf_size; i++) {
500         reta_conf[i].mask = ~0ULL;
501         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
502             reta_conf[i].reta[j] = hash++ % nb_queues;
503         }
504     }
505 
506     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
507         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
508             port_id);
509     }
510 }
511 
512 static int
513 init_port_start(void)
514 {
515     int nb_ports = ff_global_cfg.dpdk.nb_ports;
516     unsigned socketid = 0;
517     struct rte_mempool *mbuf_pool;
518     uint16_t i;
519 
520     for (i = 0; i < nb_ports; i++) {
521         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
522         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
523         uint16_t nb_queues = pconf->nb_lcores;
524 
525         struct rte_eth_dev_info dev_info;
526         struct rte_eth_conf port_conf = {0};
527         struct rte_eth_rxconf rxq_conf;
528         struct rte_eth_txconf txq_conf;
529 
530         rte_eth_dev_info_get(port_id, &dev_info);
531 
532         if (nb_queues > dev_info.max_rx_queues) {
533             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
534                 nb_queues,
535                 dev_info.max_rx_queues);
536         }
537 
538         if (nb_queues > dev_info.max_tx_queues) {
539             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
540                 nb_queues,
541                 dev_info.max_tx_queues);
542         }
543 
544         struct ether_addr addr;
545         rte_eth_macaddr_get(port_id, &addr);
546         printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
547                    " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
548                 (unsigned)port_id,
549                 addr.addr_bytes[0], addr.addr_bytes[1],
550                 addr.addr_bytes[2], addr.addr_bytes[3],
551                 addr.addr_bytes[4], addr.addr_bytes[5]);
552 
553         rte_memcpy(pconf->mac,
554             addr.addr_bytes, ETHER_ADDR_LEN);
555 
556         /* Set RSS mode */
557         uint64_t default_rss_hf = ETH_RSS_PROTO_MASK;
558         port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
559         port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf;
560         port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes;
561         port_conf.rx_adv_conf.rss_conf.rss_key_len = 40;
562         port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads;
563         if (port_conf.rx_adv_conf.rss_conf.rss_hf !=
564                 ETH_RSS_PROTO_MASK) {
565             printf("Port %u modified RSS hash function based on hardware support,"
566                     "requested:%#"PRIx64" configured:%#"PRIx64"\n",
567                     port_id, default_rss_hf,
568                     port_conf.rx_adv_conf.rss_conf.rss_hf);
569         }
570 
571         if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) {
572             port_conf.txmode.offloads |=
573                 DEV_TX_OFFLOAD_MBUF_FAST_FREE;
574         }
575 
576         /* Set Rx VLAN stripping */
577         if (ff_global_cfg.dpdk.vlan_strip) {
578             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
579                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
580             }
581         }
582 
583         /* Enable HW CRC stripping */
584         if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_KEEP_CRC) {
585             port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_KEEP_CRC;
586         }
587 
588         /* FIXME: Enable TCP LRO ?*/
589         #if 0
590         if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
591             printf("LRO is supported\n");
592             port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
593             pconf->hw_features.rx_lro = 1;
594         }
595         #endif
596 
597         /* Set Rx checksum checking */
598         if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
599             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
600             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
601             printf("RX checksum offload supported\n");
602             port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
603             pconf->hw_features.rx_csum = 1;
604         }
605 
606         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
607             printf("TX ip checksum offload supported\n");
608             port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
609             pconf->hw_features.tx_csum_ip = 1;
610         }
611 
612         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
613             (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
614             printf("TX TCP&UDP checksum offload supported\n");
615             port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM;
616             pconf->hw_features.tx_csum_l4 = 1;
617         }
618 
619         if (ff_global_cfg.dpdk.tso) {
620             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
621                 printf("TSO is supported\n");
622                 port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO;
623                 pconf->hw_features.tx_tso = 1;
624             }
625         } else {
626             printf("TSO is disabled\n");
627         }
628 
629         if (dev_info.reta_size) {
630             /* reta size must be power of 2 */
631             assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
632 
633             rss_reta_size[port_id] = dev_info.reta_size;
634             printf("port[%d]: rss table size: %d\n", port_id,
635                 dev_info.reta_size);
636         }
637 
638         if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
639             continue;
640         }
641 
642         int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
643         if (ret != 0) {
644             return ret;
645         }
646 
647         static uint16_t nb_rxd = RX_QUEUE_SIZE;
648         static uint16_t nb_txd = TX_QUEUE_SIZE;
649         ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd);
650         if (ret < 0)
651             printf("Could not adjust number of descriptors "
652                     "for port%u (%d)\n", (unsigned)port_id, ret);
653 
654         uint16_t q;
655         for (q = 0; q < nb_queues; q++) {
656             if (numa_on) {
657                 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
658                 socketid = rte_lcore_to_socket_id(lcore_id);
659             }
660             mbuf_pool = pktmbuf_pool[socketid];
661 
662             txq_conf = dev_info.default_txconf;
663             txq_conf.offloads = port_conf.txmode.offloads;
664             ret = rte_eth_tx_queue_setup(port_id, q, nb_txd,
665                 socketid, &txq_conf);
666             if (ret < 0) {
667                 return ret;
668             }
669 
670             rxq_conf = dev_info.default_rxconf;
671             rxq_conf.offloads = port_conf.rxmode.offloads;
672             ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd,
673                 socketid, &rxq_conf, mbuf_pool);
674             if (ret < 0) {
675                 return ret;
676             }
677         }
678 
679         ret = rte_eth_dev_start(port_id);
680         if (ret < 0) {
681             return ret;
682         }
683 
684         if (nb_queues > 1) {
685             /* set HW rss hash function to Toeplitz. */
686             if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
687                 struct rte_eth_hash_filter_info info = {0};
688                 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
689                 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
690 
691                 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
692                     RTE_ETH_FILTER_SET, &info) < 0) {
693                     rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
694                         port_id);
695                 }
696             }
697 
698             set_rss_table(port_id, dev_info.reta_size, nb_queues);
699         }
700 
701         /* Enable RX in promiscuous mode for the Ethernet device. */
702         if (ff_global_cfg.dpdk.promiscuous) {
703             rte_eth_promiscuous_enable(port_id);
704             ret = rte_eth_promiscuous_get(port_id);
705             if (ret == 1) {
706                 printf("set port %u to promiscuous mode ok\n", port_id);
707             } else {
708                 printf("set port %u to promiscuous mode error\n", port_id);
709             }
710         }
711 
712         /* Enable pcap dump */
713         if (pconf->pcap) {
714             ff_enable_pcap(pconf->pcap);
715         }
716     }
717 
718     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
719         check_all_ports_link_status();
720     }
721 
722     return 0;
723 }
724 
725 static int
726 init_clock(void)
727 {
728     rte_timer_subsystem_init();
729     uint64_t hz = rte_get_timer_hz();
730     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
731     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
732 
733     rte_timer_init(&freebsd_clock);
734     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
735         rte_lcore_id(), &ff_hardclock_job, NULL);
736 
737     ff_update_current_ts();
738 
739     return 0;
740 }
741 
742 int
743 ff_dpdk_init(int argc, char **argv)
744 {
745     if (ff_global_cfg.dpdk.nb_procs < 1 ||
746         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
747         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
748         ff_global_cfg.dpdk.proc_id < 0) {
749         printf("param num_procs[%d] or proc_id[%d] error!\n",
750             ff_global_cfg.dpdk.nb_procs,
751             ff_global_cfg.dpdk.proc_id);
752         exit(1);
753     }
754 
755     int ret = rte_eal_init(argc, argv);
756     if (ret < 0) {
757         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
758     }
759 
760     numa_on = ff_global_cfg.dpdk.numa_on;
761 
762     idle_sleep = ff_global_cfg.dpdk.idle_sleep;
763 
764     init_lcore_conf();
765 
766     init_mem_pool();
767 
768     init_dispatch_ring();
769 
770     init_msg_ring();
771 
772 #ifdef FF_KNI
773     enable_kni = ff_global_cfg.kni.enable;
774     if (enable_kni) {
775         init_kni();
776     }
777 #endif
778 
779 #ifdef FF_USE_PAGE_ARRAY
780     ff_mmap_init();
781 #endif
782 
783     ret = init_port_start();
784     if (ret < 0) {
785         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
786     }
787 
788     init_clock();
789 
790     return 0;
791 }
792 
793 static void
794 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
795 {
796     uint8_t rx_csum = ctx->hw_features.rx_csum;
797     if (rx_csum) {
798         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
799             rte_pktmbuf_free(pkt);
800             return;
801         }
802     }
803 
804     /*
805      * FIXME: should we save pkt->vlan_tci
806      * if (pkt->ol_flags & PKT_RX_VLAN_PKT)
807      */
808 
809     void *data = rte_pktmbuf_mtod(pkt, void*);
810     uint16_t len = rte_pktmbuf_data_len(pkt);
811 
812     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
813     if (hdr == NULL) {
814         rte_pktmbuf_free(pkt);
815         return;
816     }
817 
818     struct rte_mbuf *pn = pkt->next;
819     void *prev = hdr;
820     while(pn != NULL) {
821         data = rte_pktmbuf_mtod(pn, void*);
822         len = rte_pktmbuf_data_len(pn);
823 
824         void *mb = ff_mbuf_get(prev, data, len);
825         if (mb == NULL) {
826             ff_mbuf_free(hdr);
827             rte_pktmbuf_free(pkt);
828             return;
829         }
830         pn = pn->next;
831         prev = mb;
832     }
833 
834     ff_veth_process_packet(ctx->ifp, hdr);
835 }
836 
837 static enum FilterReturn
838 protocol_filter(const void *data, uint16_t len)
839 {
840     if(len < ETHER_HDR_LEN)
841         return FILTER_UNKNOWN;
842 
843     const struct ether_hdr *hdr;
844     hdr = (const struct ether_hdr *)data;
845 
846     if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP)
847         return FILTER_ARP;
848 
849 #ifndef FF_KNI
850     return FILTER_UNKNOWN;
851 #else
852     if (!enable_kni) {
853         return FILTER_UNKNOWN;
854     }
855 
856     if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4)
857         return FILTER_UNKNOWN;
858 
859     return ff_kni_proto_filter(data + ETHER_HDR_LEN,
860         len - ETHER_HDR_LEN);
861 #endif
862 }
863 
864 static inline void
865 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
866 {
867     struct rte_mbuf *md;
868     void *src, *dst;
869 
870     dst = rte_pktmbuf_mtod(mi, void *);
871     src = rte_pktmbuf_mtod(m, void *);
872 
873     mi->data_len = m->data_len;
874     rte_memcpy(dst, src, m->data_len);
875 
876     mi->port = m->port;
877     mi->vlan_tci = m->vlan_tci;
878     mi->vlan_tci_outer = m->vlan_tci_outer;
879     mi->tx_offload = m->tx_offload;
880     mi->hash = m->hash;
881     mi->ol_flags = m->ol_flags;
882     mi->packet_type = m->packet_type;
883 }
884 
885 /* copied from rte_pktmbuf_clone */
886 static inline struct rte_mbuf *
887 pktmbuf_deep_clone(const struct rte_mbuf *md,
888     struct rte_mempool *mp)
889 {
890     struct rte_mbuf *mc, *mi, **prev;
891     uint32_t pktlen;
892     uint8_t nseg;
893 
894     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
895         return NULL;
896 
897     mi = mc;
898     prev = &mi->next;
899     pktlen = md->pkt_len;
900     nseg = 0;
901 
902     do {
903         nseg++;
904         pktmbuf_deep_attach(mi, md);
905         *prev = mi;
906         prev = &mi->next;
907     } while ((md = md->next) != NULL &&
908         (mi = rte_pktmbuf_alloc(mp)) != NULL);
909 
910     *prev = NULL;
911     mc->nb_segs = nseg;
912     mc->pkt_len = pktlen;
913 
914     /* Allocation of new indirect segment failed */
915     if (unlikely (mi == NULL)) {
916         rte_pktmbuf_free(mc);
917         return NULL;
918     }
919 
920     __rte_mbuf_sanity_check(mc, 1);
921     return mc;
922 }
923 
924 static inline void
925 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
926     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
927 {
928     struct lcore_conf *qconf = &lcore_conf;
929     uint16_t nb_queues = qconf->nb_queue_list[port_id];
930 
931     uint16_t i;
932     for (i = 0; i < count; i++) {
933         struct rte_mbuf *rtem = bufs[i];
934 
935         if (unlikely(qconf->pcap[port_id] != NULL)) {
936             if (!pkts_from_ring) {
937                 ff_dump_packets(qconf->pcap[port_id], rtem);
938             }
939         }
940 
941         void *data = rte_pktmbuf_mtod(rtem, void*);
942         uint16_t len = rte_pktmbuf_data_len(rtem);
943 
944         if (!pkts_from_ring) {
945             ff_traffic.rx_packets++;
946             ff_traffic.rx_bytes += len;
947         }
948 
949         if (!pkts_from_ring && packet_dispatcher) {
950             int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues);
951             if (ret == FF_DISPATCH_RESPONSE) {
952                 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len;
953                 send_single_packet(rtem, port_id);
954                 continue;
955             }
956 
957             if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) {
958                 rte_pktmbuf_free(rtem);
959                 continue;
960             }
961 
962             if (ret != queue_id) {
963                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
964                 if (ret < 0)
965                     rte_pktmbuf_free(rtem);
966 
967                 continue;
968             }
969         }
970 
971         enum FilterReturn filter = protocol_filter(data, len);
972         if (filter == FILTER_ARP) {
973             struct rte_mempool *mbuf_pool;
974             struct rte_mbuf *mbuf_clone;
975             if (!pkts_from_ring) {
976                 uint16_t j;
977                 for(j = 0; j < nb_queues; ++j) {
978                     if(j == queue_id)
979                         continue;
980 
981                     unsigned socket_id = 0;
982                     if (numa_on) {
983                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
984                         socket_id = rte_lcore_to_socket_id(lcore_id);
985                     }
986                     mbuf_pool = pktmbuf_pool[socket_id];
987                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
988                     if(mbuf_clone) {
989                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
990                             mbuf_clone);
991                         if (ret < 0)
992                             rte_pktmbuf_free(mbuf_clone);
993                     }
994                 }
995             }
996 
997 #ifdef FF_KNI
998             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
999                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1000                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1001                 if(mbuf_clone) {
1002                     ff_kni_enqueue(port_id, mbuf_clone);
1003                 }
1004             }
1005 #endif
1006             ff_veth_input(ctx, rtem);
1007 #ifdef FF_KNI
1008         } else if (enable_kni &&
1009             ((filter == FILTER_KNI && kni_accept) ||
1010             (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1011             ff_kni_enqueue(port_id, rtem);
1012 #endif
1013         } else {
1014             ff_veth_input(ctx, rtem);
1015         }
1016     }
1017 }
1018 
1019 static inline int
1020 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1021     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1022 {
1023     /* read packet from ring buf and to process */
1024     uint16_t nb_rb;
1025     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1026         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1027 
1028     if(nb_rb > 0) {
1029         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1030     }
1031 
1032     return 0;
1033 }
1034 
1035 static inline void
1036 handle_sysctl_msg(struct ff_msg *msg)
1037 {
1038     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1039         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1040         msg->sysctl.newlen);
1041 
1042     if (ret < 0) {
1043         msg->result = errno;
1044     } else {
1045         msg->result = 0;
1046     }
1047 }
1048 
1049 static inline void
1050 handle_ioctl_msg(struct ff_msg *msg)
1051 {
1052     int fd, ret;
1053     fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1054     if (fd < 0) {
1055         ret = -1;
1056         goto done;
1057     }
1058 
1059     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1060 
1061     ff_close(fd);
1062 
1063 done:
1064     if (ret < 0) {
1065         msg->result = errno;
1066     } else {
1067         msg->result = 0;
1068     }
1069 }
1070 
1071 static inline void
1072 handle_route_msg(struct ff_msg *msg)
1073 {
1074     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1075         &msg->route.len, msg->route.maxlen);
1076     if (ret < 0) {
1077         msg->result = errno;
1078     } else {
1079         msg->result = 0;
1080     }
1081 }
1082 
1083 static inline void
1084 handle_top_msg(struct ff_msg *msg)
1085 {
1086     msg->top = ff_top_status;
1087     msg->result = 0;
1088 }
1089 
1090 #ifdef FF_NETGRAPH
1091 static inline void
1092 handle_ngctl_msg(struct ff_msg *msg)
1093 {
1094     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1095     if (ret < 0) {
1096         msg->result = errno;
1097     } else {
1098         msg->result = 0;
1099         msg->ngctl.ret = ret;
1100     }
1101 }
1102 #endif
1103 
1104 #ifdef FF_IPFW
1105 static inline void
1106 handle_ipfw_msg(struct ff_msg *msg)
1107 {
1108     int fd, ret;
1109     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1110     if (fd < 0) {
1111         ret = -1;
1112         goto done;
1113     }
1114 
1115     switch (msg->ipfw.cmd) {
1116         case FF_IPFW_GET:
1117             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1118                 msg->ipfw.optname, msg->ipfw.optval,
1119                 msg->ipfw.optlen);
1120             break;
1121         case FF_IPFW_SET:
1122             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1123                 msg->ipfw.optname, msg->ipfw.optval,
1124                 *(msg->ipfw.optlen));
1125             break;
1126         default:
1127             ret = -1;
1128             errno = ENOTSUP;
1129             break;
1130     }
1131 
1132     ff_close(fd);
1133 
1134 done:
1135     if (ret < 0) {
1136         msg->result = errno;
1137     } else {
1138         msg->result = 0;
1139     }
1140 }
1141 #endif
1142 
1143 static inline void
1144 handle_traffic_msg(struct ff_msg *msg)
1145 {
1146     msg->traffic = ff_traffic;
1147     msg->result = 0;
1148 }
1149 
1150 static inline void
1151 handle_default_msg(struct ff_msg *msg)
1152 {
1153     msg->result = ENOTSUP;
1154 }
1155 
1156 static inline void
1157 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1158 {
1159     switch (msg->msg_type) {
1160         case FF_SYSCTL:
1161             handle_sysctl_msg(msg);
1162             break;
1163         case FF_IOCTL:
1164             handle_ioctl_msg(msg);
1165             break;
1166         case FF_ROUTE:
1167             handle_route_msg(msg);
1168             break;
1169         case FF_TOP:
1170             handle_top_msg(msg);
1171             break;
1172 #ifdef FF_NETGRAPH
1173         case FF_NGCTL:
1174             handle_ngctl_msg(msg);
1175             break;
1176 #endif
1177 #ifdef FF_IPFW
1178         case FF_IPFW_CTL:
1179             handle_ipfw_msg(msg);
1180             break;
1181 #endif
1182         case FF_TRAFFIC:
1183             handle_traffic_msg(msg);
1184             break;
1185         default:
1186             handle_default_msg(msg);
1187             break;
1188     }
1189     rte_ring_enqueue(msg_ring[proc_id].ring[1], msg);
1190 }
1191 
1192 static inline int
1193 process_msg_ring(uint16_t proc_id)
1194 {
1195     void *msg;
1196     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1197 
1198     if (unlikely(ret == 0)) {
1199         handle_msg((struct ff_msg *)msg, proc_id);
1200     }
1201 
1202     return 0;
1203 }
1204 
1205 /* Send burst of packets on an output interface */
1206 static inline int
1207 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1208 {
1209     struct rte_mbuf **m_table;
1210     int ret;
1211     uint16_t queueid;
1212 
1213     queueid = qconf->tx_queue_id[port];
1214     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1215 
1216     if (unlikely(qconf->pcap[port] != NULL)) {
1217         uint16_t i;
1218         for (i = 0; i < n; i++) {
1219             ff_dump_packets(qconf->pcap[port], m_table[i]);
1220         }
1221     }
1222 
1223     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1224     ff_traffic.tx_packets += ret;
1225     uint16_t i;
1226     for (i = 0; i < ret; i++) {
1227         ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]);
1228 #ifdef FF_USE_PAGE_ARRAY
1229         if (qconf->tx_mbufs[port].bsd_m_table[i])
1230             ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs);
1231 #endif
1232     }
1233     if (unlikely(ret < n)) {
1234         do {
1235             rte_pktmbuf_free(m_table[ret]);
1236 #ifdef FF_USE_PAGE_ARRAY
1237             if ( qconf->tx_mbufs[port].bsd_m_table[ret] )
1238                 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]);
1239 #endif
1240         } while (++ret < n);
1241     }
1242     return 0;
1243 }
1244 
1245 /* Enqueue a single packet, and send burst if queue is filled */
1246 static inline int
1247 send_single_packet(struct rte_mbuf *m, uint8_t port)
1248 {
1249     uint16_t len;
1250     struct lcore_conf *qconf;
1251 
1252     qconf = &lcore_conf;
1253     len = qconf->tx_mbufs[port].len;
1254     qconf->tx_mbufs[port].m_table[len] = m;
1255     len++;
1256 
1257     /* enough pkts to be sent */
1258     if (unlikely(len == MAX_PKT_BURST)) {
1259         send_burst(qconf, MAX_PKT_BURST, port);
1260         len = 0;
1261     }
1262 
1263     qconf->tx_mbufs[port].len = len;
1264     return 0;
1265 }
1266 
1267 int
1268 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1269     int total)
1270 {
1271 #ifdef FF_USE_PAGE_ARRAY
1272     struct lcore_conf *qconf = &lcore_conf;
1273     int    len = 0;
1274 
1275     len = ff_if_send_onepkt(ctx, m,total);
1276     if (unlikely(len == MAX_PKT_BURST)) {
1277         send_burst(qconf, MAX_PKT_BURST, ctx->port_id);
1278         len = 0;
1279     }
1280     qconf->tx_mbufs[ctx->port_id].len = len;
1281     return 0;
1282 #endif
1283     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1284     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1285     if (head == NULL) {
1286         ff_mbuf_free(m);
1287         return -1;
1288     }
1289 
1290     head->pkt_len = total;
1291     head->nb_segs = 0;
1292 
1293     int off = 0;
1294     struct rte_mbuf *cur = head, *prev = NULL;
1295     while(total > 0) {
1296         if (cur == NULL) {
1297             cur = rte_pktmbuf_alloc(mbuf_pool);
1298             if (cur == NULL) {
1299                 rte_pktmbuf_free(head);
1300                 ff_mbuf_free(m);
1301                 return -1;
1302             }
1303         }
1304 
1305         if (prev != NULL) {
1306             prev->next = cur;
1307         }
1308         head->nb_segs++;
1309 
1310         prev = cur;
1311         void *data = rte_pktmbuf_mtod(cur, void*);
1312         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1313         int ret = ff_mbuf_copydata(m, data, off, len);
1314         if (ret < 0) {
1315             rte_pktmbuf_free(head);
1316             ff_mbuf_free(m);
1317             return -1;
1318         }
1319 
1320 
1321         cur->data_len = len;
1322         off += len;
1323         total -= len;
1324         cur = NULL;
1325     }
1326 
1327     struct ff_tx_offload offload = {0};
1328     ff_mbuf_tx_offload(m, &offload);
1329 
1330     void *data = rte_pktmbuf_mtod(head, void*);
1331 
1332     if (offload.ip_csum) {
1333         /* ipv6 not supported yet */
1334         struct ipv4_hdr *iph;
1335         int iph_len;
1336         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1337         iph_len = (iph->version_ihl & 0x0f) << 2;
1338 
1339         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1340         head->l2_len = ETHER_HDR_LEN;
1341         head->l3_len = iph_len;
1342     }
1343 
1344     if (ctx->hw_features.tx_csum_l4) {
1345         struct ipv4_hdr *iph;
1346         int iph_len;
1347         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1348         iph_len = (iph->version_ihl & 0x0f) << 2;
1349 
1350         if (offload.tcp_csum) {
1351             head->ol_flags |= PKT_TX_TCP_CKSUM;
1352             head->l2_len = ETHER_HDR_LEN;
1353             head->l3_len = iph_len;
1354         }
1355 
1356         /*
1357          *  TCP segmentation offload.
1358          *
1359          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1360          *    implies PKT_TX_TCP_CKSUM)
1361          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1362          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1363          *    write the IP checksum to 0 in the packet
1364          *  - fill the mbuf offload information: l2_len,
1365          *    l3_len, l4_len, tso_segsz
1366          *  - calculate the pseudo header checksum without taking ip_len
1367          *    in account, and set it in the TCP header. Refer to
1368          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1369          *    used as helpers.
1370          */
1371         if (offload.tso_seg_size) {
1372             struct tcp_hdr *tcph;
1373             int tcph_len;
1374             tcph = (struct tcp_hdr *)((char *)iph + iph_len);
1375             tcph_len = (tcph->data_off & 0xf0) >> 2;
1376             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1377 
1378             head->ol_flags |= PKT_TX_TCP_SEG;
1379             head->l4_len = tcph_len;
1380             head->tso_segsz = offload.tso_seg_size;
1381         }
1382 
1383         if (offload.udp_csum) {
1384             head->ol_flags |= PKT_TX_UDP_CKSUM;
1385             head->l2_len = ETHER_HDR_LEN;
1386             head->l3_len = iph_len;
1387         }
1388     }
1389 
1390     ff_mbuf_free(m);
1391 
1392     return send_single_packet(head, ctx->port_id);
1393 }
1394 
1395 static int
1396 main_loop(void *arg)
1397 {
1398     struct loop_routine *lr = (struct loop_routine *)arg;
1399 
1400     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1401     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc;
1402     int i, j, nb_rx, idle;
1403     uint16_t port_id, queue_id;
1404     struct lcore_conf *qconf;
1405     const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) /
1406         US_PER_S * BURST_TX_DRAIN_US;
1407     struct ff_dpdk_if_context *ctx;
1408 
1409     prev_tsc = 0;
1410     usch_tsc = 0;
1411 
1412     qconf = &lcore_conf;
1413 
1414     while (1) {
1415         cur_tsc = rte_rdtsc();
1416         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1417             rte_timer_manage();
1418         }
1419 
1420         idle = 1;
1421         sys_tsc = 0;
1422         usr_tsc = 0;
1423 
1424         /*
1425          * TX burst queue drain
1426          */
1427         diff_tsc = cur_tsc - prev_tsc;
1428         if (unlikely(diff_tsc > drain_tsc)) {
1429             for (i = 0; i < qconf->nb_tx_port; i++) {
1430                 port_id = qconf->tx_port_id[i];
1431                 if (qconf->tx_mbufs[port_id].len == 0)
1432                     continue;
1433 
1434                 idle = 0;
1435 
1436                 send_burst(qconf,
1437                     qconf->tx_mbufs[port_id].len,
1438                     port_id);
1439                 qconf->tx_mbufs[port_id].len = 0;
1440             }
1441 
1442             prev_tsc = cur_tsc;
1443         }
1444 
1445         /*
1446          * Read packet from RX queues
1447          */
1448         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1449             port_id = qconf->rx_queue_list[i].port_id;
1450             queue_id = qconf->rx_queue_list[i].queue_id;
1451             ctx = veth_ctx[port_id];
1452 
1453 #ifdef FF_KNI
1454             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1455                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1456             }
1457 #endif
1458 
1459             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1460 
1461             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1462                 MAX_PKT_BURST);
1463             if (nb_rx == 0)
1464                 continue;
1465 
1466             idle = 0;
1467 
1468             /* Prefetch first packets */
1469             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1470                 rte_prefetch0(rte_pktmbuf_mtod(
1471                         pkts_burst[j], void *));
1472             }
1473 
1474             /* Prefetch and handle already prefetched packets */
1475             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1476                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1477                         j + PREFETCH_OFFSET], void *));
1478                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1479             }
1480 
1481             /* Handle remaining prefetched packets */
1482             for (; j < nb_rx; j++) {
1483                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1484             }
1485         }
1486 
1487         process_msg_ring(qconf->proc_id);
1488 
1489         div_tsc = rte_rdtsc();
1490 
1491         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) {
1492             usch_tsc = cur_tsc;
1493             lr->loop(lr->arg);
1494         }
1495 
1496         idle_sleep_tsc = rte_rdtsc();
1497         if (likely(idle && idle_sleep)) {
1498             usleep(idle_sleep);
1499             end_tsc = rte_rdtsc();
1500         } else {
1501             end_tsc = idle_sleep_tsc;
1502         }
1503 
1504         end_tsc = rte_rdtsc();
1505 
1506         if (usch_tsc == cur_tsc) {
1507             usr_tsc = idle_sleep_tsc - div_tsc;
1508         }
1509 
1510         if (!idle) {
1511             sys_tsc = div_tsc - cur_tsc;
1512             ff_top_status.sys_tsc += sys_tsc;
1513         }
1514 
1515         ff_top_status.usr_tsc += usr_tsc;
1516         ff_top_status.work_tsc += end_tsc - cur_tsc;
1517         ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1518 
1519         ff_top_status.loops++;
1520     }
1521 
1522     return 0;
1523 }
1524 
1525 int
1526 ff_dpdk_if_up(void) {
1527     int i;
1528     struct lcore_conf *qconf = &lcore_conf;
1529     for (i = 0; i < qconf->nb_tx_port; i++) {
1530         uint16_t port_id = qconf->tx_port_id[i];
1531 
1532         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1533         veth_ctx[port_id] = ff_veth_attach(pconf);
1534         if (veth_ctx[port_id] == NULL) {
1535             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1536         }
1537     }
1538 
1539     return 0;
1540 }
1541 
1542 void
1543 ff_dpdk_run(loop_func_t loop, void *arg) {
1544     struct loop_routine *lr = rte_malloc(NULL,
1545         sizeof(struct loop_routine), 0);
1546     lr->loop = loop;
1547     lr->arg = arg;
1548     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1549     rte_eal_mp_wait_lcore();
1550     rte_free(lr);
1551 }
1552 
1553 void
1554 ff_dpdk_pktmbuf_free(void *m)
1555 {
1556     rte_pktmbuf_free((struct rte_mbuf *)m);
1557 }
1558 
1559 static uint32_t
1560 toeplitz_hash(unsigned keylen, const uint8_t *key,
1561     unsigned datalen, const uint8_t *data)
1562 {
1563     uint32_t hash = 0, v;
1564     u_int i, b;
1565 
1566     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1567 
1568     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1569     for (i = 0; i < datalen; i++) {
1570         for (b = 0; b < 8; b++) {
1571             if (data[i] & (1<<(7-b)))
1572                 hash ^= v;
1573             v <<= 1;
1574             if ((i + 4) < keylen &&
1575                 (key[i+4] & (1<<(7-b))))
1576                 v |= 1;
1577         }
1578     }
1579     return (hash);
1580 }
1581 
1582 int
1583 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
1584     uint16_t sport, uint16_t dport)
1585 {
1586     struct lcore_conf *qconf = &lcore_conf;
1587     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
1588     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
1589 
1590     if (nb_queues <= 1) {
1591         return 1;
1592     }
1593 
1594     uint16_t reta_size = rss_reta_size[ctx->port_id];
1595     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
1596 
1597     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
1598         sizeof(dport)];
1599 
1600     unsigned datalen = 0;
1601 
1602     bcopy(&saddr, &data[datalen], sizeof(saddr));
1603     datalen += sizeof(saddr);
1604 
1605     bcopy(&daddr, &data[datalen], sizeof(daddr));
1606     datalen += sizeof(daddr);
1607 
1608     bcopy(&sport, &data[datalen], sizeof(sport));
1609     datalen += sizeof(sport);
1610 
1611     bcopy(&dport, &data[datalen], sizeof(dport));
1612     datalen += sizeof(dport);
1613 
1614     uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes),
1615         default_rsskey_40bytes, datalen, data);
1616 
1617     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
1618 }
1619 
1620 void
1621 ff_regist_packet_dispatcher(dispatch_func_t func)
1622 {
1623     packet_dispatcher = func;
1624 }
1625 
1626 uint64_t
1627 ff_get_tsc_ns()
1628 {
1629     uint64_t cur_tsc = rte_rdtsc();
1630     uint64_t hz = rte_get_tsc_hz();
1631     return ((double)cur_tsc/(double)hz) * NS_PER_S;
1632 }
1633 
1634