xref: /f-stack/lib/ff_dpdk_if.c (revision 22a4ca44)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 #include <unistd.h>
28 
29 #include <rte_common.h>
30 #include <rte_byteorder.h>
31 #include <rte_log.h>
32 #include <rte_memory.h>
33 #include <rte_memcpy.h>
34 #include <rte_memzone.h>
35 #include <rte_config.h>
36 #include <rte_eal.h>
37 #include <rte_pci.h>
38 #include <rte_mbuf.h>
39 #include <rte_memory.h>
40 #include <rte_lcore.h>
41 #include <rte_launch.h>
42 #include <rte_ethdev.h>
43 #include <rte_debug.h>
44 #include <rte_common.h>
45 #include <rte_ether.h>
46 #include <rte_malloc.h>
47 #include <rte_cycles.h>
48 #include <rte_timer.h>
49 #include <rte_thash.h>
50 #include <rte_ip.h>
51 #include <rte_tcp.h>
52 #include <rte_udp.h>
53 
54 #include "ff_dpdk_if.h"
55 #include "ff_dpdk_pcap.h"
56 #include "ff_dpdk_kni.h"
57 #include "ff_config.h"
58 #include "ff_veth.h"
59 #include "ff_host_interface.h"
60 #include "ff_msg.h"
61 #include "ff_api.h"
62 
63 #define MEMPOOL_CACHE_SIZE 256
64 
65 #define DISPATCH_RING_SIZE 2048
66 
67 #define MSG_RING_SIZE 32
68 
69 /*
70  * Configurable number of RX/TX ring descriptors
71  */
72 #define RX_QUEUE_SIZE 512
73 #define TX_QUEUE_SIZE 512
74 
75 #define MAX_PKT_BURST 32
76 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
77 
78 /*
79  * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send.
80  */
81 #define MAX_TX_BURST    (MAX_PKT_BURST / 2)
82 
83 #define NB_SOCKETS 8
84 
85 /* Configure how many packets ahead to prefetch, when reading packets */
86 #define PREFETCH_OFFSET    3
87 
88 #define MAX_RX_QUEUE_PER_LCORE 16
89 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS
90 #define MAX_RX_QUEUE_PER_PORT 128
91 
92 #ifdef FF_KNI
93 #define KNI_MBUF_MAX 2048
94 #define KNI_QUEUE_SIZE 2048
95 
96 static int enable_kni;
97 static int kni_accept;
98 #endif
99 
100 static int numa_on;
101 
102 static unsigned idle_sleep;
103 
104 static struct rte_timer freebsd_clock;
105 
106 // Mellanox Linux's driver key
107 static uint8_t default_rsskey_40bytes[40] = {
108     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
109     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
110     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
111     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
112     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
113 };
114 
115 static struct rte_eth_conf default_port_conf = {
116     .rxmode = {
117         .mq_mode = ETH_MQ_RX_RSS,
118         .max_rx_pkt_len = ETHER_MAX_LEN,
119         .split_hdr_size = 0, /**< hdr buf size */
120         .header_split   = 0, /**< Header Split disabled */
121         .hw_ip_checksum = 0, /**< IP checksum offload disabled */
122         .hw_vlan_filter = 0, /**< VLAN filtering disabled */
123         .hw_vlan_strip  = 0, /**< VLAN strip disabled. */
124         .hw_vlan_extend = 0, /**< Extended VLAN disabled. */
125         .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
126         .hw_strip_crc   = 0, /**< CRC stripped by hardware */
127         .enable_lro     = 0, /**< LRO disabled */
128     },
129     .rx_adv_conf = {
130         .rss_conf = {
131             .rss_key = default_rsskey_40bytes,
132             .rss_key_len = 40,
133             .rss_hf = ETH_RSS_PROTO_MASK,
134         },
135     },
136     .txmode = {
137         .mq_mode = ETH_MQ_TX_NONE,
138     },
139 };
140 
141 struct mbuf_table {
142     uint16_t len;
143     struct rte_mbuf *m_table[MAX_PKT_BURST];
144 };
145 
146 struct lcore_rx_queue {
147     uint16_t port_id;
148     uint16_t queue_id;
149 } __rte_cache_aligned;
150 
151 struct lcore_conf {
152     uint16_t proc_id;
153     uint16_t socket_id;
154     uint16_t nb_queue_list[RTE_MAX_ETHPORTS];
155     struct ff_port_cfg *port_cfgs;
156 
157     uint16_t nb_rx_queue;
158     struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
159     uint16_t nb_tx_port;
160     uint16_t tx_port_id[RTE_MAX_ETHPORTS];
161     uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
162     struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
163     char *pcap[RTE_MAX_ETHPORTS];
164 } __rte_cache_aligned;
165 
166 static struct lcore_conf lcore_conf;
167 
168 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
169 
170 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
171 static dispatch_func_t packet_dispatcher;
172 
173 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
174 
175 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port);
176 
177 struct ff_msg_ring {
178     char ring_name[2][RTE_RING_NAMESIZE];
179     /* ring[0] for lcore recv msg, other send */
180     /* ring[1] for lcore send msg, other read */
181     struct rte_ring *ring[2];
182 } __rte_cache_aligned;
183 
184 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
185 static struct rte_mempool *message_pool;
186 
187 struct ff_dpdk_if_context {
188     void *sc;
189     void *ifp;
190     uint16_t port_id;
191     struct ff_hw_features hw_features;
192 } __rte_cache_aligned;
193 
194 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
195 
196 static struct ff_top_args ff_top_status;
197 static struct ff_traffic_args ff_traffic;
198 
199 extern void ff_hardclock(void);
200 
201 static void
202 ff_hardclock_job(__rte_unused struct rte_timer *timer,
203     __rte_unused void *arg) {
204     ff_hardclock();
205     ff_update_current_ts();
206 }
207 
208 struct ff_dpdk_if_context *
209 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
210 {
211     struct ff_dpdk_if_context *ctx;
212 
213     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
214     if (ctx == NULL)
215         return NULL;
216 
217     ctx->sc = sc;
218     ctx->ifp = ifp;
219     ctx->port_id = cfg->port_id;
220     ctx->hw_features = cfg->hw_features;
221 
222     return ctx;
223 }
224 
225 void
226 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
227 {
228     free(ctx);
229 }
230 
231 static void
232 check_all_ports_link_status(void)
233 {
234     #define CHECK_INTERVAL 100 /* 100ms */
235     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
236 
237     uint16_t portid;
238     uint8_t count, all_ports_up, print_flag = 0;
239     struct rte_eth_link link;
240 
241     printf("\nChecking link status");
242     fflush(stdout);
243 
244     int i, nb_ports;
245     nb_ports = ff_global_cfg.dpdk.nb_ports;
246     for (count = 0; count <= MAX_CHECK_TIME; count++) {
247         all_ports_up = 1;
248         for (i = 0; i < nb_ports; i++) {
249             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
250             memset(&link, 0, sizeof(link));
251             rte_eth_link_get_nowait(portid, &link);
252 
253             /* print link status if flag set */
254             if (print_flag == 1) {
255                 if (link.link_status) {
256                     printf("Port %d Link Up - speed %u "
257                         "Mbps - %s\n", (int)portid,
258                         (unsigned)link.link_speed,
259                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
260                         ("full-duplex") : ("half-duplex\n"));
261                 } else {
262                     printf("Port %d Link Down\n", (int)portid);
263                 }
264                 continue;
265             }
266             /* clear all_ports_up flag if any link down */
267             if (link.link_status == 0) {
268                 all_ports_up = 0;
269                 break;
270             }
271         }
272 
273         /* after finally printing all link status, get out */
274         if (print_flag == 1)
275             break;
276 
277         if (all_ports_up == 0) {
278             printf(".");
279             fflush(stdout);
280             rte_delay_ms(CHECK_INTERVAL);
281         }
282 
283         /* set the print_flag if all ports up or timeout */
284         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
285             print_flag = 1;
286             printf("done\n");
287         }
288     }
289 }
290 
291 static int
292 init_lcore_conf(void)
293 {
294     uint8_t nb_dev_ports = rte_eth_dev_count();
295     if (nb_dev_ports == 0) {
296         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
297     }
298 
299     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
300         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
301                  ff_global_cfg.dpdk.max_portid);
302     }
303 
304     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
305     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
306 
307     uint16_t proc_id;
308     for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) {
309         uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id];
310         if (!lcore_config[lcore_id].detected) {
311             rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
312         }
313     }
314 
315     uint16_t socket_id = 0;
316     if (numa_on) {
317         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
318     }
319 
320     lcore_conf.socket_id = socket_id;
321 
322     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
323     int j;
324     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
325         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
326         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
327 
328         int queueid = -1;
329         int i;
330         for (i = 0; i < pconf->nb_lcores; i++) {
331             if (pconf->lcore_list[i] == lcore_id) {
332                 queueid = i;
333             }
334         }
335         if (queueid < 0) {
336             continue;
337         }
338         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
339         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
340         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
341         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
342         lcore_conf.nb_rx_queue++;
343 
344         lcore_conf.tx_queue_id[port_id] = queueid;
345         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
346         lcore_conf.nb_tx_port++;
347 
348         lcore_conf.pcap[port_id] = pconf->pcap;
349         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
350     }
351 
352     if (lcore_conf.nb_rx_queue == 0) {
353         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
354     }
355 
356     return 0;
357 }
358 
359 static int
360 init_mem_pool(void)
361 {
362     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
363     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
364     uint32_t nb_tx_queue = nb_lcores;
365     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
366 
367     unsigned nb_mbuf = RTE_MAX (
368         (nb_rx_queue*RX_QUEUE_SIZE          +
369         nb_ports*nb_lcores*MAX_PKT_BURST    +
370         nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
371         nb_lcores*MEMPOOL_CACHE_SIZE +
372 #ifdef FF_KNI
373         nb_ports*KNI_MBUF_MAX +
374         nb_ports*KNI_QUEUE_SIZE +
375 #endif
376         nb_lcores*nb_ports*DISPATCH_RING_SIZE),
377         (unsigned)8192);
378 
379     unsigned socketid = 0;
380     uint16_t i, lcore_id;
381     char s[64];
382 
383     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
384         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
385         if (numa_on) {
386             socketid = rte_lcore_to_socket_id(lcore_id);
387         }
388 
389         if (socketid >= NB_SOCKETS) {
390             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
391                 socketid, i, NB_SOCKETS);
392         }
393 
394         if (pktmbuf_pool[socketid] != NULL) {
395             continue;
396         }
397 
398         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
399             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
400             pktmbuf_pool[socketid] =
401                 rte_pktmbuf_pool_create(s, nb_mbuf,
402                     MEMPOOL_CACHE_SIZE, 0,
403                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
404         } else {
405             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
406             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
407         }
408 
409         if (pktmbuf_pool[socketid] == NULL) {
410             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
411         } else {
412             printf("create mbuf pool on socket %d\n", socketid);
413         }
414     }
415 
416     return 0;
417 }
418 
419 static struct rte_ring *
420 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
421 {
422     struct rte_ring *ring;
423 
424     if (name == NULL) {
425         rte_exit(EXIT_FAILURE, "create ring failed, no name!\n");
426     }
427 
428     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
429         ring = rte_ring_create(name, count, socket_id, flags);
430     } else {
431         ring = rte_ring_lookup(name);
432     }
433 
434     if (ring == NULL) {
435         rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name);
436     }
437 
438     return ring;
439 }
440 
441 static int
442 init_dispatch_ring(void)
443 {
444     int j;
445     char name_buf[RTE_RING_NAMESIZE];
446     int queueid;
447 
448     unsigned socketid = lcore_conf.socket_id;
449 
450     /* Create ring according to ports actually being used. */
451     int nb_ports = ff_global_cfg.dpdk.nb_ports;
452     for (j = 0; j < nb_ports; j++) {
453         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
454         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
455         int nb_queues = pconf->nb_lcores;
456         if (dispatch_ring[portid] == NULL) {
457             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
458 
459             dispatch_ring[portid] = rte_zmalloc(name_buf,
460                 sizeof(struct rte_ring *) * nb_queues,
461                 RTE_CACHE_LINE_SIZE);
462             if (dispatch_ring[portid] == NULL) {
463                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
464                     "failed\n", name_buf);
465             }
466         }
467 
468         for(queueid = 0; queueid < nb_queues; ++queueid) {
469             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
470                 portid, queueid);
471             dispatch_ring[portid][queueid] = create_ring(name_buf,
472                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
473 
474             if (dispatch_ring[portid][queueid] == NULL)
475                 rte_panic("create ring:%s failed!\n", name_buf);
476 
477             printf("create ring:%s success, %u ring entries are now free!\n",
478                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
479         }
480     }
481 
482     return 0;
483 }
484 
485 static void
486 ff_msg_init(struct rte_mempool *mp,
487     __attribute__((unused)) void *opaque_arg,
488     void *obj, __attribute__((unused)) unsigned i)
489 {
490     struct ff_msg *msg = (struct ff_msg *)obj;
491     msg->msg_type = FF_UNKNOWN;
492     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
493     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
494 }
495 
496 static int
497 init_msg_ring(void)
498 {
499     uint16_t i;
500     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
501     unsigned socketid = lcore_conf.socket_id;
502 
503     /* Create message buffer pool */
504     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
505         message_pool = rte_mempool_create(FF_MSG_POOL,
506            MSG_RING_SIZE * 2 * nb_procs,
507            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
508            NULL, NULL, ff_msg_init, NULL,
509            socketid, 0);
510     } else {
511         message_pool = rte_mempool_lookup(FF_MSG_POOL);
512     }
513 
514     if (message_pool == NULL) {
515         rte_panic("Create msg mempool failed\n");
516     }
517 
518     for(i = 0; i < nb_procs; ++i) {
519         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
520             "%s%u", FF_MSG_RING_IN, i);
521         snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE,
522             "%s%u", FF_MSG_RING_OUT, i);
523 
524         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
525             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
526         if (msg_ring[i].ring[0] == NULL)
527             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
528 
529         msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1],
530             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
531         if (msg_ring[i].ring[1] == NULL)
532             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
533     }
534 
535     return 0;
536 }
537 
538 #ifdef FF_KNI
539 static int
540 init_kni(void)
541 {
542     int nb_ports = rte_eth_dev_count();
543     kni_accept = 0;
544     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
545         kni_accept = 1;
546 
547     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
548         ff_global_cfg.kni.udp_port);
549 
550     unsigned socket_id = lcore_conf.socket_id;
551     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
552 
553     nb_ports = ff_global_cfg.dpdk.nb_ports;
554     int i, ret;
555     for (i = 0; i < nb_ports; i++) {
556         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
557         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
558     }
559 
560     return 0;
561 }
562 #endif
563 
564 static void
565 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
566 {
567     if (reta_size == 0) {
568         return;
569     }
570 
571     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
572     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
573 
574     /* config HW indirection table */
575     unsigned i, j, hash=0;
576     for (i = 0; i < reta_conf_size; i++) {
577         reta_conf[i].mask = ~0ULL;
578         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
579             reta_conf[i].reta[j] = hash++ % nb_queues;
580         }
581     }
582 
583     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
584         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
585             port_id);
586     }
587 }
588 
589 static int
590 init_port_start(void)
591 {
592     int nb_ports = ff_global_cfg.dpdk.nb_ports;
593     unsigned socketid = 0;
594     struct rte_mempool *mbuf_pool;
595     uint16_t i;
596 
597     for (i = 0; i < nb_ports; i++) {
598         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
599         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
600         uint16_t nb_queues = pconf->nb_lcores;
601 
602         struct rte_eth_dev_info dev_info;
603         rte_eth_dev_info_get(port_id, &dev_info);
604 
605         if (nb_queues > dev_info.max_rx_queues) {
606             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
607                 nb_queues,
608                 dev_info.max_rx_queues);
609         }
610 
611         if (nb_queues > dev_info.max_tx_queues) {
612             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
613                 nb_queues,
614                 dev_info.max_tx_queues);
615         }
616 
617         struct ether_addr addr;
618         rte_eth_macaddr_get(port_id, &addr);
619         printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
620                    " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
621                 (unsigned)port_id,
622                 addr.addr_bytes[0], addr.addr_bytes[1],
623                 addr.addr_bytes[2], addr.addr_bytes[3],
624                 addr.addr_bytes[4], addr.addr_bytes[5]);
625 
626         rte_memcpy(pconf->mac,
627             addr.addr_bytes, ETHER_ADDR_LEN);
628 
629         /* Clear txq_flags - we do not need multi-mempool and refcnt */
630         dev_info.default_txconf.txq_flags = ETH_TXQ_FLAGS_NOMULTMEMP |
631             ETH_TXQ_FLAGS_NOREFCOUNT;
632 
633         /* Disable features that are not supported by port's HW */
634         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) {
635             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP;
636         }
637 
638         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
639             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP;
640         }
641 
642         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) {
643             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP;
644         }
645 
646         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) {
647             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL;
648         }
649 
650         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) &&
651             !(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TSO)) {
652             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
653         }
654 
655         struct rte_eth_conf port_conf = {0};
656 
657         /* Set RSS mode */
658         port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
659         port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK;
660         port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes;
661         port_conf.rx_adv_conf.rss_conf.rss_key_len = 40;
662 
663         /* Set Rx VLAN stripping */
664         if (ff_global_cfg.dpdk.vlan_strip) {
665             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
666                 port_conf.rxmode.hw_vlan_strip = 1;
667             }
668         }
669 
670         /* Enable HW CRC stripping */
671         port_conf.rxmode.hw_strip_crc = 1;
672 
673         /* FIXME: Enable TCP LRO ?*/
674         #if 0
675         if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
676             printf("LRO is supported\n");
677             port_conf.rxmode.enable_lro = 1;
678             pconf->hw_features.rx_lro = 1;
679         }
680         #endif
681 
682         /* Set Rx checksum checking */
683         if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
684             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
685             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
686             printf("RX checksum offload supported\n");
687             port_conf.rxmode.hw_ip_checksum = 1;
688             pconf->hw_features.rx_csum = 1;
689         }
690 
691         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
692             printf("TX ip checksum offload supported\n");
693             pconf->hw_features.tx_csum_ip = 1;
694         }
695 
696         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
697             (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
698             printf("TX TCP&UDP checksum offload supported\n");
699             pconf->hw_features.tx_csum_l4 = 1;
700         }
701 
702         if (ff_global_cfg.dpdk.tso) {
703             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
704                 printf("TSO is supported\n");
705                 pconf->hw_features.tx_tso = 1;
706             }
707         } else {
708             printf("TSO is disabled\n");
709         }
710 
711         if (dev_info.reta_size) {
712             /* reta size must be power of 2 */
713             assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
714 
715             rss_reta_size[port_id] = dev_info.reta_size;
716             printf("port[%d]: rss table size: %d\n", port_id,
717                 dev_info.reta_size);
718         }
719 
720         if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
721             continue;
722         }
723 
724         int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
725         if (ret != 0) {
726             return ret;
727         }
728         uint16_t q;
729         for (q = 0; q < nb_queues; q++) {
730             if (numa_on) {
731                 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
732                 socketid = rte_lcore_to_socket_id(lcore_id);
733             }
734             mbuf_pool = pktmbuf_pool[socketid];
735 
736             ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE,
737                 socketid, &dev_info.default_txconf);
738             if (ret < 0) {
739                 return ret;
740             }
741 
742             ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE,
743                 socketid, &dev_info.default_rxconf, mbuf_pool);
744             if (ret < 0) {
745                 return ret;
746             }
747         }
748 
749         ret = rte_eth_dev_start(port_id);
750         if (ret < 0) {
751             return ret;
752         }
753 
754         if (nb_queues > 1) {
755             /* set HW rss hash function to Toeplitz. */
756             if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
757                 struct rte_eth_hash_filter_info info = {0};
758                 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
759                 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
760 
761                 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
762                     RTE_ETH_FILTER_SET, &info) < 0) {
763                     rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
764                         port_id);
765                 }
766             }
767 
768             set_rss_table(port_id, dev_info.reta_size, nb_queues);
769         }
770 
771         /* Enable RX in promiscuous mode for the Ethernet device. */
772         if (ff_global_cfg.dpdk.promiscuous) {
773             rte_eth_promiscuous_enable(port_id);
774             ret = rte_eth_promiscuous_get(port_id);
775             if (ret == 1) {
776                 printf("set port %u to promiscuous mode ok\n", port_id);
777             } else {
778                 printf("set port %u to promiscuous mode error\n", port_id);
779             }
780         }
781 
782         /* Enable pcap dump */
783         if (pconf->pcap) {
784             ff_enable_pcap(pconf->pcap);
785         }
786     }
787 
788     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
789         check_all_ports_link_status();
790     }
791 
792     return 0;
793 }
794 
795 static int
796 init_clock(void)
797 {
798     rte_timer_subsystem_init();
799     uint64_t hz = rte_get_timer_hz();
800     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
801     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
802 
803     rte_timer_init(&freebsd_clock);
804     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
805         rte_lcore_id(), &ff_hardclock_job, NULL);
806 
807     ff_update_current_ts();
808 
809     return 0;
810 }
811 
812 int
813 ff_dpdk_init(int argc, char **argv)
814 {
815     if (ff_global_cfg.dpdk.nb_procs < 1 ||
816         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
817         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
818         ff_global_cfg.dpdk.proc_id < 0) {
819         printf("param num_procs[%d] or proc_id[%d] error!\n",
820             ff_global_cfg.dpdk.nb_procs,
821             ff_global_cfg.dpdk.proc_id);
822         exit(1);
823     }
824 
825     int ret = rte_eal_init(argc, argv);
826     if (ret < 0) {
827         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
828     }
829 
830     numa_on = ff_global_cfg.dpdk.numa_on;
831 
832     idle_sleep = ff_global_cfg.dpdk.idle_sleep;
833 
834     init_lcore_conf();
835 
836     init_mem_pool();
837 
838     init_dispatch_ring();
839 
840     init_msg_ring();
841 
842 #ifdef FF_KNI
843     enable_kni = ff_global_cfg.kni.enable;
844     if (enable_kni) {
845         init_kni();
846     }
847 #endif
848 
849     ret = init_port_start();
850     if (ret < 0) {
851         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
852     }
853 
854     init_clock();
855 
856     return 0;
857 }
858 
859 static void
860 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
861 {
862     uint8_t rx_csum = ctx->hw_features.rx_csum;
863     if (rx_csum) {
864         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
865             rte_pktmbuf_free(pkt);
866             return;
867         }
868     }
869 
870     /*
871      * FIXME: should we save pkt->vlan_tci
872      * if (pkt->ol_flags & PKT_RX_VLAN_PKT)
873      */
874 
875     void *data = rte_pktmbuf_mtod(pkt, void*);
876     uint16_t len = rte_pktmbuf_data_len(pkt);
877 
878     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
879     if (hdr == NULL) {
880         rte_pktmbuf_free(pkt);
881         return;
882     }
883 
884     struct rte_mbuf *pn = pkt->next;
885     void *prev = hdr;
886     while(pn != NULL) {
887         data = rte_pktmbuf_mtod(pn, void*);
888         len = rte_pktmbuf_data_len(pn);
889 
890         void *mb = ff_mbuf_get(prev, data, len);
891         if (mb == NULL) {
892             ff_mbuf_free(hdr);
893             rte_pktmbuf_free(pkt);
894             return;
895         }
896         pn = pn->next;
897         prev = mb;
898     }
899 
900     ff_veth_process_packet(ctx->ifp, hdr);
901 }
902 
903 static enum FilterReturn
904 protocol_filter(const void *data, uint16_t len)
905 {
906     if(len < ETHER_HDR_LEN)
907         return FILTER_UNKNOWN;
908 
909     const struct ether_hdr *hdr;
910     hdr = (const struct ether_hdr *)data;
911 
912     if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP)
913         return FILTER_ARP;
914 
915 #ifndef FF_KNI
916     return FILTER_UNKNOWN;
917 #else
918     if (!enable_kni) {
919         return FILTER_UNKNOWN;
920     }
921 
922     if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4)
923         return FILTER_UNKNOWN;
924 
925     return ff_kni_proto_filter(data + ETHER_HDR_LEN,
926         len - ETHER_HDR_LEN);
927 #endif
928 }
929 
930 static inline void
931 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
932 {
933     struct rte_mbuf *md;
934     void *src, *dst;
935 
936     dst = rte_pktmbuf_mtod(mi, void *);
937     src = rte_pktmbuf_mtod(m, void *);
938 
939     mi->data_len = m->data_len;
940     rte_memcpy(dst, src, m->data_len);
941 
942     mi->port = m->port;
943     mi->vlan_tci = m->vlan_tci;
944     mi->vlan_tci_outer = m->vlan_tci_outer;
945     mi->tx_offload = m->tx_offload;
946     mi->hash = m->hash;
947     mi->ol_flags = m->ol_flags;
948     mi->packet_type = m->packet_type;
949 }
950 
951 /* copied from rte_pktmbuf_clone */
952 static inline struct rte_mbuf *
953 pktmbuf_deep_clone(const struct rte_mbuf *md,
954     struct rte_mempool *mp)
955 {
956     struct rte_mbuf *mc, *mi, **prev;
957     uint32_t pktlen;
958     uint8_t nseg;
959 
960     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
961         return NULL;
962 
963     mi = mc;
964     prev = &mi->next;
965     pktlen = md->pkt_len;
966     nseg = 0;
967 
968     do {
969         nseg++;
970         pktmbuf_deep_attach(mi, md);
971         *prev = mi;
972         prev = &mi->next;
973     } while ((md = md->next) != NULL &&
974         (mi = rte_pktmbuf_alloc(mp)) != NULL);
975 
976     *prev = NULL;
977     mc->nb_segs = nseg;
978     mc->pkt_len = pktlen;
979 
980     /* Allocation of new indirect segment failed */
981     if (unlikely (mi == NULL)) {
982         rte_pktmbuf_free(mc);
983         return NULL;
984     }
985 
986     __rte_mbuf_sanity_check(mc, 1);
987     return mc;
988 }
989 
990 static inline void
991 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
992     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
993 {
994     struct lcore_conf *qconf = &lcore_conf;
995     uint16_t nb_queues = qconf->nb_queue_list[port_id];
996 
997     uint16_t i;
998     for (i = 0; i < count; i++) {
999         struct rte_mbuf *rtem = bufs[i];
1000 
1001         if (unlikely(qconf->pcap[port_id] != NULL)) {
1002             if (!pkts_from_ring) {
1003                 ff_dump_packets(qconf->pcap[port_id], rtem);
1004             }
1005         }
1006 
1007         void *data = rte_pktmbuf_mtod(rtem, void*);
1008         uint16_t len = rte_pktmbuf_data_len(rtem);
1009 
1010         if (!pkts_from_ring) {
1011             ff_traffic.rx_packets++;
1012             ff_traffic.rx_bytes += len;
1013         }
1014 
1015         if (!pkts_from_ring && packet_dispatcher) {
1016             int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues);
1017             if (ret == FF_DISPATCH_RESPONSE) {
1018                 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len;
1019                 send_single_packet(rtem, port_id);
1020                 continue;
1021             }
1022 
1023             if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) {
1024                 rte_pktmbuf_free(rtem);
1025                 continue;
1026             }
1027 
1028             if (ret != queue_id) {
1029                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
1030                 if (ret < 0)
1031                     rte_pktmbuf_free(rtem);
1032 
1033                 continue;
1034             }
1035         }
1036 
1037         enum FilterReturn filter = protocol_filter(data, len);
1038         if (filter == FILTER_ARP) {
1039             struct rte_mempool *mbuf_pool;
1040             struct rte_mbuf *mbuf_clone;
1041             if (!pkts_from_ring) {
1042                 uint16_t j;
1043                 for(j = 0; j < nb_queues; ++j) {
1044                     if(j == queue_id)
1045                         continue;
1046 
1047                     unsigned socket_id = 0;
1048                     if (numa_on) {
1049                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1050                         socket_id = rte_lcore_to_socket_id(lcore_id);
1051                     }
1052                     mbuf_pool = pktmbuf_pool[socket_id];
1053                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1054                     if(mbuf_clone) {
1055                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1056                             mbuf_clone);
1057                         if (ret < 0)
1058                             rte_pktmbuf_free(mbuf_clone);
1059                     }
1060                 }
1061             }
1062 
1063 #ifdef FF_KNI
1064             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1065                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1066                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1067                 if(mbuf_clone) {
1068                     ff_kni_enqueue(port_id, mbuf_clone);
1069                 }
1070             }
1071 #endif
1072             ff_veth_input(ctx, rtem);
1073 #ifdef FF_KNI
1074         } else if (enable_kni &&
1075             ((filter == FILTER_KNI && kni_accept) ||
1076             (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1077             ff_kni_enqueue(port_id, rtem);
1078 #endif
1079         } else {
1080             ff_veth_input(ctx, rtem);
1081         }
1082     }
1083 }
1084 
1085 static inline int
1086 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1087     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1088 {
1089     /* read packet from ring buf and to process */
1090     uint16_t nb_rb;
1091     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1092         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1093 
1094     if(nb_rb > 0) {
1095         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1096     }
1097 
1098     return 0;
1099 }
1100 
1101 static inline void
1102 handle_sysctl_msg(struct ff_msg *msg)
1103 {
1104     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1105         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1106         msg->sysctl.newlen);
1107 
1108     if (ret < 0) {
1109         msg->result = errno;
1110     } else {
1111         msg->result = 0;
1112     }
1113 }
1114 
1115 static inline void
1116 handle_ioctl_msg(struct ff_msg *msg)
1117 {
1118     int fd, ret;
1119     fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1120     if (fd < 0) {
1121         ret = -1;
1122         goto done;
1123     }
1124 
1125     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1126 
1127     ff_close(fd);
1128 
1129 done:
1130     if (ret < 0) {
1131         msg->result = errno;
1132     } else {
1133         msg->result = 0;
1134     }
1135 }
1136 
1137 static inline void
1138 handle_route_msg(struct ff_msg *msg)
1139 {
1140     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1141         &msg->route.len, msg->route.maxlen);
1142     if (ret < 0) {
1143         msg->result = errno;
1144     } else {
1145         msg->result = 0;
1146     }
1147 }
1148 
1149 static inline void
1150 handle_top_msg(struct ff_msg *msg)
1151 {
1152     msg->top = ff_top_status;
1153     msg->result = 0;
1154 }
1155 
1156 #ifdef FF_NETGRAPH
1157 static inline void
1158 handle_ngctl_msg(struct ff_msg *msg)
1159 {
1160     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1161     if (ret < 0) {
1162         msg->result = errno;
1163     } else {
1164         msg->result = 0;
1165         msg->ngctl.ret = ret;
1166     }
1167 }
1168 #endif
1169 
1170 #ifdef FF_IPFW
1171 static inline void
1172 handle_ipfw_msg(struct ff_msg *msg)
1173 {
1174     int fd, ret;
1175     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1176     if (fd < 0) {
1177         ret = -1;
1178         goto done;
1179     }
1180 
1181     switch (msg->ipfw.cmd) {
1182         case FF_IPFW_GET:
1183             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1184                 msg->ipfw.optname, msg->ipfw.optval,
1185                 msg->ipfw.optlen);
1186             break;
1187         case FF_IPFW_SET:
1188             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1189                 msg->ipfw.optname, msg->ipfw.optval,
1190                 *(msg->ipfw.optlen));
1191             break;
1192         default:
1193             ret = -1;
1194             errno = ENOTSUP;
1195             break;
1196     }
1197 
1198     ff_close(fd);
1199 
1200 done:
1201     if (ret < 0) {
1202         msg->result = errno;
1203     } else {
1204         msg->result = 0;
1205     }
1206 }
1207 #endif
1208 
1209 static inline void
1210 handle_traffic_msg(struct ff_msg *msg)
1211 {
1212     msg->traffic = ff_traffic;
1213     msg->result = 0;
1214 }
1215 
1216 static inline void
1217 handle_default_msg(struct ff_msg *msg)
1218 {
1219     msg->result = ENOTSUP;
1220 }
1221 
1222 static inline void
1223 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1224 {
1225     switch (msg->msg_type) {
1226         case FF_SYSCTL:
1227             handle_sysctl_msg(msg);
1228             break;
1229         case FF_IOCTL:
1230             handle_ioctl_msg(msg);
1231             break;
1232         case FF_ROUTE:
1233             handle_route_msg(msg);
1234             break;
1235         case FF_TOP:
1236             handle_top_msg(msg);
1237             break;
1238 #ifdef FF_NETGRAPH
1239         case FF_NGCTL:
1240             handle_ngctl_msg(msg);
1241             break;
1242 #endif
1243 #ifdef FF_IPFW
1244         case FF_IPFW_CTL:
1245             handle_ipfw_msg(msg);
1246             break;
1247 #endif
1248         case FF_TRAFFIC:
1249             handle_traffic_msg(msg);
1250             break;
1251         default:
1252             handle_default_msg(msg);
1253             break;
1254     }
1255     rte_ring_enqueue(msg_ring[proc_id].ring[1], msg);
1256 }
1257 
1258 static inline int
1259 process_msg_ring(uint16_t proc_id)
1260 {
1261     void *msg;
1262     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1263 
1264     if (unlikely(ret == 0)) {
1265         handle_msg((struct ff_msg *)msg, proc_id);
1266     }
1267 
1268     return 0;
1269 }
1270 
1271 /* Send burst of packets on an output interface */
1272 static inline int
1273 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1274 {
1275     struct rte_mbuf **m_table;
1276     int ret;
1277     uint16_t queueid;
1278 
1279     queueid = qconf->tx_queue_id[port];
1280     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1281 
1282     if (unlikely(qconf->pcap[port] != NULL)) {
1283         uint16_t i;
1284         for (i = 0; i < n; i++) {
1285             ff_dump_packets(qconf->pcap[port], m_table[i]);
1286         }
1287     }
1288 
1289     ff_traffic.tx_packets += n;
1290     uint16_t i;
1291     for (i = 0; i < n; i++) {
1292         ff_traffic.tx_bytes += rte_pktmbuf_data_len(m_table[i]);
1293     }
1294 
1295     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1296     if (unlikely(ret < n)) {
1297         do {
1298             rte_pktmbuf_free(m_table[ret]);
1299         } while (++ret < n);
1300     }
1301 
1302     return 0;
1303 }
1304 
1305 /* Enqueue a single packet, and send burst if queue is filled */
1306 static inline int
1307 send_single_packet(struct rte_mbuf *m, uint8_t port)
1308 {
1309     uint16_t len;
1310     struct lcore_conf *qconf;
1311 
1312     qconf = &lcore_conf;
1313     len = qconf->tx_mbufs[port].len;
1314     qconf->tx_mbufs[port].m_table[len] = m;
1315     len++;
1316 
1317     /* enough pkts to be sent */
1318     if (unlikely(len == MAX_PKT_BURST)) {
1319         send_burst(qconf, MAX_PKT_BURST, port);
1320         len = 0;
1321     }
1322 
1323     qconf->tx_mbufs[port].len = len;
1324     return 0;
1325 }
1326 
1327 int
1328 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1329     int total)
1330 {
1331     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1332     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1333     if (head == NULL) {
1334         ff_mbuf_free(m);
1335         return -1;
1336     }
1337 
1338     head->pkt_len = total;
1339     head->nb_segs = 0;
1340 
1341     int off = 0;
1342     struct rte_mbuf *cur = head, *prev = NULL;
1343     while(total > 0) {
1344         if (cur == NULL) {
1345             cur = rte_pktmbuf_alloc(mbuf_pool);
1346             if (cur == NULL) {
1347                 rte_pktmbuf_free(head);
1348                 ff_mbuf_free(m);
1349                 return -1;
1350             }
1351         }
1352 
1353         if (prev != NULL) {
1354             prev->next = cur;
1355         }
1356         head->nb_segs++;
1357 
1358         prev = cur;
1359         void *data = rte_pktmbuf_mtod(cur, void*);
1360         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1361         int ret = ff_mbuf_copydata(m, data, off, len);
1362         if (ret < 0) {
1363             rte_pktmbuf_free(head);
1364             ff_mbuf_free(m);
1365             return -1;
1366         }
1367 
1368 
1369         cur->data_len = len;
1370         off += len;
1371         total -= len;
1372         cur = NULL;
1373     }
1374 
1375     struct ff_tx_offload offload = {0};
1376     ff_mbuf_tx_offload(m, &offload);
1377 
1378     void *data = rte_pktmbuf_mtod(head, void*);
1379 
1380     if (offload.ip_csum) {
1381         /* ipv6 not supported yet */
1382         struct ipv4_hdr *iph;
1383         int iph_len;
1384         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1385         iph_len = (iph->version_ihl & 0x0f) << 2;
1386 
1387         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1388         head->l2_len = ETHER_HDR_LEN;
1389         head->l3_len = iph_len;
1390     }
1391 
1392     if (ctx->hw_features.tx_csum_l4) {
1393         struct ipv4_hdr *iph;
1394         int iph_len;
1395         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1396         iph_len = (iph->version_ihl & 0x0f) << 2;
1397 
1398         if (offload.tcp_csum) {
1399             head->ol_flags |= PKT_TX_TCP_CKSUM;
1400             head->l2_len = ETHER_HDR_LEN;
1401             head->l3_len = iph_len;
1402         }
1403 
1404         /*
1405          *  TCP segmentation offload.
1406          *
1407          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1408          *    implies PKT_TX_TCP_CKSUM)
1409          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1410          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1411          *    write the IP checksum to 0 in the packet
1412          *  - fill the mbuf offload information: l2_len,
1413          *    l3_len, l4_len, tso_segsz
1414          *  - calculate the pseudo header checksum without taking ip_len
1415          *    in account, and set it in the TCP header. Refer to
1416          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1417          *    used as helpers.
1418          */
1419         if (offload.tso_seg_size) {
1420             struct tcp_hdr *tcph;
1421             int tcph_len;
1422             tcph = (struct tcp_hdr *)((char *)iph + iph_len);
1423             tcph_len = (tcph->data_off & 0xf0) >> 2;
1424             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1425 
1426             head->ol_flags |= PKT_TX_TCP_SEG;
1427             head->l4_len = tcph_len;
1428             head->tso_segsz = offload.tso_seg_size;
1429         }
1430 
1431         if (offload.udp_csum) {
1432             head->ol_flags |= PKT_TX_UDP_CKSUM;
1433             head->l2_len = ETHER_HDR_LEN;
1434             head->l3_len = iph_len;
1435         }
1436     }
1437 
1438     ff_mbuf_free(m);
1439 
1440     return send_single_packet(head, ctx->port_id);
1441 }
1442 
1443 static int
1444 main_loop(void *arg)
1445 {
1446     struct loop_routine *lr = (struct loop_routine *)arg;
1447 
1448     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1449     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc;
1450     int i, j, nb_rx, idle;
1451     uint16_t port_id, queue_id;
1452     struct lcore_conf *qconf;
1453     const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) /
1454         US_PER_S * BURST_TX_DRAIN_US;
1455     struct ff_dpdk_if_context *ctx;
1456 
1457     prev_tsc = 0;
1458     usch_tsc = 0;
1459 
1460     qconf = &lcore_conf;
1461 
1462     while (1) {
1463         cur_tsc = rte_rdtsc();
1464         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1465             rte_timer_manage();
1466         }
1467 
1468         idle = 1;
1469         sys_tsc = 0;
1470         usr_tsc = 0;
1471 
1472         /*
1473          * TX burst queue drain
1474          */
1475         diff_tsc = cur_tsc - prev_tsc;
1476         if (unlikely(diff_tsc > drain_tsc)) {
1477             for (i = 0; i < qconf->nb_tx_port; i++) {
1478                 port_id = qconf->tx_port_id[i];
1479                 if (qconf->tx_mbufs[port_id].len == 0)
1480                     continue;
1481 
1482                 idle = 0;
1483 
1484                 send_burst(qconf,
1485                     qconf->tx_mbufs[port_id].len,
1486                     port_id);
1487                 qconf->tx_mbufs[port_id].len = 0;
1488             }
1489 
1490             prev_tsc = cur_tsc;
1491         }
1492 
1493         /*
1494          * Read packet from RX queues
1495          */
1496         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1497             port_id = qconf->rx_queue_list[i].port_id;
1498             queue_id = qconf->rx_queue_list[i].queue_id;
1499             ctx = veth_ctx[port_id];
1500 
1501 #ifdef FF_KNI
1502             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1503                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1504             }
1505 #endif
1506 
1507             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1508 
1509             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1510                 MAX_PKT_BURST);
1511             if (nb_rx == 0)
1512                 continue;
1513 
1514             idle = 0;
1515 
1516             /* Prefetch first packets */
1517             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1518                 rte_prefetch0(rte_pktmbuf_mtod(
1519                         pkts_burst[j], void *));
1520             }
1521 
1522             /* Prefetch and handle already prefetched packets */
1523             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1524                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1525                         j + PREFETCH_OFFSET], void *));
1526                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1527             }
1528 
1529             /* Handle remaining prefetched packets */
1530             for (; j < nb_rx; j++) {
1531                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1532             }
1533         }
1534 
1535         process_msg_ring(qconf->proc_id);
1536 
1537         div_tsc = rte_rdtsc();
1538 
1539         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) {
1540             usch_tsc = cur_tsc;
1541             lr->loop(lr->arg);
1542         }
1543 
1544         idle_sleep_tsc = rte_rdtsc();
1545         if (likely(idle && idle_sleep)) {
1546             usleep(idle_sleep);
1547             end_tsc = rte_rdtsc();
1548         } else {
1549             end_tsc = idle_sleep_tsc;
1550         }
1551 
1552         end_tsc = rte_rdtsc();
1553 
1554         if (usch_tsc == cur_tsc) {
1555             usr_tsc = idle_sleep_tsc - div_tsc;
1556         }
1557 
1558         if (!idle) {
1559             sys_tsc = div_tsc - cur_tsc;
1560             ff_top_status.sys_tsc += sys_tsc;
1561         }
1562 
1563         ff_top_status.usr_tsc += usr_tsc;
1564         ff_top_status.work_tsc += end_tsc - cur_tsc;
1565         ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1566 
1567         ff_top_status.loops++;
1568     }
1569 
1570     return 0;
1571 }
1572 
1573 int
1574 ff_dpdk_if_up(void) {
1575     int i;
1576     struct lcore_conf *qconf = &lcore_conf;
1577     for (i = 0; i < qconf->nb_tx_port; i++) {
1578         uint16_t port_id = qconf->tx_port_id[i];
1579 
1580         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1581         veth_ctx[port_id] = ff_veth_attach(pconf);
1582         if (veth_ctx[port_id] == NULL) {
1583             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1584         }
1585     }
1586 
1587     return 0;
1588 }
1589 
1590 void
1591 ff_dpdk_run(loop_func_t loop, void *arg) {
1592     struct loop_routine *lr = rte_malloc(NULL,
1593         sizeof(struct loop_routine), 0);
1594     lr->loop = loop;
1595     lr->arg = arg;
1596     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1597     rte_eal_mp_wait_lcore();
1598     rte_free(lr);
1599 }
1600 
1601 void
1602 ff_dpdk_pktmbuf_free(void *m)
1603 {
1604     rte_pktmbuf_free((struct rte_mbuf *)m);
1605 }
1606 
1607 static uint32_t
1608 toeplitz_hash(unsigned keylen, const uint8_t *key,
1609     unsigned datalen, const uint8_t *data)
1610 {
1611     uint32_t hash = 0, v;
1612     u_int i, b;
1613 
1614     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1615 
1616     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1617     for (i = 0; i < datalen; i++) {
1618         for (b = 0; b < 8; b++) {
1619             if (data[i] & (1<<(7-b)))
1620                 hash ^= v;
1621             v <<= 1;
1622             if ((i + 4) < keylen &&
1623                 (key[i+4] & (1<<(7-b))))
1624                 v |= 1;
1625         }
1626     }
1627     return (hash);
1628 }
1629 
1630 int
1631 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
1632     uint16_t sport, uint16_t dport)
1633 {
1634     struct lcore_conf *qconf = &lcore_conf;
1635     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
1636     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
1637 
1638     if (nb_queues <= 1) {
1639         return 1;
1640     }
1641 
1642     uint16_t reta_size = rss_reta_size[ctx->port_id];
1643     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
1644 
1645     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
1646         sizeof(dport)];
1647 
1648     unsigned datalen = 0;
1649 
1650     bcopy(&saddr, &data[datalen], sizeof(saddr));
1651     datalen += sizeof(saddr);
1652 
1653     bcopy(&daddr, &data[datalen], sizeof(daddr));
1654     datalen += sizeof(daddr);
1655 
1656     bcopy(&sport, &data[datalen], sizeof(sport));
1657     datalen += sizeof(sport);
1658 
1659     bcopy(&dport, &data[datalen], sizeof(dport));
1660     datalen += sizeof(dport);
1661 
1662     uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes),
1663         default_rsskey_40bytes, datalen, data);
1664 
1665     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
1666 }
1667 
1668 void
1669 ff_regist_packet_dispatcher(dispatch_func_t func)
1670 {
1671     packet_dispatcher = func;
1672 }
1673 
1674 uint64_t
1675 ff_get_tsc_ns()
1676 {
1677     uint64_t cur_tsc = rte_rdtsc();
1678     uint64_t hz = rte_get_tsc_hz();
1679     return ((double)cur_tsc/(double)hz) * NS_PER_S;
1680 }
1681 
1682