xref: /f-stack/lib/ff_dpdk_if.c (revision 40c3a4b5)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 #include <unistd.h>
28 
29 #include <rte_common.h>
30 #include <rte_byteorder.h>
31 #include <rte_log.h>
32 #include <rte_memory.h>
33 #include <rte_memcpy.h>
34 #include <rte_memzone.h>
35 #include <rte_config.h>
36 #include <rte_eal.h>
37 #include <rte_pci.h>
38 #include <rte_mbuf.h>
39 #include <rte_memory.h>
40 #include <rte_lcore.h>
41 #include <rte_launch.h>
42 #include <rte_ethdev.h>
43 #include <rte_debug.h>
44 #include <rte_common.h>
45 #include <rte_ether.h>
46 #include <rte_malloc.h>
47 #include <rte_cycles.h>
48 #include <rte_timer.h>
49 #include <rte_thash.h>
50 #include <rte_ip.h>
51 #include <rte_tcp.h>
52 #include <rte_udp.h>
53 
54 #include "ff_dpdk_if.h"
55 #include "ff_dpdk_pcap.h"
56 #include "ff_dpdk_kni.h"
57 #include "ff_config.h"
58 #include "ff_veth.h"
59 #include "ff_host_interface.h"
60 #include "ff_msg.h"
61 #include "ff_api.h"
62 
63 #define MEMPOOL_CACHE_SIZE 256
64 
65 #define DISPATCH_RING_SIZE 2048
66 
67 #define MSG_RING_SIZE 32
68 
69 /*
70  * Configurable number of RX/TX ring descriptors
71  */
72 #define RX_QUEUE_SIZE 512
73 #define TX_QUEUE_SIZE 512
74 
75 #define MAX_PKT_BURST 32
76 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
77 
78 /*
79  * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send.
80  */
81 #define MAX_TX_BURST    (MAX_PKT_BURST / 2)
82 
83 #define NB_SOCKETS 8
84 
85 /* Configure how many packets ahead to prefetch, when reading packets */
86 #define PREFETCH_OFFSET    3
87 
88 #define MAX_RX_QUEUE_PER_LCORE 16
89 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS
90 #define MAX_RX_QUEUE_PER_PORT 128
91 
92 #ifdef FF_KNI
93 #define KNI_MBUF_MAX 2048
94 #define KNI_QUEUE_SIZE 2048
95 
96 static int enable_kni;
97 static int kni_accept;
98 #endif
99 
100 static int numa_on;
101 
102 static unsigned idle_sleep;
103 
104 static struct rte_timer freebsd_clock;
105 
106 // Mellanox Linux's driver key
107 static uint8_t default_rsskey_40bytes[40] = {
108     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
109     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
110     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
111     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
112     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
113 };
114 
115 static struct rte_eth_conf default_port_conf = {
116     .rxmode = {
117         .mq_mode = ETH_MQ_RX_RSS,
118         .max_rx_pkt_len = ETHER_MAX_LEN,
119         .split_hdr_size = 0, /**< hdr buf size */
120         .header_split   = 0, /**< Header Split disabled */
121         .hw_ip_checksum = 0, /**< IP checksum offload disabled */
122         .hw_vlan_filter = 0, /**< VLAN filtering disabled */
123         .hw_vlan_strip  = 0, /**< VLAN strip disabled. */
124         .hw_vlan_extend = 0, /**< Extended VLAN disabled. */
125         .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
126         .hw_strip_crc   = 0, /**< CRC stripped by hardware */
127         .enable_lro     = 0, /**< LRO disabled */
128     },
129     .rx_adv_conf = {
130         .rss_conf = {
131             .rss_key = default_rsskey_40bytes,
132             .rss_key_len = 40,
133             .rss_hf = ETH_RSS_PROTO_MASK,
134         },
135     },
136     .txmode = {
137         .mq_mode = ETH_MQ_TX_NONE,
138     },
139 };
140 
141 struct mbuf_table {
142     uint16_t len;
143     struct rte_mbuf *m_table[MAX_PKT_BURST];
144 };
145 
146 struct lcore_rx_queue {
147     uint16_t port_id;
148     uint16_t queue_id;
149 } __rte_cache_aligned;
150 
151 struct lcore_conf {
152     uint16_t proc_id;
153     uint16_t socket_id;
154     uint16_t nb_queue_list[RTE_MAX_ETHPORTS];
155     struct ff_port_cfg *port_cfgs;
156 
157     uint16_t nb_rx_queue;
158     struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
159     uint16_t nb_tx_port;
160     uint16_t tx_port_id[RTE_MAX_ETHPORTS];
161     uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
162     struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
163     char *pcap[RTE_MAX_ETHPORTS];
164 } __rte_cache_aligned;
165 
166 static struct lcore_conf lcore_conf;
167 
168 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
169 
170 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
171 static dispatch_func_t packet_dispatcher;
172 
173 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
174 
175 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port);
176 
177 struct ff_msg_ring {
178     char ring_name[2][RTE_RING_NAMESIZE];
179     /* ring[0] for lcore recv msg, other send */
180     /* ring[1] for lcore send msg, other read */
181     struct rte_ring *ring[2];
182 } __rte_cache_aligned;
183 
184 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
185 static struct rte_mempool *message_pool;
186 
187 struct ff_dpdk_if_context {
188     void *sc;
189     void *ifp;
190     uint16_t port_id;
191     struct ff_hw_features hw_features;
192 } __rte_cache_aligned;
193 
194 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
195 
196 static struct ff_top_args ff_top_status;
197 static struct ff_traffic_args ff_traffic;
198 
199 extern void ff_hardclock(void);
200 
201 static void
202 ff_hardclock_job(__rte_unused struct rte_timer *timer,
203     __rte_unused void *arg) {
204     ff_hardclock();
205     ff_update_current_ts();
206 }
207 
208 struct ff_dpdk_if_context *
209 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
210 {
211     struct ff_dpdk_if_context *ctx;
212 
213     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
214     if (ctx == NULL)
215         return NULL;
216 
217     ctx->sc = sc;
218     ctx->ifp = ifp;
219     ctx->port_id = cfg->port_id;
220     ctx->hw_features = cfg->hw_features;
221 
222     return ctx;
223 }
224 
225 void
226 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
227 {
228     free(ctx);
229 }
230 
231 static void
232 check_all_ports_link_status(void)
233 {
234     #define CHECK_INTERVAL 100 /* 100ms */
235     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
236 
237     uint16_t portid;
238     uint8_t count, all_ports_up, print_flag = 0;
239     struct rte_eth_link link;
240 
241     printf("\nChecking link status");
242     fflush(stdout);
243 
244     int i, nb_ports;
245     nb_ports = ff_global_cfg.dpdk.nb_ports;
246     for (count = 0; count <= MAX_CHECK_TIME; count++) {
247         all_ports_up = 1;
248         for (i = 0; i < nb_ports; i++) {
249             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
250             memset(&link, 0, sizeof(link));
251             rte_eth_link_get_nowait(portid, &link);
252 
253             /* print link status if flag set */
254             if (print_flag == 1) {
255                 if (link.link_status) {
256                     printf("Port %d Link Up - speed %u "
257                         "Mbps - %s\n", (int)portid,
258                         (unsigned)link.link_speed,
259                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
260                         ("full-duplex") : ("half-duplex\n"));
261                 } else {
262                     printf("Port %d Link Down\n", (int)portid);
263                 }
264                 continue;
265             }
266             /* clear all_ports_up flag if any link down */
267             if (link.link_status == 0) {
268                 all_ports_up = 0;
269                 break;
270             }
271         }
272 
273         /* after finally printing all link status, get out */
274         if (print_flag == 1)
275             break;
276 
277         if (all_ports_up == 0) {
278             printf(".");
279             fflush(stdout);
280             rte_delay_ms(CHECK_INTERVAL);
281         }
282 
283         /* set the print_flag if all ports up or timeout */
284         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
285             print_flag = 1;
286             printf("done\n");
287         }
288     }
289 }
290 
291 static int
292 init_lcore_conf(void)
293 {
294     uint8_t nb_dev_ports = rte_eth_dev_count();
295     if (nb_dev_ports == 0) {
296         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
297     }
298 
299     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
300         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
301                  ff_global_cfg.dpdk.max_portid);
302     }
303 
304     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
305     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
306 
307     uint16_t proc_id;
308     for (proc_id = 0; proc_id < ff_global_cfg.dpdk.nb_procs; proc_id++) {
309         uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[proc_id];
310         if (!lcore_config[lcore_id].detected) {
311             rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
312         }
313     }
314 
315     uint16_t socket_id = 0;
316     if (numa_on) {
317         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
318     }
319 
320     lcore_conf.socket_id = socket_id;
321 
322     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
323     int j;
324     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
325         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
326         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
327 
328         int queueid = -1;
329         int i;
330         for (i = 0; i < pconf->nb_lcores; i++) {
331             if (pconf->lcore_list[i] == lcore_id) {
332                 queueid = i;
333             }
334         }
335         if (queueid < 0) {
336             continue;
337         }
338         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
339         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
340         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
341         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
342         lcore_conf.nb_rx_queue++;
343 
344         lcore_conf.tx_queue_id[port_id] = queueid;
345         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
346         lcore_conf.nb_tx_port++;
347 
348         lcore_conf.pcap[port_id] = pconf->pcap;
349         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
350     }
351 
352     if (lcore_conf.nb_rx_queue == 0) {
353         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
354     }
355 
356     return 0;
357 }
358 
359 static int
360 init_mem_pool(void)
361 {
362     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
363     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
364     uint32_t nb_tx_queue = nb_lcores;
365     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
366 
367     unsigned nb_mbuf = RTE_MAX (
368         (nb_rx_queue*RX_QUEUE_SIZE          +
369         nb_ports*nb_lcores*MAX_PKT_BURST    +
370         nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
371         nb_lcores*MEMPOOL_CACHE_SIZE +
372 #ifdef FF_KNI
373         nb_ports*KNI_MBUF_MAX +
374         nb_ports*KNI_QUEUE_SIZE +
375 #endif
376         nb_lcores*nb_ports*DISPATCH_RING_SIZE),
377         (unsigned)8192);
378 
379     unsigned socketid = 0;
380     uint16_t i, lcore_id;
381     char s[64];
382 
383     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
384         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
385         if (numa_on) {
386             socketid = rte_lcore_to_socket_id(lcore_id);
387         }
388 
389         if (socketid >= NB_SOCKETS) {
390             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
391                 socketid, i, NB_SOCKETS);
392         }
393 
394         if (pktmbuf_pool[socketid] != NULL) {
395             continue;
396         }
397 
398         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
399             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
400             pktmbuf_pool[socketid] =
401                 rte_pktmbuf_pool_create(s, nb_mbuf,
402                     MEMPOOL_CACHE_SIZE, 0,
403                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
404         } else {
405             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
406             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
407         }
408 
409         if (pktmbuf_pool[socketid] == NULL) {
410             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
411         } else {
412             printf("create mbuf pool on socket %d\n", socketid);
413         }
414     }
415 
416     return 0;
417 }
418 
419 static struct rte_ring *
420 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
421 {
422     struct rte_ring *ring;
423 
424     if (name == NULL) {
425         rte_exit(EXIT_FAILURE, "create ring failed, no name!\n");
426     }
427 
428     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
429         ring = rte_ring_create(name, count, socket_id, flags);
430     } else {
431         ring = rte_ring_lookup(name);
432     }
433 
434     if (ring == NULL) {
435         rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name);
436     }
437 
438     return ring;
439 }
440 
441 static int
442 init_dispatch_ring(void)
443 {
444     int j;
445     char name_buf[RTE_RING_NAMESIZE];
446     int queueid;
447 
448     unsigned socketid = lcore_conf.socket_id;
449 
450     /* Create ring according to ports actually being used. */
451     int nb_ports = ff_global_cfg.dpdk.nb_ports;
452     for (j = 0; j < nb_ports; j++) {
453         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
454         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
455         int nb_queues = pconf->nb_lcores;
456         if (dispatch_ring[portid] == NULL) {
457             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
458 
459             dispatch_ring[portid] = rte_zmalloc(name_buf,
460                 sizeof(struct rte_ring *) * nb_queues,
461                 RTE_CACHE_LINE_SIZE);
462             if (dispatch_ring[portid] == NULL) {
463                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
464                     "failed\n", name_buf);
465             }
466         }
467 
468         for(queueid = 0; queueid < nb_queues; ++queueid) {
469             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
470                 portid, queueid);
471             dispatch_ring[portid][queueid] = create_ring(name_buf,
472                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
473 
474             if (dispatch_ring[portid][queueid] == NULL)
475                 rte_panic("create ring:%s failed!\n", name_buf);
476 
477             printf("create ring:%s success, %u ring entries are now free!\n",
478                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
479         }
480     }
481 
482     return 0;
483 }
484 
485 static void
486 ff_msg_init(struct rte_mempool *mp,
487     __attribute__((unused)) void *opaque_arg,
488     void *obj, __attribute__((unused)) unsigned i)
489 {
490     struct ff_msg *msg = (struct ff_msg *)obj;
491     msg->msg_type = FF_UNKNOWN;
492     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
493     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
494 }
495 
496 static int
497 init_msg_ring(void)
498 {
499     uint16_t i;
500     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
501     unsigned socketid = lcore_conf.socket_id;
502 
503     /* Create message buffer pool */
504     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
505         message_pool = rte_mempool_create(FF_MSG_POOL,
506            MSG_RING_SIZE * 2 * nb_procs,
507            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
508            NULL, NULL, ff_msg_init, NULL,
509            socketid, 0);
510     } else {
511         message_pool = rte_mempool_lookup(FF_MSG_POOL);
512     }
513 
514     if (message_pool == NULL) {
515         rte_panic("Create msg mempool failed\n");
516     }
517 
518     for(i = 0; i < nb_procs; ++i) {
519         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
520             "%s%u", FF_MSG_RING_IN, i);
521         snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE,
522             "%s%u", FF_MSG_RING_OUT, i);
523 
524         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
525             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
526         if (msg_ring[i].ring[0] == NULL)
527             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
528 
529         msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1],
530             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
531         if (msg_ring[i].ring[1] == NULL)
532             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
533     }
534 
535     return 0;
536 }
537 
538 #ifdef FF_KNI
539 static int
540 init_kni(void)
541 {
542     int nb_ports = rte_eth_dev_count();
543     kni_accept = 0;
544     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
545         kni_accept = 1;
546 
547     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
548         ff_global_cfg.kni.udp_port);
549 
550     unsigned socket_id = lcore_conf.socket_id;
551     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
552 
553     nb_ports = ff_global_cfg.dpdk.nb_ports;
554     int i, ret;
555     for (i = 0; i < nb_ports; i++) {
556         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
557         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
558     }
559 
560     return 0;
561 }
562 #endif
563 
564 static void
565 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
566 {
567     if (reta_size == 0) {
568         return;
569     }
570 
571     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
572     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
573 
574     /* config HW indirection table */
575     unsigned i, j, hash=0;
576     for (i = 0; i < reta_conf_size; i++) {
577         reta_conf[i].mask = ~0ULL;
578         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
579             reta_conf[i].reta[j] = hash++ % nb_queues;
580         }
581     }
582 
583     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
584         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
585             port_id);
586     }
587 }
588 
589 static int
590 init_port_start(void)
591 {
592     int nb_ports = ff_global_cfg.dpdk.nb_ports;
593     unsigned socketid = 0;
594     struct rte_mempool *mbuf_pool;
595     uint16_t i;
596 
597     for (i = 0; i < nb_ports; i++) {
598         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
599         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
600         uint16_t nb_queues = pconf->nb_lcores;
601 
602         struct rte_eth_dev_info dev_info;
603         rte_eth_dev_info_get(port_id, &dev_info);
604 
605         if (nb_queues > dev_info.max_rx_queues) {
606             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
607                 nb_queues,
608                 dev_info.max_rx_queues);
609         }
610 
611         if (nb_queues > dev_info.max_tx_queues) {
612             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
613                 nb_queues,
614                 dev_info.max_tx_queues);
615         }
616 
617         struct ether_addr addr;
618         rte_eth_macaddr_get(port_id, &addr);
619         printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
620                    " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
621                 (unsigned)port_id,
622                 addr.addr_bytes[0], addr.addr_bytes[1],
623                 addr.addr_bytes[2], addr.addr_bytes[3],
624                 addr.addr_bytes[4], addr.addr_bytes[5]);
625 
626         rte_memcpy(pconf->mac,
627             addr.addr_bytes, ETHER_ADDR_LEN);
628 
629         /* Clear txq_flags - we do not need multi-mempool and refcnt */
630         dev_info.default_txconf.txq_flags = ETH_TXQ_FLAGS_NOMULTMEMP |
631             ETH_TXQ_FLAGS_NOREFCOUNT;
632 
633         /* Disable features that are not supported by port's HW */
634         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) {
635             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP;
636         }
637 
638         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
639             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP;
640         }
641 
642         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) {
643             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP;
644         }
645 
646         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) {
647             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL;
648         }
649 
650         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) &&
651             !(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TSO)) {
652             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
653         }
654 
655         struct rte_eth_conf port_conf = {0};
656 
657         /* Set RSS mode */
658         port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
659         port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK;
660         port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes;
661         port_conf.rx_adv_conf.rss_conf.rss_key_len = 40;
662 
663         /* Set Rx VLAN stripping */
664         if (ff_global_cfg.dpdk.vlan_strip) {
665             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
666                 port_conf.rxmode.hw_vlan_strip = 1;
667             }
668         }
669 
670         /* Enable HW CRC stripping */
671         port_conf.rxmode.hw_strip_crc = 1;
672 
673         /* FIXME: Enable TCP LRO ?*/
674         #if 0
675         if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
676             printf("LRO is supported\n");
677             port_conf.rxmode.enable_lro = 1;
678             pconf->hw_features.rx_lro = 1;
679         }
680         #endif
681 
682         /* Set Rx checksum checking */
683         if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
684             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
685             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
686             printf("RX checksum offload supported\n");
687             port_conf.rxmode.hw_ip_checksum = 1;
688             pconf->hw_features.rx_csum = 1;
689         }
690 
691         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
692             printf("TX ip checksum offload supported\n");
693             pconf->hw_features.tx_csum_ip = 1;
694         }
695 
696         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
697             (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
698             printf("TX TCP&UDP checksum offload supported\n");
699             pconf->hw_features.tx_csum_l4 = 1;
700         }
701 
702         if (ff_global_cfg.dpdk.tso) {
703             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
704                 printf("TSO is supported\n");
705                 pconf->hw_features.tx_tso = 1;
706             }
707         } else {
708             printf("TSO is disabled\n");
709         }
710 
711         if (dev_info.reta_size) {
712             /* reta size must be power of 2 */
713             assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
714 
715             rss_reta_size[port_id] = dev_info.reta_size;
716             printf("port[%d]: rss table size: %d\n", port_id,
717                 dev_info.reta_size);
718         }
719 
720         if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
721             continue;
722         }
723 
724         int ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
725         if (ret != 0) {
726             return ret;
727         }
728         uint16_t q;
729         for (q = 0; q < nb_queues; q++) {
730             if (numa_on) {
731                 uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
732                 socketid = rte_lcore_to_socket_id(lcore_id);
733             }
734             mbuf_pool = pktmbuf_pool[socketid];
735 
736             ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE,
737                 socketid, &dev_info.default_txconf);
738             if (ret < 0) {
739                 return ret;
740             }
741 
742             ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE,
743                 socketid, &dev_info.default_rxconf, mbuf_pool);
744             if (ret < 0) {
745                 return ret;
746             }
747         }
748 
749         ret = rte_eth_dev_start(port_id);
750         if (ret < 0) {
751             return ret;
752         }
753 
754         if (nb_queues > 1) {
755             /* set HW rss hash function to Toeplitz. */
756             if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
757                 struct rte_eth_hash_filter_info info = {0};
758                 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
759                 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
760 
761                 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
762                     RTE_ETH_FILTER_SET, &info) < 0) {
763                     rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
764                         port_id);
765                 }
766             }
767 
768             set_rss_table(port_id, dev_info.reta_size, nb_queues);
769         }
770 
771         /* Enable RX in promiscuous mode for the Ethernet device. */
772         if (ff_global_cfg.dpdk.promiscuous) {
773             rte_eth_promiscuous_enable(port_id);
774             ret = rte_eth_promiscuous_get(port_id);
775             if (ret == 1) {
776                 printf("set port %u to promiscuous mode ok\n", port_id);
777             } else {
778                 printf("set port %u to promiscuous mode error\n", port_id);
779             }
780         }
781 
782         /* Enable pcap dump */
783         if (pconf->pcap) {
784             ff_enable_pcap(pconf->pcap);
785         }
786     }
787 
788     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
789         check_all_ports_link_status();
790     }
791 
792     return 0;
793 }
794 
795 static int
796 init_clock(void)
797 {
798     rte_timer_subsystem_init();
799     uint64_t hz = rte_get_timer_hz();
800     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
801     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
802 
803     rte_timer_init(&freebsd_clock);
804     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
805         rte_lcore_id(), &ff_hardclock_job, NULL);
806 
807     ff_update_current_ts();
808 
809     return 0;
810 }
811 
812 int
813 ff_dpdk_init(int argc, char **argv)
814 {
815     if (ff_global_cfg.dpdk.nb_procs < 1 ||
816         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
817         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
818         ff_global_cfg.dpdk.proc_id < 0) {
819         printf("param num_procs[%d] or proc_id[%d] error!\n",
820             ff_global_cfg.dpdk.nb_procs,
821             ff_global_cfg.dpdk.proc_id);
822         exit(1);
823     }
824 
825     int ret = rte_eal_init(argc, argv);
826     if (ret < 0) {
827         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
828     }
829 
830     numa_on = ff_global_cfg.dpdk.numa_on;
831 
832     idle_sleep = ff_global_cfg.dpdk.idle_sleep;
833 
834     init_lcore_conf();
835 
836     init_mem_pool();
837 
838     init_dispatch_ring();
839 
840     init_msg_ring();
841 
842 #ifdef FF_KNI
843     enable_kni = ff_global_cfg.kni.enable;
844     if (enable_kni) {
845         init_kni();
846     }
847 #endif
848 
849     ret = init_port_start();
850     if (ret < 0) {
851         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
852     }
853 
854     init_clock();
855 
856     return 0;
857 }
858 
859 static void
860 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
861 {
862     uint8_t rx_csum = ctx->hw_features.rx_csum;
863     if (rx_csum) {
864         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
865             rte_pktmbuf_free(pkt);
866             return;
867         }
868     }
869 
870     void *data = rte_pktmbuf_mtod(pkt, void*);
871     uint16_t len = rte_pktmbuf_data_len(pkt);
872 
873     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
874     if (hdr == NULL) {
875         rte_pktmbuf_free(pkt);
876         return;
877     }
878 
879     if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) {
880         ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci);
881     }
882 
883     struct rte_mbuf *pn = pkt->next;
884     void *prev = hdr;
885     while(pn != NULL) {
886         data = rte_pktmbuf_mtod(pn, void*);
887         len = rte_pktmbuf_data_len(pn);
888 
889         void *mb = ff_mbuf_get(prev, data, len);
890         if (mb == NULL) {
891             ff_mbuf_free(hdr);
892             rte_pktmbuf_free(pkt);
893             return;
894         }
895         pn = pn->next;
896         prev = mb;
897     }
898 
899     ff_veth_process_packet(ctx->ifp, hdr);
900 }
901 
902 static enum FilterReturn
903 protocol_filter(const void *data, uint16_t len)
904 {
905     if(len < ETHER_HDR_LEN)
906         return FILTER_UNKNOWN;
907 
908     const struct ether_hdr *hdr;
909     const struct vlan_hdr *vlanhdr;
910     hdr = (const struct ether_hdr *)data;
911     uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type);
912     data += ETHER_HDR_LEN;
913     len -= ETHER_HDR_LEN;
914 
915     if (ether_type == ETHER_TYPE_VLAN) {
916         vlanhdr = (struct vlan_hdr *)data;
917         ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto);
918         data += sizeof(struct vlan_hdr);
919         len -= sizeof(struct vlan_hdr);
920     }
921 
922     if(ether_type == ETHER_TYPE_ARP)
923         return FILTER_ARP;
924 
925 #ifndef FF_KNI
926     return FILTER_UNKNOWN;
927 #else
928     if (!enable_kni) {
929         return FILTER_UNKNOWN;
930     }
931 
932     if(ether_type != ETHER_TYPE_IPv4)
933         return FILTER_UNKNOWN;
934 
935     return ff_kni_proto_filter(data, len);
936 #endif
937 }
938 
939 static inline void
940 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
941 {
942     struct rte_mbuf *md;
943     void *src, *dst;
944 
945     dst = rte_pktmbuf_mtod(mi, void *);
946     src = rte_pktmbuf_mtod(m, void *);
947 
948     mi->data_len = m->data_len;
949     rte_memcpy(dst, src, m->data_len);
950 
951     mi->port = m->port;
952     mi->vlan_tci = m->vlan_tci;
953     mi->vlan_tci_outer = m->vlan_tci_outer;
954     mi->tx_offload = m->tx_offload;
955     mi->hash = m->hash;
956     mi->ol_flags = m->ol_flags;
957     mi->packet_type = m->packet_type;
958 }
959 
960 /* copied from rte_pktmbuf_clone */
961 static inline struct rte_mbuf *
962 pktmbuf_deep_clone(const struct rte_mbuf *md,
963     struct rte_mempool *mp)
964 {
965     struct rte_mbuf *mc, *mi, **prev;
966     uint32_t pktlen;
967     uint8_t nseg;
968 
969     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
970         return NULL;
971 
972     mi = mc;
973     prev = &mi->next;
974     pktlen = md->pkt_len;
975     nseg = 0;
976 
977     do {
978         nseg++;
979         pktmbuf_deep_attach(mi, md);
980         *prev = mi;
981         prev = &mi->next;
982     } while ((md = md->next) != NULL &&
983         (mi = rte_pktmbuf_alloc(mp)) != NULL);
984 
985     *prev = NULL;
986     mc->nb_segs = nseg;
987     mc->pkt_len = pktlen;
988 
989     /* Allocation of new indirect segment failed */
990     if (unlikely (mi == NULL)) {
991         rte_pktmbuf_free(mc);
992         return NULL;
993     }
994 
995     __rte_mbuf_sanity_check(mc, 1);
996     return mc;
997 }
998 
999 static inline void
1000 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
1001     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
1002 {
1003     struct lcore_conf *qconf = &lcore_conf;
1004     uint16_t nb_queues = qconf->nb_queue_list[port_id];
1005 
1006     uint16_t i;
1007     for (i = 0; i < count; i++) {
1008         struct rte_mbuf *rtem = bufs[i];
1009 
1010         if (unlikely(qconf->pcap[port_id] != NULL)) {
1011             if (!pkts_from_ring) {
1012                 ff_dump_packets(qconf->pcap[port_id], rtem);
1013             }
1014         }
1015 
1016         void *data = rte_pktmbuf_mtod(rtem, void*);
1017         uint16_t len = rte_pktmbuf_data_len(rtem);
1018 
1019         if (!pkts_from_ring) {
1020             ff_traffic.rx_packets++;
1021             ff_traffic.rx_bytes += len;
1022         }
1023 
1024         if (!pkts_from_ring && packet_dispatcher) {
1025             int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues);
1026             if (ret == FF_DISPATCH_RESPONSE) {
1027                 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len;
1028                 send_single_packet(rtem, port_id);
1029                 continue;
1030             }
1031 
1032             if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) {
1033                 rte_pktmbuf_free(rtem);
1034                 continue;
1035             }
1036 
1037             if (ret != queue_id) {
1038                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
1039                 if (ret < 0)
1040                     rte_pktmbuf_free(rtem);
1041 
1042                 continue;
1043             }
1044         }
1045 
1046         enum FilterReturn filter = protocol_filter(data, len);
1047         if (filter == FILTER_ARP) {
1048             struct rte_mempool *mbuf_pool;
1049             struct rte_mbuf *mbuf_clone;
1050             if (!pkts_from_ring) {
1051                 uint16_t j;
1052                 for(j = 0; j < nb_queues; ++j) {
1053                     if(j == queue_id)
1054                         continue;
1055 
1056                     unsigned socket_id = 0;
1057                     if (numa_on) {
1058                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1059                         socket_id = rte_lcore_to_socket_id(lcore_id);
1060                     }
1061                     mbuf_pool = pktmbuf_pool[socket_id];
1062                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1063                     if(mbuf_clone) {
1064                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1065                             mbuf_clone);
1066                         if (ret < 0)
1067                             rte_pktmbuf_free(mbuf_clone);
1068                     }
1069                 }
1070             }
1071 
1072 #ifdef FF_KNI
1073             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1074                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1075                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1076                 if(mbuf_clone) {
1077                     ff_kni_enqueue(port_id, mbuf_clone);
1078                 }
1079             }
1080 #endif
1081             ff_veth_input(ctx, rtem);
1082 #ifdef FF_KNI
1083         } else if (enable_kni &&
1084             ((filter == FILTER_KNI && kni_accept) ||
1085             (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1086             ff_kni_enqueue(port_id, rtem);
1087 #endif
1088         } else {
1089             ff_veth_input(ctx, rtem);
1090         }
1091     }
1092 }
1093 
1094 static inline int
1095 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1096     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1097 {
1098     /* read packet from ring buf and to process */
1099     uint16_t nb_rb;
1100     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1101         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1102 
1103     if(nb_rb > 0) {
1104         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1105     }
1106 
1107     return 0;
1108 }
1109 
1110 static inline void
1111 handle_sysctl_msg(struct ff_msg *msg)
1112 {
1113     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1114         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1115         msg->sysctl.newlen);
1116 
1117     if (ret < 0) {
1118         msg->result = errno;
1119     } else {
1120         msg->result = 0;
1121     }
1122 }
1123 
1124 static inline void
1125 handle_ioctl_msg(struct ff_msg *msg)
1126 {
1127     int fd, ret;
1128     fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1129     if (fd < 0) {
1130         ret = -1;
1131         goto done;
1132     }
1133 
1134     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1135 
1136     ff_close(fd);
1137 
1138 done:
1139     if (ret < 0) {
1140         msg->result = errno;
1141     } else {
1142         msg->result = 0;
1143     }
1144 }
1145 
1146 static inline void
1147 handle_route_msg(struct ff_msg *msg)
1148 {
1149     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1150         &msg->route.len, msg->route.maxlen);
1151     if (ret < 0) {
1152         msg->result = errno;
1153     } else {
1154         msg->result = 0;
1155     }
1156 }
1157 
1158 static inline void
1159 handle_top_msg(struct ff_msg *msg)
1160 {
1161     msg->top = ff_top_status;
1162     msg->result = 0;
1163 }
1164 
1165 #ifdef FF_NETGRAPH
1166 static inline void
1167 handle_ngctl_msg(struct ff_msg *msg)
1168 {
1169     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1170     if (ret < 0) {
1171         msg->result = errno;
1172     } else {
1173         msg->result = 0;
1174         msg->ngctl.ret = ret;
1175     }
1176 }
1177 #endif
1178 
1179 #ifdef FF_IPFW
1180 static inline void
1181 handle_ipfw_msg(struct ff_msg *msg)
1182 {
1183     int fd, ret;
1184     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1185     if (fd < 0) {
1186         ret = -1;
1187         goto done;
1188     }
1189 
1190     switch (msg->ipfw.cmd) {
1191         case FF_IPFW_GET:
1192             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1193                 msg->ipfw.optname, msg->ipfw.optval,
1194                 msg->ipfw.optlen);
1195             break;
1196         case FF_IPFW_SET:
1197             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1198                 msg->ipfw.optname, msg->ipfw.optval,
1199                 *(msg->ipfw.optlen));
1200             break;
1201         default:
1202             ret = -1;
1203             errno = ENOTSUP;
1204             break;
1205     }
1206 
1207     ff_close(fd);
1208 
1209 done:
1210     if (ret < 0) {
1211         msg->result = errno;
1212     } else {
1213         msg->result = 0;
1214     }
1215 }
1216 #endif
1217 
1218 static inline void
1219 handle_traffic_msg(struct ff_msg *msg)
1220 {
1221     msg->traffic = ff_traffic;
1222     msg->result = 0;
1223 }
1224 
1225 static inline void
1226 handle_default_msg(struct ff_msg *msg)
1227 {
1228     msg->result = ENOTSUP;
1229 }
1230 
1231 static inline void
1232 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1233 {
1234     switch (msg->msg_type) {
1235         case FF_SYSCTL:
1236             handle_sysctl_msg(msg);
1237             break;
1238         case FF_IOCTL:
1239             handle_ioctl_msg(msg);
1240             break;
1241         case FF_ROUTE:
1242             handle_route_msg(msg);
1243             break;
1244         case FF_TOP:
1245             handle_top_msg(msg);
1246             break;
1247 #ifdef FF_NETGRAPH
1248         case FF_NGCTL:
1249             handle_ngctl_msg(msg);
1250             break;
1251 #endif
1252 #ifdef FF_IPFW
1253         case FF_IPFW_CTL:
1254             handle_ipfw_msg(msg);
1255             break;
1256 #endif
1257         case FF_TRAFFIC:
1258             handle_traffic_msg(msg);
1259             break;
1260         default:
1261             handle_default_msg(msg);
1262             break;
1263     }
1264     rte_ring_enqueue(msg_ring[proc_id].ring[1], msg);
1265 }
1266 
1267 static inline int
1268 process_msg_ring(uint16_t proc_id)
1269 {
1270     void *msg;
1271     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1272 
1273     if (unlikely(ret == 0)) {
1274         handle_msg((struct ff_msg *)msg, proc_id);
1275     }
1276 
1277     return 0;
1278 }
1279 
1280 /* Send burst of packets on an output interface */
1281 static inline int
1282 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1283 {
1284     struct rte_mbuf **m_table;
1285     int ret;
1286     uint16_t queueid;
1287 
1288     queueid = qconf->tx_queue_id[port];
1289     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1290 
1291     if (unlikely(qconf->pcap[port] != NULL)) {
1292         uint16_t i;
1293         for (i = 0; i < n; i++) {
1294             ff_dump_packets(qconf->pcap[port], m_table[i]);
1295         }
1296     }
1297 
1298     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1299     ff_traffic.tx_packets += ret;
1300     uint16_t i;
1301     for (i = 0; i < ret; i++) {
1302         ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]);
1303     }
1304     if (unlikely(ret < n)) {
1305         do {
1306             rte_pktmbuf_free(m_table[ret]);
1307         } while (++ret < n);
1308     }
1309     return 0;
1310 }
1311 
1312 /* Enqueue a single packet, and send burst if queue is filled */
1313 static inline int
1314 send_single_packet(struct rte_mbuf *m, uint8_t port)
1315 {
1316     uint16_t len;
1317     struct lcore_conf *qconf;
1318 
1319     qconf = &lcore_conf;
1320     len = qconf->tx_mbufs[port].len;
1321     qconf->tx_mbufs[port].m_table[len] = m;
1322     len++;
1323 
1324     /* enough pkts to be sent */
1325     if (unlikely(len == MAX_PKT_BURST)) {
1326         send_burst(qconf, MAX_PKT_BURST, port);
1327         len = 0;
1328     }
1329 
1330     qconf->tx_mbufs[port].len = len;
1331     return 0;
1332 }
1333 
1334 int
1335 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1336     int total)
1337 {
1338     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1339     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1340     if (head == NULL) {
1341         ff_mbuf_free(m);
1342         return -1;
1343     }
1344 
1345     head->pkt_len = total;
1346     head->nb_segs = 0;
1347 
1348     int off = 0;
1349     struct rte_mbuf *cur = head, *prev = NULL;
1350     while(total > 0) {
1351         if (cur == NULL) {
1352             cur = rte_pktmbuf_alloc(mbuf_pool);
1353             if (cur == NULL) {
1354                 rte_pktmbuf_free(head);
1355                 ff_mbuf_free(m);
1356                 return -1;
1357             }
1358         }
1359 
1360         if (prev != NULL) {
1361             prev->next = cur;
1362         }
1363         head->nb_segs++;
1364 
1365         prev = cur;
1366         void *data = rte_pktmbuf_mtod(cur, void*);
1367         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1368         int ret = ff_mbuf_copydata(m, data, off, len);
1369         if (ret < 0) {
1370             rte_pktmbuf_free(head);
1371             ff_mbuf_free(m);
1372             return -1;
1373         }
1374 
1375 
1376         cur->data_len = len;
1377         off += len;
1378         total -= len;
1379         cur = NULL;
1380     }
1381 
1382     struct ff_tx_offload offload = {0};
1383     ff_mbuf_tx_offload(m, &offload);
1384 
1385     void *data = rte_pktmbuf_mtod(head, void*);
1386 
1387     if (offload.ip_csum) {
1388         /* ipv6 not supported yet */
1389         struct ipv4_hdr *iph;
1390         int iph_len;
1391         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1392         iph_len = (iph->version_ihl & 0x0f) << 2;
1393 
1394         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1395         head->l2_len = ETHER_HDR_LEN;
1396         head->l3_len = iph_len;
1397     }
1398 
1399     if (ctx->hw_features.tx_csum_l4) {
1400         struct ipv4_hdr *iph;
1401         int iph_len;
1402         iph = (struct ipv4_hdr *)(data + ETHER_HDR_LEN);
1403         iph_len = (iph->version_ihl & 0x0f) << 2;
1404 
1405         if (offload.tcp_csum) {
1406             head->ol_flags |= PKT_TX_TCP_CKSUM;
1407             head->l2_len = ETHER_HDR_LEN;
1408             head->l3_len = iph_len;
1409         }
1410 
1411         /*
1412          *  TCP segmentation offload.
1413          *
1414          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1415          *    implies PKT_TX_TCP_CKSUM)
1416          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1417          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1418          *    write the IP checksum to 0 in the packet
1419          *  - fill the mbuf offload information: l2_len,
1420          *    l3_len, l4_len, tso_segsz
1421          *  - calculate the pseudo header checksum without taking ip_len
1422          *    in account, and set it in the TCP header. Refer to
1423          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1424          *    used as helpers.
1425          */
1426         if (offload.tso_seg_size) {
1427             struct tcp_hdr *tcph;
1428             int tcph_len;
1429             tcph = (struct tcp_hdr *)((char *)iph + iph_len);
1430             tcph_len = (tcph->data_off & 0xf0) >> 2;
1431             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1432 
1433             head->ol_flags |= PKT_TX_TCP_SEG;
1434             head->l4_len = tcph_len;
1435             head->tso_segsz = offload.tso_seg_size;
1436         }
1437 
1438         if (offload.udp_csum) {
1439             head->ol_flags |= PKT_TX_UDP_CKSUM;
1440             head->l2_len = ETHER_HDR_LEN;
1441             head->l3_len = iph_len;
1442         }
1443     }
1444 
1445     ff_mbuf_free(m);
1446 
1447     return send_single_packet(head, ctx->port_id);
1448 }
1449 
1450 static int
1451 main_loop(void *arg)
1452 {
1453     struct loop_routine *lr = (struct loop_routine *)arg;
1454 
1455     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1456     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc;
1457     int i, j, nb_rx, idle;
1458     uint16_t port_id, queue_id;
1459     struct lcore_conf *qconf;
1460     const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) /
1461         US_PER_S * BURST_TX_DRAIN_US;
1462     struct ff_dpdk_if_context *ctx;
1463 
1464     prev_tsc = 0;
1465     usch_tsc = 0;
1466 
1467     qconf = &lcore_conf;
1468 
1469     while (1) {
1470         cur_tsc = rte_rdtsc();
1471         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1472             rte_timer_manage();
1473         }
1474 
1475         idle = 1;
1476         sys_tsc = 0;
1477         usr_tsc = 0;
1478 
1479         /*
1480          * TX burst queue drain
1481          */
1482         diff_tsc = cur_tsc - prev_tsc;
1483         if (unlikely(diff_tsc > drain_tsc)) {
1484             for (i = 0; i < qconf->nb_tx_port; i++) {
1485                 port_id = qconf->tx_port_id[i];
1486                 if (qconf->tx_mbufs[port_id].len == 0)
1487                     continue;
1488 
1489                 idle = 0;
1490 
1491                 send_burst(qconf,
1492                     qconf->tx_mbufs[port_id].len,
1493                     port_id);
1494                 qconf->tx_mbufs[port_id].len = 0;
1495             }
1496 
1497             prev_tsc = cur_tsc;
1498         }
1499 
1500         /*
1501          * Read packet from RX queues
1502          */
1503         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1504             port_id = qconf->rx_queue_list[i].port_id;
1505             queue_id = qconf->rx_queue_list[i].queue_id;
1506             ctx = veth_ctx[port_id];
1507 
1508 #ifdef FF_KNI
1509             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1510                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1511             }
1512 #endif
1513 
1514             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1515 
1516             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1517                 MAX_PKT_BURST);
1518             if (nb_rx == 0)
1519                 continue;
1520 
1521             idle = 0;
1522 
1523             /* Prefetch first packets */
1524             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1525                 rte_prefetch0(rte_pktmbuf_mtod(
1526                         pkts_burst[j], void *));
1527             }
1528 
1529             /* Prefetch and handle already prefetched packets */
1530             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1531                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1532                         j + PREFETCH_OFFSET], void *));
1533                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1534             }
1535 
1536             /* Handle remaining prefetched packets */
1537             for (; j < nb_rx; j++) {
1538                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1539             }
1540         }
1541 
1542         process_msg_ring(qconf->proc_id);
1543 
1544         div_tsc = rte_rdtsc();
1545 
1546         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) {
1547             usch_tsc = cur_tsc;
1548             lr->loop(lr->arg);
1549         }
1550 
1551         idle_sleep_tsc = rte_rdtsc();
1552         if (likely(idle && idle_sleep)) {
1553             usleep(idle_sleep);
1554             end_tsc = rte_rdtsc();
1555         } else {
1556             end_tsc = idle_sleep_tsc;
1557         }
1558 
1559         end_tsc = rte_rdtsc();
1560 
1561         if (usch_tsc == cur_tsc) {
1562             usr_tsc = idle_sleep_tsc - div_tsc;
1563         }
1564 
1565         if (!idle) {
1566             sys_tsc = div_tsc - cur_tsc;
1567             ff_top_status.sys_tsc += sys_tsc;
1568         }
1569 
1570         ff_top_status.usr_tsc += usr_tsc;
1571         ff_top_status.work_tsc += end_tsc - cur_tsc;
1572         ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1573 
1574         ff_top_status.loops++;
1575     }
1576 
1577     return 0;
1578 }
1579 
1580 int
1581 ff_dpdk_if_up(void) {
1582     int i;
1583     struct lcore_conf *qconf = &lcore_conf;
1584     for (i = 0; i < qconf->nb_tx_port; i++) {
1585         uint16_t port_id = qconf->tx_port_id[i];
1586 
1587         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1588         veth_ctx[port_id] = ff_veth_attach(pconf);
1589         if (veth_ctx[port_id] == NULL) {
1590             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1591         }
1592     }
1593 
1594     return 0;
1595 }
1596 
1597 void
1598 ff_dpdk_run(loop_func_t loop, void *arg) {
1599     struct loop_routine *lr = rte_malloc(NULL,
1600         sizeof(struct loop_routine), 0);
1601     lr->loop = loop;
1602     lr->arg = arg;
1603     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1604     rte_eal_mp_wait_lcore();
1605     rte_free(lr);
1606 }
1607 
1608 void
1609 ff_dpdk_pktmbuf_free(void *m)
1610 {
1611     rte_pktmbuf_free((struct rte_mbuf *)m);
1612 }
1613 
1614 static uint32_t
1615 toeplitz_hash(unsigned keylen, const uint8_t *key,
1616     unsigned datalen, const uint8_t *data)
1617 {
1618     uint32_t hash = 0, v;
1619     u_int i, b;
1620 
1621     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1622 
1623     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1624     for (i = 0; i < datalen; i++) {
1625         for (b = 0; b < 8; b++) {
1626             if (data[i] & (1<<(7-b)))
1627                 hash ^= v;
1628             v <<= 1;
1629             if ((i + 4) < keylen &&
1630                 (key[i+4] & (1<<(7-b))))
1631                 v |= 1;
1632         }
1633     }
1634     return (hash);
1635 }
1636 
1637 int
1638 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
1639     uint16_t sport, uint16_t dport)
1640 {
1641     struct lcore_conf *qconf = &lcore_conf;
1642     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
1643     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
1644 
1645     if (nb_queues <= 1) {
1646         return 1;
1647     }
1648 
1649     uint16_t reta_size = rss_reta_size[ctx->port_id];
1650     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
1651 
1652     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
1653         sizeof(dport)];
1654 
1655     unsigned datalen = 0;
1656 
1657     bcopy(&saddr, &data[datalen], sizeof(saddr));
1658     datalen += sizeof(saddr);
1659 
1660     bcopy(&daddr, &data[datalen], sizeof(daddr));
1661     datalen += sizeof(daddr);
1662 
1663     bcopy(&sport, &data[datalen], sizeof(sport));
1664     datalen += sizeof(sport);
1665 
1666     bcopy(&dport, &data[datalen], sizeof(dport));
1667     datalen += sizeof(dport);
1668 
1669     uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes),
1670         default_rsskey_40bytes, datalen, data);
1671 
1672     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
1673 }
1674 
1675 void
1676 ff_regist_packet_dispatcher(dispatch_func_t func)
1677 {
1678     packet_dispatcher = func;
1679 }
1680 
1681 uint64_t
1682 ff_get_tsc_ns()
1683 {
1684     uint64_t cur_tsc = rte_rdtsc();
1685     uint64_t hz = rte_get_tsc_hz();
1686     return ((double)cur_tsc/(double)hz) * NS_PER_S;
1687 }
1688 
1689