xref: /f-stack/lib/ff_dpdk_if.c (revision a9e7dcf4)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 
28 #include <rte_common.h>
29 #include <rte_byteorder.h>
30 #include <rte_log.h>
31 #include <rte_memory.h>
32 #include <rte_memcpy.h>
33 #include <rte_memzone.h>
34 #include <rte_config.h>
35 #include <rte_eal.h>
36 #include <rte_pci.h>
37 #include <rte_mbuf.h>
38 #include <rte_memory.h>
39 #include <rte_lcore.h>
40 #include <rte_launch.h>
41 #include <rte_ethdev.h>
42 #include <rte_debug.h>
43 #include <rte_common.h>
44 #include <rte_ether.h>
45 #include <rte_malloc.h>
46 #include <rte_cycles.h>
47 #include <rte_timer.h>
48 #include <rte_thash.h>
49 #include <rte_ip.h>
50 #include <rte_tcp.h>
51 #include <rte_udp.h>
52 
53 #include "ff_dpdk_if.h"
54 #include "ff_dpdk_pcap.h"
55 #include "ff_dpdk_kni.h"
56 #include "ff_config.h"
57 #include "ff_veth.h"
58 #include "ff_host_interface.h"
59 #include "ff_msg.h"
60 #include "ff_api.h"
61 
62 #define MEMPOOL_CACHE_SIZE 256
63 
64 #define ARP_RING_SIZE 2048
65 
66 #define MSG_RING_SIZE 32
67 
68 /*
69  * Configurable number of RX/TX ring descriptors
70  */
71 #define RX_QUEUE_SIZE 512
72 #define TX_QUEUE_SIZE 512
73 
74 #define MAX_PKT_BURST 32
75 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
76 
77 /*
78  * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send.
79  */
80 #define MAX_TX_BURST    (MAX_PKT_BURST / 2)
81 
82 #define NB_SOCKETS 8
83 
84 /* Configure how many packets ahead to prefetch, when reading packets */
85 #define PREFETCH_OFFSET    3
86 
87 #define MAX_RX_QUEUE_PER_LCORE 16
88 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS
89 #define MAX_RX_QUEUE_PER_PORT 128
90 
91 #define KNI_MBUF_MAX 2048
92 #define KNI_QUEUE_SIZE 2048
93 
94 static int enable_kni;
95 static int kni_accept;
96 
97 static struct rte_timer freebsd_clock;
98 
99 // Mellanox Linux's driver key
100 static uint8_t default_rsskey_40bytes[40] = {
101     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
102     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
103     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
104     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
105     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
106 };
107 
108 static struct rte_eth_conf default_port_conf = {
109     .rxmode = {
110         .mq_mode = ETH_MQ_RX_RSS,
111         .max_rx_pkt_len = ETHER_MAX_LEN,
112         .split_hdr_size = 0, /**< hdr buf size */
113         .header_split   = 0, /**< Header Split disabled */
114         .hw_ip_checksum = 0, /**< IP checksum offload disabled */
115         .hw_vlan_filter = 0, /**< VLAN filtering disabled */
116         .hw_vlan_strip  = 0, /**< VLAN strip disabled. */
117         .hw_vlan_extend = 0, /**< Extended VLAN disabled. */
118         .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
119         .hw_strip_crc   = 0, /**< CRC stripped by hardware */
120         .enable_lro     = 0, /**< LRO disabled */
121     },
122     .rx_adv_conf = {
123         .rss_conf = {
124             .rss_key = default_rsskey_40bytes,
125             .rss_key_len = 40,
126             .rss_hf = ETH_RSS_PROTO_MASK,
127         },
128     },
129     .txmode = {
130         .mq_mode = ETH_MQ_TX_NONE,
131     },
132 };
133 
134 struct mbuf_table {
135     uint16_t len;
136     struct rte_mbuf *m_table[MAX_PKT_BURST];
137 };
138 
139 struct lcore_rx_queue {
140     uint8_t port_id;
141     uint8_t queue_id;
142 } __rte_cache_aligned;
143 
144 struct lcore_conf {
145     uint16_t proc_id;
146     uint16_t nb_procs;
147     uint16_t socket_id;
148     uint16_t nb_rx_queue;
149     uint16_t *proc_lcore;
150     struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
151     uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
152     struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
153     char *pcap[RTE_MAX_ETHPORTS];
154 } __rte_cache_aligned;
155 
156 static struct lcore_conf lcore_conf;
157 
158 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
159 
160 static struct rte_ring **arp_ring[RTE_MAX_LCORE];
161 
162 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
163 
164 struct ff_msg_ring {
165     char ring_name[2][RTE_RING_NAMESIZE];
166     /* ring[0] for lcore recv msg, other send */
167     /* ring[1] for lcore send msg, other read */
168     struct rte_ring *ring[2];
169 } __rte_cache_aligned;
170 
171 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
172 static struct rte_mempool *message_pool;
173 
174 struct ff_dpdk_if_context {
175     void *sc;
176     void *ifp;
177     uint16_t port_id;
178     struct ff_hw_features hw_features;
179 } __rte_cache_aligned;
180 
181 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
182 
183 extern void ff_hardclock(void);
184 
185 static void
186 ff_hardclock_job(__rte_unused struct rte_timer *timer,
187     __rte_unused void *arg) {
188     ff_hardclock();
189     ff_update_current_ts();
190 }
191 
192 struct ff_dpdk_if_context *
193 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
194 {
195     struct ff_dpdk_if_context *ctx;
196 
197     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
198     if (ctx == NULL)
199         return NULL;
200 
201     ctx->sc = sc;
202     ctx->ifp = ifp;
203     ctx->port_id = cfg->port_id;
204     ctx->hw_features = cfg->hw_features;
205 
206     return ctx;
207 }
208 
209 void
210 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
211 {
212     free(ctx);
213 }
214 
215 static void
216 check_all_ports_link_status(void)
217 {
218     #define CHECK_INTERVAL 100 /* 100ms */
219     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
220 
221     uint8_t portid, count, all_ports_up, print_flag = 0;
222     struct rte_eth_link link;
223 
224     printf("\nChecking link status");
225     fflush(stdout);
226 
227     int i, nb_ports;
228     nb_ports = ff_global_cfg.dpdk.nb_ports;
229     for (count = 0; count <= MAX_CHECK_TIME; count++) {
230         all_ports_up = 1;
231         for (i = 0; i < nb_ports; i++) {
232             uint8_t portid = ff_global_cfg.dpdk.port_cfgs[i].port_id;
233             memset(&link, 0, sizeof(link));
234             rte_eth_link_get_nowait(portid, &link);
235 
236             /* print link status if flag set */
237             if (print_flag == 1) {
238                 if (link.link_status) {
239                     printf("Port %d Link Up - speed %u "
240                         "Mbps - %s\n", (int)portid,
241                         (unsigned)link.link_speed,
242                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
243                         ("full-duplex") : ("half-duplex\n"));
244                 } else {
245                     printf("Port %d Link Down\n", (int)portid);
246                 }
247                 continue;
248             }
249             /* clear all_ports_up flag if any link down */
250             if (link.link_status == 0) {
251                 all_ports_up = 0;
252                 break;
253             }
254         }
255 
256         /* after finally printing all link status, get out */
257         if (print_flag == 1)
258             break;
259 
260         if (all_ports_up == 0) {
261             printf(".");
262             fflush(stdout);
263             rte_delay_ms(CHECK_INTERVAL);
264         }
265 
266         /* set the print_flag if all ports up or timeout */
267         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
268             print_flag = 1;
269             printf("done\n");
270         }
271     }
272 }
273 
274 static int
275 init_lcore_conf(void)
276 {
277     uint8_t nb_ports = rte_eth_dev_count();
278     if (nb_ports == 0) {
279         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
280     }
281 
282     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
283     lcore_conf.nb_procs = ff_global_cfg.dpdk.nb_procs;
284 
285     lcore_conf.proc_lcore = rte_zmalloc(NULL,
286         sizeof(uint16_t) * lcore_conf.nb_procs, 0);
287     if (lcore_conf.proc_lcore == NULL) {
288         rte_exit(EXIT_FAILURE, "rte_zmalloc proc_lcore failed\n");
289     }
290     rte_memcpy(lcore_conf.proc_lcore, ff_global_cfg.dpdk.proc_lcore,
291         sizeof(uint16_t) * lcore_conf.nb_procs);
292     uint16_t proc_id;
293     for (proc_id = 0; proc_id < lcore_conf.nb_procs; proc_id++) {
294         uint16_t lcore_id = lcore_conf.proc_lcore[proc_id];
295         if (!lcore_config[lcore_id].detected) {
296             rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
297         }
298     }
299 
300     uint16_t socket_id = 0;
301     if (ff_global_cfg.dpdk.numa_on) {
302         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
303     }
304 
305     lcore_conf.socket_id = socket_id;
306 
307     /* Currently, proc id 1:1 map to rx/tx queue id per port. */
308     uint8_t port_id, enabled_ports = 0;
309     for (port_id = 0; port_id < nb_ports; port_id++) {
310         if (ff_global_cfg.dpdk.port_mask &&
311             (ff_global_cfg.dpdk.port_mask & (1 << port_id)) == 0) {
312             printf("\nSkipping disabled port %d\n", port_id);
313             continue;
314         }
315 
316         if (port_id >= ff_global_cfg.dpdk.nb_ports) {
317             printf("\nSkipping non-configured port %d\n", port_id);
318             break;
319         }
320 
321         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
322         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
323         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = lcore_conf.proc_id;
324         lcore_conf.nb_rx_queue++;
325 
326         lcore_conf.tx_queue_id[port_id] = lcore_conf.proc_id;
327         lcore_conf.pcap[port_id] = ff_global_cfg.dpdk.port_cfgs[enabled_ports].pcap;
328 
329         ff_global_cfg.dpdk.port_cfgs[enabled_ports].port_id = port_id;
330 
331         enabled_ports++;
332     }
333 
334     ff_global_cfg.dpdk.nb_ports = enabled_ports;
335 
336     return 0;
337 }
338 
339 static int
340 init_mem_pool(void)
341 {
342     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
343     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
344     uint32_t nb_tx_queue = nb_lcores;
345     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
346 
347     unsigned nb_mbuf = RTE_MAX (
348         (nb_rx_queue*RX_QUEUE_SIZE          +
349         nb_ports*nb_lcores*MAX_PKT_BURST    +
350         nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
351         nb_lcores*MEMPOOL_CACHE_SIZE +
352         nb_ports*KNI_MBUF_MAX +
353         nb_ports*KNI_QUEUE_SIZE +
354         nb_lcores*nb_ports*ARP_RING_SIZE),
355         (unsigned)8192);
356 
357     unsigned socketid = 0;
358     uint16_t i, lcore_id;
359     char s[64];
360     int numa_on = ff_global_cfg.dpdk.numa_on;
361 
362     for (i = 0; i < lcore_conf.nb_procs; i++) {
363         lcore_id = lcore_conf.proc_lcore[i];
364         if (numa_on) {
365             socketid = rte_lcore_to_socket_id(lcore_id);
366         }
367 
368         if (socketid >= NB_SOCKETS) {
369             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
370                 socketid, i, NB_SOCKETS);
371         }
372 
373         if (pktmbuf_pool[socketid] != NULL) {
374             continue;
375         }
376 
377         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
378             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
379             pktmbuf_pool[socketid] =
380                 rte_pktmbuf_pool_create(s, nb_mbuf,
381                     MEMPOOL_CACHE_SIZE, 0,
382                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
383         } else {
384             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
385             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
386         }
387 
388         if (pktmbuf_pool[socketid] == NULL) {
389             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
390         } else {
391             printf("create mbuf pool on socket %d\n", socketid);
392         }
393     }
394 
395     return 0;
396 }
397 
398 static struct rte_ring *
399 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
400 {
401     struct rte_ring *ring;
402 
403     if (name == NULL)
404         return NULL;
405 
406     /* If already create, just attached it */
407     if (likely((ring = rte_ring_lookup(name)) != NULL))
408         return ring;
409 
410     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
411         return rte_ring_create(name, count, socket_id, flags);
412     } else {
413         return rte_ring_lookup(name);
414     }
415 }
416 
417 static int
418 init_arp_ring(void)
419 {
420     int i, j, ret;
421     char name_buf[RTE_RING_NAMESIZE];
422     int nb_procs = ff_global_cfg.dpdk.nb_procs;
423     int proc_id = ff_global_cfg.dpdk.proc_id;
424 
425     /* Allocate arp ring ptr according to eth dev count. */
426     int nb_ports = rte_eth_dev_count();
427     for(i = 0; i < nb_procs; ++i) {
428         snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_%d_%d",
429             proc_id, i);
430 
431         arp_ring[i] = rte_zmalloc(name_buf,
432             sizeof(struct rte_ring *) * nb_ports,
433              RTE_CACHE_LINE_SIZE);
434         if (arp_ring[i] == NULL) {
435             rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
436                 "failed\n", name_buf);
437         }
438     }
439 
440     unsigned socketid = lcore_conf.socket_id;
441 
442     /* Create ring according to ports actually being used. */
443     nb_ports = ff_global_cfg.dpdk.nb_ports;
444     for (j = 0; j < nb_ports; j++) {
445         uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[j].port_id;
446 
447         for(i = 0; i < nb_procs; ++i) {
448             snprintf(name_buf, RTE_RING_NAMESIZE, "arp_ring_%d_%d", i, port_id);
449             arp_ring[i][port_id] = create_ring(name_buf, ARP_RING_SIZE,
450                 socketid, RING_F_SC_DEQ);
451 
452             if (arp_ring[i][port_id] == NULL)
453                 rte_panic("create ring:%s failed!\n", name_buf);
454 
455             printf("create ring:%s success, %u ring entries are now free!\n",
456                 name_buf, rte_ring_free_count(arp_ring[i][port_id]));
457         }
458     }
459 
460     return 0;
461 }
462 
463 static void
464 ff_msg_init(struct rte_mempool *mp,
465     __attribute__((unused)) void *opaque_arg,
466     void *obj, __attribute__((unused)) unsigned i)
467 {
468     struct ff_msg *msg = (struct ff_msg *)obj;
469     msg->msg_type = FF_UNKNOWN;
470     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
471     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
472 }
473 
474 static int
475 init_msg_ring(void)
476 {
477     uint16_t i;
478     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
479     unsigned socketid = lcore_conf.socket_id;
480 
481     /* Create message buffer pool */
482     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
483         message_pool = rte_mempool_create(FF_MSG_POOL,
484            MSG_RING_SIZE * 2 * nb_procs,
485            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
486            NULL, NULL, ff_msg_init, NULL,
487            socketid, 0);
488     } else {
489         message_pool = rte_mempool_lookup(FF_MSG_POOL);
490     }
491 
492     if (message_pool == NULL) {
493         rte_panic("Create msg mempool failed\n");
494     }
495 
496     for(i = 0; i < nb_procs; ++i) {
497         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
498             "%s%u", FF_MSG_RING_IN, i);
499         snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE,
500             "%s%u", FF_MSG_RING_OUT, i);
501 
502         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
503             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
504         if (msg_ring[i].ring[0] == NULL)
505             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
506 
507         msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1],
508             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
509         if (msg_ring[i].ring[1] == NULL)
510             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
511     }
512 
513     return 0;
514 }
515 
516 static int
517 init_kni(void)
518 {
519     int nb_ports = rte_eth_dev_count();
520     kni_accept = 0;
521     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
522         kni_accept = 1;
523 
524     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
525         ff_global_cfg.kni.udp_port);
526 
527     unsigned socket_id = lcore_conf.socket_id;
528     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
529 
530     nb_ports = ff_global_cfg.dpdk.nb_ports;
531     int i, ret;
532     for (i = 0; i < nb_ports; i++) {
533         uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id;
534         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
535     }
536 
537     return 0;
538 }
539 
540 static void
541 set_rss_table(uint8_t port_id, uint16_t reta_size, uint16_t nb_queues)
542 {
543     if (reta_size == 0) {
544         return;
545     }
546 
547     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
548     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
549 
550     /* config HW indirection table */
551     unsigned i, j, hash=0;
552     for (i = 0; i < reta_conf_size; i++) {
553         reta_conf[i].mask = ~0ULL;
554         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
555             reta_conf[i].reta[j] = hash++ % nb_queues;
556         }
557     }
558 
559     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
560         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
561             port_id);
562     }
563 }
564 
565 static int
566 init_port_start(void)
567 {
568     int nb_ports = ff_global_cfg.dpdk.nb_ports;
569     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
570     unsigned socketid = rte_lcore_to_socket_id(rte_lcore_id());
571     struct rte_mempool *mbuf_pool = pktmbuf_pool[socketid];
572     uint16_t i;
573 
574     for (i = 0; i < nb_ports; i++) {
575         uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id;
576 
577         struct rte_eth_dev_info dev_info;
578         rte_eth_dev_info_get(port_id, &dev_info);
579 
580         if (nb_procs > dev_info.max_rx_queues) {
581             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
582                 nb_procs,
583                 dev_info.max_rx_queues);
584         }
585 
586         if (nb_procs > dev_info.max_tx_queues) {
587             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
588                 nb_procs,
589                 dev_info.max_tx_queues);
590         }
591 
592         struct ether_addr addr;
593         rte_eth_macaddr_get(port_id, &addr);
594         printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
595                    " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
596                 (unsigned)port_id,
597                 addr.addr_bytes[0], addr.addr_bytes[1],
598                 addr.addr_bytes[2], addr.addr_bytes[3],
599                 addr.addr_bytes[4], addr.addr_bytes[5]);
600 
601         rte_memcpy(ff_global_cfg.dpdk.port_cfgs[i].mac,
602             addr.addr_bytes, ETHER_ADDR_LEN);
603 
604         /* Clear txq_flags - we do not need multi-mempool and refcnt */
605         dev_info.default_txconf.txq_flags = ETH_TXQ_FLAGS_NOMULTMEMP |
606             ETH_TXQ_FLAGS_NOREFCOUNT;
607 
608         /* Disable features that are not supported by port's HW */
609         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) {
610             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP;
611         }
612 
613         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
614             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP;
615         }
616 
617         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) {
618             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP;
619         }
620 
621         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) {
622             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL;
623         }
624 
625         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) {
626             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL;
627         }
628 
629         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) &&
630             !(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TSO)) {
631             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
632         }
633 
634         struct rte_eth_conf port_conf = {0};
635 
636         /* Set RSS mode */
637         port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
638         port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK;
639         port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes;
640         port_conf.rx_adv_conf.rss_conf.rss_key_len = 40;
641 
642         /* Set Rx VLAN stripping */
643         if (ff_global_cfg.dpdk.vlan_strip) {
644             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
645                 port_conf.rxmode.hw_vlan_strip = 1;
646             }
647         }
648 
649         /* Enable HW CRC stripping */
650         port_conf.rxmode.hw_strip_crc = 1;
651 
652         /* FIXME: Enable TCP LRO ?*/
653         #if 0
654         if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
655             printf("LRO is supported\n");
656             port_conf.rxmode.enable_lro = 1;
657             ff_global_cfg.dpdk.port_cfgs[i].hw_features.rx_lro = 1;
658         }
659         #endif
660 
661         /* Set Rx checksum checking */
662         if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
663             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
664             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
665             printf("RX checksum offload supported\n");
666             port_conf.rxmode.hw_ip_checksum = 1;
667             ff_global_cfg.dpdk.port_cfgs[i].hw_features.rx_csum = 1;
668         }
669 
670         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
671             printf("TX ip checksum offload supported\n");
672             ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_csum_ip = 1;
673         }
674 
675         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
676             (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
677             printf("TX TCP&UDP checksum offload supported\n");
678             ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_csum_l4 = 1;
679         }
680 
681         if (ff_global_cfg.dpdk.tso) {
682             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
683                 printf("TSO is supported\n");
684                 ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_tso = 1;
685             }
686         } else {
687             printf("TSO is disabled\n");
688         }
689 
690         if (dev_info.reta_size) {
691             /* reta size must be power of 2 */
692             assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
693 
694             rss_reta_size[port_id] = dev_info.reta_size;
695             printf("port[%d]: rss table size: %d\n", port_id,
696                 dev_info.reta_size);
697         }
698 
699         if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
700             continue;
701         }
702 
703         /* Currently, proc id 1:1 map to queue id per port. */
704         int ret = rte_eth_dev_configure(port_id, nb_procs, nb_procs, &port_conf);
705         if (ret != 0) {
706             return ret;
707         }
708 
709         uint16_t q;
710         for (q = 0; q < nb_procs; q++) {
711             ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE,
712                 socketid, &dev_info.default_txconf);
713             if (ret < 0) {
714                 return ret;
715             }
716 
717             ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE,
718                 socketid, &dev_info.default_rxconf, mbuf_pool);
719             if (ret < 0) {
720                 return ret;
721             }
722         }
723 
724         ret = rte_eth_dev_start(port_id);
725         if (ret < 0) {
726             return ret;
727         }
728 
729         if (nb_procs > 1) {
730             /* set HW rss hash function to Toeplitz. */
731             if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
732                 struct rte_eth_hash_filter_info info = {0};
733                 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
734                 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
735 
736                 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
737                     RTE_ETH_FILTER_SET, &info) < 0) {
738                     rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
739                         port_id);
740                 }
741             }
742 
743             set_rss_table(port_id, dev_info.reta_size, nb_procs);
744         }
745 
746         /* Enable RX in promiscuous mode for the Ethernet device. */
747         if (ff_global_cfg.dpdk.promiscuous) {
748             rte_eth_promiscuous_enable(port_id);
749             ret = rte_eth_promiscuous_get(port_id);
750             if (ret == 1) {
751                 printf("set port %u to promiscuous mode ok\n", port_id);
752             } else {
753                 printf("set port %u to promiscuous mode error\n", port_id);
754             }
755         }
756 
757         /* Enable pcap dump */
758         if (ff_global_cfg.dpdk.port_cfgs[i].pcap) {
759             ff_enable_pcap(ff_global_cfg.dpdk.port_cfgs[i].pcap);
760         }
761     }
762 
763     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
764         check_all_ports_link_status();
765     }
766 
767     return 0;
768 }
769 
770 static int
771 init_clock(void)
772 {
773     rte_timer_subsystem_init();
774     uint64_t hz = rte_get_timer_hz();
775     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
776     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
777 
778     rte_timer_init(&freebsd_clock);
779     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
780         rte_lcore_id(), &ff_hardclock_job, NULL);
781 
782     ff_update_current_ts();
783 
784     return 0;
785 }
786 
787 int
788 ff_dpdk_init(int argc, char **argv)
789 {
790     if (ff_global_cfg.dpdk.nb_procs < 1 ||
791         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
792         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
793         ff_global_cfg.dpdk.proc_id < 0) {
794         printf("param num_procs[%d] or proc_id[%d] error!\n",
795             ff_global_cfg.dpdk.nb_procs,
796             ff_global_cfg.dpdk.proc_id);
797         exit(1);
798     }
799 
800     int ret = rte_eal_init(argc, argv);
801     if (ret < 0) {
802         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
803     }
804 
805     init_lcore_conf();
806 
807     init_mem_pool();
808 
809     init_arp_ring();
810 
811     init_msg_ring();
812 
813     enable_kni = ff_global_cfg.kni.enable;
814     if (enable_kni) {
815         init_kni();
816     }
817 
818     ret = init_port_start();
819     if (ret < 0) {
820         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
821     }
822 
823     init_clock();
824 
825     return 0;
826 }
827 
828 static void
829 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
830 {
831     uint8_t rx_csum = ctx->hw_features.rx_csum;
832     if (rx_csum) {
833         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
834             return;
835         }
836     }
837 
838     /*
839      * FIXME: should we save pkt->vlan_tci
840      * if (pkt->ol_flags & PKT_RX_VLAN_PKT)
841      */
842 
843     void *data = rte_pktmbuf_mtod(pkt, void*);
844     uint16_t len = rte_pktmbuf_data_len(pkt);
845 
846     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
847     if (hdr == NULL) {
848         rte_pktmbuf_free(pkt);
849         return;
850     }
851 
852     struct rte_mbuf *pn = pkt->next;
853     void *prev = hdr;
854     while(pn != NULL) {
855         data = rte_pktmbuf_mtod(pkt, void*);
856         len = rte_pktmbuf_data_len(pkt);
857 
858         void *mb = ff_mbuf_get(prev, data, len);
859         if (mb == NULL) {
860             ff_mbuf_free(hdr);
861             rte_pktmbuf_free(pkt);
862             return;
863         }
864         pn = pn->next;
865         prev = mb;
866     }
867 
868     ff_veth_process_packet(ctx->ifp, hdr);
869 }
870 
871 static enum FilterReturn
872 protocol_filter(const void *data, uint16_t len)
873 {
874     if(len < sizeof(struct ether_hdr))
875         return FILTER_UNKNOWN;
876 
877     const struct ether_hdr *hdr;
878     hdr = (const struct ether_hdr *)data;
879 
880     if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP)
881         return FILTER_ARP;
882 
883     if (!enable_kni) {
884         return FILTER_UNKNOWN;
885     }
886 
887     if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4)
888         return FILTER_UNKNOWN;
889 
890     return ff_kni_proto_filter(data + sizeof(struct ether_hdr),
891         len - sizeof(struct ether_hdr));
892 }
893 
894 static inline void
895 process_packets(uint8_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
896     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
897 {
898     struct lcore_conf *qconf = &lcore_conf;
899 
900     uint16_t i;
901     for (i = 0; i < count; i++) {
902         struct rte_mbuf *rtem = bufs[i];
903 
904         if (unlikely(qconf->pcap[port_id] != NULL)) {
905             ff_dump_packets(qconf->pcap[port_id], rtem);
906         }
907 
908         void *data = rte_pktmbuf_mtod(rtem, void*);
909         uint16_t len = rte_pktmbuf_data_len(rtem);
910 
911         enum FilterReturn filter = protocol_filter(data, len);
912         if (filter == FILTER_ARP) {
913             struct rte_mempool *mbuf_pool;
914             struct rte_mbuf *mbuf_clone;
915             if (pkts_from_ring == 0) {
916                 uint16_t i;
917                 for(i = 0; i < qconf->nb_procs; ++i) {
918                     if(i == queue_id)
919                         continue;
920 
921                     mbuf_pool = pktmbuf_pool[rte_lcore_to_socket_id(qconf->proc_lcore[i])];
922                     mbuf_clone = rte_pktmbuf_clone(rtem, mbuf_pool);
923                     if(mbuf_clone) {
924                         int ret = rte_ring_enqueue(arp_ring[i][port_id], mbuf_clone);
925                         if (ret < 0)
926                             rte_pktmbuf_free(mbuf_clone);
927                     }
928                 }
929             }
930 
931             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
932                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
933                 mbuf_clone = rte_pktmbuf_clone(rtem, mbuf_pool);
934                 if(mbuf_clone) {
935                     ff_kni_enqueue(port_id, mbuf_clone);
936                 }
937             }
938 
939             ff_veth_input(ctx, rtem);
940         } else if (enable_kni && ((filter == FILTER_KNI && kni_accept) ||
941             (filter == FILTER_UNKNOWN && !kni_accept)) ) {
942             ff_kni_enqueue(port_id, rtem);
943         } else {
944             ff_veth_input(ctx, rtem);
945         }
946     }
947 }
948 
949 static inline int
950 process_arp_ring(uint8_t port_id, uint16_t queue_id,
951     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
952 {
953     /* read packet from ring buf and to process */
954     uint16_t nb_rb;
955     nb_rb = rte_ring_dequeue_burst(arp_ring[queue_id][port_id],
956         (void **)pkts_burst, MAX_PKT_BURST);
957 
958     if(nb_rb > 0) {
959         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
960     }
961 
962     return 0;
963 }
964 
965 static inline void
966 handle_sysctl_msg(struct ff_msg *msg, uint16_t proc_id)
967 {
968     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
969         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
970         msg->sysctl.newlen);
971 
972     if (ret < 0) {
973         msg->result = errno;
974     } else {
975         msg->result = 0;
976     }
977 
978     rte_ring_enqueue(msg_ring[proc_id].ring[1], msg);
979 }
980 
981 static inline void
982 handle_ioctl_msg(struct ff_msg *msg, uint16_t proc_id)
983 {
984     int fd, ret;
985     fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
986     if (fd < 0) {
987         ret = -1;
988         goto done;
989     }
990 
991     ret = ff_ioctl(fd, msg->ioctl.cmd, msg->ioctl.data);
992 
993     ff_close(fd);
994 
995 done:
996     if (ret < 0) {
997         msg->result = errno;
998     } else {
999         msg->result = 0;
1000     }
1001 
1002     rte_ring_enqueue(msg_ring[proc_id].ring[1], msg);
1003 }
1004 
1005 static inline void
1006 handle_route_msg(struct ff_msg *msg, uint16_t proc_id)
1007 {
1008     msg->result = ff_rtioctl(msg->route.fib, msg->route.data,
1009         &msg->route.len, msg->route.maxlen);
1010 
1011     rte_ring_enqueue(msg_ring[proc_id].ring[1], msg);
1012 }
1013 
1014 static struct ff_top_args ff_status;
1015 static inline void
1016 handle_top_msg(struct ff_msg *msg, uint16_t proc_id)
1017 {
1018     msg->top = ff_status;
1019     msg->result = 0;
1020 
1021     rte_ring_enqueue(msg_ring[proc_id].ring[1], msg);
1022 }
1023 
1024 static inline void
1025 handle_default_msg(struct ff_msg *msg, uint16_t proc_id)
1026 {
1027     msg->result = EINVAL;
1028     rte_ring_enqueue(msg_ring[proc_id].ring[1], msg);
1029 }
1030 
1031 static inline void
1032 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1033 {
1034     switch (msg->msg_type) {
1035         case FF_SYSCTL:
1036             handle_sysctl_msg(msg, proc_id);
1037             break;
1038         case FF_IOCTL:
1039             handle_ioctl_msg(msg, proc_id);
1040             break;
1041         case FF_ROUTE:
1042             handle_route_msg(msg, proc_id);
1043             break;
1044         case FF_TOP:
1045             handle_top_msg(msg, proc_id);
1046             break;
1047         default:
1048             handle_default_msg(msg, proc_id);
1049             break;
1050     }
1051 }
1052 
1053 static inline int
1054 process_msg_ring(uint16_t proc_id)
1055 {
1056     void *msg;
1057     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1058 
1059     if (unlikely(ret == 0)) {
1060         handle_msg((struct ff_msg *)msg, proc_id);
1061     }
1062 
1063     return 0;
1064 }
1065 
1066 /* Send burst of packets on an output interface */
1067 static inline int
1068 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1069 {
1070     struct rte_mbuf **m_table;
1071     int ret;
1072     uint16_t queueid;
1073 
1074     queueid = qconf->tx_queue_id[port];
1075     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1076 
1077     if (unlikely(qconf->pcap[port] != NULL)) {
1078         uint16_t i;
1079         for (i = 0; i < n; i++) {
1080             ff_dump_packets(qconf->pcap[port], m_table[i]);
1081         }
1082     }
1083 
1084     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1085     if (unlikely(ret < n)) {
1086         do {
1087             rte_pktmbuf_free(m_table[ret]);
1088         } while (++ret < n);
1089     }
1090 
1091     return 0;
1092 }
1093 
1094 /* Enqueue a single packet, and send burst if queue is filled */
1095 static inline int
1096 send_single_packet(struct rte_mbuf *m, uint8_t port)
1097 {
1098     uint16_t len;
1099     struct lcore_conf *qconf;
1100 
1101     qconf = &lcore_conf;
1102     len = qconf->tx_mbufs[port].len;
1103     qconf->tx_mbufs[port].m_table[len] = m;
1104     len++;
1105 
1106     /* enough pkts to be sent */
1107     if (unlikely(len == MAX_PKT_BURST)) {
1108         send_burst(qconf, MAX_PKT_BURST, port);
1109         len = 0;
1110     }
1111 
1112     qconf->tx_mbufs[port].len = len;
1113     return 0;
1114 }
1115 
1116 int
1117 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1118     int total)
1119 {
1120     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1121     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1122     if (head == NULL) {
1123         ff_mbuf_free(m);
1124         return -1;
1125     }
1126 
1127     head->pkt_len = total;
1128     head->nb_segs = 0;
1129 
1130     int off = 0;
1131     struct rte_mbuf *cur = head, *prev = NULL;
1132     while(total > 0) {
1133         if (cur == NULL) {
1134             cur = rte_pktmbuf_alloc(mbuf_pool);
1135             if (cur == NULL) {
1136                 rte_pktmbuf_free(head);
1137                 ff_mbuf_free(m);
1138                 return -1;
1139             }
1140         }
1141 
1142         void *data = rte_pktmbuf_mtod(cur, void*);
1143         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1144         int ret = ff_mbuf_copydata(m, data, off, len);
1145         if (ret < 0) {
1146             rte_pktmbuf_free(head);
1147             ff_mbuf_free(m);
1148             return -1;
1149         }
1150 
1151         if (prev != NULL) {
1152             prev->next = cur;
1153         }
1154         prev = cur;
1155 
1156         cur->data_len = len;
1157         off += len;
1158         total -= len;
1159         head->nb_segs++;
1160         cur = NULL;
1161     }
1162 
1163     struct ff_tx_offload offload = {0};
1164     ff_mbuf_tx_offload(m, &offload);
1165 
1166     if (offload.ip_csum) {
1167         head->ol_flags |= PKT_TX_IP_CKSUM;
1168         head->l2_len = sizeof(struct ether_hdr);
1169         head->l3_len = sizeof(struct ipv4_hdr);
1170     }
1171 
1172     if (ctx->hw_features.tx_csum_l4) {
1173         if (offload.tcp_csum) {
1174             head->ol_flags |= PKT_TX_TCP_CKSUM;
1175             head->l2_len = sizeof(struct ether_hdr);
1176             head->l3_len = sizeof(struct ipv4_hdr);
1177         }
1178 
1179         if (offload.tso_seg_size) {
1180             head->ol_flags |= PKT_TX_TCP_SEG;
1181             head->l4_len = sizeof(struct tcp_hdr);
1182             head->tso_segsz = offload.tso_seg_size;
1183         }
1184 
1185         if (offload.udp_csum) {
1186             head->ol_flags |= PKT_TX_UDP_CKSUM;
1187             head->l2_len = sizeof(struct ether_hdr);
1188             head->l3_len = sizeof(struct ipv4_hdr);
1189         }
1190     }
1191 
1192     ff_mbuf_free(m);
1193 
1194     return send_single_packet(head, ctx->port_id);
1195 }
1196 
1197 static int
1198 main_loop(void *arg)
1199 {
1200     struct loop_routine *lr = (struct loop_routine *)arg;
1201 
1202     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1203     unsigned lcore_id;
1204     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc;
1205     int i, j, nb_rx, idle;
1206     uint8_t port_id, queue_id;
1207     struct lcore_conf *qconf;
1208     const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) /
1209         US_PER_S * BURST_TX_DRAIN_US;
1210     struct ff_dpdk_if_context *ctx;
1211 
1212     prev_tsc = 0;
1213     usch_tsc = 0;
1214 
1215     lcore_id = rte_lcore_id();
1216     qconf = &lcore_conf;
1217 
1218     if (qconf->nb_rx_queue == 0) {
1219         printf("lcore %u has nothing to do\n", lcore_id);
1220         return 0;
1221     }
1222 
1223     while (1) {
1224         cur_tsc = rte_rdtsc();
1225         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1226             rte_timer_manage();
1227         }
1228 
1229         idle = 1;
1230         sys_tsc = 0;
1231         usr_tsc = 0;
1232 
1233         /*
1234          * TX burst queue drain
1235          */
1236         diff_tsc = cur_tsc - prev_tsc;
1237         if (unlikely(diff_tsc > drain_tsc)) {
1238             /*
1239              * This could be optimized (use queueid instead of
1240              * portid), but it is not called so often
1241              */
1242             for (port_id = 0; port_id < RTE_MAX_ETHPORTS; port_id++) {
1243                 if (qconf->tx_mbufs[port_id].len == 0)
1244                     continue;
1245 
1246                 idle = 0;
1247                 send_burst(qconf,
1248                     qconf->tx_mbufs[port_id].len,
1249                     port_id);
1250                 qconf->tx_mbufs[port_id].len = 0;
1251             }
1252 
1253             prev_tsc = cur_tsc;
1254         }
1255 
1256         /*
1257          * Read packet from RX queues
1258          */
1259         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1260             port_id = qconf->rx_queue_list[i].port_id;
1261             queue_id = qconf->rx_queue_list[i].queue_id;
1262             ctx = veth_ctx[port_id];
1263 
1264             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1265                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1266             }
1267 
1268             process_arp_ring(port_id, queue_id, pkts_burst, ctx);
1269 
1270             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1271                 MAX_PKT_BURST);
1272             if (nb_rx == 0)
1273                 continue;
1274 
1275             idle = 0;
1276 
1277             /* Prefetch first packets */
1278             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1279                 rte_prefetch0(rte_pktmbuf_mtod(
1280                         pkts_burst[j], void *));
1281             }
1282 
1283             /* Prefetch and handle already prefetched packets */
1284             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1285                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1286                         j + PREFETCH_OFFSET], void *));
1287                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1288             }
1289 
1290             /* Handle remaining prefetched packets */
1291             for (; j < nb_rx; j++) {
1292                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1293             }
1294         }
1295 
1296         process_msg_ring(qconf->proc_id);
1297 
1298         div_tsc = rte_rdtsc();
1299 
1300         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) {
1301             usch_tsc = cur_tsc;
1302             lr->loop(lr->arg);
1303         }
1304 
1305         end_tsc = rte_rdtsc();
1306 
1307         if (usch_tsc == cur_tsc) {
1308             usr_tsc = end_tsc - div_tsc;
1309         }
1310 
1311         if (!idle) {
1312             sys_tsc = div_tsc - cur_tsc;
1313             ff_status.sys_tsc += sys_tsc;
1314         }
1315 
1316         ff_status.usr_tsc += usr_tsc;
1317         ff_status.work_tsc += end_tsc - cur_tsc;
1318         ff_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1319 
1320         ff_status.loops++;
1321     }
1322 }
1323 
1324 int
1325 ff_dpdk_if_up(void) {
1326     int nb_ports = ff_global_cfg.dpdk.nb_ports;
1327     int i;
1328     for (i = 0; i < nb_ports; i++) {
1329         uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id;
1330         veth_ctx[port_id] = ff_veth_attach(ff_global_cfg.dpdk.port_cfgs + i);
1331         if (veth_ctx[port_id] == NULL) {
1332             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1333         }
1334     }
1335 
1336     return 0;
1337 }
1338 
1339 void
1340 ff_dpdk_run(loop_func_t loop, void *arg) {
1341     struct loop_routine *lr = rte_malloc(NULL,
1342         sizeof(struct loop_routine), 0);
1343     lr->loop = loop;
1344     lr->arg = arg;
1345     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1346     rte_eal_mp_wait_lcore();
1347     rte_free(lr);
1348 }
1349 
1350 void
1351 ff_dpdk_pktmbuf_free(void *m)
1352 {
1353     rte_pktmbuf_free((struct rte_mbuf *)m);
1354 }
1355 
1356 static uint32_t
1357 toeplitz_hash(unsigned keylen, const uint8_t *key,
1358     unsigned datalen, const uint8_t *data)
1359 {
1360     uint32_t hash = 0, v;
1361     u_int i, b;
1362 
1363     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1364 
1365     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1366     for (i = 0; i < datalen; i++) {
1367         for (b = 0; b < 8; b++) {
1368             if (data[i] & (1<<(7-b)))
1369                 hash ^= v;
1370             v <<= 1;
1371             if ((i + 4) < keylen &&
1372                 (key[i+4] & (1<<(7-b))))
1373                 v |= 1;
1374         }
1375     }
1376     return (hash);
1377 }
1378 
1379 int
1380 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
1381     uint16_t sport, uint16_t dport)
1382 {
1383     struct lcore_conf *qconf = &lcore_conf;
1384 
1385     if (qconf->nb_procs == 1) {
1386         return 1;
1387     }
1388 
1389     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
1390     uint16_t reta_size = rss_reta_size[ctx->port_id];
1391 
1392     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
1393         sizeof(dport)];
1394 
1395     unsigned datalen = 0;
1396 
1397     bcopy(&saddr, &data[datalen], sizeof(saddr));
1398     datalen += sizeof(saddr);
1399 
1400     bcopy(&daddr, &data[datalen], sizeof(daddr));
1401     datalen += sizeof(daddr);
1402 
1403     bcopy(&sport, &data[datalen], sizeof(sport));
1404     datalen += sizeof(sport);
1405 
1406     bcopy(&dport, &data[datalen], sizeof(dport));
1407     datalen += sizeof(dport);
1408 
1409     uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes),
1410         default_rsskey_40bytes, datalen, data);
1411 
1412     return (hash & (reta_size - 1) % qconf->nb_procs) == qconf->proc_id;
1413 }
1414 
1415 
1416