xref: /f-stack/lib/ff_dpdk_if.c (revision 8cf1d457)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 
28 #include <rte_common.h>
29 #include <rte_byteorder.h>
30 #include <rte_log.h>
31 #include <rte_memory.h>
32 #include <rte_memcpy.h>
33 #include <rte_memzone.h>
34 #include <rte_config.h>
35 #include <rte_eal.h>
36 #include <rte_pci.h>
37 #include <rte_mbuf.h>
38 #include <rte_memory.h>
39 #include <rte_lcore.h>
40 #include <rte_launch.h>
41 #include <rte_ethdev.h>
42 #include <rte_debug.h>
43 #include <rte_common.h>
44 #include <rte_ether.h>
45 #include <rte_malloc.h>
46 #include <rte_cycles.h>
47 #include <rte_timer.h>
48 #include <rte_thash.h>
49 #include <rte_ip.h>
50 #include <rte_tcp.h>
51 #include <rte_udp.h>
52 
53 #include "ff_dpdk_if.h"
54 #include "ff_dpdk_pcap.h"
55 #include "ff_dpdk_kni.h"
56 #include "ff_config.h"
57 #include "ff_veth.h"
58 #include "ff_host_interface.h"
59 #include "ff_msg.h"
60 #include "ff_api.h"
61 
62 #define MEMPOOL_CACHE_SIZE 256
63 
64 #define ARP_RING_SIZE 2048
65 
66 #define MSG_RING_SIZE 32
67 
68 /*
69  * Configurable number of RX/TX ring descriptors
70  */
71 #define RX_QUEUE_SIZE 512
72 #define TX_QUEUE_SIZE 512
73 
74 #define MAX_PKT_BURST 32
75 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
76 
77 /*
78  * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send.
79  */
80 #define MAX_TX_BURST    (MAX_PKT_BURST / 2)
81 
82 #define NB_SOCKETS 8
83 
84 /* Configure how many packets ahead to prefetch, when reading packets */
85 #define PREFETCH_OFFSET    3
86 
87 #define MAX_RX_QUEUE_PER_LCORE 16
88 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS
89 #define MAX_RX_QUEUE_PER_PORT 128
90 
91 #define KNI_MBUF_MAX 2048
92 #define KNI_QUEUE_SIZE 2048
93 
94 static int enable_kni;
95 static int kni_accept;
96 
97 static int numa_on;
98 
99 static struct rte_timer freebsd_clock;
100 
101 // Mellanox Linux's driver key
102 static uint8_t default_rsskey_40bytes[40] = {
103     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
104     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
105     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
106     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
107     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
108 };
109 
110 static struct rte_eth_conf default_port_conf = {
111     .rxmode = {
112         .mq_mode = ETH_MQ_RX_RSS,
113         .max_rx_pkt_len = ETHER_MAX_LEN,
114         .split_hdr_size = 0, /**< hdr buf size */
115         .header_split   = 0, /**< Header Split disabled */
116         .hw_ip_checksum = 0, /**< IP checksum offload disabled */
117         .hw_vlan_filter = 0, /**< VLAN filtering disabled */
118         .hw_vlan_strip  = 0, /**< VLAN strip disabled. */
119         .hw_vlan_extend = 0, /**< Extended VLAN disabled. */
120         .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
121         .hw_strip_crc   = 0, /**< CRC stripped by hardware */
122         .enable_lro     = 0, /**< LRO disabled */
123     },
124     .rx_adv_conf = {
125         .rss_conf = {
126             .rss_key = default_rsskey_40bytes,
127             .rss_key_len = 40,
128             .rss_hf = ETH_RSS_PROTO_MASK,
129         },
130     },
131     .txmode = {
132         .mq_mode = ETH_MQ_TX_NONE,
133     },
134 };
135 
136 struct mbuf_table {
137     uint16_t len;
138     struct rte_mbuf *m_table[MAX_PKT_BURST];
139 };
140 
141 struct lcore_rx_queue {
142     uint8_t port_id;
143     uint8_t queue_id;
144 } __rte_cache_aligned;
145 
146 struct lcore_conf {
147     uint16_t proc_id;
148     uint16_t nb_procs;
149     uint16_t socket_id;
150     uint16_t nb_rx_queue;
151     uint16_t *proc_lcore;
152     struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
153     uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
154     struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
155     char *pcap[RTE_MAX_ETHPORTS];
156 } __rte_cache_aligned;
157 
158 static struct lcore_conf lcore_conf;
159 
160 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
161 
162 static struct rte_ring **arp_ring[RTE_MAX_LCORE];
163 
164 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
165 
166 struct ff_msg_ring {
167     char ring_name[2][RTE_RING_NAMESIZE];
168     /* ring[0] for lcore recv msg, other send */
169     /* ring[1] for lcore send msg, other read */
170     struct rte_ring *ring[2];
171 } __rte_cache_aligned;
172 
173 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
174 static struct rte_mempool *message_pool;
175 
176 struct ff_dpdk_if_context {
177     void *sc;
178     void *ifp;
179     uint16_t port_id;
180     struct ff_hw_features hw_features;
181 } __rte_cache_aligned;
182 
183 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
184 
185 extern void ff_hardclock(void);
186 
187 static void
188 ff_hardclock_job(__rte_unused struct rte_timer *timer,
189     __rte_unused void *arg) {
190     ff_hardclock();
191     ff_update_current_ts();
192 }
193 
194 struct ff_dpdk_if_context *
195 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
196 {
197     struct ff_dpdk_if_context *ctx;
198 
199     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
200     if (ctx == NULL)
201         return NULL;
202 
203     ctx->sc = sc;
204     ctx->ifp = ifp;
205     ctx->port_id = cfg->port_id;
206     ctx->hw_features = cfg->hw_features;
207 
208     return ctx;
209 }
210 
211 void
212 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
213 {
214     free(ctx);
215 }
216 
217 static void
218 check_all_ports_link_status(void)
219 {
220     #define CHECK_INTERVAL 100 /* 100ms */
221     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
222 
223     uint8_t portid, count, all_ports_up, print_flag = 0;
224     struct rte_eth_link link;
225 
226     printf("\nChecking link status");
227     fflush(stdout);
228 
229     int i, nb_ports;
230     nb_ports = ff_global_cfg.dpdk.nb_ports;
231     for (count = 0; count <= MAX_CHECK_TIME; count++) {
232         all_ports_up = 1;
233         for (i = 0; i < nb_ports; i++) {
234             uint8_t portid = ff_global_cfg.dpdk.port_cfgs[i].port_id;
235             memset(&link, 0, sizeof(link));
236             rte_eth_link_get_nowait(portid, &link);
237 
238             /* print link status if flag set */
239             if (print_flag == 1) {
240                 if (link.link_status) {
241                     printf("Port %d Link Up - speed %u "
242                         "Mbps - %s\n", (int)portid,
243                         (unsigned)link.link_speed,
244                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
245                         ("full-duplex") : ("half-duplex\n"));
246                 } else {
247                     printf("Port %d Link Down\n", (int)portid);
248                 }
249                 continue;
250             }
251             /* clear all_ports_up flag if any link down */
252             if (link.link_status == 0) {
253                 all_ports_up = 0;
254                 break;
255             }
256         }
257 
258         /* after finally printing all link status, get out */
259         if (print_flag == 1)
260             break;
261 
262         if (all_ports_up == 0) {
263             printf(".");
264             fflush(stdout);
265             rte_delay_ms(CHECK_INTERVAL);
266         }
267 
268         /* set the print_flag if all ports up or timeout */
269         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
270             print_flag = 1;
271             printf("done\n");
272         }
273     }
274 }
275 
276 static int
277 init_lcore_conf(void)
278 {
279     uint8_t nb_ports = rte_eth_dev_count();
280     if (nb_ports == 0) {
281         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
282     }
283 
284     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
285     lcore_conf.nb_procs = ff_global_cfg.dpdk.nb_procs;
286 
287     lcore_conf.proc_lcore = rte_zmalloc(NULL,
288         sizeof(uint16_t) * lcore_conf.nb_procs, 0);
289     if (lcore_conf.proc_lcore == NULL) {
290         rte_exit(EXIT_FAILURE, "rte_zmalloc proc_lcore failed\n");
291     }
292     rte_memcpy(lcore_conf.proc_lcore, ff_global_cfg.dpdk.proc_lcore,
293         sizeof(uint16_t) * lcore_conf.nb_procs);
294     uint16_t proc_id;
295     for (proc_id = 0; proc_id < lcore_conf.nb_procs; proc_id++) {
296         uint16_t lcore_id = lcore_conf.proc_lcore[proc_id];
297         if (!lcore_config[lcore_id].detected) {
298             rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
299         }
300     }
301 
302     uint16_t socket_id = 0;
303     if (numa_on) {
304         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
305     }
306 
307     lcore_conf.socket_id = socket_id;
308 
309     /* Currently, proc id 1:1 map to rx/tx queue id per port. */
310     uint8_t port_id, enabled_ports = 0;
311     for (port_id = 0; port_id < nb_ports; port_id++) {
312         if (ff_global_cfg.dpdk.port_mask &&
313             (ff_global_cfg.dpdk.port_mask & (1 << port_id)) == 0) {
314             printf("\nSkipping disabled port %d\n", port_id);
315             continue;
316         }
317 
318         if (port_id >= ff_global_cfg.dpdk.nb_ports) {
319             printf("\nSkipping non-configured port %d\n", port_id);
320             break;
321         }
322 
323         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
324         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
325         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = lcore_conf.proc_id;
326         lcore_conf.nb_rx_queue++;
327 
328         lcore_conf.tx_queue_id[port_id] = lcore_conf.proc_id;
329         lcore_conf.pcap[port_id] = ff_global_cfg.dpdk.port_cfgs[enabled_ports].pcap;
330 
331         ff_global_cfg.dpdk.port_cfgs[enabled_ports].port_id = port_id;
332 
333         enabled_ports++;
334     }
335 
336     ff_global_cfg.dpdk.nb_ports = enabled_ports;
337 
338     return 0;
339 }
340 
341 static int
342 init_mem_pool(void)
343 {
344     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
345     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
346     uint32_t nb_tx_queue = nb_lcores;
347     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
348 
349     unsigned nb_mbuf = RTE_MAX (
350         (nb_rx_queue*RX_QUEUE_SIZE          +
351         nb_ports*nb_lcores*MAX_PKT_BURST    +
352         nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
353         nb_lcores*MEMPOOL_CACHE_SIZE +
354         nb_ports*KNI_MBUF_MAX +
355         nb_ports*KNI_QUEUE_SIZE +
356         nb_lcores*nb_ports*ARP_RING_SIZE),
357         (unsigned)8192);
358 
359     unsigned socketid = 0;
360     uint16_t i, lcore_id;
361     char s[64];
362 
363     for (i = 0; i < lcore_conf.nb_procs; i++) {
364         lcore_id = lcore_conf.proc_lcore[i];
365         if (numa_on) {
366             socketid = rte_lcore_to_socket_id(lcore_id);
367         }
368 
369         if (socketid >= NB_SOCKETS) {
370             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
371                 socketid, i, NB_SOCKETS);
372         }
373 
374         if (pktmbuf_pool[socketid] != NULL) {
375             continue;
376         }
377 
378         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
379             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
380             pktmbuf_pool[socketid] =
381                 rte_pktmbuf_pool_create(s, nb_mbuf,
382                     MEMPOOL_CACHE_SIZE, 0,
383                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
384         } else {
385             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
386             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
387         }
388 
389         if (pktmbuf_pool[socketid] == NULL) {
390             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
391         } else {
392             printf("create mbuf pool on socket %d\n", socketid);
393         }
394     }
395 
396     return 0;
397 }
398 
399 static struct rte_ring *
400 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
401 {
402     struct rte_ring *ring;
403 
404     if (name == NULL)
405         return NULL;
406 
407     /* If already create, just attached it */
408     if (likely((ring = rte_ring_lookup(name)) != NULL))
409         return ring;
410 
411     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
412         return rte_ring_create(name, count, socket_id, flags);
413     } else {
414         return rte_ring_lookup(name);
415     }
416 }
417 
418 static int
419 init_arp_ring(void)
420 {
421     int i, j, ret;
422     char name_buf[RTE_RING_NAMESIZE];
423     int nb_procs = ff_global_cfg.dpdk.nb_procs;
424     int proc_id = ff_global_cfg.dpdk.proc_id;
425 
426     /* Allocate arp ring ptr according to eth dev count. */
427     int nb_ports = rte_eth_dev_count();
428     for(i = 0; i < nb_procs; ++i) {
429         snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_%d_%d",
430             proc_id, i);
431 
432         arp_ring[i] = rte_zmalloc(name_buf,
433             sizeof(struct rte_ring *) * nb_ports,
434              RTE_CACHE_LINE_SIZE);
435         if (arp_ring[i] == NULL) {
436             rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
437                 "failed\n", name_buf);
438         }
439     }
440 
441     unsigned socketid = lcore_conf.socket_id;
442 
443     /* Create ring according to ports actually being used. */
444     nb_ports = ff_global_cfg.dpdk.nb_ports;
445     for (j = 0; j < nb_ports; j++) {
446         uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[j].port_id;
447 
448         for(i = 0; i < nb_procs; ++i) {
449             snprintf(name_buf, RTE_RING_NAMESIZE, "arp_ring_%d_%d", i, port_id);
450             arp_ring[i][port_id] = create_ring(name_buf, ARP_RING_SIZE,
451                 socketid, RING_F_SC_DEQ);
452 
453             if (arp_ring[i][port_id] == NULL)
454                 rte_panic("create ring:%s failed!\n", name_buf);
455 
456             printf("create ring:%s success, %u ring entries are now free!\n",
457                 name_buf, rte_ring_free_count(arp_ring[i][port_id]));
458         }
459     }
460 
461     return 0;
462 }
463 
464 static void
465 ff_msg_init(struct rte_mempool *mp,
466     __attribute__((unused)) void *opaque_arg,
467     void *obj, __attribute__((unused)) unsigned i)
468 {
469     struct ff_msg *msg = (struct ff_msg *)obj;
470     msg->msg_type = FF_UNKNOWN;
471     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
472     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
473 }
474 
475 static int
476 init_msg_ring(void)
477 {
478     uint16_t i;
479     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
480     unsigned socketid = lcore_conf.socket_id;
481 
482     /* Create message buffer pool */
483     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
484         message_pool = rte_mempool_create(FF_MSG_POOL,
485            MSG_RING_SIZE * 2 * nb_procs,
486            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
487            NULL, NULL, ff_msg_init, NULL,
488            socketid, 0);
489     } else {
490         message_pool = rte_mempool_lookup(FF_MSG_POOL);
491     }
492 
493     if (message_pool == NULL) {
494         rte_panic("Create msg mempool failed\n");
495     }
496 
497     for(i = 0; i < nb_procs; ++i) {
498         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
499             "%s%u", FF_MSG_RING_IN, i);
500         snprintf(msg_ring[i].ring_name[1], RTE_RING_NAMESIZE,
501             "%s%u", FF_MSG_RING_OUT, i);
502 
503         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
504             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
505         if (msg_ring[i].ring[0] == NULL)
506             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
507 
508         msg_ring[i].ring[1] = create_ring(msg_ring[i].ring_name[1],
509             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
510         if (msg_ring[i].ring[1] == NULL)
511             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
512     }
513 
514     return 0;
515 }
516 
517 static int
518 init_kni(void)
519 {
520     int nb_ports = rte_eth_dev_count();
521     kni_accept = 0;
522     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
523         kni_accept = 1;
524 
525     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
526         ff_global_cfg.kni.udp_port);
527 
528     unsigned socket_id = lcore_conf.socket_id;
529     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
530 
531     nb_ports = ff_global_cfg.dpdk.nb_ports;
532     int i, ret;
533     for (i = 0; i < nb_ports; i++) {
534         uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id;
535         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
536     }
537 
538     return 0;
539 }
540 
541 static void
542 set_rss_table(uint8_t port_id, uint16_t reta_size, uint16_t nb_queues)
543 {
544     if (reta_size == 0) {
545         return;
546     }
547 
548     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
549     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
550 
551     /* config HW indirection table */
552     unsigned i, j, hash=0;
553     for (i = 0; i < reta_conf_size; i++) {
554         reta_conf[i].mask = ~0ULL;
555         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
556             reta_conf[i].reta[j] = hash++ % nb_queues;
557         }
558     }
559 
560     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
561         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
562             port_id);
563     }
564 }
565 
566 static int
567 init_port_start(void)
568 {
569     int nb_ports = ff_global_cfg.dpdk.nb_ports;
570     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
571     unsigned socketid = rte_lcore_to_socket_id(rte_lcore_id());
572     struct rte_mempool *mbuf_pool = pktmbuf_pool[socketid];
573     uint16_t i;
574 
575     for (i = 0; i < nb_ports; i++) {
576         uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id;
577 
578         struct rte_eth_dev_info dev_info;
579         rte_eth_dev_info_get(port_id, &dev_info);
580 
581         if (nb_procs > dev_info.max_rx_queues) {
582             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
583                 nb_procs,
584                 dev_info.max_rx_queues);
585         }
586 
587         if (nb_procs > dev_info.max_tx_queues) {
588             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
589                 nb_procs,
590                 dev_info.max_tx_queues);
591         }
592 
593         struct ether_addr addr;
594         rte_eth_macaddr_get(port_id, &addr);
595         printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
596                    " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
597                 (unsigned)port_id,
598                 addr.addr_bytes[0], addr.addr_bytes[1],
599                 addr.addr_bytes[2], addr.addr_bytes[3],
600                 addr.addr_bytes[4], addr.addr_bytes[5]);
601 
602         rte_memcpy(ff_global_cfg.dpdk.port_cfgs[i].mac,
603             addr.addr_bytes, ETHER_ADDR_LEN);
604 
605         /* Clear txq_flags - we do not need multi-mempool and refcnt */
606         dev_info.default_txconf.txq_flags = ETH_TXQ_FLAGS_NOMULTMEMP |
607             ETH_TXQ_FLAGS_NOREFCOUNT;
608 
609         /* Disable features that are not supported by port's HW */
610         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) {
611             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP;
612         }
613 
614         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
615             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP;
616         }
617 
618         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) {
619             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP;
620         }
621 
622         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) {
623             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL;
624         }
625 
626         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) {
627             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL;
628         }
629 
630         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) &&
631             !(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TSO)) {
632             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
633         }
634 
635         struct rte_eth_conf port_conf = {0};
636 
637         /* Set RSS mode */
638         port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
639         port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK;
640         port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes;
641         port_conf.rx_adv_conf.rss_conf.rss_key_len = 40;
642 
643         /* Set Rx VLAN stripping */
644         if (ff_global_cfg.dpdk.vlan_strip) {
645             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
646                 port_conf.rxmode.hw_vlan_strip = 1;
647             }
648         }
649 
650         /* Enable HW CRC stripping */
651         port_conf.rxmode.hw_strip_crc = 1;
652 
653         /* FIXME: Enable TCP LRO ?*/
654         #if 0
655         if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
656             printf("LRO is supported\n");
657             port_conf.rxmode.enable_lro = 1;
658             ff_global_cfg.dpdk.port_cfgs[i].hw_features.rx_lro = 1;
659         }
660         #endif
661 
662         /* Set Rx checksum checking */
663         if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
664             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
665             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
666             printf("RX checksum offload supported\n");
667             port_conf.rxmode.hw_ip_checksum = 1;
668             ff_global_cfg.dpdk.port_cfgs[i].hw_features.rx_csum = 1;
669         }
670 
671         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
672             printf("TX ip checksum offload supported\n");
673             ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_csum_ip = 1;
674         }
675 
676         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
677             (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
678             printf("TX TCP&UDP checksum offload supported\n");
679             ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_csum_l4 = 1;
680         }
681 
682         if (ff_global_cfg.dpdk.tso) {
683             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
684                 printf("TSO is supported\n");
685                 ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_tso = 1;
686             }
687         } else {
688             printf("TSO is disabled\n");
689         }
690 
691         if (dev_info.reta_size) {
692             /* reta size must be power of 2 */
693             assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
694 
695             rss_reta_size[port_id] = dev_info.reta_size;
696             printf("port[%d]: rss table size: %d\n", port_id,
697                 dev_info.reta_size);
698         }
699 
700         if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
701             continue;
702         }
703 
704         /* Currently, proc id 1:1 map to queue id per port. */
705         int ret = rte_eth_dev_configure(port_id, nb_procs, nb_procs, &port_conf);
706         if (ret != 0) {
707             return ret;
708         }
709 
710         uint16_t q;
711         for (q = 0; q < nb_procs; q++) {
712             ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE,
713                 socketid, &dev_info.default_txconf);
714             if (ret < 0) {
715                 return ret;
716             }
717 
718             ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE,
719                 socketid, &dev_info.default_rxconf, mbuf_pool);
720             if (ret < 0) {
721                 return ret;
722             }
723         }
724 
725         ret = rte_eth_dev_start(port_id);
726         if (ret < 0) {
727             return ret;
728         }
729 
730         if (nb_procs > 1) {
731             /* set HW rss hash function to Toeplitz. */
732             if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
733                 struct rte_eth_hash_filter_info info = {0};
734                 info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
735                 info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
736 
737                 if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
738                     RTE_ETH_FILTER_SET, &info) < 0) {
739                     rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
740                         port_id);
741                 }
742             }
743 
744             set_rss_table(port_id, dev_info.reta_size, nb_procs);
745         }
746 
747         /* Enable RX in promiscuous mode for the Ethernet device. */
748         if (ff_global_cfg.dpdk.promiscuous) {
749             rte_eth_promiscuous_enable(port_id);
750             ret = rte_eth_promiscuous_get(port_id);
751             if (ret == 1) {
752                 printf("set port %u to promiscuous mode ok\n", port_id);
753             } else {
754                 printf("set port %u to promiscuous mode error\n", port_id);
755             }
756         }
757 
758         /* Enable pcap dump */
759         if (ff_global_cfg.dpdk.port_cfgs[i].pcap) {
760             ff_enable_pcap(ff_global_cfg.dpdk.port_cfgs[i].pcap);
761         }
762     }
763 
764     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
765         check_all_ports_link_status();
766     }
767 
768     return 0;
769 }
770 
771 static int
772 init_clock(void)
773 {
774     rte_timer_subsystem_init();
775     uint64_t hz = rte_get_timer_hz();
776     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
777     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
778 
779     rte_timer_init(&freebsd_clock);
780     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
781         rte_lcore_id(), &ff_hardclock_job, NULL);
782 
783     ff_update_current_ts();
784 
785     return 0;
786 }
787 
788 int
789 ff_dpdk_init(int argc, char **argv)
790 {
791     if (ff_global_cfg.dpdk.nb_procs < 1 ||
792         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
793         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
794         ff_global_cfg.dpdk.proc_id < 0) {
795         printf("param num_procs[%d] or proc_id[%d] error!\n",
796             ff_global_cfg.dpdk.nb_procs,
797             ff_global_cfg.dpdk.proc_id);
798         exit(1);
799     }
800 
801     int ret = rte_eal_init(argc, argv);
802     if (ret < 0) {
803         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
804     }
805 
806     numa_on = ff_global_cfg.dpdk.numa_on;
807 
808     init_lcore_conf();
809 
810     init_mem_pool();
811 
812     init_arp_ring();
813 
814     init_msg_ring();
815 
816     enable_kni = ff_global_cfg.kni.enable;
817     if (enable_kni) {
818         init_kni();
819     }
820 
821     ret = init_port_start();
822     if (ret < 0) {
823         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
824     }
825 
826     init_clock();
827 
828     return 0;
829 }
830 
831 static void
832 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
833 {
834     uint8_t rx_csum = ctx->hw_features.rx_csum;
835     if (rx_csum) {
836         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
837             return;
838         }
839     }
840 
841     /*
842      * FIXME: should we save pkt->vlan_tci
843      * if (pkt->ol_flags & PKT_RX_VLAN_PKT)
844      */
845 
846     void *data = rte_pktmbuf_mtod(pkt, void*);
847     uint16_t len = rte_pktmbuf_data_len(pkt);
848 
849     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
850     if (hdr == NULL) {
851         rte_pktmbuf_free(pkt);
852         return;
853     }
854 
855     struct rte_mbuf *pn = pkt->next;
856     void *prev = hdr;
857     while(pn != NULL) {
858         data = rte_pktmbuf_mtod(pkt, void*);
859         len = rte_pktmbuf_data_len(pkt);
860 
861         void *mb = ff_mbuf_get(prev, data, len);
862         if (mb == NULL) {
863             ff_mbuf_free(hdr);
864             rte_pktmbuf_free(pkt);
865             return;
866         }
867         pn = pn->next;
868         prev = mb;
869     }
870 
871     ff_veth_process_packet(ctx->ifp, hdr);
872 }
873 
874 static enum FilterReturn
875 protocol_filter(const void *data, uint16_t len)
876 {
877     if(len < sizeof(struct ether_hdr))
878         return FILTER_UNKNOWN;
879 
880     const struct ether_hdr *hdr;
881     hdr = (const struct ether_hdr *)data;
882 
883     if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP)
884         return FILTER_ARP;
885 
886     if (!enable_kni) {
887         return FILTER_UNKNOWN;
888     }
889 
890     if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4)
891         return FILTER_UNKNOWN;
892 
893     return ff_kni_proto_filter(data + sizeof(struct ether_hdr),
894         len - sizeof(struct ether_hdr));
895 }
896 
897 static inline void
898 process_packets(uint8_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
899     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
900 {
901     struct lcore_conf *qconf = &lcore_conf;
902 
903     uint16_t i;
904     for (i = 0; i < count; i++) {
905         struct rte_mbuf *rtem = bufs[i];
906 
907         if (unlikely(qconf->pcap[port_id] != NULL)) {
908             ff_dump_packets(qconf->pcap[port_id], rtem);
909         }
910 
911         void *data = rte_pktmbuf_mtod(rtem, void*);
912         uint16_t len = rte_pktmbuf_data_len(rtem);
913 
914         enum FilterReturn filter = protocol_filter(data, len);
915         if (filter == FILTER_ARP) {
916             struct rte_mempool *mbuf_pool;
917             struct rte_mbuf *mbuf_clone;
918             if (pkts_from_ring == 0) {
919                 uint16_t i;
920                 for(i = 0; i < qconf->nb_procs; ++i) {
921                     if(i == queue_id)
922                         continue;
923 
924                     unsigned socket_id = 0;
925                     if (numa_on) {
926                         socket_id = rte_lcore_to_socket_id(qconf->proc_lcore[i]);
927                     }
928                     mbuf_pool = pktmbuf_pool[socket_id];
929                     mbuf_clone = rte_pktmbuf_clone(rtem, mbuf_pool);
930                     if(mbuf_clone) {
931                         int ret = rte_ring_enqueue(arp_ring[i][port_id], mbuf_clone);
932                         if (ret < 0)
933                             rte_pktmbuf_free(mbuf_clone);
934                     }
935                 }
936             }
937 
938             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
939                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
940                 mbuf_clone = rte_pktmbuf_clone(rtem, mbuf_pool);
941                 if(mbuf_clone) {
942                     ff_kni_enqueue(port_id, mbuf_clone);
943                 }
944             }
945 
946             ff_veth_input(ctx, rtem);
947         } else if (enable_kni && ((filter == FILTER_KNI && kni_accept) ||
948             (filter == FILTER_UNKNOWN && !kni_accept)) ) {
949             ff_kni_enqueue(port_id, rtem);
950         } else {
951             ff_veth_input(ctx, rtem);
952         }
953     }
954 }
955 
956 static inline int
957 process_arp_ring(uint8_t port_id, uint16_t queue_id,
958     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
959 {
960     /* read packet from ring buf and to process */
961     uint16_t nb_rb;
962     nb_rb = rte_ring_dequeue_burst(arp_ring[queue_id][port_id],
963         (void **)pkts_burst, MAX_PKT_BURST);
964 
965     if(nb_rb > 0) {
966         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
967     }
968 
969     return 0;
970 }
971 
972 static inline void
973 handle_sysctl_msg(struct ff_msg *msg, uint16_t proc_id)
974 {
975     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
976         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
977         msg->sysctl.newlen);
978 
979     if (ret < 0) {
980         msg->result = errno;
981     } else {
982         msg->result = 0;
983     }
984 
985     rte_ring_enqueue(msg_ring[proc_id].ring[1], msg);
986 }
987 
988 static inline void
989 handle_ioctl_msg(struct ff_msg *msg, uint16_t proc_id)
990 {
991     int fd, ret;
992     fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
993     if (fd < 0) {
994         ret = -1;
995         goto done;
996     }
997 
998     ret = ff_ioctl(fd, msg->ioctl.cmd, msg->ioctl.data);
999 
1000     ff_close(fd);
1001 
1002 done:
1003     if (ret < 0) {
1004         msg->result = errno;
1005     } else {
1006         msg->result = 0;
1007     }
1008 
1009     rte_ring_enqueue(msg_ring[proc_id].ring[1], msg);
1010 }
1011 
1012 static inline void
1013 handle_route_msg(struct ff_msg *msg, uint16_t proc_id)
1014 {
1015     msg->result = ff_rtioctl(msg->route.fib, msg->route.data,
1016         &msg->route.len, msg->route.maxlen);
1017 
1018     rte_ring_enqueue(msg_ring[proc_id].ring[1], msg);
1019 }
1020 
1021 static struct ff_top_args ff_status;
1022 static inline void
1023 handle_top_msg(struct ff_msg *msg, uint16_t proc_id)
1024 {
1025     msg->top = ff_status;
1026     msg->result = 0;
1027 
1028     rte_ring_enqueue(msg_ring[proc_id].ring[1], msg);
1029 }
1030 
1031 static inline void
1032 handle_default_msg(struct ff_msg *msg, uint16_t proc_id)
1033 {
1034     msg->result = EINVAL;
1035     rte_ring_enqueue(msg_ring[proc_id].ring[1], msg);
1036 }
1037 
1038 static inline void
1039 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1040 {
1041     switch (msg->msg_type) {
1042         case FF_SYSCTL:
1043             handle_sysctl_msg(msg, proc_id);
1044             break;
1045         case FF_IOCTL:
1046             handle_ioctl_msg(msg, proc_id);
1047             break;
1048         case FF_ROUTE:
1049             handle_route_msg(msg, proc_id);
1050             break;
1051         case FF_TOP:
1052             handle_top_msg(msg, proc_id);
1053             break;
1054         default:
1055             handle_default_msg(msg, proc_id);
1056             break;
1057     }
1058 }
1059 
1060 static inline int
1061 process_msg_ring(uint16_t proc_id)
1062 {
1063     void *msg;
1064     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1065 
1066     if (unlikely(ret == 0)) {
1067         handle_msg((struct ff_msg *)msg, proc_id);
1068     }
1069 
1070     return 0;
1071 }
1072 
1073 /* Send burst of packets on an output interface */
1074 static inline int
1075 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1076 {
1077     struct rte_mbuf **m_table;
1078     int ret;
1079     uint16_t queueid;
1080 
1081     queueid = qconf->tx_queue_id[port];
1082     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1083 
1084     if (unlikely(qconf->pcap[port] != NULL)) {
1085         uint16_t i;
1086         for (i = 0; i < n; i++) {
1087             ff_dump_packets(qconf->pcap[port], m_table[i]);
1088         }
1089     }
1090 
1091     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1092     if (unlikely(ret < n)) {
1093         do {
1094             rte_pktmbuf_free(m_table[ret]);
1095         } while (++ret < n);
1096     }
1097 
1098     return 0;
1099 }
1100 
1101 /* Enqueue a single packet, and send burst if queue is filled */
1102 static inline int
1103 send_single_packet(struct rte_mbuf *m, uint8_t port)
1104 {
1105     uint16_t len;
1106     struct lcore_conf *qconf;
1107 
1108     qconf = &lcore_conf;
1109     len = qconf->tx_mbufs[port].len;
1110     qconf->tx_mbufs[port].m_table[len] = m;
1111     len++;
1112 
1113     /* enough pkts to be sent */
1114     if (unlikely(len == MAX_PKT_BURST)) {
1115         send_burst(qconf, MAX_PKT_BURST, port);
1116         len = 0;
1117     }
1118 
1119     qconf->tx_mbufs[port].len = len;
1120     return 0;
1121 }
1122 
1123 int
1124 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1125     int total)
1126 {
1127     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1128     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1129     if (head == NULL) {
1130         ff_mbuf_free(m);
1131         return -1;
1132     }
1133 
1134     head->pkt_len = total;
1135     head->nb_segs = 0;
1136 
1137     int off = 0;
1138     struct rte_mbuf *cur = head, *prev = NULL;
1139     while(total > 0) {
1140         if (cur == NULL) {
1141             cur = rte_pktmbuf_alloc(mbuf_pool);
1142             if (cur == NULL) {
1143                 rte_pktmbuf_free(head);
1144                 ff_mbuf_free(m);
1145                 return -1;
1146             }
1147         }
1148 
1149         void *data = rte_pktmbuf_mtod(cur, void*);
1150         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1151         int ret = ff_mbuf_copydata(m, data, off, len);
1152         if (ret < 0) {
1153             rte_pktmbuf_free(head);
1154             ff_mbuf_free(m);
1155             return -1;
1156         }
1157 
1158         if (prev != NULL) {
1159             prev->next = cur;
1160         }
1161         prev = cur;
1162 
1163         cur->data_len = len;
1164         off += len;
1165         total -= len;
1166         head->nb_segs++;
1167         cur = NULL;
1168     }
1169 
1170     struct ff_tx_offload offload = {0};
1171     ff_mbuf_tx_offload(m, &offload);
1172 
1173     if (offload.ip_csum) {
1174         head->ol_flags |= PKT_TX_IP_CKSUM;
1175         head->l2_len = sizeof(struct ether_hdr);
1176         head->l3_len = sizeof(struct ipv4_hdr);
1177     }
1178 
1179     if (ctx->hw_features.tx_csum_l4) {
1180         if (offload.tcp_csum) {
1181             head->ol_flags |= PKT_TX_TCP_CKSUM;
1182             head->l2_len = sizeof(struct ether_hdr);
1183             head->l3_len = sizeof(struct ipv4_hdr);
1184         }
1185 
1186         if (offload.tso_seg_size) {
1187             head->ol_flags |= PKT_TX_TCP_SEG;
1188             head->l4_len = sizeof(struct tcp_hdr);
1189             head->tso_segsz = offload.tso_seg_size;
1190         }
1191 
1192         if (offload.udp_csum) {
1193             head->ol_flags |= PKT_TX_UDP_CKSUM;
1194             head->l2_len = sizeof(struct ether_hdr);
1195             head->l3_len = sizeof(struct ipv4_hdr);
1196         }
1197     }
1198 
1199     ff_mbuf_free(m);
1200 
1201     return send_single_packet(head, ctx->port_id);
1202 }
1203 
1204 static int
1205 main_loop(void *arg)
1206 {
1207     struct loop_routine *lr = (struct loop_routine *)arg;
1208 
1209     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1210     unsigned lcore_id;
1211     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc;
1212     int i, j, nb_rx, idle;
1213     uint8_t port_id, queue_id;
1214     struct lcore_conf *qconf;
1215     const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) /
1216         US_PER_S * BURST_TX_DRAIN_US;
1217     struct ff_dpdk_if_context *ctx;
1218 
1219     prev_tsc = 0;
1220     usch_tsc = 0;
1221 
1222     lcore_id = rte_lcore_id();
1223     qconf = &lcore_conf;
1224 
1225     if (qconf->nb_rx_queue == 0) {
1226         printf("lcore %u has nothing to do\n", lcore_id);
1227         return 0;
1228     }
1229 
1230     while (1) {
1231         cur_tsc = rte_rdtsc();
1232         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1233             rte_timer_manage();
1234         }
1235 
1236         idle = 1;
1237         sys_tsc = 0;
1238         usr_tsc = 0;
1239 
1240         /*
1241          * TX burst queue drain
1242          */
1243         diff_tsc = cur_tsc - prev_tsc;
1244         if (unlikely(diff_tsc > drain_tsc)) {
1245             /*
1246              * This could be optimized (use queueid instead of
1247              * portid), but it is not called so often
1248              */
1249             for (port_id = 0; port_id < RTE_MAX_ETHPORTS; port_id++) {
1250                 if (qconf->tx_mbufs[port_id].len == 0)
1251                     continue;
1252 
1253                 idle = 0;
1254                 send_burst(qconf,
1255                     qconf->tx_mbufs[port_id].len,
1256                     port_id);
1257                 qconf->tx_mbufs[port_id].len = 0;
1258             }
1259 
1260             prev_tsc = cur_tsc;
1261         }
1262 
1263         /*
1264          * Read packet from RX queues
1265          */
1266         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1267             port_id = qconf->rx_queue_list[i].port_id;
1268             queue_id = qconf->rx_queue_list[i].queue_id;
1269             ctx = veth_ctx[port_id];
1270 
1271             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1272                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1273             }
1274 
1275             process_arp_ring(port_id, queue_id, pkts_burst, ctx);
1276 
1277             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1278                 MAX_PKT_BURST);
1279             if (nb_rx == 0)
1280                 continue;
1281 
1282             idle = 0;
1283 
1284             /* Prefetch first packets */
1285             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1286                 rte_prefetch0(rte_pktmbuf_mtod(
1287                         pkts_burst[j], void *));
1288             }
1289 
1290             /* Prefetch and handle already prefetched packets */
1291             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1292                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1293                         j + PREFETCH_OFFSET], void *));
1294                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1295             }
1296 
1297             /* Handle remaining prefetched packets */
1298             for (; j < nb_rx; j++) {
1299                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1300             }
1301         }
1302 
1303         process_msg_ring(qconf->proc_id);
1304 
1305         div_tsc = rte_rdtsc();
1306 
1307         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc > drain_tsc))) {
1308             usch_tsc = cur_tsc;
1309             lr->loop(lr->arg);
1310         }
1311 
1312         end_tsc = rte_rdtsc();
1313 
1314         if (usch_tsc == cur_tsc) {
1315             usr_tsc = end_tsc - div_tsc;
1316         }
1317 
1318         if (!idle) {
1319             sys_tsc = div_tsc - cur_tsc;
1320             ff_status.sys_tsc += sys_tsc;
1321         }
1322 
1323         ff_status.usr_tsc += usr_tsc;
1324         ff_status.work_tsc += end_tsc - cur_tsc;
1325         ff_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1326 
1327         ff_status.loops++;
1328     }
1329 }
1330 
1331 int
1332 ff_dpdk_if_up(void) {
1333     int nb_ports = ff_global_cfg.dpdk.nb_ports;
1334     int i;
1335     for (i = 0; i < nb_ports; i++) {
1336         uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id;
1337         veth_ctx[port_id] = ff_veth_attach(ff_global_cfg.dpdk.port_cfgs + i);
1338         if (veth_ctx[port_id] == NULL) {
1339             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1340         }
1341     }
1342 
1343     return 0;
1344 }
1345 
1346 void
1347 ff_dpdk_run(loop_func_t loop, void *arg) {
1348     struct loop_routine *lr = rte_malloc(NULL,
1349         sizeof(struct loop_routine), 0);
1350     lr->loop = loop;
1351     lr->arg = arg;
1352     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1353     rte_eal_mp_wait_lcore();
1354     rte_free(lr);
1355 }
1356 
1357 void
1358 ff_dpdk_pktmbuf_free(void *m)
1359 {
1360     rte_pktmbuf_free((struct rte_mbuf *)m);
1361 }
1362 
1363 static uint32_t
1364 toeplitz_hash(unsigned keylen, const uint8_t *key,
1365     unsigned datalen, const uint8_t *data)
1366 {
1367     uint32_t hash = 0, v;
1368     u_int i, b;
1369 
1370     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1371 
1372     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1373     for (i = 0; i < datalen; i++) {
1374         for (b = 0; b < 8; b++) {
1375             if (data[i] & (1<<(7-b)))
1376                 hash ^= v;
1377             v <<= 1;
1378             if ((i + 4) < keylen &&
1379                 (key[i+4] & (1<<(7-b))))
1380                 v |= 1;
1381         }
1382     }
1383     return (hash);
1384 }
1385 
1386 int
1387 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
1388     uint16_t sport, uint16_t dport)
1389 {
1390     struct lcore_conf *qconf = &lcore_conf;
1391 
1392     if (qconf->nb_procs == 1) {
1393         return 1;
1394     }
1395 
1396     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
1397     uint16_t reta_size = rss_reta_size[ctx->port_id];
1398 
1399     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
1400         sizeof(dport)];
1401 
1402     unsigned datalen = 0;
1403 
1404     bcopy(&saddr, &data[datalen], sizeof(saddr));
1405     datalen += sizeof(saddr);
1406 
1407     bcopy(&daddr, &data[datalen], sizeof(daddr));
1408     datalen += sizeof(daddr);
1409 
1410     bcopy(&sport, &data[datalen], sizeof(sport));
1411     datalen += sizeof(sport);
1412 
1413     bcopy(&dport, &data[datalen], sizeof(dport));
1414     datalen += sizeof(dport);
1415 
1416     uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes),
1417         default_rsskey_40bytes, datalen, data);
1418 
1419     return ((hash & (reta_size - 1)) % qconf->nb_procs) == qconf->proc_id;
1420 }
1421 
1422 
1423