xref: /f-stack/lib/ff_dpdk_if.c (revision 2dfcd880)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 
27 #include <rte_common.h>
28 #include <rte_byteorder.h>
29 #include <rte_log.h>
30 #include <rte_memory.h>
31 #include <rte_memcpy.h>
32 #include <rte_memzone.h>
33 #include <rte_config.h>
34 #include <rte_eal.h>
35 #include <rte_pci.h>
36 #include <rte_mbuf.h>
37 #include <rte_memory.h>
38 #include <rte_lcore.h>
39 #include <rte_launch.h>
40 #include <rte_ethdev.h>
41 #include <rte_debug.h>
42 #include <rte_common.h>
43 #include <rte_ether.h>
44 #include <rte_malloc.h>
45 #include <rte_cycles.h>
46 #include <rte_timer.h>
47 #include <rte_thash.h>
48 #include <rte_ip.h>
49 #include <rte_tcp.h>
50 #include <rte_udp.h>
51 
52 #include "ff_dpdk_if.h"
53 #include "ff_dpdk_pcap.h"
54 #include "ff_dpdk_kni.h"
55 #include "ff_config.h"
56 #include "ff_veth.h"
57 #include "ff_host_interface.h"
58 
59 #define MEMPOOL_CACHE_SIZE 256
60 
61 #define ARP_RING_SIZE 2048
62 
63 /*
64  * Configurable number of RX/TX ring descriptors
65  */
66 #define RX_QUEUE_SIZE 512
67 #define TX_QUEUE_SIZE 256
68 
69 #define MAX_PKT_BURST 32
70 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
71 
72 /*
73  * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send.
74  */
75 #define MAX_TX_BURST    (MAX_PKT_BURST / 2)
76 
77 #define NB_SOCKETS 8
78 
79 /* Configure how many packets ahead to prefetch, when reading packets */
80 #define PREFETCH_OFFSET    3
81 
82 #define MAX_RX_QUEUE_PER_LCORE 16
83 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS
84 #define MAX_RX_QUEUE_PER_PORT 128
85 
86 #define BITS_PER_HEX 4
87 
88 static int enable_kni;
89 static int kni_accept;
90 
91 static struct rte_timer freebsd_clock;
92 
93 // Mellanox Linux's driver key
94 static uint8_t default_rsskey_40bytes[40] = {
95     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
96     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
97     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
98     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
99     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
100 };
101 
102 static struct rte_eth_conf default_port_conf = {
103     .rxmode = {
104         .mq_mode = ETH_MQ_RX_RSS,
105         .max_rx_pkt_len = ETHER_MAX_LEN,
106         .split_hdr_size = 0, /**< hdr buf size */
107         .header_split   = 0, /**< Header Split disabled */
108         .hw_ip_checksum = 0, /**< IP checksum offload disabled */
109         .hw_vlan_filter = 0, /**< VLAN filtering disabled */
110         .hw_vlan_strip  = 0, /**< VLAN strip disabled. */
111         .hw_vlan_extend = 0, /**< Extended VLAN disabled. */
112         .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
113         .hw_strip_crc   = 0, /**< CRC stripped by hardware */
114         .enable_lro     = 0, /**< LRO disabled */
115     },
116     .rx_adv_conf = {
117         .rss_conf = {
118             .rss_key = default_rsskey_40bytes,
119             .rss_key_len = 40,
120             .rss_hf = ETH_RSS_PROTO_MASK,
121         },
122     },
123     .txmode = {
124         .mq_mode = ETH_MQ_TX_NONE,
125     },
126 };
127 
128 struct mbuf_table {
129     uint16_t len;
130     struct rte_mbuf *m_table[MAX_PKT_BURST];
131 };
132 
133 struct lcore_rx_queue {
134     uint8_t port_id;
135     uint8_t queue_id;
136 } __rte_cache_aligned;
137 
138 struct lcore_conf {
139     uint16_t proc_id;
140     uint16_t nb_procs;
141     uint16_t socket_id;
142     uint16_t nb_rx_queue;
143     uint16_t *lcore_proc;
144     struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
145     uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
146     struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
147     char *pcap[RTE_MAX_ETHPORTS];
148 } __rte_cache_aligned;
149 
150 static struct lcore_conf lcore_conf;
151 
152 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
153 
154 static struct rte_ring **arp_ring[RTE_MAX_LCORE];
155 
156 struct ff_dpdk_if_context {
157     void *sc;
158     void *ifp;
159     uint16_t port_id;
160     struct ff_hw_features hw_features;
161 } __rte_cache_aligned;
162 
163 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
164 
165 extern void ff_hardclock(void);
166 
167 static void
168 freebsd_hardclock_job(__rte_unused struct rte_timer *timer,
169     __rte_unused void *arg) {
170     ff_hardclock();
171 }
172 
173 struct ff_dpdk_if_context *
174 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
175 {
176     struct ff_dpdk_if_context *ctx;
177 
178     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
179     if (ctx == NULL)
180         return NULL;
181 
182     ctx->sc = sc;
183     ctx->ifp = ifp;
184     ctx->port_id = cfg->port_id;
185     ctx->hw_features = cfg->hw_features;
186 
187     return ctx;
188 }
189 
190 void
191 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
192 {
193     free(ctx);
194 }
195 
196 static void
197 check_all_ports_link_status(void)
198 {
199     #define CHECK_INTERVAL 100 /* 100ms */
200     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
201 
202     uint8_t portid, count, all_ports_up, print_flag = 0;
203     struct rte_eth_link link;
204 
205     printf("\nChecking link status");
206     fflush(stdout);
207 
208     int i, nb_ports;
209     nb_ports = ff_global_cfg.dpdk.nb_ports;
210     for (count = 0; count <= MAX_CHECK_TIME; count++) {
211         all_ports_up = 1;
212         for (i = 0; i < nb_ports; i++) {
213             uint8_t portid = ff_global_cfg.dpdk.port_cfgs[i].port_id;
214             memset(&link, 0, sizeof(link));
215             rte_eth_link_get_nowait(portid, &link);
216 
217             /* print link status if flag set */
218             if (print_flag == 1) {
219                 if (link.link_status) {
220                     printf("Port %d Link Up - speed %u "
221                         "Mbps - %s\n", (int)portid,
222                         (unsigned)link.link_speed,
223                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
224                         ("full-duplex") : ("half-duplex\n"));
225                 } else {
226                     printf("Port %d Link Down\n", (int)portid);
227                 }
228                 continue;
229             }
230             /* clear all_ports_up flag if any link down */
231             if (link.link_status == 0) {
232                 all_ports_up = 0;
233                 break;
234             }
235         }
236 
237         /* after finally printing all link status, get out */
238         if (print_flag == 1)
239             break;
240 
241         if (all_ports_up == 0) {
242             printf(".");
243             fflush(stdout);
244             rte_delay_ms(CHECK_INTERVAL);
245         }
246 
247         /* set the print_flag if all ports up or timeout */
248         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
249             print_flag = 1;
250             printf("done\n");
251         }
252     }
253 }
254 
255 static int
256 xdigit2val(unsigned char c)
257 {
258     int val;
259 
260     if (isdigit(c))
261         val = c - '0';
262     else if (isupper(c))
263         val = c - 'A' + 10;
264     else
265         val = c - 'a' + 10;
266     return val;
267 }
268 
269 static int
270 parse_lcore_mask(const char *coremask, uint16_t *lcore_proc,
271     uint16_t nb_procs)
272 {
273     int i, j, idx = 0;
274     unsigned count = 0;
275     char c;
276     int val;
277 
278     if (coremask == NULL)
279         return -1;
280 
281     /* Remove all blank characters ahead and after.
282      * Remove 0x/0X if exists.
283      */
284     while (isblank(*coremask))
285         coremask++;
286     if (coremask[0] == '0' && ((coremask[1] == 'x')
287         || (coremask[1] == 'X')))
288         coremask += 2;
289 
290     i = strlen(coremask);
291     while ((i > 0) && isblank(coremask[i - 1]))
292         i--;
293 
294     if (i == 0)
295         return -1;
296 
297     for (i = i - 1; i >= 0 && idx < RTE_MAX_LCORE && count < nb_procs; i--) {
298         c = coremask[i];
299         if (isxdigit(c) == 0) {
300             return -1;
301         }
302         val = xdigit2val(c);
303         for (j = 0; j < BITS_PER_HEX && idx < RTE_MAX_LCORE && count < nb_procs;
304             j++, idx++) {
305             if ((1 << j) & val) {
306                 if (!lcore_config[idx].detected) {
307                     RTE_LOG(ERR, EAL, "lcore %u unavailable\n", idx);
308                     return -1;
309                 }
310                 lcore_proc[count] = idx;
311                 count++;
312             }
313         }
314     }
315 
316     for (; i >= 0; i--)
317         if (coremask[i] != '0')
318             return -1;
319 
320     if (count < nb_procs)
321         return -1;
322 
323     return 0;
324 }
325 
326 static int
327 init_lcore_conf(void)
328 {
329     uint8_t nb_ports = rte_eth_dev_count();
330     if (nb_ports == 0) {
331         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
332     }
333 
334     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
335     lcore_conf.nb_procs = ff_global_cfg.dpdk.nb_procs;
336     lcore_conf.lcore_proc = rte_zmalloc(NULL,
337         sizeof(uint16_t)*lcore_conf.nb_procs, 0);
338     if (lcore_conf.lcore_proc == NULL) {
339         rte_exit(EXIT_FAILURE, "rte_zmalloc lcore_proc failed\n");
340     }
341 
342     int ret = parse_lcore_mask(ff_global_cfg.dpdk.lcore_mask,
343         lcore_conf.lcore_proc, lcore_conf.nb_procs);
344     if (ret < 0) {
345         rte_exit(EXIT_FAILURE, "parse_lcore_mask failed:%s\n",
346             ff_global_cfg.dpdk.lcore_mask);
347     }
348 
349     uint16_t socket_id = 0;
350     if (ff_global_cfg.dpdk.numa_on) {
351         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
352     }
353 
354     lcore_conf.socket_id = socket_id;
355 
356     /* Currently, proc id 1:1 map to rx/tx queue id per port. */
357     uint8_t port_id, enabled_ports = 0;
358     for (port_id = 0; port_id < nb_ports; port_id++) {
359         if (ff_global_cfg.dpdk.port_mask &&
360             (ff_global_cfg.dpdk.port_mask & (1 << port_id)) == 0) {
361             printf("\nSkipping disabled port %d\n", port_id);
362             continue;
363         }
364 
365         if (port_id >= ff_global_cfg.dpdk.nb_ports) {
366             printf("\nSkipping non-configured port %d\n", port_id);
367             break;
368         }
369 
370         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
371         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
372         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = lcore_conf.proc_id;
373         lcore_conf.nb_rx_queue++;
374 
375         lcore_conf.tx_queue_id[port_id] = lcore_conf.proc_id;
376         lcore_conf.pcap[port_id] = ff_global_cfg.dpdk.port_cfgs[enabled_ports].pcap;
377 
378         ff_global_cfg.dpdk.port_cfgs[enabled_ports].port_id = port_id;
379 
380         enabled_ports++;
381     }
382 
383     ff_global_cfg.dpdk.nb_ports = enabled_ports;
384 
385     return 0;
386 }
387 
388 static int
389 init_mem_pool(void)
390 {
391     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
392     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
393     uint32_t nb_tx_queue = nb_lcores;
394     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
395 
396     unsigned nb_mbuf = RTE_MAX (
397         (nb_rx_queue*RX_QUEUE_SIZE          +
398         nb_ports*nb_lcores*MAX_PKT_BURST    +
399         nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
400         nb_lcores*MEMPOOL_CACHE_SIZE),
401         (unsigned)8192);
402 
403     unsigned socketid = 0;
404     uint16_t i, lcore_id;
405     char s[64];
406     int numa_on = ff_global_cfg.dpdk.numa_on;
407 
408     for (i = 0; i < lcore_conf.nb_procs; i++) {
409         lcore_id = lcore_conf.lcore_proc[i];
410         if (numa_on) {
411             socketid = rte_lcore_to_socket_id(lcore_id);
412         }
413 
414         if (socketid >= NB_SOCKETS) {
415             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
416                 socketid, i, NB_SOCKETS);
417         }
418 
419         if (pktmbuf_pool[socketid] != NULL) {
420             continue;
421         }
422 
423         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
424             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
425             pktmbuf_pool[socketid] =
426                 rte_pktmbuf_pool_create(s, nb_mbuf,
427                     MEMPOOL_CACHE_SIZE, 0,
428                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
429         } else {
430             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
431             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
432         }
433 
434         if (pktmbuf_pool[socketid] == NULL) {
435             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
436         } else {
437             printf("create mbuf pool on socket %d\n", socketid);
438         }
439     }
440 
441     return 0;
442 }
443 
444 static int
445 init_arp_ring(void)
446 {
447     int i, ret;
448     char name_buf[RTE_RING_NAMESIZE];
449     int nb_procs = ff_global_cfg.dpdk.nb_procs;
450     int proc_id = ff_global_cfg.dpdk.proc_id;
451 
452     /* Allocate arp ring ptr according to eth dev count. */
453     int nb_ports = rte_eth_dev_count();
454     for(i = 0; i < nb_procs; ++i) {
455         snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_%d_%d",
456             proc_id, i);
457 
458         arp_ring[i] = rte_zmalloc(name_buf,
459             sizeof(struct rte_ring *) * nb_ports,
460              RTE_CACHE_LINE_SIZE);
461         if (arp_ring[i] == NULL) {
462             rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
463                 "failed\n", name_buf);
464         }
465     }
466 
467     unsigned socketid = lcore_conf.socket_id;
468 
469     /* Create ring according to ports actually being used. */
470     nb_ports = ff_global_cfg.dpdk.nb_ports;
471     for (i = 0; i < nb_ports; i++) {
472         uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id;
473 
474         for(i = 0; i < nb_procs; ++i) {
475             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_%d_%d", i, port_id);
476             if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
477                 arp_ring[i][port_id] = rte_ring_create(name_buf,
478                     ARP_RING_SIZE, socketid,
479                     RING_F_SC_DEQ);
480             } else {
481                 arp_ring[i][port_id] = rte_ring_lookup(name_buf);
482             }
483 
484             if (arp_ring[i][port_id] == NULL)
485                 rte_panic("create kni ring::%s failed!\n", name_buf);
486 
487             if (rte_ring_lookup(name_buf) != arp_ring[i][port_id])
488                 rte_panic("lookup kni ring:%s failed!\n", name_buf);
489 
490             printf("create arp ring:%s success, %u ring entries are now free!\n",
491                 name_buf, rte_ring_free_count(arp_ring[i][port_id]));
492         }
493     }
494 
495     return 0;
496 }
497 
498 static int
499 init_kni(void)
500 {
501     int nb_ports = rte_eth_dev_count();
502     kni_accept = 0;
503     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
504         kni_accept = 1;
505 
506     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
507         ff_global_cfg.kni.udp_port);
508 
509     unsigned socket_id = lcore_conf.socket_id;
510     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
511 
512     nb_ports = ff_global_cfg.dpdk.nb_ports;
513     int i, ret;
514     for (i = 0; i < nb_ports; i++) {
515         uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id;
516         ff_kni_alloc(port_id, socket_id, mbuf_pool);
517     }
518 
519     return 0;
520 }
521 
522 static int
523 init_port_start(void)
524 {
525     int nb_ports = ff_global_cfg.dpdk.nb_ports;
526     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
527     unsigned socketid = rte_lcore_to_socket_id(rte_lcore_id());
528     struct rte_mempool *mbuf_pool = pktmbuf_pool[socketid];
529     uint16_t i;
530 
531     for (i = 0; i < nb_ports; i++) {
532         uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id;
533 
534         struct rte_eth_dev_info dev_info;
535         rte_eth_dev_info_get(port_id, &dev_info);
536 
537         if (nb_procs > dev_info.max_rx_queues) {
538             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
539                 nb_procs,
540                 dev_info.max_rx_queues);
541         }
542 
543         if (nb_procs > dev_info.max_tx_queues) {
544             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
545                 nb_procs,
546                 dev_info.max_tx_queues);
547         }
548 
549         struct ether_addr addr;
550         rte_eth_macaddr_get(port_id, &addr);
551         printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
552                    " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
553                 (unsigned)port_id,
554                 addr.addr_bytes[0], addr.addr_bytes[1],
555                 addr.addr_bytes[2], addr.addr_bytes[3],
556                 addr.addr_bytes[4], addr.addr_bytes[5]);
557 
558         rte_memcpy(ff_global_cfg.dpdk.port_cfgs[i].mac,
559             addr.addr_bytes, ETHER_ADDR_LEN);
560 
561         /* Clear txq_flags - we do not need multi-mempool and refcnt */
562         dev_info.default_txconf.txq_flags = ETH_TXQ_FLAGS_NOMULTMEMP |
563             ETH_TXQ_FLAGS_NOREFCOUNT;
564 
565         /* Disable features that are not supported by port's HW */
566         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) {
567             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP;
568         }
569 
570         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
571             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP;
572         }
573 
574         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) {
575             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP;
576         }
577 
578         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) {
579             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL;
580         }
581 
582         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) {
583             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL;
584         }
585 
586         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) &&
587             !(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TSO)) {
588             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
589         }
590 
591         struct rte_eth_conf port_conf = {0};
592 
593         /* Set RSS mode */
594         port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
595         port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK;
596         port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes;
597         port_conf.rx_adv_conf.rss_conf.rss_key_len = 40;
598 
599         /* Set Rx VLAN stripping */
600         if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
601             port_conf.rxmode.hw_vlan_strip = 1;
602         }
603 
604         /* Enable HW CRC stripping */
605         port_conf.rxmode.hw_strip_crc = 1;
606 
607         /* FIXME: Enable TCP LRO ?*/
608         #if 0
609         if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
610             printf("LRO is supported\n");
611             port_conf.rxmode.enable_lro = 1;
612             ff_global_cfg.dpdk.port_cfgs[i].hw_features.rx_lro = 1;
613         }
614         #endif
615 
616         /* Set Rx checksum checking */
617         if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
618             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
619             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
620             printf("RX checksum offload supported\n");
621             port_conf.rxmode.hw_ip_checksum = 1;
622             ff_global_cfg.dpdk.port_cfgs[i].hw_features.rx_csum = 1;
623         }
624 
625         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
626             printf("TX ip checksum offload supported\n");
627             ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_csum_ip = 1;
628         }
629 
630         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
631             (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
632             printf("TX TCP&UDP checksum offload supported\n");
633             ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_csum_l4 = 1;
634         }
635 
636         if (ff_global_cfg.dpdk.tso) {
637             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
638                 printf("TSO is supported\n");
639                 ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_tso = 1;
640             }
641         } else {
642             printf("TSO is disabled\n");
643         }
644 
645         if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
646             return 0;
647         }
648 
649         /* Currently, proc id 1:1 map to queue id per port. */
650         int ret = rte_eth_dev_configure(port_id, nb_procs, nb_procs, &port_conf);
651         if (ret != 0) {
652             return ret;
653         }
654 
655         uint16_t q;
656         for (q = 0; q < nb_procs; q++) {
657             ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE,
658                 socketid, &dev_info.default_txconf);
659             if (ret < 0) {
660                 return ret;
661             }
662 
663             ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE,
664                 socketid, &dev_info.default_rxconf, mbuf_pool);
665             if (ret < 0) {
666                 return ret;
667             }
668         }
669 
670         ret = rte_eth_dev_start(port_id);
671         if (ret < 0) {
672             return ret;
673         }
674 
675         /* Enable RX in promiscuous mode for the Ethernet device. */
676         if (ff_global_cfg.dpdk.promiscuous) {
677             rte_eth_promiscuous_enable(port_id);
678             ret = rte_eth_promiscuous_get(port_id);
679             if (ret == 1) {
680                 printf("set port %u to promiscuous mode ok\n", port_id);
681             } else {
682                 printf("set port %u to promiscuous mode error\n", port_id);
683             }
684         }
685 
686         /* Enable pcap dump */
687         if (ff_global_cfg.dpdk.port_cfgs[i].pcap) {
688             ff_enable_pcap(ff_global_cfg.dpdk.port_cfgs[i].pcap);
689         }
690     }
691 
692     return 0;
693 }
694 
695 static int
696 init_freebsd_clock(void)
697 {
698     rte_timer_subsystem_init();
699     uint64_t hz = rte_get_timer_hz();
700     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
701     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
702 
703     rte_timer_init(&freebsd_clock);
704     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
705         rte_lcore_id(), &freebsd_hardclock_job, NULL);
706 
707     return 0;
708 }
709 
710 int
711 ff_dpdk_init(int argc, char **argv)
712 {
713     if (ff_global_cfg.dpdk.nb_procs < 1 ||
714         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
715         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
716         ff_global_cfg.dpdk.nb_procs < 0) {
717         printf("param num_procs[%d] or proc_id[%d] error!\n",
718             ff_global_cfg.dpdk.nb_procs,
719             ff_global_cfg.dpdk.proc_id);
720         exit(1);
721     }
722 
723     int ret = rte_eal_init(argc, argv);
724     if (ret < 0) {
725         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
726     }
727 
728     init_lcore_conf();
729 
730     init_mem_pool();
731 
732     init_arp_ring();
733 
734     enable_kni = ff_global_cfg.kni.enable;
735     if (enable_kni) {
736         init_kni();
737     }
738 
739     ret = init_port_start();
740     if (ret < 0) {
741         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
742     }
743 
744     check_all_ports_link_status();
745 
746     init_freebsd_clock();
747 
748     return 0;
749 }
750 
751 static void
752 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
753 {
754     uint8_t rx_csum = ctx->hw_features.rx_csum;
755     if (rx_csum) {
756         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
757             return;
758         }
759     }
760 
761     /*
762      * FIXME: should we save pkt->vlan_tci
763      * if (pkt->ol_flags & PKT_RX_VLAN_PKT)
764      */
765 
766     void *data = rte_pktmbuf_mtod(pkt, void*);
767     uint16_t len = rte_pktmbuf_data_len(pkt);
768 
769     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
770     if (hdr == NULL) {
771         rte_pktmbuf_free(pkt);
772         return;
773     }
774 
775     pkt = pkt->next;
776     void *prev = hdr;
777     while(pkt != NULL) {
778         data = rte_pktmbuf_mtod(pkt, void*);
779         len = rte_pktmbuf_data_len(pkt);
780 
781         void *mb = ff_mbuf_get(prev, data, len);
782         if (mb == NULL) {
783             ff_mbuf_free(hdr);
784             return;
785         }
786         pkt = pkt->next;
787         prev = mb;
788     }
789 
790     ff_veth_process_packet(ctx->ifp, hdr);
791 }
792 
793 static enum FilterReturn
794 protocol_filter(const void *data, uint16_t len)
795 {
796     if(len < sizeof(struct ether_hdr))
797         return FILTER_UNKNOWN;
798 
799     const struct ether_hdr *hdr;
800     hdr = (const struct ether_hdr *)data;
801 
802     if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP)
803         return FILTER_ARP;
804 
805     if (!enable_kni) {
806         return FILTER_UNKNOWN;
807     }
808 
809     if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4)
810         return FILTER_UNKNOWN;
811 
812     return ff_kni_proto_filter(data + sizeof(struct ether_hdr),
813         len - sizeof(struct ether_hdr));
814 }
815 
816 static inline void
817 process_packets(uint8_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
818     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
819 {
820     struct lcore_conf *qconf = &lcore_conf;
821 
822     uint16_t i;
823     for (i = 0; i < count; i++) {
824         struct rte_mbuf *rtem = bufs[i];
825 
826         if (unlikely(qconf->pcap[port_id] != NULL)) {
827             ff_dump_packets(qconf->pcap[port_id], rtem);
828         }
829 
830         void *data = rte_pktmbuf_mtod(rtem, void*);
831         uint16_t len = rte_pktmbuf_data_len(rtem);
832 
833         enum FilterReturn filter = protocol_filter(data, len);
834         if (filter == FILTER_ARP) {
835             struct rte_mempool *mbuf_pool;
836             struct rte_mbuf *mbuf_clone;
837             if (pkts_from_ring == 0) {
838                 uint16_t i;
839                 for(i = 0; i < qconf->nb_procs; ++i) {
840                     if(i == queue_id)
841                         continue;
842 
843                     mbuf_pool = pktmbuf_pool[rte_lcore_to_socket_id(qconf->lcore_proc[i])];
844                     mbuf_clone = rte_pktmbuf_clone(rtem, mbuf_pool);
845                     if(mbuf_clone) {
846                         int ret = rte_ring_enqueue(arp_ring[i][port_id], mbuf_clone);
847                         if (ret < 0)
848                             rte_pktmbuf_free(mbuf_clone);
849                     }
850                 }
851             }
852 
853             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
854                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
855                 mbuf_clone = rte_pktmbuf_clone(rtem, mbuf_pool);
856                 if(mbuf_clone) {
857                     ff_kni_enqueue(port_id, rtem);
858                 }
859             }
860 
861             ff_veth_input(ctx, rtem);
862         } else if (enable_kni && ((filter == FILTER_KNI && kni_accept) ||
863             (filter == FILTER_UNKNOWN && !kni_accept)) ) {
864             ff_kni_enqueue(port_id, rtem);
865         } else {
866             ff_veth_input(ctx, rtem);
867         }
868     }
869 }
870 
871 static inline int
872 process_arp_ring(uint8_t port_id, uint16_t queue_id,
873     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
874 {
875     /* read packet from ring buf and to process */
876     uint16_t nb_tx;
877     nb_tx = rte_ring_dequeue_burst(arp_ring[queue_id][port_id],
878         (void **)pkts_burst, MAX_PKT_BURST);
879 
880     if(nb_tx > 0) {
881         process_packets(port_id, queue_id, pkts_burst, nb_tx, ctx, 1);
882     }
883 
884     return 0;
885 }
886 
887 /* Send burst of packets on an output interface */
888 static inline int
889 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
890 {
891     struct rte_mbuf **m_table;
892     int ret;
893     uint16_t queueid;
894 
895     queueid = qconf->tx_queue_id[port];
896     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
897 
898     if (unlikely(qconf->pcap[port] != NULL)) {
899         uint16_t i;
900         for (i = 0; i < n; i++) {
901             ff_dump_packets(qconf->pcap[port], m_table[i]);
902         }
903     }
904 
905     ret = rte_eth_tx_burst(port, queueid, m_table, n);
906     if (unlikely(ret < n)) {
907         do {
908             rte_pktmbuf_free(m_table[ret]);
909         } while (++ret < n);
910     }
911 
912     return 0;
913 }
914 
915 /* Enqueue a single packet, and send burst if queue is filled */
916 static inline int
917 send_single_packet(struct rte_mbuf *m, uint8_t port)
918 {
919     uint16_t len;
920     struct lcore_conf *qconf;
921 
922     qconf = &lcore_conf;
923     len = qconf->tx_mbufs[port].len;
924     qconf->tx_mbufs[port].m_table[len] = m;
925     len++;
926 
927     /* enough pkts to be sent */
928     if (unlikely(len == MAX_PKT_BURST)) {
929         send_burst(qconf, MAX_PKT_BURST, port);
930         len = 0;
931     }
932 
933     qconf->tx_mbufs[port].len = len;
934     return 0;
935 }
936 
937 int
938 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
939     int total)
940 {
941     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
942     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
943     if (head == NULL) {
944         ff_mbuf_free(m);
945         return -1;
946     }
947 
948     head->pkt_len = total;
949     head->nb_segs = 0;
950 
951     int off = 0;
952     struct rte_mbuf *cur = head, *prev = NULL;
953     while(total > 0) {
954         if (cur == NULL) {
955             cur = rte_pktmbuf_alloc(mbuf_pool);
956             if (cur == NULL) {
957                 rte_pktmbuf_free(head);
958                 ff_mbuf_free(m);
959                 return -1;
960             }
961         }
962 
963         void *data = rte_pktmbuf_mtod(cur, void*);
964         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
965         int ret = ff_mbuf_copydata(m, data, off, len);
966         if (ret < 0) {
967             rte_pktmbuf_free(head);
968             ff_mbuf_free(m);
969             return -1;
970         }
971 
972         if (prev != NULL) {
973             prev->next = cur;
974         }
975         prev = cur;
976 
977         cur->data_len = len;
978         off += len;
979         total -= len;
980         head->nb_segs++;
981         cur = NULL;
982     }
983 
984     struct ff_tx_offload offload = {0};
985     ff_mbuf_tx_offload(m, &offload);
986 
987     if (offload.ip_csum) {
988         head->ol_flags |= PKT_TX_IP_CKSUM;
989         head->l2_len = sizeof(struct ether_hdr);
990         head->l3_len = sizeof(struct ipv4_hdr);
991     }
992 
993     if (ctx->hw_features.tx_csum_l4) {
994         if (offload.tcp_csum) {
995             head->ol_flags |= PKT_TX_TCP_CKSUM;
996             head->l2_len = sizeof(struct ether_hdr);
997             head->l3_len = sizeof(struct ipv4_hdr);
998         }
999 
1000         if (offload.tso_seg_size) {
1001             head->ol_flags |= PKT_TX_TCP_SEG;
1002             head->l4_len = sizeof(struct tcp_hdr);
1003             head->tso_segsz = offload.tso_seg_size;
1004         }
1005 
1006         if (offload.udp_csum) {
1007             head->ol_flags |= PKT_TX_UDP_CKSUM;
1008             head->l2_len = sizeof(struct ether_hdr);
1009             head->l3_len = sizeof(struct ipv4_hdr);
1010         }
1011     }
1012 
1013     ff_mbuf_free(m);
1014 
1015     return send_single_packet(head, ctx->port_id);
1016 }
1017 
1018 static int
1019 main_loop(void *arg)
1020 {
1021     struct loop_routine *lr = (struct loop_routine *)arg;
1022 
1023     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1024     unsigned lcore_id;
1025     uint64_t prev_tsc, diff_tsc, cur_tsc;
1026     int i, j, nb_rx;
1027     uint8_t port_id, queue_id;
1028     struct lcore_conf *qconf;
1029     const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) /
1030         US_PER_S * BURST_TX_DRAIN_US;
1031     struct ff_dpdk_if_context *ctx;
1032 
1033     prev_tsc = 0;
1034 
1035     lcore_id = rte_lcore_id();
1036     qconf = &lcore_conf;
1037 
1038     if (qconf->nb_rx_queue == 0) {
1039         printf("lcore %u has nothing to do\n", lcore_id);
1040         return 0;
1041     }
1042 
1043     while (1) {
1044         cur_tsc = rte_rdtsc();
1045         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1046             rte_timer_manage();
1047         }
1048 
1049         /*
1050          * TX burst queue drain
1051          */
1052         diff_tsc = cur_tsc - prev_tsc;
1053         if (unlikely(diff_tsc > drain_tsc)) {
1054             /*
1055              * This could be optimized (use queueid instead of
1056              * portid), but it is not called so often
1057              */
1058             for (port_id = 0; port_id < RTE_MAX_ETHPORTS; port_id++) {
1059                 if (qconf->tx_mbufs[port_id].len == 0)
1060                     continue;
1061                 send_burst(qconf,
1062                     qconf->tx_mbufs[port_id].len,
1063                     port_id);
1064                 qconf->tx_mbufs[port_id].len = 0;
1065             }
1066 
1067             prev_tsc = cur_tsc;
1068         }
1069 
1070         /*
1071          * Read packet from RX queues
1072          */
1073         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1074             port_id = qconf->rx_queue_list[i].port_id;
1075             queue_id = qconf->rx_queue_list[i].queue_id;
1076             ctx = veth_ctx[port_id];
1077 
1078             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1079                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1080             }
1081 
1082             process_arp_ring(port_id, queue_id, pkts_burst, ctx);
1083 
1084             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1085                 MAX_PKT_BURST);
1086             if (nb_rx == 0)
1087                 continue;
1088 
1089             /* Prefetch first packets */
1090             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1091                 rte_prefetch0(rte_pktmbuf_mtod(
1092                         pkts_burst[j], void *));
1093             }
1094 
1095             /* Prefetch and handle already prefetched packets */
1096             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1097                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1098                         j + PREFETCH_OFFSET], void *));
1099                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1100             }
1101 
1102             /* Handle remaining prefetched packets */
1103             for (; j < nb_rx; j++) {
1104                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1105             }
1106         }
1107 
1108         if (likely(lr->loop != NULL)) {
1109             lr->loop(lr->arg);
1110         }
1111     }
1112 }
1113 
1114 int
1115 ff_dpdk_if_up(void) {
1116     int nb_ports = ff_global_cfg.dpdk.nb_ports;
1117     int i;
1118     for (i = 0; i < nb_ports; i++) {
1119         uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id;
1120         veth_ctx[port_id] = ff_veth_attach(ff_global_cfg.dpdk.port_cfgs + i);
1121         if (veth_ctx[port_id] == NULL) {
1122             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1123         }
1124     }
1125 
1126     return 0;
1127 }
1128 
1129 void
1130 ff_dpdk_run(loop_func_t loop, void *arg) {
1131     struct loop_routine *lr = malloc(sizeof(struct loop_routine));
1132     lr->loop = loop;
1133     lr->arg = arg;
1134     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1135     rte_eal_mp_wait_lcore();
1136     free(lr);
1137 }
1138 
1139 void
1140 ff_dpdk_pktmbuf_free(void *m)
1141 {
1142     rte_pktmbuf_free((struct rte_mbuf *)m);
1143 }
1144 
1145 static uint32_t
1146 toeplitz_hash(unsigned keylen, const uint8_t *key,
1147     unsigned datalen, const uint8_t *data)
1148 {
1149     uint32_t hash = 0, v;
1150     u_int i, b;
1151 
1152     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1153 
1154     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1155     for (i = 0; i < datalen; i++) {
1156         for (b = 0; b < 8; b++) {
1157             if (data[i] & (1<<(7-b)))
1158                 hash ^= v;
1159             v <<= 1;
1160             if ((i + 4) < keylen &&
1161                 (key[i+4] & (1<<(7-b))))
1162                 v |= 1;
1163         }
1164     }
1165     return (hash);
1166 }
1167 
1168 int
1169 ff_rss_check(uint32_t saddr, uint32_t daddr, uint16_t sport, uint16_t dport)
1170 {
1171     struct lcore_conf *qconf = &lcore_conf;
1172 
1173     if (qconf->nb_procs == 1) {
1174         return 1;
1175     }
1176 
1177     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
1178         sizeof(dport)];
1179 
1180     unsigned datalen = 0;
1181 
1182     bcopy(&saddr, &data[datalen], sizeof(saddr));
1183     datalen += sizeof(saddr);
1184 
1185     bcopy(&daddr, &data[datalen], sizeof(daddr));
1186     datalen += sizeof(daddr);
1187 
1188     bcopy(&sport, &data[datalen], sizeof(sport));
1189     datalen += sizeof(sport);
1190 
1191     bcopy(&dport, &data[datalen], sizeof(dport));
1192     datalen += sizeof(dport);
1193 
1194     uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes), default_rsskey_40bytes, datalen, data);
1195 
1196     return (hash % qconf->nb_procs) == qconf->proc_id;
1197 }
1198 
1199 
1200