xref: /f-stack/lib/ff_dpdk_if.c (revision fd436ff2)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 
27 #include <rte_common.h>
28 #include <rte_byteorder.h>
29 #include <rte_log.h>
30 #include <rte_memory.h>
31 #include <rte_memcpy.h>
32 #include <rte_memzone.h>
33 #include <rte_config.h>
34 #include <rte_eal.h>
35 #include <rte_pci.h>
36 #include <rte_mbuf.h>
37 #include <rte_memory.h>
38 #include <rte_lcore.h>
39 #include <rte_launch.h>
40 #include <rte_ethdev.h>
41 #include <rte_debug.h>
42 #include <rte_common.h>
43 #include <rte_ether.h>
44 #include <rte_malloc.h>
45 #include <rte_cycles.h>
46 #include <rte_timer.h>
47 #include <rte_thash.h>
48 #include <rte_ip.h>
49 #include <rte_tcp.h>
50 #include <rte_udp.h>
51 
52 #include "ff_dpdk_if.h"
53 #include "ff_dpdk_pcap.h"
54 #include "ff_dpdk_kni.h"
55 #include "ff_config.h"
56 #include "ff_veth.h"
57 #include "ff_host_interface.h"
58 
59 #define MEMPOOL_CACHE_SIZE 256
60 
61 #define ARP_RING_SIZE 2048
62 
63 /*
64  * Configurable number of RX/TX ring descriptors
65  */
66 #define RX_QUEUE_SIZE 512
67 #define TX_QUEUE_SIZE 256
68 
69 #define MAX_PKT_BURST 32
70 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
71 
72 /*
73  * Try to avoid TX buffering if we have at least MAX_TX_BURST packets to send.
74  */
75 #define MAX_TX_BURST    (MAX_PKT_BURST / 2)
76 
77 #define NB_SOCKETS 8
78 
79 /* Configure how many packets ahead to prefetch, when reading packets */
80 #define PREFETCH_OFFSET    3
81 
82 #define MAX_RX_QUEUE_PER_LCORE 16
83 #define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS
84 #define MAX_RX_QUEUE_PER_PORT 128
85 
86 #define BITS_PER_HEX 4
87 
88 static int enable_kni;
89 static int kni_accept;
90 
91 static struct rte_timer freebsd_clock;
92 
93 // Mellanox Linux's driver key
94 static uint8_t default_rsskey_40bytes[40] = {
95     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
96     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
97     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
98     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
99     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
100 };
101 
102 static struct rte_eth_conf default_port_conf = {
103     .rxmode = {
104         .mq_mode = ETH_MQ_RX_RSS,
105         .max_rx_pkt_len = ETHER_MAX_LEN,
106         .split_hdr_size = 0, /**< hdr buf size */
107         .header_split   = 0, /**< Header Split disabled */
108         .hw_ip_checksum = 0, /**< IP checksum offload disabled */
109         .hw_vlan_filter = 0, /**< VLAN filtering disabled */
110         .hw_vlan_strip  = 0, /**< VLAN strip disabled. */
111         .hw_vlan_extend = 0, /**< Extended VLAN disabled. */
112         .jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
113         .hw_strip_crc   = 0, /**< CRC stripped by hardware */
114         .enable_lro     = 0, /**< LRO disabled */
115     },
116     .rx_adv_conf = {
117         .rss_conf = {
118             .rss_key = default_rsskey_40bytes,
119             .rss_key_len = 40,
120             .rss_hf = ETH_RSS_PROTO_MASK,
121         },
122     },
123     .txmode = {
124         .mq_mode = ETH_MQ_TX_NONE,
125     },
126 };
127 
128 struct mbuf_table {
129     uint16_t len;
130     struct rte_mbuf *m_table[MAX_PKT_BURST];
131 };
132 
133 struct lcore_rx_queue {
134     uint8_t port_id;
135     uint8_t queue_id;
136 } __rte_cache_aligned;
137 
138 struct lcore_conf {
139     uint16_t proc_id;
140     uint16_t nb_procs;
141     uint16_t socket_id;
142     uint16_t nb_rx_queue;
143     uint16_t *lcore_proc;
144     struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
145     uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
146     struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
147     char *pcap[RTE_MAX_ETHPORTS];
148 } __rte_cache_aligned;
149 
150 static struct lcore_conf lcore_conf;
151 
152 static struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
153 
154 static struct rte_ring **arp_ring[RTE_MAX_LCORE];
155 
156 struct ff_dpdk_if_context {
157     void *sc;
158     void *ifp;
159     uint16_t port_id;
160     struct ff_hw_features hw_features;
161 } __rte_cache_aligned;
162 
163 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
164 
165 extern void ff_hardclock(void);
166 
167 static void
168 freebsd_hardclock_job(__rte_unused struct rte_timer *timer,
169     __rte_unused void *arg) {
170     ff_hardclock();
171 }
172 
173 struct ff_dpdk_if_context *
174 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
175 {
176     struct ff_dpdk_if_context *ctx;
177 
178     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
179     if (ctx == NULL)
180         return NULL;
181 
182     ctx->sc = sc;
183     ctx->ifp = ifp;
184     ctx->port_id = cfg->port_id;
185     ctx->hw_features = cfg->hw_features;
186 
187     return ctx;
188 }
189 
190 void
191 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
192 {
193     free(ctx);
194 }
195 
196 static void
197 check_all_ports_link_status(void)
198 {
199     #define CHECK_INTERVAL 100 /* 100ms */
200     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
201 
202     uint8_t portid, count, all_ports_up, print_flag = 0;
203     struct rte_eth_link link;
204 
205     printf("\nChecking link status");
206     fflush(stdout);
207 
208     int i, nb_ports;
209     nb_ports = ff_global_cfg.dpdk.nb_ports;
210     for (count = 0; count <= MAX_CHECK_TIME; count++) {
211         all_ports_up = 1;
212         for (i = 0; i < nb_ports; i++) {
213             uint8_t portid = ff_global_cfg.dpdk.port_cfgs[i].port_id;
214             memset(&link, 0, sizeof(link));
215             rte_eth_link_get_nowait(portid, &link);
216 
217             /* print link status if flag set */
218             if (print_flag == 1) {
219                 if (link.link_status) {
220                     printf("Port %d Link Up - speed %u "
221                         "Mbps - %s\n", (int)portid,
222                         (unsigned)link.link_speed,
223                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
224                         ("full-duplex") : ("half-duplex\n"));
225                 } else {
226                     printf("Port %d Link Down\n", (int)portid);
227                 }
228                 continue;
229             }
230             /* clear all_ports_up flag if any link down */
231             if (link.link_status == 0) {
232                 all_ports_up = 0;
233                 break;
234             }
235         }
236 
237         /* after finally printing all link status, get out */
238         if (print_flag == 1)
239             break;
240 
241         if (all_ports_up == 0) {
242             printf(".");
243             fflush(stdout);
244             rte_delay_ms(CHECK_INTERVAL);
245         }
246 
247         /* set the print_flag if all ports up or timeout */
248         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
249             print_flag = 1;
250             printf("done\n");
251         }
252     }
253 }
254 
255 static int
256 xdigit2val(unsigned char c)
257 {
258     int val;
259 
260     if (isdigit(c))
261         val = c - '0';
262     else if (isupper(c))
263         val = c - 'A' + 10;
264     else
265         val = c - 'a' + 10;
266     return val;
267 }
268 
269 static int
270 parse_lcore_mask(const char *coremask, uint16_t *lcore_proc,
271     uint16_t nb_procs)
272 {
273     int i, j, idx = 0;
274     unsigned count = 0;
275     char c;
276     int val;
277 
278     if (coremask == NULL)
279         return -1;
280 
281     /* Remove all blank characters ahead and after.
282      * Remove 0x/0X if exists.
283      */
284     while (isblank(*coremask))
285         coremask++;
286     if (coremask[0] == '0' && ((coremask[1] == 'x')
287         || (coremask[1] == 'X')))
288         coremask += 2;
289 
290     i = strlen(coremask);
291     while ((i > 0) && isblank(coremask[i - 1]))
292         i--;
293 
294     if (i == 0)
295         return -1;
296 
297     for (i = i - 1; i >= 0 && idx < RTE_MAX_LCORE && count < nb_procs; i--) {
298         c = coremask[i];
299         if (isxdigit(c) == 0) {
300             return -1;
301         }
302         val = xdigit2val(c);
303         for (j = 0; j < BITS_PER_HEX && idx < RTE_MAX_LCORE && count < nb_procs;
304             j++, idx++) {
305             if ((1 << j) & val) {
306                 if (!lcore_config[idx].detected) {
307                     RTE_LOG(ERR, EAL, "lcore %u unavailable\n", idx);
308                     return -1;
309                 }
310                 lcore_proc[count] = idx;
311                 count++;
312             }
313         }
314     }
315 
316     for (; i >= 0; i--)
317         if (coremask[i] != '0')
318             return -1;
319 
320     if (count < nb_procs)
321         return -1;
322 
323     return 0;
324 }
325 
326 static int
327 init_lcore_conf(void)
328 {
329     uint8_t nb_ports = rte_eth_dev_count();
330     if (nb_ports == 0) {
331         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
332     }
333 
334     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
335     lcore_conf.nb_procs = ff_global_cfg.dpdk.nb_procs;
336     lcore_conf.lcore_proc = rte_zmalloc(NULL,
337         sizeof(uint16_t)*lcore_conf.nb_procs, 0);
338     if (lcore_conf.lcore_proc == NULL) {
339         rte_exit(EXIT_FAILURE, "rte_zmalloc lcore_proc failed\n");
340     }
341 
342     int ret = parse_lcore_mask(ff_global_cfg.dpdk.lcore_mask,
343         lcore_conf.lcore_proc, lcore_conf.nb_procs);
344     if (ret < 0) {
345         rte_exit(EXIT_FAILURE, "parse_lcore_mask failed:%s\n",
346             ff_global_cfg.dpdk.lcore_mask);
347     }
348 
349     uint16_t socket_id = 0;
350     if (ff_global_cfg.dpdk.numa_on) {
351         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
352     }
353 
354     lcore_conf.socket_id = socket_id;
355 
356     /* Currently, proc id 1:1 map to rx/tx queue id per port. */
357     uint8_t port_id, enabled_ports = 0;
358     for (port_id = 0; port_id < nb_ports; port_id++) {
359         if (ff_global_cfg.dpdk.port_mask &&
360             (ff_global_cfg.dpdk.port_mask & (1 << port_id)) == 0) {
361             printf("\nSkipping disabled port %d\n", port_id);
362             continue;
363         }
364 
365         if (port_id >= ff_global_cfg.dpdk.nb_ports) {
366             printf("\nSkipping non-configured port %d\n", port_id);
367             break;
368         }
369 
370         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
371         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
372         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = lcore_conf.proc_id;
373         lcore_conf.nb_rx_queue++;
374 
375         lcore_conf.tx_queue_id[port_id] = lcore_conf.proc_id;
376         lcore_conf.pcap[port_id] = ff_global_cfg.dpdk.port_cfgs[enabled_ports].pcap;
377 
378         ff_global_cfg.dpdk.port_cfgs[enabled_ports].port_id = port_id;
379 
380         enabled_ports++;
381     }
382 
383     ff_global_cfg.dpdk.nb_ports = enabled_ports;
384 
385     return 0;
386 }
387 
388 static int
389 init_mem_pool(void)
390 {
391     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
392     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
393     uint32_t nb_tx_queue = nb_lcores;
394     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
395 
396     unsigned nb_mbuf = RTE_MAX (
397         (nb_rx_queue*RX_QUEUE_SIZE          +
398         nb_ports*nb_lcores*MAX_PKT_BURST    +
399         nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
400         nb_lcores*MEMPOOL_CACHE_SIZE),
401         (unsigned)8192);
402 
403     unsigned socketid = 0;
404     uint16_t i, lcore_id;
405     char s[64];
406     int numa_on = ff_global_cfg.dpdk.numa_on;
407 
408     for (i = 0; i < lcore_conf.nb_procs; i++) {
409         lcore_id = lcore_conf.lcore_proc[i];
410         if (numa_on) {
411             socketid = rte_lcore_to_socket_id(lcore_id);
412         }
413 
414         if (socketid >= NB_SOCKETS) {
415             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
416                 socketid, i, NB_SOCKETS);
417         }
418 
419         if (pktmbuf_pool[socketid] != NULL) {
420             continue;
421         }
422 
423         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
424             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
425             pktmbuf_pool[socketid] =
426                 rte_pktmbuf_pool_create(s, nb_mbuf,
427                     MEMPOOL_CACHE_SIZE, 0,
428                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
429         } else {
430             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
431             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
432         }
433 
434         if (pktmbuf_pool[socketid] == NULL) {
435             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
436         } else {
437             printf("create mbuf pool on socket %d\n", socketid);
438         }
439     }
440 
441     return 0;
442 }
443 
444 static int
445 init_arp_ring(void)
446 {
447     int i, ret;
448     char name_buf[RTE_RING_NAMESIZE];
449     int nb_procs = ff_global_cfg.dpdk.nb_procs;
450     int proc_id = ff_global_cfg.dpdk.proc_id;
451 
452     /* Allocate arp ring ptr according to eth dev count. */
453     int nb_ports = rte_eth_dev_count();
454     for(i = 0; i < nb_procs; ++i) {
455         snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_%d_%d",
456             proc_id, i);
457 
458         arp_ring[i] = rte_zmalloc(name_buf,
459             sizeof(struct rte_ring *) * nb_ports,
460              RTE_CACHE_LINE_SIZE);
461         if (arp_ring[i] == NULL) {
462             rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
463                 "failed\n", name_buf);
464         }
465     }
466 
467     unsigned socketid = lcore_conf.socket_id;
468 
469     /* Create ring according to ports actually being used. */
470     nb_ports = ff_global_cfg.dpdk.nb_ports;
471     for (i = 0; i < nb_ports; i++) {
472         uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id;
473 
474         for(i = 0; i < nb_procs; ++i) {
475             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_%d_%d", i, port_id);
476             if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
477                 arp_ring[i][port_id] = rte_ring_create(name_buf,
478                     ARP_RING_SIZE, socketid,
479                     RING_F_SC_DEQ);
480                 if (rte_ring_lookup(name_buf) != arp_ring[i][port_id])
481                     rte_panic("lookup kni ring:%s failed!\n", name_buf);
482             } else {
483                 arp_ring[i][port_id] = rte_ring_lookup(name_buf);
484             }
485 
486             if (arp_ring[i][port_id] == NULL)
487                 rte_panic("create kni ring::%s failed!\n", name_buf);
488 
489             printf("create arp ring:%s success, %u ring entries are now free!\n",
490                 name_buf, rte_ring_free_count(arp_ring[i][port_id]));
491         }
492     }
493 
494     return 0;
495 }
496 
497 static int
498 init_kni(void)
499 {
500     int nb_ports = rte_eth_dev_count();
501     kni_accept = 0;
502     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
503         kni_accept = 1;
504 
505     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
506         ff_global_cfg.kni.udp_port);
507 
508     unsigned socket_id = lcore_conf.socket_id;
509     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
510 
511     nb_ports = ff_global_cfg.dpdk.nb_ports;
512     int i, ret;
513     for (i = 0; i < nb_ports; i++) {
514         uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id;
515         ff_kni_alloc(port_id, socket_id, mbuf_pool);
516     }
517 
518     return 0;
519 }
520 
521 static int
522 init_port_start(void)
523 {
524     int nb_ports = ff_global_cfg.dpdk.nb_ports;
525     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
526     unsigned socketid = rte_lcore_to_socket_id(rte_lcore_id());
527     struct rte_mempool *mbuf_pool = pktmbuf_pool[socketid];
528     uint16_t i;
529 
530     for (i = 0; i < nb_ports; i++) {
531         uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id;
532 
533         struct rte_eth_dev_info dev_info;
534         rte_eth_dev_info_get(port_id, &dev_info);
535 
536         if (nb_procs > dev_info.max_rx_queues) {
537             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
538                 nb_procs,
539                 dev_info.max_rx_queues);
540         }
541 
542         if (nb_procs > dev_info.max_tx_queues) {
543             rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
544                 nb_procs,
545                 dev_info.max_tx_queues);
546         }
547 
548         struct ether_addr addr;
549         rte_eth_macaddr_get(port_id, &addr);
550         printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
551                    " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
552                 (unsigned)port_id,
553                 addr.addr_bytes[0], addr.addr_bytes[1],
554                 addr.addr_bytes[2], addr.addr_bytes[3],
555                 addr.addr_bytes[4], addr.addr_bytes[5]);
556 
557         rte_memcpy(ff_global_cfg.dpdk.port_cfgs[i].mac,
558             addr.addr_bytes, ETHER_ADDR_LEN);
559 
560         /* Clear txq_flags - we do not need multi-mempool and refcnt */
561         dev_info.default_txconf.txq_flags = ETH_TXQ_FLAGS_NOMULTMEMP |
562             ETH_TXQ_FLAGS_NOREFCOUNT;
563 
564         /* Disable features that are not supported by port's HW */
565         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) {
566             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP;
567         }
568 
569         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
570             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP;
571         }
572 
573         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) {
574             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP;
575         }
576 
577         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) {
578             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL;
579         }
580 
581         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) {
582             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL;
583         }
584 
585         if (!(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) &&
586             !(dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_TSO)) {
587             dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
588         }
589 
590         struct rte_eth_conf port_conf = {0};
591 
592         /* Set RSS mode */
593         port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
594         port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK;
595         port_conf.rx_adv_conf.rss_conf.rss_key = default_rsskey_40bytes;
596         port_conf.rx_adv_conf.rss_conf.rss_key_len = 40;
597 
598         /* Set Rx VLAN stripping */
599         if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
600             port_conf.rxmode.hw_vlan_strip = 1;
601         }
602 
603         /* Enable HW CRC stripping */
604         port_conf.rxmode.hw_strip_crc = 1;
605 
606         /* FIXME: Enable TCP LRO ?*/
607         #if 0
608         if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
609             printf("LRO is supported\n");
610             port_conf.rxmode.enable_lro = 1;
611             ff_global_cfg.dpdk.port_cfgs[i].hw_features.rx_lro = 1;
612         }
613         #endif
614 
615         /* Set Rx checksum checking */
616         if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
617             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
618             (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
619             printf("RX checksum offload supported\n");
620             port_conf.rxmode.hw_ip_checksum = 1;
621             ff_global_cfg.dpdk.port_cfgs[i].hw_features.rx_csum = 1;
622         }
623 
624         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
625             printf("TX ip checksum offload supported\n");
626             ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_csum_ip = 1;
627         }
628 
629         if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
630             (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
631             printf("TX TCP&UDP checksum offload supported\n");
632             ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_csum_l4 = 1;
633         }
634 
635         if (ff_global_cfg.dpdk.tso) {
636             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
637                 printf("TSO is supported\n");
638                 ff_global_cfg.dpdk.port_cfgs[i].hw_features.tx_tso = 1;
639             }
640         } else {
641             printf("TSO is disabled\n");
642         }
643 
644         if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
645             return 0;
646         }
647 
648         /* Currently, proc id 1:1 map to queue id per port. */
649         int ret = rte_eth_dev_configure(port_id, nb_procs, nb_procs, &port_conf);
650         if (ret != 0) {
651             return ret;
652         }
653 
654         uint16_t q;
655         for (q = 0; q < nb_procs; q++) {
656             ret = rte_eth_tx_queue_setup(port_id, q, TX_QUEUE_SIZE,
657                 socketid, &dev_info.default_txconf);
658             if (ret < 0) {
659                 return ret;
660             }
661 
662             ret = rte_eth_rx_queue_setup(port_id, q, RX_QUEUE_SIZE,
663                 socketid, &dev_info.default_rxconf, mbuf_pool);
664             if (ret < 0) {
665                 return ret;
666             }
667         }
668 
669         ret = rte_eth_dev_start(port_id);
670         if (ret < 0) {
671             return ret;
672         }
673 
674         /* Enable RX in promiscuous mode for the Ethernet device. */
675         if (ff_global_cfg.dpdk.promiscuous) {
676             rte_eth_promiscuous_enable(port_id);
677             ret = rte_eth_promiscuous_get(port_id);
678             if (ret == 1) {
679                 printf("set port %u to promiscuous mode ok\n", port_id);
680             } else {
681                 printf("set port %u to promiscuous mode error\n", port_id);
682             }
683         }
684 
685         /* Enable pcap dump */
686         if (ff_global_cfg.dpdk.port_cfgs[i].pcap) {
687             ff_enable_pcap(ff_global_cfg.dpdk.port_cfgs[i].pcap);
688         }
689     }
690 
691     return 0;
692 }
693 
694 static int
695 init_freebsd_clock(void)
696 {
697     rte_timer_subsystem_init();
698     uint64_t hz = rte_get_timer_hz();
699     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
700     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
701 
702     rte_timer_init(&freebsd_clock);
703     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
704         rte_lcore_id(), &freebsd_hardclock_job, NULL);
705 
706     return 0;
707 }
708 
709 int
710 ff_dpdk_init(int argc, char **argv)
711 {
712     if (ff_global_cfg.dpdk.nb_procs < 1 ||
713         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
714         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
715         ff_global_cfg.dpdk.proc_id < 0) {
716         printf("param num_procs[%d] or proc_id[%d] error!\n",
717             ff_global_cfg.dpdk.nb_procs,
718             ff_global_cfg.dpdk.proc_id);
719         exit(1);
720     }
721 
722     int ret = rte_eal_init(argc, argv);
723     if (ret < 0) {
724         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
725     }
726 
727     init_lcore_conf();
728 
729     init_mem_pool();
730 
731     init_arp_ring();
732 
733     enable_kni = ff_global_cfg.kni.enable;
734     if (enable_kni) {
735         init_kni();
736     }
737 
738     ret = init_port_start();
739     if (ret < 0) {
740         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
741     }
742 
743     check_all_ports_link_status();
744 
745     init_freebsd_clock();
746 
747     return 0;
748 }
749 
750 static void
751 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
752 {
753     uint8_t rx_csum = ctx->hw_features.rx_csum;
754     if (rx_csum) {
755         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
756             return;
757         }
758     }
759 
760     /*
761      * FIXME: should we save pkt->vlan_tci
762      * if (pkt->ol_flags & PKT_RX_VLAN_PKT)
763      */
764 
765     void *data = rte_pktmbuf_mtod(pkt, void*);
766     uint16_t len = rte_pktmbuf_data_len(pkt);
767 
768     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
769     if (hdr == NULL) {
770         rte_pktmbuf_free(pkt);
771         return;
772     }
773 
774     pkt = pkt->next;
775     void *prev = hdr;
776     while(pkt != NULL) {
777         data = rte_pktmbuf_mtod(pkt, void*);
778         len = rte_pktmbuf_data_len(pkt);
779 
780         void *mb = ff_mbuf_get(prev, data, len);
781         if (mb == NULL) {
782             ff_mbuf_free(hdr);
783             return;
784         }
785         pkt = pkt->next;
786         prev = mb;
787     }
788 
789     ff_veth_process_packet(ctx->ifp, hdr);
790 }
791 
792 static enum FilterReturn
793 protocol_filter(const void *data, uint16_t len)
794 {
795     if(len < sizeof(struct ether_hdr))
796         return FILTER_UNKNOWN;
797 
798     const struct ether_hdr *hdr;
799     hdr = (const struct ether_hdr *)data;
800 
801     if(ntohs(hdr->ether_type) == ETHER_TYPE_ARP)
802         return FILTER_ARP;
803 
804     if (!enable_kni) {
805         return FILTER_UNKNOWN;
806     }
807 
808     if(ntohs(hdr->ether_type) != ETHER_TYPE_IPv4)
809         return FILTER_UNKNOWN;
810 
811     return ff_kni_proto_filter(data + sizeof(struct ether_hdr),
812         len - sizeof(struct ether_hdr));
813 }
814 
815 static inline void
816 process_packets(uint8_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
817     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
818 {
819     struct lcore_conf *qconf = &lcore_conf;
820 
821     uint16_t i;
822     for (i = 0; i < count; i++) {
823         struct rte_mbuf *rtem = bufs[i];
824 
825         if (unlikely(qconf->pcap[port_id] != NULL)) {
826             ff_dump_packets(qconf->pcap[port_id], rtem);
827         }
828 
829         void *data = rte_pktmbuf_mtod(rtem, void*);
830         uint16_t len = rte_pktmbuf_data_len(rtem);
831 
832         enum FilterReturn filter = protocol_filter(data, len);
833         if (filter == FILTER_ARP) {
834             struct rte_mempool *mbuf_pool;
835             struct rte_mbuf *mbuf_clone;
836             if (pkts_from_ring == 0) {
837                 uint16_t i;
838                 for(i = 0; i < qconf->nb_procs; ++i) {
839                     if(i == queue_id)
840                         continue;
841 
842                     mbuf_pool = pktmbuf_pool[rte_lcore_to_socket_id(qconf->lcore_proc[i])];
843                     mbuf_clone = rte_pktmbuf_clone(rtem, mbuf_pool);
844                     if(mbuf_clone) {
845                         int ret = rte_ring_enqueue(arp_ring[i][port_id], mbuf_clone);
846                         if (ret < 0)
847                             rte_pktmbuf_free(mbuf_clone);
848                     }
849                 }
850             }
851 
852             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
853                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
854                 mbuf_clone = rte_pktmbuf_clone(rtem, mbuf_pool);
855                 if(mbuf_clone) {
856                     ff_kni_enqueue(port_id, rtem);
857                 }
858             }
859 
860             ff_veth_input(ctx, rtem);
861         } else if (enable_kni && ((filter == FILTER_KNI && kni_accept) ||
862             (filter == FILTER_UNKNOWN && !kni_accept)) ) {
863             ff_kni_enqueue(port_id, rtem);
864         } else {
865             ff_veth_input(ctx, rtem);
866         }
867     }
868 }
869 
870 static inline int
871 process_arp_ring(uint8_t port_id, uint16_t queue_id,
872     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
873 {
874     /* read packet from ring buf and to process */
875     uint16_t nb_tx;
876     nb_tx = rte_ring_dequeue_burst(arp_ring[queue_id][port_id],
877         (void **)pkts_burst, MAX_PKT_BURST);
878 
879     if(nb_tx > 0) {
880         process_packets(port_id, queue_id, pkts_burst, nb_tx, ctx, 1);
881     }
882 
883     return 0;
884 }
885 
886 /* Send burst of packets on an output interface */
887 static inline int
888 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
889 {
890     struct rte_mbuf **m_table;
891     int ret;
892     uint16_t queueid;
893 
894     queueid = qconf->tx_queue_id[port];
895     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
896 
897     if (unlikely(qconf->pcap[port] != NULL)) {
898         uint16_t i;
899         for (i = 0; i < n; i++) {
900             ff_dump_packets(qconf->pcap[port], m_table[i]);
901         }
902     }
903 
904     ret = rte_eth_tx_burst(port, queueid, m_table, n);
905     if (unlikely(ret < n)) {
906         do {
907             rte_pktmbuf_free(m_table[ret]);
908         } while (++ret < n);
909     }
910 
911     return 0;
912 }
913 
914 /* Enqueue a single packet, and send burst if queue is filled */
915 static inline int
916 send_single_packet(struct rte_mbuf *m, uint8_t port)
917 {
918     uint16_t len;
919     struct lcore_conf *qconf;
920 
921     qconf = &lcore_conf;
922     len = qconf->tx_mbufs[port].len;
923     qconf->tx_mbufs[port].m_table[len] = m;
924     len++;
925 
926     /* enough pkts to be sent */
927     if (unlikely(len == MAX_PKT_BURST)) {
928         send_burst(qconf, MAX_PKT_BURST, port);
929         len = 0;
930     }
931 
932     qconf->tx_mbufs[port].len = len;
933     return 0;
934 }
935 
936 int
937 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
938     int total)
939 {
940     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
941     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
942     if (head == NULL) {
943         ff_mbuf_free(m);
944         return -1;
945     }
946 
947     head->pkt_len = total;
948     head->nb_segs = 0;
949 
950     int off = 0;
951     struct rte_mbuf *cur = head, *prev = NULL;
952     while(total > 0) {
953         if (cur == NULL) {
954             cur = rte_pktmbuf_alloc(mbuf_pool);
955             if (cur == NULL) {
956                 rte_pktmbuf_free(head);
957                 ff_mbuf_free(m);
958                 return -1;
959             }
960         }
961 
962         void *data = rte_pktmbuf_mtod(cur, void*);
963         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
964         int ret = ff_mbuf_copydata(m, data, off, len);
965         if (ret < 0) {
966             rte_pktmbuf_free(head);
967             ff_mbuf_free(m);
968             return -1;
969         }
970 
971         if (prev != NULL) {
972             prev->next = cur;
973         }
974         prev = cur;
975 
976         cur->data_len = len;
977         off += len;
978         total -= len;
979         head->nb_segs++;
980         cur = NULL;
981     }
982 
983     struct ff_tx_offload offload = {0};
984     ff_mbuf_tx_offload(m, &offload);
985 
986     if (offload.ip_csum) {
987         head->ol_flags |= PKT_TX_IP_CKSUM;
988         head->l2_len = sizeof(struct ether_hdr);
989         head->l3_len = sizeof(struct ipv4_hdr);
990     }
991 
992     if (ctx->hw_features.tx_csum_l4) {
993         if (offload.tcp_csum) {
994             head->ol_flags |= PKT_TX_TCP_CKSUM;
995             head->l2_len = sizeof(struct ether_hdr);
996             head->l3_len = sizeof(struct ipv4_hdr);
997         }
998 
999         if (offload.tso_seg_size) {
1000             head->ol_flags |= PKT_TX_TCP_SEG;
1001             head->l4_len = sizeof(struct tcp_hdr);
1002             head->tso_segsz = offload.tso_seg_size;
1003         }
1004 
1005         if (offload.udp_csum) {
1006             head->ol_flags |= PKT_TX_UDP_CKSUM;
1007             head->l2_len = sizeof(struct ether_hdr);
1008             head->l3_len = sizeof(struct ipv4_hdr);
1009         }
1010     }
1011 
1012     ff_mbuf_free(m);
1013 
1014     return send_single_packet(head, ctx->port_id);
1015 }
1016 
1017 static int
1018 main_loop(void *arg)
1019 {
1020     struct loop_routine *lr = (struct loop_routine *)arg;
1021 
1022     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1023     unsigned lcore_id;
1024     uint64_t prev_tsc, diff_tsc, cur_tsc;
1025     int i, j, nb_rx;
1026     uint8_t port_id, queue_id;
1027     struct lcore_conf *qconf;
1028     const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) /
1029         US_PER_S * BURST_TX_DRAIN_US;
1030     struct ff_dpdk_if_context *ctx;
1031 
1032     prev_tsc = 0;
1033 
1034     lcore_id = rte_lcore_id();
1035     qconf = &lcore_conf;
1036 
1037     if (qconf->nb_rx_queue == 0) {
1038         printf("lcore %u has nothing to do\n", lcore_id);
1039         return 0;
1040     }
1041 
1042     while (1) {
1043         cur_tsc = rte_rdtsc();
1044         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1045             rte_timer_manage();
1046         }
1047 
1048         /*
1049          * TX burst queue drain
1050          */
1051         diff_tsc = cur_tsc - prev_tsc;
1052         if (unlikely(diff_tsc > drain_tsc)) {
1053             /*
1054              * This could be optimized (use queueid instead of
1055              * portid), but it is not called so often
1056              */
1057             for (port_id = 0; port_id < RTE_MAX_ETHPORTS; port_id++) {
1058                 if (qconf->tx_mbufs[port_id].len == 0)
1059                     continue;
1060                 send_burst(qconf,
1061                     qconf->tx_mbufs[port_id].len,
1062                     port_id);
1063                 qconf->tx_mbufs[port_id].len = 0;
1064             }
1065 
1066             prev_tsc = cur_tsc;
1067         }
1068 
1069         /*
1070          * Read packet from RX queues
1071          */
1072         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1073             port_id = qconf->rx_queue_list[i].port_id;
1074             queue_id = qconf->rx_queue_list[i].queue_id;
1075             ctx = veth_ctx[port_id];
1076 
1077             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1078                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1079             }
1080 
1081             process_arp_ring(port_id, queue_id, pkts_burst, ctx);
1082 
1083             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1084                 MAX_PKT_BURST);
1085             if (nb_rx == 0)
1086                 continue;
1087 
1088             /* Prefetch first packets */
1089             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1090                 rte_prefetch0(rte_pktmbuf_mtod(
1091                         pkts_burst[j], void *));
1092             }
1093 
1094             /* Prefetch and handle already prefetched packets */
1095             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1096                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1097                         j + PREFETCH_OFFSET], void *));
1098                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1099             }
1100 
1101             /* Handle remaining prefetched packets */
1102             for (; j < nb_rx; j++) {
1103                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1104             }
1105         }
1106 
1107         if (likely(lr->loop != NULL)) {
1108             lr->loop(lr->arg);
1109         }
1110     }
1111 }
1112 
1113 int
1114 ff_dpdk_if_up(void) {
1115     int nb_ports = ff_global_cfg.dpdk.nb_ports;
1116     int i;
1117     for (i = 0; i < nb_ports; i++) {
1118         uint8_t port_id = ff_global_cfg.dpdk.port_cfgs[i].port_id;
1119         veth_ctx[port_id] = ff_veth_attach(ff_global_cfg.dpdk.port_cfgs + i);
1120         if (veth_ctx[port_id] == NULL) {
1121             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1122         }
1123     }
1124 
1125     return 0;
1126 }
1127 
1128 void
1129 ff_dpdk_run(loop_func_t loop, void *arg) {
1130     struct loop_routine *lr = malloc(sizeof(struct loop_routine));
1131     lr->loop = loop;
1132     lr->arg = arg;
1133     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1134     rte_eal_mp_wait_lcore();
1135     free(lr);
1136 }
1137 
1138 void
1139 ff_dpdk_pktmbuf_free(void *m)
1140 {
1141     rte_pktmbuf_free((struct rte_mbuf *)m);
1142 }
1143 
1144 static uint32_t
1145 toeplitz_hash(unsigned keylen, const uint8_t *key,
1146     unsigned datalen, const uint8_t *data)
1147 {
1148     uint32_t hash = 0, v;
1149     u_int i, b;
1150 
1151     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1152 
1153     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1154     for (i = 0; i < datalen; i++) {
1155         for (b = 0; b < 8; b++) {
1156             if (data[i] & (1<<(7-b)))
1157                 hash ^= v;
1158             v <<= 1;
1159             if ((i + 4) < keylen &&
1160                 (key[i+4] & (1<<(7-b))))
1161                 v |= 1;
1162         }
1163     }
1164     return (hash);
1165 }
1166 
1167 int
1168 ff_rss_check(uint32_t saddr, uint32_t daddr, uint16_t sport, uint16_t dport)
1169 {
1170     struct lcore_conf *qconf = &lcore_conf;
1171 
1172     if (qconf->nb_procs == 1) {
1173         return 1;
1174     }
1175 
1176     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
1177         sizeof(dport)];
1178 
1179     unsigned datalen = 0;
1180 
1181     bcopy(&saddr, &data[datalen], sizeof(saddr));
1182     datalen += sizeof(saddr);
1183 
1184     bcopy(&daddr, &data[datalen], sizeof(daddr));
1185     datalen += sizeof(daddr);
1186 
1187     bcopy(&sport, &data[datalen], sizeof(sport));
1188     datalen += sizeof(sport);
1189 
1190     bcopy(&dport, &data[datalen], sizeof(dport));
1191     datalen += sizeof(dport);
1192 
1193     uint32_t hash = toeplitz_hash(sizeof(default_rsskey_40bytes), default_rsskey_40bytes, datalen, data);
1194 
1195     return (hash % qconf->nb_procs) == qconf->proc_id;
1196 }
1197 
1198 
1199