xref: /f-stack/lib/ff_dpdk_if.c (revision 73bdce41)
1 /*
2  * Copyright (C) 2017 THL A29 Limited, a Tencent company.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice, this
9  *   list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright notice,
11  *   this list of conditions and the following disclaimer in the documentation
12  *   and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  *
25  */
26 #include <assert.h>
27 #include <unistd.h>
28 #include <sys/mman.h>
29 #include <errno.h>
30 
31 #include <rte_common.h>
32 #include <rte_byteorder.h>
33 #include <rte_log.h>
34 #include <rte_memory.h>
35 #include <rte_memcpy.h>
36 #include <rte_memzone.h>
37 #include <rte_config.h>
38 #include <rte_eal.h>
39 #include <rte_pci.h>
40 #include <rte_mbuf.h>
41 #include <rte_memory.h>
42 #include <rte_lcore.h>
43 #include <rte_launch.h>
44 #include <rte_ethdev.h>
45 #include <rte_debug.h>
46 #include <rte_common.h>
47 #include <rte_ether.h>
48 #include <rte_malloc.h>
49 #include <rte_cycles.h>
50 #include <rte_timer.h>
51 #include <rte_thash.h>
52 #include <rte_ip.h>
53 #include <rte_tcp.h>
54 #include <rte_udp.h>
55 #include <rte_eth_bond.h>
56 
57 #include "ff_dpdk_if.h"
58 #include "ff_dpdk_pcap.h"
59 #include "ff_dpdk_kni.h"
60 #include "ff_config.h"
61 #include "ff_veth.h"
62 #include "ff_host_interface.h"
63 #include "ff_msg.h"
64 #include "ff_api.h"
65 #include "ff_memory.h"
66 
67 #ifdef FF_KNI
68 #define KNI_MBUF_MAX 2048
69 #define KNI_QUEUE_SIZE 2048
70 
71 int enable_kni;
72 static int kni_accept;
73 static int knictl_action = FF_KNICTL_ACTION_DEFAULT;
74 #endif
75 
76 static int numa_on;
77 
78 static unsigned idle_sleep;
79 static unsigned pkt_tx_delay;
80 
81 static struct rte_timer freebsd_clock;
82 
83 // Mellanox Linux's driver key
84 static uint8_t default_rsskey_40bytes[40] = {
85     0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
86     0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
87     0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
88     0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
89     0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
90 };
91 
92 static uint8_t default_rsskey_52bytes[52] = {
93     0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23,
94     0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30,
95     0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02,
96     0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c,
97     0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55,
98     0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e,
99     0x81, 0x15, 0x03, 0x66
100 };
101 
102 static uint8_t symmetric_rsskey[52] = {
103     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
104     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
105     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
106     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
107     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
108     0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a, 0x6d, 0x5a,
109     0x6d, 0x5a, 0x6d, 0x5a
110 };
111 
112 static int rsskey_len = sizeof(default_rsskey_40bytes);
113 static uint8_t *rsskey = default_rsskey_40bytes;
114 
115 struct lcore_conf lcore_conf;
116 
117 struct rte_mempool *pktmbuf_pool[NB_SOCKETS];
118 
119 static pcblddr_func_t pcblddr_fun;
120 
121 static struct rte_ring **dispatch_ring[RTE_MAX_ETHPORTS];
122 static dispatch_func_t packet_dispatcher;
123 
124 static uint16_t rss_reta_size[RTE_MAX_ETHPORTS];
125 
126 #define BOND_DRIVER_NAME    "net_bonding"
127 
128 static inline int send_single_packet(struct rte_mbuf *m, uint8_t port);
129 
130 struct ff_msg_ring {
131     char ring_name[FF_MSG_NUM][RTE_RING_NAMESIZE];
132     /* ring[0] for lcore recv msg, other send */
133     /* ring[1] for lcore send msg, other read */
134     struct rte_ring *ring[FF_MSG_NUM];
135 } __rte_cache_aligned;
136 
137 static struct ff_msg_ring msg_ring[RTE_MAX_LCORE];
138 static struct rte_mempool *message_pool;
139 static struct ff_dpdk_if_context *veth_ctx[RTE_MAX_ETHPORTS];
140 
141 static struct ff_top_args ff_top_status;
142 static struct ff_traffic_args ff_traffic;
143 extern void ff_hardclock(void);
144 
145 static void
146 ff_hardclock_job(__rte_unused struct rte_timer *timer,
147     __rte_unused void *arg) {
148     ff_hardclock();
149     ff_update_current_ts();
150 }
151 
152 struct ff_dpdk_if_context *
153 ff_dpdk_register_if(void *sc, void *ifp, struct ff_port_cfg *cfg)
154 {
155     struct ff_dpdk_if_context *ctx;
156 
157     ctx = calloc(1, sizeof(struct ff_dpdk_if_context));
158     if (ctx == NULL)
159         return NULL;
160 
161     ctx->sc = sc;
162     ctx->ifp = ifp;
163     ctx->port_id = cfg->port_id;
164     ctx->hw_features = cfg->hw_features;
165 
166     return ctx;
167 }
168 
169 void
170 ff_dpdk_deregister_if(struct ff_dpdk_if_context *ctx)
171 {
172     free(ctx);
173 }
174 
175 static void
176 check_all_ports_link_status(void)
177 {
178     #define CHECK_INTERVAL 100 /* 100ms */
179     #define MAX_CHECK_TIME 90  /* 9s (90 * 100ms) in total */
180 
181     uint16_t portid;
182     uint8_t count, all_ports_up, print_flag = 0;
183     struct rte_eth_link link;
184 
185     printf("\nChecking link status");
186     fflush(stdout);
187 
188     int i, nb_ports;
189     nb_ports = ff_global_cfg.dpdk.nb_ports;
190     for (count = 0; count <= MAX_CHECK_TIME; count++) {
191         all_ports_up = 1;
192         for (i = 0; i < nb_ports; i++) {
193             uint16_t portid = ff_global_cfg.dpdk.portid_list[i];
194             memset(&link, 0, sizeof(link));
195             rte_eth_link_get_nowait(portid, &link);
196 
197             /* print link status if flag set */
198             if (print_flag == 1) {
199                 if (link.link_status) {
200                     printf("Port %d Link Up - speed %u "
201                         "Mbps - %s\n", (int)portid,
202                         (unsigned)link.link_speed,
203                         (link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
204                         ("full-duplex") : ("half-duplex\n"));
205                 } else {
206                     printf("Port %d Link Down\n", (int)portid);
207                 }
208                 continue;
209             }
210             /* clear all_ports_up flag if any link down */
211             if (link.link_status == 0) {
212                 all_ports_up = 0;
213                 break;
214             }
215         }
216 
217         /* after finally printing all link status, get out */
218         if (print_flag == 1)
219             break;
220 
221         if (all_ports_up == 0) {
222             printf(".");
223             fflush(stdout);
224             rte_delay_ms(CHECK_INTERVAL);
225         }
226 
227         /* set the print_flag if all ports up or timeout */
228         if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
229             print_flag = 1;
230             printf("done\n");
231         }
232     }
233 }
234 
235 static int
236 init_lcore_conf(void)
237 {
238     uint8_t nb_dev_ports = rte_eth_dev_count_avail();
239     if (nb_dev_ports == 0) {
240         rte_exit(EXIT_FAILURE, "No probed ethernet devices\n");
241     }
242 
243     if (ff_global_cfg.dpdk.max_portid >= nb_dev_ports) {
244         rte_exit(EXIT_FAILURE, "this machine doesn't have port %d.\n",
245                  ff_global_cfg.dpdk.max_portid);
246     }
247 
248     lcore_conf.port_cfgs = ff_global_cfg.dpdk.port_cfgs;
249     lcore_conf.proc_id = ff_global_cfg.dpdk.proc_id;
250 
251     uint16_t socket_id = 0;
252     if (numa_on) {
253         socket_id = rte_lcore_to_socket_id(rte_lcore_id());
254     }
255 
256     lcore_conf.socket_id = socket_id;
257 
258     uint16_t lcore_id = ff_global_cfg.dpdk.proc_lcore[lcore_conf.proc_id];
259     if (!rte_lcore_is_enabled(lcore_id)) {
260         rte_exit(EXIT_FAILURE, "lcore %u unavailable\n", lcore_id);
261     }
262 
263     int j;
264     for (j = 0; j < ff_global_cfg.dpdk.nb_ports; ++j) {
265         uint16_t port_id = ff_global_cfg.dpdk.portid_list[j];
266         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
267 
268         int queueid = -1;
269         int i;
270         for (i = 0; i < pconf->nb_lcores; i++) {
271             if (pconf->lcore_list[i] == lcore_id) {
272                 queueid = i;
273             }
274         }
275         if (queueid < 0) {
276             continue;
277         }
278         printf("lcore: %u, port: %u, queue: %u\n", lcore_id, port_id, queueid);
279         uint16_t nb_rx_queue = lcore_conf.nb_rx_queue;
280         lcore_conf.rx_queue_list[nb_rx_queue].port_id = port_id;
281         lcore_conf.rx_queue_list[nb_rx_queue].queue_id = queueid;
282         lcore_conf.nb_rx_queue++;
283 
284         lcore_conf.tx_queue_id[port_id] = queueid;
285         lcore_conf.tx_port_id[lcore_conf.nb_tx_port] = port_id;
286         lcore_conf.nb_tx_port++;
287 
288         /* Enable pcap dump */
289         if (ff_global_cfg.pcap.enable) {
290             ff_enable_pcap(ff_global_cfg.pcap.save_path, ff_global_cfg.pcap.snap_len);
291         }
292 
293         lcore_conf.nb_queue_list[port_id] = pconf->nb_lcores;
294     }
295 
296     if (lcore_conf.nb_rx_queue == 0) {
297         rte_exit(EXIT_FAILURE, "lcore %u has nothing to do\n", lcore_id);
298     }
299 
300     return 0;
301 }
302 
303 static int
304 init_mem_pool(void)
305 {
306     uint8_t nb_ports = ff_global_cfg.dpdk.nb_ports;
307     uint32_t nb_lcores = ff_global_cfg.dpdk.nb_procs;
308     uint32_t nb_tx_queue = nb_lcores;
309     uint32_t nb_rx_queue = lcore_conf.nb_rx_queue * nb_lcores;
310     uint16_t max_portid = ff_global_cfg.dpdk.max_portid;
311 
312     unsigned nb_mbuf = RTE_ALIGN_CEIL (
313         (nb_rx_queue * (max_portid + 1) * 2 * RX_QUEUE_SIZE          +
314         nb_ports * (max_portid + 1) * 2 * nb_lcores * MAX_PKT_BURST    +
315         nb_ports * (max_portid + 1) * 2 * nb_tx_queue * TX_QUEUE_SIZE  +
316         nb_lcores * MEMPOOL_CACHE_SIZE +
317 #ifdef FF_KNI
318         nb_ports * KNI_MBUF_MAX +
319         nb_ports * KNI_QUEUE_SIZE +
320 #endif
321         nb_lcores * nb_ports * DISPATCH_RING_SIZE),
322         (unsigned)8192);
323 
324     unsigned socketid = 0;
325     uint16_t i, lcore_id;
326     char s[64];
327 
328     for (i = 0; i < ff_global_cfg.dpdk.nb_procs; i++) {
329         lcore_id = ff_global_cfg.dpdk.proc_lcore[i];
330         if (numa_on) {
331             socketid = rte_lcore_to_socket_id(lcore_id);
332         }
333 
334         if (socketid >= NB_SOCKETS) {
335             rte_exit(EXIT_FAILURE, "Socket %d of lcore %u is out of range %d\n",
336                 socketid, i, NB_SOCKETS);
337         }
338 
339         if (pktmbuf_pool[socketid] != NULL) {
340             continue;
341         }
342 
343         if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
344             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
345             pktmbuf_pool[socketid] =
346                 rte_pktmbuf_pool_create(s, nb_mbuf,
347                     MEMPOOL_CACHE_SIZE, 0,
348                     RTE_MBUF_DEFAULT_BUF_SIZE, socketid);
349         } else {
350             snprintf(s, sizeof(s), "mbuf_pool_%d", socketid);
351             pktmbuf_pool[socketid] = rte_mempool_lookup(s);
352         }
353 
354         if (pktmbuf_pool[socketid] == NULL) {
355             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool on socket %d\n", socketid);
356         } else {
357             printf("create mbuf pool on socket %d\n", socketid);
358         }
359 
360 #ifdef FF_USE_PAGE_ARRAY
361         nb_mbuf = RTE_ALIGN_CEIL (
362             nb_ports*nb_lcores*MAX_PKT_BURST    +
363             nb_ports*nb_tx_queue*TX_QUEUE_SIZE  +
364             nb_lcores*MEMPOOL_CACHE_SIZE,
365             (unsigned)4096);
366         ff_init_ref_pool(nb_mbuf, socketid);
367 #endif
368     }
369 
370     return 0;
371 }
372 
373 static struct rte_ring *
374 create_ring(const char *name, unsigned count, int socket_id, unsigned flags)
375 {
376     struct rte_ring *ring;
377 
378     if (name == NULL) {
379         rte_exit(EXIT_FAILURE, "create ring failed, no name!\n");
380     }
381 
382     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
383         ring = rte_ring_create(name, count, socket_id, flags);
384     } else {
385         ring = rte_ring_lookup(name);
386     }
387 
388     if (ring == NULL) {
389         rte_exit(EXIT_FAILURE, "create ring:%s failed!\n", name);
390     }
391 
392     return ring;
393 }
394 
395 static int
396 init_dispatch_ring(void)
397 {
398     int j;
399     char name_buf[RTE_RING_NAMESIZE];
400     int queueid;
401 
402     unsigned socketid = lcore_conf.socket_id;
403 
404     /* Create ring according to ports actually being used. */
405     int nb_ports = ff_global_cfg.dpdk.nb_ports;
406     for (j = 0; j < nb_ports; j++) {
407         uint16_t portid = ff_global_cfg.dpdk.portid_list[j];
408         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[portid];
409         int nb_queues = pconf->nb_lcores;
410         if (dispatch_ring[portid] == NULL) {
411             snprintf(name_buf, RTE_RING_NAMESIZE, "ring_ptr_p%d", portid);
412 
413             dispatch_ring[portid] = rte_zmalloc(name_buf,
414                 sizeof(struct rte_ring *) * nb_queues,
415                 RTE_CACHE_LINE_SIZE);
416             if (dispatch_ring[portid] == NULL) {
417                 rte_exit(EXIT_FAILURE, "rte_zmalloc(%s (struct rte_ring*)) "
418                     "failed\n", name_buf);
419             }
420         }
421 
422         for(queueid = 0; queueid < nb_queues; ++queueid) {
423             snprintf(name_buf, RTE_RING_NAMESIZE, "dispatch_ring_p%d_q%d",
424                 portid, queueid);
425             dispatch_ring[portid][queueid] = create_ring(name_buf,
426                 DISPATCH_RING_SIZE, socketid, RING_F_SC_DEQ);
427 
428             if (dispatch_ring[portid][queueid] == NULL)
429                 rte_panic("create ring:%s failed!\n", name_buf);
430 
431             printf("create ring:%s success, %u ring entries are now free!\n",
432                 name_buf, rte_ring_free_count(dispatch_ring[portid][queueid]));
433         }
434     }
435 
436     return 0;
437 }
438 
439 static void
440 ff_msg_init(struct rte_mempool *mp,
441     __attribute__((unused)) void *opaque_arg,
442     void *obj, __attribute__((unused)) unsigned i)
443 {
444     struct ff_msg *msg = (struct ff_msg *)obj;
445     msg->msg_type = FF_UNKNOWN;
446     msg->buf_addr = (char *)msg + sizeof(struct ff_msg);
447     msg->buf_len = mp->elt_size - sizeof(struct ff_msg);
448     msg->original_buf = NULL;
449     msg->original_buf_len = 0;
450 }
451 
452 static int
453 init_msg_ring(void)
454 {
455     uint16_t i, j;
456     uint16_t nb_procs = ff_global_cfg.dpdk.nb_procs;
457     unsigned socketid = lcore_conf.socket_id;
458 
459     /* Create message buffer pool */
460     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
461         message_pool = rte_mempool_create(FF_MSG_POOL,
462            MSG_RING_SIZE * 2 * nb_procs,
463            MAX_MSG_BUF_SIZE, MSG_RING_SIZE / 2, 0,
464            NULL, NULL, ff_msg_init, NULL,
465            socketid, 0);
466     } else {
467         message_pool = rte_mempool_lookup(FF_MSG_POOL);
468     }
469 
470     if (message_pool == NULL) {
471         rte_panic("Create msg mempool failed\n");
472     }
473 
474     for(i = 0; i < nb_procs; ++i) {
475         snprintf(msg_ring[i].ring_name[0], RTE_RING_NAMESIZE,
476             "%s%u", FF_MSG_RING_IN, i);
477         msg_ring[i].ring[0] = create_ring(msg_ring[i].ring_name[0],
478             MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
479         if (msg_ring[i].ring[0] == NULL)
480             rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[0]);
481 
482         for (j = FF_SYSCTL; j < FF_MSG_NUM; j++) {
483             snprintf(msg_ring[i].ring_name[j], RTE_RING_NAMESIZE,
484                 "%s%u_%u", FF_MSG_RING_OUT, i, j);
485             msg_ring[i].ring[j] = create_ring(msg_ring[i].ring_name[j],
486                 MSG_RING_SIZE, socketid, RING_F_SP_ENQ | RING_F_SC_DEQ);
487             if (msg_ring[i].ring[j] == NULL)
488                 rte_panic("create ring::%s failed!\n", msg_ring[i].ring_name[j]);
489         }
490     }
491 
492     return 0;
493 }
494 
495 #ifdef FF_KNI
496 
497 static enum FF_KNICTL_CMD get_kni_action(const char *c){
498     if (!c)
499         return FF_KNICTL_ACTION_DEFAULT;
500     if (0 == strcasecmp(c, "alltokni")){
501         return FF_KNICTL_ACTION_ALL_TO_KNI;
502     } else  if (0 == strcasecmp(c, "alltoff")){
503         return FF_KNICTL_ACTION_ALL_TO_FF;
504     } else if (0 == strcasecmp(c, "default")){
505         return FF_KNICTL_ACTION_DEFAULT;
506     } else {
507         return FF_KNICTL_ACTION_DEFAULT;
508     }
509 }
510 
511 static int
512 init_kni(void)
513 {
514     int nb_ports = rte_eth_dev_count_avail();
515     kni_accept = 0;
516     if(strcasecmp(ff_global_cfg.kni.method, "accept") == 0)
517         kni_accept = 1;
518 
519     knictl_action = get_kni_action(ff_global_cfg.kni.kni_action);
520 
521     ff_kni_init(nb_ports, ff_global_cfg.kni.tcp_port,
522         ff_global_cfg.kni.udp_port);
523 
524     unsigned socket_id = lcore_conf.socket_id;
525     struct rte_mempool *mbuf_pool = pktmbuf_pool[socket_id];
526 
527     nb_ports = ff_global_cfg.dpdk.nb_ports;
528     int i, ret;
529     for (i = 0; i < nb_ports; i++) {
530         uint16_t port_id = ff_global_cfg.dpdk.portid_list[i];
531         ff_kni_alloc(port_id, socket_id, mbuf_pool, KNI_QUEUE_SIZE);
532     }
533 
534     return 0;
535 }
536 #endif
537 
538 //RSS reta update will failed when enable flow isolate
539 #ifndef FF_FLOW_ISOLATE
540 static void
541 set_rss_table(uint16_t port_id, uint16_t reta_size, uint16_t nb_queues)
542 {
543     if (reta_size == 0) {
544         return;
545     }
546 
547     int reta_conf_size = RTE_MAX(1, reta_size / RTE_RETA_GROUP_SIZE);
548     struct rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
549 
550     /* config HW indirection table */
551     unsigned i, j, hash=0;
552     for (i = 0; i < reta_conf_size; i++) {
553         reta_conf[i].mask = ~0ULL;
554         for (j = 0; j < RTE_RETA_GROUP_SIZE; j++) {
555             reta_conf[i].reta[j] = hash++ % nb_queues;
556         }
557     }
558 
559     if (rte_eth_dev_rss_reta_update(port_id, reta_conf, reta_size)) {
560         rte_exit(EXIT_FAILURE, "port[%d], failed to update rss table\n",
561             port_id);
562     }
563 }
564 #endif
565 
566 static int
567 init_port_start(void)
568 {
569     int nb_ports = ff_global_cfg.dpdk.nb_ports;
570     unsigned socketid = 0;
571     struct rte_mempool *mbuf_pool;
572     uint16_t i, j;
573 
574     for (i = 0; i < nb_ports; i++) {
575         uint16_t port_id, u_port_id = ff_global_cfg.dpdk.portid_list[i];
576         struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[u_port_id];
577         uint16_t nb_queues = pconf->nb_lcores;
578 
579         for (j=0; j<=pconf->nb_slaves; j++) {
580             if (j < pconf->nb_slaves) {
581                 port_id = pconf->slave_portid_list[j];
582                 printf("To init %s's %d'st slave port[%d]\n",
583                         ff_global_cfg.dpdk.bond_cfgs->name,
584                         j, port_id);
585             } else {
586                 port_id = u_port_id;
587             }
588 
589             struct rte_eth_dev_info dev_info;
590             struct rte_eth_conf port_conf = {0};
591             struct rte_eth_rxconf rxq_conf;
592             struct rte_eth_txconf txq_conf;
593 
594             int ret = rte_eth_dev_info_get(port_id, &dev_info);
595             if (ret != 0)
596                 rte_exit(EXIT_FAILURE,
597                     "Error during getting device (port %u) info: %s\n",
598                     port_id, strerror(-ret));
599 
600             if (nb_queues > dev_info.max_rx_queues) {
601                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_rx_queues[%d]\n",
602                     nb_queues,
603                     dev_info.max_rx_queues);
604             }
605 
606             if (nb_queues > dev_info.max_tx_queues) {
607                 rte_exit(EXIT_FAILURE, "num_procs[%d] bigger than max_tx_queues[%d]\n",
608                     nb_queues,
609                     dev_info.max_tx_queues);
610             }
611 
612             struct rte_ether_addr addr;
613             rte_eth_macaddr_get(port_id, &addr);
614             printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
615                        " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
616                     (unsigned)port_id,
617                     addr.addr_bytes[0], addr.addr_bytes[1],
618                     addr.addr_bytes[2], addr.addr_bytes[3],
619                     addr.addr_bytes[4], addr.addr_bytes[5]);
620 
621             rte_memcpy(pconf->mac,
622                 addr.addr_bytes, RTE_ETHER_ADDR_LEN);
623 
624             /* Set RSS mode */
625             uint64_t default_rss_hf = ETH_RSS_PROTO_MASK;
626             port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
627             port_conf.rx_adv_conf.rss_conf.rss_hf = default_rss_hf;
628             if (dev_info.hash_key_size == 52) {
629                 rsskey = default_rsskey_52bytes;
630                 rsskey_len = 52;
631             }
632             if (ff_global_cfg.dpdk.symmetric_rss) {
633                 printf("Use symmetric Receive-side Scaling(RSS) key\n");
634                 rsskey = symmetric_rsskey;
635             }
636             port_conf.rx_adv_conf.rss_conf.rss_key = rsskey;
637             port_conf.rx_adv_conf.rss_conf.rss_key_len = rsskey_len;
638             port_conf.rx_adv_conf.rss_conf.rss_hf &= dev_info.flow_type_rss_offloads;
639             if (port_conf.rx_adv_conf.rss_conf.rss_hf !=
640                     ETH_RSS_PROTO_MASK) {
641                 printf("Port %u modified RSS hash function based on hardware support,"
642                         "requested:%#"PRIx64" configured:%#"PRIx64"\n",
643                         port_id, default_rss_hf,
644                         port_conf.rx_adv_conf.rss_conf.rss_hf);
645             }
646 
647             if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) {
648                 port_conf.txmode.offloads |=
649                     DEV_TX_OFFLOAD_MBUF_FAST_FREE;
650             }
651 
652             /* Set Rx VLAN stripping */
653             if (ff_global_cfg.dpdk.vlan_strip) {
654                 if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
655                     port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
656                 }
657             }
658 
659             /* Enable HW CRC stripping */
660             port_conf.rxmode.offloads &= ~DEV_RX_OFFLOAD_KEEP_CRC;
661 
662             /* FIXME: Enable TCP LRO ?*/
663             #if 0
664             if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) {
665                 printf("LRO is supported\n");
666                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
667                 pconf->hw_features.rx_lro = 1;
668             }
669             #endif
670 
671             /* Set Rx checksum checking */
672             if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
673                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_UDP_CKSUM) &&
674                 (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
675                 printf("RX checksum offload supported\n");
676                 port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
677                 pconf->hw_features.rx_csum = 1;
678             }
679 
680             if (ff_global_cfg.dpdk.tx_csum_offoad_skip == 0) {
681                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
682                     printf("TX ip checksum offload supported\n");
683                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
684                     pconf->hw_features.tx_csum_ip = 1;
685                 }
686 
687                 if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM) &&
688                     (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
689                     printf("TX TCP&UDP checksum offload supported\n");
690                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM;
691                     pconf->hw_features.tx_csum_l4 = 1;
692                 }
693             } else {
694                 printf("TX checksum offoad is disabled\n");
695             }
696 
697             if (ff_global_cfg.dpdk.tso) {
698                 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
699                     printf("TSO is supported\n");
700                     port_conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO;
701                     pconf->hw_features.tx_tso = 1;
702                 }
703             } else {
704                 printf("TSO is disabled\n");
705             }
706 
707             if (dev_info.reta_size) {
708                 /* reta size must be power of 2 */
709                 assert((dev_info.reta_size & (dev_info.reta_size - 1)) == 0);
710 
711                 rss_reta_size[port_id] = dev_info.reta_size;
712                 printf("port[%d]: rss table size: %d\n", port_id,
713                     dev_info.reta_size);
714             }
715 
716             if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
717                 continue;
718             }
719 
720             ret = rte_eth_dev_configure(port_id, nb_queues, nb_queues, &port_conf);
721             if (ret != 0) {
722                 return ret;
723             }
724 
725             static uint16_t nb_rxd = RX_QUEUE_SIZE;
726             static uint16_t nb_txd = TX_QUEUE_SIZE;
727             ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd);
728             if (ret < 0)
729                 printf("Could not adjust number of descriptors "
730                         "for port%u (%d)\n", (unsigned)port_id, ret);
731 
732             uint16_t q;
733             for (q = 0; q < nb_queues; q++) {
734                 if (numa_on) {
735                     uint16_t lcore_id = lcore_conf.port_cfgs[port_id].lcore_list[q];
736                     socketid = rte_lcore_to_socket_id(lcore_id);
737                 }
738                 mbuf_pool = pktmbuf_pool[socketid];
739 
740                 txq_conf = dev_info.default_txconf;
741                 txq_conf.offloads = port_conf.txmode.offloads;
742                 ret = rte_eth_tx_queue_setup(port_id, q, nb_txd,
743                     socketid, &txq_conf);
744                 if (ret < 0) {
745                     return ret;
746                 }
747 
748                 rxq_conf = dev_info.default_rxconf;
749                 rxq_conf.offloads = port_conf.rxmode.offloads;
750                 ret = rte_eth_rx_queue_setup(port_id, q, nb_rxd,
751                     socketid, &rxq_conf, mbuf_pool);
752                 if (ret < 0) {
753                     return ret;
754                 }
755             }
756 
757 
758             if (strncmp(dev_info.driver_name, BOND_DRIVER_NAME,
759                     strlen(dev_info.driver_name)) == 0) {
760 
761                 rte_eth_macaddr_get(port_id, &addr);
762                 printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
763                            " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
764                         (unsigned)port_id,
765                         addr.addr_bytes[0], addr.addr_bytes[1],
766                         addr.addr_bytes[2], addr.addr_bytes[3],
767                         addr.addr_bytes[4], addr.addr_bytes[5]);
768 
769                 rte_memcpy(pconf->mac,
770                     addr.addr_bytes, RTE_ETHER_ADDR_LEN);
771 
772                 int mode, count, x;
773                 uint16_t slaves[RTE_MAX_ETHPORTS], len = RTE_MAX_ETHPORTS;
774 
775                 mode = rte_eth_bond_mode_get(port_id);
776                 printf("Port %u, bond mode:%d\n", port_id, mode);
777 
778                 count = rte_eth_bond_slaves_get(port_id, slaves, len);
779                 printf("Port %u, %s's slave ports count:%d\n", port_id,
780                             ff_global_cfg.dpdk.bond_cfgs->name, count);
781                 for (x=0; x<count; x++) {
782                     printf("Port %u, %s's slave port[%u]\n", port_id,
783                             ff_global_cfg.dpdk.bond_cfgs->name, slaves[x]);
784                 }
785             }
786 
787             ret = rte_eth_dev_start(port_id);
788             if (ret < 0) {
789                 return ret;
790             }
791     //RSS reta update will failed when enable flow isolate
792     #ifndef FF_FLOW_ISOLATE
793             if (nb_queues > 1) {
794                 /* set HW rss hash function to Toeplitz. */
795                 if (!rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_HASH)) {
796                     struct rte_eth_hash_filter_info info = {0};
797                     info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
798                     info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
799 
800                     if (rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_HASH,
801                         RTE_ETH_FILTER_SET, &info) < 0) {
802                         rte_exit(EXIT_FAILURE, "port[%d] set hash func failed\n",
803                             port_id);
804                     }
805                 }
806 
807                 set_rss_table(port_id, dev_info.reta_size, nb_queues);
808             }
809     #endif
810 
811             /* Enable RX in promiscuous mode for the Ethernet device. */
812             if (ff_global_cfg.dpdk.promiscuous) {
813                 ret = rte_eth_promiscuous_enable(port_id);
814                 if (ret == 0) {
815                     printf("set port %u to promiscuous mode ok\n", port_id);
816                 } else {
817                     printf("set port %u to promiscuous mode error\n", port_id);
818                 }
819             }
820         }
821     }
822 
823     if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
824         check_all_ports_link_status();
825     }
826 
827     return 0;
828 }
829 
830 static int
831 init_clock(void)
832 {
833     rte_timer_subsystem_init();
834     uint64_t hz = rte_get_timer_hz();
835     uint64_t intrs = MS_PER_S/ff_global_cfg.freebsd.hz;
836     uint64_t tsc = (hz + MS_PER_S - 1) / MS_PER_S*intrs;
837 
838     rte_timer_init(&freebsd_clock);
839     rte_timer_reset(&freebsd_clock, tsc, PERIODICAL,
840         rte_lcore_id(), &ff_hardclock_job, NULL);
841 
842     ff_update_current_ts();
843 
844     return 0;
845 }
846 
847 #ifdef FF_FLOW_ISOLATE
848 /** Print a message out of a flow error. */
849 static int
850 port_flow_complain(struct rte_flow_error *error)
851 {
852     static const char *const errstrlist[] = {
853         [RTE_FLOW_ERROR_TYPE_NONE] = "no error",
854         [RTE_FLOW_ERROR_TYPE_UNSPECIFIED] = "cause unspecified",
855         [RTE_FLOW_ERROR_TYPE_HANDLE] = "flow rule (handle)",
856         [RTE_FLOW_ERROR_TYPE_ATTR_GROUP] = "group field",
857         [RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY] = "priority field",
858         [RTE_FLOW_ERROR_TYPE_ATTR_INGRESS] = "ingress field",
859         [RTE_FLOW_ERROR_TYPE_ATTR_EGRESS] = "egress field",
860         [RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER] = "transfer field",
861         [RTE_FLOW_ERROR_TYPE_ATTR] = "attributes structure",
862         [RTE_FLOW_ERROR_TYPE_ITEM_NUM] = "pattern length",
863         [RTE_FLOW_ERROR_TYPE_ITEM_SPEC] = "item specification",
864         [RTE_FLOW_ERROR_TYPE_ITEM_LAST] = "item specification range",
865         [RTE_FLOW_ERROR_TYPE_ITEM_MASK] = "item specification mask",
866         [RTE_FLOW_ERROR_TYPE_ITEM] = "specific pattern item",
867         [RTE_FLOW_ERROR_TYPE_ACTION_NUM] = "number of actions",
868         [RTE_FLOW_ERROR_TYPE_ACTION_CONF] = "action configuration",
869         [RTE_FLOW_ERROR_TYPE_ACTION] = "specific action",
870     };
871     const char *errstr;
872     char buf[32];
873     int err = rte_errno;
874 
875     if ((unsigned int)error->type >= RTE_DIM(errstrlist) ||
876         !errstrlist[error->type])
877         errstr = "unknown type";
878     else
879         errstr = errstrlist[error->type];
880     printf("Caught error type %d (%s): %s%s: %s\n",
881            error->type, errstr,
882            error->cause ? (snprintf(buf, sizeof(buf), "cause: %p, ",
883                                     error->cause), buf) : "",
884            error->message ? error->message : "(no stated reason)",
885            rte_strerror(err));
886     return -err;
887 }
888 
889 static int
890 port_flow_isolate(uint16_t port_id, int set)
891 {
892     struct rte_flow_error error;
893 
894     /* Poisoning to make sure PMDs update it in case of error. */
895     memset(&error, 0x66, sizeof(error));
896     if (rte_flow_isolate(port_id, set, &error))
897         return port_flow_complain(&error);
898     printf("Ingress traffic on port %u is %s to the defined flow rules\n",
899            port_id,
900            set ? "now restricted" : "not restricted anymore");
901     return 0;
902 }
903 
904 static int
905 create_tcp_flow(uint16_t port_id, uint16_t tcp_port) {
906   struct rte_flow_attr attr = {.ingress = 1};
907   struct ff_port_cfg *pconf = &ff_global_cfg.dpdk.port_cfgs[port_id];
908   int nb_queues = pconf->nb_lcores;
909   uint16_t queue[RTE_MAX_QUEUES_PER_PORT];
910   int i = 0, j = 0;
911   for (i = 0, j = 0; i < nb_queues; ++i)
912    queue[j++] = i;
913   struct rte_flow_action_rss rss = {
914    .types = ETH_RSS_NONFRAG_IPV4_TCP,
915    .key_len = rsskey_len,
916    .key = rsskey,
917    .queue_num = j,
918    .queue = queue,
919   };
920 
921   struct rte_eth_dev_info dev_info;
922   int ret = rte_eth_dev_info_get(port_id, &dev_info);
923   if (ret != 0)
924     rte_exit(EXIT_FAILURE, "Error during getting device (port %u) info: %s\n", port_id, strerror(-ret));
925 
926   struct rte_flow_item pattern[3];
927   struct rte_flow_action action[2];
928   struct rte_flow_item_tcp tcp_spec;
929   struct rte_flow_item_tcp tcp_mask = {
930           .hdr = {
931                   .src_port = RTE_BE16(0x0000),
932                   .dst_port = RTE_BE16(0xffff),
933           },
934   };
935   struct rte_flow_error error;
936 
937   memset(pattern, 0, sizeof(pattern));
938   memset(action, 0, sizeof(action));
939 
940   /* set the dst ipv4 packet to the required value */
941   pattern[0].type = RTE_FLOW_ITEM_TYPE_IPV4;
942 
943   memset(&tcp_spec, 0, sizeof(struct rte_flow_item_tcp));
944   tcp_spec.hdr.dst_port = rte_cpu_to_be_16(tcp_port);
945   pattern[1].type = RTE_FLOW_ITEM_TYPE_TCP;
946   pattern[1].spec = &tcp_spec;
947   pattern[1].mask = &tcp_mask;
948 
949   /* end the pattern array */
950   pattern[2].type = RTE_FLOW_ITEM_TYPE_END;
951 
952   /* create the action */
953   action[0].type = RTE_FLOW_ACTION_TYPE_RSS;
954   action[0].conf = &rss;
955   action[1].type = RTE_FLOW_ACTION_TYPE_END;
956 
957   struct rte_flow *flow;
958   /* validate and create the flow rule */
959   if (!rte_flow_validate(port_id, &attr, pattern, action, &error)) {
960       flow = rte_flow_create(port_id, &attr, pattern, action, &error);
961       if (!flow) {
962           return port_flow_complain(&error);
963       }
964   }
965 
966   memset(pattern, 0, sizeof(pattern));
967 
968   /* set the dst ipv4 packet to the required value */
969   pattern[0].type = RTE_FLOW_ITEM_TYPE_IPV4;
970 
971   struct rte_flow_item_tcp tcp_src_mask = {
972           .hdr = {
973                   .src_port = RTE_BE16(0xffff),
974                   .dst_port = RTE_BE16(0x0000),
975           },
976   };
977 
978   memset(&tcp_spec, 0, sizeof(struct rte_flow_item_tcp));
979   tcp_spec.hdr.src_port = rte_cpu_to_be_16(tcp_port);
980   pattern[1].type = RTE_FLOW_ITEM_TYPE_TCP;
981   pattern[1].spec = &tcp_spec;
982   pattern[1].mask = &tcp_src_mask;
983 
984   /* end the pattern array */
985   pattern[2].type = RTE_FLOW_ITEM_TYPE_END;
986 
987   /* validate and create the flow rule */
988   if (!rte_flow_validate(port_id, &attr, pattern, action, &error)) {
989       flow = rte_flow_create(port_id, &attr, pattern, action, &error);
990       if (!flow) {
991           return port_flow_complain(&error);
992       }
993   }
994 
995   return 1;
996 }
997 
998 static int
999 init_flow(uint16_t port_id, uint16_t tcp_port) {
1000   // struct ff_flow_cfg fcfg = ff_global_cfg.dpdk.flow_cfgs[0];
1001 
1002   // int i;
1003   // for (i = 0; i < fcfg.nb_port; i++) {
1004   //     if(!create_tcp_flow(fcfg.port_id, fcfg.tcp_ports[i])) {
1005   //         return 0;
1006   //     }
1007   // }
1008 
1009   if(!create_tcp_flow(port_id, tcp_port)) {
1010       rte_exit(EXIT_FAILURE, "create tcp flow failed\n");
1011       return -1;
1012   }
1013 
1014   /*  ARP rule */
1015   struct rte_flow_attr attr = {.ingress = 1};
1016   struct rte_flow_action_queue queue = {.index = 0};
1017 
1018   struct rte_flow_item pattern_[2];
1019   struct rte_flow_action action[2];
1020   struct rte_flow_item_eth eth_type = {.type = RTE_BE16(0x0806)};
1021   struct rte_flow_item_eth eth_mask = {
1022           .type = RTE_BE16(0xffff)
1023   };
1024 
1025   memset(pattern_, 0, sizeof(pattern_));
1026   memset(action, 0, sizeof(action));
1027 
1028   pattern_[0].type = RTE_FLOW_ITEM_TYPE_ETH;
1029   pattern_[0].spec = &eth_type;
1030   pattern_[0].mask = &eth_mask;
1031 
1032   pattern_[1].type = RTE_FLOW_ITEM_TYPE_END;
1033 
1034   /* create the action */
1035   action[0].type = RTE_FLOW_ACTION_TYPE_QUEUE;
1036   action[0].conf = &queue;
1037   action[1].type = RTE_FLOW_ACTION_TYPE_END;
1038 
1039   struct rte_flow *flow;
1040   struct rte_flow_error error;
1041   /* validate and create the flow rule */
1042   if (!rte_flow_validate(port_id, &attr, pattern_, action, &error)) {
1043       flow = rte_flow_create(port_id, &attr, pattern_, action, &error);
1044       if (!flow) {
1045           return port_flow_complain(&error);
1046       }
1047   }
1048 
1049   return 1;
1050 }
1051 
1052 #endif
1053 
1054 int
1055 ff_dpdk_init(int argc, char **argv)
1056 {
1057     if (ff_global_cfg.dpdk.nb_procs < 1 ||
1058         ff_global_cfg.dpdk.nb_procs > RTE_MAX_LCORE ||
1059         ff_global_cfg.dpdk.proc_id >= ff_global_cfg.dpdk.nb_procs ||
1060         ff_global_cfg.dpdk.proc_id < 0) {
1061         printf("param num_procs[%d] or proc_id[%d] error!\n",
1062             ff_global_cfg.dpdk.nb_procs,
1063             ff_global_cfg.dpdk.proc_id);
1064         exit(1);
1065     }
1066 
1067     int ret = rte_eal_init(argc, argv);
1068     if (ret < 0) {
1069         rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1070     }
1071 
1072     numa_on = ff_global_cfg.dpdk.numa_on;
1073 
1074     idle_sleep = ff_global_cfg.dpdk.idle_sleep;
1075     pkt_tx_delay = ff_global_cfg.dpdk.pkt_tx_delay > BURST_TX_DRAIN_US ? \
1076         BURST_TX_DRAIN_US : ff_global_cfg.dpdk.pkt_tx_delay;
1077 
1078     init_lcore_conf();
1079 
1080     init_mem_pool();
1081 
1082     init_dispatch_ring();
1083 
1084     init_msg_ring();
1085 
1086 #ifdef FF_KNI
1087     enable_kni = ff_global_cfg.kni.enable;
1088     if (enable_kni) {
1089         init_kni();
1090     }
1091 #endif
1092 
1093 #ifdef FF_USE_PAGE_ARRAY
1094     ff_mmap_init();
1095 #endif
1096 
1097 #ifdef FF_FLOW_ISOLATE
1098     // run once in primary process
1099     if (0 == lcore_conf.tx_queue_id[0]){
1100         ret = port_flow_isolate(0, 1);
1101         if (ret < 0)
1102             rte_exit(EXIT_FAILURE, "init_port_isolate failed\n");
1103     }
1104 #endif
1105 
1106     ret = init_port_start();
1107     if (ret < 0) {
1108         rte_exit(EXIT_FAILURE, "init_port_start failed\n");
1109     }
1110 
1111     init_clock();
1112 #ifdef FF_FLOW_ISOLATE
1113     //Only give a example usage: port_id=0, tcp_port= 80.
1114     //Recommend:
1115     //1. init_flow should replace `set_rss_table` in `init_port_start` loop, This can set all NIC's port_id_list instead only 0 device(port_id).
1116     //2. using config options `tcp_port` replace magic number of 80
1117     ret = init_flow(0, 80);
1118     if (ret < 0) {
1119         rte_exit(EXIT_FAILURE, "init_port_flow failed\n");
1120     }
1121 #endif
1122     return 0;
1123 }
1124 
1125 static void
1126 ff_veth_input(const struct ff_dpdk_if_context *ctx, struct rte_mbuf *pkt)
1127 {
1128     uint8_t rx_csum = ctx->hw_features.rx_csum;
1129     if (rx_csum) {
1130         if (pkt->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
1131             rte_pktmbuf_free(pkt);
1132             return;
1133         }
1134     }
1135 
1136     void *data = rte_pktmbuf_mtod(pkt, void*);
1137     uint16_t len = rte_pktmbuf_data_len(pkt);
1138 
1139     void *hdr = ff_mbuf_gethdr(pkt, pkt->pkt_len, data, len, rx_csum);
1140     if (hdr == NULL) {
1141         rte_pktmbuf_free(pkt);
1142         return;
1143     }
1144 
1145     if (pkt->ol_flags & PKT_RX_VLAN_STRIPPED) {
1146         ff_mbuf_set_vlan_info(hdr, pkt->vlan_tci);
1147     }
1148 
1149     struct rte_mbuf *pn = pkt->next;
1150     void *prev = hdr;
1151     while(pn != NULL) {
1152         data = rte_pktmbuf_mtod(pn, void*);
1153         len = rte_pktmbuf_data_len(pn);
1154 
1155         void *mb = ff_mbuf_get(prev, pn, data, len);
1156         if (mb == NULL) {
1157             ff_mbuf_free(hdr);
1158             rte_pktmbuf_free(pkt);
1159             return;
1160         }
1161         pn = pn->next;
1162         prev = mb;
1163     }
1164 
1165     ff_veth_process_packet(ctx->ifp, hdr);
1166 }
1167 
1168 static enum FilterReturn
1169 protocol_filter(const void *data, uint16_t len)
1170 {
1171     if(len < RTE_ETHER_ADDR_LEN)
1172         return FILTER_UNKNOWN;
1173 
1174     const struct rte_ether_hdr *hdr;
1175     const struct rte_vlan_hdr *vlanhdr;
1176     hdr = (const struct rte_ether_hdr *)data;
1177     uint16_t ether_type = rte_be_to_cpu_16(hdr->ether_type);
1178     data += RTE_ETHER_HDR_LEN;
1179     len -= RTE_ETHER_HDR_LEN;
1180 
1181     if (ether_type == RTE_ETHER_TYPE_VLAN) {
1182         vlanhdr = (struct rte_vlan_hdr *)data;
1183         ether_type = rte_be_to_cpu_16(vlanhdr->eth_proto);
1184         data += sizeof(struct rte_vlan_hdr);
1185         len -= sizeof(struct rte_vlan_hdr);
1186     }
1187 
1188     if(ether_type == RTE_ETHER_TYPE_ARP)
1189         return FILTER_ARP;
1190 
1191 #ifdef INET6
1192     if (ether_type == RTE_ETHER_TYPE_IPV6) {
1193         return ff_kni_proto_filter(data,
1194             len, ether_type);
1195     }
1196 #endif
1197 
1198 #ifndef FF_KNI
1199     return FILTER_UNKNOWN;
1200 #else
1201     if (!enable_kni) {
1202         return FILTER_UNKNOWN;
1203     }
1204 
1205     if(ether_type != RTE_ETHER_TYPE_IPV4)
1206         return FILTER_UNKNOWN;
1207 
1208     return ff_kni_proto_filter(data,
1209         len, ether_type);
1210 #endif
1211 }
1212 
1213 static inline void
1214 pktmbuf_deep_attach(struct rte_mbuf *mi, const struct rte_mbuf *m)
1215 {
1216     struct rte_mbuf *md;
1217     void *src, *dst;
1218 
1219     dst = rte_pktmbuf_mtod(mi, void *);
1220     src = rte_pktmbuf_mtod(m, void *);
1221 
1222     mi->data_len = m->data_len;
1223     rte_memcpy(dst, src, m->data_len);
1224 
1225     mi->port = m->port;
1226     mi->vlan_tci = m->vlan_tci;
1227     mi->vlan_tci_outer = m->vlan_tci_outer;
1228     mi->tx_offload = m->tx_offload;
1229     mi->hash = m->hash;
1230     mi->ol_flags = m->ol_flags;
1231     mi->packet_type = m->packet_type;
1232 }
1233 
1234 /* copied from rte_pktmbuf_clone */
1235 static inline struct rte_mbuf *
1236 pktmbuf_deep_clone(const struct rte_mbuf *md,
1237     struct rte_mempool *mp)
1238 {
1239     struct rte_mbuf *mc, *mi, **prev;
1240     uint32_t pktlen;
1241     uint8_t nseg;
1242 
1243     if (unlikely ((mc = rte_pktmbuf_alloc(mp)) == NULL))
1244         return NULL;
1245 
1246     mi = mc;
1247     prev = &mi->next;
1248     pktlen = md->pkt_len;
1249     nseg = 0;
1250 
1251     do {
1252         nseg++;
1253         pktmbuf_deep_attach(mi, md);
1254         *prev = mi;
1255         prev = &mi->next;
1256     } while ((md = md->next) != NULL &&
1257         (mi = rte_pktmbuf_alloc(mp)) != NULL);
1258 
1259     *prev = NULL;
1260     mc->nb_segs = nseg;
1261     mc->pkt_len = pktlen;
1262 
1263     /* Allocation of new indirect segment failed */
1264     if (unlikely (mi == NULL)) {
1265         rte_pktmbuf_free(mc);
1266         return NULL;
1267     }
1268 
1269     __rte_mbuf_sanity_check(mc, 1);
1270     return mc;
1271 }
1272 
1273 static inline void
1274 process_packets(uint16_t port_id, uint16_t queue_id, struct rte_mbuf **bufs,
1275     uint16_t count, const struct ff_dpdk_if_context *ctx, int pkts_from_ring)
1276 {
1277     struct lcore_conf *qconf = &lcore_conf;
1278     uint16_t nb_queues = qconf->nb_queue_list[port_id];
1279 
1280     uint16_t i;
1281     for (i = 0; i < count; i++) {
1282         struct rte_mbuf *rtem = bufs[i];
1283 
1284         if (unlikely( ff_global_cfg.pcap.enable)) {
1285             if (!pkts_from_ring) {
1286                 ff_dump_packets( ff_global_cfg.pcap.save_path, rtem, ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1287             }
1288         }
1289 
1290         void *data = rte_pktmbuf_mtod(rtem, void*);
1291         uint16_t len = rte_pktmbuf_data_len(rtem);
1292 
1293         if (!pkts_from_ring) {
1294             ff_traffic.rx_packets++;
1295             ff_traffic.rx_bytes += len;
1296         }
1297 
1298         if (!pkts_from_ring && packet_dispatcher) {
1299             int ret = (*packet_dispatcher)(data, &len, queue_id, nb_queues);
1300             if (ret == FF_DISPATCH_RESPONSE) {
1301                 rte_pktmbuf_pkt_len(rtem) = rte_pktmbuf_data_len(rtem) = len;
1302 
1303                 /*
1304                  * We have not support vlan out strip
1305                  */
1306                 if (rtem->vlan_tci) {
1307                     data = rte_pktmbuf_prepend(rtem, sizeof(struct rte_vlan_hdr));
1308                     if (data != NULL) {
1309                         memmove(data, data + sizeof(struct rte_vlan_hdr), RTE_ETHER_HDR_LEN);
1310                         struct rte_ether_hdr *etherhdr = (struct rte_ether_hdr *)data;
1311                         struct rte_vlan_hdr *vlanhdr = (struct rte_vlan_hdr *)(data + RTE_ETHER_HDR_LEN);
1312                         vlanhdr->vlan_tci = rte_cpu_to_be_16(rtem->vlan_tci);
1313                         vlanhdr->eth_proto = etherhdr->ether_type;
1314                         etherhdr->ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN);
1315                     }
1316                 }
1317                 send_single_packet(rtem, port_id);
1318                 continue;
1319             }
1320 
1321             if (ret == FF_DISPATCH_ERROR || ret >= nb_queues) {
1322                 rte_pktmbuf_free(rtem);
1323                 continue;
1324             }
1325 
1326             if (ret != queue_id) {
1327                 ret = rte_ring_enqueue(dispatch_ring[port_id][ret], rtem);
1328                 if (ret < 0)
1329                     rte_pktmbuf_free(rtem);
1330 
1331                 continue;
1332             }
1333         }
1334 
1335         enum FilterReturn filter = protocol_filter(data, len);
1336 #ifdef INET6
1337         if (filter == FILTER_ARP || filter == FILTER_NDP) {
1338 #else
1339         if (filter == FILTER_ARP) {
1340 #endif
1341             struct rte_mempool *mbuf_pool;
1342             struct rte_mbuf *mbuf_clone;
1343             if (!pkts_from_ring) {
1344                 uint16_t j;
1345                 for(j = 0; j < nb_queues; ++j) {
1346                     if(j == queue_id)
1347                         continue;
1348 
1349                     unsigned socket_id = 0;
1350                     if (numa_on) {
1351                         uint16_t lcore_id = qconf->port_cfgs[port_id].lcore_list[j];
1352                         socket_id = rte_lcore_to_socket_id(lcore_id);
1353                     }
1354                     mbuf_pool = pktmbuf_pool[socket_id];
1355                     mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1356                     if(mbuf_clone) {
1357                         int ret = rte_ring_enqueue(dispatch_ring[port_id][j],
1358                             mbuf_clone);
1359                         if (ret < 0)
1360                             rte_pktmbuf_free(mbuf_clone);
1361                     }
1362                 }
1363             }
1364 
1365 #ifdef FF_KNI
1366             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1367                 mbuf_pool = pktmbuf_pool[qconf->socket_id];
1368                 mbuf_clone = pktmbuf_deep_clone(rtem, mbuf_pool);
1369                 if(mbuf_clone) {
1370                     ff_kni_enqueue(port_id, mbuf_clone);
1371                 }
1372             }
1373 #endif
1374             ff_veth_input(ctx, rtem);
1375 #ifdef FF_KNI
1376         } else if (enable_kni) {
1377             if (knictl_action == FF_KNICTL_ACTION_ALL_TO_KNI){
1378                 ff_kni_enqueue(port_id, rtem);
1379             } else if (knictl_action == FF_KNICTL_ACTION_ALL_TO_FF){
1380                 ff_veth_input(ctx, rtem);
1381             } else if (knictl_action == FF_KNICTL_ACTION_DEFAULT){
1382                 if (enable_kni &&
1383                         ((filter == FILTER_KNI && kni_accept) ||
1384                         (filter == FILTER_UNKNOWN && !kni_accept)) ) {
1385                         ff_kni_enqueue(port_id, rtem);
1386                 } else {
1387                     ff_veth_input(ctx, rtem);
1388                 }
1389             } else {
1390                 ff_veth_input(ctx, rtem);
1391             }
1392 #endif
1393         } else {
1394             ff_veth_input(ctx, rtem);
1395         }
1396     }
1397 }
1398 
1399 static inline int
1400 process_dispatch_ring(uint16_t port_id, uint16_t queue_id,
1401     struct rte_mbuf **pkts_burst, const struct ff_dpdk_if_context *ctx)
1402 {
1403     /* read packet from ring buf and to process */
1404     uint16_t nb_rb;
1405     nb_rb = rte_ring_dequeue_burst(dispatch_ring[port_id][queue_id],
1406         (void **)pkts_burst, MAX_PKT_BURST, NULL);
1407 
1408     if(nb_rb > 0) {
1409         process_packets(port_id, queue_id, pkts_burst, nb_rb, ctx, 1);
1410     }
1411 
1412     return 0;
1413 }
1414 
1415 static inline void
1416 handle_sysctl_msg(struct ff_msg *msg)
1417 {
1418     int ret = ff_sysctl(msg->sysctl.name, msg->sysctl.namelen,
1419         msg->sysctl.old, msg->sysctl.oldlenp, msg->sysctl.new,
1420         msg->sysctl.newlen);
1421 
1422     if (ret < 0) {
1423         msg->result = errno;
1424     } else {
1425         msg->result = 0;
1426     }
1427 }
1428 
1429 static inline void
1430 handle_ioctl_msg(struct ff_msg *msg)
1431 {
1432     int fd, ret;
1433 #ifdef INET6
1434     if (msg->msg_type == FF_IOCTL6) {
1435         fd = ff_socket(AF_INET6, SOCK_DGRAM, 0);
1436     } else
1437 #endif
1438         fd = ff_socket(AF_INET, SOCK_DGRAM, 0);
1439 
1440     if (fd < 0) {
1441         ret = -1;
1442         goto done;
1443     }
1444 
1445     ret = ff_ioctl_freebsd(fd, msg->ioctl.cmd, msg->ioctl.data);
1446 
1447     ff_close(fd);
1448 
1449 done:
1450     if (ret < 0) {
1451         msg->result = errno;
1452     } else {
1453         msg->result = 0;
1454     }
1455 }
1456 
1457 static inline void
1458 handle_route_msg(struct ff_msg *msg)
1459 {
1460     int ret = ff_rtioctl(msg->route.fib, msg->route.data,
1461         &msg->route.len, msg->route.maxlen);
1462     if (ret < 0) {
1463         msg->result = errno;
1464     } else {
1465         msg->result = 0;
1466     }
1467 }
1468 
1469 static inline void
1470 handle_top_msg(struct ff_msg *msg)
1471 {
1472     msg->top = ff_top_status;
1473     msg->result = 0;
1474 }
1475 
1476 #ifdef FF_NETGRAPH
1477 static inline void
1478 handle_ngctl_msg(struct ff_msg *msg)
1479 {
1480     int ret = ff_ngctl(msg->ngctl.cmd, msg->ngctl.data);
1481     if (ret < 0) {
1482         msg->result = errno;
1483     } else {
1484         msg->result = 0;
1485         msg->ngctl.ret = ret;
1486     }
1487 }
1488 #endif
1489 
1490 #ifdef FF_IPFW
1491 static inline void
1492 handle_ipfw_msg(struct ff_msg *msg)
1493 {
1494     int fd, ret;
1495     fd = ff_socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
1496     if (fd < 0) {
1497         ret = -1;
1498         goto done;
1499     }
1500 
1501     switch (msg->ipfw.cmd) {
1502         case FF_IPFW_GET:
1503             ret = ff_getsockopt_freebsd(fd, msg->ipfw.level,
1504                 msg->ipfw.optname, msg->ipfw.optval,
1505                 msg->ipfw.optlen);
1506             break;
1507         case FF_IPFW_SET:
1508             ret = ff_setsockopt_freebsd(fd, msg->ipfw.level,
1509                 msg->ipfw.optname, msg->ipfw.optval,
1510                 *(msg->ipfw.optlen));
1511             break;
1512         default:
1513             ret = -1;
1514             errno = ENOTSUP;
1515             break;
1516     }
1517 
1518     ff_close(fd);
1519 
1520 done:
1521     if (ret < 0) {
1522         msg->result = errno;
1523     } else {
1524         msg->result = 0;
1525     }
1526 }
1527 #endif
1528 
1529 static inline void
1530 handle_traffic_msg(struct ff_msg *msg)
1531 {
1532     msg->traffic = ff_traffic;
1533     msg->result = 0;
1534 }
1535 
1536 #ifdef FF_KNI
1537 static inline void
1538 handle_knictl_msg(struct ff_msg *msg)
1539 {
1540     if (msg->knictl.kni_cmd == FF_KNICTL_CMD_SET){
1541         switch (msg->knictl.kni_action){
1542             case FF_KNICTL_ACTION_ALL_TO_FF: knictl_action = FF_KNICTL_ACTION_ALL_TO_FF; msg->result = 0; printf("new kni action: alltoff\n"); break;
1543             case FF_KNICTL_ACTION_ALL_TO_KNI: knictl_action = FF_KNICTL_ACTION_ALL_TO_KNI; msg->result = 0; printf("new kni action: alltokni\n"); break;
1544             case FF_KNICTL_ACTION_DEFAULT: knictl_action = FF_KNICTL_ACTION_DEFAULT; msg->result = 0; printf("new kni action: default\n"); break;
1545             default: msg->result = -1;
1546         }
1547     }
1548     else if (msg->knictl.kni_cmd == FF_KNICTL_CMD_GET){
1549         msg->knictl.kni_action = knictl_action;
1550     } else {
1551         msg->result = -2;
1552     }
1553 }
1554 #endif
1555 
1556 static inline void
1557 handle_default_msg(struct ff_msg *msg)
1558 {
1559     msg->result = ENOTSUP;
1560 }
1561 
1562 static inline void
1563 handle_msg(struct ff_msg *msg, uint16_t proc_id)
1564 {
1565     switch (msg->msg_type) {
1566         case FF_SYSCTL:
1567             handle_sysctl_msg(msg);
1568             break;
1569         case FF_IOCTL:
1570 #ifdef INET6
1571         case FF_IOCTL6:
1572 #endif
1573             handle_ioctl_msg(msg);
1574             break;
1575         case FF_ROUTE:
1576             handle_route_msg(msg);
1577             break;
1578         case FF_TOP:
1579             handle_top_msg(msg);
1580             break;
1581 #ifdef FF_NETGRAPH
1582         case FF_NGCTL:
1583             handle_ngctl_msg(msg);
1584             break;
1585 #endif
1586 #ifdef FF_IPFW
1587         case FF_IPFW_CTL:
1588             handle_ipfw_msg(msg);
1589             break;
1590 #endif
1591         case FF_TRAFFIC:
1592             handle_traffic_msg(msg);
1593             break;
1594 #ifdef FF_KNI
1595         case FF_KNICTL:
1596             handle_knictl_msg(msg);
1597             break;
1598 #endif
1599         default:
1600             handle_default_msg(msg);
1601             break;
1602     }
1603     rte_ring_enqueue(msg_ring[proc_id].ring[msg->msg_type], msg);
1604 }
1605 
1606 static inline int
1607 process_msg_ring(uint16_t proc_id)
1608 {
1609     void *msg;
1610     int ret = rte_ring_dequeue(msg_ring[proc_id].ring[0], &msg);
1611 
1612     if (unlikely(ret == 0)) {
1613         handle_msg((struct ff_msg *)msg, proc_id);
1614     }
1615 
1616     return 0;
1617 }
1618 
1619 /* Send burst of packets on an output interface */
1620 static inline int
1621 send_burst(struct lcore_conf *qconf, uint16_t n, uint8_t port)
1622 {
1623     struct rte_mbuf **m_table;
1624     int ret;
1625     uint16_t queueid;
1626 
1627     queueid = qconf->tx_queue_id[port];
1628     m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
1629 
1630     if (unlikely(ff_global_cfg.pcap.enable)) {
1631         uint16_t i;
1632         for (i = 0; i < n; i++) {
1633             ff_dump_packets( ff_global_cfg.pcap.save_path, m_table[i],
1634                ff_global_cfg.pcap.snap_len, ff_global_cfg.pcap.save_len);
1635         }
1636     }
1637 
1638     ret = rte_eth_tx_burst(port, queueid, m_table, n);
1639     ff_traffic.tx_packets += ret;
1640     uint16_t i;
1641     for (i = 0; i < ret; i++) {
1642         ff_traffic.tx_bytes += rte_pktmbuf_pkt_len(m_table[i]);
1643 #ifdef FF_USE_PAGE_ARRAY
1644         if (qconf->tx_mbufs[port].bsd_m_table[i])
1645             ff_enq_tx_bsdmbuf(port, qconf->tx_mbufs[port].bsd_m_table[i], m_table[i]->nb_segs);
1646 #endif
1647     }
1648     if (unlikely(ret < n)) {
1649         do {
1650             rte_pktmbuf_free(m_table[ret]);
1651 #ifdef FF_USE_PAGE_ARRAY
1652             if ( qconf->tx_mbufs[port].bsd_m_table[ret] )
1653                 ff_mbuf_free(qconf->tx_mbufs[port].bsd_m_table[ret]);
1654 #endif
1655         } while (++ret < n);
1656     }
1657     return 0;
1658 }
1659 
1660 /* Enqueue a single packet, and send burst if queue is filled */
1661 static inline int
1662 send_single_packet(struct rte_mbuf *m, uint8_t port)
1663 {
1664     uint16_t len;
1665     struct lcore_conf *qconf;
1666 
1667     qconf = &lcore_conf;
1668     len = qconf->tx_mbufs[port].len;
1669     qconf->tx_mbufs[port].m_table[len] = m;
1670     len++;
1671 
1672     /* enough pkts to be sent */
1673     if (unlikely(len == MAX_PKT_BURST)) {
1674         send_burst(qconf, MAX_PKT_BURST, port);
1675         len = 0;
1676     }
1677 
1678     qconf->tx_mbufs[port].len = len;
1679     return 0;
1680 }
1681 
1682 int
1683 ff_dpdk_if_send(struct ff_dpdk_if_context *ctx, void *m,
1684     int total)
1685 {
1686 #ifdef FF_USE_PAGE_ARRAY
1687     struct lcore_conf *qconf = &lcore_conf;
1688     int    len = 0;
1689 
1690     len = ff_if_send_onepkt(ctx, m,total);
1691     if (unlikely(len == MAX_PKT_BURST)) {
1692         send_burst(qconf, MAX_PKT_BURST, ctx->port_id);
1693         len = 0;
1694     }
1695     qconf->tx_mbufs[ctx->port_id].len = len;
1696     return 0;
1697 #endif
1698     struct rte_mempool *mbuf_pool = pktmbuf_pool[lcore_conf.socket_id];
1699     struct rte_mbuf *head = rte_pktmbuf_alloc(mbuf_pool);
1700     if (head == NULL) {
1701         ff_mbuf_free(m);
1702         return -1;
1703     }
1704 
1705     head->pkt_len = total;
1706     head->nb_segs = 0;
1707 
1708     int off = 0;
1709     struct rte_mbuf *cur = head, *prev = NULL;
1710     while(total > 0) {
1711         if (cur == NULL) {
1712             cur = rte_pktmbuf_alloc(mbuf_pool);
1713             if (cur == NULL) {
1714                 rte_pktmbuf_free(head);
1715                 ff_mbuf_free(m);
1716                 return -1;
1717             }
1718         }
1719 
1720         if (prev != NULL) {
1721             prev->next = cur;
1722         }
1723         head->nb_segs++;
1724 
1725         prev = cur;
1726         void *data = rte_pktmbuf_mtod(cur, void*);
1727         int len = total > RTE_MBUF_DEFAULT_DATAROOM ? RTE_MBUF_DEFAULT_DATAROOM : total;
1728         int ret = ff_mbuf_copydata(m, data, off, len);
1729         if (ret < 0) {
1730             rte_pktmbuf_free(head);
1731             ff_mbuf_free(m);
1732             return -1;
1733         }
1734 
1735 
1736         cur->data_len = len;
1737         off += len;
1738         total -= len;
1739         cur = NULL;
1740     }
1741 
1742     struct ff_tx_offload offload = {0};
1743     ff_mbuf_tx_offload(m, &offload);
1744 
1745     void *data = rte_pktmbuf_mtod(head, void*);
1746 
1747     if (offload.ip_csum) {
1748         /* ipv6 not supported yet */
1749         struct rte_ipv4_hdr *iph;
1750         int iph_len;
1751         iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN);
1752         iph_len = (iph->version_ihl & 0x0f) << 2;
1753 
1754         head->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
1755         head->l2_len = RTE_ETHER_HDR_LEN;
1756         head->l3_len = iph_len;
1757     }
1758 
1759     if (ctx->hw_features.tx_csum_l4) {
1760         struct rte_ipv4_hdr *iph;
1761         int iph_len;
1762         iph = (struct rte_ipv4_hdr *)(data + RTE_ETHER_HDR_LEN);
1763         iph_len = (iph->version_ihl & 0x0f) << 2;
1764 
1765         if (offload.tcp_csum) {
1766             head->ol_flags |= PKT_TX_TCP_CKSUM;
1767             head->l2_len = RTE_ETHER_HDR_LEN;
1768             head->l3_len = iph_len;
1769         }
1770 
1771         /*
1772          *  TCP segmentation offload.
1773          *
1774          *  - set the PKT_TX_TCP_SEG flag in mbuf->ol_flags (this flag
1775          *    implies PKT_TX_TCP_CKSUM)
1776          *  - set the flag PKT_TX_IPV4 or PKT_TX_IPV6
1777          *  - if it's IPv4, set the PKT_TX_IP_CKSUM flag and
1778          *    write the IP checksum to 0 in the packet
1779          *  - fill the mbuf offload information: l2_len,
1780          *    l3_len, l4_len, tso_segsz
1781          *  - calculate the pseudo header checksum without taking ip_len
1782          *    in account, and set it in the TCP header. Refer to
1783          *    rte_ipv4_phdr_cksum() and rte_ipv6_phdr_cksum() that can be
1784          *    used as helpers.
1785          */
1786         if (offload.tso_seg_size) {
1787             struct rte_tcp_hdr *tcph;
1788             int tcph_len;
1789             tcph = (struct rte_tcp_hdr *)((char *)iph + iph_len);
1790             tcph_len = (tcph->data_off & 0xf0) >> 2;
1791             tcph->cksum = rte_ipv4_phdr_cksum(iph, PKT_TX_TCP_SEG);
1792 
1793             head->ol_flags |= PKT_TX_TCP_SEG;
1794             head->l4_len = tcph_len;
1795             head->tso_segsz = offload.tso_seg_size;
1796         }
1797 
1798         if (offload.udp_csum) {
1799             head->ol_flags |= PKT_TX_UDP_CKSUM;
1800             head->l2_len = RTE_ETHER_HDR_LEN;
1801             head->l3_len = iph_len;
1802         }
1803     }
1804 
1805     ff_mbuf_free(m);
1806 
1807     return send_single_packet(head, ctx->port_id);
1808 }
1809 
1810 static int
1811 main_loop(void *arg)
1812 {
1813     struct loop_routine *lr = (struct loop_routine *)arg;
1814 
1815     struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1816     uint64_t prev_tsc, diff_tsc, cur_tsc, usch_tsc, div_tsc, usr_tsc, sys_tsc, end_tsc, idle_sleep_tsc;
1817     int i, j, nb_rx, idle;
1818     uint16_t port_id, queue_id;
1819     struct lcore_conf *qconf;
1820     uint64_t drain_tsc = 0;
1821     struct ff_dpdk_if_context *ctx;
1822 
1823     if (pkt_tx_delay) {
1824         drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * pkt_tx_delay;
1825     }
1826 
1827     prev_tsc = 0;
1828     usch_tsc = 0;
1829 
1830     qconf = &lcore_conf;
1831 
1832     while (1) {
1833         cur_tsc = rte_rdtsc();
1834         if (unlikely(freebsd_clock.expire < cur_tsc)) {
1835             rte_timer_manage();
1836         }
1837 
1838         idle = 1;
1839         sys_tsc = 0;
1840         usr_tsc = 0;
1841 
1842         /*
1843          * TX burst queue drain
1844          */
1845         diff_tsc = cur_tsc - prev_tsc;
1846         if (unlikely(diff_tsc >= drain_tsc)) {
1847             for (i = 0; i < qconf->nb_tx_port; i++) {
1848                 port_id = qconf->tx_port_id[i];
1849                 if (qconf->tx_mbufs[port_id].len == 0)
1850                     continue;
1851 
1852                 idle = 0;
1853 
1854                 send_burst(qconf,
1855                     qconf->tx_mbufs[port_id].len,
1856                     port_id);
1857                 qconf->tx_mbufs[port_id].len = 0;
1858             }
1859 
1860             prev_tsc = cur_tsc;
1861         }
1862 
1863         /*
1864          * Read packet from RX queues
1865          */
1866         for (i = 0; i < qconf->nb_rx_queue; ++i) {
1867             port_id = qconf->rx_queue_list[i].port_id;
1868             queue_id = qconf->rx_queue_list[i].queue_id;
1869             ctx = veth_ctx[port_id];
1870 
1871 #ifdef FF_KNI
1872             if (enable_kni && rte_eal_process_type() == RTE_PROC_PRIMARY) {
1873                 ff_kni_process(port_id, queue_id, pkts_burst, MAX_PKT_BURST);
1874             }
1875 #endif
1876 
1877             process_dispatch_ring(port_id, queue_id, pkts_burst, ctx);
1878 
1879             nb_rx = rte_eth_rx_burst(port_id, queue_id, pkts_burst,
1880                 MAX_PKT_BURST);
1881             if (nb_rx == 0)
1882                 continue;
1883 
1884             idle = 0;
1885 
1886             /* Prefetch first packets */
1887             for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++) {
1888                 rte_prefetch0(rte_pktmbuf_mtod(
1889                         pkts_burst[j], void *));
1890             }
1891 
1892             /* Prefetch and handle already prefetched packets */
1893             for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
1894                 rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
1895                         j + PREFETCH_OFFSET], void *));
1896                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1897             }
1898 
1899             /* Handle remaining prefetched packets */
1900             for (; j < nb_rx; j++) {
1901                 process_packets(port_id, queue_id, &pkts_burst[j], 1, ctx, 0);
1902             }
1903         }
1904 
1905         process_msg_ring(qconf->proc_id);
1906 
1907         div_tsc = rte_rdtsc();
1908 
1909         if (likely(lr->loop != NULL && (!idle || cur_tsc - usch_tsc >= drain_tsc))) {
1910             usch_tsc = cur_tsc;
1911             lr->loop(lr->arg);
1912         }
1913 
1914         idle_sleep_tsc = rte_rdtsc();
1915         if (likely(idle && idle_sleep)) {
1916             usleep(idle_sleep);
1917             end_tsc = rte_rdtsc();
1918         } else {
1919             end_tsc = idle_sleep_tsc;
1920         }
1921 
1922         if (usch_tsc == cur_tsc) {
1923             usr_tsc = idle_sleep_tsc - div_tsc;
1924         }
1925 
1926         if (!idle) {
1927             sys_tsc = div_tsc - cur_tsc;
1928             ff_top_status.sys_tsc += sys_tsc;
1929         }
1930 
1931         ff_top_status.usr_tsc += usr_tsc;
1932         ff_top_status.work_tsc += end_tsc - cur_tsc;
1933         ff_top_status.idle_tsc += end_tsc - cur_tsc - usr_tsc - sys_tsc;
1934 
1935         ff_top_status.loops++;
1936     }
1937 
1938     return 0;
1939 }
1940 
1941 int
1942 ff_dpdk_if_up(void) {
1943     int i;
1944     struct lcore_conf *qconf = &lcore_conf;
1945     for (i = 0; i < qconf->nb_tx_port; i++) {
1946         uint16_t port_id = qconf->tx_port_id[i];
1947 
1948         struct ff_port_cfg *pconf = &qconf->port_cfgs[port_id];
1949         veth_ctx[port_id] = ff_veth_attach(pconf);
1950         if (veth_ctx[port_id] == NULL) {
1951             rte_exit(EXIT_FAILURE, "ff_veth_attach failed");
1952         }
1953     }
1954 
1955     return 0;
1956 }
1957 
1958 void
1959 ff_dpdk_run(loop_func_t loop, void *arg) {
1960     struct loop_routine *lr = rte_malloc(NULL,
1961         sizeof(struct loop_routine), 0);
1962     lr->loop = loop;
1963     lr->arg = arg;
1964     rte_eal_mp_remote_launch(main_loop, lr, CALL_MASTER);
1965     rte_eal_mp_wait_lcore();
1966     rte_free(lr);
1967 }
1968 
1969 void
1970 ff_dpdk_pktmbuf_free(void *m)
1971 {
1972     rte_pktmbuf_free_seg((struct rte_mbuf *)m);
1973 }
1974 
1975 static uint32_t
1976 toeplitz_hash(unsigned keylen, const uint8_t *key,
1977     unsigned datalen, const uint8_t *data)
1978 {
1979     uint32_t hash = 0, v;
1980     u_int i, b;
1981 
1982     /* XXXRW: Perhaps an assertion about key length vs. data length? */
1983 
1984     v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
1985     for (i = 0; i < datalen; i++) {
1986         for (b = 0; b < 8; b++) {
1987             if (data[i] & (1<<(7-b)))
1988                 hash ^= v;
1989             v <<= 1;
1990             if ((i + 4) < keylen &&
1991                 (key[i+4] & (1<<(7-b))))
1992                 v |= 1;
1993         }
1994     }
1995     return (hash);
1996 }
1997 
1998 int
1999 ff_in_pcbladdr(uint16_t family, void *faddr, uint16_t fport, void *laddr)
2000 {
2001     int ret = 0;
2002     uint16_t fa;
2003 
2004     if (!pcblddr_fun)
2005         return ret;
2006 
2007     if (family == AF_INET)
2008         fa = AF_INET;
2009     else if (family == AF_INET6_FREEBSD)
2010         fa = AF_INET6_LINUX;
2011     else
2012         return EADDRNOTAVAIL;
2013 
2014     ret = (*pcblddr_fun)(fa, faddr, fport, laddr);
2015 
2016     return ret;
2017 }
2018 
2019 void
2020 ff_regist_pcblddr_fun(pcblddr_func_t func)
2021 {
2022     pcblddr_fun = func;
2023 }
2024 
2025 int
2026 ff_rss_check(void *softc, uint32_t saddr, uint32_t daddr,
2027     uint16_t sport, uint16_t dport)
2028 {
2029     struct lcore_conf *qconf = &lcore_conf;
2030     struct ff_dpdk_if_context *ctx = ff_veth_softc_to_hostc(softc);
2031     uint16_t nb_queues = qconf->nb_queue_list[ctx->port_id];
2032 
2033     if (nb_queues <= 1) {
2034         return 1;
2035     }
2036 
2037     uint16_t reta_size = rss_reta_size[ctx->port_id];
2038     uint16_t queueid = qconf->tx_queue_id[ctx->port_id];
2039 
2040     uint8_t data[sizeof(saddr) + sizeof(daddr) + sizeof(sport) +
2041         sizeof(dport)];
2042 
2043     unsigned datalen = 0;
2044 
2045     bcopy(&saddr, &data[datalen], sizeof(saddr));
2046     datalen += sizeof(saddr);
2047 
2048     bcopy(&daddr, &data[datalen], sizeof(daddr));
2049     datalen += sizeof(daddr);
2050 
2051     bcopy(&sport, &data[datalen], sizeof(sport));
2052     datalen += sizeof(sport);
2053 
2054     bcopy(&dport, &data[datalen], sizeof(dport));
2055     datalen += sizeof(dport);
2056 
2057     uint32_t hash = 0;
2058     hash = toeplitz_hash(rsskey_len, rsskey, datalen, data);
2059 
2060     return ((hash & (reta_size - 1)) % nb_queues) == queueid;
2061 }
2062 
2063 void
2064 ff_regist_packet_dispatcher(dispatch_func_t func)
2065 {
2066     packet_dispatcher = func;
2067 }
2068 
2069 uint64_t
2070 ff_get_tsc_ns()
2071 {
2072     uint64_t cur_tsc = rte_rdtsc();
2073     uint64_t hz = rte_get_tsc_hz();
2074     return ((double)cur_tsc/(double)hz) * NS_PER_S;
2075 }
2076 
2077