1 /* for io_module_func def'ns */
2 #include "io_module.h"
3 /* for mtcp related def'ns */
4 #include "mtcp.h"
5 /* for errno */
6 #include <errno.h>
7 /* for logging */
8 #include "debug.h"
9 /* for num_devices_* */
10 #include "config.h"
11 /* for rte_max_eth_ports */
12 #include <rte_common.h>
13 /* for rte_eth_rxconf */
14 #include <rte_ethdev.h>
15 /* for delay funcs */
16 #include <rte_cycles.h>
17 /* for ip pesudo-chksum */
18 #include <rte_ip.h>
19 #define ENABLE_STATS_IOCTL		1
20 #ifdef ENABLE_STATS_IOCTL
21 /* for close */
22 #include <unistd.h>
23 /* for open */
24 #include <fcntl.h>
25 /* for ioctl */
26 #include <sys/ioctl.h>
27 #endif /* !ENABLE_STATS_IOCTL */
28 /*----------------------------------------------------------------------------*/
29 /* Essential macros */
30 #define MAX_RX_QUEUE_PER_LCORE		MAX_CPUS
31 #define MAX_TX_QUEUE_PER_PORT		MAX_CPUS
32 
33 #define MBUF_SIZE 			(2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
34 #define NB_MBUF				8192
35 #define MEMPOOL_CACHE_SIZE		256
36 //#define RX_IDLE_ENABLE			1
37 #define RX_IDLE_TIMEOUT			1	/* in micro-seconds */
38 #define RX_IDLE_THRESH			64
39 
40 /*
41  * RX and TX Prefetch, Host, and Write-back threshold values should be
42  * carefully set for optimal performance. Consult the network
43  * controller's datasheet and supporting DPDK documentation for guidance
44  * on how these parameters should be set.
45  */
46 #define RX_PTHRESH 			8 /**< Default values of RX prefetch threshold reg. */
47 #define RX_HTHRESH 			8 /**< Default values of RX host threshold reg. */
48 #define RX_WTHRESH 			4 /**< Default values of RX write-back threshold reg. */
49 
50 /*
51  * These default values are optimized for use with the Intel(R) 82599 10 GbE
52  * Controller and the DPDK ixgbe PMD. Consider using other values for other
53  * network controllers and/or network drivers.
54  */
55 #define TX_PTHRESH 			36 /**< Default values of TX prefetch threshold reg. */
56 #define TX_HTHRESH			0  /**< Default values of TX host threshold reg. */
57 #define TX_WTHRESH			0  /**< Default values of TX write-back threshold reg. */
58 
59 #define MAX_PKT_BURST			/*32*/64/*128*//*32*/
60 
61 /*
62  * Configurable number of RX/TX ring descriptors
63  */
64 #define RTE_TEST_RX_DESC_DEFAULT	128
65 #define RTE_TEST_TX_DESC_DEFAULT	512
66 
67 static uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT;
68 static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
69 /*----------------------------------------------------------------------------*/
70 /* packet memory pools for storing packet bufs */
71 static struct rte_mempool *pktmbuf_pool[MAX_CPUS] = {NULL};
72 static uint8_t cpu_qid_map[RTE_MAX_ETHPORTS][MAX_CPUS] = {{0}};
73 
74 //#define DEBUG				1
75 #ifdef DEBUG
76 /* ethernet addresses of ports */
77 static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
78 #endif
79 
80 static struct rte_eth_conf port_conf = {
81 	.rxmode = {
82 		.mq_mode	= 	ETH_MQ_RX_RSS,
83 		.max_rx_pkt_len = 	ETHER_MAX_LEN,
84 		.split_hdr_size = 	0,
85 		.header_split   = 	0, /**< Header Split disabled */
86 		.hw_ip_checksum = 	1, /**< IP checksum offload enabled */
87 		.hw_vlan_filter = 	0, /**< VLAN filtering disabled */
88 		.jumbo_frame    = 	0, /**< Jumbo Frame Support disabled */
89 		.hw_strip_crc   = 	1, /**< CRC stripped by hardware */
90 	},
91 	.rx_adv_conf = {
92 		.rss_conf = {
93 			.rss_key = 	NULL,
94 			.rss_hf = 	ETH_RSS_TCP
95 		},
96 	},
97 	.txmode = {
98 		.mq_mode = 		ETH_MQ_TX_NONE,
99 	},
100 #if 0
101 	.fdir_conf = {
102                 .mode = RTE_FDIR_MODE_PERFECT,
103                 .pballoc = RTE_FDIR_PBALLOC_256K,
104                 .status = RTE_FDIR_REPORT_STATUS_ALWAYS,
105                 //.flexbytes_offset = 0x6,
106                 .drop_queue = 127,
107         },
108 #endif
109 };
110 
111 static const struct rte_eth_rxconf rx_conf = {
112 	.rx_thresh = {
113 		.pthresh = 		RX_PTHRESH, /* RX prefetch threshold reg */
114 		.hthresh = 		RX_HTHRESH, /* RX host threshold reg */
115 		.wthresh = 		RX_WTHRESH, /* RX write-back threshold reg */
116 	},
117 	.rx_free_thresh = 		32,
118 };
119 
120 static const struct rte_eth_txconf tx_conf = {
121 	.tx_thresh = {
122 		.pthresh = 		TX_PTHRESH, /* TX prefetch threshold reg */
123 		.hthresh = 		TX_HTHRESH, /* TX host threshold reg */
124 		.wthresh = 		TX_WTHRESH, /* TX write-back threshold reg */
125 	},
126 	.tx_free_thresh = 		0, /* Use PMD default values */
127 	.tx_rs_thresh = 		0, /* Use PMD default values */
128 	/*
129 	 * As the example won't handle mult-segments and offload cases,
130 	 * set the flag by default.
131 	 */
132 	.txq_flags = 			0x0,
133 };
134 
135 struct mbuf_table {
136 	unsigned len; /* length of queued packets */
137 	struct rte_mbuf *m_table[MAX_PKT_BURST];
138 };
139 
140 struct dpdk_private_context {
141 	struct mbuf_table rmbufs[RTE_MAX_ETHPORTS];
142 	struct mbuf_table wmbufs[RTE_MAX_ETHPORTS];
143 	struct rte_mempool *pktmbuf_pool;
144 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
145 #ifdef RX_IDLE_ENABLE
146 	uint8_t rx_idle;
147 #endif
148 #ifdef ENABLE_STATS_IOCTL
149 	int fd;
150 #endif /* !ENABLE_STATS_IOCTL */
151 } __rte_cache_aligned;
152 
153 #ifdef ENABLE_STATS_IOCTL
154 /**
155  * stats struct passed on from user space to the driver
156  */
157 struct stats_struct {
158 	uint64_t tx_bytes;
159 	uint64_t tx_pkts;
160 	uint64_t rx_bytes;
161 	uint64_t rx_pkts;
162 	uint8_t qid;
163 	uint8_t dev;
164 };
165 #endif /* !ENABLE_STATS_IOCTL */
166 /*----------------------------------------------------------------------------*/
167 void
168 dpdk_init_handle(struct mtcp_thread_context *ctxt)
169 {
170 	struct dpdk_private_context *dpc;
171 	int i, j;
172 	char mempool_name[20];
173 
174 	/* create and initialize private I/O module context */
175 	ctxt->io_private_context = calloc(1, sizeof(struct dpdk_private_context));
176 	if (ctxt->io_private_context == NULL) {
177 		TRACE_ERROR("Failed to initialize ctxt->io_private_context: "
178 			    "Can't allocate memory\n");
179 		exit(EXIT_FAILURE);
180 	}
181 
182 	sprintf(mempool_name, "mbuf_pool-%d", ctxt->cpu);
183 	dpc = (struct dpdk_private_context *)ctxt->io_private_context;
184 	dpc->pktmbuf_pool = pktmbuf_pool[ctxt->cpu];
185 
186 	/* set wmbufs correctly */
187 	for (j = 0; j < g_config.mos->netdev_table->num; j++) {
188 		/* Allocate wmbufs for each registered port */
189 		for (i = 0; i < MAX_PKT_BURST; i++) {
190 			dpc->wmbufs[j].m_table[i] = rte_pktmbuf_alloc(pktmbuf_pool[ctxt->cpu]);
191 			if (dpc->wmbufs[j].m_table[i] == NULL) {
192 				TRACE_ERROR("Failed to allocate %d:wmbuf[%d] on device %d!\n",
193 					    ctxt->cpu, i, j);
194 				exit(EXIT_FAILURE);
195 			}
196 		}
197 		/* set mbufs queue length to 0 to begin with */
198 		dpc->wmbufs[j].len = 0;
199 	}
200 
201 #ifdef ENABLE_STATS_IOCTL
202 	dpc->fd = open("/dev/dpdk-iface", O_RDWR);
203 	if (dpc->fd == -1) {
204 		TRACE_ERROR("Can't open /dev/dpdk-iface for context->cpu: %d! "
205 			    "Are you using mlx4/mlx5 driver?\n",
206 			    ctxt->cpu);
207 	}
208 #endif /* !ENABLE_STATS_IOCTL */
209 }
210 /*----------------------------------------------------------------------------*/
211 int
212 dpdk_send_pkts(struct mtcp_thread_context *ctxt, int nif)
213 {
214 	struct dpdk_private_context *dpc;
215 	mtcp_manager_t mtcp;
216 	int ret;
217 	int qid;
218 
219 	dpc = (struct dpdk_private_context *)ctxt->io_private_context;
220 	mtcp = ctxt->mtcp_manager;
221 	ret = 0;
222 	qid = cpu_qid_map[nif][ctxt->cpu];
223 
224 	/* if queue is unassigned, skip it.. */
225 	if (unlikely(qid == 0xFF))
226 		return 0;
227 
228 	/* if there are packets in the queue... flush them out to the wire */
229 	if (dpc->wmbufs[nif].len >/*= MAX_PKT_BURST*/ 0) {
230 		struct rte_mbuf **pkts;
231 #ifdef ENABLE_STATS_IOCTL
232 		struct stats_struct ss;
233 #endif /* !ENABLE_STATS_IOCTL */
234 		int cnt = dpc->wmbufs[nif].len;
235 		pkts = dpc->wmbufs[nif].m_table;
236 #ifdef NETSTAT
237 		mtcp->nstat.tx_packets[nif] += cnt;
238 #ifdef ENABLE_STATS_IOCTL
239 		if (likely(dpc->fd) >= 0) {
240 			ss.tx_pkts = mtcp->nstat.tx_packets[nif];
241 			ss.tx_bytes = mtcp->nstat.tx_bytes[nif];
242 			ss.rx_pkts = mtcp->nstat.rx_packets[nif];
243 			ss.rx_bytes = mtcp->nstat.rx_bytes[nif];
244 			ss.qid = ctxt->cpu;
245 			ss.dev = nif;
246 			ioctl(dpc->fd, 0, &ss);
247 		}
248 #endif /* !ENABLE_STATS_IOCTL */
249 #endif
250 		do {
251 			/* tx cnt # of packets */
252 			ret = rte_eth_tx_burst(nif, qid,
253 					       pkts, cnt);
254 			pkts += ret;
255 			cnt -= ret;
256 			/* if not all pkts were sent... then repeat the cycle */
257 		} while (cnt > 0);
258 
259 #ifndef SHARE_IO_BUFFER
260 		int i;
261 		/* time to allocate fresh mbufs for the queue */
262 		for (i = 0; i < dpc->wmbufs[nif].len; i++) {
263 			dpc->wmbufs[nif].m_table[i] = rte_pktmbuf_alloc(pktmbuf_pool[ctxt->cpu]);
264 			/* error checking */
265 			if (unlikely(dpc->wmbufs[nif].m_table[i] == NULL)) {
266 				TRACE_ERROR("Failed to allocate %d:wmbuf[%d] on device %d!\n",
267 					    ctxt->cpu, i, nif);
268 				exit(EXIT_FAILURE);
269 			}
270 		}
271 #endif
272 		/* reset the len of mbufs var after flushing of packets */
273 		dpc->wmbufs[nif].len = 0;
274 	}
275 
276 	return ret;
277 }
278 /*----------------------------------------------------------------------------*/
279 uint8_t *
280 dpdk_get_wptr(struct mtcp_thread_context *ctxt, int nif, uint16_t pktsize)
281 {
282 	struct dpdk_private_context *dpc;
283 	mtcp_manager_t mtcp;
284 	struct rte_mbuf *m;
285 	uint8_t *ptr;
286 	int len_of_mbuf;
287 
288 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
289 	mtcp = ctxt->mtcp_manager;
290 
291 	/* sanity check */
292 	if (unlikely(dpc->wmbufs[nif].len == MAX_PKT_BURST))
293 		return NULL;
294 
295 	len_of_mbuf = dpc->wmbufs[nif].len;
296 	m = dpc->wmbufs[nif].m_table[len_of_mbuf];
297 
298 	/* retrieve the right write offset */
299 	ptr = (void *)rte_pktmbuf_mtod(m, struct ether_hdr *);
300 	m->pkt_len = m->data_len = pktsize;
301 	m->nb_segs = 1;
302 	m->next = NULL;
303 
304 #ifdef NETSTAT
305 	mtcp->nstat.tx_bytes[nif] += pktsize + 24;
306 #endif
307 
308 	/* increment the len_of_mbuf var */
309 	dpc->wmbufs[nif].len = len_of_mbuf + 1;
310 
311 	return (uint8_t *)ptr;
312 }
313 /*----------------------------------------------------------------------------*/
314 void
315 dpdk_set_wptr(struct mtcp_thread_context *ctxt, int out_nif, int in_nif, int index)
316 {
317 	struct dpdk_private_context *dpc;
318 	mtcp_manager_t mtcp;
319 	int len_of_mbuf;
320 
321 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
322 	mtcp = ctxt->mtcp_manager;
323 
324 	/* sanity check */
325 	if (unlikely(dpc->wmbufs[out_nif].len == MAX_PKT_BURST))
326 		return;
327 
328 	len_of_mbuf = dpc->wmbufs[out_nif].len;
329 	dpc->wmbufs[out_nif].m_table[len_of_mbuf] =
330 		dpc->rmbufs[in_nif].m_table[index];
331 
332 	dpc->wmbufs[out_nif].m_table[len_of_mbuf]->udata64 = 0;
333 
334 #ifdef NETSTAT
335 	mtcp->nstat.tx_bytes[out_nif] += dpc->rmbufs[in_nif].m_table[index]->pkt_len + 24;
336 #endif
337 
338 	/* increment the len_of_mbuf var */
339 	dpc->wmbufs[out_nif].len = len_of_mbuf + 1;
340 
341 	return;
342 }
343 /*----------------------------------------------------------------------------*/
344 static inline void
345 free_pkts(struct rte_mbuf **mtable, unsigned len)
346 {
347 	int i;
348 
349 	/* free the freaking packets */
350 	for (i = 0; i < len; i++) {
351 		if (mtable[i]->udata64 == 1) {
352 			rte_pktmbuf_free_seg(mtable[i]);
353 			RTE_MBUF_PREFETCH_TO_FREE(mtable[i+1]);
354 		}
355 	}
356 }
357 /*----------------------------------------------------------------------------*/
358 int32_t
359 dpdk_recv_pkts(struct mtcp_thread_context *ctxt, int ifidx)
360 {
361 	struct dpdk_private_context *dpc;
362 	int ret;
363 	uint8_t qid;
364 
365 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
366 	qid = cpu_qid_map[ifidx][ctxt->cpu];
367 
368 	/* if queue is unassigned, skip it.. */
369 	if (qid == 0xFF)
370 		return 0;
371 
372 	if (dpc->rmbufs[ifidx].len != 0) {
373 		free_pkts(dpc->rmbufs[ifidx].m_table, dpc->rmbufs[ifidx].len);
374 		dpc->rmbufs[ifidx].len = 0;
375 	}
376 
377 	ret = rte_eth_rx_burst((uint8_t)ifidx, qid,
378 			       dpc->pkts_burst, MAX_PKT_BURST);
379 #ifdef RX_IDLE_ENABLE
380 	dpc->rx_idle = (likely(ret != 0)) ? 0 : dpc->rx_idle + 1;
381 #endif
382 	dpc->rmbufs[ifidx].len = ret;
383 
384 	return ret;
385 }
386 /*----------------------------------------------------------------------------*/
387 uint8_t *
388 dpdk_get_rptr(struct mtcp_thread_context *ctxt, int ifidx, int index, uint16_t *len)
389 {
390 	struct dpdk_private_context *dpc;
391 	struct rte_mbuf *m;
392 	uint8_t *pktbuf;
393 
394 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
395 
396 
397 	m = dpc->pkts_burst[index];
398 	/* tag to check if the packet is a local or a forwarded pkt */
399 	m->udata64 = 1;
400 	/* don't enable pre-fetching... performance goes down */
401 	//rte_prefetch0(rte_pktmbuf_mtod(m, void *));
402 	*len = m->pkt_len;
403 	pktbuf = rte_pktmbuf_mtod(m, uint8_t *);
404 
405 	/* enqueue the pkt ptr in mbuf */
406 	dpc->rmbufs[ifidx].m_table[index] = m;
407 
408 	return pktbuf;
409 }
410 /*----------------------------------------------------------------------------*/
411 int
412 dpdk_get_nif(struct ifreq *ifr)
413 {
414 	int i;
415 	static int num_dev = -1;
416 	static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
417 	/* get mac addr entries of 'detected' dpdk ports */
418 	if (num_dev < 0) {
419 		num_dev = rte_eth_dev_count();
420 		for (i = 0; i < num_dev; i++)
421 			rte_eth_macaddr_get(i, &ports_eth_addr[i]);
422 	}
423 
424 	for (i = 0; i < num_dev; i++)
425 		if (!memcmp(&ifr->ifr_addr.sa_data[0], &ports_eth_addr[i], ETH_ALEN))
426 			return i;
427 
428 	return -1;
429 }
430 /*----------------------------------------------------------------------------*/
431 int32_t
432 dpdk_select(struct mtcp_thread_context *ctxt)
433 {
434 #ifdef RX_IDLE_ENABLE
435 	struct dpdk_private_context *dpc;
436 
437 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
438 	if (dpc->rx_idle > RX_IDLE_THRESH) {
439 		dpc->rx_idle = 0;
440 		usleep(RX_IDLE_TIMEOUT);
441 	}
442 #endif
443 	return 0;
444 }
445 /*----------------------------------------------------------------------------*/
446 void
447 dpdk_destroy_handle(struct mtcp_thread_context *ctxt)
448 {
449 	struct dpdk_private_context *dpc;
450 	int i;
451 
452 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
453 
454 	/* free wmbufs */
455 	for (i = 0; i < g_config.mos->netdev_table->num; i++)
456 		free_pkts(dpc->wmbufs[i].m_table, MAX_PKT_BURST);
457 
458 #ifdef ENABLE_STATS_IOCTL
459 	/* free fd */
460 	if (dpc->fd >= 0)
461 		close(dpc->fd);
462 #endif /* !ENABLE_STATS_IOCTL */
463 
464 	/* free it all up */
465 	free(dpc);
466 }
467 /*----------------------------------------------------------------------------*/
468 static void
469 check_all_ports_link_status(uint8_t port_num, uint32_t port_mask)
470 {
471 #define CHECK_INTERVAL 			100 /* 100ms */
472 #define MAX_CHECK_TIME 			90 /* 9s (90 * 100ms) in total */
473 
474 	uint8_t portid, count, all_ports_up, print_flag = 0;
475 	struct rte_eth_link link;
476 
477 	printf("\nChecking link status");
478 	fflush(stdout);
479 	for (count = 0; count <= MAX_CHECK_TIME; count++) {
480 		all_ports_up = 1;
481 		for (portid = 0; portid < port_num; portid++) {
482 			if ((port_mask & (1 << portid)) == 0)
483 				continue;
484 			memset(&link, 0, sizeof(link));
485 			rte_eth_link_get_nowait(portid, &link);
486 			/* print link status if flag set */
487 			if (print_flag == 1) {
488 				if (link.link_status)
489 					printf("Port %d Link Up - speed %u "
490 						"Mbps - %s\n", (uint8_t)portid,
491 						(unsigned)link.link_speed,
492 				(link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
493 					("full-duplex") : ("half-duplex\n"));
494 				else
495 					printf("Port %d Link Down\n",
496 						(uint8_t)portid);
497 				continue;
498 			}
499 			/* clear all_ports_up flag if any link down */
500 			if (link.link_status == 0) {
501 				all_ports_up = 0;
502 				break;
503 			}
504 		}
505 		/* after finally printing all link status, get out */
506 		if (print_flag == 1)
507 			break;
508 
509 		if (all_ports_up == 0) {
510 			printf(".");
511 			fflush(stdout);
512 			rte_delay_ms(CHECK_INTERVAL);
513 		}
514 
515 		/* set the print_flag if all ports up or timeout */
516 		if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
517 			print_flag = 1;
518 			printf("done\n");
519 		}
520 	}
521 }
522 /*----------------------------------------------------------------------------*/
523 #if 0
524 static void
525 dpdk_enable_fdir(int portid, uint8_t is_master)
526 {
527 	struct rte_fdir_masks fdir_masks;
528 	struct rte_fdir_filter fdir_filter;
529 	int ret;
530 
531 	memset(&fdir_filter, 0, sizeof(struct rte_fdir_filter));
532 	fdir_filter.iptype = RTE_FDIR_IPTYPE_IPV4;
533 	fdir_filter.l4type = RTE_FDIR_L4TYPE_TCP;
534 	fdir_filter.ip_dst.ipv4_addr = g_config.mos->netdev_table->ent[portid]->ip_addr;
535 
536 	if (is_master) {
537 		memset(&fdir_masks, 0, sizeof(struct rte_fdir_masks));
538 		fdir_masks.src_ipv4_mask = 0x0;
539 		fdir_masks.dst_ipv4_mask = 0xFFFFFFFF;
540 		fdir_masks.src_port_mask = 0x0;
541 		fdir_masks.dst_port_mask = 0x0;
542 
543 		/*
544 		 * enable the following if the filter is IP-only
545 		 * (non-TCP, non-UDP)
546 		 */
547 		/* fdir_masks.only_ip_flow = 1; */
548 		rte_eth_dev_fdir_set_masks(portid, &fdir_masks);
549 		ret = rte_eth_dev_fdir_add_perfect_filter(portid,
550 							  &fdir_filter,
551 							  0,
552 							  g_config.mos->multiprocess_curr_core,
553 							  0);
554 	} else {
555 		ret = rte_eth_dev_fdir_update_perfect_filter(portid,
556 							     &fdir_filter,
557 							     0,
558 							     g_config.mos->multiprocess_curr_core,
559 							     0);
560 	}
561 	if (ret < 0) {
562 		rte_exit(EXIT_FAILURE,
563 			 "fdir_add_perfect_filter_t call failed!: %d\n",
564 			 ret);
565 	}
566 	fprintf(stderr, "Filter for device ifidx: %d added\n", portid);
567 }
568 #endif
569 /*----------------------------------------------------------------------------*/
570 int32_t
571 dpdk_dev_ioctl(struct mtcp_thread_context *ctx, int nif, int cmd, void *argp)
572 {
573 	struct dpdk_private_context *dpc;
574 	struct rte_mbuf *m;
575 	int len_of_mbuf;
576 	struct iphdr *iph;
577 	struct tcphdr *tcph;
578 	RssInfo *rss_i;
579 
580 	iph = (struct iphdr *)argp;
581 	dpc = (struct dpdk_private_context *)ctx->io_private_context;
582 	len_of_mbuf = dpc->wmbufs[nif].len;
583 	rss_i = NULL;
584 
585 	switch (cmd) {
586 	case PKT_TX_IP_CSUM:
587 		m = dpc->wmbufs[nif].m_table[len_of_mbuf - 1];
588 		m->ol_flags = PKT_TX_IP_CKSUM | PKT_TX_IPV4;
589 		m->l2_len = sizeof(struct ether_hdr);
590 		m->l3_len = (iph->ihl<<2);
591 		break;
592 	case PKT_TX_TCP_CSUM:
593 		m = dpc->wmbufs[nif].m_table[len_of_mbuf - 1];
594 		tcph = (struct tcphdr *)((unsigned char *)iph + (iph->ihl<<2));
595 		m->ol_flags |= PKT_TX_TCP_CKSUM;
596 		tcph->check = rte_ipv4_phdr_cksum((struct ipv4_hdr *)iph, m->ol_flags);
597 		break;
598 	case PKT_RX_RSS:
599 		rss_i = (RssInfo *)argp;
600 		m = dpc->pkts_burst[rss_i->pktidx];
601 		rss_i->hash_value = m->hash.rss;
602 		break;
603 	default:
604 		goto dev_ioctl_err;
605 	}
606 
607 	return 0;
608  dev_ioctl_err:
609 	return -1;
610 }
611 /*----------------------------------------------------------------------------*/
612 void
613 dpdk_load_module_upper_half(void)
614 {
615 	int cpu = g_config.mos->num_cores, ret;
616 	uint32_t cpumask = 0;
617 	char cpumaskbuf[10];
618 	char mem_channels[5];
619 
620 	/* set the log level */
621 	rte_set_log_type(RTE_LOGTYPE_PMD, 0);
622 	rte_set_log_type(RTE_LOGTYPE_MALLOC, 0);
623 	rte_set_log_type(RTE_LOGTYPE_MEMPOOL, 0);
624 	rte_set_log_type(RTE_LOGTYPE_RING, 0);
625 	rte_set_log_level(RTE_LOG_WARNING);
626 
627 	/* get the cpu mask */
628 	for (ret = 0; ret < cpu; ret++)
629 		cpumask = (cpumask | (1 << ret));
630 	sprintf(cpumaskbuf, "%X", cpumask);
631 
632 	/* get the mem channels per socket */
633 	if (g_config.mos->nb_mem_channels == 0) {
634 		TRACE_ERROR("DPDK module requires # of memory channels "
635 				"per socket parameter!\n");
636 		exit(EXIT_FAILURE);
637 	}
638 	sprintf(mem_channels, "%d", g_config.mos->nb_mem_channels);
639 
640 	/* initialize the rte env first, what a waste of implementation effort!  */
641 	char *argv[] = {"",
642 			"-c",
643 			cpumaskbuf,
644 			"-n",
645 			mem_channels,
646 			"--proc-type=auto",
647 			""
648 	};
649 	const int argc = 6;
650 
651 	/*
652 	 * re-set getopt extern variable optind.
653 	 * this issue was a bitch to debug
654 	 * rte_eal_init() internally uses getopt() syscall
655 	 * mtcp applications that also use an `external' getopt
656 	 * will cause a violent crash if optind is not reset to zero
657 	 * prior to calling the func below...
658 	 * see man getopt(3) for more details
659 	 */
660 	optind = 0;
661 
662 	/* initialize the dpdk eal env */
663 	ret = rte_eal_init(argc, argv);
664 	if (ret < 0)
665 		rte_exit(EXIT_FAILURE, "Invalid EAL args!\n");
666 
667 }
668 /*----------------------------------------------------------------------------*/
669 void
670 dpdk_load_module_lower_half(void)
671 {
672 	int portid, rxlcore_id, ret;
673 	struct rte_eth_fc_conf fc_conf;	/* for Ethernet flow control settings */
674 	/* setting the rss key */
675 	static const uint8_t key[] = {
676 		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
677 		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
678 		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
679 		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05
680 	};
681 
682 	port_conf.rx_adv_conf.rss_conf.rss_key = (uint8_t *)&key;
683 	port_conf.rx_adv_conf.rss_conf.rss_key_len = sizeof(key);
684 
685 	/* resetting cpu_qid mapping */
686 	memset(cpu_qid_map, 0xFF, sizeof(cpu_qid_map));
687 
688 	if (!g_config.mos->multiprocess
689 			|| (g_config.mos->multiprocess && g_config.mos->multiprocess_is_master)) {
690 		for (rxlcore_id = 0; rxlcore_id < g_config.mos->num_cores; rxlcore_id++) {
691 			char name[20];
692 			sprintf(name, "mbuf_pool-%d", rxlcore_id);
693 			/* create the mbuf pools */
694 			pktmbuf_pool[rxlcore_id] =
695 				rte_mempool_create(name, NB_MBUF,
696 						   MBUF_SIZE, MEMPOOL_CACHE_SIZE,
697 						   sizeof(struct rte_pktmbuf_pool_private),
698 						   rte_pktmbuf_pool_init, NULL,
699 						   rte_pktmbuf_init, NULL,
700 						   rte_lcore_to_socket_id(rxlcore_id), 0);
701 			if (pktmbuf_pool[rxlcore_id] == NULL)
702 				rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n");
703 		}
704 
705 		/* Initialise each port */
706 		for (portid = 0; portid < g_config.mos->netdev_table->num; portid++) {
707 			int num_queue = 0, eth_idx, i, queue_id;
708 			for (eth_idx = 0; eth_idx < g_config.mos->netdev_table->num; eth_idx++)
709 				if (portid == g_config.mos->netdev_table->ent[eth_idx]->ifindex)
710 					break;
711 			if (eth_idx == g_config.mos->netdev_table->num)
712 				continue;
713 			for (i = 0; i < sizeof(uint64_t) * 8; i++)
714 				if (g_config.mos->netdev_table->ent[eth_idx]->cpu_mask & (1L << i))
715 					num_queue++;
716 
717 			/* set 'num_queues' (used for GetRSSCPUCore() in util.c) */
718 			num_queues = num_queue;
719 
720 			/* init port */
721 			printf("Initializing port %u... ", (unsigned) portid);
722 			fflush(stdout);
723 			ret = rte_eth_dev_configure(portid, num_queue, num_queue,
724 										&port_conf);
725 			if (ret < 0)
726 				rte_exit(EXIT_FAILURE, "Cannot configure device:"
727 									   "err=%d, port=%u\n",
728 									   ret, (unsigned) portid);
729 
730 			/* init one RX queue per CPU */
731 			fflush(stdout);
732 #ifdef DEBUG
733 			rte_eth_macaddr_get(portid, &ports_eth_addr[portid]);
734 #endif
735 			queue_id = 0;
736 			for (rxlcore_id = 0; rxlcore_id < g_config.mos->num_cores; rxlcore_id++) {
737 				if (!(g_config.mos->netdev_table->ent[eth_idx]->cpu_mask & (1L << rxlcore_id)))
738 					continue;
739 				ret = rte_eth_rx_queue_setup(portid, queue_id, nb_rxd,
740 						rte_eth_dev_socket_id(portid), &rx_conf,
741 						pktmbuf_pool[rxlcore_id]);
742 				if (ret < 0)
743 					rte_exit(EXIT_FAILURE, "rte_eth_rx_queue_setup:"
744 										   "err=%d, port=%u, queueid: %d\n",
745 										   ret, (unsigned) portid, rxlcore_id);
746 				cpu_qid_map[portid][rxlcore_id] = queue_id++;
747 			}
748 
749 			/* init one TX queue on each port per CPU (this is redundant for
750 			 * this app) */
751 			fflush(stdout);
752 			queue_id = 0;
753 			for (rxlcore_id = 0; rxlcore_id < g_config.mos->num_cores; rxlcore_id++) {
754 				if (!(g_config.mos->netdev_table->ent[eth_idx]->cpu_mask & (1L << rxlcore_id)))
755 					continue;
756 				ret = rte_eth_tx_queue_setup(portid, queue_id++, nb_txd,
757 						rte_eth_dev_socket_id(portid), &tx_conf);
758 				if (ret < 0)
759 					rte_exit(EXIT_FAILURE, "rte_eth_tx_queue_setup:"
760 										   "err=%d, port=%u, queueid: %d\n",
761 										   ret, (unsigned) portid, rxlcore_id);
762 			}
763 
764 			/* Start device */
765 			ret = rte_eth_dev_start(portid);
766 			if (ret < 0)
767 				rte_exit(EXIT_FAILURE, "rte_eth_dev_start:err=%d, port=%u\n",
768 									   ret, (unsigned) portid);
769 
770 			printf("done: \n");
771 			rte_eth_promiscuous_enable(portid);
772 
773 			/* retrieve current flow control settings per port */
774 			memset(&fc_conf, 0, sizeof(fc_conf));
775 			ret = rte_eth_dev_flow_ctrl_get(portid, &fc_conf);
776 			if (ret != 0) {
777 				rte_exit(EXIT_FAILURE, "Failed to get flow control info!\n");
778 			}
779 
780 			/* and just disable the rx/tx flow control */
781 			fc_conf.mode = RTE_FC_NONE;
782 			ret = rte_eth_dev_flow_ctrl_set(portid, &fc_conf);
783 			if (ret != 0) {
784 				rte_exit(EXIT_FAILURE, "Failed to set flow control info!: errno: %d\n",
785 					 ret);
786 			}
787 
788 #ifdef DEBUG
789 			printf("Port %u, MAC address: %02X:%02X:%02X:%02X:%02X:%02X\n\n",
790 					(unsigned) portid,
791 					ports_eth_addr[portid].addr_bytes[0],
792 					ports_eth_addr[portid].addr_bytes[1],
793 					ports_eth_addr[portid].addr_bytes[2],
794 					ports_eth_addr[portid].addr_bytes[3],
795 					ports_eth_addr[portid].addr_bytes[4],
796 					ports_eth_addr[portid].addr_bytes[5]);
797 #endif
798 #if 0
799 			/* if multi-process support is enabled, then turn on FDIR */
800 			if (g_config.mos->multiprocess)
801 				dpdk_enable_fdir(portid, g_config.mos->multiprocess_is_master);
802 #endif
803 		}
804 	} else { /* g_config.mos->multiprocess && !g_config.mos->multiprocess_is_master */
805 		for (rxlcore_id = 0; rxlcore_id < g_config.mos->num_cores; rxlcore_id++) {
806 			char name[20];
807 			sprintf(name, "mbuf_pool-%d", rxlcore_id);
808 			/* initialize the mbuf pools */
809 			pktmbuf_pool[rxlcore_id] =
810 				rte_mempool_lookup(name);
811 			if (pktmbuf_pool[rxlcore_id] == NULL)
812 				rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n");
813 		}
814 #if 0
815 		for (portid = 0; portid < g_config.mos->netdev_table->num; portid++)
816 			dpdk_enable_fdir(portid, g_config.mos->multiprocess_is_master);
817 #endif
818 	}
819 
820 	check_all_ports_link_status(g_config.mos->netdev_table->num, 0xFFFFFFFF);
821 }
822 /*----------------------------------------------------------------------------*/
823 io_module_func dpdk_module_func = {
824 	.load_module_upper_half		   = dpdk_load_module_upper_half,
825 	.load_module_lower_half		   = dpdk_load_module_lower_half,
826 	.init_handle		   = dpdk_init_handle,
827 	.link_devices		   = NULL,
828 	.release_pkt		   = NULL,
829 	.send_pkts		   = dpdk_send_pkts,
830 	.get_wptr   		   = dpdk_get_wptr,
831 	.recv_pkts		   = dpdk_recv_pkts,
832 	.get_rptr	   	   = dpdk_get_rptr,
833 	.get_nif		   = dpdk_get_nif,
834 	.select			   = dpdk_select,
835 	.destroy_handle		   = dpdk_destroy_handle,
836 	.dev_ioctl		   = dpdk_dev_ioctl,
837 	.set_wptr		   = dpdk_set_wptr,
838 };
839 /*----------------------------------------------------------------------------*/
840 
841