1 /* for io_module_func def'ns */
2 #include "io_module.h"
3 /* for mtcp related def'ns */
4 #include "mtcp.h"
5 /* for errno */
6 #include <errno.h>
7 /* for logging */
8 #include "debug.h"
9 /* for num_devices_* */
10 #include "config.h"
11 /* for rte_max_eth_ports */
12 #include <rte_common.h>
13 /* for rte_eth_rxconf */
14 #include <rte_ethdev.h>
15 /* for delay funcs */
16 #include <rte_cycles.h>
17 /* for ip pesudo-chksum */
18 #include <rte_ip.h>
19 #define ENABLE_STATS_IOCTL		1
20 #ifdef ENABLE_STATS_IOCTL
21 /* for close */
22 #include <unistd.h>
23 /* for open */
24 #include <fcntl.h>
25 /* for ioctl */
26 #include <sys/ioctl.h>
27 #endif /* !ENABLE_STATS_IOCTL */
28 /*----------------------------------------------------------------------------*/
29 /* Essential macros */
30 #define MAX_RX_QUEUE_PER_LCORE		MAX_CPUS
31 #define MAX_TX_QUEUE_PER_PORT		MAX_CPUS
32 
33 #define MBUF_SIZE 			(2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
34 #define NB_MBUF				8192
35 #define MEMPOOL_CACHE_SIZE		256
36 //#define RX_IDLE_ENABLE			1
37 #define RX_IDLE_TIMEOUT			1	/* in micro-seconds */
38 #define RX_IDLE_THRESH			64
39 
40 /*
41  * RX and TX Prefetch, Host, and Write-back threshold values should be
42  * carefully set for optimal performance. Consult the network
43  * controller's datasheet and supporting DPDK documentation for guidance
44  * on how these parameters should be set.
45  */
46 #define RX_PTHRESH 			8 /**< Default values of RX prefetch threshold reg. */
47 #define RX_HTHRESH 			8 /**< Default values of RX host threshold reg. */
48 #define RX_WTHRESH 			4 /**< Default values of RX write-back threshold reg. */
49 
50 /*
51  * These default values are optimized for use with the Intel(R) 82599 10 GbE
52  * Controller and the DPDK ixgbe PMD. Consider using other values for other
53  * network controllers and/or network drivers.
54  */
55 #define TX_PTHRESH 			36 /**< Default values of TX prefetch threshold reg. */
56 #define TX_HTHRESH			0  /**< Default values of TX host threshold reg. */
57 #define TX_WTHRESH			0  /**< Default values of TX write-back threshold reg. */
58 
59 #define MAX_PKT_BURST			/*32*/64/*128*//*32*/
60 
61 /*
62  * Configurable number of RX/TX ring descriptors
63  */
64 #define RTE_TEST_RX_DESC_DEFAULT	128
65 #define RTE_TEST_TX_DESC_DEFAULT	512
66 
67 static uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT;
68 static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
69 /*----------------------------------------------------------------------------*/
70 /* packet memory pools for storing packet bufs */
71 static struct rte_mempool *pktmbuf_pool[MAX_CPUS] = {NULL};
72 static uint8_t cpu_qid_map[RTE_MAX_ETHPORTS][MAX_CPUS] = {{0}};
73 
74 //#define DEBUG				1
75 #ifdef DEBUG
76 /* ethernet addresses of ports */
77 static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
78 #endif
79 
80 static struct rte_eth_conf port_conf = {
81 	.rxmode = {
82 		.mq_mode	= 	ETH_MQ_RX_RSS,
83 		.max_rx_pkt_len = 	ETHER_MAX_LEN,
84 		.split_hdr_size = 	0,
85 		.header_split   = 	0, /**< Header Split disabled */
86 		.hw_ip_checksum = 	1, /**< IP checksum offload enabled */
87 		.hw_vlan_filter = 	0, /**< VLAN filtering disabled */
88 		.jumbo_frame    = 	0, /**< Jumbo Frame Support disabled */
89 		.hw_strip_crc   = 	1, /**< CRC stripped by hardware */
90 	},
91 	.rx_adv_conf = {
92 		.rss_conf = {
93 			.rss_key = 	NULL,
94 			.rss_hf = 	ETH_RSS_TCP
95 		},
96 	},
97 	.txmode = {
98 		.mq_mode = 		ETH_MQ_TX_NONE,
99 	},
100 };
101 
102 static const struct rte_eth_rxconf rx_conf = {
103 	.rx_thresh = {
104 		.pthresh = 		RX_PTHRESH, /* RX prefetch threshold reg */
105 		.hthresh = 		RX_HTHRESH, /* RX host threshold reg */
106 		.wthresh = 		RX_WTHRESH, /* RX write-back threshold reg */
107 	},
108 	.rx_free_thresh = 		32,
109 };
110 
111 static const struct rte_eth_txconf tx_conf = {
112 	.tx_thresh = {
113 		.pthresh = 		TX_PTHRESH, /* TX prefetch threshold reg */
114 		.hthresh = 		TX_HTHRESH, /* TX host threshold reg */
115 		.wthresh = 		TX_WTHRESH, /* TX write-back threshold reg */
116 	},
117 	.tx_free_thresh = 		0, /* Use PMD default values */
118 	.tx_rs_thresh = 		0, /* Use PMD default values */
119 	/*
120 	 * As the example won't handle mult-segments and offload cases,
121 	 * set the flag by default.
122 	 */
123 	.txq_flags = 			0x0,
124 };
125 
126 struct mbuf_table {
127 	unsigned len; /* length of queued packets */
128 	struct rte_mbuf *m_table[MAX_PKT_BURST];
129 };
130 
131 struct dpdk_private_context {
132 	struct mbuf_table rmbufs[RTE_MAX_ETHPORTS];
133 	struct mbuf_table wmbufs[RTE_MAX_ETHPORTS];
134 	struct rte_mempool *pktmbuf_pool;
135 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
136 #ifdef RX_IDLE_ENABLE
137 	uint8_t rx_idle;
138 #endif
139 #ifdef ENABLE_STATS_IOCTL
140 	int fd;
141 #endif /* !ENABLE_STATS_IOCTL */
142 } __rte_cache_aligned;
143 
144 #ifdef ENABLE_STATS_IOCTL
145 /**
146  * stats struct passed on from user space to the driver
147  */
148 struct stats_struct {
149 	uint64_t tx_bytes;
150 	uint64_t tx_pkts;
151 	uint64_t rx_bytes;
152 	uint64_t rx_pkts;
153 	uint8_t qid;
154 	uint8_t dev;
155 };
156 #endif /* !ENABLE_STATS_IOCTL */
157 /*----------------------------------------------------------------------------*/
158 void
159 dpdk_init_handle(struct mtcp_thread_context *ctxt)
160 {
161 	struct dpdk_private_context *dpc;
162 	int i, j;
163 	char mempool_name[20];
164 
165 	/* create and initialize private I/O module context */
166 	ctxt->io_private_context = calloc(1, sizeof(struct dpdk_private_context));
167 	if (ctxt->io_private_context == NULL) {
168 		TRACE_ERROR("Failed to initialize ctxt->io_private_context: "
169 			    "Can't allocate memory\n");
170 		exit(EXIT_FAILURE);
171 	}
172 
173 	sprintf(mempool_name, "mbuf_pool-%d", ctxt->cpu);
174 	dpc = (struct dpdk_private_context *)ctxt->io_private_context;
175 	dpc->pktmbuf_pool = pktmbuf_pool[ctxt->cpu];
176 
177 	/* set wmbufs correctly */
178 	for (j = 0; j < g_config.mos->netdev_table->num; j++) {
179 		/* Allocate wmbufs for each registered port */
180 		for (i = 0; i < MAX_PKT_BURST; i++) {
181 			dpc->wmbufs[j].m_table[i] = rte_pktmbuf_alloc(pktmbuf_pool[ctxt->cpu]);
182 			if (dpc->wmbufs[j].m_table[i] == NULL) {
183 				TRACE_ERROR("Failed to allocate %d:wmbuf[%d] on device %d!\n",
184 					    ctxt->cpu, i, j);
185 				exit(EXIT_FAILURE);
186 			}
187 		}
188 		/* set mbufs queue length to 0 to begin with */
189 		dpc->wmbufs[j].len = 0;
190 	}
191 
192 #ifdef ENABLE_STATS_IOCTL
193 	dpc->fd = open("/dev/dpdk-iface", O_RDWR);
194 	if (dpc->fd == -1) {
195 		TRACE_ERROR("Can't open /dev/dpdk-iface for context->cpu: %d! "
196 			    "Are you using mlx4/mlx5 driver?\n",
197 			    ctxt->cpu);
198 	}
199 #endif /* !ENABLE_STATS_IOCTL */
200 }
201 /*----------------------------------------------------------------------------*/
202 int
203 dpdk_send_pkts(struct mtcp_thread_context *ctxt, int nif)
204 {
205 	struct dpdk_private_context *dpc;
206 	mtcp_manager_t mtcp;
207 	int ret;
208 	int qid;
209 
210 	dpc = (struct dpdk_private_context *)ctxt->io_private_context;
211 	mtcp = ctxt->mtcp_manager;
212 	ret = 0;
213 	qid = cpu_qid_map[nif][ctxt->cpu];
214 
215 	/* if queue is unassigned, skip it.. */
216 	if (unlikely(qid == 0xFF))
217 		return 0;
218 
219 	/* if there are packets in the queue... flush them out to the wire */
220 	if (dpc->wmbufs[nif].len >/*= MAX_PKT_BURST*/ 0) {
221 		struct rte_mbuf **pkts;
222 #ifdef ENABLE_STATS_IOCTL
223 		struct stats_struct ss;
224 #endif /* !ENABLE_STATS_IOCTL */
225 		int cnt = dpc->wmbufs[nif].len;
226 		pkts = dpc->wmbufs[nif].m_table;
227 #ifdef NETSTAT
228 		mtcp->nstat.tx_packets[nif] += cnt;
229 #ifdef ENABLE_STATS_IOCTL
230 		if (likely(dpc->fd) >= 0) {
231 			ss.tx_pkts = mtcp->nstat.tx_packets[nif];
232 			ss.tx_bytes = mtcp->nstat.tx_bytes[nif];
233 			ss.rx_pkts = mtcp->nstat.rx_packets[nif];
234 			ss.rx_bytes = mtcp->nstat.rx_bytes[nif];
235 			ss.qid = ctxt->cpu;
236 			ss.dev = nif;
237 			ioctl(dpc->fd, 0, &ss);
238 		}
239 #endif /* !ENABLE_STATS_IOCTL */
240 #endif
241 		do {
242 			/* tx cnt # of packets */
243 			ret = rte_eth_tx_burst(nif, qid,
244 					       pkts, cnt);
245 			pkts += ret;
246 			cnt -= ret;
247 			/* if not all pkts were sent... then repeat the cycle */
248 		} while (cnt > 0);
249 
250 #ifndef SHARE_IO_BUFFER
251 		int i;
252 		/* time to allocate fresh mbufs for the queue */
253 		for (i = 0; i < dpc->wmbufs[nif].len; i++) {
254 			dpc->wmbufs[nif].m_table[i] = rte_pktmbuf_alloc(pktmbuf_pool[ctxt->cpu]);
255 			/* error checking */
256 			if (unlikely(dpc->wmbufs[nif].m_table[i] == NULL)) {
257 				TRACE_ERROR("Failed to allocate %d:wmbuf[%d] on device %d!\n",
258 					    ctxt->cpu, i, nif);
259 				exit(EXIT_FAILURE);
260 			}
261 		}
262 #endif
263 		/* reset the len of mbufs var after flushing of packets */
264 		dpc->wmbufs[nif].len = 0;
265 	}
266 
267 	return ret;
268 }
269 /*----------------------------------------------------------------------------*/
270 uint8_t *
271 dpdk_get_wptr(struct mtcp_thread_context *ctxt, int nif, uint16_t pktsize)
272 {
273 	struct dpdk_private_context *dpc;
274 	mtcp_manager_t mtcp;
275 	struct rte_mbuf *m;
276 	uint8_t *ptr;
277 	int len_of_mbuf;
278 
279 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
280 	mtcp = ctxt->mtcp_manager;
281 
282 	/* sanity check */
283 	if (unlikely(dpc->wmbufs[nif].len == MAX_PKT_BURST))
284 		return NULL;
285 
286 	len_of_mbuf = dpc->wmbufs[nif].len;
287 	m = dpc->wmbufs[nif].m_table[len_of_mbuf];
288 
289 	/* retrieve the right write offset */
290 	ptr = (void *)rte_pktmbuf_mtod(m, struct ether_hdr *);
291 	m->pkt_len = m->data_len = pktsize;
292 	m->nb_segs = 1;
293 	m->next = NULL;
294 
295 #ifdef NETSTAT
296 	mtcp->nstat.tx_bytes[nif] += pktsize + 24;
297 #endif
298 
299 	/* increment the len_of_mbuf var */
300 	dpc->wmbufs[nif].len = len_of_mbuf + 1;
301 
302 	return (uint8_t *)ptr;
303 }
304 /*----------------------------------------------------------------------------*/
305 void
306 dpdk_set_wptr(struct mtcp_thread_context *ctxt, int out_nif, int in_nif, int index)
307 {
308 	struct dpdk_private_context *dpc;
309 	mtcp_manager_t mtcp;
310 	int len_of_mbuf;
311 
312 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
313 	mtcp = ctxt->mtcp_manager;
314 
315 	/* sanity check */
316 	if (unlikely(dpc->wmbufs[out_nif].len == MAX_PKT_BURST))
317 		return;
318 
319 	len_of_mbuf = dpc->wmbufs[out_nif].len;
320 	dpc->wmbufs[out_nif].m_table[len_of_mbuf] =
321 		dpc->rmbufs[in_nif].m_table[index];
322 
323 	dpc->wmbufs[out_nif].m_table[len_of_mbuf]->udata64 = 0;
324 
325 #ifdef NETSTAT
326 	mtcp->nstat.tx_bytes[out_nif] += dpc->rmbufs[in_nif].m_table[index]->pkt_len + 24;
327 #endif
328 
329 	/* increment the len_of_mbuf var */
330 	dpc->wmbufs[out_nif].len = len_of_mbuf + 1;
331 
332 	return;
333 }
334 /*----------------------------------------------------------------------------*/
335 static inline void
336 free_pkts(struct rte_mbuf **mtable, unsigned len)
337 {
338 	int i;
339 
340 	/* free the freaking packets */
341 	for (i = 0; i < len; i++) {
342 		if (mtable[i]->udata64 == 1) {
343 			rte_pktmbuf_free_seg(mtable[i]);
344 			RTE_MBUF_PREFETCH_TO_FREE(mtable[i+1]);
345 		}
346 	}
347 }
348 /*----------------------------------------------------------------------------*/
349 int32_t
350 dpdk_recv_pkts(struct mtcp_thread_context *ctxt, int ifidx)
351 {
352 	struct dpdk_private_context *dpc;
353 	int ret;
354 	uint8_t qid;
355 
356 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
357 	qid = cpu_qid_map[ifidx][ctxt->cpu];
358 
359 	/* if queue is unassigned, skip it.. */
360 	if (qid == 0xFF)
361 		return 0;
362 
363 	if (dpc->rmbufs[ifidx].len != 0) {
364 		free_pkts(dpc->rmbufs[ifidx].m_table, dpc->rmbufs[ifidx].len);
365 		dpc->rmbufs[ifidx].len = 0;
366 	}
367 
368 	ret = rte_eth_rx_burst((uint8_t)ifidx, qid,
369 			       dpc->pkts_burst, MAX_PKT_BURST);
370 #ifdef RX_IDLE_ENABLE
371 	dpc->rx_idle = (likely(ret != 0)) ? 0 : dpc->rx_idle + 1;
372 #endif
373 	dpc->rmbufs[ifidx].len = ret;
374 
375 	return ret;
376 }
377 /*----------------------------------------------------------------------------*/
378 uint8_t *
379 dpdk_get_rptr(struct mtcp_thread_context *ctxt, int ifidx, int index, uint16_t *len)
380 {
381 	struct dpdk_private_context *dpc;
382 	struct rte_mbuf *m;
383 	uint8_t *pktbuf;
384 
385 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
386 
387 
388 	m = dpc->pkts_burst[index];
389 	/* tag to check if the packet is a local or a forwarded pkt */
390 	m->udata64 = 1;
391 	/* don't enable pre-fetching... performance goes down */
392 	//rte_prefetch0(rte_pktmbuf_mtod(m, void *));
393 	*len = m->pkt_len;
394 	pktbuf = rte_pktmbuf_mtod(m, uint8_t *);
395 
396 	/* enqueue the pkt ptr in mbuf */
397 	dpc->rmbufs[ifidx].m_table[index] = m;
398 
399 	return pktbuf;
400 }
401 /*----------------------------------------------------------------------------*/
402 int
403 dpdk_get_nif(struct ifreq *ifr)
404 {
405 	int i;
406 	static int num_dev = -1;
407 	static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
408 	/* get mac addr entries of 'detected' dpdk ports */
409 	if (num_dev < 0) {
410 		num_dev = rte_eth_dev_count();
411 		for (i = 0; i < num_dev; i++)
412 			rte_eth_macaddr_get(i, &ports_eth_addr[i]);
413 	}
414 
415 	for (i = 0; i < num_dev; i++)
416 		if (!memcmp(&ifr->ifr_addr.sa_data[0], &ports_eth_addr[i], ETH_ALEN))
417 			return i;
418 
419 	return -1;
420 }
421 /*----------------------------------------------------------------------------*/
422 int32_t
423 dpdk_select(struct mtcp_thread_context *ctxt)
424 {
425 #ifdef RX_IDLE_ENABLE
426 	struct dpdk_private_context *dpc;
427 
428 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
429 	if (dpc->rx_idle > RX_IDLE_THRESH) {
430 		dpc->rx_idle = 0;
431 		usleep(RX_IDLE_TIMEOUT);
432 	}
433 #endif
434 	return 0;
435 }
436 /*----------------------------------------------------------------------------*/
437 void
438 dpdk_destroy_handle(struct mtcp_thread_context *ctxt)
439 {
440 	struct dpdk_private_context *dpc;
441 	int i;
442 
443 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
444 
445 	/* free wmbufs */
446 	for (i = 0; i < g_config.mos->netdev_table->num; i++)
447 		free_pkts(dpc->wmbufs[i].m_table, MAX_PKT_BURST);
448 
449 #ifdef ENABLE_STATS_IOCTL
450 	/* free fd */
451 	if (dpc->fd >= 0)
452 		close(dpc->fd);
453 #endif /* !ENABLE_STATS_IOCTL */
454 
455 	/* free it all up */
456 	free(dpc);
457 }
458 /*----------------------------------------------------------------------------*/
459 static void
460 check_all_ports_link_status(uint8_t port_num, uint32_t port_mask)
461 {
462 #define CHECK_INTERVAL 			100 /* 100ms */
463 #define MAX_CHECK_TIME 			90 /* 9s (90 * 100ms) in total */
464 
465 	uint8_t portid, count, all_ports_up, print_flag = 0;
466 	struct rte_eth_link link;
467 
468 	printf("\nChecking link status");
469 	fflush(stdout);
470 	for (count = 0; count <= MAX_CHECK_TIME; count++) {
471 		all_ports_up = 1;
472 		for (portid = 0; portid < port_num; portid++) {
473 			if ((port_mask & (1 << portid)) == 0)
474 				continue;
475 			memset(&link, 0, sizeof(link));
476 			rte_eth_link_get_nowait(portid, &link);
477 			/* print link status if flag set */
478 			if (print_flag == 1) {
479 				if (link.link_status)
480 					printf("Port %d Link Up - speed %u "
481 						"Mbps - %s\n", (uint8_t)portid,
482 						(unsigned)link.link_speed,
483 				(link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
484 					("full-duplex") : ("half-duplex\n"));
485 				else
486 					printf("Port %d Link Down\n",
487 						(uint8_t)portid);
488 				continue;
489 			}
490 			/* clear all_ports_up flag if any link down */
491 			if (link.link_status == 0) {
492 				all_ports_up = 0;
493 				break;
494 			}
495 		}
496 		/* after finally printing all link status, get out */
497 		if (print_flag == 1)
498 			break;
499 
500 		if (all_ports_up == 0) {
501 			printf(".");
502 			fflush(stdout);
503 			rte_delay_ms(CHECK_INTERVAL);
504 		}
505 
506 		/* set the print_flag if all ports up or timeout */
507 		if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
508 			print_flag = 1;
509 			printf("done\n");
510 		}
511 	}
512 }
513 /*----------------------------------------------------------------------------*/
514 int32_t
515 dpdk_dev_ioctl(struct mtcp_thread_context *ctx, int nif, int cmd, void *argp)
516 {
517 	struct dpdk_private_context *dpc;
518 	struct rte_mbuf *m;
519 	int len_of_mbuf;
520 	struct iphdr *iph;
521 	struct tcphdr *tcph;
522 	RssInfo *rss_i;
523 
524 	iph = (struct iphdr *)argp;
525 	dpc = (struct dpdk_private_context *)ctx->io_private_context;
526 	len_of_mbuf = dpc->wmbufs[nif].len;
527 	rss_i = NULL;
528 
529 	switch (cmd) {
530 	case PKT_TX_IP_CSUM:
531 		m = dpc->wmbufs[nif].m_table[len_of_mbuf - 1];
532 		m->ol_flags = PKT_TX_IP_CKSUM | PKT_TX_IPV4;
533 		m->l2_len = sizeof(struct ether_hdr);
534 		m->l3_len = (iph->ihl<<2);
535 		break;
536 	case PKT_TX_TCP_CSUM:
537 		m = dpc->wmbufs[nif].m_table[len_of_mbuf - 1];
538 		tcph = (struct tcphdr *)((unsigned char *)iph + (iph->ihl<<2));
539 		m->ol_flags |= PKT_TX_TCP_CKSUM;
540 		tcph->check = rte_ipv4_phdr_cksum((struct ipv4_hdr *)iph, m->ol_flags);
541 		break;
542 	case PKT_RX_RSS:
543 		rss_i = (RssInfo *)argp;
544 		m = dpc->pkts_burst[rss_i->pktidx];
545 		rss_i->hash_value = m->hash.rss;
546 		break;
547 	default:
548 		goto dev_ioctl_err;
549 	}
550 
551 	return 0;
552  dev_ioctl_err:
553 	return -1;
554 }
555 /*----------------------------------------------------------------------------*/
556 void
557 dpdk_load_module_upper_half(void)
558 {
559 	int cpu = g_config.mos->num_cores, ret;
560 	uint32_t cpumask = 0;
561 	char cpumaskbuf[10];
562 	char mem_channels[5];
563 
564 	/* set the log level */
565 	rte_set_log_type(RTE_LOGTYPE_PMD, 0);
566 	rte_set_log_type(RTE_LOGTYPE_MALLOC, 0);
567 	rte_set_log_type(RTE_LOGTYPE_MEMPOOL, 0);
568 	rte_set_log_type(RTE_LOGTYPE_RING, 0);
569 	rte_set_log_level(RTE_LOG_WARNING);
570 
571 	/* get the cpu mask */
572 	for (ret = 0; ret < cpu; ret++)
573 		cpumask = (cpumask | (1 << ret));
574 	sprintf(cpumaskbuf, "%X", cpumask);
575 
576 	/* get the mem channels per socket */
577 	if (g_config.mos->nb_mem_channels == 0) {
578 		TRACE_ERROR("DPDK module requires # of memory channels "
579 				"per socket parameter!\n");
580 		exit(EXIT_FAILURE);
581 	}
582 	sprintf(mem_channels, "%d", g_config.mos->nb_mem_channels);
583 
584 	/* initialize the rte env first, what a waste of implementation effort!  */
585 	char *argv[] = {"",
586 			"-c",
587 			cpumaskbuf,
588 			"-n",
589 			mem_channels,
590 			"--proc-type=auto",
591 			""
592 	};
593 	const int argc = 6;
594 
595 	/*
596 	 * re-set getopt extern variable optind.
597 	 * this issue was a bitch to debug
598 	 * rte_eal_init() internally uses getopt() syscall
599 	 * mtcp applications that also use an `external' getopt
600 	 * will cause a violent crash if optind is not reset to zero
601 	 * prior to calling the func below...
602 	 * see man getopt(3) for more details
603 	 */
604 	optind = 0;
605 
606 	/* initialize the dpdk eal env */
607 	ret = rte_eal_init(argc, argv);
608 	if (ret < 0)
609 		rte_exit(EXIT_FAILURE, "Invalid EAL args!\n");
610 
611 }
612 /*----------------------------------------------------------------------------*/
613 void
614 dpdk_load_module_lower_half(void)
615 {
616 	int portid, rxlcore_id, ret;
617 	struct rte_eth_fc_conf fc_conf;	/* for Ethernet flow control settings */
618 	/* setting the rss key */
619 	static const uint8_t key[] = {
620 		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
621 		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
622 		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
623 		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05
624 	};
625 
626 	port_conf.rx_adv_conf.rss_conf.rss_key = (uint8_t *)&key;
627 	port_conf.rx_adv_conf.rss_conf.rss_key_len = sizeof(key);
628 
629 	/* resetting cpu_qid mapping */
630 	memset(cpu_qid_map, 0xFF, sizeof(cpu_qid_map));
631 
632 	if (!g_config.mos->multiprocess
633 			|| (g_config.mos->multiprocess && g_config.mos->multiprocess_is_master)) {
634 		for (rxlcore_id = 0; rxlcore_id < g_config.mos->num_cores; rxlcore_id++) {
635 			char name[20];
636 			sprintf(name, "mbuf_pool-%d", rxlcore_id);
637 			/* create the mbuf pools */
638 			pktmbuf_pool[rxlcore_id] =
639 				rte_mempool_create(name, NB_MBUF,
640 						   MBUF_SIZE, MEMPOOL_CACHE_SIZE,
641 						   sizeof(struct rte_pktmbuf_pool_private),
642 						   rte_pktmbuf_pool_init, NULL,
643 						   rte_pktmbuf_init, NULL,
644 						   rte_lcore_to_socket_id(rxlcore_id), 0);
645 			if (pktmbuf_pool[rxlcore_id] == NULL)
646 				rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n");
647 		}
648 
649 		/* Initialise each port */
650 		for (portid = 0; portid < g_config.mos->netdev_table->num; portid++) {
651 			int num_queue = 0, eth_idx, i, queue_id;
652 			for (eth_idx = 0; eth_idx < g_config.mos->netdev_table->num; eth_idx++)
653 				if (portid == g_config.mos->netdev_table->ent[eth_idx]->ifindex)
654 					break;
655 			if (eth_idx == g_config.mos->netdev_table->num)
656 				continue;
657 			for (i = 0; i < sizeof(uint64_t) * 8; i++)
658 				if (g_config.mos->netdev_table->ent[eth_idx]->cpu_mask & (1L << i))
659 					num_queue++;
660 
661 			/* set 'num_queues' (used for GetRSSCPUCore() in util.c) */
662 			num_queues = num_queue;
663 
664 			/* init port */
665 			printf("Initializing port %u... ", (unsigned) portid);
666 			fflush(stdout);
667 			ret = rte_eth_dev_configure(portid, num_queue, num_queue,
668 										&port_conf);
669 			if (ret < 0)
670 				rte_exit(EXIT_FAILURE, "Cannot configure device:"
671 									   "err=%d, port=%u\n",
672 									   ret, (unsigned) portid);
673 
674 			/* init one RX queue per CPU */
675 			fflush(stdout);
676 #ifdef DEBUG
677 			rte_eth_macaddr_get(portid, &ports_eth_addr[portid]);
678 #endif
679 			queue_id = 0;
680 			for (rxlcore_id = 0; rxlcore_id < g_config.mos->num_cores; rxlcore_id++) {
681 				if (!(g_config.mos->netdev_table->ent[eth_idx]->cpu_mask & (1L << rxlcore_id)))
682 					continue;
683 				ret = rte_eth_rx_queue_setup(portid, queue_id, nb_rxd,
684 						rte_eth_dev_socket_id(portid), &rx_conf,
685 						pktmbuf_pool[rxlcore_id]);
686 				if (ret < 0)
687 					rte_exit(EXIT_FAILURE, "rte_eth_rx_queue_setup:"
688 										   "err=%d, port=%u, queueid: %d\n",
689 										   ret, (unsigned) portid, rxlcore_id);
690 				cpu_qid_map[portid][rxlcore_id] = queue_id++;
691 			}
692 
693 			/* init one TX queue on each port per CPU (this is redundant for
694 			 * this app) */
695 			fflush(stdout);
696 			queue_id = 0;
697 			for (rxlcore_id = 0; rxlcore_id < g_config.mos->num_cores; rxlcore_id++) {
698 				if (!(g_config.mos->netdev_table->ent[eth_idx]->cpu_mask & (1L << rxlcore_id)))
699 					continue;
700 				ret = rte_eth_tx_queue_setup(portid, queue_id++, nb_txd,
701 						rte_eth_dev_socket_id(portid), &tx_conf);
702 				if (ret < 0)
703 					rte_exit(EXIT_FAILURE, "rte_eth_tx_queue_setup:"
704 										   "err=%d, port=%u, queueid: %d\n",
705 										   ret, (unsigned) portid, rxlcore_id);
706 			}
707 
708 			/* Start device */
709 			ret = rte_eth_dev_start(portid);
710 			if (ret < 0)
711 				rte_exit(EXIT_FAILURE, "rte_eth_dev_start:err=%d, port=%u\n",
712 									   ret, (unsigned) portid);
713 
714 			printf("done: \n");
715 			rte_eth_promiscuous_enable(portid);
716 
717 			/* retrieve current flow control settings per port */
718 			memset(&fc_conf, 0, sizeof(fc_conf));
719 			ret = rte_eth_dev_flow_ctrl_get(portid, &fc_conf);
720 			if (ret != 0) {
721 				rte_exit(EXIT_FAILURE, "Failed to get flow control info!\n");
722 			}
723 
724 			/* and just disable the rx/tx flow control */
725 			fc_conf.mode = RTE_FC_NONE;
726 			ret = rte_eth_dev_flow_ctrl_set(portid, &fc_conf);
727 			if (ret != 0) {
728 				rte_exit(EXIT_FAILURE, "Failed to set flow control info!: errno: %d\n",
729 					 ret);
730 			}
731 
732 #ifdef DEBUG
733 			printf("Port %u, MAC address: %02X:%02X:%02X:%02X:%02X:%02X\n\n",
734 					(unsigned) portid,
735 					ports_eth_addr[portid].addr_bytes[0],
736 					ports_eth_addr[portid].addr_bytes[1],
737 					ports_eth_addr[portid].addr_bytes[2],
738 					ports_eth_addr[portid].addr_bytes[3],
739 					ports_eth_addr[portid].addr_bytes[4],
740 					ports_eth_addr[portid].addr_bytes[5]);
741 #endif
742 		}
743 	} else { /* g_config.mos->multiprocess && !g_config.mos->multiprocess_is_master */
744 		for (rxlcore_id = 0; rxlcore_id < g_config.mos->num_cores; rxlcore_id++) {
745 			char name[20];
746 			sprintf(name, "mbuf_pool-%d", rxlcore_id);
747 			/* initialize the mbuf pools */
748 			pktmbuf_pool[rxlcore_id] =
749 				rte_mempool_lookup(name);
750 			if (pktmbuf_pool[rxlcore_id] == NULL)
751 				rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n");
752 		}
753 	}
754 
755 	check_all_ports_link_status(g_config.mos->netdev_table->num, 0xFFFFFFFF);
756 }
757 /*----------------------------------------------------------------------------*/
758 io_module_func dpdk_module_func = {
759 	.load_module_upper_half		   = dpdk_load_module_upper_half,
760 	.load_module_lower_half		   = dpdk_load_module_lower_half,
761 	.init_handle		   = dpdk_init_handle,
762 	.link_devices		   = NULL,
763 	.release_pkt		   = NULL,
764 	.send_pkts		   = dpdk_send_pkts,
765 	.get_wptr   		   = dpdk_get_wptr,
766 	.recv_pkts		   = dpdk_recv_pkts,
767 	.get_rptr	   	   = dpdk_get_rptr,
768 	.get_nif		   = dpdk_get_nif,
769 	.select			   = dpdk_select,
770 	.destroy_handle		   = dpdk_destroy_handle,
771 	.dev_ioctl		   = dpdk_dev_ioctl,
772 	.set_wptr		   = dpdk_set_wptr,
773 };
774 /*----------------------------------------------------------------------------*/
775 
776