1 /* for io_module_func def'ns */
2 #include "io_module.h"
3 /* for mtcp related def'ns */
4 #include "mtcp.h"
5 /* for errno */
6 #include <errno.h>
7 /* for close/optind */
8 #include <unistd.h>
9 /* for logging */
10 #include "debug.h"
11 /* for num_devices_* */
12 #include "config.h"
13 /* for rte_max_eth_ports */
14 #include <rte_common.h>
15 /* for rte_eth_rxconf */
16 #include <rte_ethdev.h>
17 /* for delay funcs */
18 #include <rte_cycles.h>
19 /* for ip pesudo-chksum */
20 #include <rte_ip.h>
21 #define ENABLE_STATS_IOCTL		1
22 #ifdef ENABLE_STATS_IOCTL
23 /* for open */
24 #include <fcntl.h>
25 /* for ioctl */
26 #include <sys/ioctl.h>
27 #endif /* !ENABLE_STATS_IOCTL */
28 /*----------------------------------------------------------------------------*/
29 /* Essential macros */
30 #define MAX_RX_QUEUE_PER_LCORE		MAX_CPUS
31 #define MAX_TX_QUEUE_PER_PORT		MAX_CPUS
32 
33 #define MBUF_SIZE 			(2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
34 #define NB_MBUF				8192
35 #define MEMPOOL_CACHE_SIZE		256
36 //#define RX_IDLE_ENABLE			1
37 #define RX_IDLE_TIMEOUT			1	/* in micro-seconds */
38 #define RX_IDLE_THRESH			64
39 
40 /*
41  * RX and TX Prefetch, Host, and Write-back threshold values should be
42  * carefully set for optimal performance. Consult the network
43  * controller's datasheet and supporting DPDK documentation for guidance
44  * on how these parameters should be set.
45  */
46 #define RX_PTHRESH 			8 /**< Default values of RX prefetch threshold reg. */
47 #define RX_HTHRESH 			8 /**< Default values of RX host threshold reg. */
48 #define RX_WTHRESH 			4 /**< Default values of RX write-back threshold reg. */
49 
50 /*
51  * These default values are optimized for use with the Intel(R) 82599 10 GbE
52  * Controller and the DPDK ixgbe PMD. Consider using other values for other
53  * network controllers and/or network drivers.
54  */
55 #define TX_PTHRESH 			36 /**< Default values of TX prefetch threshold reg. */
56 #define TX_HTHRESH			0  /**< Default values of TX host threshold reg. */
57 #define TX_WTHRESH			0  /**< Default values of TX write-back threshold reg. */
58 
59 #define MAX_PKT_BURST			/*32*/64/*128*//*32*/
60 
61 /*
62  * Configurable number of RX/TX ring descriptors
63  */
64 #define RTE_TEST_RX_DESC_DEFAULT	128
65 #define RTE_TEST_TX_DESC_DEFAULT	512
66 
67 static uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT;
68 static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
69 /*----------------------------------------------------------------------------*/
70 /* packet memory pools for storing packet bufs */
71 static struct rte_mempool *pktmbuf_pool[MAX_CPUS] = {NULL};
72 static uint8_t cpu_qid_map[RTE_MAX_ETHPORTS][MAX_CPUS] = {{0}};
73 
74 //#define DEBUG				1
75 #ifdef DEBUG
76 /* ethernet addresses of ports */
77 static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
78 #endif
79 
80 static struct rte_eth_conf port_conf = {
81 	.rxmode = {
82 		.mq_mode	= 	ETH_MQ_RX_RSS,
83 		.max_rx_pkt_len = 	ETHER_MAX_LEN,
84 		.split_hdr_size = 	0,
85 		.header_split   = 	0, /**< Header Split disabled */
86 		.hw_ip_checksum = 	1, /**< IP checksum offload enabled */
87 		.hw_vlan_filter = 	0, /**< VLAN filtering disabled */
88 		.jumbo_frame    = 	0, /**< Jumbo Frame Support disabled */
89 		.hw_strip_crc   = 	1, /**< CRC stripped by hardware */
90 	},
91 	.rx_adv_conf = {
92 		.rss_conf = {
93 			.rss_key = 	NULL,
94 			.rss_hf = 	ETH_RSS_TCP | ETH_RSS_UDP |
95 					ETH_RSS_IP | ETH_RSS_L2_PAYLOAD
96 		},
97 	},
98 	.txmode = {
99 		.mq_mode = 		ETH_MQ_TX_NONE,
100 	},
101 };
102 
103 static const struct rte_eth_rxconf rx_conf = {
104 	.rx_thresh = {
105 		.pthresh = 		RX_PTHRESH, /* RX prefetch threshold reg */
106 		.hthresh = 		RX_HTHRESH, /* RX host threshold reg */
107 		.wthresh = 		RX_WTHRESH, /* RX write-back threshold reg */
108 	},
109 	.rx_free_thresh = 		32,
110 };
111 
112 static const struct rte_eth_txconf tx_conf = {
113 	.tx_thresh = {
114 		.pthresh = 		TX_PTHRESH, /* TX prefetch threshold reg */
115 		.hthresh = 		TX_HTHRESH, /* TX host threshold reg */
116 		.wthresh = 		TX_WTHRESH, /* TX write-back threshold reg */
117 	},
118 	.tx_free_thresh = 		0, /* Use PMD default values */
119 	.tx_rs_thresh = 		0, /* Use PMD default values */
120 	/*
121 	 * As the example won't handle mult-segments and offload cases,
122 	 * set the flag by default.
123 	 */
124 	.txq_flags = 			0x0,
125 };
126 
127 struct mbuf_table {
128 	unsigned len; /* length of queued packets */
129 	struct rte_mbuf *m_table[MAX_PKT_BURST];
130 };
131 
132 struct dpdk_private_context {
133 	struct mbuf_table rmbufs[RTE_MAX_ETHPORTS];
134 	struct mbuf_table wmbufs[RTE_MAX_ETHPORTS];
135 	struct rte_mempool *pktmbuf_pool;
136 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
137 #ifdef RX_IDLE_ENABLE
138 	uint8_t rx_idle;
139 #endif
140 #ifdef ENABLE_STATS_IOCTL
141 	int fd;
142 #endif /* !ENABLE_STATS_IOCTL */
143 } __rte_cache_aligned;
144 
145 #ifdef ENABLE_STATS_IOCTL
146 /**
147  * stats struct passed on from user space to the driver
148  */
149 struct stats_struct {
150 	uint64_t tx_bytes;
151 	uint64_t tx_pkts;
152 	uint64_t rx_bytes;
153 	uint64_t rx_pkts;
154 	uint8_t qid;
155 	uint8_t dev;
156 };
157 #endif /* !ENABLE_STATS_IOCTL */
158 /*----------------------------------------------------------------------------*/
159 void
160 dpdk_init_handle(struct mtcp_thread_context *ctxt)
161 {
162 	struct dpdk_private_context *dpc;
163 	int i, j;
164 	char mempool_name[20];
165 
166 	/* create and initialize private I/O module context */
167 	ctxt->io_private_context = calloc(1, sizeof(struct dpdk_private_context));
168 	if (ctxt->io_private_context == NULL) {
169 		TRACE_ERROR("Failed to initialize ctxt->io_private_context: "
170 			    "Can't allocate memory\n");
171 		exit(EXIT_FAILURE);
172 	}
173 
174 	sprintf(mempool_name, "mbuf_pool-%d", ctxt->cpu);
175 	dpc = (struct dpdk_private_context *)ctxt->io_private_context;
176 	dpc->pktmbuf_pool = pktmbuf_pool[ctxt->cpu];
177 
178 	/* set wmbufs correctly */
179 	for (j = 0; j < g_config.mos->netdev_table->num; j++) {
180 		/* Allocate wmbufs for each registered port */
181 		for (i = 0; i < MAX_PKT_BURST; i++) {
182 			dpc->wmbufs[j].m_table[i] = rte_pktmbuf_alloc(pktmbuf_pool[ctxt->cpu]);
183 			if (dpc->wmbufs[j].m_table[i] == NULL) {
184 				TRACE_ERROR("Failed to allocate %d:wmbuf[%d] on device %d!\n",
185 					    ctxt->cpu, i, j);
186 				exit(EXIT_FAILURE);
187 			}
188 		}
189 		/* set mbufs queue length to 0 to begin with */
190 		dpc->wmbufs[j].len = 0;
191 	}
192 
193 #ifdef ENABLE_STATS_IOCTL
194 	dpc->fd = open("/dev/dpdk-iface", O_RDWR);
195 	if (dpc->fd == -1) {
196 		TRACE_ERROR("Can't open /dev/dpdk-iface for context->cpu: %d! "
197 			    "Are you using mlx4/mlx5 driver?\n",
198 			    ctxt->cpu);
199 	}
200 #endif /* !ENABLE_STATS_IOCTL */
201 }
202 /*----------------------------------------------------------------------------*/
203 int
204 dpdk_send_pkts(struct mtcp_thread_context *ctxt, int nif)
205 {
206 	struct dpdk_private_context *dpc;
207 	mtcp_manager_t mtcp;
208 	int ret;
209 	int qid;
210 
211 	dpc = (struct dpdk_private_context *)ctxt->io_private_context;
212 	mtcp = ctxt->mtcp_manager;
213 	ret = 0;
214 	qid = cpu_qid_map[nif][ctxt->cpu];
215 
216 	/* if queue is unassigned, skip it.. */
217 	if (unlikely(qid == 0xFF))
218 		return 0;
219 
220 	/* if there are packets in the queue... flush them out to the wire */
221 	if (dpc->wmbufs[nif].len >/*= MAX_PKT_BURST*/ 0) {
222 		struct rte_mbuf **pkts;
223 #ifdef ENABLE_STATS_IOCTL
224 		struct stats_struct ss;
225 #endif /* !ENABLE_STATS_IOCTL */
226 		int cnt = dpc->wmbufs[nif].len;
227 		pkts = dpc->wmbufs[nif].m_table;
228 #ifdef NETSTAT
229 		mtcp->nstat.tx_packets[nif] += cnt;
230 #ifdef ENABLE_STATS_IOCTL
231 		if (likely(dpc->fd) >= 0) {
232 			ss.tx_pkts = mtcp->nstat.tx_packets[nif];
233 			ss.tx_bytes = mtcp->nstat.tx_bytes[nif];
234 			ss.rx_pkts = mtcp->nstat.rx_packets[nif];
235 			ss.rx_bytes = mtcp->nstat.rx_bytes[nif];
236 			ss.qid = ctxt->cpu;
237 			ss.dev = nif;
238 			ioctl(dpc->fd, 0, &ss);
239 		}
240 #endif /* !ENABLE_STATS_IOCTL */
241 #endif
242 		do {
243 			/* tx cnt # of packets */
244 			ret = rte_eth_tx_burst(nif, qid,
245 					       pkts, cnt);
246 			pkts += ret;
247 			cnt -= ret;
248 			/* if not all pkts were sent... then repeat the cycle */
249 		} while (cnt > 0);
250 
251 #ifndef SHARE_IO_BUFFER
252 		int i;
253 		/* time to allocate fresh mbufs for the queue */
254 		for (i = 0; i < dpc->wmbufs[nif].len; i++) {
255 			dpc->wmbufs[nif].m_table[i] = rte_pktmbuf_alloc(pktmbuf_pool[ctxt->cpu]);
256 			/* error checking */
257 			if (unlikely(dpc->wmbufs[nif].m_table[i] == NULL)) {
258 				TRACE_ERROR("Failed to allocate %d:wmbuf[%d] on device %d!\n",
259 					    ctxt->cpu, i, nif);
260 				exit(EXIT_FAILURE);
261 			}
262 		}
263 #endif
264 		/* reset the len of mbufs var after flushing of packets */
265 		dpc->wmbufs[nif].len = 0;
266 	}
267 
268 	return ret;
269 }
270 /*----------------------------------------------------------------------------*/
271 uint8_t *
272 dpdk_get_wptr(struct mtcp_thread_context *ctxt, int nif, uint16_t pktsize)
273 {
274 	struct dpdk_private_context *dpc;
275 	mtcp_manager_t mtcp;
276 	struct rte_mbuf *m;
277 	uint8_t *ptr;
278 	int len_of_mbuf;
279 
280 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
281 	mtcp = ctxt->mtcp_manager;
282 
283 	/* sanity check */
284 	if (unlikely(dpc->wmbufs[nif].len == MAX_PKT_BURST))
285 		return NULL;
286 
287 	len_of_mbuf = dpc->wmbufs[nif].len;
288 	m = dpc->wmbufs[nif].m_table[len_of_mbuf];
289 
290 	/* retrieve the right write offset */
291 	ptr = (void *)rte_pktmbuf_mtod(m, struct ether_hdr *);
292 	m->pkt_len = m->data_len = pktsize;
293 	m->nb_segs = 1;
294 	m->next = NULL;
295 
296 #ifdef NETSTAT
297 	mtcp->nstat.tx_bytes[nif] += pktsize + ETHER_OVR;
298 #endif
299 
300 	/* increment the len_of_mbuf var */
301 	dpc->wmbufs[nif].len = len_of_mbuf + 1;
302 
303 	return (uint8_t *)ptr;
304 }
305 /*----------------------------------------------------------------------------*/
306 void
307 dpdk_set_wptr(struct mtcp_thread_context *ctxt, int out_nif, int in_nif, int index)
308 {
309 	struct dpdk_private_context *dpc;
310 	mtcp_manager_t mtcp;
311 	int len_of_mbuf;
312 
313 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
314 	mtcp = ctxt->mtcp_manager;
315 
316 	/* sanity check */
317 	if (unlikely(dpc->wmbufs[out_nif].len == MAX_PKT_BURST))
318 		return;
319 
320 	len_of_mbuf = dpc->wmbufs[out_nif].len;
321 	dpc->wmbufs[out_nif].m_table[len_of_mbuf] =
322 		dpc->rmbufs[in_nif].m_table[index];
323 
324 	dpc->wmbufs[out_nif].m_table[len_of_mbuf]->udata64 = 0;
325 
326 #ifdef NETSTAT
327 	mtcp->nstat.tx_bytes[out_nif] += dpc->rmbufs[in_nif].m_table[index]->pkt_len + ETHER_OVR;
328 #endif
329 
330 	/* increment the len_of_mbuf var */
331 	dpc->wmbufs[out_nif].len = len_of_mbuf + 1;
332 
333 	return;
334 }
335 /*----------------------------------------------------------------------------*/
336 static inline void
337 free_pkts(struct rte_mbuf **mtable, unsigned len)
338 {
339 	int i;
340 
341 	/* free the freaking packets */
342 	for (i = 0; i < len; i++) {
343 		if (mtable[i]->udata64 == 1) {
344 			rte_pktmbuf_free_seg(mtable[i]);
345 			RTE_MBUF_PREFETCH_TO_FREE(mtable[i+1]);
346 		}
347 	}
348 }
349 /*----------------------------------------------------------------------------*/
350 int32_t
351 dpdk_recv_pkts(struct mtcp_thread_context *ctxt, int ifidx)
352 {
353 	struct dpdk_private_context *dpc;
354 	int ret;
355 	uint8_t qid;
356 
357 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
358 	qid = cpu_qid_map[ifidx][ctxt->cpu];
359 
360 	/* if queue is unassigned, skip it.. */
361 	if (qid == 0xFF)
362 		return 0;
363 
364 	if (dpc->rmbufs[ifidx].len != 0) {
365 		free_pkts(dpc->rmbufs[ifidx].m_table, dpc->rmbufs[ifidx].len);
366 		dpc->rmbufs[ifidx].len = 0;
367 	}
368 
369 	ret = rte_eth_rx_burst((uint8_t)ifidx, qid,
370 			       dpc->pkts_burst, MAX_PKT_BURST);
371 #ifdef RX_IDLE_ENABLE
372 	dpc->rx_idle = (likely(ret != 0)) ? 0 : dpc->rx_idle + 1;
373 #endif
374 	dpc->rmbufs[ifidx].len = ret;
375 
376 	return ret;
377 }
378 /*----------------------------------------------------------------------------*/
379 uint8_t *
380 dpdk_get_rptr(struct mtcp_thread_context *ctxt, int ifidx, int index, uint16_t *len)
381 {
382 	struct dpdk_private_context *dpc;
383 	struct rte_mbuf *m;
384 	uint8_t *pktbuf;
385 
386 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
387 
388 
389 	m = dpc->pkts_burst[index];
390 	/* tag to check if the packet is a local or a forwarded pkt */
391 	m->udata64 = 1;
392 	/* don't enable pre-fetching... performance goes down */
393 	//rte_prefetch0(rte_pktmbuf_mtod(m, void *));
394 	*len = m->pkt_len;
395 	pktbuf = rte_pktmbuf_mtod(m, uint8_t *);
396 
397 	/* enqueue the pkt ptr in mbuf */
398 	dpc->rmbufs[ifidx].m_table[index] = m;
399 
400 	return pktbuf;
401 }
402 /*----------------------------------------------------------------------------*/
403 int
404 dpdk_get_nif(struct ifreq *ifr)
405 {
406 	int i;
407 	static int num_dev = -1;
408 	static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
409 	/* get mac addr entries of 'detected' dpdk ports */
410 	if (num_dev < 0) {
411 		num_dev = rte_eth_dev_count();
412 		for (i = 0; i < num_dev; i++)
413 			rte_eth_macaddr_get(i, &ports_eth_addr[i]);
414 	}
415 
416 	for (i = 0; i < num_dev; i++)
417 		if (!memcmp(&ifr->ifr_addr.sa_data[0], &ports_eth_addr[i], ETH_ALEN))
418 			return i;
419 
420 	return -1;
421 }
422 /*----------------------------------------------------------------------------*/
423 int32_t
424 dpdk_select(struct mtcp_thread_context *ctxt)
425 {
426 #ifdef RX_IDLE_ENABLE
427 	struct dpdk_private_context *dpc;
428 
429 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
430 	if (dpc->rx_idle > RX_IDLE_THRESH) {
431 		dpc->rx_idle = 0;
432 		usleep(RX_IDLE_TIMEOUT);
433 	}
434 #endif
435 	return 0;
436 }
437 /*----------------------------------------------------------------------------*/
438 void
439 dpdk_destroy_handle(struct mtcp_thread_context *ctxt)
440 {
441 	struct dpdk_private_context *dpc;
442 	int i;
443 
444 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
445 
446 	/* free wmbufs */
447 	for (i = 0; i < g_config.mos->netdev_table->num; i++)
448 		free_pkts(dpc->wmbufs[i].m_table, MAX_PKT_BURST);
449 
450 #ifdef ENABLE_STATS_IOCTL
451 	/* free fd */
452 	if (dpc->fd >= 0)
453 		close(dpc->fd);
454 #endif /* !ENABLE_STATS_IOCTL */
455 
456 	/* free it all up */
457 	free(dpc);
458 }
459 /*----------------------------------------------------------------------------*/
460 static void
461 check_all_ports_link_status(uint8_t port_num, uint32_t port_mask)
462 {
463 #define CHECK_INTERVAL 			100 /* 100ms */
464 #define MAX_CHECK_TIME 			90 /* 9s (90 * 100ms) in total */
465 
466 	uint8_t portid, count, all_ports_up, print_flag = 0;
467 	struct rte_eth_link link;
468 
469 	printf("\nChecking link status");
470 	fflush(stdout);
471 	for (count = 0; count <= MAX_CHECK_TIME; count++) {
472 		all_ports_up = 1;
473 		for (portid = 0; portid < port_num; portid++) {
474 			if ((port_mask & (1 << portid)) == 0)
475 				continue;
476 			memset(&link, 0, sizeof(link));
477 			rte_eth_link_get_nowait(portid, &link);
478 			/* print link status if flag set */
479 			if (print_flag == 1) {
480 				if (link.link_status)
481 					printf("Port %d Link Up - speed %u "
482 						"Mbps - %s\n", (uint8_t)portid,
483 						(unsigned)link.link_speed,
484 				(link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
485 					("full-duplex") : ("half-duplex\n"));
486 				else
487 					printf("Port %d Link Down\n",
488 						(uint8_t)portid);
489 				continue;
490 			}
491 			/* clear all_ports_up flag if any link down */
492 			if (link.link_status == 0) {
493 				all_ports_up = 0;
494 				break;
495 			}
496 		}
497 		/* after finally printing all link status, get out */
498 		if (print_flag == 1)
499 			break;
500 
501 		if (all_ports_up == 0) {
502 			printf(".");
503 			fflush(stdout);
504 			rte_delay_ms(CHECK_INTERVAL);
505 		}
506 
507 		/* set the print_flag if all ports up or timeout */
508 		if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
509 			print_flag = 1;
510 			printf("done\n");
511 		}
512 	}
513 }
514 /*----------------------------------------------------------------------------*/
515 int32_t
516 dpdk_dev_ioctl(struct mtcp_thread_context *ctx, int nif, int cmd, void *argp)
517 {
518 	struct dpdk_private_context *dpc;
519 	struct rte_mbuf *m;
520 	int len_of_mbuf;
521 	struct iphdr *iph;
522 	struct tcphdr *tcph;
523 	RssInfo *rss_i;
524 
525 	iph = (struct iphdr *)argp;
526 	dpc = (struct dpdk_private_context *)ctx->io_private_context;
527 	len_of_mbuf = dpc->wmbufs[nif].len;
528 	rss_i = NULL;
529 
530 	switch (cmd) {
531 	case PKT_TX_IP_CSUM:
532 		m = dpc->wmbufs[nif].m_table[len_of_mbuf - 1];
533 		m->ol_flags = PKT_TX_IP_CKSUM | PKT_TX_IPV4;
534 		m->l2_len = sizeof(struct ether_hdr);
535 		m->l3_len = (iph->ihl<<2);
536 		break;
537 	case PKT_TX_TCP_CSUM:
538 		m = dpc->wmbufs[nif].m_table[len_of_mbuf - 1];
539 		tcph = (struct tcphdr *)((unsigned char *)iph + (iph->ihl<<2));
540 		m->ol_flags |= PKT_TX_TCP_CKSUM;
541 		tcph->check = rte_ipv4_phdr_cksum((struct ipv4_hdr *)iph, m->ol_flags);
542 		break;
543 	case PKT_RX_RSS:
544 		rss_i = (RssInfo *)argp;
545 		m = dpc->pkts_burst[rss_i->pktidx];
546 		rss_i->hash_value = m->hash.rss;
547 		break;
548 	default:
549 		goto dev_ioctl_err;
550 	}
551 
552 	return 0;
553  dev_ioctl_err:
554 	return -1;
555 }
556 /*----------------------------------------------------------------------------*/
557 void
558 dpdk_load_module_upper_half(void)
559 {
560 	int cpu = g_config.mos->num_cores, ret;
561 	uint32_t cpumask = 0;
562 	char cpumaskbuf[10];
563 	char mem_channels[5];
564 
565 	/* set the log level */
566 	rte_set_log_type(RTE_LOGTYPE_PMD, 0);
567 	rte_set_log_type(RTE_LOGTYPE_MALLOC, 0);
568 	rte_set_log_type(RTE_LOGTYPE_MEMPOOL, 0);
569 	rte_set_log_type(RTE_LOGTYPE_RING, 0);
570 	rte_set_log_level(RTE_LOG_WARNING);
571 
572 	/* get the cpu mask */
573 	for (ret = 0; ret < cpu; ret++)
574 		cpumask = (cpumask | (1 << ret));
575 	sprintf(cpumaskbuf, "%X", cpumask);
576 
577 	/* get the mem channels per socket */
578 	if (g_config.mos->nb_mem_channels == 0) {
579 		TRACE_ERROR("DPDK module requires # of memory channels "
580 				"per socket parameter!\n");
581 		exit(EXIT_FAILURE);
582 	}
583 	sprintf(mem_channels, "%d", g_config.mos->nb_mem_channels);
584 
585 	/* initialize the rte env first, what a waste of implementation effort!  */
586 	char *argv[] = {"",
587 			"-c",
588 			cpumaskbuf,
589 			"-n",
590 			mem_channels,
591 			"--proc-type=auto",
592 			""
593 	};
594 	const int argc = 6;
595 
596 	/*
597 	 * re-set getopt extern variable optind.
598 	 * this issue was a bitch to debug
599 	 * rte_eal_init() internally uses getopt() syscall
600 	 * mtcp applications that also use an `external' getopt
601 	 * will cause a violent crash if optind is not reset to zero
602 	 * prior to calling the func below...
603 	 * see man getopt(3) for more details
604 	 */
605 	optind = 0;
606 
607 	/* initialize the dpdk eal env */
608 	ret = rte_eal_init(argc, argv);
609 	if (ret < 0)
610 		rte_exit(EXIT_FAILURE, "Invalid EAL args!\n");
611 
612 }
613 /*----------------------------------------------------------------------------*/
614 void
615 dpdk_load_module_lower_half(void)
616 {
617 	int portid, rxlcore_id, ret;
618 	struct rte_eth_fc_conf fc_conf;	/* for Ethernet flow control settings */
619 	/* setting the rss key */
620 	static const uint8_t key[] = {
621 		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
622 		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
623 		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
624 		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05
625 	};
626 
627 	port_conf.rx_adv_conf.rss_conf.rss_key = (uint8_t *)&key;
628 	port_conf.rx_adv_conf.rss_conf.rss_key_len = sizeof(key);
629 
630 	/* resetting cpu_qid mapping */
631 	memset(cpu_qid_map, 0xFF, sizeof(cpu_qid_map));
632 
633 	if (!g_config.mos->multiprocess
634 			|| (g_config.mos->multiprocess && g_config.mos->multiprocess_is_master)) {
635 		for (rxlcore_id = 0; rxlcore_id < g_config.mos->num_cores; rxlcore_id++) {
636 			char name[20];
637 			sprintf(name, "mbuf_pool-%d", rxlcore_id);
638 			/* create the mbuf pools */
639 			pktmbuf_pool[rxlcore_id] =
640 				rte_mempool_create(name, NB_MBUF,
641 						   MBUF_SIZE, MEMPOOL_CACHE_SIZE,
642 						   sizeof(struct rte_pktmbuf_pool_private),
643 						   rte_pktmbuf_pool_init, NULL,
644 						   rte_pktmbuf_init, NULL,
645 						   rte_lcore_to_socket_id(rxlcore_id), 0);
646 			if (pktmbuf_pool[rxlcore_id] == NULL)
647 				rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n");
648 		}
649 
650 		/* Initialise each port */
651 		for (portid = 0; portid < g_config.mos->netdev_table->num; portid++) {
652 			int num_queue = 0, eth_idx, i, queue_id;
653 			for (eth_idx = 0; eth_idx < g_config.mos->netdev_table->num; eth_idx++)
654 				if (portid == g_config.mos->netdev_table->ent[eth_idx]->ifindex)
655 					break;
656 			if (eth_idx == g_config.mos->netdev_table->num)
657 				continue;
658 			for (i = 0; i < sizeof(uint64_t) * 8; i++)
659 				if (g_config.mos->netdev_table->ent[eth_idx]->cpu_mask & (1L << i))
660 					num_queue++;
661 
662 			/* set 'num_queues' (used for GetRSSCPUCore() in util.c) */
663 			num_queues = num_queue;
664 
665 			/* init port */
666 			printf("Initializing port %u... ", (unsigned) portid);
667 			fflush(stdout);
668 			ret = rte_eth_dev_configure(portid, num_queue, num_queue,
669 										&port_conf);
670 			if (ret < 0)
671 				rte_exit(EXIT_FAILURE, "Cannot configure device:"
672 									   "err=%d, port=%u\n",
673 									   ret, (unsigned) portid);
674 
675 			/* init one RX queue per CPU */
676 			fflush(stdout);
677 #ifdef DEBUG
678 			rte_eth_macaddr_get(portid, &ports_eth_addr[portid]);
679 #endif
680 			queue_id = 0;
681 			for (rxlcore_id = 0; rxlcore_id < g_config.mos->num_cores; rxlcore_id++) {
682 				if (!(g_config.mos->netdev_table->ent[eth_idx]->cpu_mask & (1L << rxlcore_id)))
683 					continue;
684 				ret = rte_eth_rx_queue_setup(portid, queue_id, nb_rxd,
685 						rte_eth_dev_socket_id(portid), &rx_conf,
686 						pktmbuf_pool[rxlcore_id]);
687 				if (ret < 0)
688 					rte_exit(EXIT_FAILURE, "rte_eth_rx_queue_setup:"
689 										   "err=%d, port=%u, queueid: %d\n",
690 										   ret, (unsigned) portid, rxlcore_id);
691 				cpu_qid_map[portid][rxlcore_id] = queue_id++;
692 			}
693 
694 			/* init one TX queue on each port per CPU (this is redundant for
695 			 * this app) */
696 			fflush(stdout);
697 			queue_id = 0;
698 			for (rxlcore_id = 0; rxlcore_id < g_config.mos->num_cores; rxlcore_id++) {
699 				if (!(g_config.mos->netdev_table->ent[eth_idx]->cpu_mask & (1L << rxlcore_id)))
700 					continue;
701 				ret = rte_eth_tx_queue_setup(portid, queue_id++, nb_txd,
702 						rte_eth_dev_socket_id(portid), &tx_conf);
703 				if (ret < 0)
704 					rte_exit(EXIT_FAILURE, "rte_eth_tx_queue_setup:"
705 										   "err=%d, port=%u, queueid: %d\n",
706 										   ret, (unsigned) portid, rxlcore_id);
707 			}
708 
709 			/* Start device */
710 			ret = rte_eth_dev_start(portid);
711 			if (ret < 0)
712 				rte_exit(EXIT_FAILURE, "rte_eth_dev_start:err=%d, port=%u\n",
713 									   ret, (unsigned) portid);
714 
715 			printf("done: \n");
716 			rte_eth_promiscuous_enable(portid);
717 
718 			/* retrieve current flow control settings per port */
719 			memset(&fc_conf, 0, sizeof(fc_conf));
720 			ret = rte_eth_dev_flow_ctrl_get(portid, &fc_conf);
721 			if (ret != 0) {
722 				rte_exit(EXIT_FAILURE, "Failed to get flow control info!\n");
723 			}
724 
725 			/* and just disable the rx/tx flow control */
726 			fc_conf.mode = RTE_FC_NONE;
727 			ret = rte_eth_dev_flow_ctrl_set(portid, &fc_conf);
728 			if (ret != 0) {
729 				rte_exit(EXIT_FAILURE, "Failed to set flow control info!: errno: %d\n",
730 					 ret);
731 			}
732 
733 #ifdef DEBUG
734 			printf("Port %u, MAC address: %02X:%02X:%02X:%02X:%02X:%02X\n\n",
735 					(unsigned) portid,
736 					ports_eth_addr[portid].addr_bytes[0],
737 					ports_eth_addr[portid].addr_bytes[1],
738 					ports_eth_addr[portid].addr_bytes[2],
739 					ports_eth_addr[portid].addr_bytes[3],
740 					ports_eth_addr[portid].addr_bytes[4],
741 					ports_eth_addr[portid].addr_bytes[5]);
742 #endif
743 			/* only check for link status if the thread is master */
744 			check_all_ports_link_status(g_config.mos->netdev_table->num, 0xFFFFFFFF);
745 		}
746 	} else { /* g_config.mos->multiprocess && !g_config.mos->multiprocess_is_master */
747 		for (rxlcore_id = 0; rxlcore_id < g_config.mos->num_cores; rxlcore_id++) {
748 			char name[20];
749 			sprintf(name, "mbuf_pool-%d", rxlcore_id);
750 			/* initialize the mbuf pools */
751 			pktmbuf_pool[rxlcore_id] =
752 				rte_mempool_lookup(name);
753 			if (pktmbuf_pool[rxlcore_id] == NULL)
754 				rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n");
755 			for (portid = 0; portid < g_config.mos->netdev_table->num; portid++)
756 				cpu_qid_map[portid][rxlcore_id] = rxlcore_id;
757 		}
758 		/* set 'num_queues' (used for GetRSSCPUCore() in util.c) */
759 		num_queues = g_config.mos->num_cores;
760 	}
761 
762 }
763 /*----------------------------------------------------------------------------*/
764 io_module_func dpdk_module_func = {
765 	.load_module_upper_half		   = dpdk_load_module_upper_half,
766 	.load_module_lower_half		   = dpdk_load_module_lower_half,
767 	.init_handle		   = dpdk_init_handle,
768 	.link_devices		   = NULL,
769 	.release_pkt		   = NULL,
770 	.send_pkts		   = dpdk_send_pkts,
771 	.get_wptr   		   = dpdk_get_wptr,
772 	.recv_pkts		   = dpdk_recv_pkts,
773 	.get_rptr	   	   = dpdk_get_rptr,
774 	.get_nif		   = dpdk_get_nif,
775 	.select			   = dpdk_select,
776 	.destroy_handle		   = dpdk_destroy_handle,
777 	.dev_ioctl		   = dpdk_dev_ioctl,
778 	.set_wptr		   = dpdk_set_wptr,
779 };
780 /*----------------------------------------------------------------------------*/
781 
782