1 /* for io_module_func def'ns */
2 #include "io_module.h"
3 /* for mtcp related def'ns */
4 #include "mtcp.h"
5 /* for errno */
6 #include <errno.h>
7 /* for logging */
8 #include "debug.h"
9 /* for num_devices_* */
10 #include "config.h"
11 /* for rte_max_eth_ports */
12 #include <rte_common.h>
13 /* for rte_eth_rxconf */
14 #include <rte_ethdev.h>
15 /* for delay funcs */
16 #include <rte_cycles.h>
17 /* for ip pesudo-chksum */
18 #include <rte_ip.h>
19 #define ENABLE_STATS_IOCTL		1
20 #ifdef ENABLE_STATS_IOCTL
21 /* for close */
22 #include <unistd.h>
23 /* for open */
24 #include <fcntl.h>
25 /* for ioctl */
26 #include <sys/ioctl.h>
27 #endif /* !ENABLE_STATS_IOCTL */
28 /*----------------------------------------------------------------------------*/
29 /* Essential macros */
30 #define MAX_RX_QUEUE_PER_LCORE		MAX_CPUS
31 #define MAX_TX_QUEUE_PER_PORT		MAX_CPUS
32 
33 #define MBUF_SIZE 			(2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
34 #define NB_MBUF				8192
35 #define MEMPOOL_CACHE_SIZE		256
36 
37 /*
38  * RX and TX Prefetch, Host, and Write-back threshold values should be
39  * carefully set for optimal performance. Consult the network
40  * controller's datasheet and supporting DPDK documentation for guidance
41  * on how these parameters should be set.
42  */
43 #define RX_PTHRESH 			8 /**< Default values of RX prefetch threshold reg. */
44 #define RX_HTHRESH 			8 /**< Default values of RX host threshold reg. */
45 #define RX_WTHRESH 			4 /**< Default values of RX write-back threshold reg. */
46 
47 /*
48  * These default values are optimized for use with the Intel(R) 82599 10 GbE
49  * Controller and the DPDK ixgbe PMD. Consider using other values for other
50  * network controllers and/or network drivers.
51  */
52 #define TX_PTHRESH 			36 /**< Default values of TX prefetch threshold reg. */
53 #define TX_HTHRESH			0  /**< Default values of TX host threshold reg. */
54 #define TX_WTHRESH			0  /**< Default values of TX write-back threshold reg. */
55 
56 #define MAX_PKT_BURST			/*32*/64/*128*//*32*/
57 
58 /*
59  * Configurable number of RX/TX ring descriptors
60  */
61 #define RTE_TEST_RX_DESC_DEFAULT	128
62 #define RTE_TEST_TX_DESC_DEFAULT	512
63 
64 static uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT;
65 static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
66 /*----------------------------------------------------------------------------*/
67 /* packet memory pools for storing packet bufs */
68 static struct rte_mempool *pktmbuf_pool[MAX_CPUS] = {NULL};
69 static uint8_t cpu_qid_map[RTE_MAX_ETHPORTS][MAX_CPUS] = {{0}};
70 
71 //#define DEBUG				1
72 #ifdef DEBUG
73 /* ethernet addresses of ports */
74 static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
75 #endif
76 
77 static struct rte_eth_conf port_conf = {
78 	.rxmode = {
79 		.mq_mode	= 	ETH_MQ_RX_RSS,
80 		.max_rx_pkt_len = 	ETHER_MAX_LEN,
81 		.split_hdr_size = 	0,
82 		.header_split   = 	0, /**< Header Split disabled */
83 		.hw_ip_checksum = 	1, /**< IP checksum offload enabled */
84 		.hw_vlan_filter = 	0, /**< VLAN filtering disabled */
85 		.jumbo_frame    = 	0, /**< Jumbo Frame Support disabled */
86 		.hw_strip_crc   = 	1, /**< CRC stripped by hardware */
87 	},
88 	.rx_adv_conf = {
89 		.rss_conf = {
90 			.rss_key = 	NULL,
91 			.rss_hf = 	ETH_RSS_TCP
92 		},
93 	},
94 	.txmode = {
95 		.mq_mode = 		ETH_MQ_TX_NONE,
96 	},
97 #if 0
98 	.fdir_conf = {
99                 .mode = RTE_FDIR_MODE_PERFECT,
100                 .pballoc = RTE_FDIR_PBALLOC_256K,
101                 .status = RTE_FDIR_REPORT_STATUS_ALWAYS,
102                 //.flexbytes_offset = 0x6,
103                 .drop_queue = 127,
104         },
105 #endif
106 };
107 
108 static const struct rte_eth_rxconf rx_conf = {
109 	.rx_thresh = {
110 		.pthresh = 		RX_PTHRESH, /* RX prefetch threshold reg */
111 		.hthresh = 		RX_HTHRESH, /* RX host threshold reg */
112 		.wthresh = 		RX_WTHRESH, /* RX write-back threshold reg */
113 	},
114 	.rx_free_thresh = 		32,
115 };
116 
117 static const struct rte_eth_txconf tx_conf = {
118 	.tx_thresh = {
119 		.pthresh = 		TX_PTHRESH, /* TX prefetch threshold reg */
120 		.hthresh = 		TX_HTHRESH, /* TX host threshold reg */
121 		.wthresh = 		TX_WTHRESH, /* TX write-back threshold reg */
122 	},
123 	.tx_free_thresh = 		0, /* Use PMD default values */
124 	.tx_rs_thresh = 		0, /* Use PMD default values */
125 	/*
126 	 * As the example won't handle mult-segments and offload cases,
127 	 * set the flag by default.
128 	 */
129 	.txq_flags = 			0x0,
130 };
131 
132 struct mbuf_table {
133 	unsigned len; /* length of queued packets */
134 	struct rte_mbuf *m_table[MAX_PKT_BURST];
135 };
136 
137 struct dpdk_private_context {
138 	struct mbuf_table rmbufs[RTE_MAX_ETHPORTS];
139 	struct mbuf_table wmbufs[RTE_MAX_ETHPORTS];
140 	struct rte_mempool *pktmbuf_pool;
141 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
142 #ifdef ENABLE_STATS_IOCTL
143 	int fd;
144 #endif /* !ENABLE_STATS_IOCTL */
145 } __rte_cache_aligned;
146 
147 #ifdef ENABLE_STATS_IOCTL
148 /**
149  * stats struct passed on from user space to the driver
150  */
151 struct stats_struct {
152 	uint64_t tx_bytes;
153 	uint64_t tx_pkts;
154 	uint64_t rx_bytes;
155 	uint64_t rx_pkts;
156 	uint8_t qid;
157 	uint8_t dev;
158 };
159 #endif /* !ENABLE_STATS_IOCTL */
160 /*----------------------------------------------------------------------------*/
161 void
162 dpdk_init_handle(struct mtcp_thread_context *ctxt)
163 {
164 	struct dpdk_private_context *dpc;
165 	int i, j;
166 	char mempool_name[20];
167 
168 	/* create and initialize private I/O module context */
169 	ctxt->io_private_context = calloc(1, sizeof(struct dpdk_private_context));
170 	if (ctxt->io_private_context == NULL) {
171 		TRACE_ERROR("Failed to initialize ctxt->io_private_context: "
172 			    "Can't allocate memory\n");
173 		exit(EXIT_FAILURE);
174 	}
175 
176 	sprintf(mempool_name, "mbuf_pool-%d", ctxt->cpu);
177 	dpc = (struct dpdk_private_context *)ctxt->io_private_context;
178 	dpc->pktmbuf_pool = pktmbuf_pool[ctxt->cpu];
179 
180 	/* set wmbufs correctly */
181 	for (j = 0; j < g_config.mos->netdev_table->num; j++) {
182 		/* Allocate wmbufs for each registered port */
183 		for (i = 0; i < MAX_PKT_BURST; i++) {
184 			dpc->wmbufs[j].m_table[i] = rte_pktmbuf_alloc(pktmbuf_pool[ctxt->cpu]);
185 			if (dpc->wmbufs[j].m_table[i] == NULL) {
186 				TRACE_ERROR("Failed to allocate %d:wmbuf[%d] on device %d!\n",
187 					    ctxt->cpu, i, j);
188 				exit(EXIT_FAILURE);
189 			}
190 		}
191 		/* set mbufs queue length to 0 to begin with */
192 		dpc->wmbufs[j].len = 0;
193 	}
194 
195 #ifdef ENABLE_STATS_IOCTL
196 	dpc->fd = open("/dev/dpdk-iface", O_RDWR);
197 	if (dpc->fd == -1) {
198 		TRACE_ERROR("Can't open /dev/dpdk-iface for context->cpu: %d!\n",
199 			    ctxt->cpu);
200 		exit(EXIT_FAILURE);
201 	}
202 #endif /* !ENABLE_STATS_IOCTL */
203 }
204 /*----------------------------------------------------------------------------*/
205 int
206 dpdk_send_pkts(struct mtcp_thread_context *ctxt, int nif)
207 {
208 	struct dpdk_private_context *dpc;
209 	mtcp_manager_t mtcp;
210 	int ret;
211 	int qid;
212 
213 	dpc = (struct dpdk_private_context *)ctxt->io_private_context;
214 	mtcp = ctxt->mtcp_manager;
215 	ret = 0;
216 	qid = cpu_qid_map[nif][ctxt->cpu];
217 
218 	/* if queue is unassigned, skip it.. */
219 	if (unlikely(qid == 0xFF))
220 		return 0;
221 
222 	/* if there are packets in the queue... flush them out to the wire */
223 	if (dpc->wmbufs[nif].len >/*= MAX_PKT_BURST*/ 0) {
224 		struct rte_mbuf **pkts;
225 #ifdef ENABLE_STATS_IOCTL
226 		struct stats_struct ss;
227 #endif /* !ENABLE_STATS_IOCTL */
228 		int cnt = dpc->wmbufs[nif].len;
229 		pkts = dpc->wmbufs[nif].m_table;
230 #ifdef NETSTAT
231 		mtcp->nstat.tx_packets[nif] += cnt;
232 #ifdef ENABLE_STATS_IOCTL
233 		ss.tx_pkts = mtcp->nstat.tx_packets[nif];
234 		ss.tx_bytes = mtcp->nstat.tx_bytes[nif];
235 		ss.rx_pkts = mtcp->nstat.rx_packets[nif];
236 		ss.rx_bytes = mtcp->nstat.rx_bytes[nif];
237 		ss.qid = ctxt->cpu;
238 		ss.dev = nif;
239 		ioctl(dpc->fd, 0, &ss);
240 #endif /* !ENABLE_STATS_IOCTL */
241 #endif
242 		do {
243 			/* tx cnt # of packets */
244 			ret = rte_eth_tx_burst(nif, qid,
245 					       pkts, cnt);
246 			pkts += ret;
247 			cnt -= ret;
248 			/* if not all pkts were sent... then repeat the cycle */
249 		} while (cnt > 0);
250 
251 #ifndef SHARE_IO_BUFFER
252 		int i;
253 		/* time to allocate fresh mbufs for the queue */
254 		for (i = 0; i < dpc->wmbufs[nif].len; i++) {
255 			dpc->wmbufs[nif].m_table[i] = rte_pktmbuf_alloc(pktmbuf_pool[ctxt->cpu]);
256 			/* error checking */
257 			if (unlikely(dpc->wmbufs[nif].m_table[i] == NULL)) {
258 				TRACE_ERROR("Failed to allocate %d:wmbuf[%d] on device %d!\n",
259 					    ctxt->cpu, i, nif);
260 				exit(EXIT_FAILURE);
261 			}
262 		}
263 #endif
264 		/* reset the len of mbufs var after flushing of packets */
265 		dpc->wmbufs[nif].len = 0;
266 	}
267 
268 	return ret;
269 }
270 /*----------------------------------------------------------------------------*/
271 uint8_t *
272 dpdk_get_wptr(struct mtcp_thread_context *ctxt, int nif, uint16_t pktsize)
273 {
274 	struct dpdk_private_context *dpc;
275 	mtcp_manager_t mtcp;
276 	struct rte_mbuf *m;
277 	uint8_t *ptr;
278 	int len_of_mbuf;
279 
280 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
281 	mtcp = ctxt->mtcp_manager;
282 
283 	/* sanity check */
284 	if (unlikely(dpc->wmbufs[nif].len == MAX_PKT_BURST))
285 		return NULL;
286 
287 	len_of_mbuf = dpc->wmbufs[nif].len;
288 	m = dpc->wmbufs[nif].m_table[len_of_mbuf];
289 
290 	/* retrieve the right write offset */
291 	ptr = (void *)rte_pktmbuf_mtod(m, struct ether_hdr *);
292 	m->pkt_len = m->data_len = pktsize;
293 	m->nb_segs = 1;
294 	m->next = NULL;
295 
296 #ifdef NETSTAT
297 	mtcp->nstat.tx_bytes[nif] += pktsize + 24;
298 #endif
299 
300 	/* increment the len_of_mbuf var */
301 	dpc->wmbufs[nif].len = len_of_mbuf + 1;
302 
303 	return (uint8_t *)ptr;
304 }
305 /*----------------------------------------------------------------------------*/
306 void
307 dpdk_set_wptr(struct mtcp_thread_context *ctxt, int out_nif, int in_nif, int index)
308 {
309 	struct dpdk_private_context *dpc;
310 	mtcp_manager_t mtcp;
311 	int len_of_mbuf;
312 
313 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
314 	mtcp = ctxt->mtcp_manager;
315 
316 	/* sanity check */
317 	if (unlikely(dpc->wmbufs[out_nif].len == MAX_PKT_BURST))
318 		return;
319 
320 	len_of_mbuf = dpc->wmbufs[out_nif].len;
321 	dpc->wmbufs[out_nif].m_table[len_of_mbuf] =
322 		dpc->rmbufs[in_nif].m_table[index];
323 
324 	dpc->wmbufs[out_nif].m_table[len_of_mbuf]->udata64 = 0;
325 
326 #ifdef NETSTAT
327 	mtcp->nstat.tx_bytes[out_nif] += dpc->rmbufs[in_nif].m_table[index]->pkt_len + 24;
328 #endif
329 
330 	/* increment the len_of_mbuf var */
331 	dpc->wmbufs[out_nif].len = len_of_mbuf + 1;
332 
333 	return;
334 }
335 /*----------------------------------------------------------------------------*/
336 static inline void
337 free_pkts(struct rte_mbuf **mtable, unsigned len)
338 {
339 	int i;
340 
341 	/* free the freaking packets */
342 	for (i = 0; i < len; i++) {
343 		if (mtable[i]->udata64 == 1) {
344 			rte_pktmbuf_free_seg(mtable[i]);
345 			RTE_MBUF_PREFETCH_TO_FREE(mtable[i+1]);
346 		}
347 	}
348 }
349 /*----------------------------------------------------------------------------*/
350 int32_t
351 dpdk_recv_pkts(struct mtcp_thread_context *ctxt, int ifidx)
352 {
353 	struct dpdk_private_context *dpc;
354 	int ret;
355 	uint8_t qid;
356 
357 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
358 	qid = cpu_qid_map[ifidx][ctxt->cpu];
359 
360 	/* if queue is unassigned, skip it.. */
361 	if (qid == 0xFF)
362 		return 0;
363 
364 	if (dpc->rmbufs[ifidx].len != 0) {
365 		free_pkts(dpc->rmbufs[ifidx].m_table, dpc->rmbufs[ifidx].len);
366 		dpc->rmbufs[ifidx].len = 0;
367 	}
368 
369 	ret = rte_eth_rx_burst((uint8_t)ifidx, qid,
370 			       dpc->pkts_burst, MAX_PKT_BURST);
371 
372 	dpc->rmbufs[ifidx].len = ret;
373 
374 	return ret;
375 }
376 /*----------------------------------------------------------------------------*/
377 uint8_t *
378 dpdk_get_rptr(struct mtcp_thread_context *ctxt, int ifidx, int index, uint16_t *len)
379 {
380 	struct dpdk_private_context *dpc;
381 	struct rte_mbuf *m;
382 	uint8_t *pktbuf;
383 
384 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
385 
386 
387 	m = dpc->pkts_burst[index];
388 	/* tag to check if the packet is a local or a forwarded pkt */
389 	m->udata64 = 1;
390 	/* don't enable pre-fetching... performance goes down */
391 	//rte_prefetch0(rte_pktmbuf_mtod(m, void *));
392 	*len = m->pkt_len;
393 	pktbuf = rte_pktmbuf_mtod(m, uint8_t *);
394 
395 	/* enqueue the pkt ptr in mbuf */
396 	dpc->rmbufs[ifidx].m_table[index] = m;
397 
398 	return pktbuf;
399 }
400 /*----------------------------------------------------------------------------*/
401 int
402 dpdk_get_nif(struct ifreq *ifr)
403 {
404 	int i;
405 	static int num_dev = -1;
406 	static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
407 	/* get mac addr entries of 'detected' dpdk ports */
408 	if (num_dev < 0) {
409 		num_dev = rte_eth_dev_count();
410 		for (i = 0; i < num_dev; i++)
411 			rte_eth_macaddr_get(i, &ports_eth_addr[i]);
412 	}
413 
414 	for (i = 0; i < num_dev; i++)
415 		if (!memcmp(&ifr->ifr_addr.sa_data[0], &ports_eth_addr[i], ETH_ALEN))
416 			return i;
417 
418 	return -1;
419 }
420 /*----------------------------------------------------------------------------*/
421 void
422 dpdk_destroy_handle(struct mtcp_thread_context *ctxt)
423 {
424 	struct dpdk_private_context *dpc;
425 	int i;
426 
427 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
428 
429 	/* free wmbufs */
430 	for (i = 0; i < g_config.mos->netdev_table->num; i++)
431 		free_pkts(dpc->wmbufs[i].m_table, MAX_PKT_BURST);
432 
433 #ifdef ENABLE_STATS_IOCTL
434 	/* free fd */
435 	close(dpc->fd);
436 #endif /* !ENABLE_STATS_IOCTL */
437 
438 	/* free it all up */
439 	free(dpc);
440 }
441 /*----------------------------------------------------------------------------*/
442 static void
443 check_all_ports_link_status(uint8_t port_num, uint32_t port_mask)
444 {
445 #define CHECK_INTERVAL 			100 /* 100ms */
446 #define MAX_CHECK_TIME 			90 /* 9s (90 * 100ms) in total */
447 
448 	uint8_t portid, count, all_ports_up, print_flag = 0;
449 	struct rte_eth_link link;
450 
451 	printf("\nChecking link status");
452 	fflush(stdout);
453 	for (count = 0; count <= MAX_CHECK_TIME; count++) {
454 		all_ports_up = 1;
455 		for (portid = 0; portid < port_num; portid++) {
456 			if ((port_mask & (1 << portid)) == 0)
457 				continue;
458 			memset(&link, 0, sizeof(link));
459 			rte_eth_link_get_nowait(portid, &link);
460 			/* print link status if flag set */
461 			if (print_flag == 1) {
462 				if (link.link_status)
463 					printf("Port %d Link Up - speed %u "
464 						"Mbps - %s\n", (uint8_t)portid,
465 						(unsigned)link.link_speed,
466 				(link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
467 					("full-duplex") : ("half-duplex\n"));
468 				else
469 					printf("Port %d Link Down\n",
470 						(uint8_t)portid);
471 				continue;
472 			}
473 			/* clear all_ports_up flag if any link down */
474 			if (link.link_status == 0) {
475 				all_ports_up = 0;
476 				break;
477 			}
478 		}
479 		/* after finally printing all link status, get out */
480 		if (print_flag == 1)
481 			break;
482 
483 		if (all_ports_up == 0) {
484 			printf(".");
485 			fflush(stdout);
486 			rte_delay_ms(CHECK_INTERVAL);
487 		}
488 
489 		/* set the print_flag if all ports up or timeout */
490 		if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
491 			print_flag = 1;
492 			printf("done\n");
493 		}
494 	}
495 }
496 /*----------------------------------------------------------------------------*/
497 #if 0
498 static void
499 dpdk_enable_fdir(int portid, uint8_t is_master)
500 {
501 	struct rte_fdir_masks fdir_masks;
502 	struct rte_fdir_filter fdir_filter;
503 	int ret;
504 
505 	memset(&fdir_filter, 0, sizeof(struct rte_fdir_filter));
506 	fdir_filter.iptype = RTE_FDIR_IPTYPE_IPV4;
507 	fdir_filter.l4type = RTE_FDIR_L4TYPE_TCP;
508 	fdir_filter.ip_dst.ipv4_addr = g_config.mos->netdev_table->ent[portid]->ip_addr;
509 
510 	if (is_master) {
511 		memset(&fdir_masks, 0, sizeof(struct rte_fdir_masks));
512 		fdir_masks.src_ipv4_mask = 0x0;
513 		fdir_masks.dst_ipv4_mask = 0xFFFFFFFF;
514 		fdir_masks.src_port_mask = 0x0;
515 		fdir_masks.dst_port_mask = 0x0;
516 
517 		/*
518 		 * enable the following if the filter is IP-only
519 		 * (non-TCP, non-UDP)
520 		 */
521 		/* fdir_masks.only_ip_flow = 1; */
522 		rte_eth_dev_fdir_set_masks(portid, &fdir_masks);
523 		ret = rte_eth_dev_fdir_add_perfect_filter(portid,
524 							  &fdir_filter,
525 							  0,
526 							  g_config.mos->multiprocess_curr_core,
527 							  0);
528 	} else {
529 		ret = rte_eth_dev_fdir_update_perfect_filter(portid,
530 							     &fdir_filter,
531 							     0,
532 							     g_config.mos->multiprocess_curr_core,
533 							     0);
534 	}
535 	if (ret < 0) {
536 		rte_exit(EXIT_FAILURE,
537 			 "fdir_add_perfect_filter_t call failed!: %d\n",
538 			 ret);
539 	}
540 	fprintf(stderr, "Filter for device ifidx: %d added\n", portid);
541 }
542 #endif
543 /*----------------------------------------------------------------------------*/
544 int32_t
545 dpdk_dev_ioctl(struct mtcp_thread_context *ctx, int nif, int cmd, void *argp)
546 {
547 	struct dpdk_private_context *dpc;
548 	struct rte_mbuf *m;
549 	int len_of_mbuf;
550 	struct iphdr *iph;
551 	struct tcphdr *tcph;
552 	RssInfo *rss_i;
553 
554 	iph = (struct iphdr *)argp;
555 	dpc = (struct dpdk_private_context *)ctx->io_private_context;
556 	len_of_mbuf = dpc->wmbufs[nif].len;
557 	rss_i = NULL;
558 
559 	switch (cmd) {
560 	case PKT_TX_IP_CSUM:
561 		m = dpc->wmbufs[nif].m_table[len_of_mbuf - 1];
562 		m->ol_flags = PKT_TX_OUTER_IP_CKSUM |
563 			PKT_TX_IP_CKSUM | PKT_TX_IPV4;
564 		m->l2_len = sizeof(struct ether_hdr);
565 		m->l3_len = (iph->ihl<<2);
566 		break;
567 	case PKT_TX_TCP_CSUM:
568 		m = dpc->wmbufs[nif].m_table[len_of_mbuf - 1];
569 		tcph = (struct tcphdr *)((unsigned char *)iph + (iph->ihl<<2));
570 		m->ol_flags |= PKT_TX_TCP_CKSUM;
571 		tcph->check = rte_ipv4_phdr_cksum((struct ipv4_hdr *)iph, m->ol_flags);
572 		break;
573 	case PKT_RX_RSS:
574 		rss_i = (RssInfo *)argp;
575 		m = dpc->pkts_burst[rss_i->pktidx];
576 		rss_i->hash_value = m->hash.rss;
577 		break;
578 	default:
579 		goto dev_ioctl_err;
580 	}
581 
582 	return 0;
583  dev_ioctl_err:
584 	return -1;
585 }
586 /*----------------------------------------------------------------------------*/
587 void
588 dpdk_load_module_upper_half(void)
589 {
590 	int cpu = g_config.mos->num_cores, ret;
591 	uint32_t cpumask = 0;
592 	char cpumaskbuf[10];
593 	char mem_channels[5];
594 
595 	/* set the log level */
596 	rte_set_log_type(RTE_LOGTYPE_PMD, 0);
597 	rte_set_log_type(RTE_LOGTYPE_MALLOC, 0);
598 	rte_set_log_type(RTE_LOGTYPE_MEMPOOL, 0);
599 	rte_set_log_type(RTE_LOGTYPE_RING, 0);
600 	rte_set_log_level(RTE_LOG_WARNING);
601 
602 	/* get the cpu mask */
603 	for (ret = 0; ret < cpu; ret++)
604 		cpumask = (cpumask | (1 << ret));
605 	sprintf(cpumaskbuf, "%X", cpumask);
606 
607 	/* get the mem channels per socket */
608 	if (g_config.mos->nb_mem_channels == 0) {
609 		TRACE_ERROR("DPDK module requires # of memory channels "
610 				"per socket parameter!\n");
611 		exit(EXIT_FAILURE);
612 	}
613 	sprintf(mem_channels, "%d", g_config.mos->nb_mem_channels);
614 
615 	/* initialize the rte env first, what a waste of implementation effort!  */
616 	char *argv[] = {"",
617 			"-c",
618 			cpumaskbuf,
619 			"-n",
620 			mem_channels,
621 			"--proc-type=auto",
622 			""
623 	};
624 	const int argc = 6;
625 
626 	/*
627 	 * re-set getopt extern variable optind.
628 	 * this issue was a bitch to debug
629 	 * rte_eal_init() internally uses getopt() syscall
630 	 * mtcp applications that also use an `external' getopt
631 	 * will cause a violent crash if optind is not reset to zero
632 	 * prior to calling the func below...
633 	 * see man getopt(3) for more details
634 	 */
635 	optind = 0;
636 
637 	/* initialize the dpdk eal env */
638 	ret = rte_eal_init(argc, argv);
639 	if (ret < 0)
640 		rte_exit(EXIT_FAILURE, "Invalid EAL args!\n");
641 
642 }
643 /*----------------------------------------------------------------------------*/
644 void
645 dpdk_load_module_lower_half(void)
646 {
647 	int portid, rxlcore_id, ret;
648 	struct rte_eth_fc_conf fc_conf;	/* for Ethernet flow control settings */
649 	/* setting the rss key */
650 	static const uint8_t key[] = {
651 		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
652 		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
653 		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
654 		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05
655 	};
656 
657 	port_conf.rx_adv_conf.rss_conf.rss_key = (uint8_t *)&key;
658 	port_conf.rx_adv_conf.rss_conf.rss_key_len = sizeof(key);
659 
660 	/* resetting cpu_qid mapping */
661 	memset(cpu_qid_map, 0xFF, sizeof(cpu_qid_map));
662 
663 	if (!g_config.mos->multiprocess
664 			|| (g_config.mos->multiprocess && g_config.mos->multiprocess_is_master)) {
665 		for (rxlcore_id = 0; rxlcore_id < g_config.mos->num_cores; rxlcore_id++) {
666 			char name[20];
667 			sprintf(name, "mbuf_pool-%d", rxlcore_id);
668 			/* create the mbuf pools */
669 			pktmbuf_pool[rxlcore_id] =
670 				rte_mempool_create(name, NB_MBUF,
671 						   MBUF_SIZE, MEMPOOL_CACHE_SIZE,
672 						   sizeof(struct rte_pktmbuf_pool_private),
673 						   rte_pktmbuf_pool_init, NULL,
674 						   rte_pktmbuf_init, NULL,
675 						   rte_lcore_to_socket_id(rxlcore_id), 0);
676 			if (pktmbuf_pool[rxlcore_id] == NULL)
677 				rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n");
678 		}
679 
680 		/* Initialise each port */
681 		for (portid = 0; portid < g_config.mos->netdev_table->num; portid++) {
682 			int num_queue = 0, eth_idx, i, queue_id;
683 			for (eth_idx = 0; eth_idx < g_config.mos->netdev_table->num; eth_idx++)
684 				if (portid == g_config.mos->netdev_table->ent[eth_idx]->ifindex)
685 					break;
686 			if (eth_idx == g_config.mos->netdev_table->num)
687 				continue;
688 			for (i = 0; i < sizeof(uint64_t) * 8; i++)
689 				if (g_config.mos->netdev_table->ent[eth_idx]->cpu_mask & (1L << i))
690 					num_queue++;
691 
692 			/* set 'num_queues' (used for GetRSSCPUCore() in util.c) */
693 			num_queues = num_queue;
694 
695 			/* init port */
696 			printf("Initializing port %u... ", (unsigned) portid);
697 			fflush(stdout);
698 			ret = rte_eth_dev_configure(portid, num_queue, num_queue,
699 										&port_conf);
700 			if (ret < 0)
701 				rte_exit(EXIT_FAILURE, "Cannot configure device:"
702 									   "err=%d, port=%u\n",
703 									   ret, (unsigned) portid);
704 
705 			/* init one RX queue per CPU */
706 			fflush(stdout);
707 #ifdef DEBUG
708 			rte_eth_macaddr_get(portid, &ports_eth_addr[portid]);
709 #endif
710 			queue_id = 0;
711 			for (rxlcore_id = 0; rxlcore_id < g_config.mos->num_cores; rxlcore_id++) {
712 				if (!(g_config.mos->netdev_table->ent[eth_idx]->cpu_mask & (1L << rxlcore_id)))
713 					continue;
714 				ret = rte_eth_rx_queue_setup(portid, queue_id, nb_rxd,
715 						rte_eth_dev_socket_id(portid), &rx_conf,
716 						pktmbuf_pool[rxlcore_id]);
717 				if (ret < 0)
718 					rte_exit(EXIT_FAILURE, "rte_eth_rx_queue_setup:"
719 										   "err=%d, port=%u, queueid: %d\n",
720 										   ret, (unsigned) portid, rxlcore_id);
721 				cpu_qid_map[portid][rxlcore_id] = queue_id++;
722 			}
723 
724 			/* init one TX queue on each port per CPU (this is redundant for
725 			 * this app) */
726 			fflush(stdout);
727 			queue_id = 0;
728 			for (rxlcore_id = 0; rxlcore_id < g_config.mos->num_cores; rxlcore_id++) {
729 				if (!(g_config.mos->netdev_table->ent[eth_idx]->cpu_mask & (1L << rxlcore_id)))
730 					continue;
731 				ret = rte_eth_tx_queue_setup(portid, queue_id++, nb_txd,
732 						rte_eth_dev_socket_id(portid), &tx_conf);
733 				if (ret < 0)
734 					rte_exit(EXIT_FAILURE, "rte_eth_tx_queue_setup:"
735 										   "err=%d, port=%u, queueid: %d\n",
736 										   ret, (unsigned) portid, rxlcore_id);
737 			}
738 
739 			/* Start device */
740 			ret = rte_eth_dev_start(portid);
741 			if (ret < 0)
742 				rte_exit(EXIT_FAILURE, "rte_eth_dev_start:err=%d, port=%u\n",
743 									   ret, (unsigned) portid);
744 
745 			printf("done: \n");
746 			rte_eth_promiscuous_enable(portid);
747 
748 			/* retrieve current flow control settings per port */
749 			memset(&fc_conf, 0, sizeof(fc_conf));
750 			ret = rte_eth_dev_flow_ctrl_get(portid, &fc_conf);
751 			if (ret != 0) {
752 				rte_exit(EXIT_FAILURE, "Failed to get flow control info!\n");
753 			}
754 
755 			/* and just disable the rx/tx flow control */
756 			fc_conf.mode = RTE_FC_NONE;
757 			ret = rte_eth_dev_flow_ctrl_set(portid, &fc_conf);
758 			if (ret != 0) {
759 				rte_exit(EXIT_FAILURE, "Failed to set flow control info!: errno: %d\n",
760 					 ret);
761 			}
762 
763 #ifdef DEBUG
764 			printf("Port %u, MAC address: %02X:%02X:%02X:%02X:%02X:%02X\n\n",
765 					(unsigned) portid,
766 					ports_eth_addr[portid].addr_bytes[0],
767 					ports_eth_addr[portid].addr_bytes[1],
768 					ports_eth_addr[portid].addr_bytes[2],
769 					ports_eth_addr[portid].addr_bytes[3],
770 					ports_eth_addr[portid].addr_bytes[4],
771 					ports_eth_addr[portid].addr_bytes[5]);
772 #endif
773 #if 0
774 			/* if multi-process support is enabled, then turn on FDIR */
775 			if (g_config.mos->multiprocess)
776 				dpdk_enable_fdir(portid, g_config.mos->multiprocess_is_master);
777 #endif
778 		}
779 	} else { /* g_config.mos->multiprocess && !g_config.mos->multiprocess_is_master */
780 		for (rxlcore_id = 0; rxlcore_id < g_config.mos->num_cores; rxlcore_id++) {
781 			char name[20];
782 			sprintf(name, "mbuf_pool-%d", rxlcore_id);
783 			/* initialize the mbuf pools */
784 			pktmbuf_pool[rxlcore_id] =
785 				rte_mempool_lookup(name);
786 			if (pktmbuf_pool[rxlcore_id] == NULL)
787 				rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n");
788 		}
789 #if 0
790 		for (portid = 0; portid < g_config.mos->netdev_table->num; portid++)
791 			dpdk_enable_fdir(portid, g_config.mos->multiprocess_is_master);
792 #endif
793 	}
794 
795 	check_all_ports_link_status(g_config.mos->netdev_table->num, 0xFFFFFFFF);
796 }
797 /*----------------------------------------------------------------------------*/
798 io_module_func dpdk_module_func = {
799 	.load_module_upper_half		   = dpdk_load_module_upper_half,
800 	.load_module_lower_half		   = dpdk_load_module_lower_half,
801 	.init_handle		   = dpdk_init_handle,
802 	.link_devices		   = NULL,
803 	.release_pkt		   = NULL,
804 	.send_pkts		   = dpdk_send_pkts,
805 	.get_wptr   		   = dpdk_get_wptr,
806 	.recv_pkts		   = dpdk_recv_pkts,
807 	.get_rptr	   	   = dpdk_get_rptr,
808 	.get_nif		   = dpdk_get_nif,
809 	.select			   = NULL,
810 	.destroy_handle		   = dpdk_destroy_handle,
811 	.dev_ioctl		   = dpdk_dev_ioctl,
812 	.set_wptr		   = dpdk_set_wptr,
813 };
814 /*----------------------------------------------------------------------------*/
815 
816