1 /* for io_module_func def'ns */
2 #include "io_module.h"
3 /* for mtcp related def'ns */
4 #include "mtcp.h"
5 /* for errno */
6 #include <errno.h>
7 /* for logging */
8 #include "debug.h"
9 /* for num_devices_* */
10 #include "config.h"
11 /* for rte_max_eth_ports */
12 #include <rte_common.h>
13 /* for rte_eth_rxconf */
14 #include <rte_ethdev.h>
15 /* for delay funcs */
16 #include <rte_cycles.h>
17 /* for ip pesudo-chksum */
18 #include <rte_ip.h>
19 #define ENABLE_STATS_IOCTL		1
20 #ifdef ENABLE_STATS_IOCTL
21 /* for close */
22 #include <unistd.h>
23 /* for open */
24 #include <fcntl.h>
25 /* for ioctl */
26 #include <sys/ioctl.h>
27 #endif /* !ENABLE_STATS_IOCTL */
28 /*----------------------------------------------------------------------------*/
29 /* Essential macros */
30 #define MAX_RX_QUEUE_PER_LCORE		MAX_CPUS
31 #define MAX_TX_QUEUE_PER_PORT		MAX_CPUS
32 
33 #define MBUF_SIZE 			(2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
34 #define NB_MBUF				8192
35 #define MEMPOOL_CACHE_SIZE		256
36 
37 /*
38  * RX and TX Prefetch, Host, and Write-back threshold values should be
39  * carefully set for optimal performance. Consult the network
40  * controller's datasheet and supporting DPDK documentation for guidance
41  * on how these parameters should be set.
42  */
43 #define RX_PTHRESH 			8 /**< Default values of RX prefetch threshold reg. */
44 #define RX_HTHRESH 			8 /**< Default values of RX host threshold reg. */
45 #define RX_WTHRESH 			4 /**< Default values of RX write-back threshold reg. */
46 
47 /*
48  * These default values are optimized for use with the Intel(R) 82599 10 GbE
49  * Controller and the DPDK ixgbe PMD. Consider using other values for other
50  * network controllers and/or network drivers.
51  */
52 #define TX_PTHRESH 			36 /**< Default values of TX prefetch threshold reg. */
53 #define TX_HTHRESH			0  /**< Default values of TX host threshold reg. */
54 #define TX_WTHRESH			0  /**< Default values of TX write-back threshold reg. */
55 
56 #define MAX_PKT_BURST			/*32*/64/*128*//*32*/
57 
58 /*
59  * Configurable number of RX/TX ring descriptors
60  */
61 #define RTE_TEST_RX_DESC_DEFAULT	128
62 #define RTE_TEST_TX_DESC_DEFAULT	512
63 
64 static uint16_t nb_rxd = RTE_TEST_RX_DESC_DEFAULT;
65 static uint16_t nb_txd = RTE_TEST_TX_DESC_DEFAULT;
66 /*----------------------------------------------------------------------------*/
67 /* packet memory pools for storing packet bufs */
68 static struct rte_mempool *pktmbuf_pool[MAX_CPUS] = {NULL};
69 static uint8_t cpu_qid_map[RTE_MAX_ETHPORTS][MAX_CPUS] = {{0}};
70 
71 //#define DEBUG				1
72 #ifdef DEBUG
73 /* ethernet addresses of ports */
74 static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
75 #endif
76 
77 static struct rte_eth_conf port_conf = {
78 	.rxmode = {
79 		.mq_mode	= 	ETH_MQ_RX_RSS,
80 		.max_rx_pkt_len = 	ETHER_MAX_LEN,
81 		.split_hdr_size = 	0,
82 		.header_split   = 	0, /**< Header Split disabled */
83 		.hw_ip_checksum = 	1, /**< IP checksum offload enabled */
84 		.hw_vlan_filter = 	0, /**< VLAN filtering disabled */
85 		.jumbo_frame    = 	0, /**< Jumbo Frame Support disabled */
86 		.hw_strip_crc   = 	1, /**< CRC stripped by hardware */
87 	},
88 	.rx_adv_conf = {
89 		.rss_conf = {
90 			.rss_key = 	NULL,
91 			.rss_hf = 	ETH_RSS_TCP
92 		},
93 	},
94 	.txmode = {
95 		.mq_mode = 		ETH_MQ_TX_NONE,
96 	},
97 #if 0
98 	.fdir_conf = {
99                 .mode = RTE_FDIR_MODE_PERFECT,
100                 .pballoc = RTE_FDIR_PBALLOC_256K,
101                 .status = RTE_FDIR_REPORT_STATUS_ALWAYS,
102                 //.flexbytes_offset = 0x6,
103                 .drop_queue = 127,
104         },
105 #endif
106 };
107 
108 static const struct rte_eth_rxconf rx_conf = {
109 	.rx_thresh = {
110 		.pthresh = 		RX_PTHRESH, /* RX prefetch threshold reg */
111 		.hthresh = 		RX_HTHRESH, /* RX host threshold reg */
112 		.wthresh = 		RX_WTHRESH, /* RX write-back threshold reg */
113 	},
114 	.rx_free_thresh = 		32,
115 };
116 
117 static const struct rte_eth_txconf tx_conf = {
118 	.tx_thresh = {
119 		.pthresh = 		TX_PTHRESH, /* TX prefetch threshold reg */
120 		.hthresh = 		TX_HTHRESH, /* TX host threshold reg */
121 		.wthresh = 		TX_WTHRESH, /* TX write-back threshold reg */
122 	},
123 	.tx_free_thresh = 		0, /* Use PMD default values */
124 	.tx_rs_thresh = 		0, /* Use PMD default values */
125 	/*
126 	 * As the example won't handle mult-segments and offload cases,
127 	 * set the flag by default.
128 	 */
129 	.txq_flags = 			0x0,
130 };
131 
132 struct mbuf_table {
133 	unsigned len; /* length of queued packets */
134 	struct rte_mbuf *m_table[MAX_PKT_BURST];
135 };
136 
137 struct dpdk_private_context {
138 	struct mbuf_table rmbufs[RTE_MAX_ETHPORTS];
139 	struct mbuf_table wmbufs[RTE_MAX_ETHPORTS];
140 	struct rte_mempool *pktmbuf_pool;
141 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
142 #ifdef ENABLE_STATS_IOCTL
143 	int fd;
144 #endif /* !ENABLE_STATS_IOCTL */
145 } __rte_cache_aligned;
146 
147 #ifdef ENABLE_STATS_IOCTL
148 /**
149  * stats struct passed on from user space to the driver
150  */
151 struct stats_struct {
152 	uint64_t tx_bytes;
153 	uint64_t tx_pkts;
154 	uint64_t rx_bytes;
155 	uint64_t rx_pkts;
156 	uint8_t qid;
157 	uint8_t dev;
158 };
159 #endif /* !ENABLE_STATS_IOCTL */
160 /*----------------------------------------------------------------------------*/
161 void
162 dpdk_init_handle(struct mtcp_thread_context *ctxt)
163 {
164 	struct dpdk_private_context *dpc;
165 	int i, j;
166 	char mempool_name[20];
167 
168 	/* create and initialize private I/O module context */
169 	ctxt->io_private_context = calloc(1, sizeof(struct dpdk_private_context));
170 	if (ctxt->io_private_context == NULL) {
171 		TRACE_ERROR("Failed to initialize ctxt->io_private_context: "
172 			    "Can't allocate memory\n");
173 		exit(EXIT_FAILURE);
174 	}
175 
176 	sprintf(mempool_name, "mbuf_pool-%d", ctxt->cpu);
177 	dpc = (struct dpdk_private_context *)ctxt->io_private_context;
178 	dpc->pktmbuf_pool = pktmbuf_pool[ctxt->cpu];
179 
180 	/* set wmbufs correctly */
181 	for (j = 0; j < g_config.mos->netdev_table->num; j++) {
182 		/* Allocate wmbufs for each registered port */
183 		for (i = 0; i < MAX_PKT_BURST; i++) {
184 			dpc->wmbufs[j].m_table[i] = rte_pktmbuf_alloc(pktmbuf_pool[ctxt->cpu]);
185 			if (dpc->wmbufs[j].m_table[i] == NULL) {
186 				TRACE_ERROR("Failed to allocate %d:wmbuf[%d] on device %d!\n",
187 					    ctxt->cpu, i, j);
188 				exit(EXIT_FAILURE);
189 			}
190 		}
191 		/* set mbufs queue length to 0 to begin with */
192 		dpc->wmbufs[j].len = 0;
193 	}
194 
195 #ifdef ENABLE_STATS_IOCTL
196 	dpc->fd = open("/dev/dpdk-iface", O_RDWR);
197 	if (dpc->fd == -1) {
198 		TRACE_ERROR("Can't open /dev/dpdk-iface for context->cpu: %d! "
199 			    "Are you using mlx4/mlx5 driver?\n",
200 			    ctxt->cpu);
201 	}
202 #endif /* !ENABLE_STATS_IOCTL */
203 }
204 /*----------------------------------------------------------------------------*/
205 int
206 dpdk_send_pkts(struct mtcp_thread_context *ctxt, int nif)
207 {
208 	struct dpdk_private_context *dpc;
209 	mtcp_manager_t mtcp;
210 	int ret;
211 	int qid;
212 
213 	dpc = (struct dpdk_private_context *)ctxt->io_private_context;
214 	mtcp = ctxt->mtcp_manager;
215 	ret = 0;
216 	qid = cpu_qid_map[nif][ctxt->cpu];
217 
218 	/* if queue is unassigned, skip it.. */
219 	if (unlikely(qid == 0xFF))
220 		return 0;
221 
222 	/* if there are packets in the queue... flush them out to the wire */
223 	if (dpc->wmbufs[nif].len >/*= MAX_PKT_BURST*/ 0) {
224 		struct rte_mbuf **pkts;
225 #ifdef ENABLE_STATS_IOCTL
226 		struct stats_struct ss;
227 #endif /* !ENABLE_STATS_IOCTL */
228 		int cnt = dpc->wmbufs[nif].len;
229 		pkts = dpc->wmbufs[nif].m_table;
230 #ifdef NETSTAT
231 		mtcp->nstat.tx_packets[nif] += cnt;
232 #ifdef ENABLE_STATS_IOCTL
233 		if (likely(dpc->fd) >= 0) {
234 			ss.tx_pkts = mtcp->nstat.tx_packets[nif];
235 			ss.tx_bytes = mtcp->nstat.tx_bytes[nif];
236 			ss.rx_pkts = mtcp->nstat.rx_packets[nif];
237 			ss.rx_bytes = mtcp->nstat.rx_bytes[nif];
238 			ss.qid = ctxt->cpu;
239 			ss.dev = nif;
240 			ioctl(dpc->fd, 0, &ss);
241 		}
242 #endif /* !ENABLE_STATS_IOCTL */
243 #endif
244 		do {
245 			/* tx cnt # of packets */
246 			ret = rte_eth_tx_burst(nif, qid,
247 					       pkts, cnt);
248 			pkts += ret;
249 			cnt -= ret;
250 			/* if not all pkts were sent... then repeat the cycle */
251 		} while (cnt > 0);
252 
253 #ifndef SHARE_IO_BUFFER
254 		int i;
255 		/* time to allocate fresh mbufs for the queue */
256 		for (i = 0; i < dpc->wmbufs[nif].len; i++) {
257 			dpc->wmbufs[nif].m_table[i] = rte_pktmbuf_alloc(pktmbuf_pool[ctxt->cpu]);
258 			/* error checking */
259 			if (unlikely(dpc->wmbufs[nif].m_table[i] == NULL)) {
260 				TRACE_ERROR("Failed to allocate %d:wmbuf[%d] on device %d!\n",
261 					    ctxt->cpu, i, nif);
262 				exit(EXIT_FAILURE);
263 			}
264 		}
265 #endif
266 		/* reset the len of mbufs var after flushing of packets */
267 		dpc->wmbufs[nif].len = 0;
268 	}
269 
270 	return ret;
271 }
272 /*----------------------------------------------------------------------------*/
273 uint8_t *
274 dpdk_get_wptr(struct mtcp_thread_context *ctxt, int nif, uint16_t pktsize)
275 {
276 	struct dpdk_private_context *dpc;
277 	mtcp_manager_t mtcp;
278 	struct rte_mbuf *m;
279 	uint8_t *ptr;
280 	int len_of_mbuf;
281 
282 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
283 	mtcp = ctxt->mtcp_manager;
284 
285 	/* sanity check */
286 	if (unlikely(dpc->wmbufs[nif].len == MAX_PKT_BURST))
287 		return NULL;
288 
289 	len_of_mbuf = dpc->wmbufs[nif].len;
290 	m = dpc->wmbufs[nif].m_table[len_of_mbuf];
291 
292 	/* retrieve the right write offset */
293 	ptr = (void *)rte_pktmbuf_mtod(m, struct ether_hdr *);
294 	m->pkt_len = m->data_len = pktsize;
295 	m->nb_segs = 1;
296 	m->next = NULL;
297 
298 #ifdef NETSTAT
299 	mtcp->nstat.tx_bytes[nif] += pktsize + 24;
300 #endif
301 
302 	/* increment the len_of_mbuf var */
303 	dpc->wmbufs[nif].len = len_of_mbuf + 1;
304 
305 	return (uint8_t *)ptr;
306 }
307 /*----------------------------------------------------------------------------*/
308 void
309 dpdk_set_wptr(struct mtcp_thread_context *ctxt, int out_nif, int in_nif, int index)
310 {
311 	struct dpdk_private_context *dpc;
312 	mtcp_manager_t mtcp;
313 	int len_of_mbuf;
314 
315 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
316 	mtcp = ctxt->mtcp_manager;
317 
318 	/* sanity check */
319 	if (unlikely(dpc->wmbufs[out_nif].len == MAX_PKT_BURST))
320 		return;
321 
322 	len_of_mbuf = dpc->wmbufs[out_nif].len;
323 	dpc->wmbufs[out_nif].m_table[len_of_mbuf] =
324 		dpc->rmbufs[in_nif].m_table[index];
325 
326 	dpc->wmbufs[out_nif].m_table[len_of_mbuf]->udata64 = 0;
327 
328 #ifdef NETSTAT
329 	mtcp->nstat.tx_bytes[out_nif] += dpc->rmbufs[in_nif].m_table[index]->pkt_len + 24;
330 #endif
331 
332 	/* increment the len_of_mbuf var */
333 	dpc->wmbufs[out_nif].len = len_of_mbuf + 1;
334 
335 	return;
336 }
337 /*----------------------------------------------------------------------------*/
338 static inline void
339 free_pkts(struct rte_mbuf **mtable, unsigned len)
340 {
341 	int i;
342 
343 	/* free the freaking packets */
344 	for (i = 0; i < len; i++) {
345 		if (mtable[i]->udata64 == 1) {
346 			rte_pktmbuf_free_seg(mtable[i]);
347 			RTE_MBUF_PREFETCH_TO_FREE(mtable[i+1]);
348 		}
349 	}
350 }
351 /*----------------------------------------------------------------------------*/
352 int32_t
353 dpdk_recv_pkts(struct mtcp_thread_context *ctxt, int ifidx)
354 {
355 	struct dpdk_private_context *dpc;
356 	int ret;
357 	uint8_t qid;
358 
359 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
360 	qid = cpu_qid_map[ifidx][ctxt->cpu];
361 
362 	/* if queue is unassigned, skip it.. */
363 	if (qid == 0xFF)
364 		return 0;
365 
366 	if (dpc->rmbufs[ifidx].len != 0) {
367 		free_pkts(dpc->rmbufs[ifidx].m_table, dpc->rmbufs[ifidx].len);
368 		dpc->rmbufs[ifidx].len = 0;
369 	}
370 
371 	ret = rte_eth_rx_burst((uint8_t)ifidx, qid,
372 			       dpc->pkts_burst, MAX_PKT_BURST);
373 
374 	dpc->rmbufs[ifidx].len = ret;
375 
376 	return ret;
377 }
378 /*----------------------------------------------------------------------------*/
379 uint8_t *
380 dpdk_get_rptr(struct mtcp_thread_context *ctxt, int ifidx, int index, uint16_t *len)
381 {
382 	struct dpdk_private_context *dpc;
383 	struct rte_mbuf *m;
384 	uint8_t *pktbuf;
385 
386 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
387 
388 
389 	m = dpc->pkts_burst[index];
390 	/* tag to check if the packet is a local or a forwarded pkt */
391 	m->udata64 = 1;
392 	/* don't enable pre-fetching... performance goes down */
393 	//rte_prefetch0(rte_pktmbuf_mtod(m, void *));
394 	*len = m->pkt_len;
395 	pktbuf = rte_pktmbuf_mtod(m, uint8_t *);
396 
397 	/* enqueue the pkt ptr in mbuf */
398 	dpc->rmbufs[ifidx].m_table[index] = m;
399 
400 	return pktbuf;
401 }
402 /*----------------------------------------------------------------------------*/
403 int
404 dpdk_get_nif(struct ifreq *ifr)
405 {
406 	int i;
407 	static int num_dev = -1;
408 	static struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
409 	/* get mac addr entries of 'detected' dpdk ports */
410 	if (num_dev < 0) {
411 		num_dev = rte_eth_dev_count();
412 		for (i = 0; i < num_dev; i++)
413 			rte_eth_macaddr_get(i, &ports_eth_addr[i]);
414 	}
415 
416 	for (i = 0; i < num_dev; i++)
417 		if (!memcmp(&ifr->ifr_addr.sa_data[0], &ports_eth_addr[i], ETH_ALEN))
418 			return i;
419 
420 	return -1;
421 }
422 /*----------------------------------------------------------------------------*/
423 void
424 dpdk_destroy_handle(struct mtcp_thread_context *ctxt)
425 {
426 	struct dpdk_private_context *dpc;
427 	int i;
428 
429 	dpc = (struct dpdk_private_context *) ctxt->io_private_context;
430 
431 	/* free wmbufs */
432 	for (i = 0; i < g_config.mos->netdev_table->num; i++)
433 		free_pkts(dpc->wmbufs[i].m_table, MAX_PKT_BURST);
434 
435 #ifdef ENABLE_STATS_IOCTL
436 	/* free fd */
437 	if (dpc->fd >= 0)
438 		close(dpc->fd);
439 #endif /* !ENABLE_STATS_IOCTL */
440 
441 	/* free it all up */
442 	free(dpc);
443 }
444 /*----------------------------------------------------------------------------*/
445 static void
446 check_all_ports_link_status(uint8_t port_num, uint32_t port_mask)
447 {
448 #define CHECK_INTERVAL 			100 /* 100ms */
449 #define MAX_CHECK_TIME 			90 /* 9s (90 * 100ms) in total */
450 
451 	uint8_t portid, count, all_ports_up, print_flag = 0;
452 	struct rte_eth_link link;
453 
454 	printf("\nChecking link status");
455 	fflush(stdout);
456 	for (count = 0; count <= MAX_CHECK_TIME; count++) {
457 		all_ports_up = 1;
458 		for (portid = 0; portid < port_num; portid++) {
459 			if ((port_mask & (1 << portid)) == 0)
460 				continue;
461 			memset(&link, 0, sizeof(link));
462 			rte_eth_link_get_nowait(portid, &link);
463 			/* print link status if flag set */
464 			if (print_flag == 1) {
465 				if (link.link_status)
466 					printf("Port %d Link Up - speed %u "
467 						"Mbps - %s\n", (uint8_t)portid,
468 						(unsigned)link.link_speed,
469 				(link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
470 					("full-duplex") : ("half-duplex\n"));
471 				else
472 					printf("Port %d Link Down\n",
473 						(uint8_t)portid);
474 				continue;
475 			}
476 			/* clear all_ports_up flag if any link down */
477 			if (link.link_status == 0) {
478 				all_ports_up = 0;
479 				break;
480 			}
481 		}
482 		/* after finally printing all link status, get out */
483 		if (print_flag == 1)
484 			break;
485 
486 		if (all_ports_up == 0) {
487 			printf(".");
488 			fflush(stdout);
489 			rte_delay_ms(CHECK_INTERVAL);
490 		}
491 
492 		/* set the print_flag if all ports up or timeout */
493 		if (all_ports_up == 1 || count == (MAX_CHECK_TIME - 1)) {
494 			print_flag = 1;
495 			printf("done\n");
496 		}
497 	}
498 }
499 /*----------------------------------------------------------------------------*/
500 #if 0
501 static void
502 dpdk_enable_fdir(int portid, uint8_t is_master)
503 {
504 	struct rte_fdir_masks fdir_masks;
505 	struct rte_fdir_filter fdir_filter;
506 	int ret;
507 
508 	memset(&fdir_filter, 0, sizeof(struct rte_fdir_filter));
509 	fdir_filter.iptype = RTE_FDIR_IPTYPE_IPV4;
510 	fdir_filter.l4type = RTE_FDIR_L4TYPE_TCP;
511 	fdir_filter.ip_dst.ipv4_addr = g_config.mos->netdev_table->ent[portid]->ip_addr;
512 
513 	if (is_master) {
514 		memset(&fdir_masks, 0, sizeof(struct rte_fdir_masks));
515 		fdir_masks.src_ipv4_mask = 0x0;
516 		fdir_masks.dst_ipv4_mask = 0xFFFFFFFF;
517 		fdir_masks.src_port_mask = 0x0;
518 		fdir_masks.dst_port_mask = 0x0;
519 
520 		/*
521 		 * enable the following if the filter is IP-only
522 		 * (non-TCP, non-UDP)
523 		 */
524 		/* fdir_masks.only_ip_flow = 1; */
525 		rte_eth_dev_fdir_set_masks(portid, &fdir_masks);
526 		ret = rte_eth_dev_fdir_add_perfect_filter(portid,
527 							  &fdir_filter,
528 							  0,
529 							  g_config.mos->multiprocess_curr_core,
530 							  0);
531 	} else {
532 		ret = rte_eth_dev_fdir_update_perfect_filter(portid,
533 							     &fdir_filter,
534 							     0,
535 							     g_config.mos->multiprocess_curr_core,
536 							     0);
537 	}
538 	if (ret < 0) {
539 		rte_exit(EXIT_FAILURE,
540 			 "fdir_add_perfect_filter_t call failed!: %d\n",
541 			 ret);
542 	}
543 	fprintf(stderr, "Filter for device ifidx: %d added\n", portid);
544 }
545 #endif
546 /*----------------------------------------------------------------------------*/
547 int32_t
548 dpdk_dev_ioctl(struct mtcp_thread_context *ctx, int nif, int cmd, void *argp)
549 {
550 	struct dpdk_private_context *dpc;
551 	struct rte_mbuf *m;
552 	int len_of_mbuf;
553 	struct iphdr *iph;
554 	struct tcphdr *tcph;
555 	RssInfo *rss_i;
556 
557 	iph = (struct iphdr *)argp;
558 	dpc = (struct dpdk_private_context *)ctx->io_private_context;
559 	len_of_mbuf = dpc->wmbufs[nif].len;
560 	rss_i = NULL;
561 
562 	switch (cmd) {
563 	case PKT_TX_IP_CSUM:
564 		m = dpc->wmbufs[nif].m_table[len_of_mbuf - 1];
565 		m->ol_flags = PKT_TX_IP_CKSUM | PKT_TX_IPV4;
566 		m->l2_len = sizeof(struct ether_hdr);
567 		m->l3_len = (iph->ihl<<2);
568 		break;
569 	case PKT_TX_TCP_CSUM:
570 		m = dpc->wmbufs[nif].m_table[len_of_mbuf - 1];
571 		tcph = (struct tcphdr *)((unsigned char *)iph + (iph->ihl<<2));
572 		m->ol_flags |= PKT_TX_TCP_CKSUM;
573 		tcph->check = rte_ipv4_phdr_cksum((struct ipv4_hdr *)iph, m->ol_flags);
574 		break;
575 	case PKT_RX_RSS:
576 		rss_i = (RssInfo *)argp;
577 		m = dpc->pkts_burst[rss_i->pktidx];
578 		rss_i->hash_value = m->hash.rss;
579 		break;
580 	default:
581 		goto dev_ioctl_err;
582 	}
583 
584 	return 0;
585  dev_ioctl_err:
586 	return -1;
587 }
588 /*----------------------------------------------------------------------------*/
589 void
590 dpdk_load_module_upper_half(void)
591 {
592 	int cpu = g_config.mos->num_cores, ret;
593 	uint32_t cpumask = 0;
594 	char cpumaskbuf[10];
595 	char mem_channels[5];
596 
597 	/* set the log level */
598 	rte_set_log_type(RTE_LOGTYPE_PMD, 0);
599 	rte_set_log_type(RTE_LOGTYPE_MALLOC, 0);
600 	rte_set_log_type(RTE_LOGTYPE_MEMPOOL, 0);
601 	rte_set_log_type(RTE_LOGTYPE_RING, 0);
602 	rte_set_log_level(RTE_LOG_WARNING);
603 
604 	/* get the cpu mask */
605 	for (ret = 0; ret < cpu; ret++)
606 		cpumask = (cpumask | (1 << ret));
607 	sprintf(cpumaskbuf, "%X", cpumask);
608 
609 	/* get the mem channels per socket */
610 	if (g_config.mos->nb_mem_channels == 0) {
611 		TRACE_ERROR("DPDK module requires # of memory channels "
612 				"per socket parameter!\n");
613 		exit(EXIT_FAILURE);
614 	}
615 	sprintf(mem_channels, "%d", g_config.mos->nb_mem_channels);
616 
617 	/* initialize the rte env first, what a waste of implementation effort!  */
618 	char *argv[] = {"",
619 			"-c",
620 			cpumaskbuf,
621 			"-n",
622 			mem_channels,
623 			"--proc-type=auto",
624 			""
625 	};
626 	const int argc = 6;
627 
628 	/*
629 	 * re-set getopt extern variable optind.
630 	 * this issue was a bitch to debug
631 	 * rte_eal_init() internally uses getopt() syscall
632 	 * mtcp applications that also use an `external' getopt
633 	 * will cause a violent crash if optind is not reset to zero
634 	 * prior to calling the func below...
635 	 * see man getopt(3) for more details
636 	 */
637 	optind = 0;
638 
639 	/* initialize the dpdk eal env */
640 	ret = rte_eal_init(argc, argv);
641 	if (ret < 0)
642 		rte_exit(EXIT_FAILURE, "Invalid EAL args!\n");
643 
644 }
645 /*----------------------------------------------------------------------------*/
646 void
647 dpdk_load_module_lower_half(void)
648 {
649 	int portid, rxlcore_id, ret;
650 	struct rte_eth_fc_conf fc_conf;	/* for Ethernet flow control settings */
651 	/* setting the rss key */
652 	static const uint8_t key[] = {
653 		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
654 		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
655 		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
656 		0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05
657 	};
658 
659 	port_conf.rx_adv_conf.rss_conf.rss_key = (uint8_t *)&key;
660 	port_conf.rx_adv_conf.rss_conf.rss_key_len = sizeof(key);
661 
662 	/* resetting cpu_qid mapping */
663 	memset(cpu_qid_map, 0xFF, sizeof(cpu_qid_map));
664 
665 	if (!g_config.mos->multiprocess
666 			|| (g_config.mos->multiprocess && g_config.mos->multiprocess_is_master)) {
667 		for (rxlcore_id = 0; rxlcore_id < g_config.mos->num_cores; rxlcore_id++) {
668 			char name[20];
669 			sprintf(name, "mbuf_pool-%d", rxlcore_id);
670 			/* create the mbuf pools */
671 			pktmbuf_pool[rxlcore_id] =
672 				rte_mempool_create(name, NB_MBUF,
673 						   MBUF_SIZE, MEMPOOL_CACHE_SIZE,
674 						   sizeof(struct rte_pktmbuf_pool_private),
675 						   rte_pktmbuf_pool_init, NULL,
676 						   rte_pktmbuf_init, NULL,
677 						   rte_lcore_to_socket_id(rxlcore_id), 0);
678 			if (pktmbuf_pool[rxlcore_id] == NULL)
679 				rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n");
680 		}
681 
682 		/* Initialise each port */
683 		for (portid = 0; portid < g_config.mos->netdev_table->num; portid++) {
684 			int num_queue = 0, eth_idx, i, queue_id;
685 			for (eth_idx = 0; eth_idx < g_config.mos->netdev_table->num; eth_idx++)
686 				if (portid == g_config.mos->netdev_table->ent[eth_idx]->ifindex)
687 					break;
688 			if (eth_idx == g_config.mos->netdev_table->num)
689 				continue;
690 			for (i = 0; i < sizeof(uint64_t) * 8; i++)
691 				if (g_config.mos->netdev_table->ent[eth_idx]->cpu_mask & (1L << i))
692 					num_queue++;
693 
694 			/* set 'num_queues' (used for GetRSSCPUCore() in util.c) */
695 			num_queues = num_queue;
696 
697 			/* init port */
698 			printf("Initializing port %u... ", (unsigned) portid);
699 			fflush(stdout);
700 			ret = rte_eth_dev_configure(portid, num_queue, num_queue,
701 										&port_conf);
702 			if (ret < 0)
703 				rte_exit(EXIT_FAILURE, "Cannot configure device:"
704 									   "err=%d, port=%u\n",
705 									   ret, (unsigned) portid);
706 
707 			/* init one RX queue per CPU */
708 			fflush(stdout);
709 #ifdef DEBUG
710 			rte_eth_macaddr_get(portid, &ports_eth_addr[portid]);
711 #endif
712 			queue_id = 0;
713 			for (rxlcore_id = 0; rxlcore_id < g_config.mos->num_cores; rxlcore_id++) {
714 				if (!(g_config.mos->netdev_table->ent[eth_idx]->cpu_mask & (1L << rxlcore_id)))
715 					continue;
716 				ret = rte_eth_rx_queue_setup(portid, queue_id, nb_rxd,
717 						rte_eth_dev_socket_id(portid), &rx_conf,
718 						pktmbuf_pool[rxlcore_id]);
719 				if (ret < 0)
720 					rte_exit(EXIT_FAILURE, "rte_eth_rx_queue_setup:"
721 										   "err=%d, port=%u, queueid: %d\n",
722 										   ret, (unsigned) portid, rxlcore_id);
723 				cpu_qid_map[portid][rxlcore_id] = queue_id++;
724 			}
725 
726 			/* init one TX queue on each port per CPU (this is redundant for
727 			 * this app) */
728 			fflush(stdout);
729 			queue_id = 0;
730 			for (rxlcore_id = 0; rxlcore_id < g_config.mos->num_cores; rxlcore_id++) {
731 				if (!(g_config.mos->netdev_table->ent[eth_idx]->cpu_mask & (1L << rxlcore_id)))
732 					continue;
733 				ret = rte_eth_tx_queue_setup(portid, queue_id++, nb_txd,
734 						rte_eth_dev_socket_id(portid), &tx_conf);
735 				if (ret < 0)
736 					rte_exit(EXIT_FAILURE, "rte_eth_tx_queue_setup:"
737 										   "err=%d, port=%u, queueid: %d\n",
738 										   ret, (unsigned) portid, rxlcore_id);
739 			}
740 
741 			/* Start device */
742 			ret = rte_eth_dev_start(portid);
743 			if (ret < 0)
744 				rte_exit(EXIT_FAILURE, "rte_eth_dev_start:err=%d, port=%u\n",
745 									   ret, (unsigned) portid);
746 
747 			printf("done: \n");
748 			rte_eth_promiscuous_enable(portid);
749 
750 			/* retrieve current flow control settings per port */
751 			memset(&fc_conf, 0, sizeof(fc_conf));
752 			ret = rte_eth_dev_flow_ctrl_get(portid, &fc_conf);
753 			if (ret != 0) {
754 				rte_exit(EXIT_FAILURE, "Failed to get flow control info!\n");
755 			}
756 
757 			/* and just disable the rx/tx flow control */
758 			fc_conf.mode = RTE_FC_NONE;
759 			ret = rte_eth_dev_flow_ctrl_set(portid, &fc_conf);
760 			if (ret != 0) {
761 				rte_exit(EXIT_FAILURE, "Failed to set flow control info!: errno: %d\n",
762 					 ret);
763 			}
764 
765 #ifdef DEBUG
766 			printf("Port %u, MAC address: %02X:%02X:%02X:%02X:%02X:%02X\n\n",
767 					(unsigned) portid,
768 					ports_eth_addr[portid].addr_bytes[0],
769 					ports_eth_addr[portid].addr_bytes[1],
770 					ports_eth_addr[portid].addr_bytes[2],
771 					ports_eth_addr[portid].addr_bytes[3],
772 					ports_eth_addr[portid].addr_bytes[4],
773 					ports_eth_addr[portid].addr_bytes[5]);
774 #endif
775 #if 0
776 			/* if multi-process support is enabled, then turn on FDIR */
777 			if (g_config.mos->multiprocess)
778 				dpdk_enable_fdir(portid, g_config.mos->multiprocess_is_master);
779 #endif
780 		}
781 	} else { /* g_config.mos->multiprocess && !g_config.mos->multiprocess_is_master */
782 		for (rxlcore_id = 0; rxlcore_id < g_config.mos->num_cores; rxlcore_id++) {
783 			char name[20];
784 			sprintf(name, "mbuf_pool-%d", rxlcore_id);
785 			/* initialize the mbuf pools */
786 			pktmbuf_pool[rxlcore_id] =
787 				rte_mempool_lookup(name);
788 			if (pktmbuf_pool[rxlcore_id] == NULL)
789 				rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n");
790 		}
791 #if 0
792 		for (portid = 0; portid < g_config.mos->netdev_table->num; portid++)
793 			dpdk_enable_fdir(portid, g_config.mos->multiprocess_is_master);
794 #endif
795 	}
796 
797 	check_all_ports_link_status(g_config.mos->netdev_table->num, 0xFFFFFFFF);
798 }
799 /*----------------------------------------------------------------------------*/
800 io_module_func dpdk_module_func = {
801 	.load_module_upper_half		   = dpdk_load_module_upper_half,
802 	.load_module_lower_half		   = dpdk_load_module_lower_half,
803 	.init_handle		   = dpdk_init_handle,
804 	.link_devices		   = NULL,
805 	.release_pkt		   = NULL,
806 	.send_pkts		   = dpdk_send_pkts,
807 	.get_wptr   		   = dpdk_get_wptr,
808 	.recv_pkts		   = dpdk_recv_pkts,
809 	.get_rptr	   	   = dpdk_get_rptr,
810 	.get_nif		   = dpdk_get_nif,
811 	.select			   = NULL,
812 	.destroy_handle		   = dpdk_destroy_handle,
813 	.dev_ioctl		   = dpdk_dev_ioctl,
814 	.set_wptr		   = dpdk_set_wptr,
815 };
816 /*----------------------------------------------------------------------------*/
817 
818