xref: /f-stack/dpdk/app/test/test_stack_perf.c (revision 2d9fd380)
14418919fSjohnjiang /* SPDX-License-Identifier: BSD-3-Clause
24418919fSjohnjiang  * Copyright(c) 2019 Intel Corporation
34418919fSjohnjiang  */
44418919fSjohnjiang 
54418919fSjohnjiang 
64418919fSjohnjiang #include <stdio.h>
74418919fSjohnjiang #include <inttypes.h>
84418919fSjohnjiang 
94418919fSjohnjiang #include <rte_atomic.h>
104418919fSjohnjiang #include <rte_cycles.h>
114418919fSjohnjiang #include <rte_launch.h>
124418919fSjohnjiang #include <rte_pause.h>
134418919fSjohnjiang #include <rte_stack.h>
144418919fSjohnjiang 
154418919fSjohnjiang #include "test.h"
164418919fSjohnjiang 
174418919fSjohnjiang #define STACK_NAME "STACK_PERF"
184418919fSjohnjiang #define MAX_BURST 32
194418919fSjohnjiang #define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST)
204418919fSjohnjiang 
214418919fSjohnjiang /*
224418919fSjohnjiang  * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time
234418919fSjohnjiang  * constants.
244418919fSjohnjiang  */
254418919fSjohnjiang static volatile unsigned int bulk_sizes[] = {8, MAX_BURST};
264418919fSjohnjiang 
274418919fSjohnjiang static rte_atomic32_t lcore_barrier;
284418919fSjohnjiang 
294418919fSjohnjiang struct lcore_pair {
304418919fSjohnjiang 	unsigned int c1;
314418919fSjohnjiang 	unsigned int c2;
324418919fSjohnjiang };
334418919fSjohnjiang 
344418919fSjohnjiang static int
get_two_hyperthreads(struct lcore_pair * lcp)354418919fSjohnjiang get_two_hyperthreads(struct lcore_pair *lcp)
364418919fSjohnjiang {
374418919fSjohnjiang 	unsigned int socket[2];
384418919fSjohnjiang 	unsigned int core[2];
394418919fSjohnjiang 	unsigned int id[2];
404418919fSjohnjiang 
414418919fSjohnjiang 	RTE_LCORE_FOREACH(id[0]) {
424418919fSjohnjiang 		RTE_LCORE_FOREACH(id[1]) {
434418919fSjohnjiang 			if (id[0] == id[1])
444418919fSjohnjiang 				continue;
454418919fSjohnjiang 			core[0] = rte_lcore_to_cpu_id(id[0]);
464418919fSjohnjiang 			core[1] = rte_lcore_to_cpu_id(id[1]);
474418919fSjohnjiang 			socket[0] = rte_lcore_to_socket_id(id[0]);
484418919fSjohnjiang 			socket[1] = rte_lcore_to_socket_id(id[1]);
494418919fSjohnjiang 			if ((core[0] == core[1]) && (socket[0] == socket[1])) {
504418919fSjohnjiang 				lcp->c1 = id[0];
514418919fSjohnjiang 				lcp->c2 = id[1];
524418919fSjohnjiang 				return 0;
534418919fSjohnjiang 			}
544418919fSjohnjiang 		}
554418919fSjohnjiang 	}
564418919fSjohnjiang 
574418919fSjohnjiang 	return 1;
584418919fSjohnjiang }
594418919fSjohnjiang 
604418919fSjohnjiang static int
get_two_cores(struct lcore_pair * lcp)614418919fSjohnjiang get_two_cores(struct lcore_pair *lcp)
624418919fSjohnjiang {
634418919fSjohnjiang 	unsigned int socket[2];
644418919fSjohnjiang 	unsigned int core[2];
654418919fSjohnjiang 	unsigned int id[2];
664418919fSjohnjiang 
674418919fSjohnjiang 	RTE_LCORE_FOREACH(id[0]) {
684418919fSjohnjiang 		RTE_LCORE_FOREACH(id[1]) {
694418919fSjohnjiang 			if (id[0] == id[1])
704418919fSjohnjiang 				continue;
714418919fSjohnjiang 			core[0] = rte_lcore_to_cpu_id(id[0]);
724418919fSjohnjiang 			core[1] = rte_lcore_to_cpu_id(id[1]);
734418919fSjohnjiang 			socket[0] = rte_lcore_to_socket_id(id[0]);
744418919fSjohnjiang 			socket[1] = rte_lcore_to_socket_id(id[1]);
754418919fSjohnjiang 			if ((core[0] != core[1]) && (socket[0] == socket[1])) {
764418919fSjohnjiang 				lcp->c1 = id[0];
774418919fSjohnjiang 				lcp->c2 = id[1];
784418919fSjohnjiang 				return 0;
794418919fSjohnjiang 			}
804418919fSjohnjiang 		}
814418919fSjohnjiang 	}
824418919fSjohnjiang 
834418919fSjohnjiang 	return 1;
844418919fSjohnjiang }
854418919fSjohnjiang 
864418919fSjohnjiang static int
get_two_sockets(struct lcore_pair * lcp)874418919fSjohnjiang get_two_sockets(struct lcore_pair *lcp)
884418919fSjohnjiang {
894418919fSjohnjiang 	unsigned int socket[2];
904418919fSjohnjiang 	unsigned int id[2];
914418919fSjohnjiang 
924418919fSjohnjiang 	RTE_LCORE_FOREACH(id[0]) {
934418919fSjohnjiang 		RTE_LCORE_FOREACH(id[1]) {
944418919fSjohnjiang 			if (id[0] == id[1])
954418919fSjohnjiang 				continue;
964418919fSjohnjiang 			socket[0] = rte_lcore_to_socket_id(id[0]);
974418919fSjohnjiang 			socket[1] = rte_lcore_to_socket_id(id[1]);
984418919fSjohnjiang 			if (socket[0] != socket[1]) {
994418919fSjohnjiang 				lcp->c1 = id[0];
1004418919fSjohnjiang 				lcp->c2 = id[1];
1014418919fSjohnjiang 				return 0;
1024418919fSjohnjiang 			}
1034418919fSjohnjiang 		}
1044418919fSjohnjiang 	}
1054418919fSjohnjiang 
1064418919fSjohnjiang 	return 1;
1074418919fSjohnjiang }
1084418919fSjohnjiang 
1094418919fSjohnjiang /* Measure the cycle cost of popping an empty stack. */
1104418919fSjohnjiang static void
test_empty_pop(struct rte_stack * s)1114418919fSjohnjiang test_empty_pop(struct rte_stack *s)
1124418919fSjohnjiang {
1134418919fSjohnjiang 	unsigned int iterations = 100000000;
1144418919fSjohnjiang 	void *objs[MAX_BURST];
1154418919fSjohnjiang 	unsigned int i;
1164418919fSjohnjiang 
1174418919fSjohnjiang 	uint64_t start = rte_rdtsc();
1184418919fSjohnjiang 
1194418919fSjohnjiang 	for (i = 0; i < iterations; i++)
1204418919fSjohnjiang 		rte_stack_pop(s, objs, bulk_sizes[0]);
1214418919fSjohnjiang 
1224418919fSjohnjiang 	uint64_t end = rte_rdtsc();
1234418919fSjohnjiang 
1244418919fSjohnjiang 	printf("Stack empty pop: %.2F\n",
1254418919fSjohnjiang 	       (double)(end - start) / iterations);
1264418919fSjohnjiang }
1274418919fSjohnjiang 
1284418919fSjohnjiang struct thread_args {
1294418919fSjohnjiang 	struct rte_stack *s;
1304418919fSjohnjiang 	unsigned int sz;
1314418919fSjohnjiang 	double avg;
1324418919fSjohnjiang };
1334418919fSjohnjiang 
1344418919fSjohnjiang /* Measure the average per-pointer cycle cost of stack push and pop */
1354418919fSjohnjiang static int
bulk_push_pop(void * p)1364418919fSjohnjiang bulk_push_pop(void *p)
1374418919fSjohnjiang {
1384418919fSjohnjiang 	unsigned int iterations = 1000000;
1394418919fSjohnjiang 	struct thread_args *args = p;
1404418919fSjohnjiang 	void *objs[MAX_BURST] = {0};
1414418919fSjohnjiang 	unsigned int size, i;
1424418919fSjohnjiang 	struct rte_stack *s;
1434418919fSjohnjiang 
1444418919fSjohnjiang 	s = args->s;
1454418919fSjohnjiang 	size = args->sz;
1464418919fSjohnjiang 
1474418919fSjohnjiang 	rte_atomic32_sub(&lcore_barrier, 1);
1484418919fSjohnjiang 	while (rte_atomic32_read(&lcore_barrier) != 0)
1494418919fSjohnjiang 		rte_pause();
1504418919fSjohnjiang 
1514418919fSjohnjiang 	uint64_t start = rte_rdtsc();
1524418919fSjohnjiang 
1534418919fSjohnjiang 	for (i = 0; i < iterations; i++) {
1544418919fSjohnjiang 		rte_stack_push(s, objs, size);
1554418919fSjohnjiang 		rte_stack_pop(s, objs, size);
1564418919fSjohnjiang 	}
1574418919fSjohnjiang 
1584418919fSjohnjiang 	uint64_t end = rte_rdtsc();
1594418919fSjohnjiang 
1604418919fSjohnjiang 	args->avg = ((double)(end - start))/(iterations * size);
1614418919fSjohnjiang 
1624418919fSjohnjiang 	return 0;
1634418919fSjohnjiang }
1644418919fSjohnjiang 
1654418919fSjohnjiang /*
1664418919fSjohnjiang  * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack
1674418919fSjohnjiang  * perf when between hyperthread siblings, cores on the same socket, and cores
1684418919fSjohnjiang  * on different sockets.
1694418919fSjohnjiang  */
1704418919fSjohnjiang static void
run_on_core_pair(struct lcore_pair * cores,struct rte_stack * s,lcore_function_t fn)1714418919fSjohnjiang run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s,
1724418919fSjohnjiang 		 lcore_function_t fn)
1734418919fSjohnjiang {
1744418919fSjohnjiang 	struct thread_args args[2];
1754418919fSjohnjiang 	unsigned int i;
1764418919fSjohnjiang 
1774418919fSjohnjiang 	for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
1784418919fSjohnjiang 		rte_atomic32_set(&lcore_barrier, 2);
1794418919fSjohnjiang 
1804418919fSjohnjiang 		args[0].sz = args[1].sz = bulk_sizes[i];
1814418919fSjohnjiang 		args[0].s = args[1].s = s;
1824418919fSjohnjiang 
183*2d9fd380Sjfb8856606 		if (cores->c1 == rte_get_main_lcore()) {
1844418919fSjohnjiang 			rte_eal_remote_launch(fn, &args[1], cores->c2);
1854418919fSjohnjiang 			fn(&args[0]);
1864418919fSjohnjiang 			rte_eal_wait_lcore(cores->c2);
1874418919fSjohnjiang 		} else {
1884418919fSjohnjiang 			rte_eal_remote_launch(fn, &args[0], cores->c1);
1894418919fSjohnjiang 			rte_eal_remote_launch(fn, &args[1], cores->c2);
1904418919fSjohnjiang 			rte_eal_wait_lcore(cores->c1);
1914418919fSjohnjiang 			rte_eal_wait_lcore(cores->c2);
1924418919fSjohnjiang 		}
1934418919fSjohnjiang 
1944418919fSjohnjiang 		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
1954418919fSjohnjiang 		       bulk_sizes[i], (args[0].avg + args[1].avg) / 2);
1964418919fSjohnjiang 	}
1974418919fSjohnjiang }
1984418919fSjohnjiang 
1994418919fSjohnjiang /* Run bulk_push_pop() simultaneously on 1+ cores. */
2004418919fSjohnjiang static void
run_on_n_cores(struct rte_stack * s,lcore_function_t fn,int n)2014418919fSjohnjiang run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n)
2024418919fSjohnjiang {
2034418919fSjohnjiang 	struct thread_args args[RTE_MAX_LCORE];
2044418919fSjohnjiang 	unsigned int i;
2054418919fSjohnjiang 
2064418919fSjohnjiang 	for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
2074418919fSjohnjiang 		unsigned int lcore_id;
2084418919fSjohnjiang 		int cnt = 0;
2094418919fSjohnjiang 		double avg;
2104418919fSjohnjiang 
2114418919fSjohnjiang 		rte_atomic32_set(&lcore_barrier, n);
2124418919fSjohnjiang 
213*2d9fd380Sjfb8856606 		RTE_LCORE_FOREACH_WORKER(lcore_id) {
2144418919fSjohnjiang 			if (++cnt >= n)
2154418919fSjohnjiang 				break;
2164418919fSjohnjiang 
2174418919fSjohnjiang 			args[lcore_id].s = s;
2184418919fSjohnjiang 			args[lcore_id].sz = bulk_sizes[i];
2194418919fSjohnjiang 
2204418919fSjohnjiang 			if (rte_eal_remote_launch(fn, &args[lcore_id],
2214418919fSjohnjiang 						  lcore_id))
2224418919fSjohnjiang 				rte_panic("Failed to launch lcore %d\n",
2234418919fSjohnjiang 					  lcore_id);
2244418919fSjohnjiang 		}
2254418919fSjohnjiang 
2264418919fSjohnjiang 		lcore_id = rte_lcore_id();
2274418919fSjohnjiang 
2284418919fSjohnjiang 		args[lcore_id].s = s;
2294418919fSjohnjiang 		args[lcore_id].sz = bulk_sizes[i];
2304418919fSjohnjiang 
2314418919fSjohnjiang 		fn(&args[lcore_id]);
2324418919fSjohnjiang 
2334418919fSjohnjiang 		rte_eal_mp_wait_lcore();
2344418919fSjohnjiang 
2354418919fSjohnjiang 		avg = args[rte_lcore_id()].avg;
2364418919fSjohnjiang 
2374418919fSjohnjiang 		cnt = 0;
238*2d9fd380Sjfb8856606 		RTE_LCORE_FOREACH_WORKER(lcore_id) {
2394418919fSjohnjiang 			if (++cnt >= n)
2404418919fSjohnjiang 				break;
2414418919fSjohnjiang 			avg += args[lcore_id].avg;
2424418919fSjohnjiang 		}
2434418919fSjohnjiang 
2444418919fSjohnjiang 		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
2454418919fSjohnjiang 		       bulk_sizes[i], avg / n);
2464418919fSjohnjiang 	}
2474418919fSjohnjiang }
2484418919fSjohnjiang 
2494418919fSjohnjiang /*
2504418919fSjohnjiang  * Measure the cycle cost of pushing and popping a single pointer on a single
2514418919fSjohnjiang  * lcore.
2524418919fSjohnjiang  */
2534418919fSjohnjiang static void
test_single_push_pop(struct rte_stack * s)2544418919fSjohnjiang test_single_push_pop(struct rte_stack *s)
2554418919fSjohnjiang {
2564418919fSjohnjiang 	unsigned int iterations = 16000000;
2574418919fSjohnjiang 	void *obj = NULL;
2584418919fSjohnjiang 	unsigned int i;
2594418919fSjohnjiang 
2604418919fSjohnjiang 	uint64_t start = rte_rdtsc();
2614418919fSjohnjiang 
2624418919fSjohnjiang 	for (i = 0; i < iterations; i++) {
2634418919fSjohnjiang 		rte_stack_push(s, &obj, 1);
2644418919fSjohnjiang 		rte_stack_pop(s, &obj, 1);
2654418919fSjohnjiang 	}
2664418919fSjohnjiang 
2674418919fSjohnjiang 	uint64_t end = rte_rdtsc();
2684418919fSjohnjiang 
2694418919fSjohnjiang 	printf("Average cycles per single object push/pop: %.2F\n",
2704418919fSjohnjiang 	       ((double)(end - start)) / iterations);
2714418919fSjohnjiang }
2724418919fSjohnjiang 
2734418919fSjohnjiang /* Measure the cycle cost of bulk pushing and popping on a single lcore. */
2744418919fSjohnjiang static void
test_bulk_push_pop(struct rte_stack * s)2754418919fSjohnjiang test_bulk_push_pop(struct rte_stack *s)
2764418919fSjohnjiang {
2774418919fSjohnjiang 	unsigned int iterations = 8000000;
2784418919fSjohnjiang 	void *objs[MAX_BURST];
2794418919fSjohnjiang 	unsigned int sz, i;
2804418919fSjohnjiang 
2814418919fSjohnjiang 	for (sz = 0; sz < RTE_DIM(bulk_sizes); sz++) {
2824418919fSjohnjiang 		uint64_t start = rte_rdtsc();
2834418919fSjohnjiang 
2844418919fSjohnjiang 		for (i = 0; i < iterations; i++) {
2854418919fSjohnjiang 			rte_stack_push(s, objs, bulk_sizes[sz]);
2864418919fSjohnjiang 			rte_stack_pop(s, objs, bulk_sizes[sz]);
2874418919fSjohnjiang 		}
2884418919fSjohnjiang 
2894418919fSjohnjiang 		uint64_t end = rte_rdtsc();
2904418919fSjohnjiang 
2914418919fSjohnjiang 		double avg = ((double)(end - start) /
2924418919fSjohnjiang 			      (iterations * bulk_sizes[sz]));
2934418919fSjohnjiang 
2944418919fSjohnjiang 		printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
2954418919fSjohnjiang 		       bulk_sizes[sz], avg);
2964418919fSjohnjiang 	}
2974418919fSjohnjiang }
2984418919fSjohnjiang 
2994418919fSjohnjiang static int
__test_stack_perf(uint32_t flags)3004418919fSjohnjiang __test_stack_perf(uint32_t flags)
3014418919fSjohnjiang {
3024418919fSjohnjiang 	struct lcore_pair cores;
3034418919fSjohnjiang 	struct rte_stack *s;
3044418919fSjohnjiang 
3054418919fSjohnjiang 	rte_atomic32_init(&lcore_barrier);
3064418919fSjohnjiang 
3074418919fSjohnjiang 	s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), flags);
3084418919fSjohnjiang 	if (s == NULL) {
3094418919fSjohnjiang 		printf("[%s():%u] failed to create a stack\n",
3104418919fSjohnjiang 		       __func__, __LINE__);
3114418919fSjohnjiang 		return -1;
3124418919fSjohnjiang 	}
3134418919fSjohnjiang 
3144418919fSjohnjiang 	printf("### Testing single element push/pop ###\n");
3154418919fSjohnjiang 	test_single_push_pop(s);
3164418919fSjohnjiang 
3174418919fSjohnjiang 	printf("\n### Testing empty pop ###\n");
3184418919fSjohnjiang 	test_empty_pop(s);
3194418919fSjohnjiang 
3204418919fSjohnjiang 	printf("\n### Testing using a single lcore ###\n");
3214418919fSjohnjiang 	test_bulk_push_pop(s);
3224418919fSjohnjiang 
3234418919fSjohnjiang 	if (get_two_hyperthreads(&cores) == 0) {
3244418919fSjohnjiang 		printf("\n### Testing using two hyperthreads ###\n");
3254418919fSjohnjiang 		run_on_core_pair(&cores, s, bulk_push_pop);
3264418919fSjohnjiang 	}
3274418919fSjohnjiang 	if (get_two_cores(&cores) == 0) {
3284418919fSjohnjiang 		printf("\n### Testing using two physical cores ###\n");
3294418919fSjohnjiang 		run_on_core_pair(&cores, s, bulk_push_pop);
3304418919fSjohnjiang 	}
3314418919fSjohnjiang 	if (get_two_sockets(&cores) == 0) {
3324418919fSjohnjiang 		printf("\n### Testing using two NUMA nodes ###\n");
3334418919fSjohnjiang 		run_on_core_pair(&cores, s, bulk_push_pop);
3344418919fSjohnjiang 	}
3354418919fSjohnjiang 
3364418919fSjohnjiang 	printf("\n### Testing on all %u lcores ###\n", rte_lcore_count());
3374418919fSjohnjiang 	run_on_n_cores(s, bulk_push_pop, rte_lcore_count());
3384418919fSjohnjiang 
3394418919fSjohnjiang 	rte_stack_free(s);
3404418919fSjohnjiang 	return 0;
3414418919fSjohnjiang }
3424418919fSjohnjiang 
3434418919fSjohnjiang static int
test_stack_perf(void)3444418919fSjohnjiang test_stack_perf(void)
3454418919fSjohnjiang {
3464418919fSjohnjiang 	return __test_stack_perf(0);
3474418919fSjohnjiang }
3484418919fSjohnjiang 
3494418919fSjohnjiang static int
test_lf_stack_perf(void)3504418919fSjohnjiang test_lf_stack_perf(void)
3514418919fSjohnjiang {
3524418919fSjohnjiang 	return __test_stack_perf(RTE_STACK_F_LF);
3534418919fSjohnjiang }
3544418919fSjohnjiang 
3554418919fSjohnjiang REGISTER_TEST_COMMAND(stack_perf_autotest, test_stack_perf);
3564418919fSjohnjiang REGISTER_TEST_COMMAND(stack_lf_perf_autotest, test_lf_stack_perf);
357