14418919fSjohnjiang /* SPDX-License-Identifier: BSD-3-Clause
24418919fSjohnjiang * Copyright(c) 2019 Intel Corporation
34418919fSjohnjiang */
44418919fSjohnjiang
54418919fSjohnjiang
64418919fSjohnjiang #include <stdio.h>
74418919fSjohnjiang #include <inttypes.h>
84418919fSjohnjiang
94418919fSjohnjiang #include <rte_atomic.h>
104418919fSjohnjiang #include <rte_cycles.h>
114418919fSjohnjiang #include <rte_launch.h>
124418919fSjohnjiang #include <rte_pause.h>
134418919fSjohnjiang #include <rte_stack.h>
144418919fSjohnjiang
154418919fSjohnjiang #include "test.h"
164418919fSjohnjiang
174418919fSjohnjiang #define STACK_NAME "STACK_PERF"
184418919fSjohnjiang #define MAX_BURST 32
194418919fSjohnjiang #define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST)
204418919fSjohnjiang
214418919fSjohnjiang /*
224418919fSjohnjiang * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time
234418919fSjohnjiang * constants.
244418919fSjohnjiang */
254418919fSjohnjiang static volatile unsigned int bulk_sizes[] = {8, MAX_BURST};
264418919fSjohnjiang
274418919fSjohnjiang static rte_atomic32_t lcore_barrier;
284418919fSjohnjiang
294418919fSjohnjiang struct lcore_pair {
304418919fSjohnjiang unsigned int c1;
314418919fSjohnjiang unsigned int c2;
324418919fSjohnjiang };
334418919fSjohnjiang
344418919fSjohnjiang static int
get_two_hyperthreads(struct lcore_pair * lcp)354418919fSjohnjiang get_two_hyperthreads(struct lcore_pair *lcp)
364418919fSjohnjiang {
374418919fSjohnjiang unsigned int socket[2];
384418919fSjohnjiang unsigned int core[2];
394418919fSjohnjiang unsigned int id[2];
404418919fSjohnjiang
414418919fSjohnjiang RTE_LCORE_FOREACH(id[0]) {
424418919fSjohnjiang RTE_LCORE_FOREACH(id[1]) {
434418919fSjohnjiang if (id[0] == id[1])
444418919fSjohnjiang continue;
454418919fSjohnjiang core[0] = rte_lcore_to_cpu_id(id[0]);
464418919fSjohnjiang core[1] = rte_lcore_to_cpu_id(id[1]);
474418919fSjohnjiang socket[0] = rte_lcore_to_socket_id(id[0]);
484418919fSjohnjiang socket[1] = rte_lcore_to_socket_id(id[1]);
494418919fSjohnjiang if ((core[0] == core[1]) && (socket[0] == socket[1])) {
504418919fSjohnjiang lcp->c1 = id[0];
514418919fSjohnjiang lcp->c2 = id[1];
524418919fSjohnjiang return 0;
534418919fSjohnjiang }
544418919fSjohnjiang }
554418919fSjohnjiang }
564418919fSjohnjiang
574418919fSjohnjiang return 1;
584418919fSjohnjiang }
594418919fSjohnjiang
604418919fSjohnjiang static int
get_two_cores(struct lcore_pair * lcp)614418919fSjohnjiang get_two_cores(struct lcore_pair *lcp)
624418919fSjohnjiang {
634418919fSjohnjiang unsigned int socket[2];
644418919fSjohnjiang unsigned int core[2];
654418919fSjohnjiang unsigned int id[2];
664418919fSjohnjiang
674418919fSjohnjiang RTE_LCORE_FOREACH(id[0]) {
684418919fSjohnjiang RTE_LCORE_FOREACH(id[1]) {
694418919fSjohnjiang if (id[0] == id[1])
704418919fSjohnjiang continue;
714418919fSjohnjiang core[0] = rte_lcore_to_cpu_id(id[0]);
724418919fSjohnjiang core[1] = rte_lcore_to_cpu_id(id[1]);
734418919fSjohnjiang socket[0] = rte_lcore_to_socket_id(id[0]);
744418919fSjohnjiang socket[1] = rte_lcore_to_socket_id(id[1]);
754418919fSjohnjiang if ((core[0] != core[1]) && (socket[0] == socket[1])) {
764418919fSjohnjiang lcp->c1 = id[0];
774418919fSjohnjiang lcp->c2 = id[1];
784418919fSjohnjiang return 0;
794418919fSjohnjiang }
804418919fSjohnjiang }
814418919fSjohnjiang }
824418919fSjohnjiang
834418919fSjohnjiang return 1;
844418919fSjohnjiang }
854418919fSjohnjiang
864418919fSjohnjiang static int
get_two_sockets(struct lcore_pair * lcp)874418919fSjohnjiang get_two_sockets(struct lcore_pair *lcp)
884418919fSjohnjiang {
894418919fSjohnjiang unsigned int socket[2];
904418919fSjohnjiang unsigned int id[2];
914418919fSjohnjiang
924418919fSjohnjiang RTE_LCORE_FOREACH(id[0]) {
934418919fSjohnjiang RTE_LCORE_FOREACH(id[1]) {
944418919fSjohnjiang if (id[0] == id[1])
954418919fSjohnjiang continue;
964418919fSjohnjiang socket[0] = rte_lcore_to_socket_id(id[0]);
974418919fSjohnjiang socket[1] = rte_lcore_to_socket_id(id[1]);
984418919fSjohnjiang if (socket[0] != socket[1]) {
994418919fSjohnjiang lcp->c1 = id[0];
1004418919fSjohnjiang lcp->c2 = id[1];
1014418919fSjohnjiang return 0;
1024418919fSjohnjiang }
1034418919fSjohnjiang }
1044418919fSjohnjiang }
1054418919fSjohnjiang
1064418919fSjohnjiang return 1;
1074418919fSjohnjiang }
1084418919fSjohnjiang
1094418919fSjohnjiang /* Measure the cycle cost of popping an empty stack. */
1104418919fSjohnjiang static void
test_empty_pop(struct rte_stack * s)1114418919fSjohnjiang test_empty_pop(struct rte_stack *s)
1124418919fSjohnjiang {
1134418919fSjohnjiang unsigned int iterations = 100000000;
1144418919fSjohnjiang void *objs[MAX_BURST];
1154418919fSjohnjiang unsigned int i;
1164418919fSjohnjiang
1174418919fSjohnjiang uint64_t start = rte_rdtsc();
1184418919fSjohnjiang
1194418919fSjohnjiang for (i = 0; i < iterations; i++)
1204418919fSjohnjiang rte_stack_pop(s, objs, bulk_sizes[0]);
1214418919fSjohnjiang
1224418919fSjohnjiang uint64_t end = rte_rdtsc();
1234418919fSjohnjiang
1244418919fSjohnjiang printf("Stack empty pop: %.2F\n",
1254418919fSjohnjiang (double)(end - start) / iterations);
1264418919fSjohnjiang }
1274418919fSjohnjiang
1284418919fSjohnjiang struct thread_args {
1294418919fSjohnjiang struct rte_stack *s;
1304418919fSjohnjiang unsigned int sz;
1314418919fSjohnjiang double avg;
1324418919fSjohnjiang };
1334418919fSjohnjiang
1344418919fSjohnjiang /* Measure the average per-pointer cycle cost of stack push and pop */
1354418919fSjohnjiang static int
bulk_push_pop(void * p)1364418919fSjohnjiang bulk_push_pop(void *p)
1374418919fSjohnjiang {
1384418919fSjohnjiang unsigned int iterations = 1000000;
1394418919fSjohnjiang struct thread_args *args = p;
1404418919fSjohnjiang void *objs[MAX_BURST] = {0};
1414418919fSjohnjiang unsigned int size, i;
1424418919fSjohnjiang struct rte_stack *s;
1434418919fSjohnjiang
1444418919fSjohnjiang s = args->s;
1454418919fSjohnjiang size = args->sz;
1464418919fSjohnjiang
1474418919fSjohnjiang rte_atomic32_sub(&lcore_barrier, 1);
1484418919fSjohnjiang while (rte_atomic32_read(&lcore_barrier) != 0)
1494418919fSjohnjiang rte_pause();
1504418919fSjohnjiang
1514418919fSjohnjiang uint64_t start = rte_rdtsc();
1524418919fSjohnjiang
1534418919fSjohnjiang for (i = 0; i < iterations; i++) {
1544418919fSjohnjiang rte_stack_push(s, objs, size);
1554418919fSjohnjiang rte_stack_pop(s, objs, size);
1564418919fSjohnjiang }
1574418919fSjohnjiang
1584418919fSjohnjiang uint64_t end = rte_rdtsc();
1594418919fSjohnjiang
1604418919fSjohnjiang args->avg = ((double)(end - start))/(iterations * size);
1614418919fSjohnjiang
1624418919fSjohnjiang return 0;
1634418919fSjohnjiang }
1644418919fSjohnjiang
1654418919fSjohnjiang /*
1664418919fSjohnjiang * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack
1674418919fSjohnjiang * perf when between hyperthread siblings, cores on the same socket, and cores
1684418919fSjohnjiang * on different sockets.
1694418919fSjohnjiang */
1704418919fSjohnjiang static void
run_on_core_pair(struct lcore_pair * cores,struct rte_stack * s,lcore_function_t fn)1714418919fSjohnjiang run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s,
1724418919fSjohnjiang lcore_function_t fn)
1734418919fSjohnjiang {
1744418919fSjohnjiang struct thread_args args[2];
1754418919fSjohnjiang unsigned int i;
1764418919fSjohnjiang
1774418919fSjohnjiang for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
1784418919fSjohnjiang rte_atomic32_set(&lcore_barrier, 2);
1794418919fSjohnjiang
1804418919fSjohnjiang args[0].sz = args[1].sz = bulk_sizes[i];
1814418919fSjohnjiang args[0].s = args[1].s = s;
1824418919fSjohnjiang
183*2d9fd380Sjfb8856606 if (cores->c1 == rte_get_main_lcore()) {
1844418919fSjohnjiang rte_eal_remote_launch(fn, &args[1], cores->c2);
1854418919fSjohnjiang fn(&args[0]);
1864418919fSjohnjiang rte_eal_wait_lcore(cores->c2);
1874418919fSjohnjiang } else {
1884418919fSjohnjiang rte_eal_remote_launch(fn, &args[0], cores->c1);
1894418919fSjohnjiang rte_eal_remote_launch(fn, &args[1], cores->c2);
1904418919fSjohnjiang rte_eal_wait_lcore(cores->c1);
1914418919fSjohnjiang rte_eal_wait_lcore(cores->c2);
1924418919fSjohnjiang }
1934418919fSjohnjiang
1944418919fSjohnjiang printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
1954418919fSjohnjiang bulk_sizes[i], (args[0].avg + args[1].avg) / 2);
1964418919fSjohnjiang }
1974418919fSjohnjiang }
1984418919fSjohnjiang
1994418919fSjohnjiang /* Run bulk_push_pop() simultaneously on 1+ cores. */
2004418919fSjohnjiang static void
run_on_n_cores(struct rte_stack * s,lcore_function_t fn,int n)2014418919fSjohnjiang run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n)
2024418919fSjohnjiang {
2034418919fSjohnjiang struct thread_args args[RTE_MAX_LCORE];
2044418919fSjohnjiang unsigned int i;
2054418919fSjohnjiang
2064418919fSjohnjiang for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
2074418919fSjohnjiang unsigned int lcore_id;
2084418919fSjohnjiang int cnt = 0;
2094418919fSjohnjiang double avg;
2104418919fSjohnjiang
2114418919fSjohnjiang rte_atomic32_set(&lcore_barrier, n);
2124418919fSjohnjiang
213*2d9fd380Sjfb8856606 RTE_LCORE_FOREACH_WORKER(lcore_id) {
2144418919fSjohnjiang if (++cnt >= n)
2154418919fSjohnjiang break;
2164418919fSjohnjiang
2174418919fSjohnjiang args[lcore_id].s = s;
2184418919fSjohnjiang args[lcore_id].sz = bulk_sizes[i];
2194418919fSjohnjiang
2204418919fSjohnjiang if (rte_eal_remote_launch(fn, &args[lcore_id],
2214418919fSjohnjiang lcore_id))
2224418919fSjohnjiang rte_panic("Failed to launch lcore %d\n",
2234418919fSjohnjiang lcore_id);
2244418919fSjohnjiang }
2254418919fSjohnjiang
2264418919fSjohnjiang lcore_id = rte_lcore_id();
2274418919fSjohnjiang
2284418919fSjohnjiang args[lcore_id].s = s;
2294418919fSjohnjiang args[lcore_id].sz = bulk_sizes[i];
2304418919fSjohnjiang
2314418919fSjohnjiang fn(&args[lcore_id]);
2324418919fSjohnjiang
2334418919fSjohnjiang rte_eal_mp_wait_lcore();
2344418919fSjohnjiang
2354418919fSjohnjiang avg = args[rte_lcore_id()].avg;
2364418919fSjohnjiang
2374418919fSjohnjiang cnt = 0;
238*2d9fd380Sjfb8856606 RTE_LCORE_FOREACH_WORKER(lcore_id) {
2394418919fSjohnjiang if (++cnt >= n)
2404418919fSjohnjiang break;
2414418919fSjohnjiang avg += args[lcore_id].avg;
2424418919fSjohnjiang }
2434418919fSjohnjiang
2444418919fSjohnjiang printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
2454418919fSjohnjiang bulk_sizes[i], avg / n);
2464418919fSjohnjiang }
2474418919fSjohnjiang }
2484418919fSjohnjiang
2494418919fSjohnjiang /*
2504418919fSjohnjiang * Measure the cycle cost of pushing and popping a single pointer on a single
2514418919fSjohnjiang * lcore.
2524418919fSjohnjiang */
2534418919fSjohnjiang static void
test_single_push_pop(struct rte_stack * s)2544418919fSjohnjiang test_single_push_pop(struct rte_stack *s)
2554418919fSjohnjiang {
2564418919fSjohnjiang unsigned int iterations = 16000000;
2574418919fSjohnjiang void *obj = NULL;
2584418919fSjohnjiang unsigned int i;
2594418919fSjohnjiang
2604418919fSjohnjiang uint64_t start = rte_rdtsc();
2614418919fSjohnjiang
2624418919fSjohnjiang for (i = 0; i < iterations; i++) {
2634418919fSjohnjiang rte_stack_push(s, &obj, 1);
2644418919fSjohnjiang rte_stack_pop(s, &obj, 1);
2654418919fSjohnjiang }
2664418919fSjohnjiang
2674418919fSjohnjiang uint64_t end = rte_rdtsc();
2684418919fSjohnjiang
2694418919fSjohnjiang printf("Average cycles per single object push/pop: %.2F\n",
2704418919fSjohnjiang ((double)(end - start)) / iterations);
2714418919fSjohnjiang }
2724418919fSjohnjiang
2734418919fSjohnjiang /* Measure the cycle cost of bulk pushing and popping on a single lcore. */
2744418919fSjohnjiang static void
test_bulk_push_pop(struct rte_stack * s)2754418919fSjohnjiang test_bulk_push_pop(struct rte_stack *s)
2764418919fSjohnjiang {
2774418919fSjohnjiang unsigned int iterations = 8000000;
2784418919fSjohnjiang void *objs[MAX_BURST];
2794418919fSjohnjiang unsigned int sz, i;
2804418919fSjohnjiang
2814418919fSjohnjiang for (sz = 0; sz < RTE_DIM(bulk_sizes); sz++) {
2824418919fSjohnjiang uint64_t start = rte_rdtsc();
2834418919fSjohnjiang
2844418919fSjohnjiang for (i = 0; i < iterations; i++) {
2854418919fSjohnjiang rte_stack_push(s, objs, bulk_sizes[sz]);
2864418919fSjohnjiang rte_stack_pop(s, objs, bulk_sizes[sz]);
2874418919fSjohnjiang }
2884418919fSjohnjiang
2894418919fSjohnjiang uint64_t end = rte_rdtsc();
2904418919fSjohnjiang
2914418919fSjohnjiang double avg = ((double)(end - start) /
2924418919fSjohnjiang (iterations * bulk_sizes[sz]));
2934418919fSjohnjiang
2944418919fSjohnjiang printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
2954418919fSjohnjiang bulk_sizes[sz], avg);
2964418919fSjohnjiang }
2974418919fSjohnjiang }
2984418919fSjohnjiang
2994418919fSjohnjiang static int
__test_stack_perf(uint32_t flags)3004418919fSjohnjiang __test_stack_perf(uint32_t flags)
3014418919fSjohnjiang {
3024418919fSjohnjiang struct lcore_pair cores;
3034418919fSjohnjiang struct rte_stack *s;
3044418919fSjohnjiang
3054418919fSjohnjiang rte_atomic32_init(&lcore_barrier);
3064418919fSjohnjiang
3074418919fSjohnjiang s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), flags);
3084418919fSjohnjiang if (s == NULL) {
3094418919fSjohnjiang printf("[%s():%u] failed to create a stack\n",
3104418919fSjohnjiang __func__, __LINE__);
3114418919fSjohnjiang return -1;
3124418919fSjohnjiang }
3134418919fSjohnjiang
3144418919fSjohnjiang printf("### Testing single element push/pop ###\n");
3154418919fSjohnjiang test_single_push_pop(s);
3164418919fSjohnjiang
3174418919fSjohnjiang printf("\n### Testing empty pop ###\n");
3184418919fSjohnjiang test_empty_pop(s);
3194418919fSjohnjiang
3204418919fSjohnjiang printf("\n### Testing using a single lcore ###\n");
3214418919fSjohnjiang test_bulk_push_pop(s);
3224418919fSjohnjiang
3234418919fSjohnjiang if (get_two_hyperthreads(&cores) == 0) {
3244418919fSjohnjiang printf("\n### Testing using two hyperthreads ###\n");
3254418919fSjohnjiang run_on_core_pair(&cores, s, bulk_push_pop);
3264418919fSjohnjiang }
3274418919fSjohnjiang if (get_two_cores(&cores) == 0) {
3284418919fSjohnjiang printf("\n### Testing using two physical cores ###\n");
3294418919fSjohnjiang run_on_core_pair(&cores, s, bulk_push_pop);
3304418919fSjohnjiang }
3314418919fSjohnjiang if (get_two_sockets(&cores) == 0) {
3324418919fSjohnjiang printf("\n### Testing using two NUMA nodes ###\n");
3334418919fSjohnjiang run_on_core_pair(&cores, s, bulk_push_pop);
3344418919fSjohnjiang }
3354418919fSjohnjiang
3364418919fSjohnjiang printf("\n### Testing on all %u lcores ###\n", rte_lcore_count());
3374418919fSjohnjiang run_on_n_cores(s, bulk_push_pop, rte_lcore_count());
3384418919fSjohnjiang
3394418919fSjohnjiang rte_stack_free(s);
3404418919fSjohnjiang return 0;
3414418919fSjohnjiang }
3424418919fSjohnjiang
3434418919fSjohnjiang static int
test_stack_perf(void)3444418919fSjohnjiang test_stack_perf(void)
3454418919fSjohnjiang {
3464418919fSjohnjiang return __test_stack_perf(0);
3474418919fSjohnjiang }
3484418919fSjohnjiang
3494418919fSjohnjiang static int
test_lf_stack_perf(void)3504418919fSjohnjiang test_lf_stack_perf(void)
3514418919fSjohnjiang {
3524418919fSjohnjiang return __test_stack_perf(RTE_STACK_F_LF);
3534418919fSjohnjiang }
3544418919fSjohnjiang
3554418919fSjohnjiang REGISTER_TEST_COMMAND(stack_perf_autotest, test_stack_perf);
3564418919fSjohnjiang REGISTER_TEST_COMMAND(stack_lf_perf_autotest, test_lf_stack_perf);
357