1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2019 Intel Corporation
3 */
4
5
6 #include <stdio.h>
7 #include <inttypes.h>
8
9 #include <rte_atomic.h>
10 #include <rte_cycles.h>
11 #include <rte_launch.h>
12 #include <rte_pause.h>
13 #include <rte_stack.h>
14
15 #include "test.h"
16
17 #define STACK_NAME "STACK_PERF"
18 #define MAX_BURST 32
19 #define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST)
20
21 /*
22 * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time
23 * constants.
24 */
25 static volatile unsigned int bulk_sizes[] = {8, MAX_BURST};
26
27 static rte_atomic32_t lcore_barrier;
28
29 struct lcore_pair {
30 unsigned int c1;
31 unsigned int c2;
32 };
33
34 static int
get_two_hyperthreads(struct lcore_pair * lcp)35 get_two_hyperthreads(struct lcore_pair *lcp)
36 {
37 unsigned int socket[2];
38 unsigned int core[2];
39 unsigned int id[2];
40
41 RTE_LCORE_FOREACH(id[0]) {
42 RTE_LCORE_FOREACH(id[1]) {
43 if (id[0] == id[1])
44 continue;
45 core[0] = rte_lcore_to_cpu_id(id[0]);
46 core[1] = rte_lcore_to_cpu_id(id[1]);
47 socket[0] = rte_lcore_to_socket_id(id[0]);
48 socket[1] = rte_lcore_to_socket_id(id[1]);
49 if ((core[0] == core[1]) && (socket[0] == socket[1])) {
50 lcp->c1 = id[0];
51 lcp->c2 = id[1];
52 return 0;
53 }
54 }
55 }
56
57 return 1;
58 }
59
60 static int
get_two_cores(struct lcore_pair * lcp)61 get_two_cores(struct lcore_pair *lcp)
62 {
63 unsigned int socket[2];
64 unsigned int core[2];
65 unsigned int id[2];
66
67 RTE_LCORE_FOREACH(id[0]) {
68 RTE_LCORE_FOREACH(id[1]) {
69 if (id[0] == id[1])
70 continue;
71 core[0] = rte_lcore_to_cpu_id(id[0]);
72 core[1] = rte_lcore_to_cpu_id(id[1]);
73 socket[0] = rte_lcore_to_socket_id(id[0]);
74 socket[1] = rte_lcore_to_socket_id(id[1]);
75 if ((core[0] != core[1]) && (socket[0] == socket[1])) {
76 lcp->c1 = id[0];
77 lcp->c2 = id[1];
78 return 0;
79 }
80 }
81 }
82
83 return 1;
84 }
85
86 static int
get_two_sockets(struct lcore_pair * lcp)87 get_two_sockets(struct lcore_pair *lcp)
88 {
89 unsigned int socket[2];
90 unsigned int id[2];
91
92 RTE_LCORE_FOREACH(id[0]) {
93 RTE_LCORE_FOREACH(id[1]) {
94 if (id[0] == id[1])
95 continue;
96 socket[0] = rte_lcore_to_socket_id(id[0]);
97 socket[1] = rte_lcore_to_socket_id(id[1]);
98 if (socket[0] != socket[1]) {
99 lcp->c1 = id[0];
100 lcp->c2 = id[1];
101 return 0;
102 }
103 }
104 }
105
106 return 1;
107 }
108
109 /* Measure the cycle cost of popping an empty stack. */
110 static void
test_empty_pop(struct rte_stack * s)111 test_empty_pop(struct rte_stack *s)
112 {
113 unsigned int iterations = 100000000;
114 void *objs[MAX_BURST];
115 unsigned int i;
116
117 uint64_t start = rte_rdtsc();
118
119 for (i = 0; i < iterations; i++)
120 rte_stack_pop(s, objs, bulk_sizes[0]);
121
122 uint64_t end = rte_rdtsc();
123
124 printf("Stack empty pop: %.2F\n",
125 (double)(end - start) / iterations);
126 }
127
128 struct thread_args {
129 struct rte_stack *s;
130 unsigned int sz;
131 double avg;
132 };
133
134 /* Measure the average per-pointer cycle cost of stack push and pop */
135 static int
bulk_push_pop(void * p)136 bulk_push_pop(void *p)
137 {
138 unsigned int iterations = 1000000;
139 struct thread_args *args = p;
140 void *objs[MAX_BURST] = {0};
141 unsigned int size, i;
142 struct rte_stack *s;
143
144 s = args->s;
145 size = args->sz;
146
147 rte_atomic32_sub(&lcore_barrier, 1);
148 while (rte_atomic32_read(&lcore_barrier) != 0)
149 rte_pause();
150
151 uint64_t start = rte_rdtsc();
152
153 for (i = 0; i < iterations; i++) {
154 rte_stack_push(s, objs, size);
155 rte_stack_pop(s, objs, size);
156 }
157
158 uint64_t end = rte_rdtsc();
159
160 args->avg = ((double)(end - start))/(iterations * size);
161
162 return 0;
163 }
164
165 /*
166 * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack
167 * perf when between hyperthread siblings, cores on the same socket, and cores
168 * on different sockets.
169 */
170 static void
run_on_core_pair(struct lcore_pair * cores,struct rte_stack * s,lcore_function_t fn)171 run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s,
172 lcore_function_t fn)
173 {
174 struct thread_args args[2];
175 unsigned int i;
176
177 for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
178 rte_atomic32_set(&lcore_barrier, 2);
179
180 args[0].sz = args[1].sz = bulk_sizes[i];
181 args[0].s = args[1].s = s;
182
183 if (cores->c1 == rte_get_main_lcore()) {
184 rte_eal_remote_launch(fn, &args[1], cores->c2);
185 fn(&args[0]);
186 rte_eal_wait_lcore(cores->c2);
187 } else {
188 rte_eal_remote_launch(fn, &args[0], cores->c1);
189 rte_eal_remote_launch(fn, &args[1], cores->c2);
190 rte_eal_wait_lcore(cores->c1);
191 rte_eal_wait_lcore(cores->c2);
192 }
193
194 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
195 bulk_sizes[i], (args[0].avg + args[1].avg) / 2);
196 }
197 }
198
199 /* Run bulk_push_pop() simultaneously on 1+ cores. */
200 static void
run_on_n_cores(struct rte_stack * s,lcore_function_t fn,int n)201 run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n)
202 {
203 struct thread_args args[RTE_MAX_LCORE];
204 unsigned int i;
205
206 for (i = 0; i < RTE_DIM(bulk_sizes); i++) {
207 unsigned int lcore_id;
208 int cnt = 0;
209 double avg;
210
211 rte_atomic32_set(&lcore_barrier, n);
212
213 RTE_LCORE_FOREACH_WORKER(lcore_id) {
214 if (++cnt >= n)
215 break;
216
217 args[lcore_id].s = s;
218 args[lcore_id].sz = bulk_sizes[i];
219
220 if (rte_eal_remote_launch(fn, &args[lcore_id],
221 lcore_id))
222 rte_panic("Failed to launch lcore %d\n",
223 lcore_id);
224 }
225
226 lcore_id = rte_lcore_id();
227
228 args[lcore_id].s = s;
229 args[lcore_id].sz = bulk_sizes[i];
230
231 fn(&args[lcore_id]);
232
233 rte_eal_mp_wait_lcore();
234
235 avg = args[rte_lcore_id()].avg;
236
237 cnt = 0;
238 RTE_LCORE_FOREACH_WORKER(lcore_id) {
239 if (++cnt >= n)
240 break;
241 avg += args[lcore_id].avg;
242 }
243
244 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
245 bulk_sizes[i], avg / n);
246 }
247 }
248
249 /*
250 * Measure the cycle cost of pushing and popping a single pointer on a single
251 * lcore.
252 */
253 static void
test_single_push_pop(struct rte_stack * s)254 test_single_push_pop(struct rte_stack *s)
255 {
256 unsigned int iterations = 16000000;
257 void *obj = NULL;
258 unsigned int i;
259
260 uint64_t start = rte_rdtsc();
261
262 for (i = 0; i < iterations; i++) {
263 rte_stack_push(s, &obj, 1);
264 rte_stack_pop(s, &obj, 1);
265 }
266
267 uint64_t end = rte_rdtsc();
268
269 printf("Average cycles per single object push/pop: %.2F\n",
270 ((double)(end - start)) / iterations);
271 }
272
273 /* Measure the cycle cost of bulk pushing and popping on a single lcore. */
274 static void
test_bulk_push_pop(struct rte_stack * s)275 test_bulk_push_pop(struct rte_stack *s)
276 {
277 unsigned int iterations = 8000000;
278 void *objs[MAX_BURST];
279 unsigned int sz, i;
280
281 for (sz = 0; sz < RTE_DIM(bulk_sizes); sz++) {
282 uint64_t start = rte_rdtsc();
283
284 for (i = 0; i < iterations; i++) {
285 rte_stack_push(s, objs, bulk_sizes[sz]);
286 rte_stack_pop(s, objs, bulk_sizes[sz]);
287 }
288
289 uint64_t end = rte_rdtsc();
290
291 double avg = ((double)(end - start) /
292 (iterations * bulk_sizes[sz]));
293
294 printf("Average cycles per object push/pop (bulk size: %u): %.2F\n",
295 bulk_sizes[sz], avg);
296 }
297 }
298
299 static int
__test_stack_perf(uint32_t flags)300 __test_stack_perf(uint32_t flags)
301 {
302 struct lcore_pair cores;
303 struct rte_stack *s;
304
305 rte_atomic32_init(&lcore_barrier);
306
307 s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), flags);
308 if (s == NULL) {
309 printf("[%s():%u] failed to create a stack\n",
310 __func__, __LINE__);
311 return -1;
312 }
313
314 printf("### Testing single element push/pop ###\n");
315 test_single_push_pop(s);
316
317 printf("\n### Testing empty pop ###\n");
318 test_empty_pop(s);
319
320 printf("\n### Testing using a single lcore ###\n");
321 test_bulk_push_pop(s);
322
323 if (get_two_hyperthreads(&cores) == 0) {
324 printf("\n### Testing using two hyperthreads ###\n");
325 run_on_core_pair(&cores, s, bulk_push_pop);
326 }
327 if (get_two_cores(&cores) == 0) {
328 printf("\n### Testing using two physical cores ###\n");
329 run_on_core_pair(&cores, s, bulk_push_pop);
330 }
331 if (get_two_sockets(&cores) == 0) {
332 printf("\n### Testing using two NUMA nodes ###\n");
333 run_on_core_pair(&cores, s, bulk_push_pop);
334 }
335
336 printf("\n### Testing on all %u lcores ###\n", rte_lcore_count());
337 run_on_n_cores(s, bulk_push_pop, rte_lcore_count());
338
339 rte_stack_free(s);
340 return 0;
341 }
342
343 static int
test_stack_perf(void)344 test_stack_perf(void)
345 {
346 return __test_stack_perf(0);
347 }
348
349 static int
test_lf_stack_perf(void)350 test_lf_stack_perf(void)
351 {
352 return __test_stack_perf(RTE_STACK_F_LF);
353 }
354
355 REGISTER_TEST_COMMAND(stack_perf_autotest, test_stack_perf);
356 REGISTER_TEST_COMMAND(stack_lf_perf_autotest, test_lf_stack_perf);
357