1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2018 Intel Corporation
3  */
4 
5 #include <stdio.h>
6 #include <sys/types.h>
7 #include <sys/stat.h>
8 #include <fcntl.h>
9 #include <stdlib.h>
10 #include <string.h>
11 #include <unistd.h>
12 #include <signal.h>
13 #include <limits.h>
14 #include <errno.h>
15 #include <inttypes.h>
16 
17 #include <rte_memcpy.h>
18 #include <rte_memory.h>
19 #include <rte_string_fns.h>
20 
21 #include "power_pstate_cpufreq.h"
22 #include "power_common.h"
23 
24 
25 #ifdef RTE_LIBRTE_POWER_DEBUG
26 #define POWER_DEBUG_TRACE(fmt, args...) do { \
27 		RTE_LOG(ERR, POWER, "%s: " fmt, __func__, ## args); \
28 } while (0)
29 #else
30 #define POWER_DEBUG_TRACE(fmt, args...)
31 #endif
32 
33 #define FOPEN_OR_ERR_RET(f, retval) do { \
34 		if ((f) == NULL) { \
35 			RTE_LOG(ERR, POWER, "File not opened\n"); \
36 			return retval; \
37 		} \
38 } while (0)
39 
40 #define FOPS_OR_NULL_GOTO(ret, label) do { \
41 		if ((ret) == NULL) { \
42 			RTE_LOG(ERR, POWER, "fgets returns nothing\n"); \
43 			goto label; \
44 		} \
45 } while (0)
46 
47 #define FOPS_OR_ERR_GOTO(ret, label) do { \
48 		if ((ret) < 0) { \
49 			RTE_LOG(ERR, POWER, "File operations failed\n"); \
50 			goto label; \
51 		} \
52 } while (0)
53 
54 /* macros used for rounding frequency to nearest 100000 */
55 #define FREQ_ROUNDING_DELTA 50000
56 #define ROUND_FREQ_TO_N_100000 100000
57 
58 #define POWER_CONVERT_TO_DECIMAL 10
59 #define BUS_FREQ     100000
60 
61 #define POWER_GOVERNOR_PERF "performance"
62 #define POWER_SYSFILE_GOVERNOR  \
63 		"/sys/devices/system/cpu/cpu%u/cpufreq/scaling_governor"
64 #define POWER_SYSFILE_MAX_FREQ \
65 		"/sys/devices/system/cpu/cpu%u/cpufreq/scaling_max_freq"
66 #define POWER_SYSFILE_MIN_FREQ  \
67 		"/sys/devices/system/cpu/cpu%u/cpufreq/scaling_min_freq"
68 #define POWER_SYSFILE_CUR_FREQ  \
69 		"/sys/devices/system/cpu/cpu%u/cpufreq/scaling_cur_freq"
70 #define POWER_SYSFILE_BASE_MAX_FREQ \
71 		"/sys/devices/system/cpu/cpu%u/cpufreq/cpuinfo_max_freq"
72 #define POWER_SYSFILE_BASE_MIN_FREQ  \
73 		"/sys/devices/system/cpu/cpu%u/cpufreq/cpuinfo_min_freq"
74 #define POWER_SYSFILE_BASE_FREQ  \
75 		"/sys/devices/system/cpu/cpu%u/cpufreq/base_frequency"
76 #define POWER_PSTATE_DRIVER "intel_pstate"
77 #define POWER_MSR_PATH  "/dev/cpu/%u/msr"
78 
79 /*
80  * MSR related
81  */
82 #define PLATFORM_INFO     0x0CE
83 #define NON_TURBO_MASK    0xFF00
84 #define NON_TURBO_OFFSET  0x8
85 
86 
87 enum power_state {
88 	POWER_IDLE = 0,
89 	POWER_ONGOING,
90 	POWER_USED,
91 	POWER_UNKNOWN
92 };
93 
94 struct pstate_power_info {
95 	unsigned int lcore_id;               /**< Logical core id */
96 	uint32_t freqs[RTE_MAX_LCORE_FREQS]; /**< Frequency array */
97 	uint32_t nb_freqs;                   /**< number of available freqs */
98 	FILE *f_cur_min;                     /**< FD of scaling_min */
99 	FILE *f_cur_max;                     /**< FD of scaling_max */
100 	char governor_ori[32];               /**< Original governor name */
101 	uint32_t curr_idx;                   /**< Freq index in freqs array */
102 	uint32_t non_turbo_max_ratio;        /**< Non Turbo Max ratio  */
103 	uint32_t sys_max_freq;               /**< system wide max freq  */
104 	uint32_t core_base_freq;             /**< core base freq  */
105 	uint32_t state;                      /**< Power in use state */
106 	uint16_t turbo_available;            /**< Turbo Boost available */
107 	uint16_t turbo_enable;               /**< Turbo Boost enable/disable */
108 	uint16_t priority_core;              /**< High Performance core */
109 } __rte_cache_aligned;
110 
111 
112 static struct pstate_power_info lcore_power_info[RTE_MAX_LCORE];
113 
114 /**
115  * It is to read the specific MSR.
116  */
117 
118 static int32_t
power_rdmsr(int msr,uint64_t * val,unsigned int lcore_id)119 power_rdmsr(int msr, uint64_t *val, unsigned int lcore_id)
120 {
121 	int fd, ret;
122 	char fullpath[PATH_MAX];
123 
124 	snprintf(fullpath, sizeof(fullpath), POWER_MSR_PATH, lcore_id);
125 
126 	fd = open(fullpath, O_RDONLY);
127 
128 	if (fd < 0) {
129 		RTE_LOG(ERR, POWER, "Error opening '%s': %s\n", fullpath,
130 				 strerror(errno));
131 		return fd;
132 	}
133 
134 	ret = pread(fd, val, sizeof(uint64_t), msr);
135 
136 	if (ret < 0) {
137 		RTE_LOG(ERR, POWER, "Error reading '%s': %s\n", fullpath,
138 				 strerror(errno));
139 		goto out;
140 	}
141 
142 	POWER_DEBUG_TRACE("MSR Path %s, offset 0x%X for lcore %u\n",
143 			fullpath, msr, lcore_id);
144 
145 	POWER_DEBUG_TRACE("Ret value %d, content is 0x%"PRIx64"\n", ret, *val);
146 
147 out:	close(fd);
148 	return ret;
149 }
150 
151 /**
152  * It is to fopen the sys file for the future setting the lcore frequency.
153  */
154 static int
power_init_for_setting_freq(struct pstate_power_info * pi)155 power_init_for_setting_freq(struct pstate_power_info *pi)
156 {
157 	FILE *f_min, *f_max, *f_base;
158 	char fullpath_min[PATH_MAX];
159 	char fullpath_max[PATH_MAX];
160 	char fullpath_base[PATH_MAX];
161 	char buf_base[BUFSIZ];
162 	char *s_base;
163 	uint32_t base_ratio = 0;
164 	uint64_t max_non_turbo = 0;
165 	int  ret_val = 0;
166 
167 	snprintf(fullpath_min, sizeof(fullpath_min), POWER_SYSFILE_MIN_FREQ,
168 			pi->lcore_id);
169 
170 	f_min = fopen(fullpath_min, "rw+");
171 	FOPEN_OR_ERR_RET(f_min, -1);
172 
173 	snprintf(fullpath_max, sizeof(fullpath_max), POWER_SYSFILE_MAX_FREQ,
174 			pi->lcore_id);
175 
176 	f_max = fopen(fullpath_max, "rw+");
177 	if (f_max == NULL)
178 		fclose(f_min);
179 
180 	FOPEN_OR_ERR_RET(f_max, -1);
181 
182 	pi->f_cur_min = f_min;
183 	pi->f_cur_max = f_max;
184 
185 	snprintf(fullpath_base, sizeof(fullpath_base), POWER_SYSFILE_BASE_FREQ,
186 			pi->lcore_id);
187 
188 	f_base = fopen(fullpath_base, "r");
189 	if (f_base == NULL) {
190 		/* No sysfs base_frequency, that's OK, continue without */
191 		base_ratio = 0;
192 	} else {
193 		s_base = fgets(buf_base, sizeof(buf_base), f_base);
194 		FOPS_OR_NULL_GOTO(s_base, out);
195 
196 		buf_base[BUFSIZ-1] = '\0';
197 		if (strlen(buf_base))
198 			/* Strip off terminating '\n' */
199 			strtok(buf_base, "\n");
200 
201 		base_ratio = strtoul(buf_base, NULL, POWER_CONVERT_TO_DECIMAL)
202 				/ BUS_FREQ;
203 	}
204 
205 	/* Add MSR read to detect turbo status */
206 
207 	if (power_rdmsr(PLATFORM_INFO, &max_non_turbo, pi->lcore_id) < 0) {
208 		ret_val = -1;
209 		goto out;
210 	}
211 
212 	max_non_turbo = (max_non_turbo&NON_TURBO_MASK)>>NON_TURBO_OFFSET;
213 
214 	POWER_DEBUG_TRACE("no turbo perf %"PRIu64"\n", max_non_turbo);
215 
216 	pi->non_turbo_max_ratio = max_non_turbo;
217 
218 	/*
219 	 * If base_frequency is reported as greater than the maximum
220 	 * non-turbo frequency, then mark it as a high priority core.
221 	 */
222 	if (base_ratio > max_non_turbo)
223 		pi->priority_core = 1;
224 	else
225 		pi->priority_core = 0;
226 	pi->core_base_freq = base_ratio * BUS_FREQ;
227 
228 out:
229 	if (f_base != NULL)
230 		fclose(f_base);
231 	return ret_val;
232 }
233 
234 static int
set_freq_internal(struct pstate_power_info * pi,uint32_t idx)235 set_freq_internal(struct pstate_power_info *pi, uint32_t idx)
236 {
237 	uint32_t target_freq = 0;
238 
239 	if (idx >= RTE_MAX_LCORE_FREQS || idx >= pi->nb_freqs) {
240 		RTE_LOG(ERR, POWER, "Invalid frequency index %u, which "
241 				"should be less than %u\n", idx, pi->nb_freqs);
242 		return -1;
243 	}
244 
245 	/* Check if it is the same as current */
246 	if (idx == pi->curr_idx)
247 		return 0;
248 
249 	/* Because Intel Pstate Driver only allow user change min/max hint
250 	 * User need change the min/max as same value.
251 	 */
252 	if (fseek(pi->f_cur_min, 0, SEEK_SET) < 0) {
253 		RTE_LOG(ERR, POWER, "Fail to set file position indicator to 0 "
254 				"for setting frequency for lcore %u\n",
255 				pi->lcore_id);
256 		return -1;
257 	}
258 
259 	if (fseek(pi->f_cur_max, 0, SEEK_SET) < 0) {
260 		RTE_LOG(ERR, POWER, "Fail to set file position indicator to 0 "
261 				"for setting frequency for lcore %u\n",
262 				pi->lcore_id);
263 		return -1;
264 	}
265 
266 	/* Turbo is available and enabled, first freq bucket is sys max freq */
267 	if (pi->turbo_available && idx == 0) {
268 		if (pi->turbo_enable)
269 			target_freq = pi->sys_max_freq;
270 		else {
271 			RTE_LOG(ERR, POWER, "Turbo is off, frequency can't be scaled up more %u\n",
272 					pi->lcore_id);
273 			return -1;
274 		}
275 	} else
276 		target_freq = pi->freqs[idx];
277 
278 	/* Decrease freq, the min freq should be updated first */
279 	if (idx  >  pi->curr_idx) {
280 
281 		if (fprintf(pi->f_cur_min, "%u", target_freq) < 0) {
282 			RTE_LOG(ERR, POWER, "Fail to write new frequency for "
283 					"lcore %u\n", pi->lcore_id);
284 			return -1;
285 		}
286 
287 		if (fprintf(pi->f_cur_max, "%u", target_freq) < 0) {
288 			RTE_LOG(ERR, POWER, "Fail to write new frequency for "
289 					"lcore %u\n", pi->lcore_id);
290 			return -1;
291 		}
292 
293 		POWER_DEBUG_TRACE("Frequency '%u' to be set for lcore %u\n",
294 				  target_freq, pi->lcore_id);
295 
296 		fflush(pi->f_cur_min);
297 		fflush(pi->f_cur_max);
298 
299 	}
300 
301 	/* Increase freq, the max freq should be updated first */
302 	if (idx  <  pi->curr_idx) {
303 
304 		if (fprintf(pi->f_cur_max, "%u", target_freq) < 0) {
305 			RTE_LOG(ERR, POWER, "Fail to write new frequency for "
306 					"lcore %u\n", pi->lcore_id);
307 			return -1;
308 		}
309 
310 		if (fprintf(pi->f_cur_min, "%u", target_freq) < 0) {
311 			RTE_LOG(ERR, POWER, "Fail to write new frequency for "
312 					"lcore %u\n", pi->lcore_id);
313 			return -1;
314 		}
315 
316 		POWER_DEBUG_TRACE("Frequency '%u' to be set for lcore %u\n",
317 				  target_freq, pi->lcore_id);
318 
319 		fflush(pi->f_cur_max);
320 		fflush(pi->f_cur_min);
321 	}
322 
323 	pi->curr_idx = idx;
324 
325 	return 1;
326 }
327 
328 /**
329  * It is to check the current scaling governor by reading sys file, and then
330  * set it into 'performance' if it is not by writing the sys file. The original
331  * governor will be saved for rolling back.
332  */
333 static int
power_set_governor_performance(struct pstate_power_info * pi)334 power_set_governor_performance(struct pstate_power_info *pi)
335 {
336 	FILE *f;
337 	int ret = -1;
338 	char buf[BUFSIZ];
339 	char fullpath[PATH_MAX];
340 	char *s;
341 	int val;
342 
343 	snprintf(fullpath, sizeof(fullpath), POWER_SYSFILE_GOVERNOR,
344 			pi->lcore_id);
345 	f = fopen(fullpath, "rw+");
346 	FOPEN_OR_ERR_RET(f, ret);
347 
348 	s = fgets(buf, sizeof(buf), f);
349 	FOPS_OR_NULL_GOTO(s, out);
350 	/* Strip off terminating '\n' */
351 	strtok(buf, "\n");
352 
353 	/* Check if current governor is performance */
354 	if (strncmp(buf, POWER_GOVERNOR_PERF,
355 			sizeof(POWER_GOVERNOR_PERF)) == 0) {
356 		ret = 0;
357 		POWER_DEBUG_TRACE("Power management governor of lcore %u is "
358 				"already performance\n", pi->lcore_id);
359 		goto out;
360 	}
361 	/* Save the original governor */
362 	strlcpy(pi->governor_ori, buf, sizeof(pi->governor_ori));
363 
364 	/* Write 'performance' to the governor */
365 	val = fseek(f, 0, SEEK_SET);
366 	FOPS_OR_ERR_GOTO(val, out);
367 
368 	val = fputs(POWER_GOVERNOR_PERF, f);
369 	FOPS_OR_ERR_GOTO(val, out);
370 
371 	/* We need to flush to see if the fputs succeeds */
372 	val = fflush(f);
373 	FOPS_OR_ERR_GOTO(val, out);
374 
375 	ret = 0;
376 	RTE_LOG(INFO, POWER, "Power management governor of lcore %u has been "
377 			"set to performance successfully\n", pi->lcore_id);
378 out:
379 	fclose(f);
380 
381 	return ret;
382 }
383 
384 /**
385  * It is to check the governor and then set the original governor back if
386  * needed by writing the sys file.
387  */
388 static int
power_set_governor_original(struct pstate_power_info * pi)389 power_set_governor_original(struct pstate_power_info *pi)
390 {
391 	FILE *f;
392 	int ret = -1;
393 	char buf[BUFSIZ];
394 	char fullpath[PATH_MAX];
395 	char *s;
396 	int val;
397 
398 	snprintf(fullpath, sizeof(fullpath), POWER_SYSFILE_GOVERNOR,
399 			pi->lcore_id);
400 	f = fopen(fullpath, "rw+");
401 	FOPEN_OR_ERR_RET(f, ret);
402 
403 	s = fgets(buf, sizeof(buf), f);
404 	FOPS_OR_NULL_GOTO(s, out);
405 
406 	/* Check if the governor to be set is the same as current */
407 	if (strncmp(buf, pi->governor_ori, sizeof(pi->governor_ori)) == 0) {
408 		ret = 0;
409 		POWER_DEBUG_TRACE("Power management governor of lcore %u "
410 				"has already been set to %s\n",
411 				pi->lcore_id, pi->governor_ori);
412 		goto out;
413 	}
414 
415 	/* Write back the original governor */
416 	val = fseek(f, 0, SEEK_SET);
417 	FOPS_OR_ERR_GOTO(val, out);
418 
419 	val = fputs(pi->governor_ori, f);
420 	FOPS_OR_ERR_GOTO(val, out);
421 
422 	ret = 0;
423 	RTE_LOG(INFO, POWER, "Power management governor of lcore %u "
424 			"has been set back to %s successfully\n",
425 			pi->lcore_id, pi->governor_ori);
426 out:
427 	fclose(f);
428 
429 	return ret;
430 }
431 
432 /**
433  * It is to get the available frequencies of the specific lcore by reading the
434  * sys file.
435  */
436 static int
power_get_available_freqs(struct pstate_power_info * pi)437 power_get_available_freqs(struct pstate_power_info *pi)
438 {
439 	FILE *f_min, *f_max;
440 	int ret = -1;
441 	char *p_min, *p_max;
442 	char buf_min[BUFSIZ];
443 	char buf_max[BUFSIZ];
444 	char fullpath_min[PATH_MAX];
445 	char fullpath_max[PATH_MAX];
446 	char *s_min, *s_max;
447 	uint32_t sys_min_freq = 0, sys_max_freq = 0, base_max_freq = 0;
448 	uint32_t i, num_freqs = 0;
449 
450 	snprintf(fullpath_max, sizeof(fullpath_max),
451 			POWER_SYSFILE_BASE_MAX_FREQ,
452 			pi->lcore_id);
453 	snprintf(fullpath_min, sizeof(fullpath_min),
454 			POWER_SYSFILE_BASE_MIN_FREQ,
455 			pi->lcore_id);
456 
457 	f_min = fopen(fullpath_min, "r");
458 	FOPEN_OR_ERR_RET(f_min, ret);
459 
460 	f_max = fopen(fullpath_max, "r");
461 	if (f_max == NULL)
462 		fclose(f_min);
463 
464 	FOPEN_OR_ERR_RET(f_max, ret);
465 
466 	s_min = fgets(buf_min, sizeof(buf_min), f_min);
467 	FOPS_OR_NULL_GOTO(s_min, out);
468 
469 	s_max = fgets(buf_max, sizeof(buf_max), f_max);
470 	FOPS_OR_NULL_GOTO(s_max, out);
471 
472 
473 	/* Strip the line break if there is */
474 	p_min = strchr(buf_min, '\n');
475 	if (p_min != NULL)
476 		*p_min = 0;
477 
478 	p_max = strchr(buf_max, '\n');
479 	if (p_max != NULL)
480 		*p_max = 0;
481 
482 	sys_min_freq = strtoul(buf_min, &p_min, POWER_CONVERT_TO_DECIMAL);
483 	sys_max_freq = strtoul(buf_max, &p_max, POWER_CONVERT_TO_DECIMAL);
484 
485 	if (sys_max_freq < sys_min_freq)
486 		goto out;
487 
488 	pi->sys_max_freq = sys_max_freq;
489 
490 	if (pi->priority_core == 1)
491 		base_max_freq = pi->core_base_freq;
492 	else
493 		base_max_freq = pi->non_turbo_max_ratio * BUS_FREQ;
494 
495 	POWER_DEBUG_TRACE("sys min %u, sys max %u, base_max %u\n",
496 			sys_min_freq,
497 			sys_max_freq,
498 			base_max_freq);
499 
500 	if (base_max_freq < sys_max_freq)
501 		pi->turbo_available = 1;
502 	else
503 		pi->turbo_available = 0;
504 
505 	/* If turbo is available then there is one extra freq bucket
506 	 * to store the sys max freq which value is base_max +1
507 	 */
508 	num_freqs = (base_max_freq - sys_min_freq) / BUS_FREQ + 1 +
509 		pi->turbo_available;
510 
511 	/* Generate the freq bucket array.
512 	 * If turbo is available the freq bucket[0] value is base_max +1
513 	 * the bucket[1] is base_max, bucket[2] is base_max - BUS_FREQ
514 	 * and so on.
515 	 * If turbo is not available bucket[0] is base_max and so on
516 	 */
517 	for (i = 0, pi->nb_freqs = 0; i < num_freqs; i++) {
518 		if ((i == 0) && pi->turbo_available)
519 			pi->freqs[pi->nb_freqs++] = base_max_freq + 1;
520 		else
521 			pi->freqs[pi->nb_freqs++] =
522 			base_max_freq - (i - pi->turbo_available) * BUS_FREQ;
523 	}
524 
525 	ret = 0;
526 
527 	POWER_DEBUG_TRACE("%d frequency(s) of lcore %u are available\n",
528 			num_freqs, pi->lcore_id);
529 
530 out:
531 	fclose(f_min);
532 	fclose(f_max);
533 
534 	return ret;
535 }
536 
537 static int
power_get_cur_idx(struct pstate_power_info * pi)538 power_get_cur_idx(struct pstate_power_info *pi)
539 {
540 	FILE *f_cur;
541 	int ret = -1;
542 	char *p_cur;
543 	char buf_cur[BUFSIZ];
544 	char fullpath_cur[PATH_MAX];
545 	char *s_cur;
546 	uint32_t sys_cur_freq = 0;
547 	unsigned int i;
548 
549 	snprintf(fullpath_cur, sizeof(fullpath_cur),
550 			POWER_SYSFILE_CUR_FREQ,
551 			pi->lcore_id);
552 	f_cur = fopen(fullpath_cur, "r");
553 	FOPEN_OR_ERR_RET(f_cur, ret);
554 
555 	/* initialize the cur_idx to matching current frequency freq index */
556 	s_cur = fgets(buf_cur, sizeof(buf_cur), f_cur);
557 	FOPS_OR_NULL_GOTO(s_cur, fail);
558 
559 	p_cur = strchr(buf_cur, '\n');
560 	if (p_cur != NULL)
561 		*p_cur = 0;
562 	sys_cur_freq = strtoul(buf_cur, &p_cur, POWER_CONVERT_TO_DECIMAL);
563 
564 	/* convert the frequency to nearest 100000 value
565 	 * Ex: if sys_cur_freq=1396789 then freq_conv=1400000
566 	 * Ex: if sys_cur_freq=800030 then freq_conv=800000
567 	 * Ex: if sys_cur_freq=800030 then freq_conv=800000
568 	 */
569 	unsigned int freq_conv = 0;
570 	freq_conv = (sys_cur_freq + FREQ_ROUNDING_DELTA)
571 				/ ROUND_FREQ_TO_N_100000;
572 	freq_conv = freq_conv * ROUND_FREQ_TO_N_100000;
573 
574 	for (i = 0; i < pi->nb_freqs; i++) {
575 		if (freq_conv == pi->freqs[i]) {
576 			pi->curr_idx = i;
577 			break;
578 		}
579 	}
580 
581 	fclose(f_cur);
582 	return 0;
583 fail:
584 	fclose(f_cur);
585 	return ret;
586 }
587 
588 int
power_pstate_cpufreq_check_supported(void)589 power_pstate_cpufreq_check_supported(void)
590 {
591 	return cpufreq_check_scaling_driver(POWER_PSTATE_DRIVER);
592 }
593 
594 int
power_pstate_cpufreq_init(unsigned int lcore_id)595 power_pstate_cpufreq_init(unsigned int lcore_id)
596 {
597 	struct pstate_power_info *pi;
598 	uint32_t exp_state;
599 
600 	if (lcore_id >= RTE_MAX_LCORE) {
601 		RTE_LOG(ERR, POWER, "Lcore id %u can not exceed %u\n",
602 				lcore_id, RTE_MAX_LCORE - 1U);
603 		return -1;
604 	}
605 
606 	pi = &lcore_power_info[lcore_id];
607 	exp_state = POWER_IDLE;
608 	/* The power in use state works as a guard variable between
609 	 * the CPU frequency control initialization and exit process.
610 	 * The ACQUIRE memory ordering here pairs with the RELEASE
611 	 * ordering below as lock to make sure the frequency operations
612 	 * in the critical section are done under the correct state.
613 	 */
614 	if (!__atomic_compare_exchange_n(&(pi->state), &exp_state,
615 					POWER_ONGOING, 0,
616 					__ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
617 		RTE_LOG(INFO, POWER, "Power management of lcore %u is "
618 				"in use\n", lcore_id);
619 		return -1;
620 	}
621 
622 	pi->lcore_id = lcore_id;
623 	/* Check and set the governor */
624 	if (power_set_governor_performance(pi) < 0) {
625 		RTE_LOG(ERR, POWER, "Cannot set governor of lcore %u to "
626 				"performance\n", lcore_id);
627 		goto fail;
628 	}
629 	/* Init for setting lcore frequency */
630 	if (power_init_for_setting_freq(pi) < 0) {
631 		RTE_LOG(ERR, POWER, "Cannot init for setting frequency for "
632 				"lcore %u\n", lcore_id);
633 		goto fail;
634 	}
635 
636 	/* Get the available frequencies */
637 	if (power_get_available_freqs(pi) < 0) {
638 		RTE_LOG(ERR, POWER, "Cannot get available frequencies of "
639 				"lcore %u\n", lcore_id);
640 		goto fail;
641 	}
642 
643 	if (power_get_cur_idx(pi) < 0) {
644 		RTE_LOG(ERR, POWER, "Cannot get current frequency "
645 				"index of lcore %u\n", lcore_id);
646 		goto fail;
647 	}
648 
649 	/* Set freq to max by default */
650 	if (power_pstate_cpufreq_freq_max(lcore_id) < 0) {
651 		RTE_LOG(ERR, POWER, "Cannot set frequency of lcore %u "
652 				"to max\n", lcore_id);
653 		goto fail;
654 	}
655 
656 	RTE_LOG(INFO, POWER, "Initialized successfully for lcore %u "
657 			"power management\n", lcore_id);
658 	exp_state = POWER_ONGOING;
659 	__atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_USED,
660 				    0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
661 
662 	return 0;
663 
664 fail:
665 	exp_state = POWER_ONGOING;
666 	__atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_UNKNOWN,
667 				    0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
668 
669 	return -1;
670 }
671 
672 int
power_pstate_cpufreq_exit(unsigned int lcore_id)673 power_pstate_cpufreq_exit(unsigned int lcore_id)
674 {
675 	struct pstate_power_info *pi;
676 	uint32_t exp_state;
677 
678 	if (lcore_id >= RTE_MAX_LCORE) {
679 		RTE_LOG(ERR, POWER, "Lcore id %u can not exceeds %u\n",
680 				lcore_id, RTE_MAX_LCORE - 1U);
681 		return -1;
682 	}
683 	pi = &lcore_power_info[lcore_id];
684 
685 	exp_state = POWER_USED;
686 	/* The power in use state works as a guard variable between
687 	 * the CPU frequency control initialization and exit process.
688 	 * The ACQUIRE memory ordering here pairs with the RELEASE
689 	 * ordering below as lock to make sure the frequency operations
690 	 * in the critical section are under done the correct state.
691 	 */
692 	if (!__atomic_compare_exchange_n(&(pi->state), &exp_state,
693 					POWER_ONGOING, 0,
694 					__ATOMIC_ACQUIRE, __ATOMIC_RELAXED)) {
695 		RTE_LOG(INFO, POWER, "Power management of lcore %u is "
696 				"not used\n", lcore_id);
697 		return -1;
698 	}
699 
700 	/* Close FD of setting freq */
701 	fclose(pi->f_cur_min);
702 	fclose(pi->f_cur_max);
703 	pi->f_cur_min = NULL;
704 	pi->f_cur_max = NULL;
705 
706 	/* Set the governor back to the original */
707 	if (power_set_governor_original(pi) < 0) {
708 		RTE_LOG(ERR, POWER, "Cannot set the governor of %u back "
709 				"to the original\n", lcore_id);
710 		goto fail;
711 	}
712 
713 	RTE_LOG(INFO, POWER, "Power management of lcore %u has exited from "
714 			"'performance' mode and been set back to the "
715 			"original\n", lcore_id);
716 	exp_state = POWER_ONGOING;
717 	__atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_IDLE,
718 				    0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
719 
720 	return 0;
721 
722 fail:
723 	exp_state = POWER_ONGOING;
724 	__atomic_compare_exchange_n(&(pi->state), &exp_state, POWER_UNKNOWN,
725 				    0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
726 
727 	return -1;
728 }
729 
730 
731 uint32_t
power_pstate_cpufreq_freqs(unsigned int lcore_id,uint32_t * freqs,uint32_t num)732 power_pstate_cpufreq_freqs(unsigned int lcore_id, uint32_t *freqs, uint32_t num)
733 {
734 	struct pstate_power_info *pi;
735 
736 	if (lcore_id >= RTE_MAX_LCORE) {
737 		RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
738 		return 0;
739 	}
740 
741 	if (freqs == NULL) {
742 		RTE_LOG(ERR, POWER, "NULL buffer supplied\n");
743 		return 0;
744 	}
745 
746 	pi = &lcore_power_info[lcore_id];
747 	if (num < pi->nb_freqs) {
748 		RTE_LOG(ERR, POWER, "Buffer size is not enough\n");
749 		return 0;
750 	}
751 	rte_memcpy(freqs, pi->freqs, pi->nb_freqs * sizeof(uint32_t));
752 
753 	return pi->nb_freqs;
754 }
755 
756 uint32_t
power_pstate_cpufreq_get_freq(unsigned int lcore_id)757 power_pstate_cpufreq_get_freq(unsigned int lcore_id)
758 {
759 	if (lcore_id >= RTE_MAX_LCORE) {
760 		RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
761 		return RTE_POWER_INVALID_FREQ_INDEX;
762 	}
763 
764 	return lcore_power_info[lcore_id].curr_idx;
765 }
766 
767 
768 int
power_pstate_cpufreq_set_freq(unsigned int lcore_id,uint32_t index)769 power_pstate_cpufreq_set_freq(unsigned int lcore_id, uint32_t index)
770 {
771 	if (lcore_id >= RTE_MAX_LCORE) {
772 		RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
773 		return -1;
774 	}
775 
776 	return set_freq_internal(&(lcore_power_info[lcore_id]), index);
777 }
778 
779 int
power_pstate_cpufreq_freq_up(unsigned int lcore_id)780 power_pstate_cpufreq_freq_up(unsigned int lcore_id)
781 {
782 	struct pstate_power_info *pi;
783 
784 	if (lcore_id >= RTE_MAX_LCORE) {
785 		RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
786 		return -1;
787 	}
788 
789 	pi = &lcore_power_info[lcore_id];
790 	if (pi->curr_idx == 0 ||
791 	    (pi->curr_idx == 1 && pi->turbo_available && !pi->turbo_enable))
792 		return 0;
793 
794 	/* Frequencies in the array are from high to low. */
795 	return set_freq_internal(pi, pi->curr_idx - 1);
796 }
797 
798 int
power_pstate_cpufreq_freq_down(unsigned int lcore_id)799 power_pstate_cpufreq_freq_down(unsigned int lcore_id)
800 {
801 	struct pstate_power_info *pi;
802 
803 	if (lcore_id >= RTE_MAX_LCORE) {
804 		RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
805 		return -1;
806 	}
807 
808 	pi = &lcore_power_info[lcore_id];
809 	if (pi->curr_idx + 1 == pi->nb_freqs)
810 		return 0;
811 
812 	/* Frequencies in the array are from high to low. */
813 	return set_freq_internal(pi, pi->curr_idx + 1);
814 }
815 
816 int
power_pstate_cpufreq_freq_max(unsigned int lcore_id)817 power_pstate_cpufreq_freq_max(unsigned int lcore_id)
818 {
819 	if (lcore_id >= RTE_MAX_LCORE) {
820 		RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
821 		return -1;
822 	}
823 
824 	/* Frequencies in the array are from high to low. */
825 	if (lcore_power_info[lcore_id].turbo_available) {
826 		if (lcore_power_info[lcore_id].turbo_enable)
827 			/* Set to Turbo */
828 			return set_freq_internal(
829 					&lcore_power_info[lcore_id], 0);
830 		else
831 			/* Set to max non-turbo */
832 			return set_freq_internal(
833 					&lcore_power_info[lcore_id], 1);
834 	} else
835 		return set_freq_internal(&lcore_power_info[lcore_id], 0);
836 }
837 
838 
839 int
power_pstate_cpufreq_freq_min(unsigned int lcore_id)840 power_pstate_cpufreq_freq_min(unsigned int lcore_id)
841 {
842 	struct pstate_power_info *pi;
843 
844 	if (lcore_id >= RTE_MAX_LCORE) {
845 		RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
846 		return -1;
847 	}
848 
849 	pi = &lcore_power_info[lcore_id];
850 
851 	/* Frequencies in the array are from high to low. */
852 	return set_freq_internal(pi, pi->nb_freqs - 1);
853 }
854 
855 
856 int
power_pstate_turbo_status(unsigned int lcore_id)857 power_pstate_turbo_status(unsigned int lcore_id)
858 {
859 	struct pstate_power_info *pi;
860 
861 	if (lcore_id >= RTE_MAX_LCORE) {
862 		RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
863 		return -1;
864 	}
865 
866 	pi = &lcore_power_info[lcore_id];
867 
868 	return pi->turbo_enable;
869 }
870 
871 int
power_pstate_enable_turbo(unsigned int lcore_id)872 power_pstate_enable_turbo(unsigned int lcore_id)
873 {
874 	struct pstate_power_info *pi;
875 
876 	if (lcore_id >= RTE_MAX_LCORE) {
877 		RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
878 		return -1;
879 	}
880 
881 	pi = &lcore_power_info[lcore_id];
882 
883 	if (pi->turbo_available)
884 		pi->turbo_enable = 1;
885 	else {
886 		pi->turbo_enable = 0;
887 		RTE_LOG(ERR, POWER,
888 			"Failed to enable turbo on lcore %u\n",
889 			lcore_id);
890 			return -1;
891 	}
892 
893 	return 0;
894 }
895 
896 
897 int
power_pstate_disable_turbo(unsigned int lcore_id)898 power_pstate_disable_turbo(unsigned int lcore_id)
899 {
900 	struct pstate_power_info *pi;
901 
902 	if (lcore_id >= RTE_MAX_LCORE) {
903 		RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
904 		return -1;
905 	}
906 
907 	pi = &lcore_power_info[lcore_id];
908 
909 	pi->turbo_enable = 0;
910 
911 	if (pi->turbo_available && pi->curr_idx <= 1) {
912 		/* Try to set freq to max by default coming out of turbo */
913 		if (power_pstate_cpufreq_freq_max(lcore_id) < 0) {
914 			RTE_LOG(ERR, POWER,
915 				"Failed to set frequency of lcore %u to max\n",
916 				lcore_id);
917 			return -1;
918 		}
919 	}
920 
921 	return 0;
922 }
923 
924 
power_pstate_get_capabilities(unsigned int lcore_id,struct rte_power_core_capabilities * caps)925 int power_pstate_get_capabilities(unsigned int lcore_id,
926 		struct rte_power_core_capabilities *caps)
927 {
928 	struct pstate_power_info *pi;
929 
930 	if (lcore_id >= RTE_MAX_LCORE) {
931 		RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
932 		return -1;
933 	}
934 	if (caps == NULL) {
935 		RTE_LOG(ERR, POWER, "Invalid argument\n");
936 		return -1;
937 	}
938 
939 	pi = &lcore_power_info[lcore_id];
940 	caps->capabilities = 0;
941 	caps->turbo = !!(pi->turbo_available);
942 	caps->priority = pi->priority_core;
943 
944 	return 0;
945 }
946