1*22ce4affSfengbojiang /*
2*22ce4affSfengbojiang * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
3*22ce4affSfengbojiang * All rights reserved.
4*22ce4affSfengbojiang *
5*22ce4affSfengbojiang * This source code is licensed under both the BSD-style license (found in the
6*22ce4affSfengbojiang * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7*22ce4affSfengbojiang * in the COPYING file in the root directory of this source tree).
8*22ce4affSfengbojiang * You may select, at your option, one of the above-listed licenses.
9*22ce4affSfengbojiang */
10*22ce4affSfengbojiang
11*22ce4affSfengbojiang
12*22ce4affSfengbojiang
13*22ce4affSfengbojiang /* *************************************
14*22ce4affSfengbojiang * Includes
15*22ce4affSfengbojiang ***************************************/
16*22ce4affSfengbojiang #include <stdlib.h> /* malloc, free */
17*22ce4affSfengbojiang #include <string.h> /* memset */
18*22ce4affSfengbojiang #include <assert.h> /* assert */
19*22ce4affSfengbojiang
20*22ce4affSfengbojiang #include "timefn.h" /* UTIL_time_t, UTIL_getTime */
21*22ce4affSfengbojiang #include "benchfn.h"
22*22ce4affSfengbojiang
23*22ce4affSfengbojiang
24*22ce4affSfengbojiang /* *************************************
25*22ce4affSfengbojiang * Constants
26*22ce4affSfengbojiang ***************************************/
27*22ce4affSfengbojiang #define TIMELOOP_MICROSEC SEC_TO_MICRO /* 1 second */
28*22ce4affSfengbojiang #define TIMELOOP_NANOSEC (1*1000000000ULL) /* 1 second */
29*22ce4affSfengbojiang
30*22ce4affSfengbojiang #define KB *(1 <<10)
31*22ce4affSfengbojiang #define MB *(1 <<20)
32*22ce4affSfengbojiang #define GB *(1U<<30)
33*22ce4affSfengbojiang
34*22ce4affSfengbojiang
35*22ce4affSfengbojiang /* *************************************
36*22ce4affSfengbojiang * Debug errors
37*22ce4affSfengbojiang ***************************************/
38*22ce4affSfengbojiang #if defined(DEBUG) && (DEBUG >= 1)
39*22ce4affSfengbojiang # include <stdio.h> /* fprintf */
40*22ce4affSfengbojiang # define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
41*22ce4affSfengbojiang # define DEBUGOUTPUT(...) { if (DEBUG) DISPLAY(__VA_ARGS__); }
42*22ce4affSfengbojiang #else
43*22ce4affSfengbojiang # define DEBUGOUTPUT(...)
44*22ce4affSfengbojiang #endif
45*22ce4affSfengbojiang
46*22ce4affSfengbojiang
47*22ce4affSfengbojiang /* error without displaying */
48*22ce4affSfengbojiang #define RETURN_QUIET_ERROR(retValue, ...) { \
49*22ce4affSfengbojiang DEBUGOUTPUT("%s: %i: \n", __FILE__, __LINE__); \
50*22ce4affSfengbojiang DEBUGOUTPUT("Error : "); \
51*22ce4affSfengbojiang DEBUGOUTPUT(__VA_ARGS__); \
52*22ce4affSfengbojiang DEBUGOUTPUT(" \n"); \
53*22ce4affSfengbojiang return retValue; \
54*22ce4affSfengbojiang }
55*22ce4affSfengbojiang
56*22ce4affSfengbojiang /* Abort execution if a condition is not met */
57*22ce4affSfengbojiang #define CONTROL(c) { if (!(c)) { DEBUGOUTPUT("error: %s \n", #c); abort(); } }
58*22ce4affSfengbojiang
59*22ce4affSfengbojiang
60*22ce4affSfengbojiang /* *************************************
61*22ce4affSfengbojiang * Benchmarking an arbitrary function
62*22ce4affSfengbojiang ***************************************/
63*22ce4affSfengbojiang
BMK_isSuccessful_runOutcome(BMK_runOutcome_t outcome)64*22ce4affSfengbojiang int BMK_isSuccessful_runOutcome(BMK_runOutcome_t outcome)
65*22ce4affSfengbojiang {
66*22ce4affSfengbojiang return outcome.error_tag_never_ever_use_directly == 0;
67*22ce4affSfengbojiang }
68*22ce4affSfengbojiang
69*22ce4affSfengbojiang /* warning : this function will stop program execution if outcome is invalid !
70*22ce4affSfengbojiang * check outcome validity first, using BMK_isValid_runResult() */
BMK_extract_runTime(BMK_runOutcome_t outcome)71*22ce4affSfengbojiang BMK_runTime_t BMK_extract_runTime(BMK_runOutcome_t outcome)
72*22ce4affSfengbojiang {
73*22ce4affSfengbojiang CONTROL(outcome.error_tag_never_ever_use_directly == 0);
74*22ce4affSfengbojiang return outcome.internal_never_ever_use_directly;
75*22ce4affSfengbojiang }
76*22ce4affSfengbojiang
BMK_extract_errorResult(BMK_runOutcome_t outcome)77*22ce4affSfengbojiang size_t BMK_extract_errorResult(BMK_runOutcome_t outcome)
78*22ce4affSfengbojiang {
79*22ce4affSfengbojiang CONTROL(outcome.error_tag_never_ever_use_directly != 0);
80*22ce4affSfengbojiang return outcome.error_result_never_ever_use_directly;
81*22ce4affSfengbojiang }
82*22ce4affSfengbojiang
BMK_runOutcome_error(size_t errorResult)83*22ce4affSfengbojiang static BMK_runOutcome_t BMK_runOutcome_error(size_t errorResult)
84*22ce4affSfengbojiang {
85*22ce4affSfengbojiang BMK_runOutcome_t b;
86*22ce4affSfengbojiang memset(&b, 0, sizeof(b));
87*22ce4affSfengbojiang b.error_tag_never_ever_use_directly = 1;
88*22ce4affSfengbojiang b.error_result_never_ever_use_directly = errorResult;
89*22ce4affSfengbojiang return b;
90*22ce4affSfengbojiang }
91*22ce4affSfengbojiang
BMK_setValid_runTime(BMK_runTime_t runTime)92*22ce4affSfengbojiang static BMK_runOutcome_t BMK_setValid_runTime(BMK_runTime_t runTime)
93*22ce4affSfengbojiang {
94*22ce4affSfengbojiang BMK_runOutcome_t outcome;
95*22ce4affSfengbojiang outcome.error_tag_never_ever_use_directly = 0;
96*22ce4affSfengbojiang outcome.internal_never_ever_use_directly = runTime;
97*22ce4affSfengbojiang return outcome;
98*22ce4affSfengbojiang }
99*22ce4affSfengbojiang
100*22ce4affSfengbojiang
101*22ce4affSfengbojiang /* initFn will be measured once, benchFn will be measured `nbLoops` times */
102*22ce4affSfengbojiang /* initFn is optional, provide NULL if none */
103*22ce4affSfengbojiang /* benchFn must return a size_t value that errorFn can interpret */
104*22ce4affSfengbojiang /* takes # of blocks and list of size & stuff for each. */
105*22ce4affSfengbojiang /* can report result of benchFn for each block into blockResult. */
106*22ce4affSfengbojiang /* blockResult is optional, provide NULL if this information is not required */
107*22ce4affSfengbojiang /* note : time per loop can be reported as zero if run time < timer resolution */
BMK_benchFunction(BMK_benchParams_t p,unsigned nbLoops)108*22ce4affSfengbojiang BMK_runOutcome_t BMK_benchFunction(BMK_benchParams_t p,
109*22ce4affSfengbojiang unsigned nbLoops)
110*22ce4affSfengbojiang {
111*22ce4affSfengbojiang size_t dstSize = 0;
112*22ce4affSfengbojiang nbLoops += !nbLoops; /* minimum nbLoops is 1 */
113*22ce4affSfengbojiang
114*22ce4affSfengbojiang /* init */
115*22ce4affSfengbojiang { size_t i;
116*22ce4affSfengbojiang for(i = 0; i < p.blockCount; i++) {
117*22ce4affSfengbojiang memset(p.dstBuffers[i], 0xE5, p.dstCapacities[i]); /* warm up and erase result buffer */
118*22ce4affSfengbojiang } }
119*22ce4affSfengbojiang
120*22ce4affSfengbojiang /* benchmark */
121*22ce4affSfengbojiang { UTIL_time_t const clockStart = UTIL_getTime();
122*22ce4affSfengbojiang unsigned loopNb, blockNb;
123*22ce4affSfengbojiang if (p.initFn != NULL) p.initFn(p.initPayload);
124*22ce4affSfengbojiang for (loopNb = 0; loopNb < nbLoops; loopNb++) {
125*22ce4affSfengbojiang for (blockNb = 0; blockNb < p.blockCount; blockNb++) {
126*22ce4affSfengbojiang size_t const res = p.benchFn(p.srcBuffers[blockNb], p.srcSizes[blockNb],
127*22ce4affSfengbojiang p.dstBuffers[blockNb], p.dstCapacities[blockNb],
128*22ce4affSfengbojiang p.benchPayload);
129*22ce4affSfengbojiang if (loopNb == 0) {
130*22ce4affSfengbojiang if (p.blockResults != NULL) p.blockResults[blockNb] = res;
131*22ce4affSfengbojiang if ((p.errorFn != NULL) && (p.errorFn(res))) {
132*22ce4affSfengbojiang RETURN_QUIET_ERROR(BMK_runOutcome_error(res),
133*22ce4affSfengbojiang "Function benchmark failed on block %u (of size %u) with error %i",
134*22ce4affSfengbojiang blockNb, (unsigned)p.srcSizes[blockNb], (int)res);
135*22ce4affSfengbojiang }
136*22ce4affSfengbojiang dstSize += res;
137*22ce4affSfengbojiang } }
138*22ce4affSfengbojiang } /* for (loopNb = 0; loopNb < nbLoops; loopNb++) */
139*22ce4affSfengbojiang
140*22ce4affSfengbojiang { PTime const totalTime = UTIL_clockSpanNano(clockStart);
141*22ce4affSfengbojiang BMK_runTime_t rt;
142*22ce4affSfengbojiang rt.nanoSecPerRun = (double)totalTime / nbLoops;
143*22ce4affSfengbojiang rt.sumOfReturn = dstSize;
144*22ce4affSfengbojiang return BMK_setValid_runTime(rt);
145*22ce4affSfengbojiang } }
146*22ce4affSfengbojiang }
147*22ce4affSfengbojiang
148*22ce4affSfengbojiang
149*22ce4affSfengbojiang /* ==== Benchmarking any function, providing intermediate results ==== */
150*22ce4affSfengbojiang
151*22ce4affSfengbojiang struct BMK_timedFnState_s {
152*22ce4affSfengbojiang PTime timeSpent_ns;
153*22ce4affSfengbojiang PTime timeBudget_ns;
154*22ce4affSfengbojiang PTime runBudget_ns;
155*22ce4affSfengbojiang BMK_runTime_t fastestRun;
156*22ce4affSfengbojiang unsigned nbLoops;
157*22ce4affSfengbojiang UTIL_time_t coolTime;
158*22ce4affSfengbojiang }; /* typedef'd to BMK_timedFnState_t within bench.h */
159*22ce4affSfengbojiang
BMK_createTimedFnState(unsigned total_ms,unsigned run_ms)160*22ce4affSfengbojiang BMK_timedFnState_t* BMK_createTimedFnState(unsigned total_ms, unsigned run_ms)
161*22ce4affSfengbojiang {
162*22ce4affSfengbojiang BMK_timedFnState_t* const r = (BMK_timedFnState_t*)malloc(sizeof(*r));
163*22ce4affSfengbojiang if (r == NULL) return NULL; /* malloc() error */
164*22ce4affSfengbojiang BMK_resetTimedFnState(r, total_ms, run_ms);
165*22ce4affSfengbojiang return r;
166*22ce4affSfengbojiang }
167*22ce4affSfengbojiang
BMK_freeTimedFnState(BMK_timedFnState_t * state)168*22ce4affSfengbojiang void BMK_freeTimedFnState(BMK_timedFnState_t* state) { free(state); }
169*22ce4affSfengbojiang
170*22ce4affSfengbojiang BMK_timedFnState_t*
BMK_initStatic_timedFnState(void * buffer,size_t size,unsigned total_ms,unsigned run_ms)171*22ce4affSfengbojiang BMK_initStatic_timedFnState(void* buffer, size_t size, unsigned total_ms, unsigned run_ms)
172*22ce4affSfengbojiang {
173*22ce4affSfengbojiang typedef char check_size[ 2 * (sizeof(BMK_timedFnState_shell) >= sizeof(struct BMK_timedFnState_s)) - 1]; /* static assert : a compilation failure indicates that BMK_timedFnState_shell is not large enough */
174*22ce4affSfengbojiang typedef struct { check_size c; BMK_timedFnState_t tfs; } tfs_align; /* force tfs to be aligned at its next best position */
175*22ce4affSfengbojiang size_t const tfs_alignment = offsetof(tfs_align, tfs); /* provides the minimal alignment restriction for BMK_timedFnState_t */
176*22ce4affSfengbojiang BMK_timedFnState_t* const r = (BMK_timedFnState_t*)buffer;
177*22ce4affSfengbojiang if (buffer == NULL) return NULL;
178*22ce4affSfengbojiang if (size < sizeof(struct BMK_timedFnState_s)) return NULL;
179*22ce4affSfengbojiang if ((size_t)buffer % tfs_alignment) return NULL; /* buffer must be properly aligned */
180*22ce4affSfengbojiang BMK_resetTimedFnState(r, total_ms, run_ms);
181*22ce4affSfengbojiang return r;
182*22ce4affSfengbojiang }
183*22ce4affSfengbojiang
BMK_resetTimedFnState(BMK_timedFnState_t * timedFnState,unsigned total_ms,unsigned run_ms)184*22ce4affSfengbojiang void BMK_resetTimedFnState(BMK_timedFnState_t* timedFnState, unsigned total_ms, unsigned run_ms)
185*22ce4affSfengbojiang {
186*22ce4affSfengbojiang if (!total_ms) total_ms = 1 ;
187*22ce4affSfengbojiang if (!run_ms) run_ms = 1;
188*22ce4affSfengbojiang if (run_ms > total_ms) run_ms = total_ms;
189*22ce4affSfengbojiang timedFnState->timeSpent_ns = 0;
190*22ce4affSfengbojiang timedFnState->timeBudget_ns = (PTime)total_ms * TIMELOOP_NANOSEC / 1000;
191*22ce4affSfengbojiang timedFnState->runBudget_ns = (PTime)run_ms * TIMELOOP_NANOSEC / 1000;
192*22ce4affSfengbojiang timedFnState->fastestRun.nanoSecPerRun = (double)TIMELOOP_NANOSEC * 2000000000; /* hopefully large enough : must be larger than any potential measurement */
193*22ce4affSfengbojiang timedFnState->fastestRun.sumOfReturn = (size_t)(-1LL);
194*22ce4affSfengbojiang timedFnState->nbLoops = 1;
195*22ce4affSfengbojiang timedFnState->coolTime = UTIL_getTime();
196*22ce4affSfengbojiang }
197*22ce4affSfengbojiang
198*22ce4affSfengbojiang /* Tells if nb of seconds set in timedFnState for all runs is spent.
199*22ce4affSfengbojiang * note : this function will return 1 if BMK_benchFunctionTimed() has actually errored. */
BMK_isCompleted_TimedFn(const BMK_timedFnState_t * timedFnState)200*22ce4affSfengbojiang int BMK_isCompleted_TimedFn(const BMK_timedFnState_t* timedFnState)
201*22ce4affSfengbojiang {
202*22ce4affSfengbojiang return (timedFnState->timeSpent_ns >= timedFnState->timeBudget_ns);
203*22ce4affSfengbojiang }
204*22ce4affSfengbojiang
205*22ce4affSfengbojiang
206*22ce4affSfengbojiang #undef MIN
207*22ce4affSfengbojiang #define MIN(a,b) ( (a) < (b) ? (a) : (b) )
208*22ce4affSfengbojiang
209*22ce4affSfengbojiang #define MINUSABLETIME (TIMELOOP_NANOSEC / 2) /* 0.5 seconds */
210*22ce4affSfengbojiang
BMK_benchTimedFn(BMK_timedFnState_t * cont,BMK_benchParams_t p)211*22ce4affSfengbojiang BMK_runOutcome_t BMK_benchTimedFn(BMK_timedFnState_t* cont,
212*22ce4affSfengbojiang BMK_benchParams_t p)
213*22ce4affSfengbojiang {
214*22ce4affSfengbojiang PTime const runBudget_ns = cont->runBudget_ns;
215*22ce4affSfengbojiang PTime const runTimeMin_ns = runBudget_ns / 2;
216*22ce4affSfengbojiang int completed = 0;
217*22ce4affSfengbojiang BMK_runTime_t bestRunTime = cont->fastestRun;
218*22ce4affSfengbojiang
219*22ce4affSfengbojiang while (!completed) {
220*22ce4affSfengbojiang BMK_runOutcome_t const runResult = BMK_benchFunction(p, cont->nbLoops);
221*22ce4affSfengbojiang
222*22ce4affSfengbojiang if(!BMK_isSuccessful_runOutcome(runResult)) { /* error : move out */
223*22ce4affSfengbojiang return runResult;
224*22ce4affSfengbojiang }
225*22ce4affSfengbojiang
226*22ce4affSfengbojiang { BMK_runTime_t const newRunTime = BMK_extract_runTime(runResult);
227*22ce4affSfengbojiang double const loopDuration_ns = newRunTime.nanoSecPerRun * cont->nbLoops;
228*22ce4affSfengbojiang
229*22ce4affSfengbojiang cont->timeSpent_ns += (unsigned long long)loopDuration_ns;
230*22ce4affSfengbojiang
231*22ce4affSfengbojiang /* estimate nbLoops for next run to last approximately 1 second */
232*22ce4affSfengbojiang if (loopDuration_ns > (runBudget_ns / 50)) {
233*22ce4affSfengbojiang double const fastestRun_ns = MIN(bestRunTime.nanoSecPerRun, newRunTime.nanoSecPerRun);
234*22ce4affSfengbojiang cont->nbLoops = (unsigned)(runBudget_ns / fastestRun_ns) + 1;
235*22ce4affSfengbojiang } else {
236*22ce4affSfengbojiang /* previous run was too short : blindly increase workload by x multiplier */
237*22ce4affSfengbojiang const unsigned multiplier = 10;
238*22ce4affSfengbojiang assert(cont->nbLoops < ((unsigned)-1) / multiplier); /* avoid overflow */
239*22ce4affSfengbojiang cont->nbLoops *= multiplier;
240*22ce4affSfengbojiang }
241*22ce4affSfengbojiang
242*22ce4affSfengbojiang if(loopDuration_ns < runTimeMin_ns) {
243*22ce4affSfengbojiang /* don't report results for which benchmark run time was too small : increased risks of rounding errors */
244*22ce4affSfengbojiang assert(completed == 0);
245*22ce4affSfengbojiang continue;
246*22ce4affSfengbojiang } else {
247*22ce4affSfengbojiang if(newRunTime.nanoSecPerRun < bestRunTime.nanoSecPerRun) {
248*22ce4affSfengbojiang bestRunTime = newRunTime;
249*22ce4affSfengbojiang }
250*22ce4affSfengbojiang completed = 1;
251*22ce4affSfengbojiang }
252*22ce4affSfengbojiang }
253*22ce4affSfengbojiang } /* while (!completed) */
254*22ce4affSfengbojiang
255*22ce4affSfengbojiang return BMK_setValid_runTime(bestRunTime);
256*22ce4affSfengbojiang }
257