1 /* The latency monitor allows to easily observe the sources of latency
2 * in a Redis instance using the LATENCY command. Different latency
3 * sources are monitored, like disk I/O, execution of commands, fork
4 * system call, and so forth.
5 *
6 * ----------------------------------------------------------------------------
7 *
8 * Copyright (c) 2014, Salvatore Sanfilippo <antirez at gmail dot com>
9 * All rights reserved.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions are met:
13 *
14 * * Redistributions of source code must retain the above copyright notice,
15 * this list of conditions and the following disclaimer.
16 * * Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * * Neither the name of Redis nor the names of its contributors may be used
20 * to endorse or promote products derived from this software without
21 * specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
24 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
27 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
28 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
29 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
30 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
31 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGE.
34 */
35
36 #include "server.h"
37
38 /* Dictionary type for latency events. */
dictStringKeyCompare(void * privdata,const void * key1,const void * key2)39 int dictStringKeyCompare(void *privdata, const void *key1, const void *key2) {
40 UNUSED(privdata);
41 return strcmp(key1,key2) == 0;
42 }
43
dictStringHash(const void * key)44 uint64_t dictStringHash(const void *key) {
45 return dictGenHashFunction(key, strlen(key));
46 }
47
48 void dictVanillaFree(void *privdata, void *val);
49
50 dictType latencyTimeSeriesDictType = {
51 dictStringHash, /* hash function */
52 NULL, /* key dup */
53 NULL, /* val dup */
54 dictStringKeyCompare, /* key compare */
55 dictVanillaFree, /* key destructor */
56 dictVanillaFree /* val destructor */
57 };
58
59 /* ------------------------- Utility functions ------------------------------ */
60
61 #ifdef __linux__
62 /* Returns 1 if Transparent Huge Pages support is enabled in the kernel.
63 * Otherwise (or if we are unable to check) 0 is returned. */
THPIsEnabled(void)64 int THPIsEnabled(void) {
65 char buf[1024];
66
67 FILE *fp = fopen("/sys/kernel/mm/transparent_hugepage/enabled","r");
68 if (!fp) return 0;
69 if (fgets(buf,sizeof(buf),fp) == NULL) {
70 fclose(fp);
71 return 0;
72 }
73 fclose(fp);
74 return (strstr(buf,"[never]") == NULL) ? 1 : 0;
75 }
76 #endif
77
78 /* Report the amount of AnonHugePages in smap, in bytes. If the return
79 * value of the function is non-zero, the process is being targeted by
80 * THP support, and is likely to have memory usage / latency issues. */
THPGetAnonHugePagesSize(void)81 int THPGetAnonHugePagesSize(void) {
82 return zmalloc_get_smap_bytes_by_field("AnonHugePages:",-1);
83 }
84
85 /* ---------------------------- Latency API --------------------------------- */
86
87 /* Latency monitor initialization. We just need to create the dictionary
88 * of time series, each time serie is craeted on demand in order to avoid
89 * having a fixed list to maintain. */
latencyMonitorInit(void)90 void latencyMonitorInit(void) {
91 server.latency_events = dictCreate(&latencyTimeSeriesDictType,NULL);
92 }
93
94 /* Add the specified sample to the specified time series "event".
95 * This function is usually called via latencyAddSampleIfNeeded(), that
96 * is a macro that only adds the sample if the latency is higher than
97 * server.latency_monitor_threshold. */
latencyAddSample(char * event,mstime_t latency)98 void latencyAddSample(char *event, mstime_t latency) {
99 struct latencyTimeSeries *ts = dictFetchValue(server.latency_events,event);
100 time_t now = time(NULL);
101 int prev;
102
103 /* Create the time series if it does not exist. */
104 if (ts == NULL) {
105 ts = zmalloc(sizeof(*ts));
106 ts->idx = 0;
107 ts->max = 0;
108 memset(ts->samples,0,sizeof(ts->samples));
109 dictAdd(server.latency_events,zstrdup(event),ts);
110 }
111
112 if (latency > ts->max) ts->max = latency;
113
114 /* If the previous sample is in the same second, we update our old sample
115 * if this latency is > of the old one, or just return. */
116 prev = (ts->idx + LATENCY_TS_LEN - 1) % LATENCY_TS_LEN;
117 if (ts->samples[prev].time == now) {
118 if (latency > ts->samples[prev].latency)
119 ts->samples[prev].latency = latency;
120 return;
121 }
122
123 ts->samples[ts->idx].time = time(NULL);
124 ts->samples[ts->idx].latency = latency;
125
126 ts->idx++;
127 if (ts->idx == LATENCY_TS_LEN) ts->idx = 0;
128 }
129
130 /* Reset data for the specified event, or all the events data if 'event' is
131 * NULL.
132 *
133 * Note: this is O(N) even when event_to_reset is not NULL because makes
134 * the code simpler and we have a small fixed max number of events. */
latencyResetEvent(char * event_to_reset)135 int latencyResetEvent(char *event_to_reset) {
136 dictIterator *di;
137 dictEntry *de;
138 int resets = 0;
139
140 di = dictGetSafeIterator(server.latency_events);
141 while((de = dictNext(di)) != NULL) {
142 char *event = dictGetKey(de);
143
144 if (event_to_reset == NULL || strcasecmp(event,event_to_reset) == 0) {
145 dictDelete(server.latency_events, event);
146 resets++;
147 }
148 }
149 dictReleaseIterator(di);
150 return resets;
151 }
152
153 /* ------------------------ Latency reporting (doctor) ---------------------- */
154
155 /* Analyze the samples available for a given event and return a structure
156 * populate with different metrics, average, MAD, min, max, and so forth.
157 * Check latency.h definition of struct latenctStat for more info.
158 * If the specified event has no elements the structure is populate with
159 * zero values. */
analyzeLatencyForEvent(char * event,struct latencyStats * ls)160 void analyzeLatencyForEvent(char *event, struct latencyStats *ls) {
161 struct latencyTimeSeries *ts = dictFetchValue(server.latency_events,event);
162 int j;
163 uint64_t sum;
164
165 ls->all_time_high = ts ? ts->max : 0;
166 ls->avg = 0;
167 ls->min = 0;
168 ls->max = 0;
169 ls->mad = 0;
170 ls->samples = 0;
171 ls->period = 0;
172 if (!ts) return;
173
174 /* First pass, populate everything but the MAD. */
175 sum = 0;
176 for (j = 0; j < LATENCY_TS_LEN; j++) {
177 if (ts->samples[j].time == 0) continue;
178 ls->samples++;
179 if (ls->samples == 1) {
180 ls->min = ls->max = ts->samples[j].latency;
181 } else {
182 if (ls->min > ts->samples[j].latency)
183 ls->min = ts->samples[j].latency;
184 if (ls->max < ts->samples[j].latency)
185 ls->max = ts->samples[j].latency;
186 }
187 sum += ts->samples[j].latency;
188
189 /* Track the oldest event time in ls->period. */
190 if (ls->period == 0 || ts->samples[j].time < ls->period)
191 ls->period = ts->samples[j].time;
192 }
193
194 /* So far avg is actually the sum of the latencies, and period is
195 * the oldest event time. We need to make the first an average and
196 * the second a range of seconds. */
197 if (ls->samples) {
198 ls->avg = sum / ls->samples;
199 ls->period = time(NULL) - ls->period;
200 if (ls->period == 0) ls->period = 1;
201 }
202
203 /* Second pass, compute MAD. */
204 sum = 0;
205 for (j = 0; j < LATENCY_TS_LEN; j++) {
206 int64_t delta;
207
208 if (ts->samples[j].time == 0) continue;
209 delta = (int64_t)ls->avg - ts->samples[j].latency;
210 if (delta < 0) delta = -delta;
211 sum += delta;
212 }
213 if (ls->samples) ls->mad = sum / ls->samples;
214 }
215
216 /* Create a human readable report of latency events for this Redis instance. */
createLatencyReport(void)217 sds createLatencyReport(void) {
218 sds report = sdsempty();
219 int advise_better_vm = 0; /* Better virtual machines. */
220 int advise_slowlog_enabled = 0; /* Enable slowlog. */
221 int advise_slowlog_tuning = 0; /* Reconfigure slowlog. */
222 int advise_slowlog_inspect = 0; /* Check your slowlog. */
223 int advise_disk_contention = 0; /* Try to lower disk contention. */
224 int advise_scheduler = 0; /* Intrinsic latency. */
225 int advise_data_writeback = 0; /* data=writeback. */
226 int advise_no_appendfsync = 0; /* don't fsync during rewrites. */
227 int advise_local_disk = 0; /* Avoid remote disks. */
228 int advise_ssd = 0; /* Use an SSD drive. */
229 int advise_write_load_info = 0; /* Print info about AOF and write load. */
230 int advise_hz = 0; /* Use higher HZ. */
231 int advise_large_objects = 0; /* Deletion of large objects. */
232 int advise_mass_eviction = 0; /* Avoid mass eviction of keys. */
233 int advise_relax_fsync_policy = 0; /* appendfsync always is slow. */
234 int advise_disable_thp = 0; /* AnonHugePages detected. */
235 int advices = 0;
236
237 /* Return ASAP if the latency engine is disabled and it looks like it
238 * was never enabled so far. */
239 if (dictSize(server.latency_events) == 0 &&
240 server.latency_monitor_threshold == 0)
241 {
242 report = sdscat(report,"I'm sorry, Dave, I can't do that. Latency monitoring is disabled in this Redis instance. You may use \"CONFIG SET latency-monitor-threshold <milliseconds>.\" in order to enable it. If we weren't in a deep space mission I'd suggest to take a look at http://redis.io/topics/latency-monitor.\n");
243 return report;
244 }
245
246 /* Show all the events stats and add for each event some event-related
247 * comment depending on the values. */
248 dictIterator *di;
249 dictEntry *de;
250 int eventnum = 0;
251
252 di = dictGetSafeIterator(server.latency_events);
253 while((de = dictNext(di)) != NULL) {
254 char *event = dictGetKey(de);
255 struct latencyTimeSeries *ts = dictGetVal(de);
256 struct latencyStats ls;
257
258 if (ts == NULL) continue;
259 eventnum++;
260 if (eventnum == 1) {
261 report = sdscat(report,"Dave, I have observed latency spikes in this Redis instance. You don't mind talking about it, do you Dave?\n\n");
262 }
263 analyzeLatencyForEvent(event,&ls);
264
265 report = sdscatprintf(report,
266 "%d. %s: %d latency spikes (average %lums, mean deviation %lums, period %.2f sec). Worst all time event %lums.",
267 eventnum, event,
268 ls.samples,
269 (unsigned long) ls.avg,
270 (unsigned long) ls.mad,
271 (double) ls.period/ls.samples,
272 (unsigned long) ts->max);
273
274 /* Fork */
275 if (!strcasecmp(event,"fork")) {
276 char *fork_quality;
277 if (server.stat_fork_rate < 10) {
278 fork_quality = "terrible";
279 advise_better_vm = 1;
280 advices++;
281 } else if (server.stat_fork_rate < 25) {
282 fork_quality = "poor";
283 advise_better_vm = 1;
284 advices++;
285 } else if (server.stat_fork_rate < 100) {
286 fork_quality = "good";
287 } else {
288 fork_quality = "excellent";
289 }
290 report = sdscatprintf(report,
291 " Fork rate is %.2f GB/sec (%s).", server.stat_fork_rate,
292 fork_quality);
293 }
294
295 /* Potentially commands. */
296 if (!strcasecmp(event,"command")) {
297 if (server.slowlog_log_slower_than < 0) {
298 advise_slowlog_enabled = 1;
299 advices++;
300 } else if (server.slowlog_log_slower_than/1000 >
301 server.latency_monitor_threshold)
302 {
303 advise_slowlog_tuning = 1;
304 advices++;
305 }
306 advise_slowlog_inspect = 1;
307 advise_large_objects = 1;
308 advices += 2;
309 }
310
311 /* fast-command. */
312 if (!strcasecmp(event,"fast-command")) {
313 advise_scheduler = 1;
314 advices++;
315 }
316
317 /* AOF and I/O. */
318 if (!strcasecmp(event,"aof-write-pending-fsync")) {
319 advise_local_disk = 1;
320 advise_disk_contention = 1;
321 advise_ssd = 1;
322 advise_data_writeback = 1;
323 advices += 4;
324 }
325
326 if (!strcasecmp(event,"aof-write-active-child")) {
327 advise_no_appendfsync = 1;
328 advise_data_writeback = 1;
329 advise_ssd = 1;
330 advices += 3;
331 }
332
333 if (!strcasecmp(event,"aof-write-alone")) {
334 advise_local_disk = 1;
335 advise_data_writeback = 1;
336 advise_ssd = 1;
337 advices += 3;
338 }
339
340 if (!strcasecmp(event,"aof-fsync-always")) {
341 advise_relax_fsync_policy = 1;
342 advices++;
343 }
344
345 if (!strcasecmp(event,"aof-fstat") ||
346 !strcasecmp(event,"rdb-unlik-temp-file")) {
347 advise_disk_contention = 1;
348 advise_local_disk = 1;
349 advices += 2;
350 }
351
352 if (!strcasecmp(event,"aof-rewrite-diff-write") ||
353 !strcasecmp(event,"aof-rename")) {
354 advise_write_load_info = 1;
355 advise_data_writeback = 1;
356 advise_ssd = 1;
357 advise_local_disk = 1;
358 advices += 4;
359 }
360
361 /* Expire cycle. */
362 if (!strcasecmp(event,"expire-cycle")) {
363 advise_hz = 1;
364 advise_large_objects = 1;
365 advices += 2;
366 }
367
368 /* Eviction cycle. */
369 if (!strcasecmp(event,"eviction-del")) {
370 advise_large_objects = 1;
371 advices++;
372 }
373
374 if (!strcasecmp(event,"eviction-cycle")) {
375 advise_mass_eviction = 1;
376 advices++;
377 }
378
379 report = sdscatlen(report,"\n",1);
380 }
381 dictReleaseIterator(di);
382
383 /* Add non event based advices. */
384 if (THPGetAnonHugePagesSize() > 0) {
385 advise_disable_thp = 1;
386 advices++;
387 }
388
389 if (eventnum == 0 && advices == 0) {
390 report = sdscat(report,"Dave, no latency spike was observed during the lifetime of this Redis instance, not in the slightest bit. I honestly think you ought to sit down calmly, take a stress pill, and think things over.\n");
391 } else if (eventnum > 0 && advices == 0) {
392 report = sdscat(report,"\nWhile there are latency events logged, I'm not able to suggest any easy fix. Please use the Redis community to get some help, providing this report in your help request.\n");
393 } else {
394 /* Add all the suggestions accumulated so far. */
395
396 /* Better VM. */
397 report = sdscat(report,"\nI have a few advices for you:\n\n");
398 if (advise_better_vm) {
399 report = sdscat(report,"- If you are using a virtual machine, consider upgrading it with a faster one using an hypervisior that provides less latency during fork() calls. Xen is known to have poor fork() performance. Even in the context of the same VM provider, certain kinds of instances can execute fork faster than others.\n");
400 }
401
402 /* Slow log. */
403 if (advise_slowlog_enabled) {
404 report = sdscatprintf(report,"- There are latency issues with potentially slow commands you are using. Try to enable the Slow Log Redis feature using the command 'CONFIG SET slowlog-log-slower-than %llu'. If the Slow log is disabled Redis is not able to log slow commands execution for you.\n", (unsigned long long)server.latency_monitor_threshold*1000);
405 }
406
407 if (advise_slowlog_tuning) {
408 report = sdscatprintf(report,"- Your current Slow Log configuration only logs events that are slower than your configured latency monitor threshold. Please use 'CONFIG SET slowlog-log-slower-than %llu'.\n", (unsigned long long)server.latency_monitor_threshold*1000);
409 }
410
411 if (advise_slowlog_inspect) {
412 report = sdscat(report,"- Check your Slow Log to understand what are the commands you are running which are too slow to execute. Please check http://redis.io/commands/slowlog for more information.\n");
413 }
414
415 /* Intrinsic latency. */
416 if (advise_scheduler) {
417 report = sdscat(report,"- The system is slow to execute Redis code paths not containing system calls. This usually means the system does not provide Redis CPU time to run for long periods. You should try to:\n"
418 " 1) Lower the system load.\n"
419 " 2) Use a computer / VM just for Redis if you are running other softawre in the same system.\n"
420 " 3) Check if you have a \"noisy neighbour\" problem.\n"
421 " 4) Check with 'redis-cli --intrinsic-latency 100' what is the intrinsic latency in your system.\n"
422 " 5) Check if the problem is allocator-related by recompiling Redis with MALLOC=libc, if you are using Jemalloc. However this may create fragmentation problems.\n");
423 }
424
425 /* AOF / Disk latency. */
426 if (advise_local_disk) {
427 report = sdscat(report,"- It is strongly advised to use local disks for persistence, especially if you are using AOF. Remote disks provided by platform-as-a-service providers are known to be slow.\n");
428 }
429
430 if (advise_ssd) {
431 report = sdscat(report,"- SSD disks are able to reduce fsync latency, and total time needed for snapshotting and AOF log rewriting (resulting in smaller memory usage and smaller final AOF rewrite buffer flushes). With extremely high write load SSD disks can be a good option. However Redis should perform reasonably with high load using normal disks. Use this advice as a last resort.\n");
432 }
433
434 if (advise_data_writeback) {
435 report = sdscat(report,"- Mounting ext3/4 filesystems with data=writeback can provide a performance boost compared to data=ordered, however this mode of operation provides less guarantees, and sometimes it can happen that after a hard crash the AOF file will have an half-written command at the end and will require to be repaired before Redis restarts.\n");
436 }
437
438 if (advise_disk_contention) {
439 report = sdscat(report,"- Try to lower the disk contention. This is often caused by other disk intensive processes running in the same computer (including other Redis instances).\n");
440 }
441
442 if (advise_no_appendfsync) {
443 report = sdscat(report,"- Assuming from the point of view of data safety this is viable in your environment, you could try to enable the 'no-appendfsync-on-rewrite' option, so that fsync will not be performed while there is a child rewriting the AOF file or producing an RDB file (the moment where there is high disk contention).\n");
444 }
445
446 if (advise_relax_fsync_policy && server.aof_fsync == AOF_FSYNC_ALWAYS) {
447 report = sdscat(report,"- Your fsync policy is set to 'always'. It is very hard to get good performances with such a setup, if possible try to relax the fsync policy to 'onesec'.\n");
448 }
449
450 if (advise_write_load_info) {
451 report = sdscat(report,"- Latency during the AOF atomic rename operation or when the final difference is flushed to the AOF file at the end of the rewrite, sometimes is caused by very high write load, causing the AOF buffer to get very large. If possible try to send less commands to accomplish the same work, or use Lua scripts to group multiple operations into a single EVALSHA call.\n");
452 }
453
454 if (advise_hz && server.hz < 100) {
455 report = sdscat(report,"- In order to make the Redis keys expiring process more incremental, try to set the 'hz' configuration parameter to 100 using 'CONFIG SET hz 100'.\n");
456 }
457
458 if (advise_large_objects) {
459 report = sdscat(report,"- Deleting, expiring or evicting (because of maxmemory policy) large objects is a blocking operation. If you have very large objects that are often deleted, expired, or evicted, try to fragment those objects into multiple smaller objects.\n");
460 }
461
462 if (advise_mass_eviction) {
463 report = sdscat(report,"- Sudden changes to the 'maxmemory' setting via 'CONFIG SET', or allocation of large objects via sets or sorted sets intersections, STORE option of SORT, Redis Cluster large keys migrations (RESTORE command), may create sudden memory pressure forcing the server to block trying to evict keys. \n");
464 }
465
466 if (advise_disable_thp) {
467 report = sdscat(report,"- I detected a non zero amount of anonymous huge pages used by your process. This creates very serious latency events in different conditions, especially when Redis is persisting on disk. To disable THP support use the command 'echo never > /sys/kernel/mm/transparent_hugepage/enabled', make sure to also add it into /etc/rc.local so that the command will be executed again after a reboot. Note that even if you have already disabled THP, you still need to restart the Redis process to get rid of the huge pages already created.\n");
468 }
469 }
470
471 return report;
472 }
473
474 /* ---------------------- Latency command implementation -------------------- */
475
476 /* latencyCommand() helper to produce a time-delay reply for all the samples
477 * in memory for the specified time series. */
latencyCommandReplyWithSamples(client * c,struct latencyTimeSeries * ts)478 void latencyCommandReplyWithSamples(client *c, struct latencyTimeSeries *ts) {
479 void *replylen = addDeferredMultiBulkLength(c);
480 int samples = 0, j;
481
482 for (j = 0; j < LATENCY_TS_LEN; j++) {
483 int i = (ts->idx + j) % LATENCY_TS_LEN;
484
485 if (ts->samples[i].time == 0) continue;
486 addReplyMultiBulkLen(c,2);
487 addReplyLongLong(c,ts->samples[i].time);
488 addReplyLongLong(c,ts->samples[i].latency);
489 samples++;
490 }
491 setDeferredMultiBulkLength(c,replylen,samples);
492 }
493
494 /* latencyCommand() helper to produce the reply for the LATEST subcommand,
495 * listing the last latency sample for every event type registered so far. */
latencyCommandReplyWithLatestEvents(client * c)496 void latencyCommandReplyWithLatestEvents(client *c) {
497 dictIterator *di;
498 dictEntry *de;
499
500 addReplyMultiBulkLen(c,dictSize(server.latency_events));
501 di = dictGetIterator(server.latency_events);
502 while((de = dictNext(di)) != NULL) {
503 char *event = dictGetKey(de);
504 struct latencyTimeSeries *ts = dictGetVal(de);
505 int last = (ts->idx + LATENCY_TS_LEN - 1) % LATENCY_TS_LEN;
506
507 addReplyMultiBulkLen(c,4);
508 addReplyBulkCString(c,event);
509 addReplyLongLong(c,ts->samples[last].time);
510 addReplyLongLong(c,ts->samples[last].latency);
511 addReplyLongLong(c,ts->max);
512 }
513 dictReleaseIterator(di);
514 }
515
516 #define LATENCY_GRAPH_COLS 80
latencyCommandGenSparkeline(char * event,struct latencyTimeSeries * ts)517 sds latencyCommandGenSparkeline(char *event, struct latencyTimeSeries *ts) {
518 int j;
519 struct sequence *seq = createSparklineSequence();
520 sds graph = sdsempty();
521 uint32_t min = 0, max = 0;
522
523 for (j = 0; j < LATENCY_TS_LEN; j++) {
524 int i = (ts->idx + j) % LATENCY_TS_LEN;
525 int elapsed;
526 char buf[64];
527
528 if (ts->samples[i].time == 0) continue;
529 /* Update min and max. */
530 if (seq->length == 0) {
531 min = max = ts->samples[i].latency;
532 } else {
533 if (ts->samples[i].latency > max) max = ts->samples[i].latency;
534 if (ts->samples[i].latency < min) min = ts->samples[i].latency;
535 }
536 /* Use as label the number of seconds / minutes / hours / days
537 * ago the event happened. */
538 elapsed = time(NULL) - ts->samples[i].time;
539 if (elapsed < 60)
540 snprintf(buf,sizeof(buf),"%ds",elapsed);
541 else if (elapsed < 3600)
542 snprintf(buf,sizeof(buf),"%dm",elapsed/60);
543 else if (elapsed < 3600*24)
544 snprintf(buf,sizeof(buf),"%dh",elapsed/3600);
545 else
546 snprintf(buf,sizeof(buf),"%dd",elapsed/(3600*24));
547 sparklineSequenceAddSample(seq,ts->samples[i].latency,buf);
548 }
549
550 graph = sdscatprintf(graph,
551 "%s - high %lu ms, low %lu ms (all time high %lu ms)\n", event,
552 (unsigned long) max, (unsigned long) min, (unsigned long) ts->max);
553 for (j = 0; j < LATENCY_GRAPH_COLS; j++)
554 graph = sdscatlen(graph,"-",1);
555 graph = sdscatlen(graph,"\n",1);
556 graph = sparklineRender(graph,seq,LATENCY_GRAPH_COLS,4,SPARKLINE_FILL);
557 freeSparklineSequence(seq);
558 return graph;
559 }
560
561 /* LATENCY command implementations.
562 *
563 * LATENCY HISTORY: return time-latency samples for the specified event.
564 * LATENCY LATEST: return the latest latency for all the events classes.
565 * LATENCY DOCTOR: returns a human readable analysis of instance latency.
566 * LATENCY GRAPH: provide an ASCII graph of the latency of the specified event.
567 * LATENCY RESET: reset data of a specified event or all the data if no event provided.
568 */
latencyCommand(client * c)569 void latencyCommand(client *c) {
570 const char *help[] = {
571 "DOCTOR -- Returns a human readable latency analysis report.",
572 "GRAPH <event> -- Returns an ASCII latency graph for the event class.",
573 "HISTORY <event> -- Returns time-latency samples for the event class.",
574 "LATEST -- Returns the latest latency samples for all events.",
575 "RESET [event ...] -- Resets latency data of one or more event classes.",
576 " (default: reset all data for all event classes)",
577 "HELP -- Prints this help.",
578 NULL
579 };
580 struct latencyTimeSeries *ts;
581
582 if (!strcasecmp(c->argv[1]->ptr,"history") && c->argc == 3) {
583 /* LATENCY HISTORY <event> */
584 ts = dictFetchValue(server.latency_events,c->argv[2]->ptr);
585 if (ts == NULL) {
586 addReplyMultiBulkLen(c,0);
587 } else {
588 latencyCommandReplyWithSamples(c,ts);
589 }
590 } else if (!strcasecmp(c->argv[1]->ptr,"graph") && c->argc == 3) {
591 /* LATENCY GRAPH <event> */
592 sds graph;
593 dictEntry *de;
594 char *event;
595
596 de = dictFind(server.latency_events,c->argv[2]->ptr);
597 if (de == NULL) goto nodataerr;
598 ts = dictGetVal(de);
599 event = dictGetKey(de);
600
601 graph = latencyCommandGenSparkeline(event,ts);
602 addReplyBulkCString(c,graph);
603 sdsfree(graph);
604 } else if (!strcasecmp(c->argv[1]->ptr,"latest") && c->argc == 2) {
605 /* LATENCY LATEST */
606 latencyCommandReplyWithLatestEvents(c);
607 } else if (!strcasecmp(c->argv[1]->ptr,"doctor") && c->argc == 2) {
608 /* LATENCY DOCTOR */
609 sds report = createLatencyReport();
610
611 addReplyBulkCBuffer(c,report,sdslen(report));
612 sdsfree(report);
613 } else if (!strcasecmp(c->argv[1]->ptr,"reset") && c->argc >= 2) {
614 /* LATENCY RESET */
615 if (c->argc == 2) {
616 addReplyLongLong(c,latencyResetEvent(NULL));
617 } else {
618 int j, resets = 0;
619
620 for (j = 2; j < c->argc; j++)
621 resets += latencyResetEvent(c->argv[j]->ptr);
622 addReplyLongLong(c,resets);
623 }
624 } else if (!strcasecmp(c->argv[1]->ptr,"help") && c->argc >= 2) {
625 addReplyHelp(c, help);
626 } else {
627 addReplySubcommandSyntaxError(c);
628 }
629 return;
630
631 nodataerr:
632 /* Common error when the user asks for an event we have no latency
633 * information about. */
634 addReplyErrorFormat(c,
635 "No samples available for event '%s'", (char*) c->argv[2]->ptr);
636 }
637
638