1 #include "memcached.h"
2
3 #include "restart.h"
4
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <sys/mman.h>
8 #include <sys/types.h>
9 #include <sys/stat.h>
10 #include <fcntl.h>
11 #include <string.h>
12
13 typedef struct _restart_data_cb restart_data_cb;
14
15 struct _restart_data_cb {
16 void *data; // user supplied opaque data.
17 struct _restart_data_cb *next; // callbacks are ordered stack
18 restart_check_cb ccb;
19 restart_save_cb scb;
20 char tag[RESTART_TAG_MAXLEN];
21 };
22
23 // TODO: struct to hand back to caller.
24 static int mmap_fd = 0;
25 static void *mmap_base = NULL;
26 static size_t slabmem_limit = 0;
27 char *memory_file = NULL;
28
29 static restart_data_cb *cb_stack = NULL;
30
31 // Allows submodules and engines to have independent check and save metadata
32 // routines for the restart code.
restart_register(const char * tag,restart_check_cb ccb,restart_save_cb scb,void * data)33 void restart_register(const char *tag, restart_check_cb ccb, restart_save_cb scb, void *data) {
34 restart_data_cb *cb = calloc(1, sizeof(restart_data_cb));
35 if (cb == NULL) {
36 fprintf(stderr, "[restart] failed to allocate callback register\n");
37 abort();
38 }
39
40 // Handle first time call initialization inline so we don't need separate
41 // API call.
42 if (cb_stack == NULL) {
43 cb_stack = cb;
44 } else {
45 // Ensure we fire the callbacks in registration order.
46 // Someday I'll get a queue.h overhaul.
47 restart_data_cb *finder = cb_stack;
48 while (finder->next != NULL) {
49 finder = finder->next;
50 }
51 finder->next = cb;
52 }
53
54 safe_strcpy(cb->tag, tag, RESTART_TAG_MAXLEN);
55 cb->data = data;
56 cb->ccb = *ccb;
57 cb->scb = *scb;
58 }
59
60 typedef struct {
61 FILE *f;
62 restart_data_cb *cb;
63 char *line;
64 bool done;
65 } restart_cb_ctx;
66
67 // TODO: error string from cb?
68 // - look for final line with checksum
69 // - checksum entire file (up until final line)
70 // - seek to start
71
restart_check(const char * file)72 static int restart_check(const char *file) {
73 // metadata is kept in a separate file.
74 size_t flen = strlen(file);
75 const char *ext = ".meta";
76 char *metafile = calloc(1, flen + strlen(ext) + 1);
77 if (metafile == NULL) {
78 // probably in a really bad position if we hit here, so don't start.
79 fprintf(stderr, "[restart] failed to allocate memory for restart check\n");
80 abort();
81 }
82 memcpy(metafile, file, flen);
83 memcpy(metafile+flen, ext, strlen(ext));
84
85 FILE *f = fopen(metafile, "r");
86 if (f == NULL) {
87 fprintf(stderr, "[restart] no metadata save file, starting with a clean cache\n");
88 free(metafile);
89 return -1;
90 }
91
92 restart_cb_ctx ctx;
93
94 ctx.f = f;
95 ctx.cb = NULL;
96 ctx.line = NULL;
97 ctx.done = false;
98 if (restart_get_kv(&ctx, NULL, NULL) != RESTART_DONE) {
99 // First line must be a tag, so read it in and set up the proper
100 // callback here.
101 fprintf(stderr, "[restart] corrupt metadata file\n");
102 // TODO: this should probably just return -1 and skip the reuse.
103 abort();
104 }
105 if (ctx.cb == NULL) {
106 fprintf(stderr, "[restart] Failed to read a tag from metadata file\n");
107 abort();
108 }
109
110 // loop call the callback, check result code.
111 bool failed = false;
112 while (!ctx.done) {
113 restart_data_cb *cb = ctx.cb;
114 if (cb->ccb(cb->tag, &ctx, cb->data) != 0) {
115 failed = true;
116 break;
117 }
118 }
119
120 if (ctx.line)
121 free(ctx.line);
122
123 fclose(f);
124
125 unlink(metafile);
126 free(metafile);
127
128 if (failed) {
129 fprintf(stderr, "[restart] failed to validate metadata, starting with a clean cache\n");
130 return -1;
131 } else {
132 return 0;
133 }
134 }
135
136 // This function advances the file read while being called directly from the
137 // callback.
138 // The control inversion here (callback calling in which might change the next
139 // callback) allows the callbacks to set up proper loops or sequences for
140 // reading data back, avoiding an event model.
restart_get_kv(void * ctx,char ** key,char ** val)141 enum restart_get_kv_ret restart_get_kv(void *ctx, char **key, char **val) {
142 char *line = NULL;
143 size_t len = 0;
144 restart_data_cb *cb = NULL;
145 restart_cb_ctx *c = (restart_cb_ctx *) ctx;
146 // free previous line.
147 // we could just pass it into getline, but it can randomly realloc so we'd
148 // have to re-assign it into the structure anyway.
149 if (c->line != NULL) {
150 free(c->line);
151 c->line = NULL;
152 }
153
154 if (getline(&line, &len, c->f) != -1) {
155 // First char is an indicator:
156 // T for TAG, changing the callback we use.
157 // K for key/value, to ship to the active callback.
158 char *p = line;
159 while (*p != '\n') {
160 p++;
161 }
162 *p = '\0';
163
164 if (line[0] == 'T') {
165 cb = cb_stack;
166 while (cb != NULL) {
167 // NOTE: len is allocated size, not line len. need to chomp \n
168 if (strcmp(cb->tag, line+1) == 0) {
169 break;
170 }
171 cb = cb->next;
172 }
173 if (cb == NULL) {
174 fprintf(stderr, "[restart] internal handler for metadata tag not found: %s:\n", line+1);
175 return RESTART_NOTAG;
176 }
177 c->cb = cb;
178 } else if (line[0] == 'K') {
179 char *p = line+1; // start just ahead of the token.
180 // tokenize the string and return the pointers?
181 if (key != NULL) {
182 *key = p;
183 }
184
185 // turn key into a normal NULL terminated string.
186 while (*p != ' ' && (p - line < len)) {
187 p++;
188 }
189 *p = '\0';
190 p++;
191
192 // value _should_ run until where the newline was, which is \0 now
193 if (val != NULL) {
194 *val = p;
195 }
196 c->line = line;
197
198 return RESTART_OK;
199 } else {
200 // FIXME: proper error chain.
201 fprintf(stderr, "[restart] invalid metadata line:\n\n%s\n", line);
202 free(line);
203 return RESTART_BADLINE;
204 }
205 } else {
206 // EOF or error in read.
207 c->done = true;
208 }
209
210 return RESTART_DONE;
211 }
212
213 // TODO:
214 // - rolling checksum along with the writes.
215 // - write final line + checksum + byte count or w/e.
216
restart_save(const char * file)217 static int restart_save(const char *file) {
218 // metadata is kept in a separate file.
219 // FIXME: function.
220 size_t flen = strlen(file);
221 const char *ext = ".meta";
222 size_t extlen = strlen(ext);
223 char *metafile = calloc(1, flen + extlen + 1);
224 if (metafile == NULL) {
225 fprintf(stderr, "[restart] failed to allocate memory during metadata save\n");
226 return -1;
227 }
228 memcpy(metafile, file, flen);
229 memcpy(metafile+flen, ext, extlen);
230
231 // restrictive permissions for the metadata file.
232 // TODO: also for the mmap file eh? :P
233 mode_t oldmask = umask(~(S_IRUSR | S_IWUSR));
234 FILE *f = fopen(metafile, "w");
235 umask(oldmask);
236 if (f == NULL) {
237 // FIXME: correct error handling.
238 free(metafile);
239 perror("failed to write metadata file");
240 return -1;
241 }
242
243 restart_data_cb *cb = cb_stack;
244 restart_cb_ctx ctx;
245 ctx.f = f;
246 while (cb != NULL) {
247 // Plugins/engines in the metadata file are separated by tag lines.
248 fprintf(f, "T%s\n", cb->tag);
249 if (cb->scb(cb->tag, &ctx, cb->data) != 0) {
250 fclose(f);
251 free(metafile);
252 return -1;
253 }
254
255 cb = cb->next;
256 }
257
258 fclose(f);
259 free(metafile);
260
261 return 0;
262 }
263
264 // Keys and values must not contain spaces or newlines.
265 // Could offer an interface that uriencodes values for the caller, however
266 // nothing currently would use it, so add when necessary.
267 #define SET_VAL_MAX 4096
restart_set_kv(void * ctx,const char * key,const char * fmt,...)268 void restart_set_kv(void *ctx, const char *key, const char *fmt, ...) {
269 va_list ap;
270 restart_cb_ctx *c = (restart_cb_ctx *) ctx;
271 char valbuf[SET_VAL_MAX];
272
273 va_start(ap, fmt);
274 int vlen = vsnprintf(valbuf, SET_VAL_MAX-1, fmt, ap);
275 va_end(ap);
276 // This is heavy handed. We need to protect against corrupt data as much
277 // as possible. The buffer is large and these values are currently small,
278 // it will take a significant mistake to land here.
279 if (vlen >= SET_VAL_MAX) {
280 fprintf(stderr, "[restart] fatal error while saving metadata state, value too long for: %s %s",
281 key, valbuf);
282 abort();
283 }
284
285 fprintf(c->f, "K%s %s\n", key, valbuf);
286 // TODO: update crc32c
287 }
288
_find_pagesize(void)289 static long _find_pagesize(void) {
290 #if defined(HAVE_SYSCONF) && defined(_SC_PAGESIZE)
291 return sysconf(_SC_PAGESIZE);
292 #else
293 // A good guess.
294 return 4096;
295 #endif
296 }
297
restart_mmap_open(const size_t limit,const char * file,void ** mem_base)298 bool restart_mmap_open(const size_t limit, const char *file, void **mem_base) {
299 bool reuse_mmap = true;
300
301 long pagesize = _find_pagesize();
302 memory_file = strdup(file);
303 mmap_fd = open(file, O_RDWR|O_CREAT, S_IRWXU);
304 if (mmap_fd == -1) {
305 perror("failed to open file for mmap");
306 abort();
307 }
308 if (ftruncate(mmap_fd, limit) != 0) {
309 perror("ftruncate failed");
310 abort();
311 }
312 /* Allocate everything in a big chunk with malloc */
313 if (limit % pagesize) {
314 // This is a sanity check; shouldn't ever be possible since we
315 // increase memory by whole megabytes.
316 fprintf(stderr, "[restart] memory limit not divisible evenly by pagesize (please report bug)\n");
317 abort();
318 }
319 mmap_base = mmap(NULL, limit, PROT_READ|PROT_WRITE, MAP_SHARED, mmap_fd, 0);
320 if (mmap_base == MAP_FAILED) {
321 perror("failed to mmap, aborting");
322 abort();
323 }
324 // Set the limit before calling check_mmap, so we can find the meta page..
325 slabmem_limit = limit;
326 if (restart_check(file) != 0) {
327 reuse_mmap = false;
328 }
329 *mem_base = mmap_base;
330
331 return reuse_mmap;
332 }
333
334 /* Gracefully stop/close the shared memory segment */
restart_mmap_close(void)335 void restart_mmap_close(void) {
336 msync(mmap_base, slabmem_limit, MS_SYNC);
337
338 if (restart_save(memory_file) != 0) {
339 fprintf(stderr, "[restart] failed to save metadata");
340 }
341
342 if (munmap(mmap_base, slabmem_limit) != 0) {
343 perror("[restart] failed to munmap shared memory");
344 } else if (close(mmap_fd) != 0) {
345 perror("[restart] failed to close shared memory fd");
346 }
347
348 free(memory_file);
349 }
350
351 // given memory base, quickly walk memory and do pointer fixup.
352 // do this once on startup to avoid having to do pointer fixup on every
353 // reference from hash table or LRU.
restart_fixup(void * orig_addr)354 unsigned int restart_fixup(void *orig_addr) {
355 struct timeval tv;
356 uint64_t checked = 0;
357 const unsigned int page_size = settings.slab_page_size;
358 unsigned int page_remain = page_size;
359
360 gettimeofday(&tv, NULL);
361 if (settings.verbose > 0) {
362 fprintf(stderr, "[restart] original memory base: [%p] new base: [%p]\n", orig_addr, mmap_base);
363 fprintf(stderr, "[restart] recovery start [%d.%d]\n", (int)tv.tv_sec, (int)tv.tv_usec);
364 }
365
366 // since chunks don't align with pages, we have to also track page size.
367 while (checked < slabmem_limit) {
368 //fprintf(stderr, "checked: %lu\n", checked);
369 item *it = (item *)((char *)mmap_base + checked);
370
371 int size = slabs_fixup((char *)mmap_base + checked,
372 checked % settings.slab_page_size);
373 //fprintf(stderr, "id: %d, size: %d remain: %u\n", it->slabs_clsid, size, page_remain);
374 // slabber gobbled an entire page, skip and move on.
375 if (size == -1) {
376 assert(page_remain % page_size == 0);
377 assert(page_remain == page_size);
378 checked += page_remain;
379 page_remain = page_size;
380 continue;
381 }
382
383 if (it->it_flags & ITEM_LINKED) {
384 // fixup next/prev links while on LRU.
385 if (it->next) {
386 it->next = (item *)((mc_ptr_t)it->next - (mc_ptr_t)orig_addr);
387 it->next = (item *)((mc_ptr_t)it->next + (mc_ptr_t)mmap_base);
388 }
389 if (it->prev) {
390 it->prev = (item *)((mc_ptr_t)it->prev - (mc_ptr_t)orig_addr);
391 it->prev = (item *)((mc_ptr_t)it->prev + (mc_ptr_t)mmap_base);
392 }
393
394 //fprintf(stderr, "item was linked\n");
395 do_item_link_fixup(it);
396 }
397
398 if (it->it_flags & (ITEM_CHUNKED|ITEM_CHUNK)) {
399 item_chunk *ch;
400 if (it->it_flags & ITEM_CHUNKED) {
401 ch = (item_chunk *) ITEM_schunk(it);
402 // Sigh. Chunked items are a hack; the clsid is the clsid of
403 // the full object (always the largest slab class) rather than
404 // the actual chunk.
405 // I bet this is fixable :(
406 size = slabs_size(ch->orig_clsid);
407 //fprintf(stderr, "fixing chunked item header [%d]\n", size);
408 } else {
409 //fprintf(stderr, "fixing item chunk [%d]\n", size);
410 ch = (item_chunk *) it;
411 }
412 if (ch->next) {
413 ch->next = (item_chunk *)((mc_ptr_t)ch->next - (mc_ptr_t)orig_addr);
414 ch->next = (item_chunk *)((mc_ptr_t)ch->next + (mc_ptr_t)mmap_base);
415 }
416 if (ch->prev) {
417 ch->prev = (item_chunk *)((mc_ptr_t)ch->prev - (mc_ptr_t)orig_addr);
418 ch->prev = (item_chunk *)((mc_ptr_t)ch->prev + (mc_ptr_t)mmap_base);
419 }
420 if (ch->head) {
421 ch->head = (item *)((mc_ptr_t)ch->head - (mc_ptr_t)orig_addr);
422 ch->head = (item *)((mc_ptr_t)ch->head + (mc_ptr_t)mmap_base);
423 }
424 }
425
426 // next chunk
427 checked += size;
428 page_remain -= size;
429 if (size > page_remain) {
430 //fprintf(stderr, "doot %d\n", page_remain);
431 checked += page_remain;
432 page_remain = settings.slab_page_size;
433 }
434 //assert(checked != 3145728);
435 }
436
437 if (settings.verbose > 0) {
438 gettimeofday(&tv, NULL);
439 fprintf(stderr, "[restart] recovery end [%d.%d]\n", (int)tv.tv_sec, (int)tv.tv_usec);
440 }
441
442 return 0;
443 }
444