1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "bbpos.h" 4 #include "alloc_background.h" 5 #include "backpointers.h" 6 #include "bkey_buf.h" 7 #include "btree_cache.h" 8 #include "btree_update.h" 9 #include "btree_update_interior.h" 10 #include "btree_write_buffer.h" 11 #include "checksum.h" 12 #include "disk_accounting.h" 13 #include "error.h" 14 #include "progress.h" 15 16 #include <linux/mm.h> 17 18 int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, 19 struct bkey_validate_context from) 20 { 21 struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); 22 int ret = 0; 23 24 bkey_fsck_err_on(bp.v->level > BTREE_MAX_DEPTH, 25 c, backpointer_level_bad, 26 "backpointer level bad: %u >= %u", 27 bp.v->level, BTREE_MAX_DEPTH); 28 29 bkey_fsck_err_on(bp.k->p.inode == BCH_SB_MEMBER_INVALID, 30 c, backpointer_dev_bad, 31 "backpointer for BCH_SB_MEMBER_INVALID"); 32 fsck_err: 33 return ret; 34 } 35 36 void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) 37 { 38 struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); 39 40 rcu_read_lock(); 41 struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); 42 if (ca) { 43 u32 bucket_offset; 44 struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); 45 rcu_read_unlock(); 46 prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset); 47 } else { 48 rcu_read_unlock(); 49 prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT); 50 } 51 52 bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); 53 prt_str(out, " data_type="); 54 bch2_prt_data_type(out, bp.v->data_type); 55 prt_printf(out, " suboffset=%u len=%u gen=%u pos=", 56 (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), 57 bp.v->bucket_len, 58 bp.v->bucket_gen); 59 bch2_bpos_to_text(out, bp.v->pos); 60 } 61 62 void bch2_backpointer_swab(struct bkey_s k) 63 { 64 struct bkey_s_backpointer bp = bkey_s_to_backpointer(k); 65 66 bp.v->bucket_len = swab32(bp.v->bucket_len); 67 bch2_bpos_swab(&bp.v->pos); 68 } 69 70 static bool extent_matches_bp(struct bch_fs *c, 71 enum btree_id btree_id, unsigned level, 72 struct bkey_s_c k, 73 struct bkey_s_c_backpointer bp) 74 { 75 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 76 const union bch_extent_entry *entry; 77 struct extent_ptr_decoded p; 78 79 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 80 struct bkey_i_backpointer bp2; 81 bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp2); 82 83 if (bpos_eq(bp.k->p, bp2.k.p) && 84 !memcmp(bp.v, &bp2.v, sizeof(bp2.v))) 85 return true; 86 } 87 88 return false; 89 } 90 91 static noinline int backpointer_mod_err(struct btree_trans *trans, 92 struct bkey_s_c orig_k, 93 struct bkey_i_backpointer *new_bp, 94 struct bkey_s_c found_bp, 95 bool insert) 96 { 97 struct bch_fs *c = trans->c; 98 struct printbuf buf = PRINTBUF; 99 int ret = 0; 100 101 if (insert) { 102 prt_printf(&buf, "existing backpointer found when inserting "); 103 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); 104 prt_newline(&buf); 105 printbuf_indent_add(&buf, 2); 106 107 prt_printf(&buf, "found "); 108 bch2_bkey_val_to_text(&buf, c, found_bp); 109 prt_newline(&buf); 110 111 prt_printf(&buf, "for "); 112 bch2_bkey_val_to_text(&buf, c, orig_k); 113 114 bch_err(c, "%s", buf.buf); 115 } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { 116 prt_printf(&buf, "backpointer not found when deleting\n"); 117 printbuf_indent_add(&buf, 2); 118 119 prt_printf(&buf, "searching for "); 120 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); 121 prt_newline(&buf); 122 123 prt_printf(&buf, "got "); 124 bch2_bkey_val_to_text(&buf, c, found_bp); 125 prt_newline(&buf); 126 127 prt_printf(&buf, "for "); 128 bch2_bkey_val_to_text(&buf, c, orig_k); 129 } 130 131 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers && 132 __bch2_inconsistent_error(c, &buf)) 133 ret = -BCH_ERR_erofs_unfixed_errors; 134 135 bch_err(c, "%s", buf.buf); 136 printbuf_exit(&buf); 137 return ret; 138 } 139 140 int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, 141 struct bkey_s_c orig_k, 142 struct bkey_i_backpointer *bp, 143 bool insert) 144 { 145 struct btree_iter bp_iter; 146 struct bkey_s_c k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, 147 bp->k.p, 148 BTREE_ITER_intent| 149 BTREE_ITER_slots| 150 BTREE_ITER_with_updates); 151 int ret = bkey_err(k); 152 if (ret) 153 return ret; 154 155 if (insert 156 ? k.k->type 157 : (k.k->type != KEY_TYPE_backpointer || 158 memcmp(bkey_s_c_to_backpointer(k).v, &bp->v, sizeof(bp->v)))) { 159 ret = backpointer_mod_err(trans, orig_k, bp, k, insert); 160 if (ret) 161 goto err; 162 } 163 164 if (!insert) { 165 bp->k.type = KEY_TYPE_deleted; 166 set_bkey_val_u64s(&bp->k, 0); 167 } 168 169 ret = bch2_trans_update(trans, &bp_iter, &bp->k_i, 0); 170 err: 171 bch2_trans_iter_exit(trans, &bp_iter); 172 return ret; 173 } 174 175 static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) 176 { 177 return (likely(!bch2_backpointers_no_use_write_buffer) 178 ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos) 179 : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?: 180 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 181 } 182 183 static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, 184 struct bkey_s_c visiting_k, 185 struct bkey_buf *last_flushed) 186 { 187 return likely(!bch2_backpointers_no_use_write_buffer) 188 ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed) 189 : 0; 190 } 191 192 static int backpointer_target_not_found(struct btree_trans *trans, 193 struct bkey_s_c_backpointer bp, 194 struct bkey_s_c target_k, 195 struct bkey_buf *last_flushed) 196 { 197 struct bch_fs *c = trans->c; 198 struct printbuf buf = PRINTBUF; 199 int ret = 0; 200 201 /* 202 * If we're using the btree write buffer, the backpointer we were 203 * looking at may have already been deleted - failure to find what it 204 * pointed to is not an error: 205 */ 206 ret = last_flushed 207 ? bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed) 208 : 0; 209 if (ret) 210 return ret; 211 212 prt_printf(&buf, "backpointer doesn't match %s it points to:\n", 213 bp.v->level ? "btree node" : "extent"); 214 bch2_bkey_val_to_text(&buf, c, bp.s_c); 215 216 prt_newline(&buf); 217 bch2_bkey_val_to_text(&buf, c, target_k); 218 219 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k); 220 const union bch_extent_entry *entry; 221 struct extent_ptr_decoded p; 222 bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry) 223 if (p.ptr.dev == bp.k->p.inode) { 224 prt_newline(&buf); 225 struct bkey_i_backpointer bp2; 226 bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2); 227 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i)); 228 } 229 230 if (fsck_err(trans, backpointer_to_missing_ptr, 231 "%s", buf.buf)) 232 ret = bch2_backpointer_del(trans, bp.k->p); 233 fsck_err: 234 printbuf_exit(&buf); 235 return ret; 236 } 237 238 struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, 239 struct bkey_s_c_backpointer bp, 240 struct btree_iter *iter, 241 unsigned iter_flags, 242 struct bkey_buf *last_flushed) 243 { 244 struct bch_fs *c = trans->c; 245 246 if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c))) 247 return bkey_s_c_null; 248 249 bch2_trans_node_iter_init(trans, iter, 250 bp.v->btree_id, 251 bp.v->pos, 252 0, 253 bp.v->level, 254 iter_flags); 255 struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); 256 if (bkey_err(k)) { 257 bch2_trans_iter_exit(trans, iter); 258 return k; 259 } 260 261 if (k.k && 262 extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) 263 return k; 264 265 bch2_trans_iter_exit(trans, iter); 266 267 if (!bp.v->level) { 268 int ret = backpointer_target_not_found(trans, bp, k, last_flushed); 269 return ret ? bkey_s_c_err(ret) : bkey_s_c_null; 270 } else { 271 struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed); 272 if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node)) 273 return bkey_s_c_null; 274 if (IS_ERR_OR_NULL(b)) 275 return ((struct bkey_s_c) { .k = ERR_CAST(b) }); 276 277 return bkey_i_to_s_c(&b->key); 278 } 279 } 280 281 struct btree *bch2_backpointer_get_node(struct btree_trans *trans, 282 struct bkey_s_c_backpointer bp, 283 struct btree_iter *iter, 284 struct bkey_buf *last_flushed) 285 { 286 struct bch_fs *c = trans->c; 287 288 BUG_ON(!bp.v->level); 289 290 bch2_trans_node_iter_init(trans, iter, 291 bp.v->btree_id, 292 bp.v->pos, 293 0, 294 bp.v->level - 1, 295 0); 296 struct btree *b = bch2_btree_iter_peek_node(iter); 297 if (IS_ERR_OR_NULL(b)) 298 goto err; 299 300 BUG_ON(b->c.level != bp.v->level - 1); 301 302 if (extent_matches_bp(c, bp.v->btree_id, bp.v->level, 303 bkey_i_to_s_c(&b->key), bp)) 304 return b; 305 306 if (btree_node_will_make_reachable(b)) { 307 b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); 308 } else { 309 int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed); 310 b = ret ? ERR_PTR(ret) : NULL; 311 } 312 err: 313 bch2_trans_iter_exit(trans, iter); 314 return b; 315 } 316 317 static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k, 318 struct bkey_buf *last_flushed) 319 { 320 if (k.k->type != KEY_TYPE_backpointer) 321 return 0; 322 323 struct bch_fs *c = trans->c; 324 struct btree_iter alloc_iter = { NULL }; 325 struct bkey_s_c alloc_k; 326 struct printbuf buf = PRINTBUF; 327 int ret = 0; 328 329 struct bpos bucket; 330 if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) { 331 ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); 332 if (ret) 333 goto out; 334 335 if (fsck_err(trans, backpointer_to_missing_device, 336 "backpointer for missing device:\n%s", 337 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 338 ret = bch2_backpointer_del(trans, k.k->p); 339 goto out; 340 } 341 342 alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0); 343 ret = bkey_err(alloc_k); 344 if (ret) 345 goto out; 346 347 if (alloc_k.k->type != KEY_TYPE_alloc_v4) { 348 ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); 349 if (ret) 350 goto out; 351 352 if (fsck_err(trans, backpointer_to_missing_alloc, 353 "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", 354 alloc_iter.pos.inode, alloc_iter.pos.offset, 355 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 356 ret = bch2_backpointer_del(trans, k.k->p); 357 } 358 out: 359 fsck_err: 360 bch2_trans_iter_exit(trans, &alloc_iter); 361 printbuf_exit(&buf); 362 return ret; 363 } 364 365 /* verify that every backpointer has a corresponding alloc key */ 366 int bch2_check_btree_backpointers(struct bch_fs *c) 367 { 368 struct bkey_buf last_flushed; 369 bch2_bkey_buf_init(&last_flushed); 370 bkey_init(&last_flushed.k->k); 371 372 int ret = bch2_trans_run(c, 373 for_each_btree_key_commit(trans, iter, 374 BTREE_ID_backpointers, POS_MIN, 0, k, 375 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 376 bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed))); 377 378 bch2_bkey_buf_exit(&last_flushed, c); 379 bch_err_fn(c, ret); 380 return ret; 381 } 382 383 struct extents_to_bp_state { 384 struct bpos bp_start; 385 struct bpos bp_end; 386 struct bkey_buf last_flushed; 387 }; 388 389 static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree, 390 struct bkey_s_c extent, unsigned dev) 391 { 392 struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent); 393 int ret = PTR_ERR_OR_ZERO(n); 394 if (ret) 395 return ret; 396 397 bch2_bkey_drop_device(bkey_i_to_s(n), dev); 398 return bch2_btree_insert_trans(trans, btree, n, 0); 399 } 400 401 static int check_extent_checksum(struct btree_trans *trans, 402 enum btree_id btree, struct bkey_s_c extent, 403 enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev) 404 { 405 struct bch_fs *c = trans->c; 406 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent); 407 const union bch_extent_entry *entry; 408 struct extent_ptr_decoded p; 409 struct printbuf buf = PRINTBUF; 410 void *data_buf = NULL; 411 struct bio *bio = NULL; 412 size_t bytes; 413 int ret = 0; 414 415 if (bkey_is_btree_ptr(extent.k)) 416 return false; 417 418 bkey_for_each_ptr_decode(extent.k, ptrs, p, entry) 419 if (p.ptr.dev == dev) 420 goto found; 421 BUG(); 422 found: 423 if (!p.crc.csum_type) 424 return false; 425 426 bytes = p.crc.compressed_size << 9; 427 428 struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ); 429 if (!ca) 430 return false; 431 432 data_buf = kvmalloc(bytes, GFP_KERNEL); 433 if (!data_buf) { 434 ret = -ENOMEM; 435 goto err; 436 } 437 438 bio = bio_alloc(ca->disk_sb.bdev, buf_pages(data_buf, bytes), REQ_OP_READ, GFP_KERNEL); 439 bio->bi_iter.bi_sector = p.ptr.offset; 440 bch2_bio_map(bio, data_buf, bytes); 441 ret = submit_bio_wait(bio); 442 if (ret) 443 goto err; 444 445 prt_printf(&buf, "extents pointing to same space, but first extent checksum bad:\n"); 446 bch2_btree_id_to_text(&buf, btree); 447 prt_str(&buf, " "); 448 bch2_bkey_val_to_text(&buf, c, extent); 449 prt_newline(&buf); 450 bch2_btree_id_to_text(&buf, o_btree); 451 prt_str(&buf, " "); 452 bch2_bkey_val_to_text(&buf, c, extent2); 453 454 struct nonce nonce = extent_nonce(extent.k->bversion, p.crc); 455 struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes); 456 if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum), 457 trans, dup_backpointer_to_bad_csum_extent, 458 "%s", buf.buf)) 459 ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1; 460 fsck_err: 461 err: 462 if (bio) 463 bio_put(bio); 464 kvfree(data_buf); 465 percpu_ref_put(&ca->io_ref); 466 printbuf_exit(&buf); 467 return ret; 468 } 469 470 static int check_bp_exists(struct btree_trans *trans, 471 struct extents_to_bp_state *s, 472 struct bkey_i_backpointer *bp, 473 struct bkey_s_c orig_k) 474 { 475 struct bch_fs *c = trans->c; 476 struct btree_iter other_extent_iter = {}; 477 struct printbuf buf = PRINTBUF; 478 479 if (bpos_lt(bp->k.p, s->bp_start) || 480 bpos_gt(bp->k.p, s->bp_end)) 481 return 0; 482 483 struct btree_iter bp_iter; 484 struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0); 485 int ret = bkey_err(bp_k); 486 if (ret) 487 goto err; 488 489 if (bp_k.k->type != KEY_TYPE_backpointer || 490 memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp->v, sizeof(bp->v))) { 491 ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed); 492 if (ret) 493 goto err; 494 495 goto check_existing_bp; 496 } 497 out: 498 err: 499 fsck_err: 500 bch2_trans_iter_exit(trans, &other_extent_iter); 501 bch2_trans_iter_exit(trans, &bp_iter); 502 printbuf_exit(&buf); 503 return ret; 504 check_existing_bp: 505 /* Do we have a backpointer for a different extent? */ 506 if (bp_k.k->type != KEY_TYPE_backpointer) 507 goto missing; 508 509 struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k); 510 511 struct bkey_s_c other_extent = 512 bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL); 513 ret = bkey_err(other_extent); 514 if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) 515 ret = 0; 516 if (ret) 517 goto err; 518 519 if (!other_extent.k) 520 goto missing; 521 522 rcu_read_lock(); 523 struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp->k.p.inode); 524 if (ca) { 525 struct bkey_ptrs_c other_extent_ptrs = bch2_bkey_ptrs_c(other_extent); 526 bkey_for_each_ptr(other_extent_ptrs, ptr) 527 if (ptr->dev == bp->k.p.inode && 528 dev_ptr_stale_rcu(ca, ptr)) { 529 ret = drop_dev_and_update(trans, other_bp.v->btree_id, 530 other_extent, bp->k.p.inode); 531 if (ret) 532 goto err; 533 goto out; 534 } 535 } 536 rcu_read_unlock(); 537 538 if (bch2_extents_match(orig_k, other_extent)) { 539 printbuf_reset(&buf); 540 prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n"); 541 bch2_bkey_val_to_text(&buf, c, orig_k); 542 prt_newline(&buf); 543 bch2_bkey_val_to_text(&buf, c, other_extent); 544 bch_err(c, "%s", buf.buf); 545 546 if (other_extent.k->size <= orig_k.k->size) { 547 ret = drop_dev_and_update(trans, other_bp.v->btree_id, 548 other_extent, bp->k.p.inode); 549 if (ret) 550 goto err; 551 goto out; 552 } else { 553 ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bp->k.p.inode); 554 if (ret) 555 goto err; 556 goto missing; 557 } 558 } 559 560 ret = check_extent_checksum(trans, 561 other_bp.v->btree_id, other_extent, 562 bp->v.btree_id, orig_k, 563 bp->k.p.inode); 564 if (ret < 0) 565 goto err; 566 if (ret) { 567 ret = 0; 568 goto missing; 569 } 570 571 ret = check_extent_checksum(trans, bp->v.btree_id, orig_k, 572 other_bp.v->btree_id, other_extent, bp->k.p.inode); 573 if (ret < 0) 574 goto err; 575 if (ret) { 576 ret = 0; 577 goto out; 578 } 579 580 printbuf_reset(&buf); 581 prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n", bp->k.p.inode); 582 bch2_bkey_val_to_text(&buf, c, orig_k); 583 prt_newline(&buf); 584 bch2_bkey_val_to_text(&buf, c, other_extent); 585 bch_err(c, "%s", buf.buf); 586 ret = -BCH_ERR_fsck_repair_unimplemented; 587 goto err; 588 missing: 589 printbuf_reset(&buf); 590 prt_str(&buf, "missing backpointer\nfor: "); 591 bch2_bkey_val_to_text(&buf, c, orig_k); 592 prt_printf(&buf, "\nwant: "); 593 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i)); 594 prt_printf(&buf, "\ngot: "); 595 bch2_bkey_val_to_text(&buf, c, bp_k); 596 597 if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf)) 598 ret = bch2_bucket_backpointer_mod(trans, orig_k, bp, true); 599 600 goto out; 601 } 602 603 static int check_extent_to_backpointers(struct btree_trans *trans, 604 struct extents_to_bp_state *s, 605 enum btree_id btree, unsigned level, 606 struct bkey_s_c k) 607 { 608 struct bch_fs *c = trans->c; 609 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 610 const union bch_extent_entry *entry; 611 struct extent_ptr_decoded p; 612 613 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 614 if (p.ptr.dev == BCH_SB_MEMBER_INVALID) 615 continue; 616 617 rcu_read_lock(); 618 struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); 619 bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches); 620 bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty); 621 622 bool stale = p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)); 623 rcu_read_unlock(); 624 625 if ((check || empty) && !stale) { 626 struct bkey_i_backpointer bp; 627 bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); 628 629 int ret = check 630 ? check_bp_exists(trans, s, &bp, k) 631 : bch2_bucket_backpointer_mod(trans, k, &bp, true); 632 if (ret) 633 return ret; 634 } 635 } 636 637 return 0; 638 } 639 640 static int check_btree_root_to_backpointers(struct btree_trans *trans, 641 struct extents_to_bp_state *s, 642 enum btree_id btree_id, 643 int *level) 644 { 645 struct bch_fs *c = trans->c; 646 struct btree_iter iter; 647 struct btree *b; 648 struct bkey_s_c k; 649 int ret; 650 retry: 651 bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 652 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0); 653 b = bch2_btree_iter_peek_node(&iter); 654 ret = PTR_ERR_OR_ZERO(b); 655 if (ret) 656 goto err; 657 658 if (b != btree_node_root(c, b)) { 659 bch2_trans_iter_exit(trans, &iter); 660 goto retry; 661 } 662 663 *level = b->c.level; 664 665 k = bkey_i_to_s_c(&b->key); 666 ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k); 667 err: 668 bch2_trans_iter_exit(trans, &iter); 669 return ret; 670 } 671 672 static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) 673 { 674 return (struct bbpos) { 675 .btree = bp.btree_id, 676 .pos = bp.pos, 677 }; 678 } 679 680 static u64 mem_may_pin_bytes(struct bch_fs *c) 681 { 682 struct sysinfo i; 683 si_meminfo(&i); 684 685 u64 mem_bytes = i.totalram * i.mem_unit; 686 return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100); 687 } 688 689 static size_t btree_nodes_fit_in_ram(struct bch_fs *c) 690 { 691 return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size); 692 } 693 694 static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, 695 u64 btree_leaf_mask, 696 u64 btree_interior_mask, 697 struct bbpos start, struct bbpos *end) 698 { 699 struct bch_fs *c = trans->c; 700 s64 mem_may_pin = mem_may_pin_bytes(c); 701 int ret = 0; 702 703 bch2_btree_cache_unpin(c); 704 705 btree_interior_mask |= btree_leaf_mask; 706 707 c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask; 708 c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask; 709 c->btree_cache.pinned_nodes_start = start; 710 c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX; 711 712 for (enum btree_id btree = start.btree; 713 btree < BTREE_ID_NR && !ret; 714 btree++) { 715 unsigned depth = (BIT_ULL(btree) & btree_leaf_mask) ? 0 : 1; 716 717 if (!(BIT_ULL(btree) & btree_leaf_mask) && 718 !(BIT_ULL(btree) & btree_interior_mask)) 719 continue; 720 721 ret = __for_each_btree_node(trans, iter, btree, 722 btree == start.btree ? start.pos : POS_MIN, 723 0, depth, BTREE_ITER_prefetch, b, ({ 724 mem_may_pin -= btree_buf_bytes(b); 725 if (mem_may_pin <= 0) { 726 c->btree_cache.pinned_nodes_end = *end = 727 BBPOS(btree, b->key.k.p); 728 break; 729 } 730 bch2_node_pin(c, b); 731 0; 732 })); 733 } 734 735 return ret; 736 } 737 738 static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, 739 struct extents_to_bp_state *s) 740 { 741 struct bch_fs *c = trans->c; 742 struct progress_indicator_state progress; 743 int ret = 0; 744 745 bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); 746 747 for (enum btree_id btree_id = 0; 748 btree_id < btree_id_nr_alive(c); 749 btree_id++) { 750 int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1; 751 752 ret = commit_do(trans, NULL, NULL, 753 BCH_TRANS_COMMIT_no_enospc, 754 check_btree_root_to_backpointers(trans, s, btree_id, &level)); 755 if (ret) 756 return ret; 757 758 while (level >= depth) { 759 struct btree_iter iter; 760 bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level, 761 BTREE_ITER_prefetch); 762 763 ret = for_each_btree_key_continue(trans, iter, 0, k, ({ 764 bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); 765 check_extent_to_backpointers(trans, s, btree_id, level, k) ?: 766 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 767 })); 768 if (ret) 769 return ret; 770 771 --level; 772 } 773 } 774 775 return 0; 776 } 777 778 enum alloc_sector_counter { 779 ALLOC_dirty, 780 ALLOC_cached, 781 ALLOC_stripe, 782 ALLOC_SECTORS_NR 783 }; 784 785 static int data_type_to_alloc_counter(enum bch_data_type t) 786 { 787 switch (t) { 788 case BCH_DATA_btree: 789 case BCH_DATA_user: 790 return ALLOC_dirty; 791 case BCH_DATA_cached: 792 return ALLOC_cached; 793 case BCH_DATA_stripe: 794 case BCH_DATA_parity: 795 return ALLOC_stripe; 796 default: 797 return -1; 798 } 799 } 800 801 static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos); 802 803 static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k, 804 struct bkey_buf *last_flushed) 805 { 806 struct bch_fs *c = trans->c; 807 struct bch_alloc_v4 a_convert; 808 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); 809 bool need_commit = false; 810 811 if (a->data_type == BCH_DATA_sb || 812 a->data_type == BCH_DATA_journal || 813 a->data_type == BCH_DATA_parity) 814 return 0; 815 816 u32 sectors[ALLOC_SECTORS_NR]; 817 memset(sectors, 0, sizeof(sectors)); 818 819 struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(trans->c, alloc_k.k->p); 820 if (!ca) 821 return 0; 822 823 struct btree_iter iter; 824 struct bkey_s_c bp_k; 825 int ret = 0; 826 for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers, 827 bucket_pos_to_bp_start(ca, alloc_k.k->p), 828 bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) { 829 if (bp_k.k->type != KEY_TYPE_backpointer) 830 continue; 831 832 struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); 833 834 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen && 835 (bp.v->bucket_gen != a->gen || 836 bp.v->pad)) { 837 ret = bch2_backpointer_del(trans, bp_k.k->p); 838 if (ret) 839 break; 840 841 need_commit = true; 842 continue; 843 } 844 845 if (bp.v->bucket_gen != a->gen) 846 continue; 847 848 int alloc_counter = data_type_to_alloc_counter(bp.v->data_type); 849 if (alloc_counter < 0) 850 continue; 851 852 sectors[alloc_counter] += bp.v->bucket_len; 853 }; 854 bch2_trans_iter_exit(trans, &iter); 855 if (ret) 856 goto err; 857 858 if (need_commit) { 859 ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 860 if (ret) 861 goto err; 862 } 863 864 if (sectors[ALLOC_dirty] != a->dirty_sectors || 865 sectors[ALLOC_cached] != a->cached_sectors || 866 sectors[ALLOC_stripe] != a->stripe_sectors) { 867 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) { 868 ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed); 869 if (ret) 870 goto err; 871 } 872 873 if (sectors[ALLOC_dirty] > a->dirty_sectors || 874 sectors[ALLOC_cached] > a->cached_sectors || 875 sectors[ALLOC_stripe] > a->stripe_sectors) { 876 ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: 877 -BCH_ERR_transaction_restart_nested; 878 goto err; 879 } 880 881 if (!sectors[ALLOC_dirty] && 882 !sectors[ALLOC_stripe] && 883 !sectors[ALLOC_cached]) 884 __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty); 885 else 886 __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches); 887 } 888 err: 889 bch2_dev_put(ca); 890 return ret; 891 } 892 893 static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) 894 { 895 switch (k.k->type) { 896 case KEY_TYPE_btree_ptr_v2: { 897 bool ret = false; 898 899 rcu_read_lock(); 900 struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key; 901 while (pos.inode <= k.k->p.inode) { 902 if (pos.inode >= c->sb.nr_devices) 903 break; 904 905 struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); 906 if (!ca) 907 goto next; 908 909 struct bpos bucket = bp_pos_to_bucket(ca, pos); 910 bucket.offset = find_next_bit(ca->bucket_backpointer_mismatches, 911 ca->mi.nbuckets, bucket.offset); 912 if (bucket.offset == ca->mi.nbuckets) 913 goto next; 914 915 ret = bpos_le(bucket_pos_to_bp_end(ca, bucket), k.k->p); 916 if (ret) 917 break; 918 next: 919 pos = SPOS(pos.inode + 1, 0, 0); 920 } 921 rcu_read_unlock(); 922 923 return ret; 924 } 925 case KEY_TYPE_btree_ptr: 926 return true; 927 default: 928 return false; 929 } 930 } 931 932 static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k, 933 enum btree_id btree, unsigned level) 934 { 935 struct btree_iter iter; 936 bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0); 937 struct btree *b = bch2_btree_iter_peek_node(&iter); 938 int ret = PTR_ERR_OR_ZERO(b); 939 if (ret) 940 goto err; 941 942 if (b) 943 bch2_node_pin(trans->c, b); 944 err: 945 bch2_trans_iter_exit(trans, &iter); 946 return ret; 947 } 948 949 static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans, 950 struct bpos start, struct bpos *end) 951 { 952 struct bch_fs *c = trans->c; 953 int ret = 0; 954 955 struct bkey_buf tmp; 956 bch2_bkey_buf_init(&tmp); 957 958 bch2_btree_cache_unpin(c); 959 960 *end = SPOS_MAX; 961 962 s64 mem_may_pin = mem_may_pin_bytes(c); 963 struct btree_iter iter; 964 bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, 965 0, 1, BTREE_ITER_prefetch); 966 ret = for_each_btree_key_continue(trans, iter, 0, k, ({ 967 if (!backpointer_node_has_missing(c, k)) 968 continue; 969 970 mem_may_pin -= c->opts.btree_node_size; 971 if (mem_may_pin <= 0) 972 break; 973 974 bch2_bkey_buf_reassemble(&tmp, c, k); 975 struct btree_path *path = btree_iter_path(trans, &iter); 976 977 BUG_ON(path->level != 1); 978 979 bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1); 980 })); 981 if (ret) 982 return ret; 983 984 struct bpos pinned = SPOS_MAX; 985 mem_may_pin = mem_may_pin_bytes(c); 986 bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, 987 0, 1, BTREE_ITER_prefetch); 988 ret = for_each_btree_key_continue(trans, iter, 0, k, ({ 989 if (!backpointer_node_has_missing(c, k)) 990 continue; 991 992 mem_may_pin -= c->opts.btree_node_size; 993 if (mem_may_pin <= 0) { 994 *end = pinned; 995 break; 996 } 997 998 bch2_bkey_buf_reassemble(&tmp, c, k); 999 struct btree_path *path = btree_iter_path(trans, &iter); 1000 1001 BUG_ON(path->level != 1); 1002 1003 int ret2 = btree_node_get_and_pin(trans, tmp.k, path->btree_id, path->level - 1); 1004 1005 if (!ret2) 1006 pinned = tmp.k->k.p; 1007 1008 ret; 1009 })); 1010 if (ret) 1011 return ret; 1012 1013 return ret; 1014 } 1015 1016 int bch2_check_extents_to_backpointers(struct bch_fs *c) 1017 { 1018 int ret = 0; 1019 1020 /* 1021 * Can't allow devices to come/go/resize while we have bucket bitmaps 1022 * allocated 1023 */ 1024 down_read(&c->state_lock); 1025 1026 for_each_member_device(c, ca) { 1027 BUG_ON(ca->bucket_backpointer_mismatches); 1028 ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), 1029 sizeof(unsigned long), 1030 GFP_KERNEL); 1031 ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), 1032 sizeof(unsigned long), 1033 GFP_KERNEL); 1034 if (!ca->bucket_backpointer_mismatches || 1035 !ca->bucket_backpointer_empty) { 1036 bch2_dev_put(ca); 1037 ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; 1038 goto err_free_bitmaps; 1039 } 1040 } 1041 1042 struct btree_trans *trans = bch2_trans_get(c); 1043 struct extents_to_bp_state s = { .bp_start = POS_MIN }; 1044 1045 bch2_bkey_buf_init(&s.last_flushed); 1046 bkey_init(&s.last_flushed.k->k); 1047 1048 ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, 1049 POS_MIN, BTREE_ITER_prefetch, k, ({ 1050 check_bucket_backpointer_mismatch(trans, k, &s.last_flushed); 1051 })); 1052 if (ret) 1053 goto err; 1054 1055 u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0; 1056 for_each_member_device(c, ca) { 1057 nr_buckets += ca->mi.nbuckets; 1058 nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets); 1059 nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets); 1060 } 1061 1062 if (!nr_mismatches && !nr_empty) 1063 goto err; 1064 1065 bch_info(c, "scanning for missing backpointers in %llu/%llu buckets", 1066 nr_mismatches + nr_empty, nr_buckets); 1067 1068 while (1) { 1069 ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end); 1070 if (ret) 1071 break; 1072 1073 if ( bpos_eq(s.bp_start, POS_MIN) && 1074 !bpos_eq(s.bp_end, SPOS_MAX)) 1075 bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", 1076 __func__, btree_nodes_fit_in_ram(c)); 1077 1078 if (!bpos_eq(s.bp_start, POS_MIN) || 1079 !bpos_eq(s.bp_end, SPOS_MAX)) { 1080 struct printbuf buf = PRINTBUF; 1081 1082 prt_str(&buf, "check_extents_to_backpointers(): "); 1083 bch2_bpos_to_text(&buf, s.bp_start); 1084 prt_str(&buf, "-"); 1085 bch2_bpos_to_text(&buf, s.bp_end); 1086 1087 bch_verbose(c, "%s", buf.buf); 1088 printbuf_exit(&buf); 1089 } 1090 1091 ret = bch2_check_extents_to_backpointers_pass(trans, &s); 1092 if (ret || bpos_eq(s.bp_end, SPOS_MAX)) 1093 break; 1094 1095 s.bp_start = bpos_successor(s.bp_end); 1096 } 1097 err: 1098 bch2_trans_put(trans); 1099 bch2_bkey_buf_exit(&s.last_flushed, c); 1100 bch2_btree_cache_unpin(c); 1101 err_free_bitmaps: 1102 for_each_member_device(c, ca) { 1103 kvfree(ca->bucket_backpointer_empty); 1104 ca->bucket_backpointer_empty = NULL; 1105 kvfree(ca->bucket_backpointer_mismatches); 1106 ca->bucket_backpointer_mismatches = NULL; 1107 } 1108 1109 up_read(&c->state_lock); 1110 bch_err_fn(c, ret); 1111 return ret; 1112 } 1113 1114 static int check_one_backpointer(struct btree_trans *trans, 1115 struct bbpos start, 1116 struct bbpos end, 1117 struct bkey_s_c bp_k, 1118 struct bkey_buf *last_flushed) 1119 { 1120 if (bp_k.k->type != KEY_TYPE_backpointer) 1121 return 0; 1122 1123 struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); 1124 struct bbpos pos = bp_to_bbpos(*bp.v); 1125 1126 if (bbpos_cmp(pos, start) < 0 || 1127 bbpos_cmp(pos, end) > 0) 1128 return 0; 1129 1130 struct btree_iter iter; 1131 struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0, last_flushed); 1132 int ret = bkey_err(k); 1133 if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) 1134 return 0; 1135 if (ret) 1136 return ret; 1137 1138 bch2_trans_iter_exit(trans, &iter); 1139 return ret; 1140 } 1141 1142 static int check_bucket_backpointers_to_extents(struct btree_trans *trans, 1143 struct bch_dev *ca, struct bpos bucket) 1144 { 1145 u32 restart_count = trans->restart_count; 1146 struct bkey_buf last_flushed; 1147 bch2_bkey_buf_init(&last_flushed); 1148 bkey_init(&last_flushed.k->k); 1149 1150 int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers, 1151 bucket_pos_to_bp_start(ca, bucket), 1152 bucket_pos_to_bp_end(ca, bucket), 1153 0, k, 1154 check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, &last_flushed) 1155 ); 1156 1157 bch2_bkey_buf_exit(&last_flushed, trans->c); 1158 return ret ?: trans_was_restarted(trans, restart_count); 1159 } 1160 1161 static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, 1162 struct bbpos start, 1163 struct bbpos end) 1164 { 1165 struct bch_fs *c = trans->c; 1166 struct bkey_buf last_flushed; 1167 struct progress_indicator_state progress; 1168 1169 bch2_bkey_buf_init(&last_flushed); 1170 bkey_init(&last_flushed.k->k); 1171 bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); 1172 1173 int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers, 1174 POS_MIN, BTREE_ITER_prefetch, k, ({ 1175 bch2_progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); 1176 check_one_backpointer(trans, start, end, k, &last_flushed); 1177 })); 1178 1179 bch2_bkey_buf_exit(&last_flushed, c); 1180 return ret; 1181 } 1182 1183 int bch2_check_backpointers_to_extents(struct bch_fs *c) 1184 { 1185 struct btree_trans *trans = bch2_trans_get(c); 1186 struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end; 1187 int ret; 1188 1189 while (1) { 1190 ret = bch2_get_btree_in_memory_pos(trans, 1191 BIT_ULL(BTREE_ID_extents)| 1192 BIT_ULL(BTREE_ID_reflink), 1193 ~0, 1194 start, &end); 1195 if (ret) 1196 break; 1197 1198 if (!bbpos_cmp(start, BBPOS_MIN) && 1199 bbpos_cmp(end, BBPOS_MAX)) 1200 bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass", 1201 __func__, btree_nodes_fit_in_ram(c)); 1202 1203 if (bbpos_cmp(start, BBPOS_MIN) || 1204 bbpos_cmp(end, BBPOS_MAX)) { 1205 struct printbuf buf = PRINTBUF; 1206 1207 prt_str(&buf, "check_backpointers_to_extents(): "); 1208 bch2_bbpos_to_text(&buf, start); 1209 prt_str(&buf, "-"); 1210 bch2_bbpos_to_text(&buf, end); 1211 1212 bch_verbose(c, "%s", buf.buf); 1213 printbuf_exit(&buf); 1214 } 1215 1216 ret = bch2_check_backpointers_to_extents_pass(trans, start, end); 1217 if (ret || !bbpos_cmp(end, BBPOS_MAX)) 1218 break; 1219 1220 start = bbpos_successor(end); 1221 } 1222 bch2_trans_put(trans); 1223 1224 bch2_btree_cache_unpin(c); 1225 1226 bch_err_fn(c, ret); 1227 return ret; 1228 } 1229