1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * linux/fs/jbd2/commit.c 4 * 5 * Written by Stephen C. Tweedie <[email protected]>, 1998 6 * 7 * Copyright 1998 Red Hat corp --- All Rights Reserved 8 * 9 * Journal commit routines for the generic filesystem journaling code; 10 * part of the ext2fs journaling system. 11 */ 12 13 #include <linux/time.h> 14 #include <linux/fs.h> 15 #include <linux/jbd2.h> 16 #include <linux/errno.h> 17 #include <linux/slab.h> 18 #include <linux/mm.h> 19 #include <linux/pagemap.h> 20 #include <linux/jiffies.h> 21 #include <linux/crc32.h> 22 #include <linux/writeback.h> 23 #include <linux/backing-dev.h> 24 #include <linux/bio.h> 25 #include <linux/blkdev.h> 26 #include <linux/bitops.h> 27 #include <trace/events/jbd2.h> 28 29 /* 30 * IO end handler for temporary buffer_heads handling writes to the journal. 31 */ 32 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) 33 { 34 struct buffer_head *orig_bh = bh->b_private; 35 36 BUFFER_TRACE(bh, ""); 37 if (uptodate) 38 set_buffer_uptodate(bh); 39 else 40 clear_buffer_uptodate(bh); 41 if (orig_bh) { 42 clear_bit_unlock(BH_Shadow, &orig_bh->b_state); 43 smp_mb__after_atomic(); 44 wake_up_bit(&orig_bh->b_state, BH_Shadow); 45 } 46 unlock_buffer(bh); 47 } 48 49 /* 50 * When an ext4 file is truncated, it is possible that some pages are not 51 * successfully freed, because they are attached to a committing transaction. 52 * After the transaction commits, these pages are left on the LRU, with no 53 * ->mapping, and with attached buffers. These pages are trivially reclaimable 54 * by the VM, but their apparent absence upsets the VM accounting, and it makes 55 * the numbers in /proc/meminfo look odd. 56 * 57 * So here, we have a buffer which has just come off the forget list. Look to 58 * see if we can strip all buffers from the backing page. 59 * 60 * Called under lock_journal(), and possibly under journal_datalist_lock. The 61 * caller provided us with a ref against the buffer, and we drop that here. 62 */ 63 static void release_buffer_page(struct buffer_head *bh) 64 { 65 struct folio *folio; 66 67 if (buffer_dirty(bh)) 68 goto nope; 69 if (atomic_read(&bh->b_count) != 1) 70 goto nope; 71 folio = bh->b_folio; 72 if (folio->mapping) 73 goto nope; 74 75 /* OK, it's a truncated page */ 76 if (!folio_trylock(folio)) 77 goto nope; 78 79 folio_get(folio); 80 __brelse(bh); 81 try_to_free_buffers(folio); 82 folio_unlock(folio); 83 folio_put(folio); 84 return; 85 86 nope: 87 __brelse(bh); 88 } 89 90 static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh) 91 { 92 struct commit_header *h; 93 __u32 csum; 94 95 if (!jbd2_journal_has_csum_v2or3(j)) 96 return; 97 98 h = (struct commit_header *)(bh->b_data); 99 h->h_chksum_type = 0; 100 h->h_chksum_size = 0; 101 h->h_chksum[0] = 0; 102 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); 103 h->h_chksum[0] = cpu_to_be32(csum); 104 } 105 106 /* 107 * Done it all: now submit the commit record. We should have 108 * cleaned up our previous buffers by now, so if we are in abort 109 * mode we can now just skip the rest of the journal write 110 * entirely. 111 * 112 * Returns 1 if the journal needs to be aborted or 0 on success 113 */ 114 static int journal_submit_commit_record(journal_t *journal, 115 transaction_t *commit_transaction, 116 struct buffer_head **cbh, 117 __u32 crc32_sum) 118 { 119 struct commit_header *tmp; 120 struct buffer_head *bh; 121 struct timespec64 now; 122 blk_opf_t write_flags = REQ_OP_WRITE | REQ_SYNC; 123 124 *cbh = NULL; 125 126 if (is_journal_aborted(journal)) 127 return 0; 128 129 bh = jbd2_journal_get_descriptor_buffer(commit_transaction, 130 JBD2_COMMIT_BLOCK); 131 if (!bh) 132 return 1; 133 134 tmp = (struct commit_header *)bh->b_data; 135 ktime_get_coarse_real_ts64(&now); 136 tmp->h_commit_sec = cpu_to_be64(now.tv_sec); 137 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); 138 139 if (jbd2_has_feature_checksum(journal)) { 140 tmp->h_chksum_type = JBD2_CRC32_CHKSUM; 141 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; 142 tmp->h_chksum[0] = cpu_to_be32(crc32_sum); 143 } 144 jbd2_commit_block_csum_set(journal, bh); 145 146 BUFFER_TRACE(bh, "submit commit block"); 147 lock_buffer(bh); 148 clear_buffer_dirty(bh); 149 set_buffer_uptodate(bh); 150 bh->b_end_io = journal_end_buffer_io_sync; 151 152 if (journal->j_flags & JBD2_BARRIER && 153 !jbd2_has_feature_async_commit(journal)) 154 write_flags |= REQ_PREFLUSH | REQ_FUA; 155 156 submit_bh(write_flags, bh); 157 *cbh = bh; 158 return 0; 159 } 160 161 /* 162 * This function along with journal_submit_commit_record 163 * allows to write the commit record asynchronously. 164 */ 165 static int journal_wait_on_commit_record(journal_t *journal, 166 struct buffer_head *bh) 167 { 168 int ret = 0; 169 170 clear_buffer_dirty(bh); 171 wait_on_buffer(bh); 172 173 if (unlikely(!buffer_uptodate(bh))) 174 ret = -EIO; 175 put_bh(bh); /* One for getblk() */ 176 177 return ret; 178 } 179 180 /* 181 * write the filemap data using writepage() address_space_operations. 182 * We don't do block allocation here even for delalloc. We don't 183 * use writepages() because with delayed allocation we may be doing 184 * block allocation in writepages(). 185 */ 186 int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) 187 { 188 struct address_space *mapping = jinode->i_vfs_inode->i_mapping; 189 struct writeback_control wbc = { 190 .sync_mode = WB_SYNC_ALL, 191 .nr_to_write = mapping->nrpages * 2, 192 .range_start = jinode->i_dirty_start, 193 .range_end = jinode->i_dirty_end, 194 }; 195 196 /* 197 * submit the inode data buffers. We use writepage 198 * instead of writepages. Because writepages can do 199 * block allocation with delalloc. We need to write 200 * only allocated blocks here. 201 */ 202 return generic_writepages(mapping, &wbc); 203 } 204 205 /* Send all the data buffers related to an inode */ 206 int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode) 207 { 208 if (!jinode || !(jinode->i_flags & JI_WRITE_DATA)) 209 return 0; 210 211 trace_jbd2_submit_inode_data(jinode->i_vfs_inode); 212 return journal->j_submit_inode_data_buffers(jinode); 213 214 } 215 EXPORT_SYMBOL(jbd2_submit_inode_data); 216 217 int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode) 218 { 219 if (!jinode || !(jinode->i_flags & JI_WAIT_DATA) || 220 !jinode->i_vfs_inode || !jinode->i_vfs_inode->i_mapping) 221 return 0; 222 return filemap_fdatawait_range_keep_errors( 223 jinode->i_vfs_inode->i_mapping, jinode->i_dirty_start, 224 jinode->i_dirty_end); 225 } 226 EXPORT_SYMBOL(jbd2_wait_inode_data); 227 228 /* 229 * Submit all the data buffers of inode associated with the transaction to 230 * disk. 231 * 232 * We are in a committing transaction. Therefore no new inode can be added to 233 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently 234 * operate on from being released while we write out pages. 235 */ 236 static int journal_submit_data_buffers(journal_t *journal, 237 transaction_t *commit_transaction) 238 { 239 struct jbd2_inode *jinode; 240 int err, ret = 0; 241 242 spin_lock(&journal->j_list_lock); 243 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 244 if (!(jinode->i_flags & JI_WRITE_DATA)) 245 continue; 246 jinode->i_flags |= JI_COMMIT_RUNNING; 247 spin_unlock(&journal->j_list_lock); 248 /* submit the inode data buffers. */ 249 trace_jbd2_submit_inode_data(jinode->i_vfs_inode); 250 if (journal->j_submit_inode_data_buffers) { 251 err = journal->j_submit_inode_data_buffers(jinode); 252 if (!ret) 253 ret = err; 254 } 255 spin_lock(&journal->j_list_lock); 256 J_ASSERT(jinode->i_transaction == commit_transaction); 257 jinode->i_flags &= ~JI_COMMIT_RUNNING; 258 smp_mb(); 259 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 260 } 261 spin_unlock(&journal->j_list_lock); 262 return ret; 263 } 264 265 int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) 266 { 267 struct address_space *mapping = jinode->i_vfs_inode->i_mapping; 268 269 return filemap_fdatawait_range_keep_errors(mapping, 270 jinode->i_dirty_start, 271 jinode->i_dirty_end); 272 } 273 274 /* 275 * Wait for data submitted for writeout, refile inodes to proper 276 * transaction if needed. 277 * 278 */ 279 static int journal_finish_inode_data_buffers(journal_t *journal, 280 transaction_t *commit_transaction) 281 { 282 struct jbd2_inode *jinode, *next_i; 283 int err, ret = 0; 284 285 /* For locking, see the comment in journal_submit_data_buffers() */ 286 spin_lock(&journal->j_list_lock); 287 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 288 if (!(jinode->i_flags & JI_WAIT_DATA)) 289 continue; 290 jinode->i_flags |= JI_COMMIT_RUNNING; 291 spin_unlock(&journal->j_list_lock); 292 /* wait for the inode data buffers writeout. */ 293 if (journal->j_finish_inode_data_buffers) { 294 err = journal->j_finish_inode_data_buffers(jinode); 295 if (!ret) 296 ret = err; 297 } 298 spin_lock(&journal->j_list_lock); 299 jinode->i_flags &= ~JI_COMMIT_RUNNING; 300 smp_mb(); 301 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 302 } 303 304 /* Now refile inode to proper lists */ 305 list_for_each_entry_safe(jinode, next_i, 306 &commit_transaction->t_inode_list, i_list) { 307 list_del(&jinode->i_list); 308 if (jinode->i_next_transaction) { 309 jinode->i_transaction = jinode->i_next_transaction; 310 jinode->i_next_transaction = NULL; 311 list_add(&jinode->i_list, 312 &jinode->i_transaction->t_inode_list); 313 } else { 314 jinode->i_transaction = NULL; 315 jinode->i_dirty_start = 0; 316 jinode->i_dirty_end = 0; 317 } 318 } 319 spin_unlock(&journal->j_list_lock); 320 321 return ret; 322 } 323 324 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) 325 { 326 struct page *page = bh->b_page; 327 char *addr; 328 __u32 checksum; 329 330 addr = kmap_atomic(page); 331 checksum = crc32_be(crc32_sum, 332 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size); 333 kunmap_atomic(addr); 334 335 return checksum; 336 } 337 338 static void write_tag_block(journal_t *j, journal_block_tag_t *tag, 339 unsigned long long block) 340 { 341 tag->t_blocknr = cpu_to_be32(block & (u32)~0); 342 if (jbd2_has_feature_64bit(j)) 343 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); 344 } 345 346 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, 347 struct buffer_head *bh, __u32 sequence) 348 { 349 journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; 350 struct page *page = bh->b_page; 351 __u8 *addr; 352 __u32 csum32; 353 __be32 seq; 354 355 if (!jbd2_journal_has_csum_v2or3(j)) 356 return; 357 358 seq = cpu_to_be32(sequence); 359 addr = kmap_atomic(page); 360 csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); 361 csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data), 362 bh->b_size); 363 kunmap_atomic(addr); 364 365 if (jbd2_has_feature_csum3(j)) 366 tag3->t_checksum = cpu_to_be32(csum32); 367 else 368 tag->t_checksum = cpu_to_be16(csum32); 369 } 370 /* 371 * jbd2_journal_commit_transaction 372 * 373 * The primary function for committing a transaction to the log. This 374 * function is called by the journal thread to begin a complete commit. 375 */ 376 void jbd2_journal_commit_transaction(journal_t *journal) 377 { 378 struct transaction_stats_s stats; 379 transaction_t *commit_transaction; 380 struct journal_head *jh; 381 struct buffer_head *descriptor; 382 struct buffer_head **wbuf = journal->j_wbuf; 383 int bufs; 384 int flags; 385 int err; 386 unsigned long long blocknr; 387 ktime_t start_time; 388 u64 commit_time; 389 char *tagp = NULL; 390 journal_block_tag_t *tag = NULL; 391 int space_left = 0; 392 int first_tag = 0; 393 int tag_flag; 394 int i; 395 int tag_bytes = journal_tag_bytes(journal); 396 struct buffer_head *cbh = NULL; /* For transactional checksums */ 397 __u32 crc32_sum = ~0; 398 struct blk_plug plug; 399 /* Tail of the journal */ 400 unsigned long first_block; 401 tid_t first_tid; 402 int update_tail; 403 int csum_size = 0; 404 LIST_HEAD(io_bufs); 405 LIST_HEAD(log_bufs); 406 407 if (jbd2_journal_has_csum_v2or3(journal)) 408 csum_size = sizeof(struct jbd2_journal_block_tail); 409 410 /* 411 * First job: lock down the current transaction and wait for 412 * all outstanding updates to complete. 413 */ 414 415 /* Do we need to erase the effects of a prior jbd2_journal_flush? */ 416 if (journal->j_flags & JBD2_FLUSHED) { 417 jbd2_debug(3, "super block updated\n"); 418 mutex_lock_io(&journal->j_checkpoint_mutex); 419 /* 420 * We hold j_checkpoint_mutex so tail cannot change under us. 421 * We don't need any special data guarantees for writing sb 422 * since journal is empty and it is ok for write to be 423 * flushed only with transaction commit. 424 */ 425 jbd2_journal_update_sb_log_tail(journal, 426 journal->j_tail_sequence, 427 journal->j_tail, 428 REQ_SYNC); 429 mutex_unlock(&journal->j_checkpoint_mutex); 430 } else { 431 jbd2_debug(3, "superblock not updated\n"); 432 } 433 434 J_ASSERT(journal->j_running_transaction != NULL); 435 J_ASSERT(journal->j_committing_transaction == NULL); 436 437 write_lock(&journal->j_state_lock); 438 journal->j_flags |= JBD2_FULL_COMMIT_ONGOING; 439 while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) { 440 DEFINE_WAIT(wait); 441 442 prepare_to_wait(&journal->j_fc_wait, &wait, 443 TASK_UNINTERRUPTIBLE); 444 write_unlock(&journal->j_state_lock); 445 schedule(); 446 write_lock(&journal->j_state_lock); 447 finish_wait(&journal->j_fc_wait, &wait); 448 /* 449 * TODO: by blocking fast commits here, we are increasing 450 * fsync() latency slightly. Strictly speaking, we don't need 451 * to block fast commits until the transaction enters T_FLUSH 452 * state. So an optimization is possible where we block new fast 453 * commits here and wait for existing ones to complete 454 * just before we enter T_FLUSH. That way, the existing fast 455 * commits and this full commit can proceed parallely. 456 */ 457 } 458 write_unlock(&journal->j_state_lock); 459 460 commit_transaction = journal->j_running_transaction; 461 462 trace_jbd2_start_commit(journal, commit_transaction); 463 jbd2_debug(1, "JBD2: starting commit of transaction %d\n", 464 commit_transaction->t_tid); 465 466 write_lock(&journal->j_state_lock); 467 journal->j_fc_off = 0; 468 J_ASSERT(commit_transaction->t_state == T_RUNNING); 469 commit_transaction->t_state = T_LOCKED; 470 471 trace_jbd2_commit_locking(journal, commit_transaction); 472 stats.run.rs_wait = commit_transaction->t_max_wait; 473 stats.run.rs_request_delay = 0; 474 stats.run.rs_locked = jiffies; 475 if (commit_transaction->t_requested) 476 stats.run.rs_request_delay = 477 jbd2_time_diff(commit_transaction->t_requested, 478 stats.run.rs_locked); 479 stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, 480 stats.run.rs_locked); 481 482 // waits for any t_updates to finish 483 jbd2_journal_wait_updates(journal); 484 485 commit_transaction->t_state = T_SWITCH; 486 487 J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= 488 journal->j_max_transaction_buffers); 489 490 /* 491 * First thing we are allowed to do is to discard any remaining 492 * BJ_Reserved buffers. Note, it is _not_ permissible to assume 493 * that there are no such buffers: if a large filesystem 494 * operation like a truncate needs to split itself over multiple 495 * transactions, then it may try to do a jbd2_journal_restart() while 496 * there are still BJ_Reserved buffers outstanding. These must 497 * be released cleanly from the current transaction. 498 * 499 * In this case, the filesystem must still reserve write access 500 * again before modifying the buffer in the new transaction, but 501 * we do not require it to remember exactly which old buffers it 502 * has reserved. This is consistent with the existing behaviour 503 * that multiple jbd2_journal_get_write_access() calls to the same 504 * buffer are perfectly permissible. 505 * We use journal->j_state_lock here to serialize processing of 506 * t_reserved_list with eviction of buffers from journal_unmap_buffer(). 507 */ 508 while (commit_transaction->t_reserved_list) { 509 jh = commit_transaction->t_reserved_list; 510 JBUFFER_TRACE(jh, "reserved, unused: refile"); 511 /* 512 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may 513 * leave undo-committed data. 514 */ 515 if (jh->b_committed_data) { 516 struct buffer_head *bh = jh2bh(jh); 517 518 spin_lock(&jh->b_state_lock); 519 jbd2_free(jh->b_committed_data, bh->b_size); 520 jh->b_committed_data = NULL; 521 spin_unlock(&jh->b_state_lock); 522 } 523 jbd2_journal_refile_buffer(journal, jh); 524 } 525 526 write_unlock(&journal->j_state_lock); 527 /* 528 * Now try to drop any written-back buffers from the journal's 529 * checkpoint lists. We do this *before* commit because it potentially 530 * frees some memory 531 */ 532 spin_lock(&journal->j_list_lock); 533 __jbd2_journal_clean_checkpoint_list(journal, false); 534 spin_unlock(&journal->j_list_lock); 535 536 jbd2_debug(3, "JBD2: commit phase 1\n"); 537 538 /* 539 * Clear revoked flag to reflect there is no revoked buffers 540 * in the next transaction which is going to be started. 541 */ 542 jbd2_clear_buffer_revoked_flags(journal); 543 544 /* 545 * Switch to a new revoke table. 546 */ 547 jbd2_journal_switch_revoke_table(journal); 548 549 write_lock(&journal->j_state_lock); 550 /* 551 * Reserved credits cannot be claimed anymore, free them 552 */ 553 atomic_sub(atomic_read(&journal->j_reserved_credits), 554 &commit_transaction->t_outstanding_credits); 555 556 trace_jbd2_commit_flushing(journal, commit_transaction); 557 stats.run.rs_flushing = jiffies; 558 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, 559 stats.run.rs_flushing); 560 561 commit_transaction->t_state = T_FLUSH; 562 journal->j_committing_transaction = commit_transaction; 563 journal->j_running_transaction = NULL; 564 start_time = ktime_get(); 565 commit_transaction->t_log_start = journal->j_head; 566 wake_up_all(&journal->j_wait_transaction_locked); 567 write_unlock(&journal->j_state_lock); 568 569 jbd2_debug(3, "JBD2: commit phase 2a\n"); 570 571 /* 572 * Now start flushing things to disk, in the order they appear 573 * on the transaction lists. Data blocks go first. 574 */ 575 err = journal_submit_data_buffers(journal, commit_transaction); 576 if (err) 577 jbd2_journal_abort(journal, err); 578 579 blk_start_plug(&plug); 580 jbd2_journal_write_revoke_records(commit_transaction, &log_bufs); 581 582 jbd2_debug(3, "JBD2: commit phase 2b\n"); 583 584 /* 585 * Way to go: we have now written out all of the data for a 586 * transaction! Now comes the tricky part: we need to write out 587 * metadata. Loop over the transaction's entire buffer list: 588 */ 589 write_lock(&journal->j_state_lock); 590 commit_transaction->t_state = T_COMMIT; 591 write_unlock(&journal->j_state_lock); 592 593 trace_jbd2_commit_logging(journal, commit_transaction); 594 stats.run.rs_logging = jiffies; 595 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, 596 stats.run.rs_logging); 597 stats.run.rs_blocks = commit_transaction->t_nr_buffers; 598 stats.run.rs_blocks_logged = 0; 599 600 J_ASSERT(commit_transaction->t_nr_buffers <= 601 atomic_read(&commit_transaction->t_outstanding_credits)); 602 603 err = 0; 604 bufs = 0; 605 descriptor = NULL; 606 while (commit_transaction->t_buffers) { 607 608 /* Find the next buffer to be journaled... */ 609 610 jh = commit_transaction->t_buffers; 611 612 /* If we're in abort mode, we just un-journal the buffer and 613 release it. */ 614 615 if (is_journal_aborted(journal)) { 616 clear_buffer_jbddirty(jh2bh(jh)); 617 JBUFFER_TRACE(jh, "journal is aborting: refile"); 618 jbd2_buffer_abort_trigger(jh, 619 jh->b_frozen_data ? 620 jh->b_frozen_triggers : 621 jh->b_triggers); 622 jbd2_journal_refile_buffer(journal, jh); 623 /* If that was the last one, we need to clean up 624 * any descriptor buffers which may have been 625 * already allocated, even if we are now 626 * aborting. */ 627 if (!commit_transaction->t_buffers) 628 goto start_journal_io; 629 continue; 630 } 631 632 /* Make sure we have a descriptor block in which to 633 record the metadata buffer. */ 634 635 if (!descriptor) { 636 J_ASSERT (bufs == 0); 637 638 jbd2_debug(4, "JBD2: get descriptor\n"); 639 640 descriptor = jbd2_journal_get_descriptor_buffer( 641 commit_transaction, 642 JBD2_DESCRIPTOR_BLOCK); 643 if (!descriptor) { 644 jbd2_journal_abort(journal, -EIO); 645 continue; 646 } 647 648 jbd2_debug(4, "JBD2: got buffer %llu (%p)\n", 649 (unsigned long long)descriptor->b_blocknr, 650 descriptor->b_data); 651 tagp = &descriptor->b_data[sizeof(journal_header_t)]; 652 space_left = descriptor->b_size - 653 sizeof(journal_header_t); 654 first_tag = 1; 655 set_buffer_jwrite(descriptor); 656 set_buffer_dirty(descriptor); 657 wbuf[bufs++] = descriptor; 658 659 /* Record it so that we can wait for IO 660 completion later */ 661 BUFFER_TRACE(descriptor, "ph3: file as descriptor"); 662 jbd2_file_log_bh(&log_bufs, descriptor); 663 } 664 665 /* Where is the buffer to be written? */ 666 667 err = jbd2_journal_next_log_block(journal, &blocknr); 668 /* If the block mapping failed, just abandon the buffer 669 and repeat this loop: we'll fall into the 670 refile-on-abort condition above. */ 671 if (err) { 672 jbd2_journal_abort(journal, err); 673 continue; 674 } 675 676 /* 677 * start_this_handle() uses t_outstanding_credits to determine 678 * the free space in the log. 679 */ 680 atomic_dec(&commit_transaction->t_outstanding_credits); 681 682 /* Bump b_count to prevent truncate from stumbling over 683 the shadowed buffer! @@@ This can go if we ever get 684 rid of the shadow pairing of buffers. */ 685 atomic_inc(&jh2bh(jh)->b_count); 686 687 /* 688 * Make a temporary IO buffer with which to write it out 689 * (this will requeue the metadata buffer to BJ_Shadow). 690 */ 691 set_bit(BH_JWrite, &jh2bh(jh)->b_state); 692 JBUFFER_TRACE(jh, "ph3: write metadata"); 693 flags = jbd2_journal_write_metadata_buffer(commit_transaction, 694 jh, &wbuf[bufs], blocknr); 695 if (flags < 0) { 696 jbd2_journal_abort(journal, flags); 697 continue; 698 } 699 jbd2_file_log_bh(&io_bufs, wbuf[bufs]); 700 701 /* Record the new block's tag in the current descriptor 702 buffer */ 703 704 tag_flag = 0; 705 if (flags & 1) 706 tag_flag |= JBD2_FLAG_ESCAPE; 707 if (!first_tag) 708 tag_flag |= JBD2_FLAG_SAME_UUID; 709 710 tag = (journal_block_tag_t *) tagp; 711 write_tag_block(journal, tag, jh2bh(jh)->b_blocknr); 712 tag->t_flags = cpu_to_be16(tag_flag); 713 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs], 714 commit_transaction->t_tid); 715 tagp += tag_bytes; 716 space_left -= tag_bytes; 717 bufs++; 718 719 if (first_tag) { 720 memcpy (tagp, journal->j_uuid, 16); 721 tagp += 16; 722 space_left -= 16; 723 first_tag = 0; 724 } 725 726 /* If there's no more to do, or if the descriptor is full, 727 let the IO rip! */ 728 729 if (bufs == journal->j_wbufsize || 730 commit_transaction->t_buffers == NULL || 731 space_left < tag_bytes + 16 + csum_size) { 732 733 jbd2_debug(4, "JBD2: Submit %d IOs\n", bufs); 734 735 /* Write an end-of-descriptor marker before 736 submitting the IOs. "tag" still points to 737 the last tag we set up. */ 738 739 tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); 740 start_journal_io: 741 if (descriptor) 742 jbd2_descriptor_block_csum_set(journal, 743 descriptor); 744 745 for (i = 0; i < bufs; i++) { 746 struct buffer_head *bh = wbuf[i]; 747 /* 748 * Compute checksum. 749 */ 750 if (jbd2_has_feature_checksum(journal)) { 751 crc32_sum = 752 jbd2_checksum_data(crc32_sum, bh); 753 } 754 755 lock_buffer(bh); 756 clear_buffer_dirty(bh); 757 set_buffer_uptodate(bh); 758 bh->b_end_io = journal_end_buffer_io_sync; 759 submit_bh(REQ_OP_WRITE | REQ_SYNC, bh); 760 } 761 cond_resched(); 762 763 /* Force a new descriptor to be generated next 764 time round the loop. */ 765 descriptor = NULL; 766 bufs = 0; 767 } 768 } 769 770 err = journal_finish_inode_data_buffers(journal, commit_transaction); 771 if (err) { 772 printk(KERN_WARNING 773 "JBD2: Detected IO errors while flushing file data " 774 "on %s\n", journal->j_devname); 775 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR) 776 jbd2_journal_abort(journal, err); 777 err = 0; 778 } 779 780 /* 781 * Get current oldest transaction in the log before we issue flush 782 * to the filesystem device. After the flush we can be sure that 783 * blocks of all older transactions are checkpointed to persistent 784 * storage and we will be safe to update journal start in the 785 * superblock with the numbers we get here. 786 */ 787 update_tail = 788 jbd2_journal_get_log_tail(journal, &first_tid, &first_block); 789 790 write_lock(&journal->j_state_lock); 791 if (update_tail) { 792 long freed = first_block - journal->j_tail; 793 794 if (first_block < journal->j_tail) 795 freed += journal->j_last - journal->j_first; 796 /* Update tail only if we free significant amount of space */ 797 if (freed < jbd2_journal_get_max_txn_bufs(journal)) 798 update_tail = 0; 799 } 800 J_ASSERT(commit_transaction->t_state == T_COMMIT); 801 commit_transaction->t_state = T_COMMIT_DFLUSH; 802 write_unlock(&journal->j_state_lock); 803 804 /* 805 * If the journal is not located on the file system device, 806 * then we must flush the file system device before we issue 807 * the commit record 808 */ 809 if (commit_transaction->t_need_data_flush && 810 (journal->j_fs_dev != journal->j_dev) && 811 (journal->j_flags & JBD2_BARRIER)) 812 blkdev_issue_flush(journal->j_fs_dev); 813 814 /* Done it all: now write the commit record asynchronously. */ 815 if (jbd2_has_feature_async_commit(journal)) { 816 err = journal_submit_commit_record(journal, commit_transaction, 817 &cbh, crc32_sum); 818 if (err) 819 jbd2_journal_abort(journal, err); 820 } 821 822 blk_finish_plug(&plug); 823 824 /* Lo and behold: we have just managed to send a transaction to 825 the log. Before we can commit it, wait for the IO so far to 826 complete. Control buffers being written are on the 827 transaction's t_log_list queue, and metadata buffers are on 828 the io_bufs list. 829 830 Wait for the buffers in reverse order. That way we are 831 less likely to be woken up until all IOs have completed, and 832 so we incur less scheduling load. 833 */ 834 835 jbd2_debug(3, "JBD2: commit phase 3\n"); 836 837 while (!list_empty(&io_bufs)) { 838 struct buffer_head *bh = list_entry(io_bufs.prev, 839 struct buffer_head, 840 b_assoc_buffers); 841 842 wait_on_buffer(bh); 843 cond_resched(); 844 845 if (unlikely(!buffer_uptodate(bh))) 846 err = -EIO; 847 jbd2_unfile_log_bh(bh); 848 stats.run.rs_blocks_logged++; 849 850 /* 851 * The list contains temporary buffer heads created by 852 * jbd2_journal_write_metadata_buffer(). 853 */ 854 BUFFER_TRACE(bh, "dumping temporary bh"); 855 __brelse(bh); 856 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); 857 free_buffer_head(bh); 858 859 /* We also have to refile the corresponding shadowed buffer */ 860 jh = commit_transaction->t_shadow_list->b_tprev; 861 bh = jh2bh(jh); 862 clear_buffer_jwrite(bh); 863 J_ASSERT_BH(bh, buffer_jbddirty(bh)); 864 J_ASSERT_BH(bh, !buffer_shadow(bh)); 865 866 /* The metadata is now released for reuse, but we need 867 to remember it against this transaction so that when 868 we finally commit, we can do any checkpointing 869 required. */ 870 JBUFFER_TRACE(jh, "file as BJ_Forget"); 871 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); 872 JBUFFER_TRACE(jh, "brelse shadowed buffer"); 873 __brelse(bh); 874 } 875 876 J_ASSERT (commit_transaction->t_shadow_list == NULL); 877 878 jbd2_debug(3, "JBD2: commit phase 4\n"); 879 880 /* Here we wait for the revoke record and descriptor record buffers */ 881 while (!list_empty(&log_bufs)) { 882 struct buffer_head *bh; 883 884 bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers); 885 wait_on_buffer(bh); 886 cond_resched(); 887 888 if (unlikely(!buffer_uptodate(bh))) 889 err = -EIO; 890 891 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); 892 clear_buffer_jwrite(bh); 893 jbd2_unfile_log_bh(bh); 894 stats.run.rs_blocks_logged++; 895 __brelse(bh); /* One for getblk */ 896 /* AKPM: bforget here */ 897 } 898 899 if (err) 900 jbd2_journal_abort(journal, err); 901 902 jbd2_debug(3, "JBD2: commit phase 5\n"); 903 write_lock(&journal->j_state_lock); 904 J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH); 905 commit_transaction->t_state = T_COMMIT_JFLUSH; 906 write_unlock(&journal->j_state_lock); 907 908 if (!jbd2_has_feature_async_commit(journal)) { 909 err = journal_submit_commit_record(journal, commit_transaction, 910 &cbh, crc32_sum); 911 if (err) 912 jbd2_journal_abort(journal, err); 913 } 914 if (cbh) 915 err = journal_wait_on_commit_record(journal, cbh); 916 stats.run.rs_blocks_logged++; 917 if (jbd2_has_feature_async_commit(journal) && 918 journal->j_flags & JBD2_BARRIER) { 919 blkdev_issue_flush(journal->j_dev); 920 } 921 922 if (err) 923 jbd2_journal_abort(journal, err); 924 925 WARN_ON_ONCE( 926 atomic_read(&commit_transaction->t_outstanding_credits) < 0); 927 928 /* 929 * Now disk caches for filesystem device are flushed so we are safe to 930 * erase checkpointed transactions from the log by updating journal 931 * superblock. 932 */ 933 if (update_tail) 934 jbd2_update_log_tail(journal, first_tid, first_block); 935 936 /* End of a transaction! Finally, we can do checkpoint 937 processing: any buffers committed as a result of this 938 transaction can be removed from any checkpoint list it was on 939 before. */ 940 941 jbd2_debug(3, "JBD2: commit phase 6\n"); 942 943 J_ASSERT(list_empty(&commit_transaction->t_inode_list)); 944 J_ASSERT(commit_transaction->t_buffers == NULL); 945 J_ASSERT(commit_transaction->t_checkpoint_list == NULL); 946 J_ASSERT(commit_transaction->t_shadow_list == NULL); 947 948 restart_loop: 949 /* 950 * As there are other places (journal_unmap_buffer()) adding buffers 951 * to this list we have to be careful and hold the j_list_lock. 952 */ 953 spin_lock(&journal->j_list_lock); 954 while (commit_transaction->t_forget) { 955 transaction_t *cp_transaction; 956 struct buffer_head *bh; 957 int try_to_free = 0; 958 bool drop_ref; 959 960 jh = commit_transaction->t_forget; 961 spin_unlock(&journal->j_list_lock); 962 bh = jh2bh(jh); 963 /* 964 * Get a reference so that bh cannot be freed before we are 965 * done with it. 966 */ 967 get_bh(bh); 968 spin_lock(&jh->b_state_lock); 969 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); 970 971 /* 972 * If there is undo-protected committed data against 973 * this buffer, then we can remove it now. If it is a 974 * buffer needing such protection, the old frozen_data 975 * field now points to a committed version of the 976 * buffer, so rotate that field to the new committed 977 * data. 978 * 979 * Otherwise, we can just throw away the frozen data now. 980 * 981 * We also know that the frozen data has already fired 982 * its triggers if they exist, so we can clear that too. 983 */ 984 if (jh->b_committed_data) { 985 jbd2_free(jh->b_committed_data, bh->b_size); 986 jh->b_committed_data = NULL; 987 if (jh->b_frozen_data) { 988 jh->b_committed_data = jh->b_frozen_data; 989 jh->b_frozen_data = NULL; 990 jh->b_frozen_triggers = NULL; 991 } 992 } else if (jh->b_frozen_data) { 993 jbd2_free(jh->b_frozen_data, bh->b_size); 994 jh->b_frozen_data = NULL; 995 jh->b_frozen_triggers = NULL; 996 } 997 998 spin_lock(&journal->j_list_lock); 999 cp_transaction = jh->b_cp_transaction; 1000 if (cp_transaction) { 1001 JBUFFER_TRACE(jh, "remove from old cp transaction"); 1002 cp_transaction->t_chp_stats.cs_dropped++; 1003 __jbd2_journal_remove_checkpoint(jh); 1004 } 1005 1006 /* Only re-checkpoint the buffer_head if it is marked 1007 * dirty. If the buffer was added to the BJ_Forget list 1008 * by jbd2_journal_forget, it may no longer be dirty and 1009 * there's no point in keeping a checkpoint record for 1010 * it. */ 1011 1012 /* 1013 * A buffer which has been freed while still being journaled 1014 * by a previous transaction, refile the buffer to BJ_Forget of 1015 * the running transaction. If the just committed transaction 1016 * contains "add to orphan" operation, we can completely 1017 * invalidate the buffer now. We are rather through in that 1018 * since the buffer may be still accessible when blocksize < 1019 * pagesize and it is attached to the last partial page. 1020 */ 1021 if (buffer_freed(bh) && !jh->b_next_transaction) { 1022 struct address_space *mapping; 1023 1024 clear_buffer_freed(bh); 1025 clear_buffer_jbddirty(bh); 1026 1027 /* 1028 * Block device buffers need to stay mapped all the 1029 * time, so it is enough to clear buffer_jbddirty and 1030 * buffer_freed bits. For the file mapping buffers (i.e. 1031 * journalled data) we need to unmap buffer and clear 1032 * more bits. We also need to be careful about the check 1033 * because the data page mapping can get cleared under 1034 * our hands. Note that if mapping == NULL, we don't 1035 * need to make buffer unmapped because the page is 1036 * already detached from the mapping and buffers cannot 1037 * get reused. 1038 */ 1039 mapping = READ_ONCE(bh->b_folio->mapping); 1040 if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) { 1041 clear_buffer_mapped(bh); 1042 clear_buffer_new(bh); 1043 clear_buffer_req(bh); 1044 bh->b_bdev = NULL; 1045 } 1046 } 1047 1048 if (buffer_jbddirty(bh)) { 1049 JBUFFER_TRACE(jh, "add to new checkpointing trans"); 1050 __jbd2_journal_insert_checkpoint(jh, commit_transaction); 1051 if (is_journal_aborted(journal)) 1052 clear_buffer_jbddirty(bh); 1053 } else { 1054 J_ASSERT_BH(bh, !buffer_dirty(bh)); 1055 /* 1056 * The buffer on BJ_Forget list and not jbddirty means 1057 * it has been freed by this transaction and hence it 1058 * could not have been reallocated until this 1059 * transaction has committed. *BUT* it could be 1060 * reallocated once we have written all the data to 1061 * disk and before we process the buffer on BJ_Forget 1062 * list. 1063 */ 1064 if (!jh->b_next_transaction) 1065 try_to_free = 1; 1066 } 1067 JBUFFER_TRACE(jh, "refile or unfile buffer"); 1068 drop_ref = __jbd2_journal_refile_buffer(jh); 1069 spin_unlock(&jh->b_state_lock); 1070 if (drop_ref) 1071 jbd2_journal_put_journal_head(jh); 1072 if (try_to_free) 1073 release_buffer_page(bh); /* Drops bh reference */ 1074 else 1075 __brelse(bh); 1076 cond_resched_lock(&journal->j_list_lock); 1077 } 1078 spin_unlock(&journal->j_list_lock); 1079 /* 1080 * This is a bit sleazy. We use j_list_lock to protect transition 1081 * of a transaction into T_FINISHED state and calling 1082 * __jbd2_journal_drop_transaction(). Otherwise we could race with 1083 * other checkpointing code processing the transaction... 1084 */ 1085 write_lock(&journal->j_state_lock); 1086 spin_lock(&journal->j_list_lock); 1087 /* 1088 * Now recheck if some buffers did not get attached to the transaction 1089 * while the lock was dropped... 1090 */ 1091 if (commit_transaction->t_forget) { 1092 spin_unlock(&journal->j_list_lock); 1093 write_unlock(&journal->j_state_lock); 1094 goto restart_loop; 1095 } 1096 1097 /* Add the transaction to the checkpoint list 1098 * __journal_remove_checkpoint() can not destroy transaction 1099 * under us because it is not marked as T_FINISHED yet */ 1100 if (journal->j_checkpoint_transactions == NULL) { 1101 journal->j_checkpoint_transactions = commit_transaction; 1102 commit_transaction->t_cpnext = commit_transaction; 1103 commit_transaction->t_cpprev = commit_transaction; 1104 } else { 1105 commit_transaction->t_cpnext = 1106 journal->j_checkpoint_transactions; 1107 commit_transaction->t_cpprev = 1108 commit_transaction->t_cpnext->t_cpprev; 1109 commit_transaction->t_cpnext->t_cpprev = 1110 commit_transaction; 1111 commit_transaction->t_cpprev->t_cpnext = 1112 commit_transaction; 1113 } 1114 spin_unlock(&journal->j_list_lock); 1115 1116 /* Done with this transaction! */ 1117 1118 jbd2_debug(3, "JBD2: commit phase 7\n"); 1119 1120 J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH); 1121 1122 commit_transaction->t_start = jiffies; 1123 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, 1124 commit_transaction->t_start); 1125 1126 /* 1127 * File the transaction statistics 1128 */ 1129 stats.ts_tid = commit_transaction->t_tid; 1130 stats.run.rs_handle_count = 1131 atomic_read(&commit_transaction->t_handle_count); 1132 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, 1133 commit_transaction->t_tid, &stats.run); 1134 stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0; 1135 1136 commit_transaction->t_state = T_COMMIT_CALLBACK; 1137 J_ASSERT(commit_transaction == journal->j_committing_transaction); 1138 journal->j_commit_sequence = commit_transaction->t_tid; 1139 journal->j_committing_transaction = NULL; 1140 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); 1141 1142 /* 1143 * weight the commit time higher than the average time so we don't 1144 * react too strongly to vast changes in the commit time 1145 */ 1146 if (likely(journal->j_average_commit_time)) 1147 journal->j_average_commit_time = (commit_time + 1148 journal->j_average_commit_time*3) / 4; 1149 else 1150 journal->j_average_commit_time = commit_time; 1151 1152 write_unlock(&journal->j_state_lock); 1153 1154 if (journal->j_commit_callback) 1155 journal->j_commit_callback(journal, commit_transaction); 1156 if (journal->j_fc_cleanup_callback) 1157 journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid); 1158 1159 trace_jbd2_end_commit(journal, commit_transaction); 1160 jbd2_debug(1, "JBD2: commit %d complete, head %d\n", 1161 journal->j_commit_sequence, journal->j_tail_sequence); 1162 1163 write_lock(&journal->j_state_lock); 1164 journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING; 1165 journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; 1166 spin_lock(&journal->j_list_lock); 1167 commit_transaction->t_state = T_FINISHED; 1168 /* Check if the transaction can be dropped now that we are finished */ 1169 if (commit_transaction->t_checkpoint_list == NULL && 1170 commit_transaction->t_checkpoint_io_list == NULL) { 1171 __jbd2_journal_drop_transaction(journal, commit_transaction); 1172 jbd2_journal_free_transaction(commit_transaction); 1173 } 1174 spin_unlock(&journal->j_list_lock); 1175 write_unlock(&journal->j_state_lock); 1176 wake_up(&journal->j_wait_done_commit); 1177 wake_up(&journal->j_fc_wait); 1178 1179 /* 1180 * Calculate overall stats 1181 */ 1182 spin_lock(&journal->j_history_lock); 1183 journal->j_stats.ts_tid++; 1184 journal->j_stats.ts_requested += stats.ts_requested; 1185 journal->j_stats.run.rs_wait += stats.run.rs_wait; 1186 journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay; 1187 journal->j_stats.run.rs_running += stats.run.rs_running; 1188 journal->j_stats.run.rs_locked += stats.run.rs_locked; 1189 journal->j_stats.run.rs_flushing += stats.run.rs_flushing; 1190 journal->j_stats.run.rs_logging += stats.run.rs_logging; 1191 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; 1192 journal->j_stats.run.rs_blocks += stats.run.rs_blocks; 1193 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; 1194 spin_unlock(&journal->j_history_lock); 1195 } 1196