xref: /linux-6.15/fs/jbd2/commit.c (revision e67f88dd)
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <[email protected]>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <linux/bitops.h>
30 #include <trace/events/jbd2.h>
31 #include <asm/system.h>
32 
33 /*
34  * Default IO end handler for temporary BJ_IO buffer_heads.
35  */
36 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
37 {
38 	BUFFER_TRACE(bh, "");
39 	if (uptodate)
40 		set_buffer_uptodate(bh);
41 	else
42 		clear_buffer_uptodate(bh);
43 	unlock_buffer(bh);
44 }
45 
46 /*
47  * When an ext4 file is truncated, it is possible that some pages are not
48  * successfully freed, because they are attached to a committing transaction.
49  * After the transaction commits, these pages are left on the LRU, with no
50  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
51  * by the VM, but their apparent absence upsets the VM accounting, and it makes
52  * the numbers in /proc/meminfo look odd.
53  *
54  * So here, we have a buffer which has just come off the forget list.  Look to
55  * see if we can strip all buffers from the backing page.
56  *
57  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
58  * caller provided us with a ref against the buffer, and we drop that here.
59  */
60 static void release_buffer_page(struct buffer_head *bh)
61 {
62 	struct page *page;
63 
64 	if (buffer_dirty(bh))
65 		goto nope;
66 	if (atomic_read(&bh->b_count) != 1)
67 		goto nope;
68 	page = bh->b_page;
69 	if (!page)
70 		goto nope;
71 	if (page->mapping)
72 		goto nope;
73 
74 	/* OK, it's a truncated page */
75 	if (!trylock_page(page))
76 		goto nope;
77 
78 	page_cache_get(page);
79 	__brelse(bh);
80 	try_to_free_buffers(page);
81 	unlock_page(page);
82 	page_cache_release(page);
83 	return;
84 
85 nope:
86 	__brelse(bh);
87 }
88 
89 /*
90  * Done it all: now submit the commit record.  We should have
91  * cleaned up our previous buffers by now, so if we are in abort
92  * mode we can now just skip the rest of the journal write
93  * entirely.
94  *
95  * Returns 1 if the journal needs to be aborted or 0 on success
96  */
97 static int journal_submit_commit_record(journal_t *journal,
98 					transaction_t *commit_transaction,
99 					struct buffer_head **cbh,
100 					__u32 crc32_sum)
101 {
102 	struct journal_head *descriptor;
103 	struct commit_header *tmp;
104 	struct buffer_head *bh;
105 	int ret;
106 	struct timespec now = current_kernel_time();
107 
108 	if (is_journal_aborted(journal))
109 		return 0;
110 
111 	descriptor = jbd2_journal_get_descriptor_buffer(journal);
112 	if (!descriptor)
113 		return 1;
114 
115 	bh = jh2bh(descriptor);
116 
117 	tmp = (struct commit_header *)bh->b_data;
118 	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
119 	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
120 	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
121 	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
122 	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
123 
124 	if (JBD2_HAS_COMPAT_FEATURE(journal,
125 				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
126 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
127 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
128 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
129 	}
130 
131 	JBUFFER_TRACE(descriptor, "submit commit block");
132 	lock_buffer(bh);
133 	clear_buffer_dirty(bh);
134 	set_buffer_uptodate(bh);
135 	bh->b_end_io = journal_end_buffer_io_sync;
136 
137 	if (journal->j_flags & JBD2_BARRIER &&
138 	    !JBD2_HAS_INCOMPAT_FEATURE(journal,
139 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
140 		ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
141 	else
142 		ret = submit_bh(WRITE_SYNC, bh);
143 
144 	*cbh = bh;
145 	return ret;
146 }
147 
148 /*
149  * This function along with journal_submit_commit_record
150  * allows to write the commit record asynchronously.
151  */
152 static int journal_wait_on_commit_record(journal_t *journal,
153 					 struct buffer_head *bh)
154 {
155 	int ret = 0;
156 
157 	clear_buffer_dirty(bh);
158 	wait_on_buffer(bh);
159 
160 	if (unlikely(!buffer_uptodate(bh)))
161 		ret = -EIO;
162 	put_bh(bh);            /* One for getblk() */
163 	jbd2_journal_put_journal_head(bh2jh(bh));
164 
165 	return ret;
166 }
167 
168 /*
169  * write the filemap data using writepage() address_space_operations.
170  * We don't do block allocation here even for delalloc. We don't
171  * use writepages() because with dealyed allocation we may be doing
172  * block allocation in writepages().
173  */
174 static int journal_submit_inode_data_buffers(struct address_space *mapping)
175 {
176 	int ret;
177 	struct writeback_control wbc = {
178 		.sync_mode =  WB_SYNC_ALL,
179 		.nr_to_write = mapping->nrpages * 2,
180 		.range_start = 0,
181 		.range_end = i_size_read(mapping->host),
182 	};
183 
184 	ret = generic_writepages(mapping, &wbc);
185 	return ret;
186 }
187 
188 /*
189  * Submit all the data buffers of inode associated with the transaction to
190  * disk.
191  *
192  * We are in a committing transaction. Therefore no new inode can be added to
193  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
194  * operate on from being released while we write out pages.
195  */
196 static int journal_submit_data_buffers(journal_t *journal,
197 		transaction_t *commit_transaction)
198 {
199 	struct jbd2_inode *jinode;
200 	int err, ret = 0;
201 	struct address_space *mapping;
202 
203 	spin_lock(&journal->j_list_lock);
204 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
205 		mapping = jinode->i_vfs_inode->i_mapping;
206 		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
207 		spin_unlock(&journal->j_list_lock);
208 		/*
209 		 * submit the inode data buffers. We use writepage
210 		 * instead of writepages. Because writepages can do
211 		 * block allocation  with delalloc. We need to write
212 		 * only allocated blocks here.
213 		 */
214 		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
215 		err = journal_submit_inode_data_buffers(mapping);
216 		if (!ret)
217 			ret = err;
218 		spin_lock(&journal->j_list_lock);
219 		J_ASSERT(jinode->i_transaction == commit_transaction);
220 		commit_transaction->t_flushed_data_blocks = 1;
221 		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
222 		smp_mb__after_clear_bit();
223 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
224 	}
225 	spin_unlock(&journal->j_list_lock);
226 	return ret;
227 }
228 
229 /*
230  * Wait for data submitted for writeout, refile inodes to proper
231  * transaction if needed.
232  *
233  */
234 static int journal_finish_inode_data_buffers(journal_t *journal,
235 		transaction_t *commit_transaction)
236 {
237 	struct jbd2_inode *jinode, *next_i;
238 	int err, ret = 0;
239 
240 	/* For locking, see the comment in journal_submit_data_buffers() */
241 	spin_lock(&journal->j_list_lock);
242 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
243 		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
244 		spin_unlock(&journal->j_list_lock);
245 		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
246 		if (err) {
247 			/*
248 			 * Because AS_EIO is cleared by
249 			 * filemap_fdatawait_range(), set it again so
250 			 * that user process can get -EIO from fsync().
251 			 */
252 			set_bit(AS_EIO,
253 				&jinode->i_vfs_inode->i_mapping->flags);
254 
255 			if (!ret)
256 				ret = err;
257 		}
258 		spin_lock(&journal->j_list_lock);
259 		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
260 		smp_mb__after_clear_bit();
261 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
262 	}
263 
264 	/* Now refile inode to proper lists */
265 	list_for_each_entry_safe(jinode, next_i,
266 				 &commit_transaction->t_inode_list, i_list) {
267 		list_del(&jinode->i_list);
268 		if (jinode->i_next_transaction) {
269 			jinode->i_transaction = jinode->i_next_transaction;
270 			jinode->i_next_transaction = NULL;
271 			list_add(&jinode->i_list,
272 				&jinode->i_transaction->t_inode_list);
273 		} else {
274 			jinode->i_transaction = NULL;
275 		}
276 	}
277 	spin_unlock(&journal->j_list_lock);
278 
279 	return ret;
280 }
281 
282 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
283 {
284 	struct page *page = bh->b_page;
285 	char *addr;
286 	__u32 checksum;
287 
288 	addr = kmap_atomic(page, KM_USER0);
289 	checksum = crc32_be(crc32_sum,
290 		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
291 	kunmap_atomic(addr, KM_USER0);
292 
293 	return checksum;
294 }
295 
296 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
297 				   unsigned long long block)
298 {
299 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
300 	if (tag_bytes > JBD2_TAG_SIZE32)
301 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
302 }
303 
304 /*
305  * jbd2_journal_commit_transaction
306  *
307  * The primary function for committing a transaction to the log.  This
308  * function is called by the journal thread to begin a complete commit.
309  */
310 void jbd2_journal_commit_transaction(journal_t *journal)
311 {
312 	struct transaction_stats_s stats;
313 	transaction_t *commit_transaction;
314 	struct journal_head *jh, *new_jh, *descriptor;
315 	struct buffer_head **wbuf = journal->j_wbuf;
316 	int bufs;
317 	int flags;
318 	int err;
319 	unsigned long long blocknr;
320 	ktime_t start_time;
321 	u64 commit_time;
322 	char *tagp = NULL;
323 	journal_header_t *header;
324 	journal_block_tag_t *tag = NULL;
325 	int space_left = 0;
326 	int first_tag = 0;
327 	int tag_flag;
328 	int i, to_free = 0;
329 	int tag_bytes = journal_tag_bytes(journal);
330 	struct buffer_head *cbh = NULL; /* For transactional checksums */
331 	__u32 crc32_sum = ~0;
332 	struct blk_plug plug;
333 
334 	/*
335 	 * First job: lock down the current transaction and wait for
336 	 * all outstanding updates to complete.
337 	 */
338 
339 #ifdef COMMIT_STATS
340 	spin_lock(&journal->j_list_lock);
341 	summarise_journal_usage(journal);
342 	spin_unlock(&journal->j_list_lock);
343 #endif
344 
345 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
346 	if (journal->j_flags & JBD2_FLUSHED) {
347 		jbd_debug(3, "super block updated\n");
348 		jbd2_journal_update_superblock(journal, 1);
349 	} else {
350 		jbd_debug(3, "superblock not updated\n");
351 	}
352 
353 	J_ASSERT(journal->j_running_transaction != NULL);
354 	J_ASSERT(journal->j_committing_transaction == NULL);
355 
356 	commit_transaction = journal->j_running_transaction;
357 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
358 
359 	trace_jbd2_start_commit(journal, commit_transaction);
360 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
361 			commit_transaction->t_tid);
362 
363 	write_lock(&journal->j_state_lock);
364 	commit_transaction->t_state = T_LOCKED;
365 
366 	trace_jbd2_commit_locking(journal, commit_transaction);
367 	stats.run.rs_wait = commit_transaction->t_max_wait;
368 	stats.run.rs_locked = jiffies;
369 	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
370 					      stats.run.rs_locked);
371 
372 	spin_lock(&commit_transaction->t_handle_lock);
373 	while (atomic_read(&commit_transaction->t_updates)) {
374 		DEFINE_WAIT(wait);
375 
376 		prepare_to_wait(&journal->j_wait_updates, &wait,
377 					TASK_UNINTERRUPTIBLE);
378 		if (atomic_read(&commit_transaction->t_updates)) {
379 			spin_unlock(&commit_transaction->t_handle_lock);
380 			write_unlock(&journal->j_state_lock);
381 			schedule();
382 			write_lock(&journal->j_state_lock);
383 			spin_lock(&commit_transaction->t_handle_lock);
384 		}
385 		finish_wait(&journal->j_wait_updates, &wait);
386 	}
387 	spin_unlock(&commit_transaction->t_handle_lock);
388 
389 	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
390 			journal->j_max_transaction_buffers);
391 
392 	/*
393 	 * First thing we are allowed to do is to discard any remaining
394 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
395 	 * that there are no such buffers: if a large filesystem
396 	 * operation like a truncate needs to split itself over multiple
397 	 * transactions, then it may try to do a jbd2_journal_restart() while
398 	 * there are still BJ_Reserved buffers outstanding.  These must
399 	 * be released cleanly from the current transaction.
400 	 *
401 	 * In this case, the filesystem must still reserve write access
402 	 * again before modifying the buffer in the new transaction, but
403 	 * we do not require it to remember exactly which old buffers it
404 	 * has reserved.  This is consistent with the existing behaviour
405 	 * that multiple jbd2_journal_get_write_access() calls to the same
406 	 * buffer are perfectly permissible.
407 	 */
408 	while (commit_transaction->t_reserved_list) {
409 		jh = commit_transaction->t_reserved_list;
410 		JBUFFER_TRACE(jh, "reserved, unused: refile");
411 		/*
412 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
413 		 * leave undo-committed data.
414 		 */
415 		if (jh->b_committed_data) {
416 			struct buffer_head *bh = jh2bh(jh);
417 
418 			jbd_lock_bh_state(bh);
419 			jbd2_free(jh->b_committed_data, bh->b_size);
420 			jh->b_committed_data = NULL;
421 			jbd_unlock_bh_state(bh);
422 		}
423 		jbd2_journal_refile_buffer(journal, jh);
424 	}
425 
426 	/*
427 	 * Now try to drop any written-back buffers from the journal's
428 	 * checkpoint lists.  We do this *before* commit because it potentially
429 	 * frees some memory
430 	 */
431 	spin_lock(&journal->j_list_lock);
432 	__jbd2_journal_clean_checkpoint_list(journal);
433 	spin_unlock(&journal->j_list_lock);
434 
435 	jbd_debug (3, "JBD: commit phase 1\n");
436 
437 	/*
438 	 * Switch to a new revoke table.
439 	 */
440 	jbd2_journal_switch_revoke_table(journal);
441 
442 	trace_jbd2_commit_flushing(journal, commit_transaction);
443 	stats.run.rs_flushing = jiffies;
444 	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
445 					     stats.run.rs_flushing);
446 
447 	commit_transaction->t_state = T_FLUSH;
448 	journal->j_committing_transaction = commit_transaction;
449 	journal->j_running_transaction = NULL;
450 	start_time = ktime_get();
451 	commit_transaction->t_log_start = journal->j_head;
452 	wake_up(&journal->j_wait_transaction_locked);
453 	write_unlock(&journal->j_state_lock);
454 
455 	jbd_debug (3, "JBD: commit phase 2\n");
456 
457 	/*
458 	 * Now start flushing things to disk, in the order they appear
459 	 * on the transaction lists.  Data blocks go first.
460 	 */
461 	err = journal_submit_data_buffers(journal, commit_transaction);
462 	if (err)
463 		jbd2_journal_abort(journal, err);
464 
465 	blk_start_plug(&plug);
466 	jbd2_journal_write_revoke_records(journal, commit_transaction,
467 					  WRITE_SYNC);
468 	blk_finish_plug(&plug);
469 
470 	jbd_debug(3, "JBD: commit phase 2\n");
471 
472 	/*
473 	 * Way to go: we have now written out all of the data for a
474 	 * transaction!  Now comes the tricky part: we need to write out
475 	 * metadata.  Loop over the transaction's entire buffer list:
476 	 */
477 	write_lock(&journal->j_state_lock);
478 	commit_transaction->t_state = T_COMMIT;
479 	write_unlock(&journal->j_state_lock);
480 
481 	trace_jbd2_commit_logging(journal, commit_transaction);
482 	stats.run.rs_logging = jiffies;
483 	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
484 					       stats.run.rs_logging);
485 	stats.run.rs_blocks =
486 		atomic_read(&commit_transaction->t_outstanding_credits);
487 	stats.run.rs_blocks_logged = 0;
488 
489 	J_ASSERT(commit_transaction->t_nr_buffers <=
490 		 atomic_read(&commit_transaction->t_outstanding_credits));
491 
492 	err = 0;
493 	descriptor = NULL;
494 	bufs = 0;
495 	blk_start_plug(&plug);
496 	while (commit_transaction->t_buffers) {
497 
498 		/* Find the next buffer to be journaled... */
499 
500 		jh = commit_transaction->t_buffers;
501 
502 		/* If we're in abort mode, we just un-journal the buffer and
503 		   release it. */
504 
505 		if (is_journal_aborted(journal)) {
506 			clear_buffer_jbddirty(jh2bh(jh));
507 			JBUFFER_TRACE(jh, "journal is aborting: refile");
508 			jbd2_buffer_abort_trigger(jh,
509 						  jh->b_frozen_data ?
510 						  jh->b_frozen_triggers :
511 						  jh->b_triggers);
512 			jbd2_journal_refile_buffer(journal, jh);
513 			/* If that was the last one, we need to clean up
514 			 * any descriptor buffers which may have been
515 			 * already allocated, even if we are now
516 			 * aborting. */
517 			if (!commit_transaction->t_buffers)
518 				goto start_journal_io;
519 			continue;
520 		}
521 
522 		/* Make sure we have a descriptor block in which to
523 		   record the metadata buffer. */
524 
525 		if (!descriptor) {
526 			struct buffer_head *bh;
527 
528 			J_ASSERT (bufs == 0);
529 
530 			jbd_debug(4, "JBD: get descriptor\n");
531 
532 			descriptor = jbd2_journal_get_descriptor_buffer(journal);
533 			if (!descriptor) {
534 				jbd2_journal_abort(journal, -EIO);
535 				continue;
536 			}
537 
538 			bh = jh2bh(descriptor);
539 			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
540 				(unsigned long long)bh->b_blocknr, bh->b_data);
541 			header = (journal_header_t *)&bh->b_data[0];
542 			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
543 			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
544 			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
545 
546 			tagp = &bh->b_data[sizeof(journal_header_t)];
547 			space_left = bh->b_size - sizeof(journal_header_t);
548 			first_tag = 1;
549 			set_buffer_jwrite(bh);
550 			set_buffer_dirty(bh);
551 			wbuf[bufs++] = bh;
552 
553 			/* Record it so that we can wait for IO
554                            completion later */
555 			BUFFER_TRACE(bh, "ph3: file as descriptor");
556 			jbd2_journal_file_buffer(descriptor, commit_transaction,
557 					BJ_LogCtl);
558 		}
559 
560 		/* Where is the buffer to be written? */
561 
562 		err = jbd2_journal_next_log_block(journal, &blocknr);
563 		/* If the block mapping failed, just abandon the buffer
564 		   and repeat this loop: we'll fall into the
565 		   refile-on-abort condition above. */
566 		if (err) {
567 			jbd2_journal_abort(journal, err);
568 			continue;
569 		}
570 
571 		/*
572 		 * start_this_handle() uses t_outstanding_credits to determine
573 		 * the free space in the log, but this counter is changed
574 		 * by jbd2_journal_next_log_block() also.
575 		 */
576 		atomic_dec(&commit_transaction->t_outstanding_credits);
577 
578 		/* Bump b_count to prevent truncate from stumbling over
579                    the shadowed buffer!  @@@ This can go if we ever get
580                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
581 		atomic_inc(&jh2bh(jh)->b_count);
582 
583 		/* Make a temporary IO buffer with which to write it out
584                    (this will requeue both the metadata buffer and the
585                    temporary IO buffer). new_bh goes on BJ_IO*/
586 
587 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
588 		/*
589 		 * akpm: jbd2_journal_write_metadata_buffer() sets
590 		 * new_bh->b_transaction to commit_transaction.
591 		 * We need to clean this up before we release new_bh
592 		 * (which is of type BJ_IO)
593 		 */
594 		JBUFFER_TRACE(jh, "ph3: write metadata");
595 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
596 						      jh, &new_jh, blocknr);
597 		if (flags < 0) {
598 			jbd2_journal_abort(journal, flags);
599 			continue;
600 		}
601 		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
602 		wbuf[bufs++] = jh2bh(new_jh);
603 
604 		/* Record the new block's tag in the current descriptor
605                    buffer */
606 
607 		tag_flag = 0;
608 		if (flags & 1)
609 			tag_flag |= JBD2_FLAG_ESCAPE;
610 		if (!first_tag)
611 			tag_flag |= JBD2_FLAG_SAME_UUID;
612 
613 		tag = (journal_block_tag_t *) tagp;
614 		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
615 		tag->t_flags = cpu_to_be32(tag_flag);
616 		tagp += tag_bytes;
617 		space_left -= tag_bytes;
618 
619 		if (first_tag) {
620 			memcpy (tagp, journal->j_uuid, 16);
621 			tagp += 16;
622 			space_left -= 16;
623 			first_tag = 0;
624 		}
625 
626 		/* If there's no more to do, or if the descriptor is full,
627 		   let the IO rip! */
628 
629 		if (bufs == journal->j_wbufsize ||
630 		    commit_transaction->t_buffers == NULL ||
631 		    space_left < tag_bytes + 16) {
632 
633 			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
634 
635 			/* Write an end-of-descriptor marker before
636                            submitting the IOs.  "tag" still points to
637                            the last tag we set up. */
638 
639 			tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
640 
641 start_journal_io:
642 			for (i = 0; i < bufs; i++) {
643 				struct buffer_head *bh = wbuf[i];
644 				/*
645 				 * Compute checksum.
646 				 */
647 				if (JBD2_HAS_COMPAT_FEATURE(journal,
648 					JBD2_FEATURE_COMPAT_CHECKSUM)) {
649 					crc32_sum =
650 					    jbd2_checksum_data(crc32_sum, bh);
651 				}
652 
653 				lock_buffer(bh);
654 				clear_buffer_dirty(bh);
655 				set_buffer_uptodate(bh);
656 				bh->b_end_io = journal_end_buffer_io_sync;
657 				submit_bh(WRITE_SYNC, bh);
658 			}
659 			cond_resched();
660 			stats.run.rs_blocks_logged += bufs;
661 
662 			/* Force a new descriptor to be generated next
663                            time round the loop. */
664 			descriptor = NULL;
665 			bufs = 0;
666 		}
667 	}
668 
669 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
670 	if (err) {
671 		printk(KERN_WARNING
672 			"JBD2: Detected IO errors while flushing file data "
673 		       "on %s\n", journal->j_devname);
674 		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
675 			jbd2_journal_abort(journal, err);
676 		err = 0;
677 	}
678 
679 	/*
680 	 * If the journal is not located on the file system device,
681 	 * then we must flush the file system device before we issue
682 	 * the commit record
683 	 */
684 	if (commit_transaction->t_flushed_data_blocks &&
685 	    (journal->j_fs_dev != journal->j_dev) &&
686 	    (journal->j_flags & JBD2_BARRIER))
687 		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
688 
689 	/* Done it all: now write the commit record asynchronously. */
690 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
691 				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
692 		err = journal_submit_commit_record(journal, commit_transaction,
693 						 &cbh, crc32_sum);
694 		if (err)
695 			__jbd2_journal_abort_hard(journal);
696 	}
697 
698 	blk_finish_plug(&plug);
699 
700 	/* Lo and behold: we have just managed to send a transaction to
701            the log.  Before we can commit it, wait for the IO so far to
702            complete.  Control buffers being written are on the
703            transaction's t_log_list queue, and metadata buffers are on
704            the t_iobuf_list queue.
705 
706 	   Wait for the buffers in reverse order.  That way we are
707 	   less likely to be woken up until all IOs have completed, and
708 	   so we incur less scheduling load.
709 	*/
710 
711 	jbd_debug(3, "JBD: commit phase 3\n");
712 
713 	/*
714 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
715 	 * See __journal_try_to_free_buffer.
716 	 */
717 wait_for_iobuf:
718 	while (commit_transaction->t_iobuf_list != NULL) {
719 		struct buffer_head *bh;
720 
721 		jh = commit_transaction->t_iobuf_list->b_tprev;
722 		bh = jh2bh(jh);
723 		if (buffer_locked(bh)) {
724 			wait_on_buffer(bh);
725 			goto wait_for_iobuf;
726 		}
727 		if (cond_resched())
728 			goto wait_for_iobuf;
729 
730 		if (unlikely(!buffer_uptodate(bh)))
731 			err = -EIO;
732 
733 		clear_buffer_jwrite(bh);
734 
735 		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
736 		jbd2_journal_unfile_buffer(journal, jh);
737 
738 		/*
739 		 * ->t_iobuf_list should contain only dummy buffer_heads
740 		 * which were created by jbd2_journal_write_metadata_buffer().
741 		 */
742 		BUFFER_TRACE(bh, "dumping temporary bh");
743 		jbd2_journal_put_journal_head(jh);
744 		__brelse(bh);
745 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
746 		free_buffer_head(bh);
747 
748 		/* We also have to unlock and free the corresponding
749                    shadowed buffer */
750 		jh = commit_transaction->t_shadow_list->b_tprev;
751 		bh = jh2bh(jh);
752 		clear_bit(BH_JWrite, &bh->b_state);
753 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
754 
755 		/* The metadata is now released for reuse, but we need
756                    to remember it against this transaction so that when
757                    we finally commit, we can do any checkpointing
758                    required. */
759 		JBUFFER_TRACE(jh, "file as BJ_Forget");
760 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
761 		/* Wake up any transactions which were waiting for this
762 		   IO to complete */
763 		wake_up_bit(&bh->b_state, BH_Unshadow);
764 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
765 		__brelse(bh);
766 	}
767 
768 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
769 
770 	jbd_debug(3, "JBD: commit phase 4\n");
771 
772 	/* Here we wait for the revoke record and descriptor record buffers */
773  wait_for_ctlbuf:
774 	while (commit_transaction->t_log_list != NULL) {
775 		struct buffer_head *bh;
776 
777 		jh = commit_transaction->t_log_list->b_tprev;
778 		bh = jh2bh(jh);
779 		if (buffer_locked(bh)) {
780 			wait_on_buffer(bh);
781 			goto wait_for_ctlbuf;
782 		}
783 		if (cond_resched())
784 			goto wait_for_ctlbuf;
785 
786 		if (unlikely(!buffer_uptodate(bh)))
787 			err = -EIO;
788 
789 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
790 		clear_buffer_jwrite(bh);
791 		jbd2_journal_unfile_buffer(journal, jh);
792 		jbd2_journal_put_journal_head(jh);
793 		__brelse(bh);		/* One for getblk */
794 		/* AKPM: bforget here */
795 	}
796 
797 	if (err)
798 		jbd2_journal_abort(journal, err);
799 
800 	jbd_debug(3, "JBD: commit phase 5\n");
801 
802 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
803 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
804 		err = journal_submit_commit_record(journal, commit_transaction,
805 						&cbh, crc32_sum);
806 		if (err)
807 			__jbd2_journal_abort_hard(journal);
808 	}
809 	if (!err && !is_journal_aborted(journal))
810 		err = journal_wait_on_commit_record(journal, cbh);
811 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
812 				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
813 	    journal->j_flags & JBD2_BARRIER) {
814 		blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
815 	}
816 
817 	if (err)
818 		jbd2_journal_abort(journal, err);
819 
820 	/* End of a transaction!  Finally, we can do checkpoint
821            processing: any buffers committed as a result of this
822            transaction can be removed from any checkpoint list it was on
823            before. */
824 
825 	jbd_debug(3, "JBD: commit phase 6\n");
826 
827 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
828 	J_ASSERT(commit_transaction->t_buffers == NULL);
829 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
830 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
831 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
832 	J_ASSERT(commit_transaction->t_log_list == NULL);
833 
834 restart_loop:
835 	/*
836 	 * As there are other places (journal_unmap_buffer()) adding buffers
837 	 * to this list we have to be careful and hold the j_list_lock.
838 	 */
839 	spin_lock(&journal->j_list_lock);
840 	while (commit_transaction->t_forget) {
841 		transaction_t *cp_transaction;
842 		struct buffer_head *bh;
843 
844 		jh = commit_transaction->t_forget;
845 		spin_unlock(&journal->j_list_lock);
846 		bh = jh2bh(jh);
847 		jbd_lock_bh_state(bh);
848 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
849 
850 		/*
851 		 * If there is undo-protected committed data against
852 		 * this buffer, then we can remove it now.  If it is a
853 		 * buffer needing such protection, the old frozen_data
854 		 * field now points to a committed version of the
855 		 * buffer, so rotate that field to the new committed
856 		 * data.
857 		 *
858 		 * Otherwise, we can just throw away the frozen data now.
859 		 *
860 		 * We also know that the frozen data has already fired
861 		 * its triggers if they exist, so we can clear that too.
862 		 */
863 		if (jh->b_committed_data) {
864 			jbd2_free(jh->b_committed_data, bh->b_size);
865 			jh->b_committed_data = NULL;
866 			if (jh->b_frozen_data) {
867 				jh->b_committed_data = jh->b_frozen_data;
868 				jh->b_frozen_data = NULL;
869 				jh->b_frozen_triggers = NULL;
870 			}
871 		} else if (jh->b_frozen_data) {
872 			jbd2_free(jh->b_frozen_data, bh->b_size);
873 			jh->b_frozen_data = NULL;
874 			jh->b_frozen_triggers = NULL;
875 		}
876 
877 		spin_lock(&journal->j_list_lock);
878 		cp_transaction = jh->b_cp_transaction;
879 		if (cp_transaction) {
880 			JBUFFER_TRACE(jh, "remove from old cp transaction");
881 			cp_transaction->t_chp_stats.cs_dropped++;
882 			__jbd2_journal_remove_checkpoint(jh);
883 		}
884 
885 		/* Only re-checkpoint the buffer_head if it is marked
886 		 * dirty.  If the buffer was added to the BJ_Forget list
887 		 * by jbd2_journal_forget, it may no longer be dirty and
888 		 * there's no point in keeping a checkpoint record for
889 		 * it. */
890 
891 		/* A buffer which has been freed while still being
892 		 * journaled by a previous transaction may end up still
893 		 * being dirty here, but we want to avoid writing back
894 		 * that buffer in the future after the "add to orphan"
895 		 * operation been committed,  That's not only a performance
896 		 * gain, it also stops aliasing problems if the buffer is
897 		 * left behind for writeback and gets reallocated for another
898 		 * use in a different page. */
899 		if (buffer_freed(bh) && !jh->b_next_transaction) {
900 			clear_buffer_freed(bh);
901 			clear_buffer_jbddirty(bh);
902 		}
903 
904 		if (buffer_jbddirty(bh)) {
905 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
906 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
907 			if (is_journal_aborted(journal))
908 				clear_buffer_jbddirty(bh);
909 			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
910 			__jbd2_journal_refile_buffer(jh);
911 			jbd_unlock_bh_state(bh);
912 		} else {
913 			J_ASSERT_BH(bh, !buffer_dirty(bh));
914 			/* The buffer on BJ_Forget list and not jbddirty means
915 			 * it has been freed by this transaction and hence it
916 			 * could not have been reallocated until this
917 			 * transaction has committed. *BUT* it could be
918 			 * reallocated once we have written all the data to
919 			 * disk and before we process the buffer on BJ_Forget
920 			 * list. */
921 			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
922 			__jbd2_journal_refile_buffer(jh);
923 			if (!jh->b_transaction) {
924 				jbd_unlock_bh_state(bh);
925 				 /* needs a brelse */
926 				jbd2_journal_remove_journal_head(bh);
927 				release_buffer_page(bh);
928 			} else
929 				jbd_unlock_bh_state(bh);
930 		}
931 		cond_resched_lock(&journal->j_list_lock);
932 	}
933 	spin_unlock(&journal->j_list_lock);
934 	/*
935 	 * This is a bit sleazy.  We use j_list_lock to protect transition
936 	 * of a transaction into T_FINISHED state and calling
937 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
938 	 * other checkpointing code processing the transaction...
939 	 */
940 	write_lock(&journal->j_state_lock);
941 	spin_lock(&journal->j_list_lock);
942 	/*
943 	 * Now recheck if some buffers did not get attached to the transaction
944 	 * while the lock was dropped...
945 	 */
946 	if (commit_transaction->t_forget) {
947 		spin_unlock(&journal->j_list_lock);
948 		write_unlock(&journal->j_state_lock);
949 		goto restart_loop;
950 	}
951 
952 	/* Done with this transaction! */
953 
954 	jbd_debug(3, "JBD: commit phase 7\n");
955 
956 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
957 
958 	commit_transaction->t_start = jiffies;
959 	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
960 					      commit_transaction->t_start);
961 
962 	/*
963 	 * File the transaction statistics
964 	 */
965 	stats.ts_tid = commit_transaction->t_tid;
966 	stats.run.rs_handle_count =
967 		atomic_read(&commit_transaction->t_handle_count);
968 	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
969 			     commit_transaction->t_tid, &stats.run);
970 
971 	/*
972 	 * Calculate overall stats
973 	 */
974 	spin_lock(&journal->j_history_lock);
975 	journal->j_stats.ts_tid++;
976 	journal->j_stats.run.rs_wait += stats.run.rs_wait;
977 	journal->j_stats.run.rs_running += stats.run.rs_running;
978 	journal->j_stats.run.rs_locked += stats.run.rs_locked;
979 	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
980 	journal->j_stats.run.rs_logging += stats.run.rs_logging;
981 	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
982 	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
983 	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
984 	spin_unlock(&journal->j_history_lock);
985 
986 	commit_transaction->t_state = T_FINISHED;
987 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
988 	journal->j_commit_sequence = commit_transaction->t_tid;
989 	journal->j_committing_transaction = NULL;
990 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
991 
992 	/*
993 	 * weight the commit time higher than the average time so we don't
994 	 * react too strongly to vast changes in the commit time
995 	 */
996 	if (likely(journal->j_average_commit_time))
997 		journal->j_average_commit_time = (commit_time +
998 				journal->j_average_commit_time*3) / 4;
999 	else
1000 		journal->j_average_commit_time = commit_time;
1001 	write_unlock(&journal->j_state_lock);
1002 
1003 	if (commit_transaction->t_checkpoint_list == NULL &&
1004 	    commit_transaction->t_checkpoint_io_list == NULL) {
1005 		__jbd2_journal_drop_transaction(journal, commit_transaction);
1006 		to_free = 1;
1007 	} else {
1008 		if (journal->j_checkpoint_transactions == NULL) {
1009 			journal->j_checkpoint_transactions = commit_transaction;
1010 			commit_transaction->t_cpnext = commit_transaction;
1011 			commit_transaction->t_cpprev = commit_transaction;
1012 		} else {
1013 			commit_transaction->t_cpnext =
1014 				journal->j_checkpoint_transactions;
1015 			commit_transaction->t_cpprev =
1016 				commit_transaction->t_cpnext->t_cpprev;
1017 			commit_transaction->t_cpnext->t_cpprev =
1018 				commit_transaction;
1019 			commit_transaction->t_cpprev->t_cpnext =
1020 				commit_transaction;
1021 		}
1022 	}
1023 	spin_unlock(&journal->j_list_lock);
1024 
1025 	if (journal->j_commit_callback)
1026 		journal->j_commit_callback(journal, commit_transaction);
1027 
1028 	trace_jbd2_end_commit(journal, commit_transaction);
1029 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
1030 		  journal->j_commit_sequence, journal->j_tail_sequence);
1031 	if (to_free)
1032 		kfree(commit_transaction);
1033 
1034 	wake_up(&journal->j_wait_done_commit);
1035 }
1036