xref: /linux-6.15/fs/jbd2/commit.c (revision 7ec7fb39)
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <[email protected]>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/marker.h>
20 #include <linux/errno.h>
21 #include <linux/slab.h>
22 #include <linux/mm.h>
23 #include <linux/pagemap.h>
24 #include <linux/jiffies.h>
25 #include <linux/crc32.h>
26 #include <linux/writeback.h>
27 #include <linux/backing-dev.h>
28 
29 /*
30  * Default IO end handler for temporary BJ_IO buffer_heads.
31  */
32 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
33 {
34 	BUFFER_TRACE(bh, "");
35 	if (uptodate)
36 		set_buffer_uptodate(bh);
37 	else
38 		clear_buffer_uptodate(bh);
39 	unlock_buffer(bh);
40 }
41 
42 /*
43  * When an ext4 file is truncated, it is possible that some pages are not
44  * successfully freed, because they are attached to a committing transaction.
45  * After the transaction commits, these pages are left on the LRU, with no
46  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
47  * by the VM, but their apparent absence upsets the VM accounting, and it makes
48  * the numbers in /proc/meminfo look odd.
49  *
50  * So here, we have a buffer which has just come off the forget list.  Look to
51  * see if we can strip all buffers from the backing page.
52  *
53  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
54  * caller provided us with a ref against the buffer, and we drop that here.
55  */
56 static void release_buffer_page(struct buffer_head *bh)
57 {
58 	struct page *page;
59 
60 	if (buffer_dirty(bh))
61 		goto nope;
62 	if (atomic_read(&bh->b_count) != 1)
63 		goto nope;
64 	page = bh->b_page;
65 	if (!page)
66 		goto nope;
67 	if (page->mapping)
68 		goto nope;
69 
70 	/* OK, it's a truncated page */
71 	if (!trylock_page(page))
72 		goto nope;
73 
74 	page_cache_get(page);
75 	__brelse(bh);
76 	try_to_free_buffers(page);
77 	unlock_page(page);
78 	page_cache_release(page);
79 	return;
80 
81 nope:
82 	__brelse(bh);
83 }
84 
85 /*
86  * Done it all: now submit the commit record.  We should have
87  * cleaned up our previous buffers by now, so if we are in abort
88  * mode we can now just skip the rest of the journal write
89  * entirely.
90  *
91  * Returns 1 if the journal needs to be aborted or 0 on success
92  */
93 static int journal_submit_commit_record(journal_t *journal,
94 					transaction_t *commit_transaction,
95 					struct buffer_head **cbh,
96 					__u32 crc32_sum)
97 {
98 	struct journal_head *descriptor;
99 	struct commit_header *tmp;
100 	struct buffer_head *bh;
101 	int ret;
102 	int barrier_done = 0;
103 	struct timespec now = current_kernel_time();
104 
105 	if (is_journal_aborted(journal))
106 		return 0;
107 
108 	descriptor = jbd2_journal_get_descriptor_buffer(journal);
109 	if (!descriptor)
110 		return 1;
111 
112 	bh = jh2bh(descriptor);
113 
114 	tmp = (struct commit_header *)bh->b_data;
115 	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
116 	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
117 	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
118 	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
119 	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
120 
121 	if (JBD2_HAS_COMPAT_FEATURE(journal,
122 				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
123 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
124 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
125 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
126 	}
127 
128 	JBUFFER_TRACE(descriptor, "submit commit block");
129 	lock_buffer(bh);
130 	clear_buffer_dirty(bh);
131 	set_buffer_uptodate(bh);
132 	bh->b_end_io = journal_end_buffer_io_sync;
133 
134 	if (journal->j_flags & JBD2_BARRIER &&
135 		!JBD2_HAS_INCOMPAT_FEATURE(journal,
136 					 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
137 		set_buffer_ordered(bh);
138 		barrier_done = 1;
139 	}
140 	ret = submit_bh(WRITE, bh);
141 	if (barrier_done)
142 		clear_buffer_ordered(bh);
143 
144 	/* is it possible for another commit to fail at roughly
145 	 * the same time as this one?  If so, we don't want to
146 	 * trust the barrier flag in the super, but instead want
147 	 * to remember if we sent a barrier request
148 	 */
149 	if (ret == -EOPNOTSUPP && barrier_done) {
150 		printk(KERN_WARNING
151 		       "JBD: barrier-based sync failed on %s - "
152 		       "disabling barriers\n", journal->j_devname);
153 		spin_lock(&journal->j_state_lock);
154 		journal->j_flags &= ~JBD2_BARRIER;
155 		spin_unlock(&journal->j_state_lock);
156 
157 		/* And try again, without the barrier */
158 		lock_buffer(bh);
159 		set_buffer_uptodate(bh);
160 		clear_buffer_dirty(bh);
161 		ret = submit_bh(WRITE, bh);
162 	}
163 	*cbh = bh;
164 	return ret;
165 }
166 
167 /*
168  * This function along with journal_submit_commit_record
169  * allows to write the commit record asynchronously.
170  */
171 static int journal_wait_on_commit_record(struct buffer_head *bh)
172 {
173 	int ret = 0;
174 
175 	clear_buffer_dirty(bh);
176 	wait_on_buffer(bh);
177 
178 	if (unlikely(!buffer_uptodate(bh)))
179 		ret = -EIO;
180 	put_bh(bh);            /* One for getblk() */
181 	jbd2_journal_put_journal_head(bh2jh(bh));
182 
183 	return ret;
184 }
185 
186 /*
187  * write the filemap data using writepage() address_space_operations.
188  * We don't do block allocation here even for delalloc. We don't
189  * use writepages() because with dealyed allocation we may be doing
190  * block allocation in writepages().
191  */
192 static int journal_submit_inode_data_buffers(struct address_space *mapping)
193 {
194 	int ret;
195 	struct writeback_control wbc = {
196 		.sync_mode =  WB_SYNC_ALL,
197 		.nr_to_write = mapping->nrpages * 2,
198 		.range_start = 0,
199 		.range_end = i_size_read(mapping->host),
200 		.for_writepages = 1,
201 	};
202 
203 	ret = generic_writepages(mapping, &wbc);
204 	return ret;
205 }
206 
207 /*
208  * Submit all the data buffers of inode associated with the transaction to
209  * disk.
210  *
211  * We are in a committing transaction. Therefore no new inode can be added to
212  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
213  * operate on from being released while we write out pages.
214  */
215 static int journal_submit_data_buffers(journal_t *journal,
216 		transaction_t *commit_transaction)
217 {
218 	struct jbd2_inode *jinode;
219 	int err, ret = 0;
220 	struct address_space *mapping;
221 
222 	spin_lock(&journal->j_list_lock);
223 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
224 		mapping = jinode->i_vfs_inode->i_mapping;
225 		jinode->i_flags |= JI_COMMIT_RUNNING;
226 		spin_unlock(&journal->j_list_lock);
227 		/*
228 		 * submit the inode data buffers. We use writepage
229 		 * instead of writepages. Because writepages can do
230 		 * block allocation  with delalloc. We need to write
231 		 * only allocated blocks here.
232 		 */
233 		err = journal_submit_inode_data_buffers(mapping);
234 		if (!ret)
235 			ret = err;
236 		spin_lock(&journal->j_list_lock);
237 		J_ASSERT(jinode->i_transaction == commit_transaction);
238 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
239 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
240 	}
241 	spin_unlock(&journal->j_list_lock);
242 	return ret;
243 }
244 
245 /*
246  * Wait for data submitted for writeout, refile inodes to proper
247  * transaction if needed.
248  *
249  */
250 static int journal_finish_inode_data_buffers(journal_t *journal,
251 		transaction_t *commit_transaction)
252 {
253 	struct jbd2_inode *jinode, *next_i;
254 	int err, ret = 0;
255 
256 	/* For locking, see the comment in journal_submit_data_buffers() */
257 	spin_lock(&journal->j_list_lock);
258 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
259 		jinode->i_flags |= JI_COMMIT_RUNNING;
260 		spin_unlock(&journal->j_list_lock);
261 		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
262 		if (err) {
263 			/*
264 			 * Because AS_EIO is cleared by
265 			 * wait_on_page_writeback_range(), set it again so
266 			 * that user process can get -EIO from fsync().
267 			 */
268 			set_bit(AS_EIO,
269 				&jinode->i_vfs_inode->i_mapping->flags);
270 
271 			if (!ret)
272 				ret = err;
273 		}
274 		spin_lock(&journal->j_list_lock);
275 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
276 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
277 	}
278 
279 	/* Now refile inode to proper lists */
280 	list_for_each_entry_safe(jinode, next_i,
281 				 &commit_transaction->t_inode_list, i_list) {
282 		list_del(&jinode->i_list);
283 		if (jinode->i_next_transaction) {
284 			jinode->i_transaction = jinode->i_next_transaction;
285 			jinode->i_next_transaction = NULL;
286 			list_add(&jinode->i_list,
287 				&jinode->i_transaction->t_inode_list);
288 		} else {
289 			jinode->i_transaction = NULL;
290 		}
291 	}
292 	spin_unlock(&journal->j_list_lock);
293 
294 	return ret;
295 }
296 
297 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
298 {
299 	struct page *page = bh->b_page;
300 	char *addr;
301 	__u32 checksum;
302 
303 	addr = kmap_atomic(page, KM_USER0);
304 	checksum = crc32_be(crc32_sum,
305 		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
306 	kunmap_atomic(addr, KM_USER0);
307 
308 	return checksum;
309 }
310 
311 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
312 				   unsigned long long block)
313 {
314 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
315 	if (tag_bytes > JBD2_TAG_SIZE32)
316 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
317 }
318 
319 /*
320  * jbd2_journal_commit_transaction
321  *
322  * The primary function for committing a transaction to the log.  This
323  * function is called by the journal thread to begin a complete commit.
324  */
325 void jbd2_journal_commit_transaction(journal_t *journal)
326 {
327 	struct transaction_stats_s stats;
328 	transaction_t *commit_transaction;
329 	struct journal_head *jh, *new_jh, *descriptor;
330 	struct buffer_head **wbuf = journal->j_wbuf;
331 	int bufs;
332 	int flags;
333 	int err;
334 	unsigned long long blocknr;
335 	char *tagp = NULL;
336 	journal_header_t *header;
337 	journal_block_tag_t *tag = NULL;
338 	int space_left = 0;
339 	int first_tag = 0;
340 	int tag_flag;
341 	int i;
342 	int tag_bytes = journal_tag_bytes(journal);
343 	struct buffer_head *cbh = NULL; /* For transactional checksums */
344 	__u32 crc32_sum = ~0;
345 
346 	/*
347 	 * First job: lock down the current transaction and wait for
348 	 * all outstanding updates to complete.
349 	 */
350 
351 #ifdef COMMIT_STATS
352 	spin_lock(&journal->j_list_lock);
353 	summarise_journal_usage(journal);
354 	spin_unlock(&journal->j_list_lock);
355 #endif
356 
357 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
358 	if (journal->j_flags & JBD2_FLUSHED) {
359 		jbd_debug(3, "super block updated\n");
360 		jbd2_journal_update_superblock(journal, 1);
361 	} else {
362 		jbd_debug(3, "superblock not updated\n");
363 	}
364 
365 	J_ASSERT(journal->j_running_transaction != NULL);
366 	J_ASSERT(journal->j_committing_transaction == NULL);
367 
368 	commit_transaction = journal->j_running_transaction;
369 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
370 
371 	trace_mark(jbd2_start_commit, "dev %s transaction %d",
372 		   journal->j_devname, commit_transaction->t_tid);
373 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
374 			commit_transaction->t_tid);
375 
376 	spin_lock(&journal->j_state_lock);
377 	commit_transaction->t_state = T_LOCKED;
378 
379 	stats.u.run.rs_wait = commit_transaction->t_max_wait;
380 	stats.u.run.rs_locked = jiffies;
381 	stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
382 						stats.u.run.rs_locked);
383 
384 	spin_lock(&commit_transaction->t_handle_lock);
385 	while (commit_transaction->t_updates) {
386 		DEFINE_WAIT(wait);
387 
388 		prepare_to_wait(&journal->j_wait_updates, &wait,
389 					TASK_UNINTERRUPTIBLE);
390 		if (commit_transaction->t_updates) {
391 			spin_unlock(&commit_transaction->t_handle_lock);
392 			spin_unlock(&journal->j_state_lock);
393 			schedule();
394 			spin_lock(&journal->j_state_lock);
395 			spin_lock(&commit_transaction->t_handle_lock);
396 		}
397 		finish_wait(&journal->j_wait_updates, &wait);
398 	}
399 	spin_unlock(&commit_transaction->t_handle_lock);
400 
401 	J_ASSERT (commit_transaction->t_outstanding_credits <=
402 			journal->j_max_transaction_buffers);
403 
404 	/*
405 	 * First thing we are allowed to do is to discard any remaining
406 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
407 	 * that there are no such buffers: if a large filesystem
408 	 * operation like a truncate needs to split itself over multiple
409 	 * transactions, then it may try to do a jbd2_journal_restart() while
410 	 * there are still BJ_Reserved buffers outstanding.  These must
411 	 * be released cleanly from the current transaction.
412 	 *
413 	 * In this case, the filesystem must still reserve write access
414 	 * again before modifying the buffer in the new transaction, but
415 	 * we do not require it to remember exactly which old buffers it
416 	 * has reserved.  This is consistent with the existing behaviour
417 	 * that multiple jbd2_journal_get_write_access() calls to the same
418 	 * buffer are perfectly permissable.
419 	 */
420 	while (commit_transaction->t_reserved_list) {
421 		jh = commit_transaction->t_reserved_list;
422 		JBUFFER_TRACE(jh, "reserved, unused: refile");
423 		/*
424 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
425 		 * leave undo-committed data.
426 		 */
427 		if (jh->b_committed_data) {
428 			struct buffer_head *bh = jh2bh(jh);
429 
430 			jbd_lock_bh_state(bh);
431 			jbd2_free(jh->b_committed_data, bh->b_size);
432 			jh->b_committed_data = NULL;
433 			jbd_unlock_bh_state(bh);
434 		}
435 		jbd2_journal_refile_buffer(journal, jh);
436 	}
437 
438 	/*
439 	 * Now try to drop any written-back buffers from the journal's
440 	 * checkpoint lists.  We do this *before* commit because it potentially
441 	 * frees some memory
442 	 */
443 	spin_lock(&journal->j_list_lock);
444 	__jbd2_journal_clean_checkpoint_list(journal);
445 	spin_unlock(&journal->j_list_lock);
446 
447 	jbd_debug (3, "JBD: commit phase 1\n");
448 
449 	/*
450 	 * Switch to a new revoke table.
451 	 */
452 	jbd2_journal_switch_revoke_table(journal);
453 
454 	stats.u.run.rs_flushing = jiffies;
455 	stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
456 					       stats.u.run.rs_flushing);
457 
458 	commit_transaction->t_state = T_FLUSH;
459 	journal->j_committing_transaction = commit_transaction;
460 	journal->j_running_transaction = NULL;
461 	commit_transaction->t_log_start = journal->j_head;
462 	wake_up(&journal->j_wait_transaction_locked);
463 	spin_unlock(&journal->j_state_lock);
464 
465 	jbd_debug (3, "JBD: commit phase 2\n");
466 
467 	/*
468 	 * Now start flushing things to disk, in the order they appear
469 	 * on the transaction lists.  Data blocks go first.
470 	 */
471 	err = journal_submit_data_buffers(journal, commit_transaction);
472 	if (err)
473 		jbd2_journal_abort(journal, err);
474 
475 	jbd2_journal_write_revoke_records(journal, commit_transaction);
476 
477 	jbd_debug(3, "JBD: commit phase 2\n");
478 
479 	/*
480 	 * Way to go: we have now written out all of the data for a
481 	 * transaction!  Now comes the tricky part: we need to write out
482 	 * metadata.  Loop over the transaction's entire buffer list:
483 	 */
484 	spin_lock(&journal->j_state_lock);
485 	commit_transaction->t_state = T_COMMIT;
486 	spin_unlock(&journal->j_state_lock);
487 
488 	stats.u.run.rs_logging = jiffies;
489 	stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
490 						 stats.u.run.rs_logging);
491 	stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
492 	stats.u.run.rs_blocks_logged = 0;
493 
494 	J_ASSERT(commit_transaction->t_nr_buffers <=
495 		 commit_transaction->t_outstanding_credits);
496 
497 	err = 0;
498 	descriptor = NULL;
499 	bufs = 0;
500 	while (commit_transaction->t_buffers) {
501 
502 		/* Find the next buffer to be journaled... */
503 
504 		jh = commit_transaction->t_buffers;
505 
506 		/* If we're in abort mode, we just un-journal the buffer and
507 		   release it. */
508 
509 		if (is_journal_aborted(journal)) {
510 			clear_buffer_jbddirty(jh2bh(jh));
511 			JBUFFER_TRACE(jh, "journal is aborting: refile");
512 			jbd2_buffer_abort_trigger(jh,
513 						  jh->b_frozen_data ?
514 						  jh->b_frozen_triggers :
515 						  jh->b_triggers);
516 			jbd2_journal_refile_buffer(journal, jh);
517 			/* If that was the last one, we need to clean up
518 			 * any descriptor buffers which may have been
519 			 * already allocated, even if we are now
520 			 * aborting. */
521 			if (!commit_transaction->t_buffers)
522 				goto start_journal_io;
523 			continue;
524 		}
525 
526 		/* Make sure we have a descriptor block in which to
527 		   record the metadata buffer. */
528 
529 		if (!descriptor) {
530 			struct buffer_head *bh;
531 
532 			J_ASSERT (bufs == 0);
533 
534 			jbd_debug(4, "JBD: get descriptor\n");
535 
536 			descriptor = jbd2_journal_get_descriptor_buffer(journal);
537 			if (!descriptor) {
538 				jbd2_journal_abort(journal, -EIO);
539 				continue;
540 			}
541 
542 			bh = jh2bh(descriptor);
543 			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
544 				(unsigned long long)bh->b_blocknr, bh->b_data);
545 			header = (journal_header_t *)&bh->b_data[0];
546 			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
547 			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
548 			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
549 
550 			tagp = &bh->b_data[sizeof(journal_header_t)];
551 			space_left = bh->b_size - sizeof(journal_header_t);
552 			first_tag = 1;
553 			set_buffer_jwrite(bh);
554 			set_buffer_dirty(bh);
555 			wbuf[bufs++] = bh;
556 
557 			/* Record it so that we can wait for IO
558                            completion later */
559 			BUFFER_TRACE(bh, "ph3: file as descriptor");
560 			jbd2_journal_file_buffer(descriptor, commit_transaction,
561 					BJ_LogCtl);
562 		}
563 
564 		/* Where is the buffer to be written? */
565 
566 		err = jbd2_journal_next_log_block(journal, &blocknr);
567 		/* If the block mapping failed, just abandon the buffer
568 		   and repeat this loop: we'll fall into the
569 		   refile-on-abort condition above. */
570 		if (err) {
571 			jbd2_journal_abort(journal, err);
572 			continue;
573 		}
574 
575 		/*
576 		 * start_this_handle() uses t_outstanding_credits to determine
577 		 * the free space in the log, but this counter is changed
578 		 * by jbd2_journal_next_log_block() also.
579 		 */
580 		commit_transaction->t_outstanding_credits--;
581 
582 		/* Bump b_count to prevent truncate from stumbling over
583                    the shadowed buffer!  @@@ This can go if we ever get
584                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
585 		atomic_inc(&jh2bh(jh)->b_count);
586 
587 		/* Make a temporary IO buffer with which to write it out
588                    (this will requeue both the metadata buffer and the
589                    temporary IO buffer). new_bh goes on BJ_IO*/
590 
591 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
592 		/*
593 		 * akpm: jbd2_journal_write_metadata_buffer() sets
594 		 * new_bh->b_transaction to commit_transaction.
595 		 * We need to clean this up before we release new_bh
596 		 * (which is of type BJ_IO)
597 		 */
598 		JBUFFER_TRACE(jh, "ph3: write metadata");
599 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
600 						      jh, &new_jh, blocknr);
601 		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
602 		wbuf[bufs++] = jh2bh(new_jh);
603 
604 		/* Record the new block's tag in the current descriptor
605                    buffer */
606 
607 		tag_flag = 0;
608 		if (flags & 1)
609 			tag_flag |= JBD2_FLAG_ESCAPE;
610 		if (!first_tag)
611 			tag_flag |= JBD2_FLAG_SAME_UUID;
612 
613 		tag = (journal_block_tag_t *) tagp;
614 		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
615 		tag->t_flags = cpu_to_be32(tag_flag);
616 		tagp += tag_bytes;
617 		space_left -= tag_bytes;
618 
619 		if (first_tag) {
620 			memcpy (tagp, journal->j_uuid, 16);
621 			tagp += 16;
622 			space_left -= 16;
623 			first_tag = 0;
624 		}
625 
626 		/* If there's no more to do, or if the descriptor is full,
627 		   let the IO rip! */
628 
629 		if (bufs == journal->j_wbufsize ||
630 		    commit_transaction->t_buffers == NULL ||
631 		    space_left < tag_bytes + 16) {
632 
633 			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
634 
635 			/* Write an end-of-descriptor marker before
636                            submitting the IOs.  "tag" still points to
637                            the last tag we set up. */
638 
639 			tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
640 
641 start_journal_io:
642 			for (i = 0; i < bufs; i++) {
643 				struct buffer_head *bh = wbuf[i];
644 				/*
645 				 * Compute checksum.
646 				 */
647 				if (JBD2_HAS_COMPAT_FEATURE(journal,
648 					JBD2_FEATURE_COMPAT_CHECKSUM)) {
649 					crc32_sum =
650 					    jbd2_checksum_data(crc32_sum, bh);
651 				}
652 
653 				lock_buffer(bh);
654 				clear_buffer_dirty(bh);
655 				set_buffer_uptodate(bh);
656 				bh->b_end_io = journal_end_buffer_io_sync;
657 				submit_bh(WRITE, bh);
658 			}
659 			cond_resched();
660 			stats.u.run.rs_blocks_logged += bufs;
661 
662 			/* Force a new descriptor to be generated next
663                            time round the loop. */
664 			descriptor = NULL;
665 			bufs = 0;
666 		}
667 	}
668 
669 	/* Done it all: now write the commit record asynchronously. */
670 
671 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
672 		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
673 		err = journal_submit_commit_record(journal, commit_transaction,
674 						 &cbh, crc32_sum);
675 		if (err)
676 			__jbd2_journal_abort_hard(journal);
677 	}
678 
679 	/*
680 	 * This is the right place to wait for data buffers both for ASYNC
681 	 * and !ASYNC commit. If commit is ASYNC, we need to wait only after
682 	 * the commit block went to disk (which happens above). If commit is
683 	 * SYNC, we need to wait for data buffers before we start writing
684 	 * commit block, which happens below in such setting.
685 	 */
686 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
687 	if (err) {
688 		printk(KERN_WARNING
689 			"JBD2: Detected IO errors while flushing file data "
690 		       "on %s\n", journal->j_devname);
691 		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
692 			jbd2_journal_abort(journal, err);
693 		err = 0;
694 	}
695 
696 	/* Lo and behold: we have just managed to send a transaction to
697            the log.  Before we can commit it, wait for the IO so far to
698            complete.  Control buffers being written are on the
699            transaction's t_log_list queue, and metadata buffers are on
700            the t_iobuf_list queue.
701 
702 	   Wait for the buffers in reverse order.  That way we are
703 	   less likely to be woken up until all IOs have completed, and
704 	   so we incur less scheduling load.
705 	*/
706 
707 	jbd_debug(3, "JBD: commit phase 3\n");
708 
709 	/*
710 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
711 	 * See __journal_try_to_free_buffer.
712 	 */
713 wait_for_iobuf:
714 	while (commit_transaction->t_iobuf_list != NULL) {
715 		struct buffer_head *bh;
716 
717 		jh = commit_transaction->t_iobuf_list->b_tprev;
718 		bh = jh2bh(jh);
719 		if (buffer_locked(bh)) {
720 			wait_on_buffer(bh);
721 			goto wait_for_iobuf;
722 		}
723 		if (cond_resched())
724 			goto wait_for_iobuf;
725 
726 		if (unlikely(!buffer_uptodate(bh)))
727 			err = -EIO;
728 
729 		clear_buffer_jwrite(bh);
730 
731 		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
732 		jbd2_journal_unfile_buffer(journal, jh);
733 
734 		/*
735 		 * ->t_iobuf_list should contain only dummy buffer_heads
736 		 * which were created by jbd2_journal_write_metadata_buffer().
737 		 */
738 		BUFFER_TRACE(bh, "dumping temporary bh");
739 		jbd2_journal_put_journal_head(jh);
740 		__brelse(bh);
741 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
742 		free_buffer_head(bh);
743 
744 		/* We also have to unlock and free the corresponding
745                    shadowed buffer */
746 		jh = commit_transaction->t_shadow_list->b_tprev;
747 		bh = jh2bh(jh);
748 		clear_bit(BH_JWrite, &bh->b_state);
749 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
750 
751 		/* The metadata is now released for reuse, but we need
752                    to remember it against this transaction so that when
753                    we finally commit, we can do any checkpointing
754                    required. */
755 		JBUFFER_TRACE(jh, "file as BJ_Forget");
756 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
757 		/* Wake up any transactions which were waiting for this
758 		   IO to complete */
759 		wake_up_bit(&bh->b_state, BH_Unshadow);
760 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
761 		__brelse(bh);
762 	}
763 
764 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
765 
766 	jbd_debug(3, "JBD: commit phase 4\n");
767 
768 	/* Here we wait for the revoke record and descriptor record buffers */
769  wait_for_ctlbuf:
770 	while (commit_transaction->t_log_list != NULL) {
771 		struct buffer_head *bh;
772 
773 		jh = commit_transaction->t_log_list->b_tprev;
774 		bh = jh2bh(jh);
775 		if (buffer_locked(bh)) {
776 			wait_on_buffer(bh);
777 			goto wait_for_ctlbuf;
778 		}
779 		if (cond_resched())
780 			goto wait_for_ctlbuf;
781 
782 		if (unlikely(!buffer_uptodate(bh)))
783 			err = -EIO;
784 
785 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
786 		clear_buffer_jwrite(bh);
787 		jbd2_journal_unfile_buffer(journal, jh);
788 		jbd2_journal_put_journal_head(jh);
789 		__brelse(bh);		/* One for getblk */
790 		/* AKPM: bforget here */
791 	}
792 
793 	if (err)
794 		jbd2_journal_abort(journal, err);
795 
796 	jbd_debug(3, "JBD: commit phase 5\n");
797 
798 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
799 		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
800 		err = journal_submit_commit_record(journal, commit_transaction,
801 						&cbh, crc32_sum);
802 		if (err)
803 			__jbd2_journal_abort_hard(journal);
804 	}
805 	if (!err && !is_journal_aborted(journal))
806 		err = journal_wait_on_commit_record(cbh);
807 
808 	if (err)
809 		jbd2_journal_abort(journal, err);
810 
811 	/* End of a transaction!  Finally, we can do checkpoint
812            processing: any buffers committed as a result of this
813            transaction can be removed from any checkpoint list it was on
814            before. */
815 
816 	jbd_debug(3, "JBD: commit phase 6\n");
817 
818 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
819 	J_ASSERT(commit_transaction->t_buffers == NULL);
820 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
821 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
822 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
823 	J_ASSERT(commit_transaction->t_log_list == NULL);
824 
825 restart_loop:
826 	/*
827 	 * As there are other places (journal_unmap_buffer()) adding buffers
828 	 * to this list we have to be careful and hold the j_list_lock.
829 	 */
830 	spin_lock(&journal->j_list_lock);
831 	while (commit_transaction->t_forget) {
832 		transaction_t *cp_transaction;
833 		struct buffer_head *bh;
834 
835 		jh = commit_transaction->t_forget;
836 		spin_unlock(&journal->j_list_lock);
837 		bh = jh2bh(jh);
838 		jbd_lock_bh_state(bh);
839 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
840 			jh->b_transaction == journal->j_running_transaction);
841 
842 		/*
843 		 * If there is undo-protected committed data against
844 		 * this buffer, then we can remove it now.  If it is a
845 		 * buffer needing such protection, the old frozen_data
846 		 * field now points to a committed version of the
847 		 * buffer, so rotate that field to the new committed
848 		 * data.
849 		 *
850 		 * Otherwise, we can just throw away the frozen data now.
851 		 *
852 		 * We also know that the frozen data has already fired
853 		 * its triggers if they exist, so we can clear that too.
854 		 */
855 		if (jh->b_committed_data) {
856 			jbd2_free(jh->b_committed_data, bh->b_size);
857 			jh->b_committed_data = NULL;
858 			if (jh->b_frozen_data) {
859 				jh->b_committed_data = jh->b_frozen_data;
860 				jh->b_frozen_data = NULL;
861 				jh->b_frozen_triggers = NULL;
862 			}
863 		} else if (jh->b_frozen_data) {
864 			jbd2_free(jh->b_frozen_data, bh->b_size);
865 			jh->b_frozen_data = NULL;
866 			jh->b_frozen_triggers = NULL;
867 		}
868 
869 		spin_lock(&journal->j_list_lock);
870 		cp_transaction = jh->b_cp_transaction;
871 		if (cp_transaction) {
872 			JBUFFER_TRACE(jh, "remove from old cp transaction");
873 			cp_transaction->t_chp_stats.cs_dropped++;
874 			__jbd2_journal_remove_checkpoint(jh);
875 		}
876 
877 		/* Only re-checkpoint the buffer_head if it is marked
878 		 * dirty.  If the buffer was added to the BJ_Forget list
879 		 * by jbd2_journal_forget, it may no longer be dirty and
880 		 * there's no point in keeping a checkpoint record for
881 		 * it. */
882 
883 		/* A buffer which has been freed while still being
884 		 * journaled by a previous transaction may end up still
885 		 * being dirty here, but we want to avoid writing back
886 		 * that buffer in the future now that the last use has
887 		 * been committed.  That's not only a performance gain,
888 		 * it also stops aliasing problems if the buffer is left
889 		 * behind for writeback and gets reallocated for another
890 		 * use in a different page. */
891 		if (buffer_freed(bh)) {
892 			clear_buffer_freed(bh);
893 			clear_buffer_jbddirty(bh);
894 		}
895 
896 		if (buffer_jbddirty(bh)) {
897 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
898 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
899 			if (is_journal_aborted(journal))
900 				clear_buffer_jbddirty(bh);
901 			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
902 			__jbd2_journal_refile_buffer(jh);
903 			jbd_unlock_bh_state(bh);
904 		} else {
905 			J_ASSERT_BH(bh, !buffer_dirty(bh));
906 			/* The buffer on BJ_Forget list and not jbddirty means
907 			 * it has been freed by this transaction and hence it
908 			 * could not have been reallocated until this
909 			 * transaction has committed. *BUT* it could be
910 			 * reallocated once we have written all the data to
911 			 * disk and before we process the buffer on BJ_Forget
912 			 * list. */
913 			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
914 			__jbd2_journal_refile_buffer(jh);
915 			if (!jh->b_transaction) {
916 				jbd_unlock_bh_state(bh);
917 				 /* needs a brelse */
918 				jbd2_journal_remove_journal_head(bh);
919 				release_buffer_page(bh);
920 			} else
921 				jbd_unlock_bh_state(bh);
922 		}
923 		cond_resched_lock(&journal->j_list_lock);
924 	}
925 	spin_unlock(&journal->j_list_lock);
926 	/*
927 	 * This is a bit sleazy.  We use j_list_lock to protect transition
928 	 * of a transaction into T_FINISHED state and calling
929 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
930 	 * other checkpointing code processing the transaction...
931 	 */
932 	spin_lock(&journal->j_state_lock);
933 	spin_lock(&journal->j_list_lock);
934 	/*
935 	 * Now recheck if some buffers did not get attached to the transaction
936 	 * while the lock was dropped...
937 	 */
938 	if (commit_transaction->t_forget) {
939 		spin_unlock(&journal->j_list_lock);
940 		spin_unlock(&journal->j_state_lock);
941 		goto restart_loop;
942 	}
943 
944 	/* Done with this transaction! */
945 
946 	jbd_debug(3, "JBD: commit phase 7\n");
947 
948 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
949 
950 	commit_transaction->t_start = jiffies;
951 	stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
952 						commit_transaction->t_start);
953 
954 	/*
955 	 * File the transaction for history
956 	 */
957 	stats.ts_type = JBD2_STATS_RUN;
958 	stats.ts_tid = commit_transaction->t_tid;
959 	stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
960 	spin_lock(&journal->j_history_lock);
961 	memcpy(journal->j_history + journal->j_history_cur, &stats,
962 			sizeof(stats));
963 	if (++journal->j_history_cur == journal->j_history_max)
964 		journal->j_history_cur = 0;
965 
966 	/*
967 	 * Calculate overall stats
968 	 */
969 	journal->j_stats.ts_tid++;
970 	journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
971 	journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
972 	journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
973 	journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
974 	journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
975 	journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
976 	journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
977 	journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
978 	spin_unlock(&journal->j_history_lock);
979 
980 	commit_transaction->t_state = T_FINISHED;
981 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
982 	journal->j_commit_sequence = commit_transaction->t_tid;
983 	journal->j_committing_transaction = NULL;
984 	spin_unlock(&journal->j_state_lock);
985 
986 	if (journal->j_commit_callback)
987 		journal->j_commit_callback(journal, commit_transaction);
988 
989 	if (commit_transaction->t_checkpoint_list == NULL &&
990 	    commit_transaction->t_checkpoint_io_list == NULL) {
991 		__jbd2_journal_drop_transaction(journal, commit_transaction);
992 	} else {
993 		if (journal->j_checkpoint_transactions == NULL) {
994 			journal->j_checkpoint_transactions = commit_transaction;
995 			commit_transaction->t_cpnext = commit_transaction;
996 			commit_transaction->t_cpprev = commit_transaction;
997 		} else {
998 			commit_transaction->t_cpnext =
999 				journal->j_checkpoint_transactions;
1000 			commit_transaction->t_cpprev =
1001 				commit_transaction->t_cpnext->t_cpprev;
1002 			commit_transaction->t_cpnext->t_cpprev =
1003 				commit_transaction;
1004 			commit_transaction->t_cpprev->t_cpnext =
1005 				commit_transaction;
1006 		}
1007 	}
1008 	spin_unlock(&journal->j_list_lock);
1009 
1010 	trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
1011 		   journal->j_devname, journal->j_commit_sequence,
1012 		   journal->j_tail_sequence);
1013 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
1014 		  journal->j_commit_sequence, journal->j_tail_sequence);
1015 
1016 	wake_up(&journal->j_wait_done_commit);
1017 }
1018