Line data Source code
1 : /*
2 : * linux/fs/jbd/commit.c
3 : *
4 : * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 : *
6 : * Copyright 1998 Red Hat corp --- All Rights Reserved
7 : *
8 : * This file is part of the Linux kernel and is made available under
9 : * the terms of the GNU General Public License, version 2, or at your
10 : * option, any later version, incorporated herein by reference.
11 : *
12 : * Journal commit routines for the generic filesystem journaling code;
13 : * part of the ext2fs journaling system.
14 : */
15 :
16 : #include <linux/time.h>
17 : #include <linux/fs.h>
18 : #include <linux/jbd.h>
19 : #include <linux/errno.h>
20 : #include <linux/slab.h>
21 : #include <linux/mm.h>
22 : #include <linux/pagemap.h>
23 : #include <linux/bio.h>
24 :
25 : /*
26 : * Default IO end handler for temporary BJ_IO buffer_heads.
27 : */
28 : static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
29 : {
30 : BUFFER_TRACE(bh, "");
31 0 : if (uptodate)
32 0 : set_buffer_uptodate(bh);
33 : else
34 0 : clear_buffer_uptodate(bh);
35 0 : unlock_buffer(bh);
36 0 : }
37 :
38 : /*
39 : * When an ext3-ordered file is truncated, it is possible that many pages are
40 : * not successfully freed, because they are attached to a committing transaction.
41 : * After the transaction commits, these pages are left on the LRU, with no
42 : * ->mapping, and with attached buffers. These pages are trivially reclaimable
43 : * by the VM, but their apparent absence upsets the VM accounting, and it makes
44 : * the numbers in /proc/meminfo look odd.
45 : *
46 : * So here, we have a buffer which has just come off the forget list. Look to
47 : * see if we can strip all buffers from the backing page.
48 : *
49 : * Called under journal->j_list_lock. The caller provided us with a ref
50 : * against the buffer, and we drop that here.
51 : */
52 : static void release_buffer_page(struct buffer_head *bh)
53 : {
54 0 : struct page *page;
55 0 :
56 0 : if (buffer_dirty(bh))
57 0 : goto nope;
58 0 : if (atomic_read(&bh->b_count) != 1)
59 0 : goto nope;
60 0 : page = bh->b_page;
61 0 : if (!page)
62 0 : goto nope;
63 0 : if (page->mapping)
64 0 : goto nope;
65 :
66 : /* OK, it's a truncated page */
67 0 : if (!trylock_page(page))
68 0 : goto nope;
69 :
70 0 : page_cache_get(page);
71 0 : __brelse(bh);
72 0 : try_to_free_buffers(page);
73 0 : unlock_page(page);
74 0 : page_cache_release(page);
75 0 : return;
76 0 :
77 : nope:
78 0 : __brelse(bh);
79 0 : }
80 :
81 : /*
82 : * Decrement reference counter for data buffer. If it has been marked
83 : * 'BH_Freed', release it and the page to which it belongs if possible.
84 : */
85 : static void release_data_buffer(struct buffer_head *bh)
86 : {
87 0 : if (buffer_freed(bh)) {
88 0 : clear_buffer_freed(bh);
89 0 : release_buffer_page(bh);
90 : } else
91 0 : put_bh(bh);
92 0 : }
93 :
94 : /*
95 : * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
96 : * held. For ranking reasons we must trylock. If we lose, schedule away and
97 : * return 0. j_list_lock is dropped in this case.
98 : */
99 : static int inverted_lock(journal_t *journal, struct buffer_head *bh)
100 : {
101 0 : if (!jbd_trylock_bh_state(bh)) {
102 0 : spin_unlock(&journal->j_list_lock);
103 0 : schedule();
104 0 : return 0;
105 : }
106 0 : return 1;
107 : }
108 :
109 : /* Done it all: now write the commit record. We should have
110 : * cleaned up our previous buffers by now, so if we are in abort
111 : * mode we can now just skip the rest of the journal write
112 : * entirely.
113 : *
114 : * Returns 1 if the journal needs to be aborted or 0 on success
115 : */
116 : static int journal_write_commit_record(journal_t *journal,
117 : transaction_t *commit_transaction)
118 0 : {
119 0 : struct journal_head *descriptor;
120 0 : struct buffer_head *bh;
121 0 : journal_header_t *header;
122 0 : int ret;
123 0 : int barrier_done = 0;
124 0 :
125 0 : if (is_journal_aborted(journal))
126 0 : return 0;
127 :
128 0 : descriptor = journal_get_descriptor_buffer(journal);
129 0 : if (!descriptor)
130 0 : return 1;
131 :
132 0 : bh = jh2bh(descriptor);
133 :
134 0 : header = (journal_header_t *)(bh->b_data);
135 0 : header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
136 0 : header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
137 0 : header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
138 :
139 : JBUFFER_TRACE(descriptor, "write commit block");
140 0 : set_buffer_dirty(bh);
141 0 : if (journal->j_flags & JFS_BARRIER) {
142 0 : set_buffer_ordered(bh);
143 0 : barrier_done = 1;
144 : }
145 0 : ret = sync_dirty_buffer(bh);
146 0 : if (barrier_done)
147 0 : clear_buffer_ordered(bh);
148 : /* is it possible for another commit to fail at roughly
149 : * the same time as this one? If so, we don't want to
150 : * trust the barrier flag in the super, but instead want
151 : * to remember if we sent a barrier request
152 : */
153 0 : if (ret == -EOPNOTSUPP && barrier_done) {
154 : char b[BDEVNAME_SIZE];
155 :
156 0 : printk(KERN_WARNING
157 : "JBD: barrier-based sync failed on %s - "
158 : "disabling barriers\n",
159 : bdevname(journal->j_dev, b));
160 0 : spin_lock(&journal->j_state_lock);
161 0 : journal->j_flags &= ~JFS_BARRIER;
162 0 : spin_unlock(&journal->j_state_lock);
163 :
164 : /* And try again, without the barrier */
165 0 : set_buffer_uptodate(bh);
166 0 : set_buffer_dirty(bh);
167 0 : ret = sync_dirty_buffer(bh);
168 : }
169 0 : put_bh(bh); /* One for getblk() */
170 0 : journal_put_journal_head(descriptor);
171 :
172 0 : return (ret == -EIO);
173 : }
174 :
175 : static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
176 : int write_op)
177 0 : {
178 : int i;
179 :
180 0 : for (i = 0; i < bufs; i++) {
181 0 : wbuf[i]->b_end_io = end_buffer_write_sync;
182 0 : /* We use-up our safety reference in submit_bh() */
183 0 : submit_bh(write_op, wbuf[i]);
184 : }
185 : }
186 :
187 0 : /*
188 : * Submit all the data buffers to disk
189 : */
190 : static int journal_submit_data_buffers(journal_t *journal,
191 : transaction_t *commit_transaction,
192 : int write_op)
193 0 : {
194 0 : struct journal_head *jh;
195 0 : struct buffer_head *bh;
196 0 : int locked;
197 0 : int bufs = 0;
198 0 : struct buffer_head **wbuf = journal->j_wbuf;
199 0 : int err = 0;
200 0 :
201 0 : /*
202 0 : * Whenever we unlock the journal and sleep, things can get added
203 0 : * onto ->t_sync_datalist, so we have to keep looping back to
204 0 : * write_out_data until we *know* that the list is empty.
205 0 : *
206 0 : * Cleanup any flushed data buffers from the data list. Even in
207 0 : * abort mode, we want to flush this out as soon as possible.
208 0 : */
209 0 : write_out_data:
210 0 : cond_resched();
211 0 : spin_lock(&journal->j_list_lock);
212 :
213 0 : while (commit_transaction->t_sync_datalist) {
214 0 : jh = commit_transaction->t_sync_datalist;
215 0 : bh = jh2bh(jh);
216 0 : locked = 0;
217 :
218 : /* Get reference just to make sure buffer does not disappear
219 : * when we are forced to drop various locks */
220 0 : get_bh(bh);
221 : /* If the buffer is dirty, we need to submit IO and hence
222 : * we need the buffer lock. We try to lock the buffer without
223 : * blocking. If we fail, we need to drop j_list_lock and do
224 : * blocking lock_buffer().
225 : */
226 0 : if (buffer_dirty(bh)) {
227 0 : if (!trylock_buffer(bh)) {
228 : BUFFER_TRACE(bh, "needs blocking lock");
229 0 : spin_unlock(&journal->j_list_lock);
230 : /* Write out all data to prevent deadlocks */
231 0 : journal_do_submit_data(wbuf, bufs, write_op);
232 0 : bufs = 0;
233 0 : lock_buffer(bh);
234 0 : spin_lock(&journal->j_list_lock);
235 : }
236 0 : locked = 1;
237 : }
238 : /* We have to get bh_state lock. Again out of order, sigh. */
239 0 : if (!inverted_lock(journal, bh)) {
240 0 : jbd_lock_bh_state(bh);
241 0 : spin_lock(&journal->j_list_lock);
242 : }
243 : /* Someone already cleaned up the buffer? */
244 0 : if (!buffer_jbd(bh) || bh2jh(bh) != jh
245 0 : || jh->b_transaction != commit_transaction
246 : || jh->b_jlist != BJ_SyncData) {
247 0 : jbd_unlock_bh_state(bh);
248 0 : if (locked)
249 0 : unlock_buffer(bh);
250 : BUFFER_TRACE(bh, "already cleaned up");
251 0 : release_data_buffer(bh);
252 0 : continue;
253 : }
254 0 : if (locked && test_clear_buffer_dirty(bh)) {
255 : BUFFER_TRACE(bh, "needs writeout, adding to array");
256 0 : wbuf[bufs++] = bh;
257 0 : __journal_file_buffer(jh, commit_transaction,
258 : BJ_Locked);
259 0 : jbd_unlock_bh_state(bh);
260 0 : if (bufs == journal->j_wbufsize) {
261 0 : spin_unlock(&journal->j_list_lock);
262 0 : journal_do_submit_data(wbuf, bufs, write_op);
263 0 : bufs = 0;
264 0 : goto write_out_data;
265 : }
266 0 : } else if (!locked && buffer_locked(bh)) {
267 0 : __journal_file_buffer(jh, commit_transaction,
268 : BJ_Locked);
269 0 : jbd_unlock_bh_state(bh);
270 0 : put_bh(bh);
271 : } else {
272 : BUFFER_TRACE(bh, "writeout complete: unfile");
273 0 : if (unlikely(!buffer_uptodate(bh)))
274 0 : err = -EIO;
275 0 : __journal_unfile_buffer(jh);
276 0 : jbd_unlock_bh_state(bh);
277 0 : if (locked)
278 0 : unlock_buffer(bh);
279 0 : journal_remove_journal_head(bh);
280 : /* One for our safety reference, other for
281 : * journal_remove_journal_head() */
282 0 : put_bh(bh);
283 0 : release_data_buffer(bh);
284 : }
285 :
286 0 : if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
287 0 : spin_unlock(&journal->j_list_lock);
288 0 : goto write_out_data;
289 : }
290 : }
291 0 : spin_unlock(&journal->j_list_lock);
292 0 : journal_do_submit_data(wbuf, bufs, write_op);
293 0 :
294 0 : return err;
295 : }
296 :
297 : /*
298 : * journal_commit_transaction
299 : *
300 : * The primary function for committing a transaction to the log. This
301 : * function is called by the journal thread to begin a complete commit.
302 : */
303 : void journal_commit_transaction(journal_t *journal)
304 : {
305 0 : transaction_t *commit_transaction;
306 0 : struct journal_head *jh, *new_jh, *descriptor;
307 0 : struct buffer_head **wbuf = journal->j_wbuf;
308 0 : int bufs;
309 0 : int flags;
310 0 : int err;
311 0 : unsigned int blocknr;
312 0 : ktime_t start_time;
313 0 : u64 commit_time;
314 0 : char *tagp = NULL;
315 0 : journal_header_t *header;
316 0 : journal_block_tag_t *tag = NULL;
317 0 : int space_left = 0;
318 0 : int first_tag = 0;
319 0 : int tag_flag;
320 0 : int i;
321 0 : int write_op = WRITE;
322 0 :
323 0 : /*
324 0 : * First job: lock down the current transaction and wait for
325 0 : * all outstanding updates to complete.
326 0 : */
327 0 :
328 0 : #ifdef COMMIT_STATS
329 0 : spin_lock(&journal->j_list_lock);
330 0 : summarise_journal_usage(journal);
331 0 : spin_unlock(&journal->j_list_lock);
332 0 : #endif
333 0 :
334 0 : /* Do we need to erase the effects of a prior journal_flush? */
335 0 : if (journal->j_flags & JFS_FLUSHED) {
336 0 : jbd_debug(3, "super block updated\n");
337 0 : journal_update_superblock(journal, 1);
338 0 : } else {
339 0 : jbd_debug(3, "superblock not updated\n");
340 0 : }
341 0 :
342 0 : J_ASSERT(journal->j_running_transaction != NULL);
343 0 : J_ASSERT(journal->j_committing_transaction == NULL);
344 0 :
345 0 : commit_transaction = journal->j_running_transaction;
346 0 : J_ASSERT(commit_transaction->t_state == T_RUNNING);
347 0 :
348 0 : jbd_debug(1, "JBD: starting commit of transaction %d\n",
349 0 : commit_transaction->t_tid);
350 0 :
351 0 : spin_lock(&journal->j_state_lock);
352 0 : commit_transaction->t_state = T_LOCKED;
353 0 :
354 0 : /*
355 0 : * Use plugged writes here, since we want to submit several before
356 0 : * we unplug the device. We don't do explicit unplugging in here,
357 0 : * instead we rely on sync_buffer() doing the unplug for us.
358 0 : */
359 0 : if (commit_transaction->t_synchronous_commit)
360 0 : write_op = WRITE_SYNC_PLUG;
361 0 : spin_lock(&commit_transaction->t_handle_lock);
362 0 : while (commit_transaction->t_updates) {
363 0 : DEFINE_WAIT(wait);
364 0 :
365 0 : prepare_to_wait(&journal->j_wait_updates, &wait,
366 0 : TASK_UNINTERRUPTIBLE);
367 0 : if (commit_transaction->t_updates) {
368 0 : spin_unlock(&commit_transaction->t_handle_lock);
369 0 : spin_unlock(&journal->j_state_lock);
370 0 : schedule();
371 0 : spin_lock(&journal->j_state_lock);
372 0 : spin_lock(&commit_transaction->t_handle_lock);
373 0 : }
374 0 : finish_wait(&journal->j_wait_updates, &wait);
375 0 : }
376 0 : spin_unlock(&commit_transaction->t_handle_lock);
377 0 :
378 0 : J_ASSERT (commit_transaction->t_outstanding_credits <=
379 0 : journal->j_max_transaction_buffers);
380 0 :
381 0 : /*
382 0 : * First thing we are allowed to do is to discard any remaining
383 0 : * BJ_Reserved buffers. Note, it is _not_ permissible to assume
384 0 : * that there are no such buffers: if a large filesystem
385 0 : * operation like a truncate needs to split itself over multiple
386 0 : * transactions, then it may try to do a journal_restart() while
387 0 : * there are still BJ_Reserved buffers outstanding. These must
388 0 : * be released cleanly from the current transaction.
389 0 : *
390 0 : * In this case, the filesystem must still reserve write access
391 0 : * again before modifying the buffer in the new transaction, but
392 0 : * we do not require it to remember exactly which old buffers it
393 0 : * has reserved. This is consistent with the existing behaviour
394 : * that multiple journal_get_write_access() calls to the same
395 : * buffer are perfectly permissable.
396 : */
397 0 : while (commit_transaction->t_reserved_list) {
398 0 : jh = commit_transaction->t_reserved_list;
399 0 : JBUFFER_TRACE(jh, "reserved, unused: refile");
400 : /*
401 : * A journal_get_undo_access()+journal_release_buffer() may
402 : * leave undo-committed data.
403 : */
404 0 : if (jh->b_committed_data) {
405 0 : struct buffer_head *bh = jh2bh(jh);
406 :
407 0 : jbd_lock_bh_state(bh);
408 0 : jbd_free(jh->b_committed_data, bh->b_size);
409 0 : jh->b_committed_data = NULL;
410 0 : jbd_unlock_bh_state(bh);
411 : }
412 0 : journal_refile_buffer(journal, jh);
413 : }
414 0 :
415 : /*
416 : * Now try to drop any written-back buffers from the journal's
417 : * checkpoint lists. We do this *before* commit because it potentially
418 : * frees some memory
419 : */
420 0 : spin_lock(&journal->j_list_lock);
421 0 : __journal_clean_checkpoint_list(journal);
422 0 : spin_unlock(&journal->j_list_lock);
423 :
424 : jbd_debug (3, "JBD: commit phase 1\n");
425 :
426 : /*
427 : * Switch to a new revoke table.
428 : */
429 0 : journal_switch_revoke_table(journal);
430 :
431 0 : commit_transaction->t_state = T_FLUSH;
432 0 : journal->j_committing_transaction = commit_transaction;
433 0 : journal->j_running_transaction = NULL;
434 0 : start_time = ktime_get();
435 0 : commit_transaction->t_log_start = journal->j_head;
436 0 : wake_up(&journal->j_wait_transaction_locked);
437 0 : spin_unlock(&journal->j_state_lock);
438 :
439 : jbd_debug (3, "JBD: commit phase 2\n");
440 :
441 : /*
442 : * Now start flushing things to disk, in the order they appear
443 : * on the transaction lists. Data blocks go first.
444 : */
445 0 : err = journal_submit_data_buffers(journal, commit_transaction,
446 : write_op);
447 :
448 : /*
449 : * Wait for all previously submitted IO to complete.
450 : */
451 0 : spin_lock(&journal->j_list_lock);
452 0 : while (commit_transaction->t_locked_list) {
453 0 : struct buffer_head *bh;
454 0 :
455 0 : jh = commit_transaction->t_locked_list->b_tprev;
456 0 : bh = jh2bh(jh);
457 0 : get_bh(bh);
458 0 : if (buffer_locked(bh)) {
459 0 : spin_unlock(&journal->j_list_lock);
460 0 : wait_on_buffer(bh);
461 0 : spin_lock(&journal->j_list_lock);
462 : }
463 0 : if (unlikely(!buffer_uptodate(bh))) {
464 0 : if (!trylock_page(bh->b_page)) {
465 0 : spin_unlock(&journal->j_list_lock);
466 0 : lock_page(bh->b_page);
467 0 : spin_lock(&journal->j_list_lock);
468 : }
469 0 : if (bh->b_page->mapping)
470 0 : set_bit(AS_EIO, &bh->b_page->mapping->flags);
471 :
472 0 : unlock_page(bh->b_page);
473 0 : SetPageError(bh->b_page);
474 0 : err = -EIO;
475 : }
476 0 : if (!inverted_lock(journal, bh)) {
477 0 : put_bh(bh);
478 0 : spin_lock(&journal->j_list_lock);
479 0 : continue;
480 : }
481 0 : if (buffer_jbd(bh) && bh2jh(bh) == jh &&
482 : jh->b_transaction == commit_transaction &&
483 : jh->b_jlist == BJ_Locked) {
484 0 : __journal_unfile_buffer(jh);
485 0 : jbd_unlock_bh_state(bh);
486 0 : journal_remove_journal_head(bh);
487 0 : put_bh(bh);
488 : } else {
489 0 : jbd_unlock_bh_state(bh);
490 : }
491 0 : release_data_buffer(bh);
492 0 : cond_resched_lock(&journal->j_list_lock);
493 : }
494 0 : spin_unlock(&journal->j_list_lock);
495 :
496 0 : if (err) {
497 : char b[BDEVNAME_SIZE];
498 :
499 0 : printk(KERN_WARNING
500 : "JBD: Detected IO errors while flushing file data "
501 : "on %s\n", bdevname(journal->j_fs_dev, b));
502 0 : if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
503 0 : journal_abort(journal, err);
504 0 : err = 0;
505 : }
506 :
507 0 : journal_write_revoke_records(journal, commit_transaction, write_op);
508 :
509 : /*
510 : * If we found any dirty or locked buffers, then we should have
511 : * looped back up to the write_out_data label. If there weren't
512 : * any then journal_clean_data_list should have wiped the list
513 : * clean by now, so check that it is in fact empty.
514 : */
515 0 : J_ASSERT (commit_transaction->t_sync_datalist == NULL);
516 :
517 : jbd_debug (3, "JBD: commit phase 3\n");
518 :
519 : /*
520 : * Way to go: we have now written out all of the data for a
521 : * transaction! Now comes the tricky part: we need to write out
522 : * metadata. Loop over the transaction's entire buffer list:
523 : */
524 0 : spin_lock(&journal->j_state_lock);
525 0 : commit_transaction->t_state = T_COMMIT;
526 0 : spin_unlock(&journal->j_state_lock);
527 :
528 0 : J_ASSERT(commit_transaction->t_nr_buffers <=
529 : commit_transaction->t_outstanding_credits);
530 :
531 0 : descriptor = NULL;
532 0 : bufs = 0;
533 0 : while (commit_transaction->t_buffers) {
534 0 :
535 0 : /* Find the next buffer to be journaled... */
536 :
537 0 : jh = commit_transaction->t_buffers;
538 :
539 : /* If we're in abort mode, we just un-journal the buffer and
540 0 : release it. */
541 :
542 0 : if (is_journal_aborted(journal)) {
543 0 : clear_buffer_jbddirty(jh2bh(jh));
544 : JBUFFER_TRACE(jh, "journal is aborting: refile");
545 0 : journal_refile_buffer(journal, jh);
546 : /* If that was the last one, we need to clean up
547 : * any descriptor buffers which may have been
548 : * already allocated, even if we are now
549 : * aborting. */
550 0 : if (!commit_transaction->t_buffers)
551 0 : goto start_journal_io;
552 0 : continue;
553 : }
554 :
555 : /* Make sure we have a descriptor block in which to
556 : record the metadata buffer. */
557 :
558 0 : if (!descriptor) {
559 : struct buffer_head *bh;
560 :
561 0 : J_ASSERT (bufs == 0);
562 :
563 : jbd_debug(4, "JBD: get descriptor\n");
564 :
565 0 : descriptor = journal_get_descriptor_buffer(journal);
566 0 : if (!descriptor) {
567 0 : journal_abort(journal, -EIO);
568 0 : continue;
569 : }
570 :
571 0 : bh = jh2bh(descriptor);
572 : jbd_debug(4, "JBD: got buffer %llu (%p)\n",
573 : (unsigned long long)bh->b_blocknr, bh->b_data);
574 0 : header = (journal_header_t *)&bh->b_data[0];
575 0 : header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
576 0 : header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
577 0 : header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
578 :
579 0 : tagp = &bh->b_data[sizeof(journal_header_t)];
580 0 : space_left = bh->b_size - sizeof(journal_header_t);
581 0 : first_tag = 1;
582 0 : set_buffer_jwrite(bh);
583 0 : set_buffer_dirty(bh);
584 0 : wbuf[bufs++] = bh;
585 :
586 : /* Record it so that we can wait for IO
587 : completion later */
588 : BUFFER_TRACE(bh, "ph3: file as descriptor");
589 0 : journal_file_buffer(descriptor, commit_transaction,
590 : BJ_LogCtl);
591 : }
592 :
593 : /* Where is the buffer to be written? */
594 :
595 0 : err = journal_next_log_block(journal, &blocknr);
596 : /* If the block mapping failed, just abandon the buffer
597 : and repeat this loop: we'll fall into the
598 : refile-on-abort condition above. */
599 0 : if (err) {
600 0 : journal_abort(journal, err);
601 0 : continue;
602 : }
603 :
604 : /*
605 : * start_this_handle() uses t_outstanding_credits to determine
606 : * the free space in the log, but this counter is changed
607 : * by journal_next_log_block() also.
608 : */
609 0 : commit_transaction->t_outstanding_credits--;
610 :
611 : /* Bump b_count to prevent truncate from stumbling over
612 : the shadowed buffer! @@@ This can go if we ever get
613 : rid of the BJ_IO/BJ_Shadow pairing of buffers. */
614 0 : atomic_inc(&jh2bh(jh)->b_count);
615 :
616 : /* Make a temporary IO buffer with which to write it out
617 : (this will requeue both the metadata buffer and the
618 : temporary IO buffer). new_bh goes on BJ_IO*/
619 :
620 0 : set_bit(BH_JWrite, &jh2bh(jh)->b_state);
621 : /*
622 : * akpm: journal_write_metadata_buffer() sets
623 : * new_bh->b_transaction to commit_transaction.
624 : * We need to clean this up before we release new_bh
625 : * (which is of type BJ_IO)
626 : */
627 : JBUFFER_TRACE(jh, "ph3: write metadata");
628 0 : flags = journal_write_metadata_buffer(commit_transaction,
629 : jh, &new_jh, blocknr);
630 0 : set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
631 0 : wbuf[bufs++] = jh2bh(new_jh);
632 :
633 : /* Record the new block's tag in the current descriptor
634 : buffer */
635 :
636 0 : tag_flag = 0;
637 0 : if (flags & 1)
638 0 : tag_flag |= JFS_FLAG_ESCAPE;
639 0 : if (!first_tag)
640 0 : tag_flag |= JFS_FLAG_SAME_UUID;
641 :
642 0 : tag = (journal_block_tag_t *) tagp;
643 0 : tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
644 0 : tag->t_flags = cpu_to_be32(tag_flag);
645 0 : tagp += sizeof(journal_block_tag_t);
646 0 : space_left -= sizeof(journal_block_tag_t);
647 :
648 0 : if (first_tag) {
649 0 : memcpy (tagp, journal->j_uuid, 16);
650 0 : tagp += 16;
651 0 : space_left -= 16;
652 0 : first_tag = 0;
653 : }
654 :
655 : /* If there's no more to do, or if the descriptor is full,
656 : let the IO rip! */
657 :
658 0 : if (bufs == journal->j_wbufsize ||
659 : commit_transaction->t_buffers == NULL ||
660 : space_left < sizeof(journal_block_tag_t) + 16) {
661 :
662 : jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
663 :
664 : /* Write an end-of-descriptor marker before
665 : submitting the IOs. "tag" still points to
666 : the last tag we set up. */
667 :
668 0 : tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
669 0 :
670 : start_journal_io:
671 0 : for (i = 0; i < bufs; i++) {
672 0 : struct buffer_head *bh = wbuf[i];
673 0 : lock_buffer(bh);
674 0 : clear_buffer_dirty(bh);
675 0 : set_buffer_uptodate(bh);
676 0 : bh->b_end_io = journal_end_buffer_io_sync;
677 0 : submit_bh(write_op, bh);
678 : }
679 0 : cond_resched();
680 :
681 : /* Force a new descriptor to be generated next
682 : time round the loop. */
683 0 : descriptor = NULL;
684 0 : bufs = 0;
685 : }
686 : }
687 :
688 : /* Lo and behold: we have just managed to send a transaction to
689 0 : the log. Before we can commit it, wait for the IO so far to
690 : complete. Control buffers being written are on the
691 : transaction's t_log_list queue, and metadata buffers are on
692 : the t_iobuf_list queue.
693 :
694 : Wait for the buffers in reverse order. That way we are
695 : less likely to be woken up until all IOs have completed, and
696 : so we incur less scheduling load.
697 : */
698 :
699 : jbd_debug(3, "JBD: commit phase 4\n");
700 :
701 : /*
702 : * akpm: these are BJ_IO, and j_list_lock is not needed.
703 : * See __journal_try_to_free_buffer.
704 : */
705 : wait_for_iobuf:
706 0 : while (commit_transaction->t_iobuf_list != NULL) {
707 0 : struct buffer_head *bh;
708 0 :
709 0 : jh = commit_transaction->t_iobuf_list->b_tprev;
710 0 : bh = jh2bh(jh);
711 0 : if (buffer_locked(bh)) {
712 0 : wait_on_buffer(bh);
713 0 : goto wait_for_iobuf;
714 : }
715 0 : if (cond_resched())
716 0 : goto wait_for_iobuf;
717 :
718 0 : if (unlikely(!buffer_uptodate(bh)))
719 0 : err = -EIO;
720 :
721 0 : clear_buffer_jwrite(bh);
722 :
723 : JBUFFER_TRACE(jh, "ph4: unfile after journal write");
724 0 : journal_unfile_buffer(journal, jh);
725 :
726 : /*
727 : * ->t_iobuf_list should contain only dummy buffer_heads
728 : * which were created by journal_write_metadata_buffer().
729 : */
730 : BUFFER_TRACE(bh, "dumping temporary bh");
731 0 : journal_put_journal_head(jh);
732 0 : __brelse(bh);
733 0 : J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
734 0 : free_buffer_head(bh);
735 :
736 : /* We also have to unlock and free the corresponding
737 : shadowed buffer */
738 0 : jh = commit_transaction->t_shadow_list->b_tprev;
739 0 : bh = jh2bh(jh);
740 0 : clear_bit(BH_JWrite, &bh->b_state);
741 0 : J_ASSERT_BH(bh, buffer_jbddirty(bh));
742 :
743 : /* The metadata is now released for reuse, but we need
744 : to remember it against this transaction so that when
745 : we finally commit, we can do any checkpointing
746 : required. */
747 : JBUFFER_TRACE(jh, "file as BJ_Forget");
748 0 : journal_file_buffer(jh, commit_transaction, BJ_Forget);
749 : /*
750 : * Wake up any transactions which were waiting for this
751 : * IO to complete. The barrier must be here so that changes
752 : * by journal_file_buffer() take effect before wake_up_bit()
753 : * does the waitqueue check.
754 : */
755 0 : smp_mb();
756 0 : wake_up_bit(&bh->b_state, BH_Unshadow);
757 : JBUFFER_TRACE(jh, "brelse shadowed buffer");
758 0 : __brelse(bh);
759 : }
760 0 :
761 0 : J_ASSERT (commit_transaction->t_shadow_list == NULL);
762 :
763 : jbd_debug(3, "JBD: commit phase 5\n");
764 :
765 : /* Here we wait for the revoke record and descriptor record buffers */
766 0 : wait_for_ctlbuf:
767 0 : while (commit_transaction->t_log_list != NULL) {
768 0 : struct buffer_head *bh;
769 0 :
770 0 : jh = commit_transaction->t_log_list->b_tprev;
771 0 : bh = jh2bh(jh);
772 0 : if (buffer_locked(bh)) {
773 0 : wait_on_buffer(bh);
774 0 : goto wait_for_ctlbuf;
775 : }
776 0 : if (cond_resched())
777 0 : goto wait_for_ctlbuf;
778 :
779 0 : if (unlikely(!buffer_uptodate(bh)))
780 0 : err = -EIO;
781 :
782 : BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
783 0 : clear_buffer_jwrite(bh);
784 0 : journal_unfile_buffer(journal, jh);
785 0 : journal_put_journal_head(jh);
786 0 : __brelse(bh); /* One for getblk */
787 : /* AKPM: bforget here */
788 0 : }
789 :
790 0 : if (err)
791 0 : journal_abort(journal, err);
792 :
793 : jbd_debug(3, "JBD: commit phase 6\n");
794 :
795 0 : if (journal_write_commit_record(journal, commit_transaction))
796 0 : err = -EIO;
797 :
798 0 : if (err)
799 0 : journal_abort(journal, err);
800 :
801 : /* End of a transaction! Finally, we can do checkpoint
802 : processing: any buffers committed as a result of this
803 : transaction can be removed from any checkpoint list it was on
804 : before. */
805 :
806 : jbd_debug(3, "JBD: commit phase 7\n");
807 :
808 0 : J_ASSERT(commit_transaction->t_sync_datalist == NULL);
809 0 : J_ASSERT(commit_transaction->t_buffers == NULL);
810 0 : J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
811 0 : J_ASSERT(commit_transaction->t_iobuf_list == NULL);
812 0 : J_ASSERT(commit_transaction->t_shadow_list == NULL);
813 0 : J_ASSERT(commit_transaction->t_log_list == NULL);
814 :
815 : restart_loop:
816 : /*
817 : * As there are other places (journal_unmap_buffer()) adding buffers
818 0 : * to this list we have to be careful and hold the j_list_lock.
819 : */
820 0 : spin_lock(&journal->j_list_lock);
821 0 : while (commit_transaction->t_forget) {
822 0 : transaction_t *cp_transaction;
823 0 : struct buffer_head *bh;
824 :
825 0 : jh = commit_transaction->t_forget;
826 0 : spin_unlock(&journal->j_list_lock);
827 0 : bh = jh2bh(jh);
828 0 : jbd_lock_bh_state(bh);
829 0 : J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
830 : jh->b_transaction == journal->j_running_transaction);
831 :
832 : /*
833 : * If there is undo-protected committed data against
834 : * this buffer, then we can remove it now. If it is a
835 : * buffer needing such protection, the old frozen_data
836 : * field now points to a committed version of the
837 : * buffer, so rotate that field to the new committed
838 : * data.
839 : *
840 : * Otherwise, we can just throw away the frozen data now.
841 : */
842 0 : if (jh->b_committed_data) {
843 0 : jbd_free(jh->b_committed_data, bh->b_size);
844 0 : jh->b_committed_data = NULL;
845 0 : if (jh->b_frozen_data) {
846 0 : jh->b_committed_data = jh->b_frozen_data;
847 0 : jh->b_frozen_data = NULL;
848 : }
849 0 : } else if (jh->b_frozen_data) {
850 0 : jbd_free(jh->b_frozen_data, bh->b_size);
851 0 : jh->b_frozen_data = NULL;
852 : }
853 :
854 0 : spin_lock(&journal->j_list_lock);
855 0 : cp_transaction = jh->b_cp_transaction;
856 0 : if (cp_transaction) {
857 : JBUFFER_TRACE(jh, "remove from old cp transaction");
858 0 : __journal_remove_checkpoint(jh);
859 : }
860 :
861 : /* Only re-checkpoint the buffer_head if it is marked
862 : * dirty. If the buffer was added to the BJ_Forget list
863 : * by journal_forget, it may no longer be dirty and
864 : * there's no point in keeping a checkpoint record for
865 : * it. */
866 :
867 : /* A buffer which has been freed while still being
868 : * journaled by a previous transaction may end up still
869 : * being dirty here, but we want to avoid writing back
870 : * that buffer in the future now that the last use has
871 : * been committed. That's not only a performance gain,
872 : * it also stops aliasing problems if the buffer is left
873 : * behind for writeback and gets reallocated for another
874 : * use in a different page. */
875 0 : if (buffer_freed(bh)) {
876 0 : clear_buffer_freed(bh);
877 0 : clear_buffer_jbddirty(bh);
878 : }
879 :
880 0 : if (buffer_jbddirty(bh)) {
881 : JBUFFER_TRACE(jh, "add to new checkpointing trans");
882 0 : __journal_insert_checkpoint(jh, commit_transaction);
883 0 : if (is_journal_aborted(journal))
884 0 : clear_buffer_jbddirty(bh);
885 : JBUFFER_TRACE(jh, "refile for checkpoint writeback");
886 0 : __journal_refile_buffer(jh);
887 0 : jbd_unlock_bh_state(bh);
888 : } else {
889 0 : J_ASSERT_BH(bh, !buffer_dirty(bh));
890 : /* The buffer on BJ_Forget list and not jbddirty means
891 : * it has been freed by this transaction and hence it
892 : * could not have been reallocated until this
893 : * transaction has committed. *BUT* it could be
894 : * reallocated once we have written all the data to
895 : * disk and before we process the buffer on BJ_Forget
896 : * list. */
897 : JBUFFER_TRACE(jh, "refile or unfile freed buffer");
898 0 : __journal_refile_buffer(jh);
899 0 : if (!jh->b_transaction) {
900 0 : jbd_unlock_bh_state(bh);
901 : /* needs a brelse */
902 0 : journal_remove_journal_head(bh);
903 0 : release_buffer_page(bh);
904 : } else
905 0 : jbd_unlock_bh_state(bh);
906 : }
907 0 : cond_resched_lock(&journal->j_list_lock);
908 : }
909 0 : spin_unlock(&journal->j_list_lock);
910 : /*
911 : * This is a bit sleazy. We use j_list_lock to protect transition
912 : * of a transaction into T_FINISHED state and calling
913 : * __journal_drop_transaction(). Otherwise we could race with
914 : * other checkpointing code processing the transaction...
915 : */
916 0 : spin_lock(&journal->j_state_lock);
917 0 : spin_lock(&journal->j_list_lock);
918 : /*
919 : * Now recheck if some buffers did not get attached to the transaction
920 : * while the lock was dropped...
921 : */
922 0 : if (commit_transaction->t_forget) {
923 0 : spin_unlock(&journal->j_list_lock);
924 0 : spin_unlock(&journal->j_state_lock);
925 0 : goto restart_loop;
926 : }
927 :
928 : /* Done with this transaction! */
929 :
930 : jbd_debug(3, "JBD: commit phase 8\n");
931 :
932 0 : J_ASSERT(commit_transaction->t_state == T_COMMIT);
933 :
934 0 : commit_transaction->t_state = T_FINISHED;
935 0 : J_ASSERT(commit_transaction == journal->j_committing_transaction);
936 0 : journal->j_commit_sequence = commit_transaction->t_tid;
937 0 : journal->j_committing_transaction = NULL;
938 0 : commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
939 :
940 : /*
941 : * weight the commit time higher than the average time so we don't
942 : * react too strongly to vast changes in commit time
943 : */
944 0 : if (likely(journal->j_average_commit_time))
945 0 : journal->j_average_commit_time = (commit_time*3 +
946 : journal->j_average_commit_time) / 4;
947 : else
948 0 : journal->j_average_commit_time = commit_time;
949 :
950 0 : spin_unlock(&journal->j_state_lock);
951 :
952 0 : if (commit_transaction->t_checkpoint_list == NULL &&
953 : commit_transaction->t_checkpoint_io_list == NULL) {
954 0 : __journal_drop_transaction(journal, commit_transaction);
955 : } else {
956 0 : if (journal->j_checkpoint_transactions == NULL) {
957 0 : journal->j_checkpoint_transactions = commit_transaction;
958 0 : commit_transaction->t_cpnext = commit_transaction;
959 0 : commit_transaction->t_cpprev = commit_transaction;
960 : } else {
961 0 : commit_transaction->t_cpnext =
962 : journal->j_checkpoint_transactions;
963 0 : commit_transaction->t_cpprev =
964 : commit_transaction->t_cpnext->t_cpprev;
965 0 : commit_transaction->t_cpnext->t_cpprev =
966 : commit_transaction;
967 0 : commit_transaction->t_cpprev->t_cpnext =
968 : commit_transaction;
969 : }
970 : }
971 0 : spin_unlock(&journal->j_list_lock);
972 :
973 : jbd_debug(1, "JBD: commit %d complete, head %d\n",
974 : journal->j_commit_sequence, journal->j_tail_sequence);
975 :
976 0 : wake_up(&journal->j_wait_done_commit);
977 0 : }
|