LCOV - code coverage report
Current view: top level - drivers/md - raid1.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 879 1411 62.3 %
Date: 2017-01-25 Functions: 32 49 65.3 %

          Line data    Source code
       1             : /*
       2             :  * raid1.c : Multiple Devices driver for Linux
       3             :  *
       4             :  * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
       5             :  *
       6             :  * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
       7             :  *
       8             :  * RAID-1 management functions.
       9             :  *
      10             :  * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
      11             :  *
      12             :  * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
      13             :  * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
      14             :  *
      15             :  * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
      16             :  * bitmapped intelligence in resync:
      17             :  *
      18             :  *      - bitmap marked during normal i/o
      19             :  *      - bitmap used to skip nondirty blocks during sync
      20             :  *
      21             :  * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
      22             :  * - persistent bitmap code
      23             :  *
      24             :  * This program is free software; you can redistribute it and/or modify
      25             :  * it under the terms of the GNU General Public License as published by
      26             :  * the Free Software Foundation; either version 2, or (at your option)
      27             :  * any later version.
      28             :  *
      29             :  * You should have received a copy of the GNU General Public License
      30             :  * (for example /usr/src/linux/COPYING); if not, write to the Free
      31             :  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
      32             :  */
      33             : 
      34             : #include <linux/delay.h>
      35             : #include <linux/blkdev.h>
      36             : #include <linux/seq_file.h>
      37             : #include "md.h"
      38             : #include "raid1.h"
      39             : #include "bitmap.h"
      40             : 
      41             : #define DEBUG 0
      42             : #if DEBUG
      43             : #define PRINTK(x...) printk(x)
      44             : #else
      45             : #define PRINTK(x...)
      46             : #endif
      47             : 
      48             : /*
      49             :  * Number of guaranteed r1bios in case of extreme VM load:
      50             :  */
      51             : #define NR_RAID1_BIOS 256
      52             : 
      53             : 
      54             : static void unplug_slaves(mddev_t *mddev);
      55             : 
      56             : static void allow_barrier(conf_t *conf);
      57             : static void lower_barrier(conf_t *conf);
      58             : 
      59             : static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
      60             : {
      61           0 :         struct pool_info *pi = data;
      62           0 :         r1bio_t *r1_bio;
      63           0 :         int size = offsetof(r1bio_t, bios[pi->raid_disks]);
      64           0 : 
      65             :         /* allocate a r1bio with room for raid_disks entries in the bios array */
      66           0 :         r1_bio = kzalloc(size, gfp_flags);
      67           0 :         if (!r1_bio && pi->mddev)
      68           0 :                 unplug_slaves(pi->mddev);
      69             : 
      70           0 :         return r1_bio;
      71             : }
      72             : 
      73             : static void r1bio_pool_free(void *r1_bio, void *data)
      74             : {
      75           0 :         kfree(r1_bio);
      76           0 : }
      77             : 
      78             : #define RESYNC_BLOCK_SIZE (64*1024)
      79             : //#define RESYNC_BLOCK_SIZE PAGE_SIZE
      80             : #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
      81             : #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
      82             : #define RESYNC_WINDOW (2048*1024)
      83             : 
      84             : static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
      85             : {
      86           0 :         struct pool_info *pi = data;
      87           0 :         struct page *page;
      88           0 :         r1bio_t *r1_bio;
      89           0 :         struct bio *bio;
      90           0 :         int i, j;
      91           0 : 
      92           0 :         r1_bio = r1bio_pool_alloc(gfp_flags, pi);
      93           0 :         if (!r1_bio) {
      94           0 :                 unplug_slaves(pi->mddev);
      95           0 :                 return NULL;
      96           0 :         }
      97           0 : 
      98             :         /*
      99             :          * Allocate bios : 1 for reading, n-1 for writing
     100             :          */
     101           0 :         for (j = pi->raid_disks ; j-- ; ) {
     102           0 :                 bio = bio_alloc(gfp_flags, RESYNC_PAGES);
     103           0 :                 if (!bio)
     104           0 :                         goto out_free_bio;
     105           0 :                 r1_bio->bios[j] = bio;
     106           0 :         }
     107             :         /*
     108             :          * Allocate RESYNC_PAGES data pages and attach them to
     109             :          * the first bio.
     110             :          * If this is a user-requested check/repair, allocate
     111             :          * RESYNC_PAGES for each bio.
     112             :          */
     113           0 :         if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
     114           0 :                 j = pi->raid_disks;
     115             :         else
     116           0 :                 j = 1;
     117           0 :         while(j--) {
     118           0 :                 bio = r1_bio->bios[j];
     119           0 :                 for (i = 0; i < RESYNC_PAGES; i++) {
     120           0 :                         page = alloc_page(gfp_flags);
     121           0 :                         if (unlikely(!page))
     122           0 :                                 goto out_free_pages;
     123             : 
     124           0 :                         bio->bi_io_vec[i].bv_page = page;
     125           0 :                         bio->bi_vcnt = i+1;
     126           0 :                 }
     127             :         }
     128             :         /* If not user-requests, copy the page pointers to all bios */
     129           0 :         if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
     130           0 :                 for (i=0; i<RESYNC_PAGES ; i++)
     131           0 :                         for (j=1; j<pi->raid_disks; j++)
     132           0 :                                 r1_bio->bios[j]->bi_io_vec[i].bv_page =
     133           0 :                                         r1_bio->bios[0]->bi_io_vec[i].bv_page;
     134             :         }
     135             : 
     136           0 :         r1_bio->master_bio = NULL;
     137             : 
     138           0 :         return r1_bio;
     139           0 : 
     140             : out_free_pages:
     141           0 :         for (j=0 ; j < pi->raid_disks; j++)
     142           0 :                 for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++)
     143           0 :                         put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
     144           0 :         j = -1;
     145           0 : out_free_bio:
     146           0 :         while ( ++j < pi->raid_disks )
     147           0 :                 bio_put(r1_bio->bios[j]);
     148           0 :         r1bio_pool_free(r1_bio, data);
     149           0 :         return NULL;
     150             : }
     151             : 
     152             : static void r1buf_pool_free(void *__r1_bio, void *data)
     153             : {
     154           0 :         struct pool_info *pi = data;
     155           0 :         int i,j;
     156           0 :         r1bio_t *r1bio = __r1_bio;
     157           0 : 
     158           0 :         for (i = 0; i < RESYNC_PAGES; i++)
     159           0 :                 for (j = pi->raid_disks; j-- ;) {
     160           0 :                         if (j == 0 ||
     161           0 :                             r1bio->bios[j]->bi_io_vec[i].bv_page !=
     162             :                             r1bio->bios[0]->bi_io_vec[i].bv_page)
     163           0 :                                 safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
     164             :                 }
     165           0 :         for (i=0 ; i < pi->raid_disks; i++)
     166           0 :                 bio_put(r1bio->bios[i]);
     167           0 : 
     168           0 :         r1bio_pool_free(r1bio, data);
     169           0 : }
     170             : 
     171             : static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
     172             : {
     173           2 :         int i;
     174           2 : 
     175          10 :         for (i = 0; i < conf->raid_disks; i++) {
     176           6 :                 struct bio **bio = r1_bio->bios + i;
     177           6 :                 if (*bio && *bio != IO_BLOCKED)
     178           2 :                         bio_put(*bio);
     179           2 :                 *bio = NULL;
     180             :         }
     181             : }
     182           2 : 
     183             : static void free_r1bio(r1bio_t *r1_bio)
     184             : {
     185           6 :         conf_t *conf = r1_bio->mddev->private;
     186             : 
     187             :         /*
     188             :          * Wake up any possible resync thread that waits for the device
     189             :          * to go idle.
     190             :          */
     191           4 :         allow_barrier(conf);
     192             : 
     193           4 :         put_all_bios(conf, r1_bio);
     194           2 :         mempool_free(r1_bio, conf->r1bio_pool);
     195           2 : }
     196             : 
     197             : static void put_buf(r1bio_t *r1_bio)
     198             : {
     199           3 :         conf_t *conf = r1_bio->mddev->private;
     200           1 :         int i;
     201           1 : 
     202           6 :         for (i=0; i<conf->raid_disks; i++) {
     203           4 :                 struct bio *bio = r1_bio->bios[i];
     204           4 :                 if (bio->bi_end_io)
     205           3 :                         rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
     206             :         }
     207             : 
     208           1 :         mempool_free(r1_bio, conf->r1buf_pool);
     209             : 
     210           2 :         lower_barrier(conf);
     211           1 : }
     212             : 
     213             : static void reschedule_retry(r1bio_t *r1_bio)
     214             : {
     215           0 :         unsigned long flags;
     216           0 :         mddev_t *mddev = r1_bio->mddev;
     217           0 :         conf_t *conf = mddev->private;
     218           0 : 
     219           0 :         spin_lock_irqsave(&conf->device_lock, flags);
     220           0 :         list_add(&r1_bio->retry_list, &conf->retry_list);
     221           0 :         conf->nr_queued ++;
     222           0 :         spin_unlock_irqrestore(&conf->device_lock, flags);
     223             : 
     224           0 :         wake_up(&conf->wait_barrier);
     225           0 :         md_wakeup_thread(mddev->thread);
     226           0 : }
     227             : 
     228             : /*
     229             :  * raid_end_bio_io() is called when we have finished servicing a mirrored
     230             :  * operation and are ready to return a success/failure code to the buffer
     231             :  * cache layer.
     232             :  */
     233             : static void raid_end_bio_io(r1bio_t *r1_bio)
     234             : {
     235           2 :         struct bio *bio = r1_bio->master_bio;
     236           1 : 
     237           1 :         /* if nobody has done the final endio yet, do it now */
     238           4 :         if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
     239             :                 PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",
     240             :                         (bio_data_dir(bio) == WRITE) ? "write" : "read",
     241             :                         (unsigned long long) bio->bi_sector,
     242             :                         (unsigned long long) bio->bi_sector +
     243             :                                 (bio->bi_size >> 9) - 1);
     244             : 
     245           8 :                 bio_endio(bio,
     246             :                         test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
     247             :         }
     248           4 :         free_r1bio(r1_bio);
     249           1 : }
     250             : 
     251             : /*
     252             :  * Update disk head position estimator based on IRQ completion info.
     253             :  */
     254             : static inline void update_head_pos(int disk, r1bio_t *r1_bio)
     255             : {
     256           0 :         conf_t *conf = r1_bio->mddev->private;
     257             : 
     258           0 :         conf->mirrors[disk].head_position =
     259             :                 r1_bio->sector + (r1_bio->sectors);
     260           0 : }
     261             : 
     262             : static void raid1_end_read_request(struct bio *bio, int error)
     263             : {
     264           0 :         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
     265           0 :         r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
     266           0 :         int mirror;
     267           0 :         conf_t *conf = r1_bio->mddev->private;
     268           0 : 
     269           0 :         mirror = r1_bio->read_disk;
     270           0 :         /*
     271           0 :          * this branch is our 'one mirror IO has finished' event handler:
     272           0 :          */
     273           0 :         update_head_pos(mirror, r1_bio);
     274           0 : 
     275           0 :         if (uptodate)
     276           0 :                 set_bit(R1BIO_Uptodate, &r1_bio->state);
     277             :         else {
     278             :                 /* If all other devices have failed, we want to return
     279             :                  * the error upwards rather than fail the last device.
     280             :                  * Here we redefine "uptodate" to mean "Don't want to retry"
     281             :                  */
     282             :                 unsigned long flags;
     283           0 :                 spin_lock_irqsave(&conf->device_lock, flags);
     284           0 :                 if (r1_bio->mddev->degraded == conf->raid_disks ||
     285             :                     (r1_bio->mddev->degraded == conf->raid_disks-1 &&
     286             :                      !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)))
     287           0 :                         uptodate = 1;
     288           0 :                 spin_unlock_irqrestore(&conf->device_lock, flags);
     289             :         }
     290             : 
     291           0 :         if (uptodate)
     292           0 :                 raid_end_bio_io(r1_bio);
     293             :         else {
     294             :                 /*
     295             :                  * oops, read error:
     296             :                  */
     297             :                 char b[BDEVNAME_SIZE];
     298           0 :                 if (printk_ratelimit())
     299           0 :                         printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",
     300             :                                bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
     301           0 :                 reschedule_retry(r1_bio);
     302             :         }
     303             : 
     304           0 :         rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
     305           0 : }
     306             : 
     307             : static void raid1_end_write_request(struct bio *bio, int error)
     308             : {
     309           0 :         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
     310           0 :         r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
     311           0 :         int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
     312           0 :         conf_t *conf = r1_bio->mddev->private;
     313           0 :         struct bio *to_put = NULL;
     314           0 : 
     315           0 : 
     316           0 :         for (mirror = 0; mirror < conf->raid_disks; mirror++)
     317           0 :                 if (r1_bio->bios[mirror] == bio)
     318           0 :                         break;
     319           0 : 
     320           0 :         if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {
     321           0 :                 set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);
     322           0 :                 set_bit(R1BIO_BarrierRetry, &r1_bio->state);
     323           0 :                 r1_bio->mddev->barriers_work = 0;
     324           0 :                 /* Don't rdev_dec_pending in this branch - keep it for the retry */
     325           0 :         } else {
     326           0 :                 /*
     327           0 :                  * this branch is our 'one mirror IO has finished' event handler:
     328           0 :                  */
     329           0 :                 r1_bio->bios[mirror] = NULL;
     330           0 :                 to_put = bio;
     331           0 :                 if (!uptodate) {
     332           0 :                         md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
     333             :                         /* an I/O failed, we can't clear the bitmap */
     334           0 :                         set_bit(R1BIO_Degraded, &r1_bio->state);
     335             :                 } else
     336             :                         /*
     337             :                          * Set R1BIO_Uptodate in our master bio, so that
     338             :                          * we will return a good error code for to the higher
     339             :                          * levels even if IO on some other mirrored buffer fails.
     340             :                          *
     341             :                          * The 'master' represents the composite IO operation to
     342             :                          * user-side. So if something waits for IO, then it will
     343             :                          * wait for the 'master' bio.
     344             :                          */
     345           0 :                         set_bit(R1BIO_Uptodate, &r1_bio->state);
     346             : 
     347           0 :                 update_head_pos(mirror, r1_bio);
     348             : 
     349           0 :                 if (behind) {
     350           0 :                         if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
     351           0 :                                 atomic_dec(&r1_bio->behind_remaining);
     352             : 
     353             :                         /* In behind mode, we ACK the master bio once the I/O has safely
     354             :                          * reached all non-writemostly disks. Setting the Returned bit
     355             :                          * ensures that this gets done only once -- we don't ever want to
     356             :                          * return -EIO here, instead we'll wait */
     357             : 
     358           0 :                         if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
     359             :                             test_bit(R1BIO_Uptodate, &r1_bio->state)) {
     360             :                                 /* Maybe we can return now */
     361           0 :                                 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
     362           0 :                                         struct bio *mbio = r1_bio->master_bio;
     363             :                                         PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
     364             :                                                (unsigned long long) mbio->bi_sector,
     365             :                                                (unsigned long long) mbio->bi_sector +
     366             :                                                (mbio->bi_size >> 9) - 1);
     367           0 :                                         bio_endio(mbio, 0);
     368             :                                 }
     369             :                         }
     370             :                 }
     371           0 :                 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
     372             :         }
     373             :         /*
     374             :          *
     375             :          * Let's see if all mirrored write operations have finished
     376             :          * already.
     377             :          */
     378           0 :         if (atomic_dec_and_test(&r1_bio->remaining)) {
     379           0 :                 if (test_bit(R1BIO_BarrierRetry, &r1_bio->state))
     380           0 :                         reschedule_retry(r1_bio);
     381             :                 else {
     382             :                         /* it really is the end of this request */
     383           0 :                         if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
     384             :                                 /* free extra copy of the data pages */
     385           0 :                                 int i = bio->bi_vcnt;
     386           0 :                                 while (i--)
     387           0 :                                         safe_put_page(bio->bi_io_vec[i].bv_page);
     388           0 :                         }
     389           0 :                         /* clear the bitmap if all writes complete successfully */
     390           0 :                         bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
     391             :                                         r1_bio->sectors,
     392             :                                         !test_bit(R1BIO_Degraded, &r1_bio->state),
     393             :                                         behind);
     394           0 :                         md_write_end(r1_bio->mddev);
     395           0 :                         raid_end_bio_io(r1_bio);
     396             :                 }
     397             :         }
     398             : 
     399           0 :         if (to_put)
     400           0 :                 bio_put(to_put);
     401           0 : }
     402             : 
     403             : 
     404             : /*
     405             :  * This routine returns the disk from which the requested read should
     406             :  * be done. There is a per-array 'next expected sequential IO' sector
     407             :  * number - if this matches on the next IO then we use the last disk.
     408             :  * There is also a per-disk 'last know head position' sector that is
     409             :  * maintained from IRQ contexts, both the normal and the resync IO
     410             :  * completion handlers update this position correctly. If there is no
     411             :  * perfect sequential match then we pick the disk whose head is closest.
     412             :  *
     413             :  * If there are 2 mirrors in the same 2 devices, performance degrades
     414             :  * because position is mirror, not device based.
     415             :  *
     416             :  * The rdev for the device selected will have nr_pending incremented.
     417             :  */
     418             : static int read_balance(conf_t *conf, r1bio_t *r1_bio)
     419             : {
     420           2 :         const sector_t this_sector = r1_bio->sector;
     421           3 :         int new_disk = conf->last_used, disk = new_disk;
     422           2 :         int wonly_disk = -1;
     423           2 :         const int sectors = r1_bio->sectors;
     424           1 :         sector_t new_distance, current_distance;
     425           1 :         mdk_rdev_t *rdev;
     426           1 : 
     427           3 :         rcu_read_lock();
     428           1 :         /*
     429           2 :          * Check if we can balance. We can balance on the whole
     430           1 :          * device if no resync is going on, or below the resync window.
     431           1 :          * We take the first readable disk when above the resync window.
     432           1 :          */
     433           1 :  retry:
     434           5 :         if (conf->mddev->recovery_cp < MaxSector &&
     435           1 :             (this_sector + sectors >= conf->next_resync)) {
     436           1 :                 /* Choose the first operational device, for consistancy */
     437           2 :                 new_disk = 0;
     438           1 : 
     439          16 :                 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
     440           4 :                      r1_bio->bios[new_disk] == IO_BLOCKED ||
     441           1 :                      !rdev || !test_bit(In_sync, &rdev->flags)
     442           1 :                              || test_bit(WriteMostly, &rdev->flags);
     443          13 :                      rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) {
     444           8 : 
     445          17 :                         if (rdev && test_bit(In_sync, &rdev->flags) &&
     446             :                                 r1_bio->bios[new_disk] != IO_BLOCKED)
     447           2 :                                 wonly_disk = new_disk;
     448             : 
     449           8 :                         if (new_disk == conf->raid_disks - 1) {
     450           4 :                                 new_disk = wonly_disk;
     451           8 :                                 break;
     452             :                         }
     453             :                 }
     454           4 :                 goto rb_out;
     455             :         }
     456             : 
     457             : 
     458             :         /* make sure the disk is operational */
     459          15 :         for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
     460           3 :              r1_bio->bios[new_disk] == IO_BLOCKED ||
     461             :              !rdev || !test_bit(In_sync, &rdev->flags) ||
     462             :                      test_bit(WriteMostly, &rdev->flags);
     463          11 :              rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) {
     464           4 : 
     465          16 :                 if (rdev && test_bit(In_sync, &rdev->flags) &&
     466           1 :                     r1_bio->bios[new_disk] != IO_BLOCKED)
     467           1 :                         wonly_disk = new_disk;
     468             : 
     469           8 :                 if (new_disk <= 0)
     470           8 :                         new_disk = conf->raid_disks;
     471           4 :                 new_disk--;
     472           8 :                 if (new_disk == disk) {
     473           4 :                         new_disk = wonly_disk;
     474           4 :                         break;
     475             :                 }
     476             :         }
     477             : 
     478           8 :         if (new_disk < 0)
     479           4 :                 goto rb_out;
     480             : 
     481           4 :         disk = new_disk;
     482             :         /* now disk == new_disk == starting point for search */
     483             : 
     484             :         /*
     485             :          * Don't change to another disk for sequential reads:
     486             :          */
     487           8 :         if (conf->next_seq_sect == this_sector)
     488           4 :                 goto rb_out;
     489           8 :         if (this_sector == conf->mirrors[new_disk].head_position)
     490           4 :                 goto rb_out;
     491             : 
     492          28 :         current_distance = abs(this_sector - conf->mirrors[disk].head_position);
     493           4 : 
     494             :         /* Find the disk whose head is closest */
     495             : 
     496             :         do {
     497           2 :                 if (disk <= 0)
     498           1 :                         disk = conf->raid_disks;
     499           1 :                 disk--;
     500             : 
     501           2 :                 rdev = rcu_dereference(conf->mirrors[disk].rdev);
     502             : 
     503          12 :                 if (!rdev || r1_bio->bios[disk] == IO_BLOCKED ||
     504             :                     !test_bit(In_sync, &rdev->flags) ||
     505             :                     test_bit(WriteMostly, &rdev->flags))
     506           3 :                         continue;
     507             : 
     508           4 :                 if (!atomic_read(&rdev->nr_pending)) {
     509           1 :                         new_disk = disk;
     510           1 :                         break;
     511             :                 }
     512           7 :                 new_distance = abs(this_sector - conf->mirrors[disk].head_position);
     513           2 :                 if (new_distance < current_distance) {
     514           1 :                         current_distance = new_distance;
     515           1 :                         new_disk = disk;
     516             :                 }
     517           8 :         } while (disk != conf->last_used);
     518             : 
     519           5 :  rb_out:
     520             : 
     521             : 
     522          24 :         if (new_disk >= 0) {
     523          28 :                 rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
     524          28 :                 if (!rdev)
     525          12 :                         goto retry;
     526          24 :                 atomic_inc(&rdev->nr_pending);
     527           4 :                 if (!test_bit(In_sync, &rdev->flags)) {
     528             :                         /* cannot risk returning a device that failed
     529             :                          * before we inc'ed nr_pending
     530             :                          */
     531           3 :                         rdev_dec_pending(rdev, conf->mddev);
     532           1 :                         goto retry;
     533             :                 }
     534           1 :                 conf->next_seq_sect = this_sector + sectors;
     535           1 :                 conf->last_used = new_disk;
     536             :         }
     537          26 :         rcu_read_unlock();
     538             : 
     539           1 :         return new_disk;
     540             : }
     541             : 
     542             : static void unplug_slaves(mddev_t *mddev)
     543             : {
     544          63 :         conf_t *conf = mddev->private;
     545          21 :         int i;
     546          21 : 
     547          63 :         rcu_read_lock();
     548         189 :         for (i=0; i<mddev->raid_disks; i++) {
     549         168 :                 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
     550         252 :                 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
     551          84 :                         struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
     552             : 
     553          42 :                         atomic_inc(&rdev->nr_pending);
     554          42 :                         rcu_read_unlock();
     555             : 
     556          21 :                         blk_unplug(r_queue);
     557             : 
     558          63 :                         rdev_dec_pending(rdev, mddev);
     559          42 :                         rcu_read_lock();
     560             :                 }
     561             :         }
     562          42 :         rcu_read_unlock();
     563          21 : }
     564             : 
     565             : static void raid1_unplug(struct request_queue *q)
     566             : {
     567          63 :         mddev_t *mddev = q->queuedata;
     568             : 
     569          42 :         unplug_slaves(mddev);
     570          21 :         md_wakeup_thread(mddev->thread);
     571          21 : }
     572             : 
     573             : static int raid1_congested(void *data, int bits)
     574             : {
     575           0 :         mddev_t *mddev = data;
     576           0 :         conf_t *conf = mddev->private;
     577           0 :         int i, ret = 0;
     578           0 : 
     579           0 :         if (mddev_congested(mddev, bits))
     580           0 :                 return 1;
     581           0 : 
     582           0 :         rcu_read_lock();
     583           0 :         for (i = 0; i < mddev->raid_disks; i++) {
     584           0 :                 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
     585           0 :                 if (rdev && !test_bit(Faulty, &rdev->flags)) {
     586           0 :                         struct request_queue *q = bdev_get_queue(rdev->bdev);
     587             : 
     588             :                         /* Note the '|| 1' - when read_balance prefers
     589             :                          * non-congested targets, it can be removed
     590             :                          */
     591             :                         if ((bits & (1<<BDI_async_congested)) || 1)
     592           0 :                                 ret |= bdi_congested(&q->backing_dev_info, bits);
     593             :                         else
     594             :                                 ret &= bdi_congested(&q->backing_dev_info, bits);
     595             :                 }
     596             :         }
     597           0 :         rcu_read_unlock();
     598           0 :         return ret;
     599             : }
     600             : 
     601             : 
     602             : static int flush_pending_writes(conf_t *conf)
     603             : {
     604           0 :         /* Any writes that have been queued but are awaiting
     605           0 :          * bitmap updates get flushed here.
     606           0 :          * We return 1 if any requests were actually submitted.
     607             :          */
     608           0 :         int rv = 0;
     609             : 
     610           0 :         spin_lock_irq(&conf->device_lock);
     611             : 
     612           0 :         if (conf->pending_bio_list.head) {
     613             :                 struct bio *bio;
     614           0 :                 bio = bio_list_get(&conf->pending_bio_list);
     615           0 :                 blk_remove_plug(conf->mddev->queue);
     616           0 :                 spin_unlock_irq(&conf->device_lock);
     617             :                 /* flush any pending bitmap writes to
     618             :                  * disk before proceeding w/ I/O */
     619           0 :                 bitmap_unplug(conf->mddev->bitmap);
     620             : 
     621           0 :                 while (bio) { /* submit pending writes */
     622           0 :                         struct bio *next = bio->bi_next;
     623           0 :                         bio->bi_next = NULL;
     624           0 :                         generic_make_request(bio);
     625           0 :                         bio = next;
     626             :                 }
     627           0 :                 rv = 1;
     628             :         } else
     629           0 :                 spin_unlock_irq(&conf->device_lock);
     630           0 :         return rv;
     631             : }
     632             : 
     633             : /* Barriers....
     634             :  * Sometimes we need to suspend IO while we do something else,
     635             :  * either some resync/recovery, or reconfigure the array.
     636             :  * To do this we raise a 'barrier'.
     637             :  * The 'barrier' is a counter that can be raised multiple times
     638             :  * to count how many activities are happening which preclude
     639             :  * normal IO.
     640             :  * We can only raise the barrier if there is no pending IO.
     641             :  * i.e. if nr_pending == 0.
     642             :  * We choose only to raise the barrier if no-one is waiting for the
     643             :  * barrier to go down.  This means that as soon as an IO request
     644             :  * is ready, no other operations which require a barrier will start
     645             :  * until the IO request has had a chance.
     646             :  *
     647             :  * So: regular IO calls 'wait_barrier'.  When that returns there
     648             :  *    is no backgroup IO happening,  It must arrange to call
     649             :  *    allow_barrier when it has finished its IO.
     650             :  * backgroup IO calls must call raise_barrier.  Once that returns
     651             :  *    there is no normal IO happeing.  It must arrange to call
     652             :  *    lower_barrier when the particular background IO completes.
     653             :  */
     654             : #define RESYNC_DEPTH 32
     655             : 
     656             : static void raise_barrier(conf_t *conf)
     657             : {
     658          24 :         spin_lock_irq(&conf->resync_lock);
     659           8 : 
     660           8 :         /* Wait until no block IO is waiting */
     661         272 :         wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
     662          24 :                             conf->resync_lock,
     663          24 :                             raid1_unplug(conf->mddev->queue));
     664           8 : 
     665           8 :         /* block any new IO from starting */
     666          24 :         conf->barrier++;
     667           8 : 
     668           8 :         /* No wait for all pending IO to complete */
     669         360 :         wait_event_lock_irq(conf->wait_barrier,
     670          24 :                             !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
     671          24 :                             conf->resync_lock,
     672           8 :                             raid1_unplug(conf->mddev->queue));
     673           8 : 
     674          48 :         spin_unlock_irq(&conf->resync_lock);
     675           8 : }
     676             : 
     677             : static void lower_barrier(conf_t *conf)
     678             : {
     679           4 :         unsigned long flags;
     680          28 :         BUG_ON(conf->barrier <= 0);
     681          16 :         spin_lock_irqsave(&conf->resync_lock, flags);
     682           4 :         conf->barrier--;
     683           8 :         spin_unlock_irqrestore(&conf->resync_lock, flags);
     684           4 :         wake_up(&conf->wait_barrier);
     685           4 : }
     686             : 
     687             : static void wait_barrier(conf_t *conf)
     688             : {
     689          15 :         spin_lock_irq(&conf->resync_lock);
     690          15 :         if (conf->barrier) {
     691          10 :                 conf->nr_waiting++;
     692         170 :                 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
     693          15 :                                     conf->resync_lock,
     694          15 :                                     raid1_unplug(conf->mddev->queue));
     695          15 :                 conf->nr_waiting--;
     696           5 :         }
     697          10 :         conf->nr_pending++;
     698          20 :         spin_unlock_irq(&conf->resync_lock);
     699           5 : }
     700             : 
     701             : static void allow_barrier(conf_t *conf)
     702             : {
     703           5 :         unsigned long flags;
     704          20 :         spin_lock_irqsave(&conf->resync_lock, flags);
     705           5 :         conf->nr_pending--;
     706          10 :         spin_unlock_irqrestore(&conf->resync_lock, flags);
     707           5 :         wake_up(&conf->wait_barrier);
     708           5 : }
     709             : 
     710             : static void freeze_array(conf_t *conf)
     711             : {
     712           0 :         /* stop syncio and normal IO and wait for everything to
     713           0 :          * go quite.
     714           0 :          * We increment barrier and nr_waiting, and then
     715           0 :          * wait until nr_pending match nr_queued+1
     716           0 :          * This is called in the context of one normal IO request
     717           0 :          * that has failed. Thus any sync request that might be pending
     718           0 :          * will be blocked by nr_pending, and we need to wait for
     719           0 :          * pending IO requests to complete or be queued for re-try.
     720             :          * Thus the number queued (nr_queued) plus this request (1)
     721             :          * must match the number of pending IOs (nr_pending) before
     722             :          * we continue.
     723             :          */
     724           0 :         spin_lock_irq(&conf->resync_lock);
     725           0 :         conf->barrier++;
     726           0 :         conf->nr_waiting++;
     727           0 :         wait_event_lock_irq(conf->wait_barrier,
     728           0 :                             conf->nr_pending == conf->nr_queued+1,
     729           0 :                             conf->resync_lock,
     730             :                             ({ flush_pending_writes(conf);
     731             :                                raid1_unplug(conf->mddev->queue); }));
     732           0 :         spin_unlock_irq(&conf->resync_lock);
     733           0 : }
     734             : static void unfreeze_array(conf_t *conf)
     735             : {
     736             :         /* reverse the effect of the freeze */
     737           0 :         spin_lock_irq(&conf->resync_lock);
     738           0 :         conf->barrier--;
     739           0 :         conf->nr_waiting--;
     740           0 :         wake_up(&conf->wait_barrier);
     741           0 :         spin_unlock_irq(&conf->resync_lock);
     742           0 : }
     743             : 
     744             : 
     745             : /* duplicate the data pages for behind I/O */
     746             : static struct page **alloc_behind_pages(struct bio *bio)
     747             : {
     748           1 :         int i;
     749           1 :         struct bio_vec *bvec;
     750           5 :         struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *),
     751           1 :                                         GFP_NOIO);
     752           5 :         if (unlikely(!pages))
     753           2 :                 goto do_sync_io;
     754           1 : 
     755          11 :         bio_for_each_segment(bvec, bio, i) {
     756           3 :                 pages[i] = alloc_page(GFP_NOIO);
     757           6 :                 if (unlikely(!pages[i]))
     758           1 :                         goto do_sync_io;
     759           8 :                 memcpy(kmap(pages[i]) + bvec->bv_offset,
     760             :                         kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
     761           2 :                 kunmap(pages[i]);
     762           2 :                 kunmap(bvec->bv_page);
     763             :         }
     764             : 
     765           1 :         return pages;
     766           2 : 
     767             : do_sync_io:
     768           4 :         if (pages)
     769          10 :                 for (i = 0; i < bio->bi_vcnt && pages[i]; i++)
     770           2 :                         put_page(pages[i]);
     771           5 :         kfree(pages);
     772             :         PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
     773           3 :         return NULL;
     774             : }
     775             : 
     776             : static int make_request(struct request_queue *q, struct bio * bio)
     777             : {
     778           3 :         mddev_t *mddev = q->queuedata;
     779           3 :         conf_t *conf = mddev->private;
     780           1 :         mirror_info_t *mirror;
     781           1 :         r1bio_t *r1_bio;
     782           1 :         struct bio *read_bio;
     783           2 :         int i, targets = 0, disks;
     784           1 :         struct bitmap *bitmap;
     785           1 :         unsigned long flags;
     786           1 :         struct bio_list bl;
     787           2 :         struct page **behind_pages = NULL;
     788           3 :         const int rw = bio_data_dir(bio);
     789           4 :         const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
     790           1 :         int cpu;
     791           1 :         bool do_barriers;
     792           1 :         mdk_rdev_t *blocked_rdev;
     793           1 : 
     794           1 :         /*
     795           1 :          * Register the new request and wait if the reconstruction
     796           1 :          * thread has put up a bar for new requests.
     797           1 :          * Continue immediately if no resync is active currently.
     798           1 :          * We test barriers_work *after* md_write_start as md_write_start
     799           1 :          * may cause the first superblock write, and that will check out
     800           1 :          * if barriers work.
     801           1 :          */
     802           1 : 
     803           2 :         md_write_start(mddev, bio); /* wait on superblock update early */
     804           1 : 
     805           8 :         if (bio_data_dir(bio) == WRITE &&
     806           1 :             bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo &&
     807           1 :             bio->bi_sector < mddev->suspend_hi) {
     808           1 :                 /* As the suspend_* range is controlled by
     809           1 :                  * userspace, we want an interruptible
     810           1 :                  * wait.
     811           1 :                  */
     812           8 :                 DEFINE_WAIT(w);
     813           1 :                 for (;;) {
     814           5 :                         flush_signals(current);
     815           2 :                         prepare_to_wait(&conf->wait_barrier,
     816           1 :                                         &w, TASK_INTERRUPTIBLE);
     817           5 :                         if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo ||
     818           1 :                             bio->bi_sector >= mddev->suspend_hi)
     819           2 :                                 break;
     820           2 :                         schedule();
     821           2 :                 }
     822           3 :                 finish_wait(&conf->wait_barrier, &w);
     823           1 :         }
     824          13 :         if (unlikely(!mddev->barriers_work &&
     825           1 :                      bio_rw_flagged(bio, BIO_RW_BARRIER))) {
     826           3 :                 if (rw == WRITE)
     827           2 :                         md_write_end(mddev);
     828           1 :                 bio_endio(bio, -EOPNOTSUPP);
     829           1 :                 return 0;
     830             :         }
     831             : 
     832           4 :         wait_barrier(conf);
     833             : 
     834           1 :         bitmap = mddev->bitmap;
     835             : 
     836           9 :         cpu = part_stat_lock();
     837           9 :         part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
     838          14 :         part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
     839             :                       bio_sectors(bio));
     840           6 :         part_stat_unlock();
     841             : 
     842             :         /*
     843             :          * make_request() can abort the operation when READA is being
     844             :          * used and no empty request is available.
     845             :          *
     846             :          */
     847           2 :         r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
     848             : 
     849           1 :         r1_bio->master_bio = bio;
     850           1 :         r1_bio->sectors = bio->bi_size >> 9;
     851           1 :         r1_bio->state = 0;
     852           1 :         r1_bio->mddev = mddev;
     853           1 :         r1_bio->sector = bio->bi_sector;
     854             : 
     855           2 :         if (rw == READ) {
     856             :                 /*
     857             :                  * read balancing logic:
     858             :                  */
     859           3 :                 int rdisk = read_balance(conf, r1_bio);
     860             : 
     861           2 :                 if (rdisk < 0) {
     862             :                         /* couldn't find anywhere to read from */
     863           2 :                         raid_end_bio_io(r1_bio);
     864           1 :                         return 0;
     865             :                 }
     866           1 :                 mirror = conf->mirrors + rdisk;
     867             : 
     868           1 :                 r1_bio->read_disk = rdisk;
     869             : 
     870           1 :                 read_bio = bio_clone(bio, GFP_NOIO);
     871             : 
     872           1 :                 r1_bio->bios[rdisk] = read_bio;
     873             : 
     874           1 :                 read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
     875           1 :                 read_bio->bi_bdev = mirror->rdev->bdev;
     876           1 :                 read_bio->bi_end_io = raid1_end_read_request;
     877           1 :                 read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
     878           1 :                 read_bio->bi_private = r1_bio;
     879             : 
     880           1 :                 generic_make_request(read_bio);
     881           1 :                 return 0;
     882             :         }
     883             : 
     884             :         /*
     885             :          * WRITE:
     886             :          */
     887             :         /* first select target devices under spinlock and
     888             :          * inc refcount on their rdev.  Record them by setting
     889             :          * bios[x] to bio
     890             :          */
     891           1 :         disks = conf->raid_disks;
     892           1 : #if 0
     893             :         { static int first=1;
     894             :         if (first) printk("First Write sector %llu disks %d\n",
     895             :                           (unsigned long long)r1_bio->sector, disks);
     896             :         first = 0;
     897             :         }
     898             : #endif
     899             :  retry_write:
     900           1 :         blocked_rdev = NULL;
     901           2 :         rcu_read_lock();
     902           9 :         for (i = 0;  i < disks; i++) {
     903           8 :                 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
     904           9 :                 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
     905           2 :                         atomic_inc(&rdev->nr_pending);
     906           1 :                         blocked_rdev = rdev;
     907           1 :                         break;
     908           1 :                 }
     909          10 :                 if (rdev && !test_bit(Faulty, &rdev->flags)) {
     910           2 :                         atomic_inc(&rdev->nr_pending);
     911           4 :                         if (test_bit(Faulty, &rdev->flags)) {
     912           3 :                                 rdev_dec_pending(rdev, mddev);
     913           1 :                                 r1_bio->bios[i] = NULL;
     914             :                         } else {
     915           1 :                                 r1_bio->bios[i] = bio;
     916           1 :                                 targets++;
     917             :                         }
     918             :                 } else
     919           3 :                         r1_bio->bios[i] = NULL;
     920             :         }
     921           4 :         rcu_read_unlock();
     922             : 
     923           4 :         if (unlikely(blocked_rdev)) {
     924             :                 /* Wait for this device to become unblocked */
     925             :                 int j;
     926             : 
     927           6 :                 for (j = 0; j < i; j++)
     928           5 :                         if (r1_bio->bios[j])
     929           4 :                                 rdev_dec_pending(conf->mirrors[j].rdev, mddev);
     930             : 
     931           2 :                 allow_barrier(conf);
     932           1 :                 md_wait_for_blocked_rdev(blocked_rdev, mddev);
     933           2 :                 wait_barrier(conf);
     934           1 :                 goto retry_write;
     935             :         }
     936             : 
     937           6 :         BUG_ON(targets == 0); /* we never fail the last device */
     938             : 
     939           2 :         if (targets < conf->raid_disks) {
     940             :                 /* array is degraded, we will not clear the bitmap
     941             :                  * on I/O completion (see raid1_end_write_request) */
     942           2 :                 set_bit(R1BIO_Degraded, &r1_bio->state);
     943             :         }
     944             : 
     945             :         /* do behind I/O ? */
     946          17 :         if (bitmap &&
     947             :             (atomic_read(&bitmap->behind_writes)
     948             :              < mddev->bitmap_info.max_write_behind) &&
     949             :             (behind_pages = alloc_behind_pages(bio)) != NULL)
     950           2 :                 set_bit(R1BIO_BehindIO, &r1_bio->state);
     951             : 
     952          10 :         atomic_set(&r1_bio->remaining, 0);
     953           2 :         atomic_set(&r1_bio->behind_remaining, 0);
     954             : 
     955           2 :         do_barriers = bio_rw_flagged(bio, BIO_RW_BARRIER);
     956           2 :         if (do_barriers)
     957           2 :                 set_bit(R1BIO_Barrier, &r1_bio->state);
     958             : 
     959           4 :         bio_list_init(&bl);
     960           6 :         for (i = 0; i < disks; i++) {
     961           3 :                 struct bio *mbio;
     962           3 :                 if (!r1_bio->bios[i])
     963           1 :                         continue;
     964             : 
     965           1 :                 mbio = bio_clone(bio, GFP_NOIO);
     966           1 :                 r1_bio->bios[i] = mbio;
     967             : 
     968           1 :                 mbio->bi_sector      = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
     969           1 :                 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
     970           1 :                 mbio->bi_end_io      = raid1_end_write_request;
     971           1 :                 mbio->bi_rw = WRITE | (do_barriers << BIO_RW_BARRIER) |
     972             :                         (do_sync << BIO_RW_SYNCIO);
     973           1 :                 mbio->bi_private = r1_bio;
     974             : 
     975           2 :                 if (behind_pages) {
     976             :                         struct bio_vec *bvec;
     977             :                         int j;
     978             : 
     979             :                         /* Yes, I really want the '__' version so that
     980             :                          * we clear any unused pointer in the io_vec, rather
     981             :                          * than leave them unchanged.  This is important
     982             :                          * because when we come to free the pages, we won't
     983             :                          * know the originial bi_idx, so we just free
     984             :                          * them all
     985             :                          */
     986           8 :                         __bio_for_each_segment(bvec, mbio, j, 0)
     987           3 :                                 bvec->bv_page = behind_pages[j];
     988           5 :                         if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
     989           2 :                                 atomic_inc(&r1_bio->behind_remaining);
     990             :                 }
     991             : 
     992           6 :                 atomic_inc(&r1_bio->remaining);
     993             : 
     994           2 :                 bio_list_add(&bl, mbio);
     995             :         }
     996           2 :         kfree(behind_pages); /* the behind pages are attached to the bios now */
     997             : 
     998           4 :         bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
     999             :                                 test_bit(R1BIO_BehindIO, &r1_bio->state));
    1000           3 :         spin_lock_irqsave(&conf->device_lock, flags);
    1001           2 :         bio_list_merge(&conf->pending_bio_list, &bl);
    1002           2 :         bio_list_init(&bl);
    1003             : 
    1004           1 :         blk_plug_device(mddev->queue);
    1005           2 :         spin_unlock_irqrestore(&conf->device_lock, flags);
    1006             : 
    1007             :         /* In case raid1d snuck into freeze_array */
    1008           1 :         wake_up(&conf->wait_barrier);
    1009             : 
    1010           2 :         if (do_sync)
    1011           1 :                 md_wakeup_thread(mddev->thread);
    1012             : #if 0
    1013             :         while ((bio = bio_list_pop(&bl)) != NULL)
    1014             :                 generic_make_request(bio);
    1015             : #endif
    1016             : 
    1017           1 :         return 0;
    1018             : }
    1019             : 
    1020             : static void status(struct seq_file *seq, mddev_t *mddev)
    1021             : {
    1022           3 :         conf_t *conf = mddev->private;
    1023           1 :         int i;
    1024           1 : 
    1025           2 :         seq_printf(seq, " [%d/%d] [", conf->raid_disks,
    1026           1 :                    conf->raid_disks - mddev->degraded);
    1027           3 :         rcu_read_lock();
    1028           6 :         for (i = 0; i < conf->raid_disks; i++) {
    1029           3 :                 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
    1030          14 :                 seq_printf(seq, "%s",
    1031             :                            rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
    1032             :         }
    1033           2 :         rcu_read_unlock();
    1034           1 :         seq_printf(seq, "]");
    1035           1 : }
    1036             : 
    1037             : 
    1038             : static void error(mddev_t *mddev, mdk_rdev_t *rdev)
    1039             : {
    1040           1 :         char b[BDEVNAME_SIZE];
    1041           3 :         conf_t *conf = mddev->private;
    1042           1 : 
    1043           1 :         /*
    1044           1 :          * If it is not operational, then we have already marked it as dead
    1045           1 :          * else if it is the last working disks, ignore the error, let the
    1046           1 :          * next level up know.
    1047             :          * else mark the drive as failed
    1048             :          */
    1049           6 :         if (test_bit(In_sync, &rdev->flags)
    1050             :             && (conf->raid_disks - mddev->degraded) == 1) {
    1051             :                 /*
    1052             :                  * Don't fail the drive, act as though we were just a
    1053             :                  * normal single drive.
    1054             :                  * However don't try a recovery from this drive as
    1055             :                  * it is very likely to fail.
    1056             :                  */
    1057           1 :                 mddev->recovery_disabled = 1;
    1058           1 :                 return;
    1059             :         }
    1060           4 :         if (test_and_clear_bit(In_sync, &rdev->flags)) {
    1061             :                 unsigned long flags;
    1062           3 :                 spin_lock_irqsave(&conf->device_lock, flags);
    1063           1 :                 mddev->degraded++;
    1064           2 :                 set_bit(Faulty, &rdev->flags);
    1065           2 :                 spin_unlock_irqrestore(&conf->device_lock, flags);
    1066             :                 /*
    1067             :                  * if recovery is running, make sure it aborts.
    1068             :                  */
    1069           2 :                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
    1070             :         } else
    1071           2 :                 set_bit(Faulty, &rdev->flags);
    1072           4 :         set_bit(MD_CHANGE_DEVS, &mddev->flags);
    1073           2 :         printk(KERN_ALERT "raid1: Disk failure on %s, disabling device.\n"
    1074             :                 "raid1: Operation continuing on %d devices.\n",
    1075             :                 bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
    1076           1 : }
    1077             : 
    1078             : static void print_conf(conf_t *conf)
    1079             : {
    1080           9 :         int i;
    1081           9 : 
    1082          18 :         printk("RAID1 conf printout:\n");
    1083          27 :         if (!conf) {
    1084          18 :                 printk("(!conf)\n");
    1085          18 :                 return;
    1086           9 :         }
    1087           9 :         printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
    1088             :                 conf->raid_disks);
    1089             : 
    1090          18 :         rcu_read_lock();
    1091          54 :         for (i = 0; i < conf->raid_disks; i++) {
    1092          27 :                 char b[BDEVNAME_SIZE];
    1093          27 :                 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
    1094          18 :                 if (rdev)
    1095          54 :                         printk(" disk %d, wo:%d, o:%d, dev:%s\n",
    1096             :                                i, !test_bit(In_sync, &rdev->flags),
    1097             :                                !test_bit(Faulty, &rdev->flags),
    1098             :                                bdevname(rdev->bdev,b));
    1099             :         }
    1100          18 :         rcu_read_unlock();
    1101           9 : }
    1102             : 
    1103             : static void close_sync(conf_t *conf)
    1104             : {
    1105           4 :         wait_barrier(conf);
    1106           4 :         allow_barrier(conf);
    1107             : 
    1108           2 :         mempool_destroy(conf->r1buf_pool);
    1109           2 :         conf->r1buf_pool = NULL;
    1110           2 : }
    1111             : 
    1112             : static int raid1_spare_active(mddev_t *mddev)
    1113             : {
    1114           1 :         int i;
    1115           3 :         conf_t *conf = mddev->private;
    1116           1 : 
    1117           1 :         /*
    1118           1 :          * Find all failed disks within the RAID1 configuration 
    1119           1 :          * and mark them readable.
    1120           1 :          * Called under mddev lock, so rcu protection not needed.
    1121             :          */
    1122           8 :         for (i = 0; i < conf->raid_disks; i++) {
    1123           6 :                 mdk_rdev_t *rdev = conf->mirrors[i].rdev;
    1124          11 :                 if (rdev
    1125             :                     && !test_bit(Faulty, &rdev->flags)
    1126             :                     && !test_and_set_bit(In_sync, &rdev->flags)) {
    1127             :                         unsigned long flags;
    1128           3 :                         spin_lock_irqsave(&conf->device_lock, flags);
    1129           1 :                         mddev->degraded--;
    1130           2 :                         spin_unlock_irqrestore(&conf->device_lock, flags);
    1131             :                 }
    1132             :         }
    1133             : 
    1134           3 :         print_conf(conf);
    1135           1 :         return 0;
    1136             : }
    1137             : 
    1138             : 
    1139             : static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
    1140             : {
    1141           3 :         conf_t *conf = mddev->private;
    1142           2 :         int err = -EEXIST;
    1143           2 :         int mirror = 0;
    1144           1 :         mirror_info_t *p;
    1145           2 :         int first = 0;
    1146           2 :         int last = mddev->raid_disks - 1;
    1147           1 : 
    1148           2 :         if (rdev->raid_disk >= 0)
    1149           2 :                 first = last = rdev->raid_disk;
    1150             : 
    1151           5 :         for (mirror = first; mirror <= last; mirror++)
    1152           6 :                 if ( !(p=conf->mirrors+mirror)->rdev) {
    1153           1 : 
    1154           1 :                         disk_stack_limits(mddev->gendisk, rdev->bdev,
    1155             :                                           rdev->data_offset << 9);
    1156             :                         /* as we don't honour merge_bvec_fn, we must never risk
    1157           1 :                          * violating it, so limit ->max_sector to one PAGE, as
    1158             :                          * a one page request is never in violation.
    1159             :                          */
    1160           7 :                         if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
    1161             :                             queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
    1162           1 :                                 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
    1163             : 
    1164           2 :                         p->head_position = 0;
    1165           2 :                         rdev->raid_disk = mirror;
    1166           2 :                         err = 0;
    1167             :                         /* As all devices are equivalent, we don't need a full recovery
    1168             :                          * if this was recently any drive of the array
    1169             :                          */
    1170           4 :                         if (rdev->saved_raid_disk < 0)
    1171           2 :                                 conf->fullsync = 1;
    1172           4 :                         rcu_assign_pointer(p->rdev, rdev);
    1173           2 :                         break;
    1174             :                 }
    1175           2 :         md_integrity_add_rdev(rdev, mddev);
    1176           6 :         print_conf(conf);
    1177           1 :         return err;
    1178             : }
    1179             : 
    1180             : static int raid1_remove_disk(mddev_t *mddev, int number)
    1181             : {
    1182           3 :         conf_t *conf = mddev->private;
    1183           2 :         int err = 0;
    1184           1 :         mdk_rdev_t *rdev;
    1185           2 :         mirror_info_t *p = conf->mirrors+ number;
    1186           1 : 
    1187           4 :         print_conf(conf);
    1188           2 :         rdev = p->rdev;
    1189           3 :         if (rdev) {
    1190           8 :                 if (test_bit(In_sync, &rdev->flags) ||
    1191             :                     atomic_read(&rdev->nr_pending)) {
    1192           2 :                         err = -EBUSY;
    1193           2 :                         goto abort;
    1194             :                 }
    1195             :                 /* Only remove non-faulty devices is recovery
    1196             :                  * is not possible.
    1197             :                  */
    1198           8 :                 if (!test_bit(Faulty, &rdev->flags) &&
    1199             :                     !mddev->recovery_disabled &&
    1200             :                     mddev->degraded < conf->raid_disks) {
    1201           1 :                         err = -EBUSY;
    1202           1 :                         goto abort;
    1203             :                 }
    1204           1 :                 p->rdev = NULL;
    1205           1 :                 synchronize_rcu();
    1206           4 :                 if (atomic_read(&rdev->nr_pending)) {
    1207             :                         /* lost the race, try later */
    1208           1 :                         err = -EBUSY;
    1209           1 :                         p->rdev = rdev;
    1210           1 :                         goto abort;
    1211             :                 }
    1212           1 :                 md_integrity_register(mddev);
    1213             :         }
    1214             : abort:
    1215             : 
    1216          15 :         print_conf(conf);
    1217           3 :         return err;
    1218             : }
    1219             : 
    1220             : 
    1221             : static void end_sync_read(struct bio *bio, int error)
    1222             : {
    1223           0 :         r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
    1224           0 :         int i;
    1225           0 : 
    1226           0 :         for (i=r1_bio->mddev->raid_disks; i--; )
    1227           0 :                 if (r1_bio->bios[i] == bio)
    1228           0 :                         break;
    1229           0 :         BUG_ON(i < 0);
    1230           0 :         update_head_pos(i, r1_bio);
    1231             :         /*
    1232           0 :          * we have read a block, now it needs to be re-written,
    1233             :          * or re-read if the read failed.
    1234             :          * We don't do much here, just schedule handling by raid1d
    1235             :          */
    1236           0 :         if (test_bit(BIO_UPTODATE, &bio->bi_flags))
    1237           0 :                 set_bit(R1BIO_Uptodate, &r1_bio->state);
    1238             : 
    1239           0 :         if (atomic_dec_and_test(&r1_bio->remaining))
    1240           0 :                 reschedule_retry(r1_bio);
    1241           0 : }
    1242             : 
    1243             : static void end_sync_write(struct bio *bio, int error)
    1244             : {
    1245           0 :         int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
    1246           0 :         r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
    1247           0 :         mddev_t *mddev = r1_bio->mddev;
    1248           0 :         conf_t *conf = mddev->private;
    1249           0 :         int i;
    1250           0 :         int mirror=0;
    1251           0 : 
    1252           0 :         for (i = 0; i < conf->raid_disks; i++)
    1253           0 :                 if (r1_bio->bios[i] == bio) {
    1254           0 :                         mirror = i;
    1255           0 :                         break;
    1256           0 :                 }
    1257           0 :         if (!uptodate) {
    1258           0 :                 int sync_blocks = 0;
    1259           0 :                 sector_t s = r1_bio->sector;
    1260           0 :                 long sectors_to_go = r1_bio->sectors;
    1261           0 :                 /* make sure these bits doesn't get cleared. */
    1262             :                 do {
    1263           0 :                         bitmap_end_sync(mddev->bitmap, s,
    1264             :                                         &sync_blocks, 1);
    1265           0 :                         s += sync_blocks;
    1266           0 :                         sectors_to_go -= sync_blocks;
    1267           0 :                 } while (sectors_to_go > 0);
    1268           0 :                 md_error(mddev, conf->mirrors[mirror].rdev);
    1269           0 :         }
    1270             : 
    1271           0 :         update_head_pos(mirror, r1_bio);
    1272             : 
    1273           0 :         if (atomic_dec_and_test(&r1_bio->remaining)) {
    1274           0 :                 sector_t s = r1_bio->sectors;
    1275           0 :                 put_buf(r1_bio);
    1276           0 :                 md_done_sync(mddev, s, uptodate);
    1277             :         }
    1278           0 : }
    1279             : 
    1280             : static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
    1281             : {
    1282           0 :         conf_t *conf = mddev->private;
    1283           0 :         int i;
    1284           0 :         int disks = conf->raid_disks;
    1285           0 :         struct bio *bio, *wbio;
    1286           0 : 
    1287           0 :         bio = r1_bio->bios[r1_bio->read_disk];
    1288           0 : 
    1289           0 : 
    1290           0 :         if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
    1291           0 :                 /* We have read all readable devices.  If we haven't
    1292           0 :                  * got the block, then there is no hope left.
    1293           0 :                  * If we have, then we want to do a comparison
    1294           0 :                  * and skip the write if everything is the same.
    1295           0 :                  * If any blocks failed to read, then we need to
    1296           0 :                  * attempt an over-write
    1297           0 :                  */
    1298           0 :                 int primary;
    1299           0 :                 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
    1300           0 :                         for (i=0; i<mddev->raid_disks; i++)
    1301           0 :                                 if (r1_bio->bios[i]->bi_end_io == end_sync_read)
    1302           0 :                                         md_error(mddev, conf->mirrors[i].rdev);
    1303           0 : 
    1304           0 :                         md_done_sync(mddev, r1_bio->sectors, 1);
    1305           0 :                         put_buf(r1_bio);
    1306           0 :                         return;
    1307           0 :                 }
    1308           0 :                 for (primary=0; primary<mddev->raid_disks; primary++)
    1309           0 :                         if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
    1310           0 :                             test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
    1311           0 :                                 r1_bio->bios[primary]->bi_end_io = NULL;
    1312           0 :                                 rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
    1313           0 :                                 break;
    1314           0 :                         }
    1315           0 :                 r1_bio->read_disk = primary;
    1316           0 :                 for (i=0; i<mddev->raid_disks; i++)
    1317           0 :                         if (r1_bio->bios[i]->bi_end_io == end_sync_read) {
    1318           0 :                                 int j;
    1319           0 :                                 int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
    1320           0 :                                 struct bio *pbio = r1_bio->bios[primary];
    1321           0 :                                 struct bio *sbio = r1_bio->bios[i];
    1322           0 : 
    1323           0 :                                 if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
    1324           0 :                                         for (j = vcnt; j-- ; ) {
    1325           0 :                                                 struct page *p, *s;
    1326           0 :                                                 p = pbio->bi_io_vec[j].bv_page;
    1327           0 :                                                 s = sbio->bi_io_vec[j].bv_page;
    1328           0 :                                                 if (memcmp(page_address(p),
    1329             :                                                            page_address(s),
    1330           0 :                                                            PAGE_SIZE))
    1331           0 :                                                         break;
    1332             :                                         }
    1333             :                                 } else
    1334           0 :                                         j = 0;
    1335           0 :                                 if (j >= 0)
    1336           0 :                                         mddev->resync_mismatches += r1_bio->sectors;
    1337           0 :                                 if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
    1338             :                                               && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
    1339           0 :                                         sbio->bi_end_io = NULL;
    1340           0 :                                         rdev_dec_pending(conf->mirrors[i].rdev, mddev);
    1341             :                                 } else {
    1342             :                                         /* fixup the bio for reuse */
    1343             :                                         int size;
    1344           0 :                                         sbio->bi_vcnt = vcnt;
    1345           0 :                                         sbio->bi_size = r1_bio->sectors << 9;
    1346           0 :                                         sbio->bi_idx = 0;
    1347           0 :                                         sbio->bi_phys_segments = 0;
    1348           0 :                                         sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
    1349           0 :                                         sbio->bi_flags |= 1 << BIO_UPTODATE;
    1350           0 :                                         sbio->bi_next = NULL;
    1351           0 :                                         sbio->bi_sector = r1_bio->sector +
    1352             :                                                 conf->mirrors[i].rdev->data_offset;
    1353           0 :                                         sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
    1354           0 :                                         size = sbio->bi_size;
    1355           0 :                                         for (j = 0; j < vcnt ; j++) {
    1356           0 :                                                 struct bio_vec *bi;
    1357           0 :                                                 bi = &sbio->bi_io_vec[j];
    1358           0 :                                                 bi->bv_offset = 0;
    1359           0 :                                                 if (size > PAGE_SIZE)
    1360           0 :                                                         bi->bv_len = PAGE_SIZE;
    1361             :                                                 else
    1362           0 :                                                         bi->bv_len = size;
    1363           0 :                                                 size -= PAGE_SIZE;
    1364           0 :                                                 memcpy(page_address(bi->bv_page),
    1365             :                                                        page_address(pbio->bi_io_vec[j].bv_page),
    1366             :                                                        PAGE_SIZE);
    1367             :                                         }
    1368             : 
    1369             :                                 }
    1370             :                         }
    1371             :         }
    1372           0 :         if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
    1373             :                 /* ouch - failed to read all of that.
    1374             :                  * Try some synchronous reads of other devices to get
    1375             :                  * good data, much like with normal read errors.  Only
    1376             :                  * read into the pages we already have so we don't
    1377             :                  * need to re-issue the read request.
    1378             :                  * We don't need to freeze the array, because being in an
    1379             :                  * active sync request, there is no normal IO, and
    1380             :                  * no overlapping syncs.
    1381             :                  */
    1382           0 :                 sector_t sect = r1_bio->sector;
    1383           0 :                 int sectors = r1_bio->sectors;
    1384           0 :                 int idx = 0;
    1385             : 
    1386           0 :                 while(sectors) {
    1387           0 :                         int s = sectors;
    1388           0 :                         int d = r1_bio->read_disk;
    1389           0 :                         int success = 0;
    1390             :                         mdk_rdev_t *rdev;
    1391             : 
    1392           0 :                         if (s > (PAGE_SIZE>>9))
    1393           0 :                                 s = PAGE_SIZE >> 9;
    1394             :                         do {
    1395           0 :                                 if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
    1396             :                                         /* No rcu protection needed here devices
    1397           0 :                                          * can only be removed when no resync is
    1398             :                                          * active, and resync is currently active
    1399             :                                          */
    1400           0 :                                         rdev = conf->mirrors[d].rdev;
    1401           0 :                                         if (sync_page_io(rdev->bdev,
    1402             :                                                          sect + rdev->data_offset,
    1403             :                                                          s<<9,
    1404             :                                                          bio->bi_io_vec[idx].bv_page,
    1405             :                                                          READ)) {
    1406           0 :                                                 success = 1;
    1407           0 :                                                 break;
    1408             :                                         }
    1409             :                                 }
    1410           0 :                                 d++;
    1411           0 :                                 if (d == conf->raid_disks)
    1412           0 :                                         d = 0;
    1413           0 :                         } while (!success && d != r1_bio->read_disk);
    1414             : 
    1415           0 :                         if (success) {
    1416           0 :                                 int start = d;
    1417             :                                 /* write it back and re-read */
    1418           0 :                                 set_bit(R1BIO_Uptodate, &r1_bio->state);
    1419           0 :                                 while (d != r1_bio->read_disk) {
    1420           0 :                                         if (d == 0)
    1421           0 :                                                 d = conf->raid_disks;
    1422           0 :                                         d--;
    1423           0 :                                         if (r1_bio->bios[d]->bi_end_io != end_sync_read)
    1424           0 :                                                 continue;
    1425           0 :                                         rdev = conf->mirrors[d].rdev;
    1426           0 :                                         atomic_add(s, &rdev->corrected_errors);
    1427           0 :                                         if (sync_page_io(rdev->bdev,
    1428             :                                                          sect + rdev->data_offset,
    1429             :                                                          s<<9,
    1430             :                                                          bio->bi_io_vec[idx].bv_page,
    1431             :                                                          WRITE) == 0)
    1432           0 :                                                 md_error(mddev, rdev);
    1433             :                                 }
    1434           0 :                                 d = start;
    1435           0 :                                 while (d != r1_bio->read_disk) {
    1436           0 :                                         if (d == 0)
    1437           0 :                                                 d = conf->raid_disks;
    1438           0 :                                         d--;
    1439           0 :                                         if (r1_bio->bios[d]->bi_end_io != end_sync_read)
    1440           0 :                                                 continue;
    1441           0 :                                         rdev = conf->mirrors[d].rdev;
    1442           0 :                                         if (sync_page_io(rdev->bdev,
    1443             :                                                          sect + rdev->data_offset,
    1444             :                                                          s<<9,
    1445             :                                                          bio->bi_io_vec[idx].bv_page,
    1446             :                                                          READ) == 0)
    1447           0 :                                                 md_error(mddev, rdev);
    1448             :                                 }
    1449             :                         } else {
    1450             :                                 char b[BDEVNAME_SIZE];
    1451             :                                 /* Cannot read from anywhere, array is toast */
    1452           0 :                                 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
    1453           0 :                                 printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error"
    1454             :                                        " for block %llu\n",
    1455             :                                        bdevname(bio->bi_bdev,b),
    1456             :                                        (unsigned long long)r1_bio->sector);
    1457           0 :                                 md_done_sync(mddev, r1_bio->sectors, 0);
    1458           0 :                                 put_buf(r1_bio);
    1459           0 :                                 return;
    1460             :                         }
    1461           0 :                         sectors -= s;
    1462           0 :                         sect += s;
    1463           0 :                         idx ++;
    1464           0 :                 }
    1465             :         }
    1466             : 
    1467             :         /*
    1468             :          * schedule writes
    1469             :          */
    1470           0 :         atomic_set(&r1_bio->remaining, 1);
    1471           0 :         for (i = 0; i < disks ; i++) {
    1472           0 :                 wbio = r1_bio->bios[i];
    1473           0 :                 if (wbio->bi_end_io == NULL ||
    1474             :                     (wbio->bi_end_io == end_sync_read &&
    1475             :                      (i == r1_bio->read_disk ||
    1476             :                       !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
    1477           0 :                         continue;
    1478             : 
    1479           0 :                 wbio->bi_rw = WRITE;
    1480           0 :                 wbio->bi_end_io = end_sync_write;
    1481           0 :                 atomic_inc(&r1_bio->remaining);
    1482           0 :                 md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
    1483             : 
    1484           0 :                 generic_make_request(wbio);
    1485             :         }
    1486           0 : 
    1487           0 :         if (atomic_dec_and_test(&r1_bio->remaining)) {
    1488             :                 /* if we're here, all write(s) have completed, so clean up */
    1489           0 :                 md_done_sync(mddev, r1_bio->sectors, 1);
    1490           0 :                 put_buf(r1_bio);
    1491             :         }
    1492           0 : }
    1493             : 
    1494             : /*
    1495             :  * This is a kernel thread which:
    1496             :  *
    1497             :  *      1.      Retries failed read operations on working mirrors.
    1498             :  *      2.      Updates the raid superblock when problems encounter.
    1499             :  *      3.      Performs writes following reads for array syncronising.
    1500             :  */
    1501             : 
    1502             : static void fix_read_error(conf_t *conf, int read_disk,
    1503             :                            sector_t sect, int sectors)
    1504           0 : {
    1505           0 :         mddev_t *mddev = conf->mddev;
    1506           0 :         while(sectors) {
    1507           0 :                 int s = sectors;
    1508           0 :                 int d = read_disk;
    1509           0 :                 int success = 0;
    1510           0 :                 int start;
    1511           0 :                 mdk_rdev_t *rdev;
    1512           0 : 
    1513           0 :                 if (s > (PAGE_SIZE>>9))
    1514           0 :                         s = PAGE_SIZE >> 9;
    1515           0 : 
    1516           0 :                 do {
    1517           0 :                         /* Note: no rcu protection needed here
    1518           0 :                          * as this is synchronous in the raid1d thread
    1519             :                          * which is the thread that might remove
    1520             :                          * a device.  If raid1d ever becomes multi-threaded....
    1521             :                          */
    1522           0 :                         rdev = conf->mirrors[d].rdev;
    1523           0 :                         if (rdev &&
    1524             :                             test_bit(In_sync, &rdev->flags) &&
    1525             :                             sync_page_io(rdev->bdev,
    1526           0 :                                          sect + rdev->data_offset,
    1527             :                                          s<<9,
    1528             :                                          conf->tmppage, READ))
    1529           0 :                                 success = 1;
    1530             :                         else {
    1531           0 :                                 d++;
    1532           0 :                                 if (d == conf->raid_disks)
    1533           0 :                                         d = 0;
    1534             :                         }
    1535           0 :                 } while (!success && d != read_disk);
    1536             : 
    1537           0 :                 if (!success) {
    1538             :                         /* Cannot read from anywhere -- bye bye array */
    1539           0 :                         md_error(mddev, conf->mirrors[read_disk].rdev);
    1540           0 :                         break;
    1541             :                 }
    1542             :                 /* write it back and re-read */
    1543           0 :                 start = d;
    1544           0 :                 while (d != read_disk) {
    1545           0 :                         if (d==0)
    1546           0 :                                 d = conf->raid_disks;
    1547           0 :                         d--;
    1548           0 :                         rdev = conf->mirrors[d].rdev;
    1549           0 :                         if (rdev &&
    1550             :                             test_bit(In_sync, &rdev->flags)) {
    1551           0 :                                 if (sync_page_io(rdev->bdev,
    1552             :                                                  sect + rdev->data_offset,
    1553             :                                                  s<<9, conf->tmppage, WRITE)
    1554             :                                     == 0)
    1555             :                                         /* Well, this device is dead */
    1556           0 :                                         md_error(mddev, rdev);
    1557             :                         }
    1558             :                 }
    1559           0 :                 d = start;
    1560           0 :                 while (d != read_disk) {
    1561           0 :                         char b[BDEVNAME_SIZE];
    1562           0 :                         if (d==0)
    1563           0 :                                 d = conf->raid_disks;
    1564           0 :                         d--;
    1565           0 :                         rdev = conf->mirrors[d].rdev;
    1566           0 :                         if (rdev &&
    1567           0 :                             test_bit(In_sync, &rdev->flags)) {
    1568           0 :                                 if (sync_page_io(rdev->bdev,
    1569             :                                                  sect + rdev->data_offset,
    1570             :                                                  s<<9, conf->tmppage, READ)
    1571             :                                     == 0)
    1572             :                                         /* Well, this device is dead */
    1573           0 :                                         md_error(mddev, rdev);
    1574             :                                 else {
    1575           0 :                                         atomic_add(s, &rdev->corrected_errors);
    1576           0 :                                         printk(KERN_INFO
    1577             :                                                "raid1:%s: read error corrected "
    1578             :                                                "(%d sectors at %llu on %s)\n",
    1579             :                                                mdname(mddev), s,
    1580             :                                                (unsigned long long)(sect +
    1581             :                                                    rdev->data_offset),
    1582             :                                                bdevname(rdev->bdev, b));
    1583             :                                 }
    1584             :                         }
    1585             :                 }
    1586           0 :                 sectors -= s;
    1587           0 :                 sect += s;
    1588           0 :         }
    1589             : }
    1590             : 
    1591             : static void raid1d(mddev_t *mddev)
    1592             : {
    1593           0 :         r1bio_t *r1_bio;
    1594           0 :         struct bio *bio;
    1595           0 :         unsigned long flags;
    1596           0 :         conf_t *conf = mddev->private;
    1597           0 :         struct list_head *head = &conf->retry_list;
    1598           0 :         int unplug=0;
    1599           0 :         mdk_rdev_t *rdev;
    1600           0 : 
    1601           0 :         md_check_recovery(mddev);
    1602           0 :         
    1603           0 :         for (;;) {
    1604           0 :                 char b[BDEVNAME_SIZE];
    1605           0 : 
    1606           0 :                 unplug += flush_pending_writes(conf);
    1607           0 : 
    1608           0 :                 spin_lock_irqsave(&conf->device_lock, flags);
    1609           0 :                 if (list_empty(head)) {
    1610           0 :                         spin_unlock_irqrestore(&conf->device_lock, flags);
    1611           0 :                         break;
    1612           0 :                 }
    1613           0 :                 r1_bio = list_entry(head->prev, r1bio_t, retry_list);
    1614           0 :                 list_del(head->prev);
    1615           0 :                 conf->nr_queued--;
    1616           0 :                 spin_unlock_irqrestore(&conf->device_lock, flags);
    1617           0 : 
    1618           0 :                 mddev = r1_bio->mddev;
    1619           0 :                 conf = mddev->private;
    1620           0 :                 if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
    1621           0 :                         sync_request_write(mddev, r1_bio);
    1622           0 :                         unplug = 1;
    1623           0 :                 } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
    1624             :                         /* some requests in the r1bio were BIO_RW_BARRIER
    1625             :                          * requests which failed with -EOPNOTSUPP.  Hohumm..
    1626             :                          * Better resubmit without the barrier.
    1627             :                          * We know which devices to resubmit for, because
    1628             :                          * all others have had their bios[] entry cleared.
    1629             :                          * We already have a nr_pending reference on these rdevs.
    1630             :                          */
    1631             :                         int i;
    1632           0 :                         const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO);
    1633           0 :                         clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
    1634           0 :                         clear_bit(R1BIO_Barrier, &r1_bio->state);
    1635           0 :                         for (i=0; i < conf->raid_disks; i++)
    1636           0 :                                 if (r1_bio->bios[i])
    1637           0 :                                         atomic_inc(&r1_bio->remaining);
    1638           0 :                         for (i=0; i < conf->raid_disks; i++)
    1639           0 :                                 if (r1_bio->bios[i]) {
    1640           0 :                                         struct bio_vec *bvec;
    1641             :                                         int j;
    1642             : 
    1643           0 :                                         bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
    1644             :                                         /* copy pages from the failed bio, as
    1645             :                                          * this might be a write-behind device */
    1646           0 :                                         __bio_for_each_segment(bvec, bio, j, 0)
    1647           0 :                                                 bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
    1648           0 :                                         bio_put(r1_bio->bios[i]);
    1649           0 :                                         bio->bi_sector = r1_bio->sector +
    1650             :                                                 conf->mirrors[i].rdev->data_offset;
    1651           0 :                                         bio->bi_bdev = conf->mirrors[i].rdev->bdev;
    1652           0 :                                         bio->bi_end_io = raid1_end_write_request;
    1653           0 :                                         bio->bi_rw = WRITE |
    1654             :                                                 (do_sync << BIO_RW_SYNCIO);
    1655           0 :                                         bio->bi_private = r1_bio;
    1656           0 :                                         r1_bio->bios[i] = bio;
    1657           0 :                                         generic_make_request(bio);
    1658             :                                 }
    1659             :                 } else {
    1660             :                         int disk;
    1661             : 
    1662             :                         /* we got a read error. Maybe the drive is bad.  Maybe just
    1663             :                          * the block and we can fix it.
    1664             :                          * We freeze all other IO, and try reading the block from
    1665             :                          * other devices.  When we find one, we re-write
    1666             :                          * and check it that fixes the read error.
    1667             :                          * This is all done synchronously while the array is
    1668             :                          * frozen
    1669             :                          */
    1670           0 :                         if (mddev->ro == 0) {
    1671           0 :                                 freeze_array(conf);
    1672           0 :                                 fix_read_error(conf, r1_bio->read_disk,
    1673             :                                                r1_bio->sector,
    1674             :                                                r1_bio->sectors);
    1675           0 :                                 unfreeze_array(conf);
    1676             :                         } else
    1677           0 :                                 md_error(mddev,
    1678             :                                          conf->mirrors[r1_bio->read_disk].rdev);
    1679             : 
    1680           0 :                         bio = r1_bio->bios[r1_bio->read_disk];
    1681           0 :                         if ((disk=read_balance(conf, r1_bio)) == -1) {
    1682           0 :                                 printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
    1683             :                                        " read error for block %llu\n",
    1684             :                                        bdevname(bio->bi_bdev,b),
    1685             :                                        (unsigned long long)r1_bio->sector);
    1686           0 :                                 raid_end_bio_io(r1_bio);
    1687             :                         } else {
    1688           0 :                                 const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO);
    1689           0 :                                 r1_bio->bios[r1_bio->read_disk] =
    1690             :                                         mddev->ro ? IO_BLOCKED : NULL;
    1691           0 :                                 r1_bio->read_disk = disk;
    1692           0 :                                 bio_put(bio);
    1693           0 :                                 bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
    1694           0 :                                 r1_bio->bios[r1_bio->read_disk] = bio;
    1695           0 :                                 rdev = conf->mirrors[disk].rdev;
    1696           0 :                                 if (printk_ratelimit())
    1697           0 :                                         printk(KERN_ERR "raid1: %s: redirecting sector %llu to"
    1698             :                                                " another mirror\n",
    1699             :                                                bdevname(rdev->bdev,b),
    1700             :                                                (unsigned long long)r1_bio->sector);
    1701           0 :                                 bio->bi_sector = r1_bio->sector + rdev->data_offset;
    1702           0 :                                 bio->bi_bdev = rdev->bdev;
    1703           0 :                                 bio->bi_end_io = raid1_end_read_request;
    1704           0 :                                 bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
    1705           0 :                                 bio->bi_private = r1_bio;
    1706           0 :                                 unplug = 1;
    1707           0 :                                 generic_make_request(bio);
    1708             :                         }
    1709             :                 }
    1710           0 :                 cond_resched();
    1711           0 :         }
    1712           0 :         if (unplug)
    1713           0 :                 unplug_slaves(mddev);
    1714           0 : }
    1715             : 
    1716             : 
    1717             : static int init_resync(conf_t *conf)
    1718             : {
    1719           1 :         int buffs;
    1720           1 : 
    1721           1 :         buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
    1722           7 :         BUG_ON(conf->r1buf_pool);
    1723           2 :         conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free,
    1724             :                                           conf->poolinfo);
    1725           3 :         if (!conf->r1buf_pool)
    1726           1 :                 return -ENOMEM;
    1727           1 :         conf->next_resync = 0;
    1728           1 :         return 0;
    1729             : }
    1730             : 
    1731             : /*
    1732             :  * perform a "sync" on one "block"
    1733             :  *
    1734             :  * We need to make sure that no normal I/O request - particularly write
    1735             :  * requests - conflict with active sync requests.
    1736             :  *
    1737             :  * This is achieved by tracking pending requests and a 'barrier' concept
    1738             :  * that can be installed to exclude normal IO requests.
    1739             :  */
    1740             : 
    1741             : static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
    1742             : {
    1743           2 :         conf_t *conf = mddev->private;
    1744           1 :         r1bio_t *r1_bio;
    1745           1 :         struct bio *bio;
    1746           1 :         sector_t max_sector, nr_sectors;
    1747           2 :         int disk = -1;
    1748           1 :         int i;
    1749           2 :         int wonly = -1;
    1750           3 :         int write_targets = 0, read_targets = 0;
    1751           1 :         int sync_blocks;
    1752           2 :         int still_degraded = 0;
    1753           1 : 
    1754           4 :         if (!conf->r1buf_pool)
    1755           1 :         {
    1756           1 : /*
    1757           1 :                 printk("sync start - bitmap %p\n", mddev->bitmap);
    1758           1 : */
    1759           5 :                 if (init_resync(conf))
    1760           2 :                         return 0;
    1761           1 :         }
    1762           1 : 
    1763           3 :         max_sector = mddev->dev_sectors;
    1764           5 :         if (sector_nr >= max_sector) {
    1765           1 :                 /* If we aborted, we need to abort the
    1766           1 :                  * sync on the 'current' bitmap chunk (there will
    1767           1 :                  * only be one in raid1 resync.
    1768           1 :                  * We can find the current addess in mddev->curr_resync
    1769           1 :                  */
    1770           5 :                 if (mddev->curr_resync < max_sector) /* aborted */
    1771           3 :                         bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
    1772           1 :                                                 &sync_blocks, 1);
    1773           1 :                 else /* completed sync */
    1774           3 :                         conf->fullsync = 0;
    1775             : 
    1776           2 :                 bitmap_close_sync(mddev->bitmap);
    1777           4 :                 close_sync(conf);
    1778           1 :                 return 0;
    1779             :         }
    1780             : 
    1781          18 :         if (mddev->bitmap == NULL &&
    1782             :             mddev->recovery_cp == MaxSector &&
    1783             :             !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
    1784             :             conf->fullsync == 0) {
    1785           1 :                 *skipped = 1;
    1786           1 :                 return max_sector - sector_nr;
    1787             :         }
    1788             :         /* before building a request, check if we can skip these blocks..
    1789             :          * This call the bitmap_start_sync doesn't actually record anything
    1790             :          */
    1791          23 :         if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
    1792             :             !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
    1793             :                 /* We can skip this block, and probably several more */
    1794           1 :                 *skipped = 1;
    1795           1 :                 return sync_blocks;
    1796             :         }
    1797             :         /*
    1798             :          * If there is non-resync activity waiting for a turn,
    1799             :          * and resync is going fast enough,
    1800             :          * then let it though before starting on this new sync request.
    1801             :          */
    1802          16 :         if (!go_faster && conf->nr_waiting)
    1803           4 :                 msleep_interruptible(1000);
    1804             : 
    1805           4 :         bitmap_cond_end_sync(mddev->bitmap, sector_nr);
    1806           8 :         raise_barrier(conf);
    1807             : 
    1808           1 :         conf->next_resync = sector_nr;
    1809             : 
    1810           2 :         r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
    1811           2 :         rcu_read_lock();
    1812             :         /*
    1813             :          * If we get a correctably read error during resync or recovery,
    1814             :          * we might want to read from a different device.  So we
    1815             :          * flag all drives that could conceivably be read from for READ,
    1816             :          * and any others (which will be non-In_sync devices) for WRITE.
    1817             :          * If a read fails, we try reading from something else for which READ
    1818             :          * is OK.
    1819             :          */
    1820             : 
    1821           1 :         r1_bio->mddev = mddev;
    1822           1 :         r1_bio->sector = sector_nr;
    1823           1 :         r1_bio->state = 0;
    1824           2 :         set_bit(R1BIO_IsSync, &r1_bio->state);
    1825             : 
    1826           7 :         for (i=0; i < conf->raid_disks; i++) {
    1827           4 :                 mdk_rdev_t *rdev;
    1828           2 :                 bio = r1_bio->bios[i];
    1829             : 
    1830             :                 /* take from bio_init */
    1831           1 :                 bio->bi_next = NULL;
    1832           1 :                 bio->bi_flags |= 1 << BIO_UPTODATE;
    1833           1 :                 bio->bi_rw = READ;
    1834           1 :                 bio->bi_vcnt = 0;
    1835           1 :                 bio->bi_idx = 0;
    1836           1 :                 bio->bi_phys_segments = 0;
    1837           1 :                 bio->bi_size = 0;
    1838           1 :                 bio->bi_end_io = NULL;
    1839           1 :                 bio->bi_private = NULL;
    1840             : 
    1841           2 :                 rdev = rcu_dereference(conf->mirrors[i].rdev);
    1842           6 :                 if (rdev == NULL ||
    1843             :                            test_bit(Faulty, &rdev->flags)) {
    1844           2 :                         still_degraded = 1;
    1845           2 :                         continue;
    1846           4 :                 } else if (!test_bit(In_sync, &rdev->flags)) {
    1847           1 :                         bio->bi_rw = WRITE;
    1848           1 :                         bio->bi_end_io = end_sync_write;
    1849           1 :                         write_targets ++;
    1850             :                 } else {
    1851             :                         /* may need to read from here */
    1852           1 :                         bio->bi_rw = READ;
    1853           1 :                         bio->bi_end_io = end_sync_read;
    1854           4 :                         if (test_bit(WriteMostly, &rdev->flags)) {
    1855           2 :                                 if (wonly < 0)
    1856           1 :                                         wonly = i;
    1857             :                         } else {
    1858           2 :                                 if (disk < 0)
    1859           1 :                                         disk = i;
    1860             :                         }
    1861           1 :                         read_targets++;
    1862             :                 }
    1863           4 :                 atomic_inc(&rdev->nr_pending);
    1864           1 :                 bio->bi_sector = sector_nr + rdev->data_offset;
    1865           1 :                 bio->bi_bdev = rdev->bdev;
    1866           1 :                 bio->bi_private = r1_bio;
    1867             :         }
    1868           3 :         rcu_read_unlock();
    1869           2 :         if (disk < 0)
    1870           1 :                 disk = wonly;
    1871           1 :         r1_bio->read_disk = disk;
    1872             : 
    1873           6 :         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
    1874             :                 /* extra read targets are also write targets */
    1875           1 :                 write_targets += read_targets-1;
    1876             : 
    1877           4 :         if (write_targets == 0 || read_targets == 0) {
    1878             :                 /* There is nowhere to write, so all non-sync
    1879             :                  * drives must be failed - so we are finished
    1880             :                  */
    1881           1 :                 sector_t rv = max_sector - sector_nr;
    1882           1 :                 *skipped = 1;
    1883           2 :                 put_buf(r1_bio);
    1884           1 :                 return rv;
    1885             :         }
    1886             : 
    1887           2 :         if (max_sector > mddev->resync_max)
    1888           1 :                 max_sector = mddev->resync_max; /* Don't do IO beyond here */
    1889           1 :         nr_sectors = 0;
    1890           1 :         sync_blocks = 0;
    1891           1 :         do {
    1892             :                 struct page *page;
    1893           1 :                 int len = PAGE_SIZE;
    1894           2 :                 if (sector_nr + (len>>9) > max_sector)
    1895           1 :                         len = (max_sector - sector_nr) << 9;
    1896           2 :                 if (len == 0)
    1897           1 :                         break;
    1898           2 :                 if (sync_blocks == 0) {
    1899           9 :                         if (!bitmap_start_sync(mddev->bitmap, sector_nr,
    1900             :                                                &sync_blocks, still_degraded) &&
    1901             :                             !conf->fullsync &&
    1902             :                             !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
    1903           1 :                                 break;
    1904           8 :                         BUG_ON(sync_blocks < (PAGE_SIZE>>9));
    1905           2 :                         if (len > (sync_blocks<<9))
    1906           1 :                                 len = sync_blocks<<9;
    1907             :                 }
    1908             : 
    1909           7 :                 for (i=0 ; i < conf->raid_disks; i++) {
    1910           3 :                         bio = r1_bio->bios[i];
    1911           4 :                         if (bio->bi_end_io) {
    1912           2 :                                 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
    1913           3 :                                 if (bio_add_page(bio, page, len, 0) == 0) {
    1914             :                                         /* stop here */
    1915           2 :                                         bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
    1916           3 :                                         while (i > 0) {
    1917           2 :                                                 i--;
    1918           2 :                                                 bio = r1_bio->bios[i];
    1919           3 :                                                 if (bio->bi_end_io==NULL)
    1920           1 :                                                         continue;
    1921             :                                                 /* remove last page from this bio */
    1922           2 :                                                 bio->bi_vcnt--;
    1923           1 :                                                 bio->bi_size -= len;
    1924           1 :                                                 bio->bi_flags &= ~(1<< BIO_SEG_VALID);
    1925           1 :                                         }
    1926           1 :                                         goto bio_full;
    1927             :                                 }
    1928             :                         }
    1929             :                 }
    1930           1 :                 nr_sectors += len>>9;
    1931           1 :                 sector_nr += len>>9;
    1932           1 :                 sync_blocks -= (len>>9);
    1933           3 :         } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
    1934             :  bio_full:
    1935           5 :         r1_bio->sectors = nr_sectors;
    1936             : 
    1937             :         /* For a user-requested sync, we read all readable devices and do a
    1938             :          * compare
    1939           1 :          */
    1940          13 :         if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
    1941           2 :                 atomic_set(&r1_bio->remaining, read_targets);
    1942           5 :                 for (i=0; i<conf->raid_disks; i++) {
    1943           3 :                         bio = r1_bio->bios[i];
    1944           3 :                         if (bio->bi_end_io == end_sync_read) {
    1945           0 :                                 md_sync_acct(bio->bi_bdev, nr_sectors);
    1946           0 :                                 generic_make_request(bio);
    1947             :                         }
    1948             :                 }
    1949             :         } else {
    1950           2 :                 atomic_set(&r1_bio->remaining, 1);
    1951           1 :                 bio = r1_bio->bios[r1_bio->read_disk];
    1952           2 :                 md_sync_acct(bio->bi_bdev, nr_sectors);
    1953           1 :                 generic_make_request(bio);
    1954             : 
    1955             :         }
    1956           2 :         return nr_sectors;
    1957             : }
    1958             : 
    1959             : static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks)
    1960             : {
    1961           8 :         if (sectors)
    1962           4 :                 return sectors;
    1963             : 
    1964           4 :         return mddev->dev_sectors;
    1965             : }
    1966             : 
    1967             : static conf_t *setup_conf(mddev_t *mddev)
    1968             : {
    1969           2 :         conf_t *conf;
    1970           2 :         int i;
    1971           2 :         mirror_info_t *disk;
    1972           2 :         mdk_rdev_t *rdev;
    1973           4 :         int err = -ENOMEM;
    1974           2 : 
    1975           8 :         conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
    1976           6 :         if (!conf)
    1977           4 :                 goto abort;
    1978           2 : 
    1979          10 :         conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
    1980           2 :                                  GFP_KERNEL);
    1981           8 :         if (!conf->mirrors)
    1982           4 :                 goto abort;
    1983           2 : 
    1984           6 :         conf->tmppage = alloc_page(GFP_KERNEL);
    1985           8 :         if (!conf->tmppage)
    1986           4 :                 goto abort;
    1987           2 : 
    1988           6 :         conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
    1989           6 :         if (!conf->poolinfo)
    1990           2 :                 goto abort;
    1991           2 :         conf->poolinfo->raid_disks = mddev->raid_disks;
    1992           4 :         conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
    1993             :                                           r1bio_pool_free,
    1994             :                                           conf->poolinfo);
    1995           6 :         if (!conf->r1bio_pool)
    1996           2 :                 goto abort;
    1997             : 
    1998           2 :         conf->poolinfo->mddev = mddev;
    1999             : 
    2000           8 :         spin_lock_init(&conf->device_lock);
    2001          16 :         list_for_each_entry(rdev, &mddev->disks, same_set) {
    2002           6 :                 int disk_idx = rdev->raid_disk;
    2003          10 :                 if (disk_idx >= mddev->raid_disks
    2004             :                     || disk_idx < 0)
    2005           2 :                         continue;
    2006           2 :                 disk = conf->mirrors + disk_idx;
    2007             : 
    2008           2 :                 disk->rdev = rdev;
    2009             : 
    2010           2 :                 disk->head_position = 0;
    2011           2 :         }
    2012           2 :         conf->raid_disks = mddev->raid_disks;
    2013           2 :         conf->mddev = mddev;
    2014           4 :         INIT_LIST_HEAD(&conf->retry_list);
    2015             : 
    2016           8 :         spin_lock_init(&conf->resync_lock);
    2017           2 :         init_waitqueue_head(&conf->wait_barrier);
    2018             : 
    2019           4 :         bio_list_init(&conf->pending_bio_list);
    2020           4 :         bio_list_init(&conf->flushing_bio_list);
    2021             : 
    2022           2 :         conf->last_used = -1;
    2023          12 :         for (i = 0; i < conf->raid_disks; i++) {
    2024           6 : 
    2025           4 :                 disk = conf->mirrors + i;
    2026             : 
    2027          16 :                 if (!disk->rdev ||
    2028           2 :                     !test_bit(In_sync, &disk->rdev->flags)) {
    2029           4 :                         disk->head_position = 0;
    2030          12 :                         if (disk->rdev)
    2031           4 :                                 conf->fullsync = 1;
    2032           4 :                 } else if (conf->last_used < 0)
    2033             :                         /*
    2034             :                          * The first working device is used as a
    2035             :                          * starting point to read balancing.
    2036             :                          */
    2037           2 :                         conf->last_used = i;
    2038             :         }
    2039             : 
    2040           2 :         err = -EIO;
    2041           4 :         if (conf->last_used < 0) {
    2042           6 :                 printk(KERN_ERR "raid1: no operational mirrors for %s\n",
    2043             :                        mdname(mddev));
    2044           2 :                 goto abort;
    2045             :         }
    2046           2 :         err = -ENOMEM;
    2047           4 :         conf->thread = md_register_thread(raid1d, mddev, NULL);
    2048           6 :         if (!conf->thread) {
    2049           6 :                 printk(KERN_ERR
    2050             :                        "raid1: couldn't allocate thread for %s\n",
    2051             :                        mdname(mddev));
    2052           2 :                 goto abort;
    2053             :         }
    2054             : 
    2055           2 :         return conf;
    2056          12 : 
    2057             :  abort:
    2058          24 :         if (conf) {
    2059          36 :                 if (conf->r1bio_pool)
    2060          12 :                         mempool_destroy(conf->r1bio_pool);
    2061          24 :                 kfree(conf->mirrors);
    2062          24 :                 safe_put_page(conf->tmppage);
    2063           4 :                 kfree(conf->poolinfo);
    2064           2 :                 kfree(conf);
    2065             :         }
    2066          30 :         return ERR_PTR(err);
    2067             : }
    2068             : 
    2069             : static int run(mddev_t *mddev)
    2070             : {
    2071           1 :         conf_t *conf;
    2072           1 :         int i;
    2073           1 :         mdk_rdev_t *rdev;
    2074           1 : 
    2075           3 :         if (mddev->level != 1) {
    2076           4 :                 printk("raid1: %s: raid level not set to mirroring (%d)\n",
    2077           1 :                        mdname(mddev), mddev->level);
    2078           2 :                 return -EIO;
    2079           1 :         }
    2080           3 :         if (mddev->reshape_position != MaxSector) {
    2081           4 :                 printk("raid1: %s: reshape_position set but not supported\n",
    2082           1 :                        mdname(mddev));
    2083           2 :                 return -EIO;
    2084           1 :         }
    2085           1 :         /*
    2086             :          * copy the already verified devices into our private RAID1
    2087             :          * bookkeeping area. [whatever we allocate in run(),
    2088             :          * should be freed in stop()]
    2089             :          */
    2090           3 :         if (mddev->private == NULL)
    2091           3 :                 conf = setup_conf(mddev);
    2092             :         else
    2093           2 :                 conf = mddev->private;
    2094             : 
    2095           6 :         if (IS_ERR(conf))
    2096           3 :                 return PTR_ERR(conf);
    2097             : 
    2098           1 :         mddev->queue->queue_lock = &conf->device_lock;
    2099          10 :         list_for_each_entry(rdev, &mddev->disks, same_set) {
    2100           4 :                 disk_stack_limits(mddev->gendisk, rdev->bdev,
    2101           1 :                                   rdev->data_offset << 9);
    2102             :                 /* as we don't honour merge_bvec_fn, we must never risk
    2103             :                  * violating it, so limit ->max_sector to one PAGE, as
    2104             :                  * a one page request is never in violation.
    2105             :                  */
    2106           7 :                 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
    2107             :                     queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
    2108           1 :                         blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
    2109             :         }
    2110             : 
    2111           1 :         mddev->degraded = 0;
    2112           7 :         for (i=0; i < conf->raid_disks; i++)
    2113          15 :                 if (conf->mirrors[i].rdev == NULL ||
    2114           1 :                     !test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
    2115             :                     test_bit(Faulty, &conf->mirrors[i].rdev->flags))
    2116           3 :                         mddev->degraded++;
    2117             : 
    2118           2 :         if (conf->raid_disks - mddev->degraded == 1)
    2119           1 :                 mddev->recovery_cp = MaxSector;
    2120             : 
    2121           2 :         if (mddev->recovery_cp != MaxSector)
    2122           3 :                 printk(KERN_NOTICE "raid1: %s is not clean"
    2123             :                        " -- starting background reconstruction\n",
    2124             :                        mdname(mddev));
    2125           5 :         printk(KERN_INFO 
    2126             :                 "raid1: raid set %s active with %d out of %d mirrors\n",
    2127             :                 mdname(mddev), mddev->raid_disks - mddev->degraded, 
    2128             :                 mddev->raid_disks);
    2129             : 
    2130             :         /*
    2131             :          * Ok, everything is just fine now
    2132             :          */
    2133           1 :         mddev->thread = conf->thread;
    2134           1 :         conf->thread = NULL;
    2135           1 :         mddev->private = conf;
    2136             : 
    2137           3 :         md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
    2138             : 
    2139           1 :         mddev->queue->unplug_fn = raid1_unplug;
    2140           1 :         mddev->queue->backing_dev_info.congested_fn = raid1_congested;
    2141           1 :         mddev->queue->backing_dev_info.congested_data = mddev;
    2142           1 :         md_integrity_register(mddev);
    2143           1 :         return 0;
    2144             : }
    2145             : 
    2146             : static int stop(mddev_t *mddev)
    2147             : {
    2148           3 :         conf_t *conf = mddev->private;
    2149           2 :         struct bitmap *bitmap = mddev->bitmap;
    2150           2 :         int behind_wait = 0;
    2151           1 : 
    2152           1 :         /* wait for behind writes to complete */
    2153           8 :         while (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
    2154           3 :                 behind_wait++;
    2155           5 :                 printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait);
    2156          11 :                 set_current_state(TASK_UNINTERRUPTIBLE);
    2157           3 :                 schedule_timeout(HZ); /* wait a second */
    2158           1 :                 /* need to kick something here to make sure I/O goes? */
    2159           1 :         }
    2160             : 
    2161           4 :         raise_barrier(conf);
    2162           2 :         lower_barrier(conf);
    2163           1 : 
    2164           1 :         md_unregister_thread(mddev->thread);
    2165           1 :         mddev->thread = NULL;
    2166           1 :         blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
    2167           3 :         if (conf->r1bio_pool)
    2168           1 :                 mempool_destroy(conf->r1bio_pool);
    2169           2 :         kfree(conf->mirrors);
    2170           2 :         kfree(conf->poolinfo);
    2171           1 :         kfree(conf);
    2172           1 :         mddev->private = NULL;
    2173           1 :         return 0;
    2174             : }
    2175             : 
    2176             : static int raid1_resize(mddev_t *mddev, sector_t sectors)
    2177             : {
    2178           1 :         /* no resync is happening, and there is enough space
    2179           1 :          * on all devices, so we can resize.
    2180             :          * We need to make sure resync covers any new space.
    2181             :          * If the array is shrinking we should possibly wait until
    2182             :          * any io in the removed space completes, but it hardly seems
    2183             :          * worth it.
    2184             :          */
    2185           3 :         md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0));
    2186           4 :         if (mddev->array_sectors > raid1_size(mddev, sectors, 0))
    2187           1 :                 return -EINVAL;
    2188           2 :         set_capacity(mddev->gendisk, mddev->array_sectors);
    2189           1 :         mddev->changed = 1;
    2190           1 :         revalidate_disk(mddev->gendisk);
    2191           4 :         if (sectors > mddev->dev_sectors &&
    2192             :             mddev->recovery_cp == MaxSector) {
    2193           1 :                 mddev->recovery_cp = mddev->dev_sectors;
    2194           2 :                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
    2195             :         }
    2196           2 :         mddev->dev_sectors = sectors;
    2197           2 :         mddev->resync_max_sectors = sectors;
    2198           2 :         return 0;
    2199             : }
    2200             : 
    2201             : static int raid1_reshape(mddev_t *mddev)
    2202             : {
    2203           1 :         /* We need to:
    2204           1 :          * 1/ resize the r1bio_pool
    2205           1 :          * 2/ resize conf->mirrors
    2206           1 :          *
    2207           1 :          * We allocate a new r1bio_pool if we can.
    2208           1 :          * Then raise a device barrier and wait until all IO stops.
    2209           1 :          * Then resize conf->mirrors and swap in the new r1bio pool.
    2210           1 :          *
    2211           1 :          * At the same time, we "pack" the devices so that all the missing
    2212           1 :          * devices have the higher raid_disk numbers.
    2213           1 :          */
    2214           1 :         mempool_t *newpool, *oldpool;
    2215           1 :         struct pool_info *newpoolinfo;
    2216           1 :         mirror_info_t *newmirrors;
    2217           3 :         conf_t *conf = mddev->private;
    2218           1 :         int cnt, raid_disks;
    2219           1 :         unsigned long flags;
    2220           1 :         int d, d2, err;
    2221           1 : 
    2222           1 :         /* Cannot change chunk_size, layout, or level */
    2223           6 :         if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
    2224             :             mddev->layout != mddev->new_layout ||
    2225             :             mddev->level != mddev->new_level) {
    2226           1 :                 mddev->new_chunk_sectors = mddev->chunk_sectors;
    2227           1 :                 mddev->new_layout = mddev->layout;
    2228           1 :                 mddev->new_level = mddev->level;
    2229           1 :                 return -EINVAL;
    2230             :         }
    2231             : 
    2232           1 :         err = md_allow_write(mddev);
    2233           2 :         if (err)
    2234           1 :                 return err;
    2235             : 
    2236           1 :         raid_disks = mddev->raid_disks + mddev->delta_disks;
    2237             : 
    2238           2 :         if (raid_disks < conf->raid_disks) {
    2239           1 :                 cnt=0;
    2240           5 :                 for (d= 0; d < conf->raid_disks; d++)
    2241           5 :                         if (conf->mirrors[d].rdev)
    2242           2 :                                 cnt++;
    2243           2 :                 if (cnt > raid_disks)
    2244           1 :                         return -EBUSY;
    2245             :         }
    2246             : 
    2247           5 :         newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
    2248           2 :         if (!newpoolinfo)
    2249           1 :                 return -ENOMEM;
    2250           1 :         newpoolinfo->mddev = mddev;
    2251           1 :         newpoolinfo->raid_disks = raid_disks;
    2252             : 
    2253           1 :         newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
    2254             :                                  r1bio_pool_free, newpoolinfo);
    2255           2 :         if (!newpool) {
    2256           1 :                 kfree(newpoolinfo);
    2257           1 :                 return -ENOMEM;
    2258             :         }
    2259           3 :         newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
    2260           2 :         if (!newmirrors) {
    2261           1 :                 kfree(newpoolinfo);
    2262           1 :                 mempool_destroy(newpool);
    2263           1 :                 return -ENOMEM;
    2264             :         }
    2265             : 
    2266           2 :         raise_barrier(conf);
    2267             : 
    2268             :         /* ok, everything is stopped */
    2269           1 :         oldpool = conf->r1bio_pool;
    2270           1 :         conf->r1bio_pool = newpool;
    2271             : 
    2272           7 :         for (d = d2 = 0; d < conf->raid_disks; d++) {
    2273           4 :                 mdk_rdev_t *rdev = conf->mirrors[d].rdev;
    2274           5 :                 if (rdev && rdev->raid_disk != d2) {
    2275             :                         char nm[20];
    2276           1 :                         sprintf(nm, "rd%d", rdev->raid_disk);
    2277           1 :                         sysfs_remove_link(&mddev->kobj, nm);
    2278           1 :                         rdev->raid_disk = d2;
    2279           1 :                         sprintf(nm, "rd%d", rdev->raid_disk);
    2280           1 :                         sysfs_remove_link(&mddev->kobj, nm);
    2281           3 :                         if (sysfs_create_link(&mddev->kobj,
    2282             :                                               &rdev->kobj, nm))
    2283           3 :                                 printk(KERN_WARNING
    2284             :                                        "md/raid1: cannot register "
    2285             :                                        "%s for %s\n",
    2286             :                                        nm, mdname(mddev));
    2287             :                 }
    2288           4 :                 if (rdev)
    2289           6 :                         newmirrors[d2++].rdev = rdev;
    2290             :         }
    2291           2 :         kfree(conf->mirrors);
    2292           1 :         conf->mirrors = newmirrors;
    2293           2 :         kfree(conf->poolinfo);
    2294           1 :         conf->poolinfo = newpoolinfo;
    2295             : 
    2296           3 :         spin_lock_irqsave(&conf->device_lock, flags);
    2297           1 :         mddev->degraded += (raid_disks - conf->raid_disks);
    2298           2 :         spin_unlock_irqrestore(&conf->device_lock, flags);
    2299           3 :         conf->raid_disks = mddev->raid_disks = raid_disks;
    2300           1 :         mddev->delta_disks = 0;
    2301             : 
    2302           1 :         conf->last_used = 0; /* just make sure it is in-range */
    2303           2 :         lower_barrier(conf);
    2304             : 
    2305           2 :         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
    2306           1 :         md_wakeup_thread(mddev->thread);
    2307             : 
    2308           1 :         mempool_destroy(oldpool);
    2309           1 :         return 0;
    2310             : }
    2311             : 
    2312             : static void raid1_quiesce(mddev_t *mddev, int state)
    2313             : {
    2314           3 :         conf_t *conf = mddev->private;
    2315             : 
    2316           1 :         switch(state) {
    2317           4 :         case 2: /* wake for suspend */
    2318           1 :                 wake_up(&conf->wait_barrier);
    2319           1 :                 break;
    2320           4 :         case 1:
    2321           2 :                 raise_barrier(conf);
    2322           1 :                 break;
    2323           4 :         case 0:
    2324           2 :                 lower_barrier(conf);
    2325           1 :                 break;
    2326           1 :         }
    2327             : }
    2328           4 : 
    2329             : static void *raid1_takeover(mddev_t *mddev)
    2330             : {
    2331           1 :         /* raid1 can take over:
    2332           1 :          *  raid5 with 2 devices, any layout or chunk size
    2333           1 :          */
    2334           4 :         if (mddev->level == 5 && mddev->raid_disks == 2) {
    2335             :                 conf_t *conf;
    2336           1 :                 mddev->new_level = 1;
    2337           1 :                 mddev->new_layout = 0;
    2338           1 :                 mddev->new_chunk_sectors = 0;
    2339           3 :                 conf = setup_conf(mddev);
    2340           4 :                 if (!IS_ERR(conf))
    2341           1 :                         conf->barrier = 1;
    2342           1 :                 return conf;
    2343             :         }
    2344           3 :         return ERR_PTR(-EINVAL);
    2345             : }
    2346             : 
    2347           1 : static struct mdk_personality raid1_personality =
    2348             : {
    2349             :         .name           = "raid1",
    2350             :         .level          = 1,
    2351             :         .owner          = THIS_MODULE,
    2352             :         .make_request   = make_request,
    2353             :         .run            = run,
    2354             :         .stop           = stop,
    2355             :         .status         = status,
    2356             :         .error_handler  = error,
    2357             :         .hot_add_disk   = raid1_add_disk,
    2358             :         .hot_remove_disk= raid1_remove_disk,
    2359             :         .spare_active   = raid1_spare_active,
    2360             :         .sync_request   = sync_request,
    2361             :         .resize         = raid1_resize,
    2362             :         .size           = raid1_size,
    2363             :         .check_reshape  = raid1_reshape,
    2364             :         .quiesce        = raid1_quiesce,
    2365             :         .takeover       = raid1_takeover,
    2366             : };
    2367             : 
    2368             : static int __init raid_init(void)
    2369             : {
    2370           4 :         return register_md_personality(&raid1_personality);
    2371             : }
    2372             : 
    2373             : static void raid_exit(void)
    2374             : {
    2375           2 :         unregister_md_personality(&raid1_personality);
    2376           1 : }
    2377             : 
    2378             : module_init(raid_init);
    2379             : module_exit(raid_exit);
    2380           1 : MODULE_LICENSE("GPL");
    2381             : MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
    2382             : MODULE_ALIAS("md-personality-3"); /* RAID1 */
    2383             : MODULE_ALIAS("md-raid1");
    2384             : MODULE_ALIAS("md-level-1");

Generated by: LCOV version 1.10