Line data Source code
1 : /*
2 : * raid1.c : Multiple Devices driver for Linux
3 : *
4 : * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
5 : *
6 : * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
7 : *
8 : * RAID-1 management functions.
9 : *
10 : * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
11 : *
12 : * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
13 : * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
14 : *
15 : * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
16 : * bitmapped intelligence in resync:
17 : *
18 : * - bitmap marked during normal i/o
19 : * - bitmap used to skip nondirty blocks during sync
20 : *
21 : * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
22 : * - persistent bitmap code
23 : *
24 : * This program is free software; you can redistribute it and/or modify
25 : * it under the terms of the GNU General Public License as published by
26 : * the Free Software Foundation; either version 2, or (at your option)
27 : * any later version.
28 : *
29 : * You should have received a copy of the GNU General Public License
30 : * (for example /usr/src/linux/COPYING); if not, write to the Free
31 : * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
32 : */
33 :
34 : #include <linux/delay.h>
35 : #include <linux/blkdev.h>
36 : #include <linux/seq_file.h>
37 : #include "md.h"
38 : #include "raid1.h"
39 : #include "bitmap.h"
40 :
41 : #define DEBUG 0
42 : #if DEBUG
43 : #define PRINTK(x...) printk(x)
44 : #else
45 : #define PRINTK(x...)
46 : #endif
47 :
48 : /*
49 : * Number of guaranteed r1bios in case of extreme VM load:
50 : */
51 : #define NR_RAID1_BIOS 256
52 :
53 :
54 : static void unplug_slaves(mddev_t *mddev);
55 :
56 : static void allow_barrier(conf_t *conf);
57 : static void lower_barrier(conf_t *conf);
58 :
59 : static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
60 : {
61 0 : struct pool_info *pi = data;
62 0 : r1bio_t *r1_bio;
63 0 : int size = offsetof(r1bio_t, bios[pi->raid_disks]);
64 0 :
65 : /* allocate a r1bio with room for raid_disks entries in the bios array */
66 0 : r1_bio = kzalloc(size, gfp_flags);
67 0 : if (!r1_bio && pi->mddev)
68 0 : unplug_slaves(pi->mddev);
69 :
70 0 : return r1_bio;
71 : }
72 :
73 : static void r1bio_pool_free(void *r1_bio, void *data)
74 : {
75 0 : kfree(r1_bio);
76 0 : }
77 :
78 : #define RESYNC_BLOCK_SIZE (64*1024)
79 : //#define RESYNC_BLOCK_SIZE PAGE_SIZE
80 : #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
81 : #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
82 : #define RESYNC_WINDOW (2048*1024)
83 :
84 : static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
85 : {
86 0 : struct pool_info *pi = data;
87 0 : struct page *page;
88 0 : r1bio_t *r1_bio;
89 0 : struct bio *bio;
90 0 : int i, j;
91 0 :
92 0 : r1_bio = r1bio_pool_alloc(gfp_flags, pi);
93 0 : if (!r1_bio) {
94 0 : unplug_slaves(pi->mddev);
95 0 : return NULL;
96 0 : }
97 0 :
98 : /*
99 : * Allocate bios : 1 for reading, n-1 for writing
100 : */
101 0 : for (j = pi->raid_disks ; j-- ; ) {
102 0 : bio = bio_alloc(gfp_flags, RESYNC_PAGES);
103 0 : if (!bio)
104 0 : goto out_free_bio;
105 0 : r1_bio->bios[j] = bio;
106 0 : }
107 : /*
108 : * Allocate RESYNC_PAGES data pages and attach them to
109 : * the first bio.
110 : * If this is a user-requested check/repair, allocate
111 : * RESYNC_PAGES for each bio.
112 : */
113 0 : if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
114 0 : j = pi->raid_disks;
115 : else
116 0 : j = 1;
117 0 : while(j--) {
118 0 : bio = r1_bio->bios[j];
119 0 : for (i = 0; i < RESYNC_PAGES; i++) {
120 0 : page = alloc_page(gfp_flags);
121 0 : if (unlikely(!page))
122 0 : goto out_free_pages;
123 :
124 0 : bio->bi_io_vec[i].bv_page = page;
125 0 : bio->bi_vcnt = i+1;
126 0 : }
127 : }
128 : /* If not user-requests, copy the page pointers to all bios */
129 0 : if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
130 0 : for (i=0; i<RESYNC_PAGES ; i++)
131 0 : for (j=1; j<pi->raid_disks; j++)
132 0 : r1_bio->bios[j]->bi_io_vec[i].bv_page =
133 0 : r1_bio->bios[0]->bi_io_vec[i].bv_page;
134 : }
135 :
136 0 : r1_bio->master_bio = NULL;
137 :
138 0 : return r1_bio;
139 0 :
140 : out_free_pages:
141 0 : for (j=0 ; j < pi->raid_disks; j++)
142 0 : for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++)
143 0 : put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
144 0 : j = -1;
145 0 : out_free_bio:
146 0 : while ( ++j < pi->raid_disks )
147 0 : bio_put(r1_bio->bios[j]);
148 0 : r1bio_pool_free(r1_bio, data);
149 0 : return NULL;
150 : }
151 :
152 : static void r1buf_pool_free(void *__r1_bio, void *data)
153 : {
154 0 : struct pool_info *pi = data;
155 0 : int i,j;
156 0 : r1bio_t *r1bio = __r1_bio;
157 0 :
158 0 : for (i = 0; i < RESYNC_PAGES; i++)
159 0 : for (j = pi->raid_disks; j-- ;) {
160 0 : if (j == 0 ||
161 0 : r1bio->bios[j]->bi_io_vec[i].bv_page !=
162 : r1bio->bios[0]->bi_io_vec[i].bv_page)
163 0 : safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
164 : }
165 0 : for (i=0 ; i < pi->raid_disks; i++)
166 0 : bio_put(r1bio->bios[i]);
167 0 :
168 0 : r1bio_pool_free(r1bio, data);
169 0 : }
170 :
171 : static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
172 : {
173 2 : int i;
174 2 :
175 10 : for (i = 0; i < conf->raid_disks; i++) {
176 6 : struct bio **bio = r1_bio->bios + i;
177 6 : if (*bio && *bio != IO_BLOCKED)
178 2 : bio_put(*bio);
179 2 : *bio = NULL;
180 : }
181 : }
182 2 :
183 : static void free_r1bio(r1bio_t *r1_bio)
184 : {
185 6 : conf_t *conf = r1_bio->mddev->private;
186 :
187 : /*
188 : * Wake up any possible resync thread that waits for the device
189 : * to go idle.
190 : */
191 4 : allow_barrier(conf);
192 :
193 4 : put_all_bios(conf, r1_bio);
194 2 : mempool_free(r1_bio, conf->r1bio_pool);
195 2 : }
196 :
197 : static void put_buf(r1bio_t *r1_bio)
198 : {
199 3 : conf_t *conf = r1_bio->mddev->private;
200 1 : int i;
201 1 :
202 6 : for (i=0; i<conf->raid_disks; i++) {
203 4 : struct bio *bio = r1_bio->bios[i];
204 4 : if (bio->bi_end_io)
205 3 : rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
206 : }
207 :
208 1 : mempool_free(r1_bio, conf->r1buf_pool);
209 :
210 2 : lower_barrier(conf);
211 1 : }
212 :
213 : static void reschedule_retry(r1bio_t *r1_bio)
214 : {
215 0 : unsigned long flags;
216 0 : mddev_t *mddev = r1_bio->mddev;
217 0 : conf_t *conf = mddev->private;
218 0 :
219 0 : spin_lock_irqsave(&conf->device_lock, flags);
220 0 : list_add(&r1_bio->retry_list, &conf->retry_list);
221 0 : conf->nr_queued ++;
222 0 : spin_unlock_irqrestore(&conf->device_lock, flags);
223 :
224 0 : wake_up(&conf->wait_barrier);
225 0 : md_wakeup_thread(mddev->thread);
226 0 : }
227 :
228 : /*
229 : * raid_end_bio_io() is called when we have finished servicing a mirrored
230 : * operation and are ready to return a success/failure code to the buffer
231 : * cache layer.
232 : */
233 : static void raid_end_bio_io(r1bio_t *r1_bio)
234 : {
235 2 : struct bio *bio = r1_bio->master_bio;
236 1 :
237 1 : /* if nobody has done the final endio yet, do it now */
238 4 : if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
239 : PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",
240 : (bio_data_dir(bio) == WRITE) ? "write" : "read",
241 : (unsigned long long) bio->bi_sector,
242 : (unsigned long long) bio->bi_sector +
243 : (bio->bi_size >> 9) - 1);
244 :
245 8 : bio_endio(bio,
246 : test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
247 : }
248 4 : free_r1bio(r1_bio);
249 1 : }
250 :
251 : /*
252 : * Update disk head position estimator based on IRQ completion info.
253 : */
254 : static inline void update_head_pos(int disk, r1bio_t *r1_bio)
255 : {
256 0 : conf_t *conf = r1_bio->mddev->private;
257 :
258 0 : conf->mirrors[disk].head_position =
259 : r1_bio->sector + (r1_bio->sectors);
260 0 : }
261 :
262 : static void raid1_end_read_request(struct bio *bio, int error)
263 : {
264 0 : int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
265 0 : r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
266 0 : int mirror;
267 0 : conf_t *conf = r1_bio->mddev->private;
268 0 :
269 0 : mirror = r1_bio->read_disk;
270 0 : /*
271 0 : * this branch is our 'one mirror IO has finished' event handler:
272 0 : */
273 0 : update_head_pos(mirror, r1_bio);
274 0 :
275 0 : if (uptodate)
276 0 : set_bit(R1BIO_Uptodate, &r1_bio->state);
277 : else {
278 : /* If all other devices have failed, we want to return
279 : * the error upwards rather than fail the last device.
280 : * Here we redefine "uptodate" to mean "Don't want to retry"
281 : */
282 : unsigned long flags;
283 0 : spin_lock_irqsave(&conf->device_lock, flags);
284 0 : if (r1_bio->mddev->degraded == conf->raid_disks ||
285 : (r1_bio->mddev->degraded == conf->raid_disks-1 &&
286 : !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)))
287 0 : uptodate = 1;
288 0 : spin_unlock_irqrestore(&conf->device_lock, flags);
289 : }
290 :
291 0 : if (uptodate)
292 0 : raid_end_bio_io(r1_bio);
293 : else {
294 : /*
295 : * oops, read error:
296 : */
297 : char b[BDEVNAME_SIZE];
298 0 : if (printk_ratelimit())
299 0 : printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",
300 : bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
301 0 : reschedule_retry(r1_bio);
302 : }
303 :
304 0 : rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
305 0 : }
306 :
307 : static void raid1_end_write_request(struct bio *bio, int error)
308 : {
309 0 : int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
310 0 : r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
311 0 : int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
312 0 : conf_t *conf = r1_bio->mddev->private;
313 0 : struct bio *to_put = NULL;
314 0 :
315 0 :
316 0 : for (mirror = 0; mirror < conf->raid_disks; mirror++)
317 0 : if (r1_bio->bios[mirror] == bio)
318 0 : break;
319 0 :
320 0 : if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {
321 0 : set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);
322 0 : set_bit(R1BIO_BarrierRetry, &r1_bio->state);
323 0 : r1_bio->mddev->barriers_work = 0;
324 0 : /* Don't rdev_dec_pending in this branch - keep it for the retry */
325 0 : } else {
326 0 : /*
327 0 : * this branch is our 'one mirror IO has finished' event handler:
328 0 : */
329 0 : r1_bio->bios[mirror] = NULL;
330 0 : to_put = bio;
331 0 : if (!uptodate) {
332 0 : md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
333 : /* an I/O failed, we can't clear the bitmap */
334 0 : set_bit(R1BIO_Degraded, &r1_bio->state);
335 : } else
336 : /*
337 : * Set R1BIO_Uptodate in our master bio, so that
338 : * we will return a good error code for to the higher
339 : * levels even if IO on some other mirrored buffer fails.
340 : *
341 : * The 'master' represents the composite IO operation to
342 : * user-side. So if something waits for IO, then it will
343 : * wait for the 'master' bio.
344 : */
345 0 : set_bit(R1BIO_Uptodate, &r1_bio->state);
346 :
347 0 : update_head_pos(mirror, r1_bio);
348 :
349 0 : if (behind) {
350 0 : if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
351 0 : atomic_dec(&r1_bio->behind_remaining);
352 :
353 : /* In behind mode, we ACK the master bio once the I/O has safely
354 : * reached all non-writemostly disks. Setting the Returned bit
355 : * ensures that this gets done only once -- we don't ever want to
356 : * return -EIO here, instead we'll wait */
357 :
358 0 : if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
359 : test_bit(R1BIO_Uptodate, &r1_bio->state)) {
360 : /* Maybe we can return now */
361 0 : if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
362 0 : struct bio *mbio = r1_bio->master_bio;
363 : PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
364 : (unsigned long long) mbio->bi_sector,
365 : (unsigned long long) mbio->bi_sector +
366 : (mbio->bi_size >> 9) - 1);
367 0 : bio_endio(mbio, 0);
368 : }
369 : }
370 : }
371 0 : rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
372 : }
373 : /*
374 : *
375 : * Let's see if all mirrored write operations have finished
376 : * already.
377 : */
378 0 : if (atomic_dec_and_test(&r1_bio->remaining)) {
379 0 : if (test_bit(R1BIO_BarrierRetry, &r1_bio->state))
380 0 : reschedule_retry(r1_bio);
381 : else {
382 : /* it really is the end of this request */
383 0 : if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
384 : /* free extra copy of the data pages */
385 0 : int i = bio->bi_vcnt;
386 0 : while (i--)
387 0 : safe_put_page(bio->bi_io_vec[i].bv_page);
388 0 : }
389 0 : /* clear the bitmap if all writes complete successfully */
390 0 : bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
391 : r1_bio->sectors,
392 : !test_bit(R1BIO_Degraded, &r1_bio->state),
393 : behind);
394 0 : md_write_end(r1_bio->mddev);
395 0 : raid_end_bio_io(r1_bio);
396 : }
397 : }
398 :
399 0 : if (to_put)
400 0 : bio_put(to_put);
401 0 : }
402 :
403 :
404 : /*
405 : * This routine returns the disk from which the requested read should
406 : * be done. There is a per-array 'next expected sequential IO' sector
407 : * number - if this matches on the next IO then we use the last disk.
408 : * There is also a per-disk 'last know head position' sector that is
409 : * maintained from IRQ contexts, both the normal and the resync IO
410 : * completion handlers update this position correctly. If there is no
411 : * perfect sequential match then we pick the disk whose head is closest.
412 : *
413 : * If there are 2 mirrors in the same 2 devices, performance degrades
414 : * because position is mirror, not device based.
415 : *
416 : * The rdev for the device selected will have nr_pending incremented.
417 : */
418 : static int read_balance(conf_t *conf, r1bio_t *r1_bio)
419 : {
420 2 : const sector_t this_sector = r1_bio->sector;
421 3 : int new_disk = conf->last_used, disk = new_disk;
422 2 : int wonly_disk = -1;
423 2 : const int sectors = r1_bio->sectors;
424 1 : sector_t new_distance, current_distance;
425 1 : mdk_rdev_t *rdev;
426 1 :
427 3 : rcu_read_lock();
428 1 : /*
429 2 : * Check if we can balance. We can balance on the whole
430 1 : * device if no resync is going on, or below the resync window.
431 1 : * We take the first readable disk when above the resync window.
432 1 : */
433 1 : retry:
434 5 : if (conf->mddev->recovery_cp < MaxSector &&
435 1 : (this_sector + sectors >= conf->next_resync)) {
436 1 : /* Choose the first operational device, for consistancy */
437 2 : new_disk = 0;
438 1 :
439 16 : for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
440 4 : r1_bio->bios[new_disk] == IO_BLOCKED ||
441 1 : !rdev || !test_bit(In_sync, &rdev->flags)
442 1 : || test_bit(WriteMostly, &rdev->flags);
443 13 : rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) {
444 8 :
445 17 : if (rdev && test_bit(In_sync, &rdev->flags) &&
446 : r1_bio->bios[new_disk] != IO_BLOCKED)
447 2 : wonly_disk = new_disk;
448 :
449 8 : if (new_disk == conf->raid_disks - 1) {
450 4 : new_disk = wonly_disk;
451 8 : break;
452 : }
453 : }
454 4 : goto rb_out;
455 : }
456 :
457 :
458 : /* make sure the disk is operational */
459 15 : for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
460 3 : r1_bio->bios[new_disk] == IO_BLOCKED ||
461 : !rdev || !test_bit(In_sync, &rdev->flags) ||
462 : test_bit(WriteMostly, &rdev->flags);
463 11 : rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) {
464 4 :
465 16 : if (rdev && test_bit(In_sync, &rdev->flags) &&
466 1 : r1_bio->bios[new_disk] != IO_BLOCKED)
467 1 : wonly_disk = new_disk;
468 :
469 8 : if (new_disk <= 0)
470 8 : new_disk = conf->raid_disks;
471 4 : new_disk--;
472 8 : if (new_disk == disk) {
473 4 : new_disk = wonly_disk;
474 4 : break;
475 : }
476 : }
477 :
478 8 : if (new_disk < 0)
479 4 : goto rb_out;
480 :
481 4 : disk = new_disk;
482 : /* now disk == new_disk == starting point for search */
483 :
484 : /*
485 : * Don't change to another disk for sequential reads:
486 : */
487 8 : if (conf->next_seq_sect == this_sector)
488 4 : goto rb_out;
489 8 : if (this_sector == conf->mirrors[new_disk].head_position)
490 4 : goto rb_out;
491 :
492 28 : current_distance = abs(this_sector - conf->mirrors[disk].head_position);
493 4 :
494 : /* Find the disk whose head is closest */
495 :
496 : do {
497 2 : if (disk <= 0)
498 1 : disk = conf->raid_disks;
499 1 : disk--;
500 :
501 2 : rdev = rcu_dereference(conf->mirrors[disk].rdev);
502 :
503 12 : if (!rdev || r1_bio->bios[disk] == IO_BLOCKED ||
504 : !test_bit(In_sync, &rdev->flags) ||
505 : test_bit(WriteMostly, &rdev->flags))
506 3 : continue;
507 :
508 4 : if (!atomic_read(&rdev->nr_pending)) {
509 1 : new_disk = disk;
510 1 : break;
511 : }
512 7 : new_distance = abs(this_sector - conf->mirrors[disk].head_position);
513 2 : if (new_distance < current_distance) {
514 1 : current_distance = new_distance;
515 1 : new_disk = disk;
516 : }
517 8 : } while (disk != conf->last_used);
518 :
519 5 : rb_out:
520 :
521 :
522 24 : if (new_disk >= 0) {
523 28 : rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
524 28 : if (!rdev)
525 12 : goto retry;
526 24 : atomic_inc(&rdev->nr_pending);
527 4 : if (!test_bit(In_sync, &rdev->flags)) {
528 : /* cannot risk returning a device that failed
529 : * before we inc'ed nr_pending
530 : */
531 3 : rdev_dec_pending(rdev, conf->mddev);
532 1 : goto retry;
533 : }
534 1 : conf->next_seq_sect = this_sector + sectors;
535 1 : conf->last_used = new_disk;
536 : }
537 26 : rcu_read_unlock();
538 :
539 1 : return new_disk;
540 : }
541 :
542 : static void unplug_slaves(mddev_t *mddev)
543 : {
544 63 : conf_t *conf = mddev->private;
545 21 : int i;
546 21 :
547 63 : rcu_read_lock();
548 189 : for (i=0; i<mddev->raid_disks; i++) {
549 168 : mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
550 252 : if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
551 84 : struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
552 :
553 42 : atomic_inc(&rdev->nr_pending);
554 42 : rcu_read_unlock();
555 :
556 21 : blk_unplug(r_queue);
557 :
558 63 : rdev_dec_pending(rdev, mddev);
559 42 : rcu_read_lock();
560 : }
561 : }
562 42 : rcu_read_unlock();
563 21 : }
564 :
565 : static void raid1_unplug(struct request_queue *q)
566 : {
567 63 : mddev_t *mddev = q->queuedata;
568 :
569 42 : unplug_slaves(mddev);
570 21 : md_wakeup_thread(mddev->thread);
571 21 : }
572 :
573 : static int raid1_congested(void *data, int bits)
574 : {
575 0 : mddev_t *mddev = data;
576 0 : conf_t *conf = mddev->private;
577 0 : int i, ret = 0;
578 0 :
579 0 : if (mddev_congested(mddev, bits))
580 0 : return 1;
581 0 :
582 0 : rcu_read_lock();
583 0 : for (i = 0; i < mddev->raid_disks; i++) {
584 0 : mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
585 0 : if (rdev && !test_bit(Faulty, &rdev->flags)) {
586 0 : struct request_queue *q = bdev_get_queue(rdev->bdev);
587 :
588 : /* Note the '|| 1' - when read_balance prefers
589 : * non-congested targets, it can be removed
590 : */
591 : if ((bits & (1<<BDI_async_congested)) || 1)
592 0 : ret |= bdi_congested(&q->backing_dev_info, bits);
593 : else
594 : ret &= bdi_congested(&q->backing_dev_info, bits);
595 : }
596 : }
597 0 : rcu_read_unlock();
598 0 : return ret;
599 : }
600 :
601 :
602 : static int flush_pending_writes(conf_t *conf)
603 : {
604 0 : /* Any writes that have been queued but are awaiting
605 0 : * bitmap updates get flushed here.
606 0 : * We return 1 if any requests were actually submitted.
607 : */
608 0 : int rv = 0;
609 :
610 0 : spin_lock_irq(&conf->device_lock);
611 :
612 0 : if (conf->pending_bio_list.head) {
613 : struct bio *bio;
614 0 : bio = bio_list_get(&conf->pending_bio_list);
615 0 : blk_remove_plug(conf->mddev->queue);
616 0 : spin_unlock_irq(&conf->device_lock);
617 : /* flush any pending bitmap writes to
618 : * disk before proceeding w/ I/O */
619 0 : bitmap_unplug(conf->mddev->bitmap);
620 :
621 0 : while (bio) { /* submit pending writes */
622 0 : struct bio *next = bio->bi_next;
623 0 : bio->bi_next = NULL;
624 0 : generic_make_request(bio);
625 0 : bio = next;
626 : }
627 0 : rv = 1;
628 : } else
629 0 : spin_unlock_irq(&conf->device_lock);
630 0 : return rv;
631 : }
632 :
633 : /* Barriers....
634 : * Sometimes we need to suspend IO while we do something else,
635 : * either some resync/recovery, or reconfigure the array.
636 : * To do this we raise a 'barrier'.
637 : * The 'barrier' is a counter that can be raised multiple times
638 : * to count how many activities are happening which preclude
639 : * normal IO.
640 : * We can only raise the barrier if there is no pending IO.
641 : * i.e. if nr_pending == 0.
642 : * We choose only to raise the barrier if no-one is waiting for the
643 : * barrier to go down. This means that as soon as an IO request
644 : * is ready, no other operations which require a barrier will start
645 : * until the IO request has had a chance.
646 : *
647 : * So: regular IO calls 'wait_barrier'. When that returns there
648 : * is no backgroup IO happening, It must arrange to call
649 : * allow_barrier when it has finished its IO.
650 : * backgroup IO calls must call raise_barrier. Once that returns
651 : * there is no normal IO happeing. It must arrange to call
652 : * lower_barrier when the particular background IO completes.
653 : */
654 : #define RESYNC_DEPTH 32
655 :
656 : static void raise_barrier(conf_t *conf)
657 : {
658 24 : spin_lock_irq(&conf->resync_lock);
659 8 :
660 8 : /* Wait until no block IO is waiting */
661 272 : wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
662 24 : conf->resync_lock,
663 24 : raid1_unplug(conf->mddev->queue));
664 8 :
665 8 : /* block any new IO from starting */
666 24 : conf->barrier++;
667 8 :
668 8 : /* No wait for all pending IO to complete */
669 360 : wait_event_lock_irq(conf->wait_barrier,
670 24 : !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
671 24 : conf->resync_lock,
672 8 : raid1_unplug(conf->mddev->queue));
673 8 :
674 48 : spin_unlock_irq(&conf->resync_lock);
675 8 : }
676 :
677 : static void lower_barrier(conf_t *conf)
678 : {
679 4 : unsigned long flags;
680 28 : BUG_ON(conf->barrier <= 0);
681 16 : spin_lock_irqsave(&conf->resync_lock, flags);
682 4 : conf->barrier--;
683 8 : spin_unlock_irqrestore(&conf->resync_lock, flags);
684 4 : wake_up(&conf->wait_barrier);
685 4 : }
686 :
687 : static void wait_barrier(conf_t *conf)
688 : {
689 15 : spin_lock_irq(&conf->resync_lock);
690 15 : if (conf->barrier) {
691 10 : conf->nr_waiting++;
692 170 : wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
693 15 : conf->resync_lock,
694 15 : raid1_unplug(conf->mddev->queue));
695 15 : conf->nr_waiting--;
696 5 : }
697 10 : conf->nr_pending++;
698 20 : spin_unlock_irq(&conf->resync_lock);
699 5 : }
700 :
701 : static void allow_barrier(conf_t *conf)
702 : {
703 5 : unsigned long flags;
704 20 : spin_lock_irqsave(&conf->resync_lock, flags);
705 5 : conf->nr_pending--;
706 10 : spin_unlock_irqrestore(&conf->resync_lock, flags);
707 5 : wake_up(&conf->wait_barrier);
708 5 : }
709 :
710 : static void freeze_array(conf_t *conf)
711 : {
712 0 : /* stop syncio and normal IO and wait for everything to
713 0 : * go quite.
714 0 : * We increment barrier and nr_waiting, and then
715 0 : * wait until nr_pending match nr_queued+1
716 0 : * This is called in the context of one normal IO request
717 0 : * that has failed. Thus any sync request that might be pending
718 0 : * will be blocked by nr_pending, and we need to wait for
719 0 : * pending IO requests to complete or be queued for re-try.
720 : * Thus the number queued (nr_queued) plus this request (1)
721 : * must match the number of pending IOs (nr_pending) before
722 : * we continue.
723 : */
724 0 : spin_lock_irq(&conf->resync_lock);
725 0 : conf->barrier++;
726 0 : conf->nr_waiting++;
727 0 : wait_event_lock_irq(conf->wait_barrier,
728 0 : conf->nr_pending == conf->nr_queued+1,
729 0 : conf->resync_lock,
730 : ({ flush_pending_writes(conf);
731 : raid1_unplug(conf->mddev->queue); }));
732 0 : spin_unlock_irq(&conf->resync_lock);
733 0 : }
734 : static void unfreeze_array(conf_t *conf)
735 : {
736 : /* reverse the effect of the freeze */
737 0 : spin_lock_irq(&conf->resync_lock);
738 0 : conf->barrier--;
739 0 : conf->nr_waiting--;
740 0 : wake_up(&conf->wait_barrier);
741 0 : spin_unlock_irq(&conf->resync_lock);
742 0 : }
743 :
744 :
745 : /* duplicate the data pages for behind I/O */
746 : static struct page **alloc_behind_pages(struct bio *bio)
747 : {
748 1 : int i;
749 1 : struct bio_vec *bvec;
750 5 : struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *),
751 1 : GFP_NOIO);
752 5 : if (unlikely(!pages))
753 2 : goto do_sync_io;
754 1 :
755 11 : bio_for_each_segment(bvec, bio, i) {
756 3 : pages[i] = alloc_page(GFP_NOIO);
757 6 : if (unlikely(!pages[i]))
758 1 : goto do_sync_io;
759 8 : memcpy(kmap(pages[i]) + bvec->bv_offset,
760 : kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
761 2 : kunmap(pages[i]);
762 2 : kunmap(bvec->bv_page);
763 : }
764 :
765 1 : return pages;
766 2 :
767 : do_sync_io:
768 4 : if (pages)
769 10 : for (i = 0; i < bio->bi_vcnt && pages[i]; i++)
770 2 : put_page(pages[i]);
771 5 : kfree(pages);
772 : PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
773 3 : return NULL;
774 : }
775 :
776 : static int make_request(struct request_queue *q, struct bio * bio)
777 : {
778 3 : mddev_t *mddev = q->queuedata;
779 3 : conf_t *conf = mddev->private;
780 1 : mirror_info_t *mirror;
781 1 : r1bio_t *r1_bio;
782 1 : struct bio *read_bio;
783 2 : int i, targets = 0, disks;
784 1 : struct bitmap *bitmap;
785 1 : unsigned long flags;
786 1 : struct bio_list bl;
787 2 : struct page **behind_pages = NULL;
788 3 : const int rw = bio_data_dir(bio);
789 4 : const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
790 1 : int cpu;
791 1 : bool do_barriers;
792 1 : mdk_rdev_t *blocked_rdev;
793 1 :
794 1 : /*
795 1 : * Register the new request and wait if the reconstruction
796 1 : * thread has put up a bar for new requests.
797 1 : * Continue immediately if no resync is active currently.
798 1 : * We test barriers_work *after* md_write_start as md_write_start
799 1 : * may cause the first superblock write, and that will check out
800 1 : * if barriers work.
801 1 : */
802 1 :
803 2 : md_write_start(mddev, bio); /* wait on superblock update early */
804 1 :
805 8 : if (bio_data_dir(bio) == WRITE &&
806 1 : bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo &&
807 1 : bio->bi_sector < mddev->suspend_hi) {
808 1 : /* As the suspend_* range is controlled by
809 1 : * userspace, we want an interruptible
810 1 : * wait.
811 1 : */
812 8 : DEFINE_WAIT(w);
813 1 : for (;;) {
814 5 : flush_signals(current);
815 2 : prepare_to_wait(&conf->wait_barrier,
816 1 : &w, TASK_INTERRUPTIBLE);
817 5 : if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo ||
818 1 : bio->bi_sector >= mddev->suspend_hi)
819 2 : break;
820 2 : schedule();
821 2 : }
822 3 : finish_wait(&conf->wait_barrier, &w);
823 1 : }
824 13 : if (unlikely(!mddev->barriers_work &&
825 1 : bio_rw_flagged(bio, BIO_RW_BARRIER))) {
826 3 : if (rw == WRITE)
827 2 : md_write_end(mddev);
828 1 : bio_endio(bio, -EOPNOTSUPP);
829 1 : return 0;
830 : }
831 :
832 4 : wait_barrier(conf);
833 :
834 1 : bitmap = mddev->bitmap;
835 :
836 9 : cpu = part_stat_lock();
837 9 : part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
838 14 : part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
839 : bio_sectors(bio));
840 6 : part_stat_unlock();
841 :
842 : /*
843 : * make_request() can abort the operation when READA is being
844 : * used and no empty request is available.
845 : *
846 : */
847 2 : r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
848 :
849 1 : r1_bio->master_bio = bio;
850 1 : r1_bio->sectors = bio->bi_size >> 9;
851 1 : r1_bio->state = 0;
852 1 : r1_bio->mddev = mddev;
853 1 : r1_bio->sector = bio->bi_sector;
854 :
855 2 : if (rw == READ) {
856 : /*
857 : * read balancing logic:
858 : */
859 3 : int rdisk = read_balance(conf, r1_bio);
860 :
861 2 : if (rdisk < 0) {
862 : /* couldn't find anywhere to read from */
863 2 : raid_end_bio_io(r1_bio);
864 1 : return 0;
865 : }
866 1 : mirror = conf->mirrors + rdisk;
867 :
868 1 : r1_bio->read_disk = rdisk;
869 :
870 1 : read_bio = bio_clone(bio, GFP_NOIO);
871 :
872 1 : r1_bio->bios[rdisk] = read_bio;
873 :
874 1 : read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
875 1 : read_bio->bi_bdev = mirror->rdev->bdev;
876 1 : read_bio->bi_end_io = raid1_end_read_request;
877 1 : read_bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
878 1 : read_bio->bi_private = r1_bio;
879 :
880 1 : generic_make_request(read_bio);
881 1 : return 0;
882 : }
883 :
884 : /*
885 : * WRITE:
886 : */
887 : /* first select target devices under spinlock and
888 : * inc refcount on their rdev. Record them by setting
889 : * bios[x] to bio
890 : */
891 1 : disks = conf->raid_disks;
892 1 : #if 0
893 : { static int first=1;
894 : if (first) printk("First Write sector %llu disks %d\n",
895 : (unsigned long long)r1_bio->sector, disks);
896 : first = 0;
897 : }
898 : #endif
899 : retry_write:
900 1 : blocked_rdev = NULL;
901 2 : rcu_read_lock();
902 9 : for (i = 0; i < disks; i++) {
903 8 : mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
904 9 : if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
905 2 : atomic_inc(&rdev->nr_pending);
906 1 : blocked_rdev = rdev;
907 1 : break;
908 1 : }
909 10 : if (rdev && !test_bit(Faulty, &rdev->flags)) {
910 2 : atomic_inc(&rdev->nr_pending);
911 4 : if (test_bit(Faulty, &rdev->flags)) {
912 3 : rdev_dec_pending(rdev, mddev);
913 1 : r1_bio->bios[i] = NULL;
914 : } else {
915 1 : r1_bio->bios[i] = bio;
916 1 : targets++;
917 : }
918 : } else
919 3 : r1_bio->bios[i] = NULL;
920 : }
921 4 : rcu_read_unlock();
922 :
923 4 : if (unlikely(blocked_rdev)) {
924 : /* Wait for this device to become unblocked */
925 : int j;
926 :
927 6 : for (j = 0; j < i; j++)
928 5 : if (r1_bio->bios[j])
929 4 : rdev_dec_pending(conf->mirrors[j].rdev, mddev);
930 :
931 2 : allow_barrier(conf);
932 1 : md_wait_for_blocked_rdev(blocked_rdev, mddev);
933 2 : wait_barrier(conf);
934 1 : goto retry_write;
935 : }
936 :
937 6 : BUG_ON(targets == 0); /* we never fail the last device */
938 :
939 2 : if (targets < conf->raid_disks) {
940 : /* array is degraded, we will not clear the bitmap
941 : * on I/O completion (see raid1_end_write_request) */
942 2 : set_bit(R1BIO_Degraded, &r1_bio->state);
943 : }
944 :
945 : /* do behind I/O ? */
946 17 : if (bitmap &&
947 : (atomic_read(&bitmap->behind_writes)
948 : < mddev->bitmap_info.max_write_behind) &&
949 : (behind_pages = alloc_behind_pages(bio)) != NULL)
950 2 : set_bit(R1BIO_BehindIO, &r1_bio->state);
951 :
952 10 : atomic_set(&r1_bio->remaining, 0);
953 2 : atomic_set(&r1_bio->behind_remaining, 0);
954 :
955 2 : do_barriers = bio_rw_flagged(bio, BIO_RW_BARRIER);
956 2 : if (do_barriers)
957 2 : set_bit(R1BIO_Barrier, &r1_bio->state);
958 :
959 4 : bio_list_init(&bl);
960 6 : for (i = 0; i < disks; i++) {
961 3 : struct bio *mbio;
962 3 : if (!r1_bio->bios[i])
963 1 : continue;
964 :
965 1 : mbio = bio_clone(bio, GFP_NOIO);
966 1 : r1_bio->bios[i] = mbio;
967 :
968 1 : mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
969 1 : mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
970 1 : mbio->bi_end_io = raid1_end_write_request;
971 1 : mbio->bi_rw = WRITE | (do_barriers << BIO_RW_BARRIER) |
972 : (do_sync << BIO_RW_SYNCIO);
973 1 : mbio->bi_private = r1_bio;
974 :
975 2 : if (behind_pages) {
976 : struct bio_vec *bvec;
977 : int j;
978 :
979 : /* Yes, I really want the '__' version so that
980 : * we clear any unused pointer in the io_vec, rather
981 : * than leave them unchanged. This is important
982 : * because when we come to free the pages, we won't
983 : * know the originial bi_idx, so we just free
984 : * them all
985 : */
986 8 : __bio_for_each_segment(bvec, mbio, j, 0)
987 3 : bvec->bv_page = behind_pages[j];
988 5 : if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
989 2 : atomic_inc(&r1_bio->behind_remaining);
990 : }
991 :
992 6 : atomic_inc(&r1_bio->remaining);
993 :
994 2 : bio_list_add(&bl, mbio);
995 : }
996 2 : kfree(behind_pages); /* the behind pages are attached to the bios now */
997 :
998 4 : bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
999 : test_bit(R1BIO_BehindIO, &r1_bio->state));
1000 3 : spin_lock_irqsave(&conf->device_lock, flags);
1001 2 : bio_list_merge(&conf->pending_bio_list, &bl);
1002 2 : bio_list_init(&bl);
1003 :
1004 1 : blk_plug_device(mddev->queue);
1005 2 : spin_unlock_irqrestore(&conf->device_lock, flags);
1006 :
1007 : /* In case raid1d snuck into freeze_array */
1008 1 : wake_up(&conf->wait_barrier);
1009 :
1010 2 : if (do_sync)
1011 1 : md_wakeup_thread(mddev->thread);
1012 : #if 0
1013 : while ((bio = bio_list_pop(&bl)) != NULL)
1014 : generic_make_request(bio);
1015 : #endif
1016 :
1017 1 : return 0;
1018 : }
1019 :
1020 : static void status(struct seq_file *seq, mddev_t *mddev)
1021 : {
1022 3 : conf_t *conf = mddev->private;
1023 1 : int i;
1024 1 :
1025 2 : seq_printf(seq, " [%d/%d] [", conf->raid_disks,
1026 1 : conf->raid_disks - mddev->degraded);
1027 3 : rcu_read_lock();
1028 6 : for (i = 0; i < conf->raid_disks; i++) {
1029 3 : mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
1030 14 : seq_printf(seq, "%s",
1031 : rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
1032 : }
1033 2 : rcu_read_unlock();
1034 1 : seq_printf(seq, "]");
1035 1 : }
1036 :
1037 :
1038 : static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1039 : {
1040 1 : char b[BDEVNAME_SIZE];
1041 3 : conf_t *conf = mddev->private;
1042 1 :
1043 1 : /*
1044 1 : * If it is not operational, then we have already marked it as dead
1045 1 : * else if it is the last working disks, ignore the error, let the
1046 1 : * next level up know.
1047 : * else mark the drive as failed
1048 : */
1049 6 : if (test_bit(In_sync, &rdev->flags)
1050 : && (conf->raid_disks - mddev->degraded) == 1) {
1051 : /*
1052 : * Don't fail the drive, act as though we were just a
1053 : * normal single drive.
1054 : * However don't try a recovery from this drive as
1055 : * it is very likely to fail.
1056 : */
1057 1 : mddev->recovery_disabled = 1;
1058 1 : return;
1059 : }
1060 4 : if (test_and_clear_bit(In_sync, &rdev->flags)) {
1061 : unsigned long flags;
1062 3 : spin_lock_irqsave(&conf->device_lock, flags);
1063 1 : mddev->degraded++;
1064 2 : set_bit(Faulty, &rdev->flags);
1065 2 : spin_unlock_irqrestore(&conf->device_lock, flags);
1066 : /*
1067 : * if recovery is running, make sure it aborts.
1068 : */
1069 2 : set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1070 : } else
1071 2 : set_bit(Faulty, &rdev->flags);
1072 4 : set_bit(MD_CHANGE_DEVS, &mddev->flags);
1073 2 : printk(KERN_ALERT "raid1: Disk failure on %s, disabling device.\n"
1074 : "raid1: Operation continuing on %d devices.\n",
1075 : bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
1076 1 : }
1077 :
1078 : static void print_conf(conf_t *conf)
1079 : {
1080 9 : int i;
1081 9 :
1082 18 : printk("RAID1 conf printout:\n");
1083 27 : if (!conf) {
1084 18 : printk("(!conf)\n");
1085 18 : return;
1086 9 : }
1087 9 : printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
1088 : conf->raid_disks);
1089 :
1090 18 : rcu_read_lock();
1091 54 : for (i = 0; i < conf->raid_disks; i++) {
1092 27 : char b[BDEVNAME_SIZE];
1093 27 : mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
1094 18 : if (rdev)
1095 54 : printk(" disk %d, wo:%d, o:%d, dev:%s\n",
1096 : i, !test_bit(In_sync, &rdev->flags),
1097 : !test_bit(Faulty, &rdev->flags),
1098 : bdevname(rdev->bdev,b));
1099 : }
1100 18 : rcu_read_unlock();
1101 9 : }
1102 :
1103 : static void close_sync(conf_t *conf)
1104 : {
1105 4 : wait_barrier(conf);
1106 4 : allow_barrier(conf);
1107 :
1108 2 : mempool_destroy(conf->r1buf_pool);
1109 2 : conf->r1buf_pool = NULL;
1110 2 : }
1111 :
1112 : static int raid1_spare_active(mddev_t *mddev)
1113 : {
1114 1 : int i;
1115 3 : conf_t *conf = mddev->private;
1116 1 :
1117 1 : /*
1118 1 : * Find all failed disks within the RAID1 configuration
1119 1 : * and mark them readable.
1120 1 : * Called under mddev lock, so rcu protection not needed.
1121 : */
1122 8 : for (i = 0; i < conf->raid_disks; i++) {
1123 6 : mdk_rdev_t *rdev = conf->mirrors[i].rdev;
1124 11 : if (rdev
1125 : && !test_bit(Faulty, &rdev->flags)
1126 : && !test_and_set_bit(In_sync, &rdev->flags)) {
1127 : unsigned long flags;
1128 3 : spin_lock_irqsave(&conf->device_lock, flags);
1129 1 : mddev->degraded--;
1130 2 : spin_unlock_irqrestore(&conf->device_lock, flags);
1131 : }
1132 : }
1133 :
1134 3 : print_conf(conf);
1135 1 : return 0;
1136 : }
1137 :
1138 :
1139 : static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1140 : {
1141 3 : conf_t *conf = mddev->private;
1142 2 : int err = -EEXIST;
1143 2 : int mirror = 0;
1144 1 : mirror_info_t *p;
1145 2 : int first = 0;
1146 2 : int last = mddev->raid_disks - 1;
1147 1 :
1148 2 : if (rdev->raid_disk >= 0)
1149 2 : first = last = rdev->raid_disk;
1150 :
1151 5 : for (mirror = first; mirror <= last; mirror++)
1152 6 : if ( !(p=conf->mirrors+mirror)->rdev) {
1153 1 :
1154 1 : disk_stack_limits(mddev->gendisk, rdev->bdev,
1155 : rdev->data_offset << 9);
1156 : /* as we don't honour merge_bvec_fn, we must never risk
1157 1 : * violating it, so limit ->max_sector to one PAGE, as
1158 : * a one page request is never in violation.
1159 : */
1160 7 : if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
1161 : queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
1162 1 : blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
1163 :
1164 2 : p->head_position = 0;
1165 2 : rdev->raid_disk = mirror;
1166 2 : err = 0;
1167 : /* As all devices are equivalent, we don't need a full recovery
1168 : * if this was recently any drive of the array
1169 : */
1170 4 : if (rdev->saved_raid_disk < 0)
1171 2 : conf->fullsync = 1;
1172 4 : rcu_assign_pointer(p->rdev, rdev);
1173 2 : break;
1174 : }
1175 2 : md_integrity_add_rdev(rdev, mddev);
1176 6 : print_conf(conf);
1177 1 : return err;
1178 : }
1179 :
1180 : static int raid1_remove_disk(mddev_t *mddev, int number)
1181 : {
1182 3 : conf_t *conf = mddev->private;
1183 2 : int err = 0;
1184 1 : mdk_rdev_t *rdev;
1185 2 : mirror_info_t *p = conf->mirrors+ number;
1186 1 :
1187 4 : print_conf(conf);
1188 2 : rdev = p->rdev;
1189 3 : if (rdev) {
1190 8 : if (test_bit(In_sync, &rdev->flags) ||
1191 : atomic_read(&rdev->nr_pending)) {
1192 2 : err = -EBUSY;
1193 2 : goto abort;
1194 : }
1195 : /* Only remove non-faulty devices is recovery
1196 : * is not possible.
1197 : */
1198 8 : if (!test_bit(Faulty, &rdev->flags) &&
1199 : !mddev->recovery_disabled &&
1200 : mddev->degraded < conf->raid_disks) {
1201 1 : err = -EBUSY;
1202 1 : goto abort;
1203 : }
1204 1 : p->rdev = NULL;
1205 1 : synchronize_rcu();
1206 4 : if (atomic_read(&rdev->nr_pending)) {
1207 : /* lost the race, try later */
1208 1 : err = -EBUSY;
1209 1 : p->rdev = rdev;
1210 1 : goto abort;
1211 : }
1212 1 : md_integrity_register(mddev);
1213 : }
1214 : abort:
1215 :
1216 15 : print_conf(conf);
1217 3 : return err;
1218 : }
1219 :
1220 :
1221 : static void end_sync_read(struct bio *bio, int error)
1222 : {
1223 0 : r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
1224 0 : int i;
1225 0 :
1226 0 : for (i=r1_bio->mddev->raid_disks; i--; )
1227 0 : if (r1_bio->bios[i] == bio)
1228 0 : break;
1229 0 : BUG_ON(i < 0);
1230 0 : update_head_pos(i, r1_bio);
1231 : /*
1232 0 : * we have read a block, now it needs to be re-written,
1233 : * or re-read if the read failed.
1234 : * We don't do much here, just schedule handling by raid1d
1235 : */
1236 0 : if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1237 0 : set_bit(R1BIO_Uptodate, &r1_bio->state);
1238 :
1239 0 : if (atomic_dec_and_test(&r1_bio->remaining))
1240 0 : reschedule_retry(r1_bio);
1241 0 : }
1242 :
1243 : static void end_sync_write(struct bio *bio, int error)
1244 : {
1245 0 : int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1246 0 : r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
1247 0 : mddev_t *mddev = r1_bio->mddev;
1248 0 : conf_t *conf = mddev->private;
1249 0 : int i;
1250 0 : int mirror=0;
1251 0 :
1252 0 : for (i = 0; i < conf->raid_disks; i++)
1253 0 : if (r1_bio->bios[i] == bio) {
1254 0 : mirror = i;
1255 0 : break;
1256 0 : }
1257 0 : if (!uptodate) {
1258 0 : int sync_blocks = 0;
1259 0 : sector_t s = r1_bio->sector;
1260 0 : long sectors_to_go = r1_bio->sectors;
1261 0 : /* make sure these bits doesn't get cleared. */
1262 : do {
1263 0 : bitmap_end_sync(mddev->bitmap, s,
1264 : &sync_blocks, 1);
1265 0 : s += sync_blocks;
1266 0 : sectors_to_go -= sync_blocks;
1267 0 : } while (sectors_to_go > 0);
1268 0 : md_error(mddev, conf->mirrors[mirror].rdev);
1269 0 : }
1270 :
1271 0 : update_head_pos(mirror, r1_bio);
1272 :
1273 0 : if (atomic_dec_and_test(&r1_bio->remaining)) {
1274 0 : sector_t s = r1_bio->sectors;
1275 0 : put_buf(r1_bio);
1276 0 : md_done_sync(mddev, s, uptodate);
1277 : }
1278 0 : }
1279 :
1280 : static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1281 : {
1282 0 : conf_t *conf = mddev->private;
1283 0 : int i;
1284 0 : int disks = conf->raid_disks;
1285 0 : struct bio *bio, *wbio;
1286 0 :
1287 0 : bio = r1_bio->bios[r1_bio->read_disk];
1288 0 :
1289 0 :
1290 0 : if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1291 0 : /* We have read all readable devices. If we haven't
1292 0 : * got the block, then there is no hope left.
1293 0 : * If we have, then we want to do a comparison
1294 0 : * and skip the write if everything is the same.
1295 0 : * If any blocks failed to read, then we need to
1296 0 : * attempt an over-write
1297 0 : */
1298 0 : int primary;
1299 0 : if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
1300 0 : for (i=0; i<mddev->raid_disks; i++)
1301 0 : if (r1_bio->bios[i]->bi_end_io == end_sync_read)
1302 0 : md_error(mddev, conf->mirrors[i].rdev);
1303 0 :
1304 0 : md_done_sync(mddev, r1_bio->sectors, 1);
1305 0 : put_buf(r1_bio);
1306 0 : return;
1307 0 : }
1308 0 : for (primary=0; primary<mddev->raid_disks; primary++)
1309 0 : if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
1310 0 : test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
1311 0 : r1_bio->bios[primary]->bi_end_io = NULL;
1312 0 : rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
1313 0 : break;
1314 0 : }
1315 0 : r1_bio->read_disk = primary;
1316 0 : for (i=0; i<mddev->raid_disks; i++)
1317 0 : if (r1_bio->bios[i]->bi_end_io == end_sync_read) {
1318 0 : int j;
1319 0 : int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
1320 0 : struct bio *pbio = r1_bio->bios[primary];
1321 0 : struct bio *sbio = r1_bio->bios[i];
1322 0 :
1323 0 : if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
1324 0 : for (j = vcnt; j-- ; ) {
1325 0 : struct page *p, *s;
1326 0 : p = pbio->bi_io_vec[j].bv_page;
1327 0 : s = sbio->bi_io_vec[j].bv_page;
1328 0 : if (memcmp(page_address(p),
1329 : page_address(s),
1330 0 : PAGE_SIZE))
1331 0 : break;
1332 : }
1333 : } else
1334 0 : j = 0;
1335 0 : if (j >= 0)
1336 0 : mddev->resync_mismatches += r1_bio->sectors;
1337 0 : if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
1338 : && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
1339 0 : sbio->bi_end_io = NULL;
1340 0 : rdev_dec_pending(conf->mirrors[i].rdev, mddev);
1341 : } else {
1342 : /* fixup the bio for reuse */
1343 : int size;
1344 0 : sbio->bi_vcnt = vcnt;
1345 0 : sbio->bi_size = r1_bio->sectors << 9;
1346 0 : sbio->bi_idx = 0;
1347 0 : sbio->bi_phys_segments = 0;
1348 0 : sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1349 0 : sbio->bi_flags |= 1 << BIO_UPTODATE;
1350 0 : sbio->bi_next = NULL;
1351 0 : sbio->bi_sector = r1_bio->sector +
1352 : conf->mirrors[i].rdev->data_offset;
1353 0 : sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1354 0 : size = sbio->bi_size;
1355 0 : for (j = 0; j < vcnt ; j++) {
1356 0 : struct bio_vec *bi;
1357 0 : bi = &sbio->bi_io_vec[j];
1358 0 : bi->bv_offset = 0;
1359 0 : if (size > PAGE_SIZE)
1360 0 : bi->bv_len = PAGE_SIZE;
1361 : else
1362 0 : bi->bv_len = size;
1363 0 : size -= PAGE_SIZE;
1364 0 : memcpy(page_address(bi->bv_page),
1365 : page_address(pbio->bi_io_vec[j].bv_page),
1366 : PAGE_SIZE);
1367 : }
1368 :
1369 : }
1370 : }
1371 : }
1372 0 : if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
1373 : /* ouch - failed to read all of that.
1374 : * Try some synchronous reads of other devices to get
1375 : * good data, much like with normal read errors. Only
1376 : * read into the pages we already have so we don't
1377 : * need to re-issue the read request.
1378 : * We don't need to freeze the array, because being in an
1379 : * active sync request, there is no normal IO, and
1380 : * no overlapping syncs.
1381 : */
1382 0 : sector_t sect = r1_bio->sector;
1383 0 : int sectors = r1_bio->sectors;
1384 0 : int idx = 0;
1385 :
1386 0 : while(sectors) {
1387 0 : int s = sectors;
1388 0 : int d = r1_bio->read_disk;
1389 0 : int success = 0;
1390 : mdk_rdev_t *rdev;
1391 :
1392 0 : if (s > (PAGE_SIZE>>9))
1393 0 : s = PAGE_SIZE >> 9;
1394 : do {
1395 0 : if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
1396 : /* No rcu protection needed here devices
1397 0 : * can only be removed when no resync is
1398 : * active, and resync is currently active
1399 : */
1400 0 : rdev = conf->mirrors[d].rdev;
1401 0 : if (sync_page_io(rdev->bdev,
1402 : sect + rdev->data_offset,
1403 : s<<9,
1404 : bio->bi_io_vec[idx].bv_page,
1405 : READ)) {
1406 0 : success = 1;
1407 0 : break;
1408 : }
1409 : }
1410 0 : d++;
1411 0 : if (d == conf->raid_disks)
1412 0 : d = 0;
1413 0 : } while (!success && d != r1_bio->read_disk);
1414 :
1415 0 : if (success) {
1416 0 : int start = d;
1417 : /* write it back and re-read */
1418 0 : set_bit(R1BIO_Uptodate, &r1_bio->state);
1419 0 : while (d != r1_bio->read_disk) {
1420 0 : if (d == 0)
1421 0 : d = conf->raid_disks;
1422 0 : d--;
1423 0 : if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1424 0 : continue;
1425 0 : rdev = conf->mirrors[d].rdev;
1426 0 : atomic_add(s, &rdev->corrected_errors);
1427 0 : if (sync_page_io(rdev->bdev,
1428 : sect + rdev->data_offset,
1429 : s<<9,
1430 : bio->bi_io_vec[idx].bv_page,
1431 : WRITE) == 0)
1432 0 : md_error(mddev, rdev);
1433 : }
1434 0 : d = start;
1435 0 : while (d != r1_bio->read_disk) {
1436 0 : if (d == 0)
1437 0 : d = conf->raid_disks;
1438 0 : d--;
1439 0 : if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1440 0 : continue;
1441 0 : rdev = conf->mirrors[d].rdev;
1442 0 : if (sync_page_io(rdev->bdev,
1443 : sect + rdev->data_offset,
1444 : s<<9,
1445 : bio->bi_io_vec[idx].bv_page,
1446 : READ) == 0)
1447 0 : md_error(mddev, rdev);
1448 : }
1449 : } else {
1450 : char b[BDEVNAME_SIZE];
1451 : /* Cannot read from anywhere, array is toast */
1452 0 : md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1453 0 : printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error"
1454 : " for block %llu\n",
1455 : bdevname(bio->bi_bdev,b),
1456 : (unsigned long long)r1_bio->sector);
1457 0 : md_done_sync(mddev, r1_bio->sectors, 0);
1458 0 : put_buf(r1_bio);
1459 0 : return;
1460 : }
1461 0 : sectors -= s;
1462 0 : sect += s;
1463 0 : idx ++;
1464 0 : }
1465 : }
1466 :
1467 : /*
1468 : * schedule writes
1469 : */
1470 0 : atomic_set(&r1_bio->remaining, 1);
1471 0 : for (i = 0; i < disks ; i++) {
1472 0 : wbio = r1_bio->bios[i];
1473 0 : if (wbio->bi_end_io == NULL ||
1474 : (wbio->bi_end_io == end_sync_read &&
1475 : (i == r1_bio->read_disk ||
1476 : !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
1477 0 : continue;
1478 :
1479 0 : wbio->bi_rw = WRITE;
1480 0 : wbio->bi_end_io = end_sync_write;
1481 0 : atomic_inc(&r1_bio->remaining);
1482 0 : md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
1483 :
1484 0 : generic_make_request(wbio);
1485 : }
1486 0 :
1487 0 : if (atomic_dec_and_test(&r1_bio->remaining)) {
1488 : /* if we're here, all write(s) have completed, so clean up */
1489 0 : md_done_sync(mddev, r1_bio->sectors, 1);
1490 0 : put_buf(r1_bio);
1491 : }
1492 0 : }
1493 :
1494 : /*
1495 : * This is a kernel thread which:
1496 : *
1497 : * 1. Retries failed read operations on working mirrors.
1498 : * 2. Updates the raid superblock when problems encounter.
1499 : * 3. Performs writes following reads for array syncronising.
1500 : */
1501 :
1502 : static void fix_read_error(conf_t *conf, int read_disk,
1503 : sector_t sect, int sectors)
1504 0 : {
1505 0 : mddev_t *mddev = conf->mddev;
1506 0 : while(sectors) {
1507 0 : int s = sectors;
1508 0 : int d = read_disk;
1509 0 : int success = 0;
1510 0 : int start;
1511 0 : mdk_rdev_t *rdev;
1512 0 :
1513 0 : if (s > (PAGE_SIZE>>9))
1514 0 : s = PAGE_SIZE >> 9;
1515 0 :
1516 0 : do {
1517 0 : /* Note: no rcu protection needed here
1518 0 : * as this is synchronous in the raid1d thread
1519 : * which is the thread that might remove
1520 : * a device. If raid1d ever becomes multi-threaded....
1521 : */
1522 0 : rdev = conf->mirrors[d].rdev;
1523 0 : if (rdev &&
1524 : test_bit(In_sync, &rdev->flags) &&
1525 : sync_page_io(rdev->bdev,
1526 0 : sect + rdev->data_offset,
1527 : s<<9,
1528 : conf->tmppage, READ))
1529 0 : success = 1;
1530 : else {
1531 0 : d++;
1532 0 : if (d == conf->raid_disks)
1533 0 : d = 0;
1534 : }
1535 0 : } while (!success && d != read_disk);
1536 :
1537 0 : if (!success) {
1538 : /* Cannot read from anywhere -- bye bye array */
1539 0 : md_error(mddev, conf->mirrors[read_disk].rdev);
1540 0 : break;
1541 : }
1542 : /* write it back and re-read */
1543 0 : start = d;
1544 0 : while (d != read_disk) {
1545 0 : if (d==0)
1546 0 : d = conf->raid_disks;
1547 0 : d--;
1548 0 : rdev = conf->mirrors[d].rdev;
1549 0 : if (rdev &&
1550 : test_bit(In_sync, &rdev->flags)) {
1551 0 : if (sync_page_io(rdev->bdev,
1552 : sect + rdev->data_offset,
1553 : s<<9, conf->tmppage, WRITE)
1554 : == 0)
1555 : /* Well, this device is dead */
1556 0 : md_error(mddev, rdev);
1557 : }
1558 : }
1559 0 : d = start;
1560 0 : while (d != read_disk) {
1561 0 : char b[BDEVNAME_SIZE];
1562 0 : if (d==0)
1563 0 : d = conf->raid_disks;
1564 0 : d--;
1565 0 : rdev = conf->mirrors[d].rdev;
1566 0 : if (rdev &&
1567 0 : test_bit(In_sync, &rdev->flags)) {
1568 0 : if (sync_page_io(rdev->bdev,
1569 : sect + rdev->data_offset,
1570 : s<<9, conf->tmppage, READ)
1571 : == 0)
1572 : /* Well, this device is dead */
1573 0 : md_error(mddev, rdev);
1574 : else {
1575 0 : atomic_add(s, &rdev->corrected_errors);
1576 0 : printk(KERN_INFO
1577 : "raid1:%s: read error corrected "
1578 : "(%d sectors at %llu on %s)\n",
1579 : mdname(mddev), s,
1580 : (unsigned long long)(sect +
1581 : rdev->data_offset),
1582 : bdevname(rdev->bdev, b));
1583 : }
1584 : }
1585 : }
1586 0 : sectors -= s;
1587 0 : sect += s;
1588 0 : }
1589 : }
1590 :
1591 : static void raid1d(mddev_t *mddev)
1592 : {
1593 0 : r1bio_t *r1_bio;
1594 0 : struct bio *bio;
1595 0 : unsigned long flags;
1596 0 : conf_t *conf = mddev->private;
1597 0 : struct list_head *head = &conf->retry_list;
1598 0 : int unplug=0;
1599 0 : mdk_rdev_t *rdev;
1600 0 :
1601 0 : md_check_recovery(mddev);
1602 0 :
1603 0 : for (;;) {
1604 0 : char b[BDEVNAME_SIZE];
1605 0 :
1606 0 : unplug += flush_pending_writes(conf);
1607 0 :
1608 0 : spin_lock_irqsave(&conf->device_lock, flags);
1609 0 : if (list_empty(head)) {
1610 0 : spin_unlock_irqrestore(&conf->device_lock, flags);
1611 0 : break;
1612 0 : }
1613 0 : r1_bio = list_entry(head->prev, r1bio_t, retry_list);
1614 0 : list_del(head->prev);
1615 0 : conf->nr_queued--;
1616 0 : spin_unlock_irqrestore(&conf->device_lock, flags);
1617 0 :
1618 0 : mddev = r1_bio->mddev;
1619 0 : conf = mddev->private;
1620 0 : if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
1621 0 : sync_request_write(mddev, r1_bio);
1622 0 : unplug = 1;
1623 0 : } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
1624 : /* some requests in the r1bio were BIO_RW_BARRIER
1625 : * requests which failed with -EOPNOTSUPP. Hohumm..
1626 : * Better resubmit without the barrier.
1627 : * We know which devices to resubmit for, because
1628 : * all others have had their bios[] entry cleared.
1629 : * We already have a nr_pending reference on these rdevs.
1630 : */
1631 : int i;
1632 0 : const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO);
1633 0 : clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
1634 0 : clear_bit(R1BIO_Barrier, &r1_bio->state);
1635 0 : for (i=0; i < conf->raid_disks; i++)
1636 0 : if (r1_bio->bios[i])
1637 0 : atomic_inc(&r1_bio->remaining);
1638 0 : for (i=0; i < conf->raid_disks; i++)
1639 0 : if (r1_bio->bios[i]) {
1640 0 : struct bio_vec *bvec;
1641 : int j;
1642 :
1643 0 : bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
1644 : /* copy pages from the failed bio, as
1645 : * this might be a write-behind device */
1646 0 : __bio_for_each_segment(bvec, bio, j, 0)
1647 0 : bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
1648 0 : bio_put(r1_bio->bios[i]);
1649 0 : bio->bi_sector = r1_bio->sector +
1650 : conf->mirrors[i].rdev->data_offset;
1651 0 : bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1652 0 : bio->bi_end_io = raid1_end_write_request;
1653 0 : bio->bi_rw = WRITE |
1654 : (do_sync << BIO_RW_SYNCIO);
1655 0 : bio->bi_private = r1_bio;
1656 0 : r1_bio->bios[i] = bio;
1657 0 : generic_make_request(bio);
1658 : }
1659 : } else {
1660 : int disk;
1661 :
1662 : /* we got a read error. Maybe the drive is bad. Maybe just
1663 : * the block and we can fix it.
1664 : * We freeze all other IO, and try reading the block from
1665 : * other devices. When we find one, we re-write
1666 : * and check it that fixes the read error.
1667 : * This is all done synchronously while the array is
1668 : * frozen
1669 : */
1670 0 : if (mddev->ro == 0) {
1671 0 : freeze_array(conf);
1672 0 : fix_read_error(conf, r1_bio->read_disk,
1673 : r1_bio->sector,
1674 : r1_bio->sectors);
1675 0 : unfreeze_array(conf);
1676 : } else
1677 0 : md_error(mddev,
1678 : conf->mirrors[r1_bio->read_disk].rdev);
1679 :
1680 0 : bio = r1_bio->bios[r1_bio->read_disk];
1681 0 : if ((disk=read_balance(conf, r1_bio)) == -1) {
1682 0 : printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
1683 : " read error for block %llu\n",
1684 : bdevname(bio->bi_bdev,b),
1685 : (unsigned long long)r1_bio->sector);
1686 0 : raid_end_bio_io(r1_bio);
1687 : } else {
1688 0 : const bool do_sync = bio_rw_flagged(r1_bio->master_bio, BIO_RW_SYNCIO);
1689 0 : r1_bio->bios[r1_bio->read_disk] =
1690 : mddev->ro ? IO_BLOCKED : NULL;
1691 0 : r1_bio->read_disk = disk;
1692 0 : bio_put(bio);
1693 0 : bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
1694 0 : r1_bio->bios[r1_bio->read_disk] = bio;
1695 0 : rdev = conf->mirrors[disk].rdev;
1696 0 : if (printk_ratelimit())
1697 0 : printk(KERN_ERR "raid1: %s: redirecting sector %llu to"
1698 : " another mirror\n",
1699 : bdevname(rdev->bdev,b),
1700 : (unsigned long long)r1_bio->sector);
1701 0 : bio->bi_sector = r1_bio->sector + rdev->data_offset;
1702 0 : bio->bi_bdev = rdev->bdev;
1703 0 : bio->bi_end_io = raid1_end_read_request;
1704 0 : bio->bi_rw = READ | (do_sync << BIO_RW_SYNCIO);
1705 0 : bio->bi_private = r1_bio;
1706 0 : unplug = 1;
1707 0 : generic_make_request(bio);
1708 : }
1709 : }
1710 0 : cond_resched();
1711 0 : }
1712 0 : if (unplug)
1713 0 : unplug_slaves(mddev);
1714 0 : }
1715 :
1716 :
1717 : static int init_resync(conf_t *conf)
1718 : {
1719 1 : int buffs;
1720 1 :
1721 1 : buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
1722 7 : BUG_ON(conf->r1buf_pool);
1723 2 : conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free,
1724 : conf->poolinfo);
1725 3 : if (!conf->r1buf_pool)
1726 1 : return -ENOMEM;
1727 1 : conf->next_resync = 0;
1728 1 : return 0;
1729 : }
1730 :
1731 : /*
1732 : * perform a "sync" on one "block"
1733 : *
1734 : * We need to make sure that no normal I/O request - particularly write
1735 : * requests - conflict with active sync requests.
1736 : *
1737 : * This is achieved by tracking pending requests and a 'barrier' concept
1738 : * that can be installed to exclude normal IO requests.
1739 : */
1740 :
1741 : static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1742 : {
1743 2 : conf_t *conf = mddev->private;
1744 1 : r1bio_t *r1_bio;
1745 1 : struct bio *bio;
1746 1 : sector_t max_sector, nr_sectors;
1747 2 : int disk = -1;
1748 1 : int i;
1749 2 : int wonly = -1;
1750 3 : int write_targets = 0, read_targets = 0;
1751 1 : int sync_blocks;
1752 2 : int still_degraded = 0;
1753 1 :
1754 4 : if (!conf->r1buf_pool)
1755 1 : {
1756 1 : /*
1757 1 : printk("sync start - bitmap %p\n", mddev->bitmap);
1758 1 : */
1759 5 : if (init_resync(conf))
1760 2 : return 0;
1761 1 : }
1762 1 :
1763 3 : max_sector = mddev->dev_sectors;
1764 5 : if (sector_nr >= max_sector) {
1765 1 : /* If we aborted, we need to abort the
1766 1 : * sync on the 'current' bitmap chunk (there will
1767 1 : * only be one in raid1 resync.
1768 1 : * We can find the current addess in mddev->curr_resync
1769 1 : */
1770 5 : if (mddev->curr_resync < max_sector) /* aborted */
1771 3 : bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1772 1 : &sync_blocks, 1);
1773 1 : else /* completed sync */
1774 3 : conf->fullsync = 0;
1775 :
1776 2 : bitmap_close_sync(mddev->bitmap);
1777 4 : close_sync(conf);
1778 1 : return 0;
1779 : }
1780 :
1781 18 : if (mddev->bitmap == NULL &&
1782 : mddev->recovery_cp == MaxSector &&
1783 : !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
1784 : conf->fullsync == 0) {
1785 1 : *skipped = 1;
1786 1 : return max_sector - sector_nr;
1787 : }
1788 : /* before building a request, check if we can skip these blocks..
1789 : * This call the bitmap_start_sync doesn't actually record anything
1790 : */
1791 23 : if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
1792 : !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1793 : /* We can skip this block, and probably several more */
1794 1 : *skipped = 1;
1795 1 : return sync_blocks;
1796 : }
1797 : /*
1798 : * If there is non-resync activity waiting for a turn,
1799 : * and resync is going fast enough,
1800 : * then let it though before starting on this new sync request.
1801 : */
1802 16 : if (!go_faster && conf->nr_waiting)
1803 4 : msleep_interruptible(1000);
1804 :
1805 4 : bitmap_cond_end_sync(mddev->bitmap, sector_nr);
1806 8 : raise_barrier(conf);
1807 :
1808 1 : conf->next_resync = sector_nr;
1809 :
1810 2 : r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
1811 2 : rcu_read_lock();
1812 : /*
1813 : * If we get a correctably read error during resync or recovery,
1814 : * we might want to read from a different device. So we
1815 : * flag all drives that could conceivably be read from for READ,
1816 : * and any others (which will be non-In_sync devices) for WRITE.
1817 : * If a read fails, we try reading from something else for which READ
1818 : * is OK.
1819 : */
1820 :
1821 1 : r1_bio->mddev = mddev;
1822 1 : r1_bio->sector = sector_nr;
1823 1 : r1_bio->state = 0;
1824 2 : set_bit(R1BIO_IsSync, &r1_bio->state);
1825 :
1826 7 : for (i=0; i < conf->raid_disks; i++) {
1827 4 : mdk_rdev_t *rdev;
1828 2 : bio = r1_bio->bios[i];
1829 :
1830 : /* take from bio_init */
1831 1 : bio->bi_next = NULL;
1832 1 : bio->bi_flags |= 1 << BIO_UPTODATE;
1833 1 : bio->bi_rw = READ;
1834 1 : bio->bi_vcnt = 0;
1835 1 : bio->bi_idx = 0;
1836 1 : bio->bi_phys_segments = 0;
1837 1 : bio->bi_size = 0;
1838 1 : bio->bi_end_io = NULL;
1839 1 : bio->bi_private = NULL;
1840 :
1841 2 : rdev = rcu_dereference(conf->mirrors[i].rdev);
1842 6 : if (rdev == NULL ||
1843 : test_bit(Faulty, &rdev->flags)) {
1844 2 : still_degraded = 1;
1845 2 : continue;
1846 4 : } else if (!test_bit(In_sync, &rdev->flags)) {
1847 1 : bio->bi_rw = WRITE;
1848 1 : bio->bi_end_io = end_sync_write;
1849 1 : write_targets ++;
1850 : } else {
1851 : /* may need to read from here */
1852 1 : bio->bi_rw = READ;
1853 1 : bio->bi_end_io = end_sync_read;
1854 4 : if (test_bit(WriteMostly, &rdev->flags)) {
1855 2 : if (wonly < 0)
1856 1 : wonly = i;
1857 : } else {
1858 2 : if (disk < 0)
1859 1 : disk = i;
1860 : }
1861 1 : read_targets++;
1862 : }
1863 4 : atomic_inc(&rdev->nr_pending);
1864 1 : bio->bi_sector = sector_nr + rdev->data_offset;
1865 1 : bio->bi_bdev = rdev->bdev;
1866 1 : bio->bi_private = r1_bio;
1867 : }
1868 3 : rcu_read_unlock();
1869 2 : if (disk < 0)
1870 1 : disk = wonly;
1871 1 : r1_bio->read_disk = disk;
1872 :
1873 6 : if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
1874 : /* extra read targets are also write targets */
1875 1 : write_targets += read_targets-1;
1876 :
1877 4 : if (write_targets == 0 || read_targets == 0) {
1878 : /* There is nowhere to write, so all non-sync
1879 : * drives must be failed - so we are finished
1880 : */
1881 1 : sector_t rv = max_sector - sector_nr;
1882 1 : *skipped = 1;
1883 2 : put_buf(r1_bio);
1884 1 : return rv;
1885 : }
1886 :
1887 2 : if (max_sector > mddev->resync_max)
1888 1 : max_sector = mddev->resync_max; /* Don't do IO beyond here */
1889 1 : nr_sectors = 0;
1890 1 : sync_blocks = 0;
1891 1 : do {
1892 : struct page *page;
1893 1 : int len = PAGE_SIZE;
1894 2 : if (sector_nr + (len>>9) > max_sector)
1895 1 : len = (max_sector - sector_nr) << 9;
1896 2 : if (len == 0)
1897 1 : break;
1898 2 : if (sync_blocks == 0) {
1899 9 : if (!bitmap_start_sync(mddev->bitmap, sector_nr,
1900 : &sync_blocks, still_degraded) &&
1901 : !conf->fullsync &&
1902 : !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
1903 1 : break;
1904 8 : BUG_ON(sync_blocks < (PAGE_SIZE>>9));
1905 2 : if (len > (sync_blocks<<9))
1906 1 : len = sync_blocks<<9;
1907 : }
1908 :
1909 7 : for (i=0 ; i < conf->raid_disks; i++) {
1910 3 : bio = r1_bio->bios[i];
1911 4 : if (bio->bi_end_io) {
1912 2 : page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
1913 3 : if (bio_add_page(bio, page, len, 0) == 0) {
1914 : /* stop here */
1915 2 : bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
1916 3 : while (i > 0) {
1917 2 : i--;
1918 2 : bio = r1_bio->bios[i];
1919 3 : if (bio->bi_end_io==NULL)
1920 1 : continue;
1921 : /* remove last page from this bio */
1922 2 : bio->bi_vcnt--;
1923 1 : bio->bi_size -= len;
1924 1 : bio->bi_flags &= ~(1<< BIO_SEG_VALID);
1925 1 : }
1926 1 : goto bio_full;
1927 : }
1928 : }
1929 : }
1930 1 : nr_sectors += len>>9;
1931 1 : sector_nr += len>>9;
1932 1 : sync_blocks -= (len>>9);
1933 3 : } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
1934 : bio_full:
1935 5 : r1_bio->sectors = nr_sectors;
1936 :
1937 : /* For a user-requested sync, we read all readable devices and do a
1938 : * compare
1939 1 : */
1940 13 : if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1941 2 : atomic_set(&r1_bio->remaining, read_targets);
1942 5 : for (i=0; i<conf->raid_disks; i++) {
1943 3 : bio = r1_bio->bios[i];
1944 3 : if (bio->bi_end_io == end_sync_read) {
1945 0 : md_sync_acct(bio->bi_bdev, nr_sectors);
1946 0 : generic_make_request(bio);
1947 : }
1948 : }
1949 : } else {
1950 2 : atomic_set(&r1_bio->remaining, 1);
1951 1 : bio = r1_bio->bios[r1_bio->read_disk];
1952 2 : md_sync_acct(bio->bi_bdev, nr_sectors);
1953 1 : generic_make_request(bio);
1954 :
1955 : }
1956 2 : return nr_sectors;
1957 : }
1958 :
1959 : static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks)
1960 : {
1961 8 : if (sectors)
1962 4 : return sectors;
1963 :
1964 4 : return mddev->dev_sectors;
1965 : }
1966 :
1967 : static conf_t *setup_conf(mddev_t *mddev)
1968 : {
1969 2 : conf_t *conf;
1970 2 : int i;
1971 2 : mirror_info_t *disk;
1972 2 : mdk_rdev_t *rdev;
1973 4 : int err = -ENOMEM;
1974 2 :
1975 8 : conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
1976 6 : if (!conf)
1977 4 : goto abort;
1978 2 :
1979 10 : conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1980 2 : GFP_KERNEL);
1981 8 : if (!conf->mirrors)
1982 4 : goto abort;
1983 2 :
1984 6 : conf->tmppage = alloc_page(GFP_KERNEL);
1985 8 : if (!conf->tmppage)
1986 4 : goto abort;
1987 2 :
1988 6 : conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
1989 6 : if (!conf->poolinfo)
1990 2 : goto abort;
1991 2 : conf->poolinfo->raid_disks = mddev->raid_disks;
1992 4 : conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
1993 : r1bio_pool_free,
1994 : conf->poolinfo);
1995 6 : if (!conf->r1bio_pool)
1996 2 : goto abort;
1997 :
1998 2 : conf->poolinfo->mddev = mddev;
1999 :
2000 8 : spin_lock_init(&conf->device_lock);
2001 16 : list_for_each_entry(rdev, &mddev->disks, same_set) {
2002 6 : int disk_idx = rdev->raid_disk;
2003 10 : if (disk_idx >= mddev->raid_disks
2004 : || disk_idx < 0)
2005 2 : continue;
2006 2 : disk = conf->mirrors + disk_idx;
2007 :
2008 2 : disk->rdev = rdev;
2009 :
2010 2 : disk->head_position = 0;
2011 2 : }
2012 2 : conf->raid_disks = mddev->raid_disks;
2013 2 : conf->mddev = mddev;
2014 4 : INIT_LIST_HEAD(&conf->retry_list);
2015 :
2016 8 : spin_lock_init(&conf->resync_lock);
2017 2 : init_waitqueue_head(&conf->wait_barrier);
2018 :
2019 4 : bio_list_init(&conf->pending_bio_list);
2020 4 : bio_list_init(&conf->flushing_bio_list);
2021 :
2022 2 : conf->last_used = -1;
2023 12 : for (i = 0; i < conf->raid_disks; i++) {
2024 6 :
2025 4 : disk = conf->mirrors + i;
2026 :
2027 16 : if (!disk->rdev ||
2028 2 : !test_bit(In_sync, &disk->rdev->flags)) {
2029 4 : disk->head_position = 0;
2030 12 : if (disk->rdev)
2031 4 : conf->fullsync = 1;
2032 4 : } else if (conf->last_used < 0)
2033 : /*
2034 : * The first working device is used as a
2035 : * starting point to read balancing.
2036 : */
2037 2 : conf->last_used = i;
2038 : }
2039 :
2040 2 : err = -EIO;
2041 4 : if (conf->last_used < 0) {
2042 6 : printk(KERN_ERR "raid1: no operational mirrors for %s\n",
2043 : mdname(mddev));
2044 2 : goto abort;
2045 : }
2046 2 : err = -ENOMEM;
2047 4 : conf->thread = md_register_thread(raid1d, mddev, NULL);
2048 6 : if (!conf->thread) {
2049 6 : printk(KERN_ERR
2050 : "raid1: couldn't allocate thread for %s\n",
2051 : mdname(mddev));
2052 2 : goto abort;
2053 : }
2054 :
2055 2 : return conf;
2056 12 :
2057 : abort:
2058 24 : if (conf) {
2059 36 : if (conf->r1bio_pool)
2060 12 : mempool_destroy(conf->r1bio_pool);
2061 24 : kfree(conf->mirrors);
2062 24 : safe_put_page(conf->tmppage);
2063 4 : kfree(conf->poolinfo);
2064 2 : kfree(conf);
2065 : }
2066 30 : return ERR_PTR(err);
2067 : }
2068 :
2069 : static int run(mddev_t *mddev)
2070 : {
2071 1 : conf_t *conf;
2072 1 : int i;
2073 1 : mdk_rdev_t *rdev;
2074 1 :
2075 3 : if (mddev->level != 1) {
2076 4 : printk("raid1: %s: raid level not set to mirroring (%d)\n",
2077 1 : mdname(mddev), mddev->level);
2078 2 : return -EIO;
2079 1 : }
2080 3 : if (mddev->reshape_position != MaxSector) {
2081 4 : printk("raid1: %s: reshape_position set but not supported\n",
2082 1 : mdname(mddev));
2083 2 : return -EIO;
2084 1 : }
2085 1 : /*
2086 : * copy the already verified devices into our private RAID1
2087 : * bookkeeping area. [whatever we allocate in run(),
2088 : * should be freed in stop()]
2089 : */
2090 3 : if (mddev->private == NULL)
2091 3 : conf = setup_conf(mddev);
2092 : else
2093 2 : conf = mddev->private;
2094 :
2095 6 : if (IS_ERR(conf))
2096 3 : return PTR_ERR(conf);
2097 :
2098 1 : mddev->queue->queue_lock = &conf->device_lock;
2099 10 : list_for_each_entry(rdev, &mddev->disks, same_set) {
2100 4 : disk_stack_limits(mddev->gendisk, rdev->bdev,
2101 1 : rdev->data_offset << 9);
2102 : /* as we don't honour merge_bvec_fn, we must never risk
2103 : * violating it, so limit ->max_sector to one PAGE, as
2104 : * a one page request is never in violation.
2105 : */
2106 7 : if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
2107 : queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
2108 1 : blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
2109 : }
2110 :
2111 1 : mddev->degraded = 0;
2112 7 : for (i=0; i < conf->raid_disks; i++)
2113 15 : if (conf->mirrors[i].rdev == NULL ||
2114 1 : !test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
2115 : test_bit(Faulty, &conf->mirrors[i].rdev->flags))
2116 3 : mddev->degraded++;
2117 :
2118 2 : if (conf->raid_disks - mddev->degraded == 1)
2119 1 : mddev->recovery_cp = MaxSector;
2120 :
2121 2 : if (mddev->recovery_cp != MaxSector)
2122 3 : printk(KERN_NOTICE "raid1: %s is not clean"
2123 : " -- starting background reconstruction\n",
2124 : mdname(mddev));
2125 5 : printk(KERN_INFO
2126 : "raid1: raid set %s active with %d out of %d mirrors\n",
2127 : mdname(mddev), mddev->raid_disks - mddev->degraded,
2128 : mddev->raid_disks);
2129 :
2130 : /*
2131 : * Ok, everything is just fine now
2132 : */
2133 1 : mddev->thread = conf->thread;
2134 1 : conf->thread = NULL;
2135 1 : mddev->private = conf;
2136 :
2137 3 : md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
2138 :
2139 1 : mddev->queue->unplug_fn = raid1_unplug;
2140 1 : mddev->queue->backing_dev_info.congested_fn = raid1_congested;
2141 1 : mddev->queue->backing_dev_info.congested_data = mddev;
2142 1 : md_integrity_register(mddev);
2143 1 : return 0;
2144 : }
2145 :
2146 : static int stop(mddev_t *mddev)
2147 : {
2148 3 : conf_t *conf = mddev->private;
2149 2 : struct bitmap *bitmap = mddev->bitmap;
2150 2 : int behind_wait = 0;
2151 1 :
2152 1 : /* wait for behind writes to complete */
2153 8 : while (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
2154 3 : behind_wait++;
2155 5 : printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait);
2156 11 : set_current_state(TASK_UNINTERRUPTIBLE);
2157 3 : schedule_timeout(HZ); /* wait a second */
2158 1 : /* need to kick something here to make sure I/O goes? */
2159 1 : }
2160 :
2161 4 : raise_barrier(conf);
2162 2 : lower_barrier(conf);
2163 1 :
2164 1 : md_unregister_thread(mddev->thread);
2165 1 : mddev->thread = NULL;
2166 1 : blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
2167 3 : if (conf->r1bio_pool)
2168 1 : mempool_destroy(conf->r1bio_pool);
2169 2 : kfree(conf->mirrors);
2170 2 : kfree(conf->poolinfo);
2171 1 : kfree(conf);
2172 1 : mddev->private = NULL;
2173 1 : return 0;
2174 : }
2175 :
2176 : static int raid1_resize(mddev_t *mddev, sector_t sectors)
2177 : {
2178 1 : /* no resync is happening, and there is enough space
2179 1 : * on all devices, so we can resize.
2180 : * We need to make sure resync covers any new space.
2181 : * If the array is shrinking we should possibly wait until
2182 : * any io in the removed space completes, but it hardly seems
2183 : * worth it.
2184 : */
2185 3 : md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0));
2186 4 : if (mddev->array_sectors > raid1_size(mddev, sectors, 0))
2187 1 : return -EINVAL;
2188 2 : set_capacity(mddev->gendisk, mddev->array_sectors);
2189 1 : mddev->changed = 1;
2190 1 : revalidate_disk(mddev->gendisk);
2191 4 : if (sectors > mddev->dev_sectors &&
2192 : mddev->recovery_cp == MaxSector) {
2193 1 : mddev->recovery_cp = mddev->dev_sectors;
2194 2 : set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2195 : }
2196 2 : mddev->dev_sectors = sectors;
2197 2 : mddev->resync_max_sectors = sectors;
2198 2 : return 0;
2199 : }
2200 :
2201 : static int raid1_reshape(mddev_t *mddev)
2202 : {
2203 1 : /* We need to:
2204 1 : * 1/ resize the r1bio_pool
2205 1 : * 2/ resize conf->mirrors
2206 1 : *
2207 1 : * We allocate a new r1bio_pool if we can.
2208 1 : * Then raise a device barrier and wait until all IO stops.
2209 1 : * Then resize conf->mirrors and swap in the new r1bio pool.
2210 1 : *
2211 1 : * At the same time, we "pack" the devices so that all the missing
2212 1 : * devices have the higher raid_disk numbers.
2213 1 : */
2214 1 : mempool_t *newpool, *oldpool;
2215 1 : struct pool_info *newpoolinfo;
2216 1 : mirror_info_t *newmirrors;
2217 3 : conf_t *conf = mddev->private;
2218 1 : int cnt, raid_disks;
2219 1 : unsigned long flags;
2220 1 : int d, d2, err;
2221 1 :
2222 1 : /* Cannot change chunk_size, layout, or level */
2223 6 : if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
2224 : mddev->layout != mddev->new_layout ||
2225 : mddev->level != mddev->new_level) {
2226 1 : mddev->new_chunk_sectors = mddev->chunk_sectors;
2227 1 : mddev->new_layout = mddev->layout;
2228 1 : mddev->new_level = mddev->level;
2229 1 : return -EINVAL;
2230 : }
2231 :
2232 1 : err = md_allow_write(mddev);
2233 2 : if (err)
2234 1 : return err;
2235 :
2236 1 : raid_disks = mddev->raid_disks + mddev->delta_disks;
2237 :
2238 2 : if (raid_disks < conf->raid_disks) {
2239 1 : cnt=0;
2240 5 : for (d= 0; d < conf->raid_disks; d++)
2241 5 : if (conf->mirrors[d].rdev)
2242 2 : cnt++;
2243 2 : if (cnt > raid_disks)
2244 1 : return -EBUSY;
2245 : }
2246 :
2247 5 : newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
2248 2 : if (!newpoolinfo)
2249 1 : return -ENOMEM;
2250 1 : newpoolinfo->mddev = mddev;
2251 1 : newpoolinfo->raid_disks = raid_disks;
2252 :
2253 1 : newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
2254 : r1bio_pool_free, newpoolinfo);
2255 2 : if (!newpool) {
2256 1 : kfree(newpoolinfo);
2257 1 : return -ENOMEM;
2258 : }
2259 3 : newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
2260 2 : if (!newmirrors) {
2261 1 : kfree(newpoolinfo);
2262 1 : mempool_destroy(newpool);
2263 1 : return -ENOMEM;
2264 : }
2265 :
2266 2 : raise_barrier(conf);
2267 :
2268 : /* ok, everything is stopped */
2269 1 : oldpool = conf->r1bio_pool;
2270 1 : conf->r1bio_pool = newpool;
2271 :
2272 7 : for (d = d2 = 0; d < conf->raid_disks; d++) {
2273 4 : mdk_rdev_t *rdev = conf->mirrors[d].rdev;
2274 5 : if (rdev && rdev->raid_disk != d2) {
2275 : char nm[20];
2276 1 : sprintf(nm, "rd%d", rdev->raid_disk);
2277 1 : sysfs_remove_link(&mddev->kobj, nm);
2278 1 : rdev->raid_disk = d2;
2279 1 : sprintf(nm, "rd%d", rdev->raid_disk);
2280 1 : sysfs_remove_link(&mddev->kobj, nm);
2281 3 : if (sysfs_create_link(&mddev->kobj,
2282 : &rdev->kobj, nm))
2283 3 : printk(KERN_WARNING
2284 : "md/raid1: cannot register "
2285 : "%s for %s\n",
2286 : nm, mdname(mddev));
2287 : }
2288 4 : if (rdev)
2289 6 : newmirrors[d2++].rdev = rdev;
2290 : }
2291 2 : kfree(conf->mirrors);
2292 1 : conf->mirrors = newmirrors;
2293 2 : kfree(conf->poolinfo);
2294 1 : conf->poolinfo = newpoolinfo;
2295 :
2296 3 : spin_lock_irqsave(&conf->device_lock, flags);
2297 1 : mddev->degraded += (raid_disks - conf->raid_disks);
2298 2 : spin_unlock_irqrestore(&conf->device_lock, flags);
2299 3 : conf->raid_disks = mddev->raid_disks = raid_disks;
2300 1 : mddev->delta_disks = 0;
2301 :
2302 1 : conf->last_used = 0; /* just make sure it is in-range */
2303 2 : lower_barrier(conf);
2304 :
2305 2 : set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2306 1 : md_wakeup_thread(mddev->thread);
2307 :
2308 1 : mempool_destroy(oldpool);
2309 1 : return 0;
2310 : }
2311 :
2312 : static void raid1_quiesce(mddev_t *mddev, int state)
2313 : {
2314 3 : conf_t *conf = mddev->private;
2315 :
2316 1 : switch(state) {
2317 4 : case 2: /* wake for suspend */
2318 1 : wake_up(&conf->wait_barrier);
2319 1 : break;
2320 4 : case 1:
2321 2 : raise_barrier(conf);
2322 1 : break;
2323 4 : case 0:
2324 2 : lower_barrier(conf);
2325 1 : break;
2326 1 : }
2327 : }
2328 4 :
2329 : static void *raid1_takeover(mddev_t *mddev)
2330 : {
2331 1 : /* raid1 can take over:
2332 1 : * raid5 with 2 devices, any layout or chunk size
2333 1 : */
2334 4 : if (mddev->level == 5 && mddev->raid_disks == 2) {
2335 : conf_t *conf;
2336 1 : mddev->new_level = 1;
2337 1 : mddev->new_layout = 0;
2338 1 : mddev->new_chunk_sectors = 0;
2339 3 : conf = setup_conf(mddev);
2340 4 : if (!IS_ERR(conf))
2341 1 : conf->barrier = 1;
2342 1 : return conf;
2343 : }
2344 3 : return ERR_PTR(-EINVAL);
2345 : }
2346 :
2347 1 : static struct mdk_personality raid1_personality =
2348 : {
2349 : .name = "raid1",
2350 : .level = 1,
2351 : .owner = THIS_MODULE,
2352 : .make_request = make_request,
2353 : .run = run,
2354 : .stop = stop,
2355 : .status = status,
2356 : .error_handler = error,
2357 : .hot_add_disk = raid1_add_disk,
2358 : .hot_remove_disk= raid1_remove_disk,
2359 : .spare_active = raid1_spare_active,
2360 : .sync_request = sync_request,
2361 : .resize = raid1_resize,
2362 : .size = raid1_size,
2363 : .check_reshape = raid1_reshape,
2364 : .quiesce = raid1_quiesce,
2365 : .takeover = raid1_takeover,
2366 : };
2367 :
2368 : static int __init raid_init(void)
2369 : {
2370 4 : return register_md_personality(&raid1_personality);
2371 : }
2372 :
2373 : static void raid_exit(void)
2374 : {
2375 2 : unregister_md_personality(&raid1_personality);
2376 1 : }
2377 :
2378 : module_init(raid_init);
2379 : module_exit(raid_exit);
2380 1 : MODULE_LICENSE("GPL");
2381 : MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
2382 : MODULE_ALIAS("md-personality-3"); /* RAID1 */
2383 : MODULE_ALIAS("md-raid1");
2384 : MODULE_ALIAS("md-level-1");
|