Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md: (75 commits) md/raid10: handle further errors during fix_read_error better. md/raid10: Handle read errors during recovery better. md/raid10: simplify read error handling during recovery. md/raid10: record bad blocks due to write errors during resync/recovery. md/raid10: attempt to fix read errors during resync/check md/raid10: Handle write errors by updating badblock log. md/raid10: clear bad-block record when write succeeds. md/raid10: avoid writing to known bad blocks on known bad drives. md/raid10 record bad blocks as needed during recovery. md/raid10: avoid reading known bad blocks during resync/recovery. md/raid10 - avoid reading from known bad blocks - part 3 md/raid10: avoid reading from known bad blocks - part 2 md/raid10: avoid reading from known bad blocks - part 1 md/raid10: Split handle_read_error out from raid10d. md/raid10: simplify/reindent some loops. md/raid5: Clear bad blocks on successful write. md/raid5. Don't write to known bad block on doubtful devices. md/raid5: write errors should be recorded as bad blocks if possible. md/raid5: use bad-block log to improve handling of uncorrectable read errors. md/raid5: avoid reading from known bad blocks. ...
This commit is contained in:
commit
6140333d36
12 changed files with 3116 additions and 1402 deletions
|
@ -360,18 +360,20 @@ Each directory contains:
|
|||
A file recording the current state of the device in the array
|
||||
which can be a comma separated list of
|
||||
faulty - device has been kicked from active use due to
|
||||
a detected fault
|
||||
a detected fault or it has unacknowledged bad
|
||||
blocks
|
||||
in_sync - device is a fully in-sync member of the array
|
||||
writemostly - device will only be subject to read
|
||||
requests if there are no other options.
|
||||
This applies only to raid1 arrays.
|
||||
blocked - device has failed, metadata is "external",
|
||||
and the failure hasn't been acknowledged yet.
|
||||
blocked - device has failed, and the failure hasn't been
|
||||
acknowledged yet by the metadata handler.
|
||||
Writes that would write to this device if
|
||||
it were not faulty are blocked.
|
||||
spare - device is working, but not a full member.
|
||||
This includes spares that are in the process
|
||||
of being recovered to
|
||||
write_error - device has ever seen a write error.
|
||||
This list may grow in future.
|
||||
This can be written to.
|
||||
Writing "faulty" simulates a failure on the device.
|
||||
|
@ -379,9 +381,11 @@ Each directory contains:
|
|||
Writing "writemostly" sets the writemostly flag.
|
||||
Writing "-writemostly" clears the writemostly flag.
|
||||
Writing "blocked" sets the "blocked" flag.
|
||||
Writing "-blocked" clears the "blocked" flag and allows writes
|
||||
to complete.
|
||||
Writing "-blocked" clears the "blocked" flags and allows writes
|
||||
to complete and possibly simulates an error.
|
||||
Writing "in_sync" sets the in_sync flag.
|
||||
Writing "write_error" sets writeerrorseen flag.
|
||||
Writing "-write_error" clears writeerrorseen flag.
|
||||
|
||||
This file responds to select/poll. Any change to 'faulty'
|
||||
or 'blocked' causes an event.
|
||||
|
@ -419,7 +423,6 @@ Each directory contains:
|
|||
written, it will be rejected.
|
||||
|
||||
recovery_start
|
||||
|
||||
When the device is not 'in_sync', this records the number of
|
||||
sectors from the start of the device which are known to be
|
||||
correct. This is normally zero, but during a recovery
|
||||
|
@ -435,6 +438,20 @@ Each directory contains:
|
|||
Setting this to 'none' is equivalent to setting 'in_sync'.
|
||||
Setting to any other value also clears the 'in_sync' flag.
|
||||
|
||||
bad_blocks
|
||||
This gives the list of all known bad blocks in the form of
|
||||
start address and length (in sectors respectively). If output
|
||||
is too big to fit in a page, it will be truncated. Writing
|
||||
"sector length" to this file adds new acknowledged (i.e.
|
||||
recorded to disk safely) bad blocks.
|
||||
|
||||
unacknowledged_bad_blocks
|
||||
This gives the list of known-but-not-yet-saved-to-disk bad
|
||||
blocks in the same form of 'bad_blocks'. If output is too big
|
||||
to fit in a page, it will be truncated. Writing to this file
|
||||
adds bad blocks without acknowledging them. This is largely
|
||||
for testing.
|
||||
|
||||
|
||||
|
||||
An active md device will also contain and entry for each active device
|
||||
|
|
|
@ -29,7 +29,6 @@
|
|||
#include "md.h"
|
||||
#include "bitmap.h"
|
||||
|
||||
#include <linux/dm-dirty-log.h>
|
||||
/* debug macros */
|
||||
|
||||
#define DEBUG 0
|
||||
|
@ -775,10 +774,8 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon
|
|||
* 0 or page 1
|
||||
*/
|
||||
static inline struct page *filemap_get_page(struct bitmap *bitmap,
|
||||
unsigned long chunk)
|
||||
unsigned long chunk)
|
||||
{
|
||||
if (bitmap->filemap == NULL)
|
||||
return NULL;
|
||||
if (file_page_index(bitmap, chunk) >= bitmap->file_pages)
|
||||
return NULL;
|
||||
return bitmap->filemap[file_page_index(bitmap, chunk)
|
||||
|
@ -878,28 +875,19 @@ enum bitmap_page_attr {
|
|||
static inline void set_page_attr(struct bitmap *bitmap, struct page *page,
|
||||
enum bitmap_page_attr attr)
|
||||
{
|
||||
if (page)
|
||||
__set_bit((page->index<<2) + attr, bitmap->filemap_attr);
|
||||
else
|
||||
__set_bit(attr, &bitmap->logattrs);
|
||||
__set_bit((page->index<<2) + attr, bitmap->filemap_attr);
|
||||
}
|
||||
|
||||
static inline void clear_page_attr(struct bitmap *bitmap, struct page *page,
|
||||
enum bitmap_page_attr attr)
|
||||
{
|
||||
if (page)
|
||||
__clear_bit((page->index<<2) + attr, bitmap->filemap_attr);
|
||||
else
|
||||
__clear_bit(attr, &bitmap->logattrs);
|
||||
__clear_bit((page->index<<2) + attr, bitmap->filemap_attr);
|
||||
}
|
||||
|
||||
static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page,
|
||||
enum bitmap_page_attr attr)
|
||||
{
|
||||
if (page)
|
||||
return test_bit((page->index<<2) + attr, bitmap->filemap_attr);
|
||||
else
|
||||
return test_bit(attr, &bitmap->logattrs);
|
||||
return test_bit((page->index<<2) + attr, bitmap->filemap_attr);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -912,30 +900,26 @@ static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *p
|
|||
static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
|
||||
{
|
||||
unsigned long bit;
|
||||
struct page *page = NULL;
|
||||
struct page *page;
|
||||
void *kaddr;
|
||||
unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap);
|
||||
|
||||
if (!bitmap->filemap) {
|
||||
struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log;
|
||||
if (log)
|
||||
log->type->mark_region(log, chunk);
|
||||
} else {
|
||||
if (!bitmap->filemap)
|
||||
return;
|
||||
|
||||
page = filemap_get_page(bitmap, chunk);
|
||||
if (!page)
|
||||
return;
|
||||
bit = file_page_offset(bitmap, chunk);
|
||||
page = filemap_get_page(bitmap, chunk);
|
||||
if (!page)
|
||||
return;
|
||||
bit = file_page_offset(bitmap, chunk);
|
||||
|
||||
/* set the bit */
|
||||
kaddr = kmap_atomic(page, KM_USER0);
|
||||
if (bitmap->flags & BITMAP_HOSTENDIAN)
|
||||
set_bit(bit, kaddr);
|
||||
else
|
||||
__test_and_set_bit_le(bit, kaddr);
|
||||
kunmap_atomic(kaddr, KM_USER0);
|
||||
PRINTK("set file bit %lu page %lu\n", bit, page->index);
|
||||
}
|
||||
/* set the bit */
|
||||
kaddr = kmap_atomic(page, KM_USER0);
|
||||
if (bitmap->flags & BITMAP_HOSTENDIAN)
|
||||
set_bit(bit, kaddr);
|
||||
else
|
||||
__set_bit_le(bit, kaddr);
|
||||
kunmap_atomic(kaddr, KM_USER0);
|
||||
PRINTK("set file bit %lu page %lu\n", bit, page->index);
|
||||
/* record page number so it gets flushed to disk when unplug occurs */
|
||||
set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
|
||||
}
|
||||
|
@ -952,16 +936,6 @@ void bitmap_unplug(struct bitmap *bitmap)
|
|||
|
||||
if (!bitmap)
|
||||
return;
|
||||
if (!bitmap->filemap) {
|
||||
/* Must be using a dirty_log */
|
||||
struct dm_dirty_log *log = bitmap->mddev->bitmap_info.log;
|
||||
dirty = test_and_clear_bit(BITMAP_PAGE_DIRTY, &bitmap->logattrs);
|
||||
need_write = test_and_clear_bit(BITMAP_PAGE_NEEDWRITE, &bitmap->logattrs);
|
||||
if (dirty || need_write)
|
||||
if (log->type->flush(log))
|
||||
bitmap->flags |= BITMAP_WRITE_ERROR;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* look at each page to see if there are any set bits that need to be
|
||||
* flushed out to disk */
|
||||
|
@ -990,7 +964,6 @@ void bitmap_unplug(struct bitmap *bitmap)
|
|||
else
|
||||
md_super_wait(bitmap->mddev);
|
||||
}
|
||||
out:
|
||||
if (bitmap->flags & BITMAP_WRITE_ERROR)
|
||||
bitmap_file_kick(bitmap);
|
||||
}
|
||||
|
@ -1199,7 +1172,6 @@ void bitmap_daemon_work(mddev_t *mddev)
|
|||
struct page *page = NULL, *lastpage = NULL;
|
||||
sector_t blocks;
|
||||
void *paddr;
|
||||
struct dm_dirty_log *log = mddev->bitmap_info.log;
|
||||
|
||||
/* Use a mutex to guard daemon_work against
|
||||
* bitmap_destroy.
|
||||
|
@ -1224,12 +1196,11 @@ void bitmap_daemon_work(mddev_t *mddev)
|
|||
spin_lock_irqsave(&bitmap->lock, flags);
|
||||
for (j = 0; j < bitmap->chunks; j++) {
|
||||
bitmap_counter_t *bmc;
|
||||
if (!bitmap->filemap) {
|
||||
if (!log)
|
||||
/* error or shutdown */
|
||||
break;
|
||||
} else
|
||||
page = filemap_get_page(bitmap, j);
|
||||
if (!bitmap->filemap)
|
||||
/* error or shutdown */
|
||||
break;
|
||||
|
||||
page = filemap_get_page(bitmap, j);
|
||||
|
||||
if (page != lastpage) {
|
||||
/* skip this page unless it's marked as needing cleaning */
|
||||
|
@ -1298,17 +1269,16 @@ void bitmap_daemon_work(mddev_t *mddev)
|
|||
-1);
|
||||
|
||||
/* clear the bit */
|
||||
if (page) {
|
||||
paddr = kmap_atomic(page, KM_USER0);
|
||||
if (bitmap->flags & BITMAP_HOSTENDIAN)
|
||||
clear_bit(file_page_offset(bitmap, j),
|
||||
paddr);
|
||||
else
|
||||
__test_and_clear_bit_le(file_page_offset(bitmap, j),
|
||||
paddr);
|
||||
kunmap_atomic(paddr, KM_USER0);
|
||||
} else
|
||||
log->type->clear_region(log, j);
|
||||
paddr = kmap_atomic(page, KM_USER0);
|
||||
if (bitmap->flags & BITMAP_HOSTENDIAN)
|
||||
clear_bit(file_page_offset(bitmap, j),
|
||||
paddr);
|
||||
else
|
||||
__clear_bit_le(
|
||||
file_page_offset(bitmap,
|
||||
j),
|
||||
paddr);
|
||||
kunmap_atomic(paddr, KM_USER0);
|
||||
}
|
||||
} else
|
||||
j |= PAGE_COUNTER_MASK;
|
||||
|
@ -1316,16 +1286,12 @@ void bitmap_daemon_work(mddev_t *mddev)
|
|||
spin_unlock_irqrestore(&bitmap->lock, flags);
|
||||
|
||||
/* now sync the final page */
|
||||
if (lastpage != NULL || log != NULL) {
|
||||
if (lastpage != NULL) {
|
||||
spin_lock_irqsave(&bitmap->lock, flags);
|
||||
if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) {
|
||||
clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
|
||||
spin_unlock_irqrestore(&bitmap->lock, flags);
|
||||
if (lastpage)
|
||||
write_page(bitmap, lastpage, 0);
|
||||
else
|
||||
if (log->type->flush(log))
|
||||
bitmap->flags |= BITMAP_WRITE_ERROR;
|
||||
write_page(bitmap, lastpage, 0);
|
||||
} else {
|
||||
set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
|
||||
spin_unlock_irqrestore(&bitmap->lock, flags);
|
||||
|
@ -1767,12 +1733,10 @@ int bitmap_create(mddev_t *mddev)
|
|||
BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
|
||||
|
||||
if (!file
|
||||
&& !mddev->bitmap_info.offset
|
||||
&& !mddev->bitmap_info.log) /* bitmap disabled, nothing to do */
|
||||
&& !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */
|
||||
return 0;
|
||||
|
||||
BUG_ON(file && mddev->bitmap_info.offset);
|
||||
BUG_ON(mddev->bitmap_info.offset && mddev->bitmap_info.log);
|
||||
|
||||
bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
|
||||
if (!bitmap)
|
||||
|
@ -1863,6 +1827,7 @@ int bitmap_create(mddev_t *mddev)
|
|||
int bitmap_load(mddev_t *mddev)
|
||||
{
|
||||
int err = 0;
|
||||
sector_t start = 0;
|
||||
sector_t sector = 0;
|
||||
struct bitmap *bitmap = mddev->bitmap;
|
||||
|
||||
|
@ -1881,24 +1846,14 @@ int bitmap_load(mddev_t *mddev)
|
|||
}
|
||||
bitmap_close_sync(bitmap);
|
||||
|
||||
if (mddev->bitmap_info.log) {
|
||||
unsigned long i;
|
||||
struct dm_dirty_log *log = mddev->bitmap_info.log;
|
||||
for (i = 0; i < bitmap->chunks; i++)
|
||||
if (!log->type->in_sync(log, i, 1))
|
||||
bitmap_set_memory_bits(bitmap,
|
||||
(sector_t)i << CHUNK_BLOCK_SHIFT(bitmap),
|
||||
1);
|
||||
} else {
|
||||
sector_t start = 0;
|
||||
if (mddev->degraded == 0
|
||||
|| bitmap->events_cleared == mddev->events)
|
||||
/* no need to keep dirty bits to optimise a
|
||||
* re-add of a missing device */
|
||||
start = mddev->recovery_cp;
|
||||
if (mddev->degraded == 0
|
||||
|| bitmap->events_cleared == mddev->events)
|
||||
/* no need to keep dirty bits to optimise a
|
||||
* re-add of a missing device */
|
||||
start = mddev->recovery_cp;
|
||||
|
||||
err = bitmap_init_from_disk(bitmap, start);
|
||||
|
||||
err = bitmap_init_from_disk(bitmap, start);
|
||||
}
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
|
|
|
@ -212,10 +212,6 @@ struct bitmap {
|
|||
unsigned long file_pages; /* number of pages in the file */
|
||||
int last_page_size; /* bytes in the last page */
|
||||
|
||||
unsigned long logattrs; /* used when filemap_attr doesn't exist
|
||||
* because we are working with a dirty_log
|
||||
*/
|
||||
|
||||
unsigned long flags;
|
||||
|
||||
int allclean;
|
||||
|
@ -237,7 +233,6 @@ struct bitmap {
|
|||
wait_queue_head_t behind_wait;
|
||||
|
||||
struct sysfs_dirent *sysfs_can_clear;
|
||||
|
||||
};
|
||||
|
||||
/* the bitmap API */
|
||||
|
|
871
drivers/md/md.c
871
drivers/md/md.c
File diff suppressed because it is too large
Load diff
110
drivers/md/md.h
110
drivers/md/md.h
|
@ -29,6 +29,13 @@
|
|||
typedef struct mddev_s mddev_t;
|
||||
typedef struct mdk_rdev_s mdk_rdev_t;
|
||||
|
||||
/* Bad block numbers are stored sorted in a single page.
|
||||
* 64bits is used for each block or extent.
|
||||
* 54 bits are sector number, 9 bits are extent size,
|
||||
* 1 bit is an 'acknowledged' flag.
|
||||
*/
|
||||
#define MD_MAX_BADBLOCKS (PAGE_SIZE/8)
|
||||
|
||||
/*
|
||||
* MD's 'extended' device
|
||||
*/
|
||||
|
@ -48,7 +55,7 @@ struct mdk_rdev_s
|
|||
struct block_device *meta_bdev;
|
||||
struct block_device *bdev; /* block device handle */
|
||||
|
||||
struct page *sb_page;
|
||||
struct page *sb_page, *bb_page;
|
||||
int sb_loaded;
|
||||
__u64 sb_events;
|
||||
sector_t data_offset; /* start of data in array */
|
||||
|
@ -74,9 +81,29 @@ struct mdk_rdev_s
|
|||
#define In_sync 2 /* device is in_sync with rest of array */
|
||||
#define WriteMostly 4 /* Avoid reading if at all possible */
|
||||
#define AutoDetected 7 /* added by auto-detect */
|
||||
#define Blocked 8 /* An error occurred on an externally
|
||||
* managed array, don't allow writes
|
||||
#define Blocked 8 /* An error occurred but has not yet
|
||||
* been acknowledged by the metadata
|
||||
* handler, so don't allow writes
|
||||
* until it is cleared */
|
||||
#define WriteErrorSeen 9 /* A write error has been seen on this
|
||||
* device
|
||||
*/
|
||||
#define FaultRecorded 10 /* Intermediate state for clearing
|
||||
* Blocked. The Fault is/will-be
|
||||
* recorded in the metadata, but that
|
||||
* metadata hasn't been stored safely
|
||||
* on disk yet.
|
||||
*/
|
||||
#define BlockedBadBlocks 11 /* A writer is blocked because they
|
||||
* found an unacknowledged bad-block.
|
||||
* This can safely be cleared at any
|
||||
* time, and the writer will re-check.
|
||||
* It may be set at any time, and at
|
||||
* worst the writer will timeout and
|
||||
* re-check. So setting it as
|
||||
* accurately as possible is good, but
|
||||
* not absolutely critical.
|
||||
*/
|
||||
wait_queue_head_t blocked_wait;
|
||||
|
||||
int desc_nr; /* descriptor index in the superblock */
|
||||
|
@ -111,8 +138,54 @@ struct mdk_rdev_s
|
|||
|
||||
struct sysfs_dirent *sysfs_state; /* handle for 'state'
|
||||
* sysfs entry */
|
||||
|
||||
struct badblocks {
|
||||
int count; /* count of bad blocks */
|
||||
int unacked_exist; /* there probably are unacknowledged
|
||||
* bad blocks. This is only cleared
|
||||
* when a read discovers none
|
||||
*/
|
||||
int shift; /* shift from sectors to block size
|
||||
* a -ve shift means badblocks are
|
||||
* disabled.*/
|
||||
u64 *page; /* badblock list */
|
||||
int changed;
|
||||
seqlock_t lock;
|
||||
|
||||
sector_t sector;
|
||||
sector_t size; /* in sectors */
|
||||
} badblocks;
|
||||
};
|
||||
|
||||
#define BB_LEN_MASK (0x00000000000001FFULL)
|
||||
#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
|
||||
#define BB_ACK_MASK (0x8000000000000000ULL)
|
||||
#define BB_MAX_LEN 512
|
||||
#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
|
||||
#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
|
||||
#define BB_ACK(x) (!!((x) & BB_ACK_MASK))
|
||||
#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
|
||||
|
||||
extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
|
||||
sector_t *first_bad, int *bad_sectors);
|
||||
static inline int is_badblock(mdk_rdev_t *rdev, sector_t s, int sectors,
|
||||
sector_t *first_bad, int *bad_sectors)
|
||||
{
|
||||
if (unlikely(rdev->badblocks.count)) {
|
||||
int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s,
|
||||
sectors,
|
||||
first_bad, bad_sectors);
|
||||
if (rv)
|
||||
*first_bad -= rdev->data_offset;
|
||||
return rv;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
extern int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors,
|
||||
int acknowledged);
|
||||
extern int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors);
|
||||
extern void md_ack_all_badblocks(struct badblocks *bb);
|
||||
|
||||
struct mddev_s
|
||||
{
|
||||
void *private;
|
||||
|
@ -239,9 +312,12 @@ struct mddev_s
|
|||
#define MD_RECOVERY_FROZEN 9
|
||||
|
||||
unsigned long recovery;
|
||||
int recovery_disabled; /* if we detect that recovery
|
||||
* will always fail, set this
|
||||
* so we don't loop trying */
|
||||
/* If a RAID personality determines that recovery (of a particular
|
||||
* device) will fail due to a read error on the source device, it
|
||||
* takes a copy of this number and does not attempt recovery again
|
||||
* until this number changes.
|
||||
*/
|
||||
int recovery_disabled;
|
||||
|
||||
int in_sync; /* know to not need resync */
|
||||
/* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
|
||||
|
@ -304,11 +380,6 @@ struct mddev_s
|
|||
* hot-adding a bitmap. It should
|
||||
* eventually be settable by sysfs.
|
||||
*/
|
||||
/* When md is serving under dm, it might use a
|
||||
* dirty_log to store the bits.
|
||||
*/
|
||||
struct dm_dirty_log *log;
|
||||
|
||||
struct mutex mutex;
|
||||
unsigned long chunksize;
|
||||
unsigned long daemon_sleep; /* how many jiffies between updates? */
|
||||
|
@ -413,6 +484,20 @@ static inline char * mdname (mddev_t * mddev)
|
|||
return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
|
||||
}
|
||||
|
||||
static inline int sysfs_link_rdev(mddev_t *mddev, mdk_rdev_t *rdev)
|
||||
{
|
||||
char nm[20];
|
||||
sprintf(nm, "rd%d", rdev->raid_disk);
|
||||
return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
|
||||
}
|
||||
|
||||
static inline void sysfs_unlink_rdev(mddev_t *mddev, mdk_rdev_t *rdev)
|
||||
{
|
||||
char nm[20];
|
||||
sprintf(nm, "rd%d", rdev->raid_disk);
|
||||
sysfs_remove_link(&mddev->kobj, nm);
|
||||
}
|
||||
|
||||
/*
|
||||
* iterates through some rdev ringlist. It's safe to remove the
|
||||
* current 'rdev'. Dont touch 'tmp' though.
|
||||
|
@ -505,7 +590,7 @@ extern void mddev_init(mddev_t *mddev);
|
|||
extern int md_run(mddev_t *mddev);
|
||||
extern void md_stop(mddev_t *mddev);
|
||||
extern void md_stop_writes(mddev_t *mddev);
|
||||
extern void md_rdev_init(mdk_rdev_t *rdev);
|
||||
extern int md_rdev_init(mdk_rdev_t *rdev);
|
||||
|
||||
extern void mddev_suspend(mddev_t *mddev);
|
||||
extern void mddev_resume(mddev_t *mddev);
|
||||
|
@ -514,4 +599,5 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
|
|||
extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
|
||||
mddev_t *mddev);
|
||||
extern int mddev_check_plugged(mddev_t *mddev);
|
||||
extern void md_trim_bio(struct bio *bio, int offset, int size);
|
||||
#endif /* _MD_MD_H */
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -48,6 +48,12 @@ struct r1_private_data_s {
|
|||
* (fresh device added).
|
||||
* Cleared when a sync completes.
|
||||
*/
|
||||
int recovery_disabled; /* when the same as
|
||||
* mddev->recovery_disabled
|
||||
* we don't allow recovery
|
||||
* to be attempted as we
|
||||
* expect a read error
|
||||
*/
|
||||
|
||||
wait_queue_head_t wait_barrier;
|
||||
|
||||
|
@ -95,7 +101,7 @@ struct r1bio_s {
|
|||
|
||||
struct list_head retry_list;
|
||||
/* Next two are only valid when R1BIO_BehindIO is set */
|
||||
struct page **behind_pages;
|
||||
struct bio_vec *behind_bvecs;
|
||||
int behind_page_count;
|
||||
/*
|
||||
* if the IO is in WRITE direction, then multiple bios are used.
|
||||
|
@ -110,13 +116,24 @@ struct r1bio_s {
|
|||
* correct the read error. To keep track of bad blocks on a per-bio
|
||||
* level, we store IO_BLOCKED in the appropriate 'bios' pointer
|
||||
*/
|
||||
#define IO_BLOCKED ((struct bio*)1)
|
||||
#define IO_BLOCKED ((struct bio *)1)
|
||||
/* When we successfully write to a known bad-block, we need to remove the
|
||||
* bad-block marking which must be done from process context. So we record
|
||||
* the success by setting bios[n] to IO_MADE_GOOD
|
||||
*/
|
||||
#define IO_MADE_GOOD ((struct bio *)2)
|
||||
|
||||
#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
|
||||
|
||||
/* bits for r1bio.state */
|
||||
#define R1BIO_Uptodate 0
|
||||
#define R1BIO_IsSync 1
|
||||
#define R1BIO_Degraded 2
|
||||
#define R1BIO_BehindIO 3
|
||||
/* Set ReadError on bios that experience a readerror so that
|
||||
* raid1d knows what to do with them.
|
||||
*/
|
||||
#define R1BIO_ReadError 4
|
||||
/* For write-behind requests, we call bi_end_io when
|
||||
* the last non-write-behind device completes, providing
|
||||
* any write was successful. Otherwise we call when
|
||||
|
@ -125,6 +142,11 @@ struct r1bio_s {
|
|||
* Record that bi_end_io was called with this flag...
|
||||
*/
|
||||
#define R1BIO_Returned 6
|
||||
/* If a write for this request means we can clear some
|
||||
* known-bad-block records, we set this flag
|
||||
*/
|
||||
#define R1BIO_MadeGood 7
|
||||
#define R1BIO_WriteError 8
|
||||
|
||||
extern int md_raid1_congested(mddev_t *mddev, int bits);
|
||||
|
||||
|
|
1191
drivers/md/raid10.c
1191
drivers/md/raid10.c
File diff suppressed because it is too large
Load diff
|
@ -6,6 +6,11 @@ typedef struct mirror_info mirror_info_t;
|
|||
struct mirror_info {
|
||||
mdk_rdev_t *rdev;
|
||||
sector_t head_position;
|
||||
int recovery_disabled; /* matches
|
||||
* mddev->recovery_disabled
|
||||
* when we shouldn't try
|
||||
* recovering this device.
|
||||
*/
|
||||
};
|
||||
|
||||
typedef struct r10bio_s r10bio_t;
|
||||
|
@ -113,10 +118,26 @@ struct r10bio_s {
|
|||
* level, we store IO_BLOCKED in the appropriate 'bios' pointer
|
||||
*/
|
||||
#define IO_BLOCKED ((struct bio*)1)
|
||||
/* When we successfully write to a known bad-block, we need to remove the
|
||||
* bad-block marking which must be done from process context. So we record
|
||||
* the success by setting devs[n].bio to IO_MADE_GOOD
|
||||
*/
|
||||
#define IO_MADE_GOOD ((struct bio *)2)
|
||||
|
||||
#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
|
||||
|
||||
/* bits for r10bio.state */
|
||||
#define R10BIO_Uptodate 0
|
||||
#define R10BIO_IsSync 1
|
||||
#define R10BIO_IsRecover 2
|
||||
#define R10BIO_Degraded 3
|
||||
/* Set ReadError on bios that experience a read error
|
||||
* so that raid10d knows what to do with them.
|
||||
*/
|
||||
#define R10BIO_ReadError 4
|
||||
/* If a write for this request means we can clear some
|
||||
* known-bad-block records, we set this flag.
|
||||
*/
|
||||
#define R10BIO_MadeGood 5
|
||||
#define R10BIO_WriteError 6
|
||||
#endif
|
||||
|
|
1059
drivers/md/raid5.c
1059
drivers/md/raid5.c
File diff suppressed because it is too large
Load diff
|
@ -6,11 +6,11 @@
|
|||
|
||||
/*
|
||||
*
|
||||
* Each stripe contains one buffer per disc. Each buffer can be in
|
||||
* Each stripe contains one buffer per device. Each buffer can be in
|
||||
* one of a number of states stored in "flags". Changes between
|
||||
* these states happen *almost* exclusively under a per-stripe
|
||||
* spinlock. Some very specific changes can happen in bi_end_io, and
|
||||
* these are not protected by the spin lock.
|
||||
* these states happen *almost* exclusively under the protection of the
|
||||
* STRIPE_ACTIVE flag. Some very specific changes can happen in bi_end_io, and
|
||||
* these are not protected by STRIPE_ACTIVE.
|
||||
*
|
||||
* The flag bits that are used to represent these states are:
|
||||
* R5_UPTODATE and R5_LOCKED
|
||||
|
@ -76,12 +76,10 @@
|
|||
* block and the cached buffer are successfully written, any buffer on
|
||||
* a written list can be returned with b_end_io.
|
||||
*
|
||||
* The write list and read list both act as fifos. The read list is
|
||||
* protected by the device_lock. The write and written lists are
|
||||
* protected by the stripe lock. The device_lock, which can be
|
||||
* claimed while the stipe lock is held, is only for list
|
||||
* manipulations and will only be held for a very short time. It can
|
||||
* be claimed from interrupts.
|
||||
* The write list and read list both act as fifos. The read list,
|
||||
* write list and written list are protected by the device_lock.
|
||||
* The device_lock is only for list manipulations and will only be
|
||||
* held for a very short time. It can be claimed from interrupts.
|
||||
*
|
||||
*
|
||||
* Stripes in the stripe cache can be on one of two lists (or on
|
||||
|
@ -96,7 +94,6 @@
|
|||
*
|
||||
* The inactive_list, handle_list and hash bucket lists are all protected by the
|
||||
* device_lock.
|
||||
* - stripes on the inactive_list never have their stripe_lock held.
|
||||
* - stripes have a reference counter. If count==0, they are on a list.
|
||||
* - If a stripe might need handling, STRIPE_HANDLE is set.
|
||||
* - When refcount reaches zero, then if STRIPE_HANDLE it is put on
|
||||
|
@ -116,10 +113,10 @@
|
|||
* attach a request to an active stripe (add_stripe_bh())
|
||||
* lockdev attach-buffer unlockdev
|
||||
* handle a stripe (handle_stripe())
|
||||
* lockstripe clrSTRIPE_HANDLE ...
|
||||
* setSTRIPE_ACTIVE, clrSTRIPE_HANDLE ...
|
||||
* (lockdev check-buffers unlockdev) ..
|
||||
* change-state ..
|
||||
* record io/ops needed unlockstripe schedule io/ops
|
||||
* record io/ops needed clearSTRIPE_ACTIVE schedule io/ops
|
||||
* release an active stripe (release_stripe())
|
||||
* lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
|
||||
*
|
||||
|
@ -128,8 +125,7 @@
|
|||
* on a cached buffer, and plus one if the stripe is undergoing stripe
|
||||
* operations.
|
||||
*
|
||||
* Stripe operations are performed outside the stripe lock,
|
||||
* the stripe operations are:
|
||||
* The stripe operations are:
|
||||
* -copying data between the stripe cache and user application buffers
|
||||
* -computing blocks to save a disk access, or to recover a missing block
|
||||
* -updating the parity on a write operation (reconstruct write and
|
||||
|
@ -159,7 +155,8 @@
|
|||
*/
|
||||
|
||||
/*
|
||||
* Operations state - intermediate states that are visible outside of sh->lock
|
||||
* Operations state - intermediate states that are visible outside of
|
||||
* STRIPE_ACTIVE.
|
||||
* In general _idle indicates nothing is running, _run indicates a data
|
||||
* processing operation is active, and _result means the data processing result
|
||||
* is stable and can be acted upon. For simple operations like biofill and
|
||||
|
@ -209,7 +206,6 @@ struct stripe_head {
|
|||
short ddf_layout;/* use DDF ordering to calculate Q */
|
||||
unsigned long state; /* state flags */
|
||||
atomic_t count; /* nr of active thread/requests */
|
||||
spinlock_t lock;
|
||||
int bm_seq; /* sequence number for bitmap flushes */
|
||||
int disks; /* disks in stripe */
|
||||
enum check_states check_state;
|
||||
|
@ -240,19 +236,20 @@ struct stripe_head {
|
|||
};
|
||||
|
||||
/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
|
||||
* for handle_stripe. It is only valid under spin_lock(sh->lock);
|
||||
* for handle_stripe.
|
||||
*/
|
||||
struct stripe_head_state {
|
||||
int syncing, expanding, expanded;
|
||||
int locked, uptodate, to_read, to_write, failed, written;
|
||||
int to_fill, compute, req_compute, non_overwrite;
|
||||
int failed_num;
|
||||
int failed_num[2];
|
||||
int p_failed, q_failed;
|
||||
int dec_preread_active;
|
||||
unsigned long ops_request;
|
||||
};
|
||||
|
||||
/* r6_state - extra state data only relevant to r6 */
|
||||
struct r6_state {
|
||||
int p_failed, q_failed, failed_num[2];
|
||||
struct bio *return_bi;
|
||||
mdk_rdev_t *blocked_rdev;
|
||||
int handle_bad_blocks;
|
||||
};
|
||||
|
||||
/* Flags */
|
||||
|
@ -268,14 +265,16 @@ struct r6_state {
|
|||
#define R5_ReWrite 9 /* have tried to over-write the readerror */
|
||||
|
||||
#define R5_Expanded 10 /* This block now has post-expand data */
|
||||
#define R5_Wantcompute 11 /* compute_block in progress treat as
|
||||
* uptodate
|
||||
*/
|
||||
#define R5_Wantfill 12 /* dev->toread contains a bio that needs
|
||||
* filling
|
||||
*/
|
||||
#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
|
||||
#define R5_WantFUA 14 /* Write should be FUA */
|
||||
#define R5_Wantcompute 11 /* compute_block in progress treat as
|
||||
* uptodate
|
||||
*/
|
||||
#define R5_Wantfill 12 /* dev->toread contains a bio that needs
|
||||
* filling
|
||||
*/
|
||||
#define R5_Wantdrain 13 /* dev->towrite needs to be drained */
|
||||
#define R5_WantFUA 14 /* Write should be FUA */
|
||||
#define R5_WriteError 15 /* got a write error - need to record it */
|
||||
#define R5_MadeGood 16 /* A bad block has been fixed by writing to it*/
|
||||
/*
|
||||
* Write method
|
||||
*/
|
||||
|
@ -289,21 +288,25 @@ struct r6_state {
|
|||
/*
|
||||
* Stripe state
|
||||
*/
|
||||
#define STRIPE_HANDLE 2
|
||||
#define STRIPE_SYNCING 3
|
||||
#define STRIPE_INSYNC 4
|
||||
#define STRIPE_PREREAD_ACTIVE 5
|
||||
#define STRIPE_DELAYED 6
|
||||
#define STRIPE_DEGRADED 7
|
||||
#define STRIPE_BIT_DELAY 8
|
||||
#define STRIPE_EXPANDING 9
|
||||
#define STRIPE_EXPAND_SOURCE 10
|
||||
#define STRIPE_EXPAND_READY 11
|
||||
#define STRIPE_IO_STARTED 12 /* do not count towards 'bypass_count' */
|
||||
#define STRIPE_FULL_WRITE 13 /* all blocks are set to be overwritten */
|
||||
#define STRIPE_BIOFILL_RUN 14
|
||||
#define STRIPE_COMPUTE_RUN 15
|
||||
#define STRIPE_OPS_REQ_PENDING 16
|
||||
enum {
|
||||
STRIPE_ACTIVE,
|
||||
STRIPE_HANDLE,
|
||||
STRIPE_SYNC_REQUESTED,
|
||||
STRIPE_SYNCING,
|
||||
STRIPE_INSYNC,
|
||||
STRIPE_PREREAD_ACTIVE,
|
||||
STRIPE_DELAYED,
|
||||
STRIPE_DEGRADED,
|
||||
STRIPE_BIT_DELAY,
|
||||
STRIPE_EXPANDING,
|
||||
STRIPE_EXPAND_SOURCE,
|
||||
STRIPE_EXPAND_READY,
|
||||
STRIPE_IO_STARTED, /* do not count towards 'bypass_count' */
|
||||
STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */
|
||||
STRIPE_BIOFILL_RUN,
|
||||
STRIPE_COMPUTE_RUN,
|
||||
STRIPE_OPS_REQ_PENDING,
|
||||
};
|
||||
|
||||
/*
|
||||
* Operation request flags
|
||||
|
@ -336,7 +339,7 @@ struct r6_state {
|
|||
* PREREAD_ACTIVE.
|
||||
* In stripe_handle, if we find pre-reading is necessary, we do it if
|
||||
* PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
|
||||
* HANDLE gets cleared if stripe_handle leave nothing locked.
|
||||
* HANDLE gets cleared if stripe_handle leaves nothing locked.
|
||||
*/
|
||||
|
||||
|
||||
|
@ -399,7 +402,7 @@ struct raid5_private_data {
|
|||
* (fresh device added).
|
||||
* Cleared when a sync completes.
|
||||
*/
|
||||
|
||||
int recovery_disabled;
|
||||
/* per cpu variables */
|
||||
struct raid5_percpu {
|
||||
struct page *spare_page; /* Used when checking P/Q in raid6 */
|
||||
|
|
|
@ -245,10 +245,16 @@ struct mdp_superblock_1 {
|
|||
__u8 device_uuid[16]; /* user-space setable, ignored by kernel */
|
||||
__u8 devflags; /* per-device flags. Only one defined...*/
|
||||
#define WriteMostly1 1 /* mask for writemostly flag in above */
|
||||
__u8 pad2[64-57]; /* set to 0 when writing */
|
||||
/* Bad block log. If there are any bad blocks the feature flag is set.
|
||||
* If offset and size are non-zero, that space is reserved and available
|
||||
*/
|
||||
__u8 bblog_shift; /* shift from sectors to block size */
|
||||
__le16 bblog_size; /* number of sectors reserved for list */
|
||||
__le32 bblog_offset; /* sector offset from superblock to bblog,
|
||||
* signed - not unsigned */
|
||||
|
||||
/* array state information - 64 bytes */
|
||||
__le64 utime; /* 40 bits second, 24 btes microseconds */
|
||||
__le64 utime; /* 40 bits second, 24 bits microseconds */
|
||||
__le64 events; /* incremented when superblock updated */
|
||||
__le64 resync_offset; /* data before this offset (from data_offset) known to be in sync */
|
||||
__le32 sb_csum; /* checksum up to devs[max_dev] */
|
||||
|
@ -270,8 +276,8 @@ struct mdp_superblock_1 {
|
|||
* must be honoured
|
||||
*/
|
||||
#define MD_FEATURE_RESHAPE_ACTIVE 4
|
||||
#define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */
|
||||
|
||||
#define MD_FEATURE_ALL (1|2|4)
|
||||
#define MD_FEATURE_ALL (1|2|4|8)
|
||||
|
||||
#endif
|
||||
|
||||
|
|
Loading…
Reference in a new issue