From ece5cff0da9e696c360fff592cb5f51b6419e4d6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 14 Dec 2009 12:49:56 +1100 Subject: [PATCH] md: Support write-intent bitmaps with externally managed metadata. In this case, the metadata needs to not be in the same sector as the bitmap. md will not read/write any bitmap metadata. Config must be done via sysfs and when a recovery makes the array non-degraded again, writing 'true' to 'bitmap/can_clear' will allow bits in the bitmap to be cleared again. Signed-off-by: NeilBrown --- Documentation/md.txt | 16 +++++ drivers/md/bitmap.c | 142 ++++++++++++++++++++++++++++++++++++------- drivers/md/bitmap.h | 11 +--- drivers/md/md.h | 1 + 4 files changed, 137 insertions(+), 33 deletions(-) diff --git a/Documentation/md.txt b/Documentation/md.txt index 18fad6876228..21d26fb5d02b 100644 --- a/Documentation/md.txt +++ b/Documentation/md.txt @@ -322,6 +322,22 @@ All md devices contain: 'backlog' sets a limit on the number of concurrent background writes. If there are more than this, new writes will by synchronous. + bitmap/metadata + This can be either 'internal' or 'external'. + 'internal' is the default and means the metadata for the bitmap + is stored in the first 256 bytes of the allocated space and is + managed by the md module. + 'external' means that bitmap metadata is managed externally to + the kernel (i.e. by some userspace program) + bitmap/can_clear + This is either 'true' or 'false'. If 'true', then bits in the + bitmap will be cleared when the corresponding blocks are thought + to be in-sync. If 'false', bits will never be cleared. + This is automatically set to 'false' if a write happens on a + degraded array, or if the array becomes degraded during a write. + When metadata is managed externally, it should be set to true + once the array becomes non-degraded, and this fact has been + recorded in the metadata. diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 62958491f329..de5c42df8d17 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -497,6 +497,8 @@ void bitmap_update_sb(struct bitmap *bitmap) if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ return; + if (bitmap->mddev->bitmap_info.external) + return; spin_lock_irqsave(&bitmap->lock, flags); if (!bitmap->sb_page) { /* no superblock */ spin_unlock_irqrestore(&bitmap->lock, flags); @@ -676,16 +678,26 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, * general bitmap file operations */ +/* + * on-disk bitmap: + * + * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap + * file a page at a time. There's a superblock at the start of the file. + */ /* calculate the index of the page that contains this bit */ -static inline unsigned long file_page_index(unsigned long chunk) +static inline unsigned long file_page_index(struct bitmap *bitmap, unsigned long chunk) { - return CHUNK_BIT_OFFSET(chunk) >> PAGE_BIT_SHIFT; + if (!bitmap->mddev->bitmap_info.external) + chunk += sizeof(bitmap_super_t) << 3; + return chunk >> PAGE_BIT_SHIFT; } /* calculate the (bit) offset of this bit within a page */ -static inline unsigned long file_page_offset(unsigned long chunk) +static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned long chunk) { - return CHUNK_BIT_OFFSET(chunk) & (PAGE_BITS - 1); + if (!bitmap->mddev->bitmap_info.external) + chunk += sizeof(bitmap_super_t) << 3; + return chunk & (PAGE_BITS - 1); } /* @@ -698,8 +710,9 @@ static inline unsigned long file_page_offset(unsigned long chunk) static inline struct page *filemap_get_page(struct bitmap *bitmap, unsigned long chunk) { - if (file_page_index(chunk) >= bitmap->file_pages) return NULL; - return bitmap->filemap[file_page_index(chunk) - file_page_index(0)]; + if (file_page_index(bitmap, chunk) >= bitmap->file_pages) return NULL; + return bitmap->filemap[file_page_index(bitmap, chunk) + - file_page_index(bitmap, 0)]; } @@ -722,7 +735,7 @@ static void bitmap_file_unmap(struct bitmap *bitmap) spin_unlock_irqrestore(&bitmap->lock, flags); while (pages--) - if (map[pages]->index != 0) /* 0 is sb_page, release it below */ + if (map[pages] != sb_page) /* 0 is sb_page, release it below */ free_buffers(map[pages]); kfree(map); kfree(attr); @@ -833,7 +846,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) page = filemap_get_page(bitmap, chunk); if (!page) return; - bit = file_page_offset(chunk); + bit = file_page_offset(bitmap, chunk); /* set the bit */ kaddr = kmap_atomic(page, KM_USER0); @@ -931,14 +944,17 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) "recovery\n", bmname(bitmap)); bytes = (chunks + 7) / 8; + if (!bitmap->mddev->bitmap_info.external) + bytes += sizeof(bitmap_super_t); - num_pages = (bytes + sizeof(bitmap_super_t) + PAGE_SIZE - 1) / PAGE_SIZE; + + num_pages = (bytes + PAGE_SIZE - 1) / PAGE_SIZE; - if (file && i_size_read(file->f_mapping->host) < bytes + sizeof(bitmap_super_t)) { + if (file && i_size_read(file->f_mapping->host) < bytes) { printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", bmname(bitmap), (unsigned long) i_size_read(file->f_mapping->host), - bytes + sizeof(bitmap_super_t)); + bytes); goto err; } @@ -959,17 +975,16 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) for (i = 0; i < chunks; i++) { int b; - index = file_page_index(i); - bit = file_page_offset(i); + index = file_page_index(bitmap, i); + bit = file_page_offset(bitmap, i); if (index != oldindex) { /* this is a new page, read it in */ int count; /* unmap the old page, we're done with it */ if (index == num_pages-1) - count = bytes + sizeof(bitmap_super_t) - - index * PAGE_SIZE; + count = bytes - index * PAGE_SIZE; else count = PAGE_SIZE; - if (index == 0) { + if (index == 0 && bitmap->sb_page) { /* * if we're here then the superblock page * contains some bits (PAGE_SIZE != sizeof sb) @@ -1164,7 +1179,8 @@ void bitmap_daemon_work(mddev_t *mddev) /* We are possibly going to clear some bits, so make * sure that events_cleared is up-to-date. */ - if (bitmap->need_sync) { + if (bitmap->need_sync && + bitmap->mddev->bitmap_info.external == 0) { bitmap_super_t *sb; bitmap->need_sync = 0; sb = kmap_atomic(bitmap->sb_page, KM_USER0); @@ -1174,7 +1190,8 @@ void bitmap_daemon_work(mddev_t *mddev) write_page(bitmap, bitmap->sb_page, 1); } spin_lock_irqsave(&bitmap->lock, flags); - clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); + if (!bitmap->need_sync) + clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); } bmc = bitmap_get_counter(bitmap, (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), @@ -1189,7 +1206,7 @@ void bitmap_daemon_work(mddev_t *mddev) if (*bmc == 2) { *bmc=1; /* maybe clear the bit next time */ set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); - } else if (*bmc == 1) { + } else if (*bmc == 1 && !bitmap->need_sync) { /* we can clear the bit */ *bmc = 0; bitmap_count_page(bitmap, @@ -1199,9 +1216,11 @@ void bitmap_daemon_work(mddev_t *mddev) /* clear the bit */ paddr = kmap_atomic(page, KM_USER0); if (bitmap->flags & BITMAP_HOSTENDIAN) - clear_bit(file_page_offset(j), paddr); + clear_bit(file_page_offset(bitmap, j), + paddr); else - ext2_clear_bit(file_page_offset(j), paddr); + ext2_clear_bit(file_page_offset(bitmap, j), + paddr); kunmap_atomic(paddr, KM_USER0); } } else @@ -1356,6 +1375,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto bitmap->events_cleared < bitmap->mddev->events) { bitmap->events_cleared = bitmap->mddev->events; bitmap->need_sync = 1; + sysfs_notify_dirent(bitmap->sysfs_can_clear); } if (!success && ! (*bmc & NEEDED_MASK)) @@ -1613,6 +1633,9 @@ void bitmap_destroy(mddev_t *mddev) if (mddev->thread) mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + if (bitmap->sysfs_can_clear) + sysfs_put(bitmap->sysfs_can_clear); + bitmap_free(bitmap); } @@ -1629,6 +1652,7 @@ int bitmap_create(mddev_t *mddev) struct file *file = mddev->bitmap_info.file; int err; sector_t start; + struct sysfs_dirent *bm; BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); @@ -1648,6 +1672,13 @@ int bitmap_create(mddev_t *mddev) bitmap->mddev = mddev; + bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap"); + if (bm) { + bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear"); + sysfs_put(bm); + } else + bitmap->sysfs_can_clear = NULL; + bitmap->file = file; if (file) { get_file(file); @@ -1658,7 +1689,16 @@ int bitmap_create(mddev_t *mddev) vfs_fsync(file, file->f_dentry, 1); } /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */ - err = bitmap_read_sb(bitmap); + if (!mddev->bitmap_info.external) + err = bitmap_read_sb(bitmap); + else { + err = 0; + if (mddev->bitmap_info.chunksize == 0 || + mddev->bitmap_info.daemon_sleep == 0) + /* chunksize and time_base need to be + * set first. */ + err = -EINVAL; + } if (err) goto error; @@ -1777,7 +1817,8 @@ location_store(mddev_t *mddev, const char *buf, size_t len) return rv; if (offset == 0) return -EINVAL; - if (mddev->major_version == 0 && + if (mddev->bitmap_info.external == 0 && + mddev->major_version == 0 && offset != mddev->bitmap_info.default_offset) return -EINVAL; mddev->bitmap_info.offset = offset; @@ -1906,11 +1947,66 @@ chunksize_store(mddev_t *mddev, const char *buf, size_t len) static struct md_sysfs_entry bitmap_chunksize = __ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store); +static ssize_t metadata_show(mddev_t *mddev, char *page) +{ + return sprintf(page, "%s\n", (mddev->bitmap_info.external + ? "external" : "internal")); +} + +static ssize_t metadata_store(mddev_t *mddev, const char *buf, size_t len) +{ + if (mddev->bitmap || + mddev->bitmap_info.file || + mddev->bitmap_info.offset) + return -EBUSY; + if (strncmp(buf, "external", 8) == 0) + mddev->bitmap_info.external = 1; + else if (strncmp(buf, "internal", 8) == 0) + mddev->bitmap_info.external = 0; + else + return -EINVAL; + return len; +} + +static struct md_sysfs_entry bitmap_metadata = +__ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store); + +static ssize_t can_clear_show(mddev_t *mddev, char *page) +{ + int len; + if (mddev->bitmap) + len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ? + "false" : "true")); + else + len = sprintf(page, "\n"); + return len; +} + +static ssize_t can_clear_store(mddev_t *mddev, const char *buf, size_t len) +{ + if (mddev->bitmap == NULL) + return -ENOENT; + if (strncmp(buf, "false", 5) == 0) + mddev->bitmap->need_sync = 1; + else if (strncmp(buf, "true", 4) == 0) { + if (mddev->degraded) + return -EBUSY; + mddev->bitmap->need_sync = 0; + } else + return -EINVAL; + return len; +} + +static struct md_sysfs_entry bitmap_can_clear = +__ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); + static struct attribute *md_bitmap_attrs[] = { &bitmap_location.attr, &bitmap_timeout.attr, &bitmap_backlog.attr, &bitmap_chunksize.attr, + &bitmap_metadata.attr, + &bitmap_can_clear.attr, NULL }; struct attribute_group md_bitmap_group = { diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index 50ee4240f5db..cb821d76d1b4 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -118,16 +118,6 @@ typedef __u16 bitmap_counter_t; (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) #define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) -/* - * on-disk bitmap: - * - * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap - * file a page at a time. There's a superblock at the start of the file. - */ - -/* map chunks (bits) to file pages - offset by the size of the superblock */ -#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3)) - #endif /* @@ -250,6 +240,7 @@ struct bitmap { wait_queue_head_t write_wait; wait_queue_head_t overflow_wait; + struct sysfs_dirent *sysfs_can_clear; }; /* the bitmap API */ diff --git a/drivers/md/md.h b/drivers/md/md.h index fce02073f1a4..d9138885b87f 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -296,6 +296,7 @@ struct mddev_s unsigned long chunksize; unsigned long daemon_sleep; /* how many seconds between updates? */ unsigned long max_write_behind; /* write-behind mode */ + int external; } bitmap_info; struct list_head all_mddevs;