exofs: Write sbi->s_nextid as part of the Create command
Before when creating a new inode, we'd set the sb->s_dirt flag, and sometime later the system would write out s_nextid as part of the sb_info. Also on inode sync we would force the sb sync as well. Define the s_nextid as a new partition attribute and set it every time we create a new object. At mount we read it from it's new place. We now never set sb->s_dirt anywhere in exofs. write_super is actually never called. The call to exofs_write_super from exofs_put_super is also removed because the VFS always calls ->sync_fs before calling ->put_super twice. To stay backward-and-forward compatible we also write the old s_nextid in the super_block object at unmount, and support zero length attribute on mount. This also fixes a BUG where in layouts when group_width was not a divisor of EXOFS_SUPER_ID (0x10000) the s_nextid was not read from the device it was written to. Because of the sliding window layout trick, and because the read was always done from the 0 device but the write was done via the raid engine that might slide the device view. Now we read and write through the raid engine. Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
This commit is contained in:
parent
9ed9648431
commit
1cea312ad4
5 changed files with 142 additions and 32 deletions
|
@ -53,10 +53,14 @@
|
|||
#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */
|
||||
|
||||
/* exofs Application specific page/attribute */
|
||||
/* Inode attrs */
|
||||
# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3)
|
||||
# define EXOFS_ATTR_INODE_DATA 1
|
||||
# define EXOFS_ATTR_INODE_FILE_LAYOUT 2
|
||||
# define EXOFS_ATTR_INODE_DIR_LAYOUT 3
|
||||
/* Partition attrs */
|
||||
# define EXOFS_APAGE_SB_DATA (0xF0000000U + 3)
|
||||
# define EXOFS_ATTR_SB_STATS 1
|
||||
|
||||
/*
|
||||
* The maximum number of files we can have is limited by the size of the
|
||||
|
@ -86,8 +90,8 @@ enum {
|
|||
*/
|
||||
enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
|
||||
struct exofs_fscb {
|
||||
__le64 s_nextid; /* Highest object ID used */
|
||||
__le64 s_numfiles; /* Number of files on fs */
|
||||
__le64 s_nextid; /* Only used after mkfs */
|
||||
__le64 s_numfiles; /* Only used after mkfs */
|
||||
__le32 s_version; /* == EXOFS_FSCB_VER */
|
||||
__le16 s_magic; /* Magic signature */
|
||||
__le16 s_newfs; /* Non-zero if this is a new fs */
|
||||
|
@ -97,6 +101,16 @@ struct exofs_fscb {
|
|||
__le64 s_dev_table_count; /* == 0 means no dev_table */
|
||||
} __packed;
|
||||
|
||||
/*
|
||||
* This struct is set on the FS partition's attributes.
|
||||
* [EXOFS_APAGE_SB_DATA, EXOFS_ATTR_SB_STATS] and is written together
|
||||
* with the create command, to atomically persist the sb writeable information.
|
||||
*/
|
||||
struct exofs_sb_stats {
|
||||
__le64 s_nextid; /* Highest object ID used */
|
||||
__le64 s_numfiles; /* Number of files on fs */
|
||||
} __packed;
|
||||
|
||||
/*
|
||||
* Describes the raid used in the FS. It is part of the device table.
|
||||
* This here is taken from the pNFS-objects definition. In exofs we
|
||||
|
|
|
@ -77,7 +77,7 @@ struct exofs_layout {
|
|||
* our extension to the in-memory superblock
|
||||
*/
|
||||
struct exofs_sb_info {
|
||||
struct exofs_fscb s_fscb; /* Written often, pre-allocate*/
|
||||
struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/
|
||||
int s_timeout; /* timeout for OSD operations */
|
||||
uint64_t s_nextid; /* highest object ID used */
|
||||
uint32_t s_numfiles; /* number of files on fs */
|
||||
|
@ -281,7 +281,7 @@ int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
|
|||
struct inode *);
|
||||
|
||||
/* super.c */
|
||||
int exofs_sync_fs(struct super_block *sb, int wait);
|
||||
int exofs_sbi_write_stats(struct exofs_sb_info *sbi);
|
||||
|
||||
/*********************
|
||||
* operation vectors *
|
||||
|
|
|
@ -45,17 +45,8 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
|
|||
static int exofs_file_fsync(struct file *filp, int datasync)
|
||||
{
|
||||
int ret;
|
||||
struct inode *inode = filp->f_mapping->host;
|
||||
struct super_block *sb;
|
||||
|
||||
ret = sync_inode_metadata(inode, 1);
|
||||
|
||||
/* This is a good place to write the sb */
|
||||
/* TODO: Sechedule an sb-sync on create */
|
||||
sb = inode->i_sb;
|
||||
if (sb->s_dirt)
|
||||
exofs_sync_fs(sb, 1);
|
||||
|
||||
ret = sync_inode_metadata(filp->f_mapping->host, 1);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
|
@ -1102,6 +1102,7 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi)
|
|||
}
|
||||
return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Callback function from exofs_new_inode(). The important thing is that we
|
||||
* set the obj_created flag so that other methods know that the object exists on
|
||||
|
@ -1160,7 +1161,6 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
|
|||
sbi = sb->s_fs_info;
|
||||
|
||||
inode->i_mapping->backing_dev_info = sb->s_bdi;
|
||||
sb->s_dirt = 1;
|
||||
inode_init_owner(inode, dir, mode);
|
||||
inode->i_ino = sbi->s_nextid++;
|
||||
inode->i_blkbits = EXOFS_BLKSHIFT;
|
||||
|
@ -1171,6 +1171,8 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
|
|||
spin_unlock(&sbi->s_next_gen_lock);
|
||||
insert_inode_hash(inode);
|
||||
|
||||
exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
|
||||
|
||||
mark_inode_dirty(inode);
|
||||
|
||||
ret = exofs_get_io_state(&sbi->layout, &ios);
|
||||
|
|
137
fs/exofs/super.c
137
fs/exofs/super.c
|
@ -213,6 +213,101 @@ static void destroy_inodecache(void)
|
|||
static const struct super_operations exofs_sops;
|
||||
static const struct export_operations exofs_export_ops;
|
||||
|
||||
static const struct osd_attr g_attr_sb_stats = ATTR_DEF(
|
||||
EXOFS_APAGE_SB_DATA,
|
||||
EXOFS_ATTR_SB_STATS,
|
||||
sizeof(struct exofs_sb_stats));
|
||||
|
||||
static int __sbi_read_stats(struct exofs_sb_info *sbi)
|
||||
{
|
||||
struct osd_attr attrs[] = {
|
||||
[0] = g_attr_sb_stats,
|
||||
};
|
||||
struct exofs_io_state *ios;
|
||||
int ret;
|
||||
|
||||
ret = exofs_get_io_state(&sbi->layout, &ios);
|
||||
if (unlikely(ret)) {
|
||||
EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ios->cred = sbi->s_cred;
|
||||
|
||||
ios->in_attr = attrs;
|
||||
ios->in_attr_len = ARRAY_SIZE(attrs);
|
||||
|
||||
ret = exofs_sbi_read(ios);
|
||||
if (unlikely(ret)) {
|
||||
EXOFS_ERR("Error reading super_block stats => %d\n", ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = extract_attr_from_ios(ios, &attrs[0]);
|
||||
if (ret) {
|
||||
EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__);
|
||||
goto out;
|
||||
}
|
||||
if (attrs[0].len) {
|
||||
struct exofs_sb_stats *ess;
|
||||
|
||||
if (unlikely(attrs[0].len != sizeof(*ess))) {
|
||||
EXOFS_ERR("%s: Wrong version of exofs_sb_stats "
|
||||
"size(%d) != expected(%zd)\n",
|
||||
__func__, attrs[0].len, sizeof(*ess));
|
||||
goto out;
|
||||
}
|
||||
|
||||
ess = attrs[0].val_ptr;
|
||||
sbi->s_nextid = le64_to_cpu(ess->s_nextid);
|
||||
sbi->s_numfiles = le32_to_cpu(ess->s_numfiles);
|
||||
}
|
||||
|
||||
out:
|
||||
exofs_put_io_state(ios);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void stats_done(struct exofs_io_state *ios, void *p)
|
||||
{
|
||||
exofs_put_io_state(ios);
|
||||
/* Good thanks nothing to do anymore */
|
||||
}
|
||||
|
||||
/* Asynchronously write the stats attribute */
|
||||
int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
|
||||
{
|
||||
struct osd_attr attrs[] = {
|
||||
[0] = g_attr_sb_stats,
|
||||
};
|
||||
struct exofs_io_state *ios;
|
||||
int ret;
|
||||
|
||||
ret = exofs_get_io_state(&sbi->layout, &ios);
|
||||
if (unlikely(ret)) {
|
||||
EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
|
||||
return ret;
|
||||
}
|
||||
|
||||
sbi->s_ess.s_nextid = cpu_to_le64(sbi->s_nextid);
|
||||
sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles);
|
||||
attrs[0].val_ptr = &sbi->s_ess;
|
||||
|
||||
ios->cred = sbi->s_cred;
|
||||
ios->done = stats_done;
|
||||
ios->private = sbi;
|
||||
ios->out_attr = attrs;
|
||||
ios->out_attr_len = ARRAY_SIZE(attrs);
|
||||
|
||||
ret = exofs_sbi_write(ios);
|
||||
if (unlikely(ret)) {
|
||||
EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
|
||||
exofs_put_io_state(ios);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Write the superblock to the OSD
|
||||
*/
|
||||
|
@ -223,18 +318,25 @@ int exofs_sync_fs(struct super_block *sb, int wait)
|
|||
struct exofs_io_state *ios;
|
||||
int ret = -ENOMEM;
|
||||
|
||||
lock_super(sb);
|
||||
sbi = sb->s_fs_info;
|
||||
fscb = &sbi->s_fscb;
|
||||
fscb = kmalloc(sizeof(*fscb), GFP_KERNEL);
|
||||
if (unlikely(!fscb))
|
||||
return -ENOMEM;
|
||||
|
||||
sbi = sb->s_fs_info;
|
||||
|
||||
/* NOTE: We no longer dirty the super_block anywhere in exofs. The
|
||||
* reason we write the fscb here on unmount is so we can stay backwards
|
||||
* compatible with fscb->s_version == 1. (What we are not compatible
|
||||
* with is if a new version FS crashed and then we try to mount an old
|
||||
* version). Otherwise the exofs_fscb is read-only from mkfs time. All
|
||||
* the writeable info is set in exofs_sbi_write_stats() above.
|
||||
*/
|
||||
ret = exofs_get_io_state(&sbi->layout, &ios);
|
||||
if (ret)
|
||||
if (unlikely(ret))
|
||||
goto out;
|
||||
|
||||
/* Note: We only write the changing part of the fscb. .i.e upto the
|
||||
* the fscb->s_dev_table_oid member. There is no read-modify-write
|
||||
* here.
|
||||
*/
|
||||
lock_super(sb);
|
||||
|
||||
ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
|
||||
memset(fscb, 0, ios->length);
|
||||
fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
|
||||
|
@ -249,16 +351,17 @@ int exofs_sync_fs(struct super_block *sb, int wait)
|
|||
ios->cred = sbi->s_cred;
|
||||
|
||||
ret = exofs_sbi_write(ios);
|
||||
if (unlikely(ret)) {
|
||||
if (unlikely(ret))
|
||||
EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
|
||||
goto out;
|
||||
}
|
||||
sb->s_dirt = 0;
|
||||
else
|
||||
sb->s_dirt = 0;
|
||||
|
||||
|
||||
unlock_super(sb);
|
||||
out:
|
||||
EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
|
||||
exofs_put_io_state(ios);
|
||||
unlock_super(sb);
|
||||
kfree(fscb);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -302,9 +405,6 @@ static void exofs_put_super(struct super_block *sb)
|
|||
int num_pend;
|
||||
struct exofs_sb_info *sbi = sb->s_fs_info;
|
||||
|
||||
if (sb->s_dirt)
|
||||
exofs_write_super(sb);
|
||||
|
||||
/* make sure there are no pending commands */
|
||||
for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
|
||||
num_pend = atomic_read(&sbi->s_curr_pending)) {
|
||||
|
@ -629,6 +729,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
|
|||
goto free_sbi;
|
||||
|
||||
sb->s_magic = le16_to_cpu(fscb.s_magic);
|
||||
/* NOTE: we read below to be backward compatible with old versions */
|
||||
sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
|
||||
sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
|
||||
|
||||
|
@ -639,7 +740,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
|
|||
ret = -EINVAL;
|
||||
goto free_sbi;
|
||||
}
|
||||
if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) {
|
||||
if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) {
|
||||
EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
|
||||
EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
|
||||
ret = -EINVAL;
|
||||
|
@ -657,6 +758,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
|
|||
goto free_sbi;
|
||||
}
|
||||
|
||||
__sbi_read_stats(sbi);
|
||||
|
||||
/* set up operation vectors */
|
||||
sbi->bdi.ra_pages = __ra_pages(&sbi->layout);
|
||||
sb->s_bdi = &sbi->bdi;
|
||||
|
|
Loading…
Reference in a new issue