From db3540522e955c1ebb391f4f5324dff4f20ecd09 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 24 May 2011 11:46:31 -0700 Subject: [PATCH] ceph: fix cap flush race reentrancy In e9964c10 we change cap flushing to do a delicate dance because some inodes on the cap_dirty list could be in a migrating state (got EXPORT but not IMPORT) in which we couldn't actually flush and move from dirty->flushing, breaking the while (!empty) { process first } loop structure. It worked for a single sync thread, but was not reentrant and triggered infinite loops when multiple syncers came along. Instead, move inodes with dirty to a separate cap_dirty_migrating list when in the limbo export-but-no-import state, allowing us to go back to the simple loop structure (which was reentrant). This is cleaner and more robust. Audited the cap_dirty users and this looks fine: list_empty(&ci->i_dirty_item) is still a reliable indicator of whether we have dirty caps (which list we're on is irrelevant) and list_del_init() calls still do the right thing. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 58 ++++++++++++++++++++++---------------------- fs/ceph/mds_client.c | 1 + fs/ceph/mds_client.h | 1 + 3 files changed, 31 insertions(+), 29 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 591202bc9668..1f72b00447c4 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2635,6 +2635,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, struct ceph_mds_session *session, int *open_target_sessions) { + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; unsigned mseq = le32_to_cpu(ex->migrate_seq); @@ -2671,6 +2672,19 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, * export targets, so that we get the matching IMPORT */ *open_target_sessions = 1; + + /* + * we can't flush dirty caps that we've seen the + * EXPORT but no IMPORT for + */ + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) { + dout(" moving %p to cap_dirty_migrating\n", + inode); + list_move(&ci->i_dirty_item, + &mdsc->cap_dirty_migrating); + } + spin_unlock(&mdsc->cap_dirty_lock); } __ceph_remove_cap(cap); } @@ -2708,6 +2722,13 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, ci->i_cap_exporting_issued = 0; ci->i_cap_exporting_mseq = 0; ci->i_cap_exporting_mds = -1; + + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) { + dout(" moving %p back to cap_dirty\n", inode); + list_move(&ci->i_dirty_item, &mdsc->cap_dirty); + } + spin_unlock(&mdsc->cap_dirty_lock); } else { dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", inode, ci, mds, mseq); @@ -2911,38 +2932,16 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) */ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) { - struct ceph_inode_info *ci, *nci = NULL; - struct inode *inode, *ninode = NULL; - struct list_head *p, *n; + struct ceph_inode_info *ci; + struct inode *inode; dout("flush_dirty_caps\n"); spin_lock(&mdsc->cap_dirty_lock); - list_for_each_safe(p, n, &mdsc->cap_dirty) { - if (nci) { - ci = nci; - inode = ninode; - ci->i_ceph_flags &= ~CEPH_I_NOFLUSH; - dout("flush_dirty_caps inode %p (was next inode)\n", - inode); - } else { - ci = list_entry(p, struct ceph_inode_info, - i_dirty_item); - inode = igrab(&ci->vfs_inode); - BUG_ON(!inode); - dout("flush_dirty_caps inode %p\n", inode); - } - if (n != &mdsc->cap_dirty) { - nci = list_entry(n, struct ceph_inode_info, - i_dirty_item); - ninode = igrab(&nci->vfs_inode); - BUG_ON(!ninode); - nci->i_ceph_flags |= CEPH_I_NOFLUSH; - dout("flush_dirty_caps next inode %p, noflush\n", - ninode); - } else { - nci = NULL; - ninode = NULL; - } + while (!list_empty(&mdsc->cap_dirty)) { + ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info, + i_dirty_item); + inode = igrab(&ci->vfs_inode); + dout("flush_dirty_caps %p\n", inode); spin_unlock(&mdsc->cap_dirty_lock); if (inode) { ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, @@ -2952,6 +2951,7 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) spin_lock(&mdsc->cap_dirty_lock); } spin_unlock(&mdsc->cap_dirty_lock); + dout("flush_dirty_caps done\n"); } /* diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c12d2e9a0ec6..79743d146be6 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3004,6 +3004,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) spin_lock_init(&mdsc->snap_flush_lock); mdsc->cap_flush_seq = 0; INIT_LIST_HEAD(&mdsc->cap_dirty); + INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); mdsc->num_cap_flushing = 0; spin_lock_init(&mdsc->cap_dirty_lock); init_waitqueue_head(&mdsc->cap_flushing_wq); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 4e3a9cc0bba6..7d8a0d662d56 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -278,6 +278,7 @@ struct ceph_mds_client { u64 cap_flush_seq; struct list_head cap_dirty; /* inodes with dirty caps */ + struct list_head cap_dirty_migrating; /* ...that are migration... */ int num_cap_flushing; /* # caps we are flushing */ spinlock_t cap_dirty_lock; /* protects above items */ wait_queue_head_t cap_flushing_wq;