btrfs: fix dead lock while running replace and defrag concurrently

This can be reproduced by fstests: btrfs/070

The scenario is like the following:

replace worker thread		defrag thread
---------------------		-------------
copy_nocow_pages_worker		btrfs_defrag_file
  copy_nocow_pages_for_inode	    ...
				  btrfs_writepages
  |A| lock_extent_bits		    extent_write_cache_pages
				|B|   lock_page
					__extent_writepage
		...			  writepage_delalloc
					    find_lock_delalloc_range
				|B| 	      lock_extent_bits
  find_or_create_page
    pagecache_get_page
  |A| lock_page

This leads to an ABBA pattern deadlock. To fix it,
o we just change it to an AABB pattern which means to @unlock_extent_bits()
  before we @lock_page(), and in this way the @extent_read_full_page_nolock()
  is no longer in an locked context, so change it back to @extent_read_full_page()
  to regain protection.

o Since we @unlock_extent_bits() earlier, then before @write_page_nocow(),
  the extent may not really point at the physical block we want, so we
  have to check it before write.

Signed-off-by: Gui Hecheng <guihc.fnst@cn.fujitsu.com>
Tested-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
This commit is contained in:
Gui Hecheng 2014-11-10 15:36:08 +08:00 committed by Chris Mason
parent 5f5bc6b1e2
commit 321592427c

View file

@ -3310,6 +3310,50 @@ static void copy_nocow_pages_worker(struct btrfs_work *work)
scrub_pending_trans_workers_dec(sctx);
}
static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
u64 logical)
{
struct extent_state *cached_state = NULL;
struct btrfs_ordered_extent *ordered;
struct extent_io_tree *io_tree;
struct extent_map *em;
u64 lockstart = start, lockend = start + len - 1;
int ret = 0;
io_tree = &BTRFS_I(inode)->io_tree;
lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
if (ordered) {
btrfs_put_ordered_extent(ordered);
ret = 1;
goto out_unlock;
}
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out_unlock;
}
/*
* This extent does not actually cover the logical extent anymore,
* move on to the next inode.
*/
if (em->block_start > logical ||
em->block_start + em->block_len < logical + len) {
free_extent_map(em);
ret = 1;
goto out_unlock;
}
free_extent_map(em);
out_unlock:
unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
GFP_NOFS);
return ret;
}
static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
struct scrub_copy_nocow_ctx *nocow_ctx)
{
@ -3318,13 +3362,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
struct inode *inode;
struct page *page;
struct btrfs_root *local_root;
struct btrfs_ordered_extent *ordered;
struct extent_map *em;
struct extent_state *cached_state = NULL;
struct extent_io_tree *io_tree;
u64 physical_for_dev_replace;
u64 nocow_ctx_logical;
u64 len = nocow_ctx->len;
u64 lockstart = offset, lockend = offset + len - 1;
unsigned long index;
int srcu_index;
int ret = 0;
@ -3356,31 +3397,14 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
io_tree = &BTRFS_I(inode)->io_tree;
nocow_ctx_logical = nocow_ctx->logical;
lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
if (ordered) {
btrfs_put_ordered_extent(ordered);
goto out_unlock;
ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
if (ret) {
ret = ret > 0 ? 0 : ret;
goto out;
}
em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto out_unlock;
}
/*
* This extent does not actually cover the logical extent anymore,
* move on to the next inode.
*/
if (em->block_start > nocow_ctx->logical ||
em->block_start + em->block_len < nocow_ctx->logical + len) {
free_extent_map(em);
goto out_unlock;
}
free_extent_map(em);
while (len >= PAGE_CACHE_SIZE) {
index = offset >> PAGE_CACHE_SHIFT;
again:
@ -3396,7 +3420,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
goto next_page;
} else {
ClearPageError(page);
err = extent_read_full_page_nolock(io_tree, page,
err = extent_read_full_page(io_tree, page,
btrfs_get_extent,
nocow_ctx->mirror_num);
if (err) {
@ -3421,6 +3445,14 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
goto next_page;
}
}
ret = check_extent_to_block(inode, offset, len,
nocow_ctx_logical);
if (ret) {
ret = ret > 0 ? 0 : ret;
goto next_page;
}
err = write_page_nocow(nocow_ctx->sctx,
physical_for_dev_replace, page);
if (err)
@ -3434,12 +3466,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
offset += PAGE_CACHE_SIZE;
physical_for_dev_replace += PAGE_CACHE_SIZE;
nocow_ctx_logical += PAGE_CACHE_SIZE;
len -= PAGE_CACHE_SIZE;
}
ret = COPY_COMPLETE;
out_unlock:
unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
GFP_NOFS);
out:
mutex_unlock(&inode->i_mutex);
iput(inode);