2007-06-12 07:07:21 -06:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2007 Oracle. All rights reserved.
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public
|
|
|
|
* License v2 as published by the Free Software Foundation.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public
|
|
|
|
* License along with this program; if not, write to the
|
|
|
|
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
|
|
* Boston, MA 021110-1307, USA.
|
|
|
|
*/
|
|
|
|
|
2008-04-25 14:53:30 -06:00
|
|
|
#include <linux/kernel.h>
|
2008-02-20 10:07:25 -07:00
|
|
|
#include <linux/bio.h>
|
2007-06-12 04:35:45 -06:00
|
|
|
#include <linux/buffer_head.h>
|
2008-05-02 12:43:14 -06:00
|
|
|
#include <linux/file.h>
|
2007-06-12 04:35:45 -06:00
|
|
|
#include <linux/fs.h>
|
|
|
|
#include <linux/pagemap.h>
|
|
|
|
#include <linux/highmem.h>
|
|
|
|
#include <linux/time.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/string.h>
|
|
|
|
#include <linux/backing-dev.h>
|
|
|
|
#include <linux/mpage.h>
|
|
|
|
#include <linux/swap.h>
|
|
|
|
#include <linux/writeback.h>
|
|
|
|
#include <linux/statfs.h>
|
|
|
|
#include <linux/compat.h>
|
2007-06-15 11:50:00 -06:00
|
|
|
#include <linux/bit_spinlock.h>
|
2007-11-16 09:45:54 -07:00
|
|
|
#include <linux/xattr.h>
|
2008-07-24 10:16:36 -06:00
|
|
|
#include <linux/posix_acl.h>
|
2008-10-30 12:25:28 -06:00
|
|
|
#include <linux/falloc.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 02:04:11 -06:00
|
|
|
#include <linux/slab.h>
|
2011-05-06 07:33:15 -06:00
|
|
|
#include <linux/ratelimit.h>
|
2008-11-20 08:22:27 -07:00
|
|
|
#include "compat.h"
|
2007-06-12 04:35:45 -06:00
|
|
|
#include "ctree.h"
|
|
|
|
#include "disk-io.h"
|
|
|
|
#include "transaction.h"
|
|
|
|
#include "btrfs_inode.h"
|
|
|
|
#include "ioctl.h"
|
|
|
|
#include "print-tree.h"
|
2008-03-24 13:01:56 -06:00
|
|
|
#include "volumes.h"
|
2008-07-17 10:53:50 -06:00
|
|
|
#include "ordered-data.h"
|
2008-08-28 04:21:17 -06:00
|
|
|
#include "xattr.h"
|
2008-09-05 14:13:11 -06:00
|
|
|
#include "tree-log.h"
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
#include "compression.h"
|
Btrfs: Change btree locking to use explicit blocking points
Most of the btrfs metadata operations can be protected by a spinlock,
but some operations still need to schedule.
So far, btrfs has been using a mutex along with a trylock loop,
most of the time it is able to avoid going for the full mutex, so
the trylock loop is a big performance gain.
This commit is step one for getting rid of the blocking locks entirely.
btrfs_tree_lock takes a spinlock, and the code explicitly switches
to a blocking lock when it starts an operation that can schedule.
We'll be able get rid of the blocking locks in smaller pieces over time.
Tracing allows us to find the most common cause of blocking, so we
can start with the hot spots first.
The basic idea is:
btrfs_tree_lock() returns with the spin lock held
btrfs_set_lock_blocking() sets the EXTENT_BUFFER_BLOCKING bit in
the extent buffer flags, and then drops the spin lock. The buffer is
still considered locked by all of the btrfs code.
If btrfs_tree_lock gets the spinlock but finds the blocking bit set, it drops
the spin lock and waits on a wait queue for the blocking bit to go away.
Much of the code that needs to set the blocking bit finishes without actually
blocking a good percentage of the time. So, an adaptive spin is still
used against the blocking bit to avoid very high context switch rates.
btrfs_clear_lock_blocking() clears the blocking bit and returns
with the spinlock held again.
btrfs_tree_unlock() can be called on either blocking or spinning locks,
it does the right thing based on the blocking bit.
ctree.c has a helper function to set/clear all the locked buffers in a
path as blocking.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-02-04 07:25:08 -07:00
|
|
|
#include "locking.h"
|
2011-01-28 15:05:48 -07:00
|
|
|
#include "free-space-cache.h"
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-19 20:06:11 -06:00
|
|
|
#include "inode-map.h"
|
2007-06-12 04:35:45 -06:00
|
|
|
|
|
|
|
struct btrfs_iget_args {
|
|
|
|
u64 ino;
|
|
|
|
struct btrfs_root *root;
|
|
|
|
};
|
|
|
|
|
2009-09-21 18:01:11 -06:00
|
|
|
static const struct inode_operations btrfs_dir_inode_operations;
|
|
|
|
static const struct inode_operations btrfs_symlink_inode_operations;
|
|
|
|
static const struct inode_operations btrfs_dir_ro_inode_operations;
|
|
|
|
static const struct inode_operations btrfs_special_inode_operations;
|
|
|
|
static const struct inode_operations btrfs_file_inode_operations;
|
2009-09-21 18:01:10 -06:00
|
|
|
static const struct address_space_operations btrfs_aops;
|
|
|
|
static const struct address_space_operations btrfs_symlink_aops;
|
2009-10-01 16:43:56 -06:00
|
|
|
static const struct file_operations btrfs_dir_file_operations;
|
2008-01-24 14:13:08 -07:00
|
|
|
static struct extent_io_ops btrfs_extent_io_ops;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
|
|
|
static struct kmem_cache *btrfs_inode_cachep;
|
|
|
|
struct kmem_cache *btrfs_trans_handle_cachep;
|
|
|
|
struct kmem_cache *btrfs_transaction_cachep;
|
|
|
|
struct kmem_cache *btrfs_path_cachep;
|
2011-01-28 15:05:48 -07:00
|
|
|
struct kmem_cache *btrfs_free_space_cachep;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
|
|
|
#define S_SHIFT 12
|
|
|
|
static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
|
|
|
|
[S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE,
|
|
|
|
[S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR,
|
|
|
|
[S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV,
|
|
|
|
[S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV,
|
|
|
|
[S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO,
|
|
|
|
[S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK,
|
|
|
|
[S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
|
|
|
|
};
|
|
|
|
|
2011-01-31 13:30:16 -07:00
|
|
|
static int btrfs_setsize(struct inode *inode, loff_t newsize);
|
|
|
|
static int btrfs_truncate(struct inode *inode);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
|
2008-11-06 20:02:51 -07:00
|
|
|
static noinline int cow_file_range(struct inode *inode,
|
|
|
|
struct page *locked_page,
|
|
|
|
u64 start, u64 end, int *page_started,
|
|
|
|
unsigned long *nr_written, int unlock);
|
2008-07-24 10:17:14 -06:00
|
|
|
|
2009-11-12 02:35:27 -07:00
|
|
|
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
|
2011-02-01 09:05:39 -07:00
|
|
|
struct inode *inode, struct inode *dir,
|
|
|
|
const struct qstr *qstr)
|
2009-02-04 07:29:13 -07:00
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
2009-11-12 02:35:27 -07:00
|
|
|
err = btrfs_init_acl(trans, inode, dir);
|
2009-02-04 07:29:13 -07:00
|
|
|
if (!err)
|
2011-02-01 09:05:39 -07:00
|
|
|
err = btrfs_xattr_security_init(trans, inode, dir, qstr);
|
2009-02-04 07:29:13 -07:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
/*
|
|
|
|
* this does all the hard work for inserting an inline extent into
|
|
|
|
* the btree. The caller should have done a btrfs_drop_extents so that
|
|
|
|
* no overlapping inline items exist in the btree
|
|
|
|
*/
|
2009-01-05 19:25:51 -07:00
|
|
|
static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
struct btrfs_root *root, struct inode *inode,
|
|
|
|
u64 start, size_t size, size_t compressed_size,
|
2011-03-28 02:30:38 -06:00
|
|
|
int compress_type,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
struct page **compressed_pages)
|
|
|
|
{
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct page *page = NULL;
|
|
|
|
char *kaddr;
|
|
|
|
unsigned long ptr;
|
|
|
|
struct btrfs_file_extent_item *ei;
|
|
|
|
int err = 0;
|
|
|
|
int ret;
|
|
|
|
size_t cur_size = size;
|
|
|
|
size_t datasize;
|
|
|
|
unsigned long offset;
|
|
|
|
|
2011-03-28 02:30:38 -06:00
|
|
|
if (compressed_size && compressed_pages)
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
cur_size = compressed_size;
|
|
|
|
|
2009-01-05 19:25:51 -07:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
return -ENOMEM;
|
|
|
|
|
2009-03-13 09:00:37 -06:00
|
|
|
path->leave_spinning = 1;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
key.objectid = btrfs_ino(inode);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
key.offset = start;
|
|
|
|
btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
|
|
|
|
datasize = btrfs_file_extent_calc_inline_size(cur_size);
|
|
|
|
|
|
|
|
inode_add_bytes(inode, size);
|
|
|
|
ret = btrfs_insert_empty_item(trans, root, path, &key,
|
|
|
|
datasize);
|
|
|
|
BUG_ON(ret);
|
|
|
|
if (ret) {
|
|
|
|
err = ret;
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
ei = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
btrfs_set_file_extent_generation(leaf, ei, trans->transid);
|
|
|
|
btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
|
|
|
|
btrfs_set_file_extent_encryption(leaf, ei, 0);
|
|
|
|
btrfs_set_file_extent_other_encoding(leaf, ei, 0);
|
|
|
|
btrfs_set_file_extent_ram_bytes(leaf, ei, size);
|
|
|
|
ptr = btrfs_file_extent_inline_start(ei);
|
|
|
|
|
2010-12-16 23:21:50 -07:00
|
|
|
if (compress_type != BTRFS_COMPRESS_NONE) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
struct page *cpage;
|
|
|
|
int i = 0;
|
2009-01-05 19:25:51 -07:00
|
|
|
while (compressed_size > 0) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
cpage = compressed_pages[i];
|
2008-11-11 07:34:41 -07:00
|
|
|
cur_size = min_t(unsigned long, compressed_size,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
PAGE_CACHE_SIZE);
|
|
|
|
|
2009-03-13 09:00:37 -06:00
|
|
|
kaddr = kmap_atomic(cpage, KM_USER0);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
write_extent_buffer(leaf, kaddr, ptr, cur_size);
|
2009-03-13 09:00:37 -06:00
|
|
|
kunmap_atomic(kaddr, KM_USER0);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
|
|
|
|
i++;
|
|
|
|
ptr += cur_size;
|
|
|
|
compressed_size -= cur_size;
|
|
|
|
}
|
|
|
|
btrfs_set_file_extent_compression(leaf, ei,
|
2010-12-16 23:21:50 -07:00
|
|
|
compress_type);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
} else {
|
|
|
|
page = find_get_page(inode->i_mapping,
|
|
|
|
start >> PAGE_CACHE_SHIFT);
|
|
|
|
btrfs_set_file_extent_compression(leaf, ei, 0);
|
|
|
|
kaddr = kmap_atomic(page, KM_USER0);
|
|
|
|
offset = start & (PAGE_CACHE_SIZE - 1);
|
|
|
|
write_extent_buffer(leaf, kaddr + offset, ptr, size);
|
|
|
|
kunmap_atomic(kaddr, KM_USER0);
|
|
|
|
page_cache_release(page);
|
|
|
|
}
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
2009-11-12 02:34:21 -07:00
|
|
|
/*
|
|
|
|
* we're an inline extent, so nobody can
|
|
|
|
* extend the file past i_size without locking
|
|
|
|
* a page we already have locked.
|
|
|
|
*
|
|
|
|
* We must do any isize and inode updates
|
|
|
|
* before we unlock the pages. Otherwise we
|
|
|
|
* could end up racing with unlink.
|
|
|
|
*/
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
BTRFS_I(inode)->disk_i_size = inode->i_size;
|
|
|
|
btrfs_update_inode(trans, root, inode);
|
2009-11-12 02:34:21 -07:00
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
return 0;
|
|
|
|
fail:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* conditionally insert an inline extent into the file. This
|
|
|
|
* does the checks required to make sure the data is small enough
|
|
|
|
* to fit as an inline extent.
|
|
|
|
*/
|
2009-03-12 18:12:45 -06:00
|
|
|
static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
struct btrfs_root *root,
|
|
|
|
struct inode *inode, u64 start, u64 end,
|
2011-03-28 02:30:38 -06:00
|
|
|
size_t compressed_size, int compress_type,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
struct page **compressed_pages)
|
|
|
|
{
|
|
|
|
u64 isize = i_size_read(inode);
|
|
|
|
u64 actual_end = min(end + 1, isize);
|
|
|
|
u64 inline_len = actual_end - start;
|
|
|
|
u64 aligned_end = (end + root->sectorsize - 1) &
|
|
|
|
~((u64)root->sectorsize - 1);
|
|
|
|
u64 hint_byte;
|
|
|
|
u64 data_len = inline_len;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (compressed_size)
|
|
|
|
data_len = compressed_size;
|
|
|
|
|
|
|
|
if (start > 0 ||
|
2008-10-31 10:46:39 -06:00
|
|
|
actual_end >= PAGE_CACHE_SIZE ||
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
|
|
|
|
(!compressed_size &&
|
|
|
|
(actual_end & (root->sectorsize - 1)) == 0) ||
|
|
|
|
end + 1 < isize ||
|
|
|
|
data_len > root->fs_info->max_inline) {
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2009-11-12 02:34:08 -07:00
|
|
|
ret = btrfs_drop_extents(trans, inode, start, aligned_end,
|
2009-09-11 10:27:37 -06:00
|
|
|
&hint_byte, 1);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
BUG_ON(ret);
|
|
|
|
|
|
|
|
if (isize > actual_end)
|
|
|
|
inline_len = min_t(u64, isize, actual_end);
|
|
|
|
ret = insert_inline_extent(trans, root, inode, start,
|
|
|
|
inline_len, compressed_size,
|
2011-03-28 02:30:38 -06:00
|
|
|
compress_type, compressed_pages);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
BUG_ON(ret);
|
2010-05-16 08:48:47 -06:00
|
|
|
btrfs_delalloc_release_metadata(inode, end + 1 - start);
|
2009-09-11 10:27:37 -06:00
|
|
|
btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-11-06 20:02:51 -07:00
|
|
|
struct async_extent {
|
|
|
|
u64 start;
|
|
|
|
u64 ram_size;
|
|
|
|
u64 compressed_size;
|
|
|
|
struct page **pages;
|
|
|
|
unsigned long nr_pages;
|
2010-12-16 23:21:50 -07:00
|
|
|
int compress_type;
|
2008-11-06 20:02:51 -07:00
|
|
|
struct list_head list;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct async_cow {
|
|
|
|
struct inode *inode;
|
|
|
|
struct btrfs_root *root;
|
|
|
|
struct page *locked_page;
|
|
|
|
u64 start;
|
|
|
|
u64 end;
|
|
|
|
struct list_head extents;
|
|
|
|
struct btrfs_work work;
|
|
|
|
};
|
|
|
|
|
|
|
|
static noinline int add_async_extent(struct async_cow *cow,
|
|
|
|
u64 start, u64 ram_size,
|
|
|
|
u64 compressed_size,
|
|
|
|
struct page **pages,
|
2010-12-16 23:21:50 -07:00
|
|
|
unsigned long nr_pages,
|
|
|
|
int compress_type)
|
2008-11-06 20:02:51 -07:00
|
|
|
{
|
|
|
|
struct async_extent *async_extent;
|
|
|
|
|
|
|
|
async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
|
2011-02-15 05:01:42 -07:00
|
|
|
BUG_ON(!async_extent);
|
2008-11-06 20:02:51 -07:00
|
|
|
async_extent->start = start;
|
|
|
|
async_extent->ram_size = ram_size;
|
|
|
|
async_extent->compressed_size = compressed_size;
|
|
|
|
async_extent->pages = pages;
|
|
|
|
async_extent->nr_pages = nr_pages;
|
2010-12-16 23:21:50 -07:00
|
|
|
async_extent->compress_type = compress_type;
|
2008-11-06 20:02:51 -07:00
|
|
|
list_add_tail(&async_extent->list, &cow->extents);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/*
|
2008-11-06 20:02:51 -07:00
|
|
|
* we create compressed extents in two phases. The first
|
|
|
|
* phase compresses a range of pages that have already been
|
|
|
|
* locked (both pages and state bits are locked).
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
*
|
2008-11-06 20:02:51 -07:00
|
|
|
* This is done inside an ordered work queue, and the compression
|
|
|
|
* is spread across many cpus. The actual IO submission is step
|
|
|
|
* two, and the ordered work queue takes care of making sure that
|
|
|
|
* happens in the same order things were put onto the queue by
|
|
|
|
* writepages and friends.
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
*
|
2008-11-06 20:02:51 -07:00
|
|
|
* If this code finds it can't get good compression, it puts an
|
|
|
|
* entry onto the work queue to write the uncompressed bytes. This
|
|
|
|
* makes sure that both compressed inodes and uncompressed inodes
|
|
|
|
* are written in the same order that pdflush sent them down.
|
2008-09-29 13:18:18 -06:00
|
|
|
*/
|
2008-11-06 20:02:51 -07:00
|
|
|
static noinline int compress_file_range(struct inode *inode,
|
|
|
|
struct page *locked_page,
|
|
|
|
u64 start, u64 end,
|
|
|
|
struct async_cow *async_cow,
|
|
|
|
int *num_added)
|
2007-08-27 14:49:44 -06:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_trans_handle *trans;
|
2007-10-15 14:15:53 -06:00
|
|
|
u64 num_bytes;
|
|
|
|
u64 blocksize = root->sectorsize;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
u64 actual_end;
|
2008-12-15 09:44:56 -07:00
|
|
|
u64 isize = i_size_read(inode);
|
2008-07-17 10:53:50 -06:00
|
|
|
int ret = 0;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
struct page **pages = NULL;
|
|
|
|
unsigned long nr_pages;
|
|
|
|
unsigned long nr_pages_ret = 0;
|
|
|
|
unsigned long total_compressed = 0;
|
|
|
|
unsigned long total_in = 0;
|
|
|
|
unsigned long max_compressed = 128 * 1024;
|
2008-11-06 20:02:51 -07:00
|
|
|
unsigned long max_uncompressed = 128 * 1024;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
int i;
|
|
|
|
int will_compress;
|
2010-12-16 23:21:50 -07:00
|
|
|
int compress_type = root->fs_info->compress_type;
|
2007-08-27 14:49:44 -06:00
|
|
|
|
2011-05-24 13:35:30 -06:00
|
|
|
/* if this is a small write inside eof, kick off a defragbot */
|
|
|
|
if (end <= BTRFS_I(inode)->disk_i_size && (end - start + 1) < 16 * 1024)
|
|
|
|
btrfs_add_inode_defrag(NULL, inode);
|
|
|
|
|
2008-12-15 09:44:56 -07:00
|
|
|
actual_end = min_t(u64, isize, end + 1);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
again:
|
|
|
|
will_compress = 0;
|
|
|
|
nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
|
|
|
|
nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
|
2007-12-17 18:14:01 -07:00
|
|
|
|
2009-02-04 07:31:06 -07:00
|
|
|
/*
|
|
|
|
* we don't want to send crud past the end of i_size through
|
|
|
|
* compression, that's just a waste of CPU time. So, if the
|
|
|
|
* end of the file is before the start of our current
|
|
|
|
* requested range of bytes, we bail out to the uncompressed
|
|
|
|
* cleanup code that can deal with all of this.
|
|
|
|
*
|
|
|
|
* It isn't really the fastest way to fix things, but this is a
|
|
|
|
* very uncommon corner.
|
|
|
|
*/
|
|
|
|
if (actual_end <= start)
|
|
|
|
goto cleanup_and_bail_uncompressed;
|
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
total_compressed = actual_end - start;
|
|
|
|
|
|
|
|
/* we want to make sure that amount of ram required to uncompress
|
|
|
|
* an extent is reasonable, so we limit the total size in ram
|
2008-11-06 20:02:51 -07:00
|
|
|
* of a compressed extent to 128k. This is a crucial number
|
|
|
|
* because it also controls how easily we can spread reads across
|
|
|
|
* cpus for decompression.
|
|
|
|
*
|
|
|
|
* We also want to make sure the amount of IO required to do
|
|
|
|
* a random read is reasonably small, so we limit the size of
|
|
|
|
* a compressed extent to 128k.
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
*/
|
|
|
|
total_compressed = min(total_compressed, max_uncompressed);
|
2007-10-15 14:15:53 -06:00
|
|
|
num_bytes = (end - start + blocksize) & ~(blocksize - 1);
|
2007-12-17 18:14:01 -07:00
|
|
|
num_bytes = max(blocksize, num_bytes);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
total_in = 0;
|
|
|
|
ret = 0;
|
2007-10-15 14:15:53 -06:00
|
|
|
|
2008-11-06 20:02:51 -07:00
|
|
|
/*
|
|
|
|
* we do compression for mount -o compress and when the
|
|
|
|
* inode has not been flagged as nocompress. This flag can
|
|
|
|
* change at any time if we discover bad compression ratios.
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
*/
|
2009-04-17 02:37:41 -06:00
|
|
|
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
|
2010-03-11 07:42:04 -07:00
|
|
|
(btrfs_test_opt(root, COMPRESS) ||
|
Btrfs: Per file/directory controls for COW and compression
Data compression and data cow are controlled across the entire FS by mount
options right now. ioctls are needed to set this on a per file or per
directory basis. This has been proposed previously, but VFS developers
wanted us to use generic ioctls rather than btrfs-specific ones.
According to Chris's comment, there should be just one true compression
method(probably LZO) stored in the super. However, before this, we would
wait for that one method is stable enough to be adopted into the super.
So I list it as a long term goal, and just store it in ram today.
After applying this patch, we can use the generic "FS_IOC_SETFLAGS" ioctl to
control file and directory's datacow and compression attribute.
NOTE:
- The compression type is selected by such rules:
If we mount btrfs with compress options, ie, zlib/lzo, the type is it.
Otherwise, we'll use the default compress type (zlib today).
v1->v2:
- rebase to the latest btrfs.
v2->v3:
- fix a problem, i.e. when a file is set NOCOW via mount option, then this NOCOW
will be screwed by inheritance from parent directory.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-22 04:12:20 -06:00
|
|
|
(BTRFS_I(inode)->force_compress) ||
|
|
|
|
(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
WARN_ON(pages);
|
2008-10-30 11:22:14 -06:00
|
|
|
pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
|
2011-02-15 05:01:42 -07:00
|
|
|
BUG_ON(!pages);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
|
2010-12-16 23:21:50 -07:00
|
|
|
if (BTRFS_I(inode)->force_compress)
|
|
|
|
compress_type = BTRFS_I(inode)->force_compress;
|
|
|
|
|
|
|
|
ret = btrfs_compress_pages(compress_type,
|
|
|
|
inode->i_mapping, start,
|
|
|
|
total_compressed, pages,
|
|
|
|
nr_pages, &nr_pages_ret,
|
|
|
|
&total_in,
|
|
|
|
&total_compressed,
|
|
|
|
max_compressed);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
|
|
|
|
if (!ret) {
|
|
|
|
unsigned long offset = total_compressed &
|
|
|
|
(PAGE_CACHE_SIZE - 1);
|
|
|
|
struct page *page = pages[nr_pages_ret - 1];
|
|
|
|
char *kaddr;
|
|
|
|
|
|
|
|
/* zero the tail end of the last page, we might be
|
|
|
|
* sending it down to disk
|
|
|
|
*/
|
|
|
|
if (offset) {
|
|
|
|
kaddr = kmap_atomic(page, KM_USER0);
|
|
|
|
memset(kaddr + offset, 0,
|
|
|
|
PAGE_CACHE_SIZE - offset);
|
|
|
|
kunmap_atomic(kaddr, KM_USER0);
|
|
|
|
}
|
|
|
|
will_compress = 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (start == 0) {
|
2011-04-13 10:54:33 -06:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-01-24 19:51:38 -07:00
|
|
|
BUG_ON(IS_ERR(trans));
|
2010-05-16 08:48:47 -06:00
|
|
|
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
|
2008-11-06 20:02:51 -07:00
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
/* lets try to make an inline extent */
|
2008-11-06 20:02:51 -07:00
|
|
|
if (ret || total_in < (actual_end - start)) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
/* we didn't compress the entire range, try
|
2008-11-06 20:02:51 -07:00
|
|
|
* to make an uncompressed inline extent.
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
*/
|
|
|
|
ret = cow_file_range_inline(trans, root, inode,
|
2011-03-28 02:30:38 -06:00
|
|
|
start, end, 0, 0, NULL);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
} else {
|
2008-11-06 20:02:51 -07:00
|
|
|
/* try making a compressed inline extent */
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
ret = cow_file_range_inline(trans, root, inode,
|
|
|
|
start, end,
|
2011-03-28 02:30:38 -06:00
|
|
|
total_compressed,
|
|
|
|
compress_type, pages);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
}
|
|
|
|
if (ret == 0) {
|
2008-11-06 20:02:51 -07:00
|
|
|
/*
|
|
|
|
* inline extent creation worked, we don't need
|
|
|
|
* to create any more async work items. Unlock
|
|
|
|
* and free up our temp pages.
|
|
|
|
*/
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
extent_clear_unlock_delalloc(inode,
|
2009-10-08 09:27:10 -06:00
|
|
|
&BTRFS_I(inode)->io_tree,
|
|
|
|
start, end, NULL,
|
|
|
|
EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
|
2009-10-08 10:30:20 -06:00
|
|
|
EXTENT_CLEAR_DELALLOC |
|
2009-10-08 09:27:10 -06:00
|
|
|
EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
|
2009-11-12 02:34:21 -07:00
|
|
|
|
|
|
|
btrfs_end_transaction(trans, root);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
goto free_pages_out;
|
|
|
|
}
|
2009-11-12 02:34:21 -07:00
|
|
|
btrfs_end_transaction(trans, root);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
if (will_compress) {
|
|
|
|
/*
|
|
|
|
* we aren't doing an inline extent round the compressed size
|
|
|
|
* up to a block size boundary so the allocator does sane
|
|
|
|
* things
|
|
|
|
*/
|
|
|
|
total_compressed = (total_compressed + blocksize - 1) &
|
|
|
|
~(blocksize - 1);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* one last check to make sure the compression is really a
|
|
|
|
* win, compare the page count read with the blocks on disk
|
|
|
|
*/
|
|
|
|
total_in = (total_in + PAGE_CACHE_SIZE - 1) &
|
|
|
|
~(PAGE_CACHE_SIZE - 1);
|
|
|
|
if (total_compressed >= total_in) {
|
|
|
|
will_compress = 0;
|
|
|
|
} else {
|
|
|
|
num_bytes = total_in;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!will_compress && pages) {
|
|
|
|
/*
|
|
|
|
* the compression code ran but failed to make things smaller,
|
|
|
|
* free any pages it allocated and our page pointer array
|
|
|
|
*/
|
|
|
|
for (i = 0; i < nr_pages_ret; i++) {
|
2008-10-31 10:46:39 -06:00
|
|
|
WARN_ON(pages[i]->mapping);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
page_cache_release(pages[i]);
|
|
|
|
}
|
|
|
|
kfree(pages);
|
|
|
|
pages = NULL;
|
|
|
|
total_compressed = 0;
|
|
|
|
nr_pages_ret = 0;
|
|
|
|
|
|
|
|
/* flag the file so we don't compress in the future */
|
2010-03-11 07:42:04 -07:00
|
|
|
if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
|
|
|
|
!(BTRFS_I(inode)->force_compress)) {
|
2010-01-28 14:18:15 -07:00
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
|
2010-03-11 07:42:04 -07:00
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
}
|
2008-11-06 20:02:51 -07:00
|
|
|
if (will_compress) {
|
|
|
|
*num_added += 1;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
|
2008-11-06 20:02:51 -07:00
|
|
|
/* the async work queues will take care of doing actual
|
|
|
|
* allocation on disk for these compressed pages,
|
|
|
|
* and will submit them to the elevator.
|
|
|
|
*/
|
|
|
|
add_async_extent(async_cow, start, num_bytes,
|
2010-12-16 23:21:50 -07:00
|
|
|
total_compressed, pages, nr_pages_ret,
|
|
|
|
compress_type);
|
2007-11-01 09:28:41 -06:00
|
|
|
|
2010-12-06 00:02:36 -07:00
|
|
|
if (start + num_bytes < end) {
|
2008-11-06 20:02:51 -07:00
|
|
|
start += num_bytes;
|
|
|
|
pages = NULL;
|
|
|
|
cond_resched();
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
} else {
|
2009-02-04 07:31:06 -07:00
|
|
|
cleanup_and_bail_uncompressed:
|
2008-11-06 20:02:51 -07:00
|
|
|
/*
|
|
|
|
* No compression, but we still need to write the pages in
|
|
|
|
* the file we've been given so far. redirty the locked
|
|
|
|
* page if it corresponds to our extent and set things up
|
|
|
|
* for the async work queue to run cow_file_range to do
|
|
|
|
* the normal delalloc dance
|
|
|
|
*/
|
|
|
|
if (page_offset(locked_page) >= start &&
|
|
|
|
page_offset(locked_page) <= end) {
|
|
|
|
__set_page_dirty_nobuffers(locked_page);
|
|
|
|
/* unlocked later on in the async handlers */
|
|
|
|
}
|
2010-12-16 23:21:50 -07:00
|
|
|
add_async_extent(async_cow, start, end - start + 1,
|
|
|
|
0, NULL, 0, BTRFS_COMPRESS_NONE);
|
2008-11-06 20:02:51 -07:00
|
|
|
*num_added += 1;
|
|
|
|
}
|
2008-04-17 09:29:12 -06:00
|
|
|
|
2008-11-06 20:02:51 -07:00
|
|
|
out:
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
free_pages_out:
|
|
|
|
for (i = 0; i < nr_pages_ret; i++) {
|
|
|
|
WARN_ON(pages[i]->mapping);
|
|
|
|
page_cache_release(pages[i]);
|
|
|
|
}
|
2009-01-05 19:25:51 -07:00
|
|
|
kfree(pages);
|
2008-11-06 20:02:51 -07:00
|
|
|
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* phase two of compressed writeback. This is the ordered portion
|
|
|
|
* of the code, which only gets called in the order the work was
|
|
|
|
* queued. We walk all the async extents created by compress_file_range
|
|
|
|
* and send them down to the disk.
|
|
|
|
*/
|
|
|
|
static noinline int submit_compressed_extents(struct inode *inode,
|
|
|
|
struct async_cow *async_cow)
|
|
|
|
{
|
|
|
|
struct async_extent *async_extent;
|
|
|
|
u64 alloc_hint = 0;
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_key ins;
|
|
|
|
struct extent_map *em;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
|
|
|
|
struct extent_io_tree *io_tree;
|
2009-11-10 19:23:48 -07:00
|
|
|
int ret = 0;
|
2008-11-06 20:02:51 -07:00
|
|
|
|
|
|
|
if (list_empty(&async_cow->extents))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
2009-01-05 19:25:51 -07:00
|
|
|
while (!list_empty(&async_cow->extents)) {
|
2008-11-06 20:02:51 -07:00
|
|
|
async_extent = list_entry(async_cow->extents.next,
|
|
|
|
struct async_extent, list);
|
|
|
|
list_del(&async_extent->list);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
|
2008-11-06 20:02:51 -07:00
|
|
|
io_tree = &BTRFS_I(inode)->io_tree;
|
|
|
|
|
2009-11-10 19:23:48 -07:00
|
|
|
retry:
|
2008-11-06 20:02:51 -07:00
|
|
|
/* did the compression code fall back to uncompressed IO? */
|
|
|
|
if (!async_extent->pages) {
|
|
|
|
int page_started = 0;
|
|
|
|
unsigned long nr_written = 0;
|
|
|
|
|
|
|
|
lock_extent(io_tree, async_extent->start,
|
2010-02-03 12:33:23 -07:00
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1, GFP_NOFS);
|
2008-11-06 20:02:51 -07:00
|
|
|
|
|
|
|
/* allocate blocks */
|
2009-11-10 19:23:48 -07:00
|
|
|
ret = cow_file_range(inode, async_cow->locked_page,
|
|
|
|
async_extent->start,
|
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1,
|
|
|
|
&page_started, &nr_written, 0);
|
2008-11-06 20:02:51 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* if page_started, cow_file_range inserted an
|
|
|
|
* inline extent and took care of all the unlocking
|
|
|
|
* and IO for us. Otherwise, we need to submit
|
|
|
|
* all those pages down to the drive.
|
|
|
|
*/
|
2009-11-10 19:23:48 -07:00
|
|
|
if (!page_started && !ret)
|
2008-11-06 20:02:51 -07:00
|
|
|
extent_write_locked_range(io_tree,
|
|
|
|
inode, async_extent->start,
|
2009-01-05 19:25:51 -07:00
|
|
|
async_extent->start +
|
2008-11-06 20:02:51 -07:00
|
|
|
async_extent->ram_size - 1,
|
|
|
|
btrfs_get_extent,
|
|
|
|
WB_SYNC_ALL);
|
|
|
|
kfree(async_extent);
|
|
|
|
cond_resched();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
lock_extent(io_tree, async_extent->start,
|
|
|
|
async_extent->start + async_extent->ram_size - 1,
|
|
|
|
GFP_NOFS);
|
|
|
|
|
2011-04-13 10:54:33 -06:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-01-24 19:51:38 -07:00
|
|
|
BUG_ON(IS_ERR(trans));
|
2011-04-13 10:02:53 -06:00
|
|
|
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
|
2008-11-06 20:02:51 -07:00
|
|
|
ret = btrfs_reserve_extent(trans, root,
|
|
|
|
async_extent->compressed_size,
|
|
|
|
async_extent->compressed_size,
|
|
|
|
0, alloc_hint,
|
|
|
|
(u64)-1, &ins, 1);
|
2009-11-12 02:34:21 -07:00
|
|
|
btrfs_end_transaction(trans, root);
|
|
|
|
|
2009-11-10 19:23:48 -07:00
|
|
|
if (ret) {
|
|
|
|
int i;
|
|
|
|
for (i = 0; i < async_extent->nr_pages; i++) {
|
|
|
|
WARN_ON(async_extent->pages[i]->mapping);
|
|
|
|
page_cache_release(async_extent->pages[i]);
|
|
|
|
}
|
|
|
|
kfree(async_extent->pages);
|
|
|
|
async_extent->nr_pages = 0;
|
|
|
|
async_extent->pages = NULL;
|
|
|
|
unlock_extent(io_tree, async_extent->start,
|
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1, GFP_NOFS);
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
|
2009-11-12 02:34:21 -07:00
|
|
|
/*
|
|
|
|
* here we're doing allocation and writeback of the
|
|
|
|
* compressed pages
|
|
|
|
*/
|
|
|
|
btrfs_drop_extent_cache(inode, async_extent->start,
|
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1, 0);
|
|
|
|
|
2011-04-20 16:48:27 -06:00
|
|
|
em = alloc_extent_map();
|
2011-02-13 17:45:29 -07:00
|
|
|
BUG_ON(!em);
|
2008-11-06 20:02:51 -07:00
|
|
|
em->start = async_extent->start;
|
|
|
|
em->len = async_extent->ram_size;
|
2008-11-10 09:53:33 -07:00
|
|
|
em->orig_start = em->start;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
|
2008-11-06 20:02:51 -07:00
|
|
|
em->block_start = ins.objectid;
|
|
|
|
em->block_len = ins.offset;
|
|
|
|
em->bdev = root->fs_info->fs_devices->latest_bdev;
|
2010-12-16 23:21:50 -07:00
|
|
|
em->compress_type = async_extent->compress_type;
|
2008-11-06 20:02:51 -07:00
|
|
|
set_bit(EXTENT_FLAG_PINNED, &em->flags);
|
|
|
|
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
|
|
|
|
|
2009-01-05 19:25:51 -07:00
|
|
|
while (1) {
|
2009-09-02 14:24:52 -06:00
|
|
|
write_lock(&em_tree->lock);
|
2008-11-06 20:02:51 -07:00
|
|
|
ret = add_extent_mapping(em_tree, em);
|
2009-09-02 14:24:52 -06:00
|
|
|
write_unlock(&em_tree->lock);
|
2008-11-06 20:02:51 -07:00
|
|
|
if (ret != -EEXIST) {
|
|
|
|
free_extent_map(em);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
btrfs_drop_extent_cache(inode, async_extent->start,
|
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1, 0);
|
|
|
|
}
|
|
|
|
|
2010-12-16 23:21:50 -07:00
|
|
|
ret = btrfs_add_ordered_extent_compress(inode,
|
|
|
|
async_extent->start,
|
|
|
|
ins.objectid,
|
|
|
|
async_extent->ram_size,
|
|
|
|
ins.offset,
|
|
|
|
BTRFS_ORDERED_COMPRESSED,
|
|
|
|
async_extent->compress_type);
|
2008-11-06 20:02:51 -07:00
|
|
|
BUG_ON(ret);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* clear dirty, set writeback and unlock the pages.
|
|
|
|
*/
|
|
|
|
extent_clear_unlock_delalloc(inode,
|
2009-10-08 09:27:10 -06:00
|
|
|
&BTRFS_I(inode)->io_tree,
|
|
|
|
async_extent->start,
|
|
|
|
async_extent->start +
|
|
|
|
async_extent->ram_size - 1,
|
|
|
|
NULL, EXTENT_CLEAR_UNLOCK_PAGE |
|
|
|
|
EXTENT_CLEAR_UNLOCK |
|
2009-10-08 10:30:20 -06:00
|
|
|
EXTENT_CLEAR_DELALLOC |
|
2009-10-08 09:27:10 -06:00
|
|
|
EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
|
2008-11-06 20:02:51 -07:00
|
|
|
|
|
|
|
ret = btrfs_submit_compressed_write(inode,
|
2009-01-05 19:25:51 -07:00
|
|
|
async_extent->start,
|
|
|
|
async_extent->ram_size,
|
|
|
|
ins.objectid,
|
|
|
|
ins.offset, async_extent->pages,
|
|
|
|
async_extent->nr_pages);
|
2008-11-06 20:02:51 -07:00
|
|
|
|
|
|
|
BUG_ON(ret);
|
|
|
|
alloc_hint = ins.objectid + ins.offset;
|
|
|
|
kfree(async_extent);
|
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-05-23 09:00:55 -06:00
|
|
|
static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
|
|
|
|
u64 num_bytes)
|
|
|
|
{
|
|
|
|
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
|
|
|
|
struct extent_map *em;
|
|
|
|
u64 alloc_hint = 0;
|
|
|
|
|
|
|
|
read_lock(&em_tree->lock);
|
|
|
|
em = search_extent_mapping(em_tree, start, num_bytes);
|
|
|
|
if (em) {
|
|
|
|
/*
|
|
|
|
* if block start isn't an actual block number then find the
|
|
|
|
* first block in this inode and use that as a hint. If that
|
|
|
|
* block is also bogus then just don't worry about it.
|
|
|
|
*/
|
|
|
|
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
|
|
|
|
free_extent_map(em);
|
|
|
|
em = search_extent_mapping(em_tree, 0, 0);
|
|
|
|
if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
|
|
|
|
alloc_hint = em->block_start;
|
|
|
|
if (em)
|
|
|
|
free_extent_map(em);
|
|
|
|
} else {
|
|
|
|
alloc_hint = em->block_start;
|
|
|
|
free_extent_map(em);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
read_unlock(&em_tree->lock);
|
|
|
|
|
|
|
|
return alloc_hint;
|
|
|
|
}
|
|
|
|
|
2011-04-19 20:33:24 -06:00
|
|
|
static inline bool is_free_space_inode(struct btrfs_root *root,
|
|
|
|
struct inode *inode)
|
|
|
|
{
|
|
|
|
if (root == root->fs_info->tree_root ||
|
|
|
|
BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2008-11-06 20:02:51 -07:00
|
|
|
/*
|
|
|
|
* when extent_io.c finds a delayed allocation range in the file,
|
|
|
|
* the call backs end up in this code. The basic idea is to
|
|
|
|
* allocate extents on disk for the range, and create ordered data structs
|
|
|
|
* in ram to track those extents.
|
|
|
|
*
|
|
|
|
* locked_page is the page that writepage had locked already. We use
|
|
|
|
* it to make sure we don't do extra locks or unlocks.
|
|
|
|
*
|
|
|
|
* *page_started is set to one if we unlock locked_page and do everything
|
|
|
|
* required to start IO on it. It may be clean and already done with
|
|
|
|
* IO when we return.
|
|
|
|
*/
|
|
|
|
static noinline int cow_file_range(struct inode *inode,
|
|
|
|
struct page *locked_page,
|
|
|
|
u64 start, u64 end, int *page_started,
|
|
|
|
unsigned long *nr_written,
|
|
|
|
int unlock)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
u64 alloc_hint = 0;
|
|
|
|
u64 num_bytes;
|
|
|
|
unsigned long ram_size;
|
|
|
|
u64 disk_num_bytes;
|
|
|
|
u64 cur_alloc_size;
|
|
|
|
u64 blocksize = root->sectorsize;
|
|
|
|
struct btrfs_key ins;
|
|
|
|
struct extent_map *em;
|
|
|
|
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
|
|
|
|
int ret = 0;
|
|
|
|
|
2011-04-19 20:33:24 -06:00
|
|
|
BUG_ON(is_free_space_inode(root, inode));
|
2011-04-13 10:54:33 -06:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-01-24 19:51:38 -07:00
|
|
|
BUG_ON(IS_ERR(trans));
|
2010-05-16 08:48:47 -06:00
|
|
|
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
|
2008-11-06 20:02:51 -07:00
|
|
|
|
|
|
|
num_bytes = (end - start + blocksize) & ~(blocksize - 1);
|
|
|
|
num_bytes = max(blocksize, num_bytes);
|
|
|
|
disk_num_bytes = num_bytes;
|
|
|
|
ret = 0;
|
|
|
|
|
2011-05-24 13:35:30 -06:00
|
|
|
/* if this is a small write inside eof, kick off defrag */
|
|
|
|
if (end <= BTRFS_I(inode)->disk_i_size && num_bytes < 64 * 1024)
|
|
|
|
btrfs_add_inode_defrag(trans, inode);
|
|
|
|
|
2008-11-06 20:02:51 -07:00
|
|
|
if (start == 0) {
|
|
|
|
/* lets try to make an inline extent */
|
|
|
|
ret = cow_file_range_inline(trans, root, inode,
|
2011-03-28 02:30:38 -06:00
|
|
|
start, end, 0, 0, NULL);
|
2008-11-06 20:02:51 -07:00
|
|
|
if (ret == 0) {
|
|
|
|
extent_clear_unlock_delalloc(inode,
|
2009-10-08 09:27:10 -06:00
|
|
|
&BTRFS_I(inode)->io_tree,
|
|
|
|
start, end, NULL,
|
|
|
|
EXTENT_CLEAR_UNLOCK_PAGE |
|
|
|
|
EXTENT_CLEAR_UNLOCK |
|
|
|
|
EXTENT_CLEAR_DELALLOC |
|
|
|
|
EXTENT_CLEAR_DIRTY |
|
|
|
|
EXTENT_SET_WRITEBACK |
|
|
|
|
EXTENT_END_WRITEBACK);
|
2009-11-12 02:34:21 -07:00
|
|
|
|
2008-11-06 20:02:51 -07:00
|
|
|
*nr_written = *nr_written +
|
|
|
|
(end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
|
|
|
|
*page_started = 1;
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
BUG_ON(disk_num_bytes >
|
|
|
|
btrfs_super_total_bytes(&root->fs_info->super_copy));
|
|
|
|
|
2010-05-23 09:00:55 -06:00
|
|
|
alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
|
2008-11-06 20:02:51 -07:00
|
|
|
btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
|
|
|
|
|
2009-01-05 19:25:51 -07:00
|
|
|
while (disk_num_bytes > 0) {
|
2009-10-08 09:27:10 -06:00
|
|
|
unsigned long op;
|
|
|
|
|
2010-03-19 12:07:23 -06:00
|
|
|
cur_alloc_size = disk_num_bytes;
|
2008-07-17 10:53:50 -06:00
|
|
|
ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
|
2008-11-06 20:02:51 -07:00
|
|
|
root->sectorsize, 0, alloc_hint,
|
2008-07-17 10:53:50 -06:00
|
|
|
(u64)-1, &ins, 1);
|
2009-01-05 19:25:51 -07:00
|
|
|
BUG_ON(ret);
|
|
|
|
|
2011-04-20 16:48:27 -06:00
|
|
|
em = alloc_extent_map();
|
2011-02-13 17:45:29 -07:00
|
|
|
BUG_ON(!em);
|
2008-07-17 10:53:50 -06:00
|
|
|
em->start = start;
|
2008-11-10 09:53:33 -07:00
|
|
|
em->orig_start = em->start;
|
2008-11-06 20:02:51 -07:00
|
|
|
ram_size = ins.offset;
|
|
|
|
em->len = ins.offset;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
|
2008-07-17 10:53:50 -06:00
|
|
|
em->block_start = ins.objectid;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
em->block_len = ins.offset;
|
2008-07-17 10:53:50 -06:00
|
|
|
em->bdev = root->fs_info->fs_devices->latest_bdev;
|
2008-07-18 10:01:11 -06:00
|
|
|
set_bit(EXTENT_FLAG_PINNED, &em->flags);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
|
2009-01-05 19:25:51 -07:00
|
|
|
while (1) {
|
2009-09-02 14:24:52 -06:00
|
|
|
write_lock(&em_tree->lock);
|
2008-07-17 10:53:50 -06:00
|
|
|
ret = add_extent_mapping(em_tree, em);
|
2009-09-02 14:24:52 -06:00
|
|
|
write_unlock(&em_tree->lock);
|
2008-07-17 10:53:50 -06:00
|
|
|
if (ret != -EEXIST) {
|
|
|
|
free_extent_map(em);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
btrfs_drop_extent_cache(inode, start,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
start + ram_size - 1, 0);
|
2008-07-17 10:53:50 -06:00
|
|
|
}
|
|
|
|
|
2008-04-14 07:46:10 -06:00
|
|
|
cur_alloc_size = ins.offset;
|
2008-07-17 10:53:50 -06:00
|
|
|
ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
|
2008-11-06 20:02:51 -07:00
|
|
|
ram_size, cur_alloc_size, 0);
|
2008-07-17 10:53:50 -06:00
|
|
|
BUG_ON(ret);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
|
2008-12-12 08:03:38 -07:00
|
|
|
if (root->root_key.objectid ==
|
|
|
|
BTRFS_DATA_RELOC_TREE_OBJECTID) {
|
|
|
|
ret = btrfs_reloc_clone_csums(inode, start,
|
|
|
|
cur_alloc_size);
|
|
|
|
BUG_ON(ret);
|
|
|
|
}
|
|
|
|
|
2009-01-05 19:25:51 -07:00
|
|
|
if (disk_num_bytes < cur_alloc_size)
|
2008-04-17 09:29:12 -06:00
|
|
|
break;
|
2009-01-05 19:25:51 -07:00
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
/* we're not doing compressed IO, don't unlock the first
|
|
|
|
* page (which the caller expects to stay locked), don't
|
|
|
|
* clear any dirty bits and don't set any writeback bits
|
2009-09-02 14:53:46 -06:00
|
|
|
*
|
|
|
|
* Do set the Private2 bit so we know this page was properly
|
|
|
|
* setup for writepage
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
*/
|
2009-10-08 09:27:10 -06:00
|
|
|
op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
|
|
|
|
op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
|
|
|
|
EXTENT_SET_PRIVATE2;
|
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
|
|
|
|
start, start + ram_size - 1,
|
2009-10-08 09:27:10 -06:00
|
|
|
locked_page, op);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
disk_num_bytes -= cur_alloc_size;
|
2007-12-17 18:14:04 -07:00
|
|
|
num_bytes -= cur_alloc_size;
|
|
|
|
alloc_hint = ins.objectid + ins.offset;
|
|
|
|
start += cur_alloc_size;
|
2007-08-27 14:49:44 -06:00
|
|
|
}
|
|
|
|
out:
|
2008-11-06 20:02:51 -07:00
|
|
|
ret = 0;
|
2007-08-27 14:49:44 -06:00
|
|
|
btrfs_end_transaction(trans, root);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
|
2007-12-17 18:14:01 -07:00
|
|
|
return ret;
|
2008-11-06 20:02:51 -07:00
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
|
2008-11-06 20:02:51 -07:00
|
|
|
/*
|
|
|
|
* work queue call back to started compression on a file and pages
|
|
|
|
*/
|
|
|
|
static noinline void async_cow_start(struct btrfs_work *work)
|
|
|
|
{
|
|
|
|
struct async_cow *async_cow;
|
|
|
|
int num_added = 0;
|
|
|
|
async_cow = container_of(work, struct async_cow, work);
|
|
|
|
|
|
|
|
compress_file_range(async_cow->inode, async_cow->locked_page,
|
|
|
|
async_cow->start, async_cow->end, async_cow,
|
|
|
|
&num_added);
|
|
|
|
if (num_added == 0)
|
|
|
|
async_cow->inode = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* work queue call back to submit previously compressed pages
|
|
|
|
*/
|
|
|
|
static noinline void async_cow_submit(struct btrfs_work *work)
|
|
|
|
{
|
|
|
|
struct async_cow *async_cow;
|
|
|
|
struct btrfs_root *root;
|
|
|
|
unsigned long nr_pages;
|
|
|
|
|
|
|
|
async_cow = container_of(work, struct async_cow, work);
|
|
|
|
|
|
|
|
root = async_cow->root;
|
|
|
|
nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
|
|
|
|
PAGE_CACHE_SHIFT;
|
|
|
|
|
|
|
|
atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
|
|
|
|
|
|
|
|
if (atomic_read(&root->fs_info->async_delalloc_pages) <
|
|
|
|
5 * 1042 * 1024 &&
|
|
|
|
waitqueue_active(&root->fs_info->async_submit_wait))
|
|
|
|
wake_up(&root->fs_info->async_submit_wait);
|
|
|
|
|
2009-01-05 19:25:51 -07:00
|
|
|
if (async_cow->inode)
|
2008-11-06 20:02:51 -07:00
|
|
|
submit_compressed_extents(async_cow->inode, async_cow);
|
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
|
2008-11-06 20:02:51 -07:00
|
|
|
static noinline void async_cow_free(struct btrfs_work *work)
|
|
|
|
{
|
|
|
|
struct async_cow *async_cow;
|
|
|
|
async_cow = container_of(work, struct async_cow, work);
|
|
|
|
kfree(async_cow);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int cow_file_range_async(struct inode *inode, struct page *locked_page,
|
|
|
|
u64 start, u64 end, int *page_started,
|
|
|
|
unsigned long *nr_written)
|
|
|
|
{
|
|
|
|
struct async_cow *async_cow;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
unsigned long nr_pages;
|
|
|
|
u64 cur_end;
|
|
|
|
int limit = 10 * 1024 * 1042;
|
|
|
|
|
2009-10-08 10:30:20 -06:00
|
|
|
clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
|
|
|
|
1, 0, NULL, GFP_NOFS);
|
2009-01-05 19:25:51 -07:00
|
|
|
while (start < end) {
|
2008-11-06 20:02:51 -07:00
|
|
|
async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
|
2011-04-25 17:43:52 -06:00
|
|
|
BUG_ON(!async_cow);
|
2008-11-06 20:02:51 -07:00
|
|
|
async_cow->inode = inode;
|
|
|
|
async_cow->root = root;
|
|
|
|
async_cow->locked_page = locked_page;
|
|
|
|
async_cow->start = start;
|
|
|
|
|
2009-04-17 02:37:41 -06:00
|
|
|
if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
|
2008-11-06 20:02:51 -07:00
|
|
|
cur_end = end;
|
|
|
|
else
|
|
|
|
cur_end = min(end, start + 512 * 1024 - 1);
|
|
|
|
|
|
|
|
async_cow->end = cur_end;
|
|
|
|
INIT_LIST_HEAD(&async_cow->extents);
|
|
|
|
|
|
|
|
async_cow->work.func = async_cow_start;
|
|
|
|
async_cow->work.ordered_func = async_cow_submit;
|
|
|
|
async_cow->work.ordered_free = async_cow_free;
|
|
|
|
async_cow->work.flags = 0;
|
|
|
|
|
|
|
|
nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
|
|
|
|
PAGE_CACHE_SHIFT;
|
|
|
|
atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
|
|
|
|
|
|
|
|
btrfs_queue_worker(&root->fs_info->delalloc_workers,
|
|
|
|
&async_cow->work);
|
|
|
|
|
|
|
|
if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
|
|
|
|
wait_event(root->fs_info->async_submit_wait,
|
|
|
|
(atomic_read(&root->fs_info->async_delalloc_pages) <
|
|
|
|
limit));
|
|
|
|
}
|
|
|
|
|
2009-01-05 19:25:51 -07:00
|
|
|
while (atomic_read(&root->fs_info->async_submit_draining) &&
|
2008-11-06 20:02:51 -07:00
|
|
|
atomic_read(&root->fs_info->async_delalloc_pages)) {
|
|
|
|
wait_event(root->fs_info->async_submit_wait,
|
|
|
|
(atomic_read(&root->fs_info->async_delalloc_pages) ==
|
|
|
|
0));
|
|
|
|
}
|
|
|
|
|
|
|
|
*nr_written += nr_pages;
|
|
|
|
start = cur_end + 1;
|
|
|
|
}
|
|
|
|
*page_started = 1;
|
|
|
|
return 0;
|
2007-12-17 18:14:01 -07:00
|
|
|
}
|
|
|
|
|
2009-01-05 19:25:51 -07:00
|
|
|
static noinline int csum_exist_in_range(struct btrfs_root *root,
|
2008-12-12 08:03:38 -07:00
|
|
|
u64 bytenr, u64 num_bytes)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct btrfs_ordered_sum *sums;
|
|
|
|
LIST_HEAD(list);
|
|
|
|
|
2009-01-06 09:42:00 -07:00
|
|
|
ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
|
2011-03-08 06:14:00 -07:00
|
|
|
bytenr + num_bytes - 1, &list, 0);
|
2008-12-12 08:03:38 -07:00
|
|
|
if (ret == 0 && list_empty(&list))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
while (!list_empty(&list)) {
|
|
|
|
sums = list_entry(list.next, struct btrfs_ordered_sum, list);
|
|
|
|
list_del(&sums->list);
|
|
|
|
kfree(sums);
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/*
|
|
|
|
* when nowcow writeback call back. This checks for snapshots or COW copies
|
|
|
|
* of the extents that exist in the file, and COWs the file as required.
|
|
|
|
*
|
|
|
|
* If no cow copies or snapshots exist, we write directly to the existing
|
|
|
|
* blocks on disk
|
|
|
|
*/
|
2009-03-12 18:12:45 -06:00
|
|
|
static noinline int run_delalloc_nocow(struct inode *inode,
|
|
|
|
struct page *locked_page,
|
2008-11-06 20:02:51 -07:00
|
|
|
u64 start, u64 end, int *page_started, int force,
|
|
|
|
unsigned long *nr_written)
|
2007-12-17 18:14:01 -07:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2008-08-05 11:05:02 -06:00
|
|
|
struct btrfs_trans_handle *trans;
|
2007-12-17 18:14:01 -07:00
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_path *path;
|
2008-10-30 12:20:02 -06:00
|
|
|
struct btrfs_file_extent_item *fi;
|
2007-12-17 18:14:01 -07:00
|
|
|
struct btrfs_key found_key;
|
2008-10-30 12:20:02 -06:00
|
|
|
u64 cow_start;
|
|
|
|
u64 cur_offset;
|
|
|
|
u64 extent_end;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
u64 extent_offset;
|
2008-10-30 12:20:02 -06:00
|
|
|
u64 disk_bytenr;
|
|
|
|
u64 num_bytes;
|
|
|
|
int extent_type;
|
|
|
|
int ret;
|
2008-10-30 12:25:28 -06:00
|
|
|
int type;
|
2008-10-30 12:20:02 -06:00
|
|
|
int nocow;
|
|
|
|
int check_prev = 1;
|
2011-04-19 20:33:24 -06:00
|
|
|
bool nolock;
|
2011-04-19 20:31:50 -06:00
|
|
|
u64 ino = btrfs_ino(inode);
|
2007-12-17 18:14:01 -07:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-13 11:38:47 -06:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
2011-04-19 20:33:24 -06:00
|
|
|
|
|
|
|
nolock = is_free_space_inode(root, inode);
|
|
|
|
|
|
|
|
if (nolock)
|
2011-04-13 10:54:33 -06:00
|
|
|
trans = btrfs_join_transaction_nolock(root);
|
2011-04-19 20:33:24 -06:00
|
|
|
else
|
2011-04-13 10:54:33 -06:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-05-28 05:00:39 -06:00
|
|
|
|
2011-01-24 19:51:38 -07:00
|
|
|
BUG_ON(IS_ERR(trans));
|
2011-04-13 10:02:53 -06:00
|
|
|
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
|
2007-12-17 18:14:01 -07:00
|
|
|
|
2008-10-30 12:20:02 -06:00
|
|
|
cow_start = (u64)-1;
|
|
|
|
cur_offset = start;
|
|
|
|
while (1) {
|
2011-04-19 20:31:50 -06:00
|
|
|
ret = btrfs_lookup_file_extent(trans, root, path, ino,
|
2008-10-30 12:20:02 -06:00
|
|
|
cur_offset, 0);
|
|
|
|
BUG_ON(ret < 0);
|
|
|
|
if (ret > 0 && path->slots[0] > 0 && check_prev) {
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key,
|
|
|
|
path->slots[0] - 1);
|
2011-04-19 20:31:50 -06:00
|
|
|
if (found_key.objectid == ino &&
|
2008-10-30 12:20:02 -06:00
|
|
|
found_key.type == BTRFS_EXTENT_DATA_KEY)
|
|
|
|
path->slots[0]--;
|
|
|
|
}
|
|
|
|
check_prev = 0;
|
|
|
|
next_slot:
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
|
|
|
if (ret < 0)
|
|
|
|
BUG_ON(1);
|
|
|
|
if (ret > 0)
|
|
|
|
break;
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
}
|
2007-12-17 18:14:01 -07:00
|
|
|
|
2008-10-30 12:20:02 -06:00
|
|
|
nocow = 0;
|
|
|
|
disk_bytenr = 0;
|
2008-12-12 08:03:38 -07:00
|
|
|
num_bytes = 0;
|
2008-10-30 12:20:02 -06:00
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
if (found_key.objectid > ino ||
|
2008-10-30 12:20:02 -06:00
|
|
|
found_key.type > BTRFS_EXTENT_DATA_KEY ||
|
|
|
|
found_key.offset > end)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (found_key.offset > cur_offset) {
|
|
|
|
extent_end = found_key.offset;
|
2009-10-09 07:57:45 -06:00
|
|
|
extent_type = 0;
|
2008-10-30 12:20:02 -06:00
|
|
|
goto out_check;
|
|
|
|
}
|
|
|
|
|
|
|
|
fi = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
extent_type = btrfs_file_extent_type(leaf, fi);
|
|
|
|
|
2008-10-30 12:25:28 -06:00
|
|
|
if (extent_type == BTRFS_FILE_EXTENT_REG ||
|
|
|
|
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
|
2008-10-30 12:20:02 -06:00
|
|
|
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
extent_offset = btrfs_file_extent_offset(leaf, fi);
|
2008-10-30 12:20:02 -06:00
|
|
|
extent_end = found_key.offset +
|
|
|
|
btrfs_file_extent_num_bytes(leaf, fi);
|
|
|
|
if (extent_end <= start) {
|
|
|
|
path->slots[0]++;
|
|
|
|
goto next_slot;
|
|
|
|
}
|
2008-12-12 08:03:38 -07:00
|
|
|
if (disk_bytenr == 0)
|
|
|
|
goto out_check;
|
2008-10-30 12:20:02 -06:00
|
|
|
if (btrfs_file_extent_compression(leaf, fi) ||
|
|
|
|
btrfs_file_extent_encryption(leaf, fi) ||
|
|
|
|
btrfs_file_extent_other_encoding(leaf, fi))
|
|
|
|
goto out_check;
|
2008-10-30 12:25:28 -06:00
|
|
|
if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
|
|
|
|
goto out_check;
|
2008-12-11 14:30:39 -07:00
|
|
|
if (btrfs_extent_readonly(root, disk_bytenr))
|
2008-10-30 12:20:02 -06:00
|
|
|
goto out_check;
|
2011-04-19 20:31:50 -06:00
|
|
|
if (btrfs_cross_ref_exist(trans, root, ino,
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
found_key.offset -
|
|
|
|
extent_offset, disk_bytenr))
|
2008-12-12 08:03:38 -07:00
|
|
|
goto out_check;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
disk_bytenr += extent_offset;
|
2008-12-12 08:03:38 -07:00
|
|
|
disk_bytenr += cur_offset - found_key.offset;
|
|
|
|
num_bytes = min(end + 1, extent_end) - cur_offset;
|
|
|
|
/*
|
|
|
|
* force cow if csum exists in the range.
|
|
|
|
* this ensure that csum for a given extent are
|
|
|
|
* either valid or do not exist.
|
|
|
|
*/
|
|
|
|
if (csum_exist_in_range(root, disk_bytenr, num_bytes))
|
|
|
|
goto out_check;
|
2008-10-30 12:20:02 -06:00
|
|
|
nocow = 1;
|
|
|
|
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
|
|
|
|
extent_end = found_key.offset +
|
|
|
|
btrfs_file_extent_inline_len(leaf, fi);
|
|
|
|
extent_end = ALIGN(extent_end, root->sectorsize);
|
|
|
|
} else {
|
|
|
|
BUG_ON(1);
|
|
|
|
}
|
|
|
|
out_check:
|
|
|
|
if (extent_end <= start) {
|
|
|
|
path->slots[0]++;
|
|
|
|
goto next_slot;
|
|
|
|
}
|
|
|
|
if (!nocow) {
|
|
|
|
if (cow_start == (u64)-1)
|
|
|
|
cow_start = cur_offset;
|
|
|
|
cur_offset = extent_end;
|
|
|
|
if (cur_offset > end)
|
|
|
|
break;
|
|
|
|
path->slots[0]++;
|
|
|
|
goto next_slot;
|
2008-08-05 11:05:02 -06:00
|
|
|
}
|
|
|
|
|
2011-04-20 17:20:15 -06:00
|
|
|
btrfs_release_path(path);
|
2008-10-30 12:20:02 -06:00
|
|
|
if (cow_start != (u64)-1) {
|
|
|
|
ret = cow_file_range(inode, locked_page, cow_start,
|
2008-11-06 20:02:51 -07:00
|
|
|
found_key.offset - 1, page_started,
|
|
|
|
nr_written, 1);
|
2008-10-30 12:20:02 -06:00
|
|
|
BUG_ON(ret);
|
|
|
|
cow_start = (u64)-1;
|
2008-08-05 11:05:02 -06:00
|
|
|
}
|
2008-10-30 12:20:02 -06:00
|
|
|
|
2008-10-30 12:25:28 -06:00
|
|
|
if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
|
|
|
|
struct extent_map *em;
|
|
|
|
struct extent_map_tree *em_tree;
|
|
|
|
em_tree = &BTRFS_I(inode)->extent_tree;
|
2011-04-20 16:48:27 -06:00
|
|
|
em = alloc_extent_map();
|
2011-02-13 17:45:29 -07:00
|
|
|
BUG_ON(!em);
|
2008-10-30 12:25:28 -06:00
|
|
|
em->start = cur_offset;
|
2008-11-10 09:53:33 -07:00
|
|
|
em->orig_start = em->start;
|
2008-10-30 12:25:28 -06:00
|
|
|
em->len = num_bytes;
|
|
|
|
em->block_len = num_bytes;
|
|
|
|
em->block_start = disk_bytenr;
|
|
|
|
em->bdev = root->fs_info->fs_devices->latest_bdev;
|
|
|
|
set_bit(EXTENT_FLAG_PINNED, &em->flags);
|
|
|
|
while (1) {
|
2009-09-02 14:24:52 -06:00
|
|
|
write_lock(&em_tree->lock);
|
2008-10-30 12:25:28 -06:00
|
|
|
ret = add_extent_mapping(em_tree, em);
|
2009-09-02 14:24:52 -06:00
|
|
|
write_unlock(&em_tree->lock);
|
2008-10-30 12:25:28 -06:00
|
|
|
if (ret != -EEXIST) {
|
|
|
|
free_extent_map(em);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
btrfs_drop_extent_cache(inode, em->start,
|
|
|
|
em->start + em->len - 1, 0);
|
|
|
|
}
|
|
|
|
type = BTRFS_ORDERED_PREALLOC;
|
|
|
|
} else {
|
|
|
|
type = BTRFS_ORDERED_NOCOW;
|
|
|
|
}
|
2008-10-30 12:20:02 -06:00
|
|
|
|
|
|
|
ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
|
2008-10-30 12:25:28 -06:00
|
|
|
num_bytes, num_bytes, type);
|
|
|
|
BUG_ON(ret);
|
2008-11-06 20:02:51 -07:00
|
|
|
|
2010-05-16 08:49:59 -06:00
|
|
|
if (root->root_key.objectid ==
|
|
|
|
BTRFS_DATA_RELOC_TREE_OBJECTID) {
|
|
|
|
ret = btrfs_reloc_clone_csums(inode, cur_offset,
|
|
|
|
num_bytes);
|
|
|
|
BUG_ON(ret);
|
|
|
|
}
|
|
|
|
|
2008-10-30 12:25:28 -06:00
|
|
|
extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
|
2009-10-08 09:27:10 -06:00
|
|
|
cur_offset, cur_offset + num_bytes - 1,
|
|
|
|
locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
|
|
|
|
EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
|
|
|
|
EXTENT_SET_PRIVATE2);
|
2008-10-30 12:20:02 -06:00
|
|
|
cur_offset = extent_end;
|
|
|
|
if (cur_offset > end)
|
|
|
|
break;
|
2007-12-17 18:14:01 -07:00
|
|
|
}
|
2011-04-20 17:20:15 -06:00
|
|
|
btrfs_release_path(path);
|
2008-10-30 12:20:02 -06:00
|
|
|
|
|
|
|
if (cur_offset <= end && cow_start == (u64)-1)
|
|
|
|
cow_start = cur_offset;
|
|
|
|
if (cow_start != (u64)-1) {
|
|
|
|
ret = cow_file_range(inode, locked_page, cow_start, end,
|
2008-11-06 20:02:51 -07:00
|
|
|
page_started, nr_written, 1);
|
2008-10-30 12:20:02 -06:00
|
|
|
BUG_ON(ret);
|
|
|
|
}
|
|
|
|
|
2010-07-02 10:14:14 -06:00
|
|
|
if (nolock) {
|
|
|
|
ret = btrfs_end_transaction_nolock(trans, root);
|
|
|
|
BUG_ON(ret);
|
|
|
|
} else {
|
|
|
|
ret = btrfs_end_transaction(trans, root);
|
|
|
|
BUG_ON(ret);
|
|
|
|
}
|
2008-08-05 11:05:02 -06:00
|
|
|
btrfs_free_path(path);
|
2008-10-30 12:20:02 -06:00
|
|
|
return 0;
|
2007-12-17 18:14:01 -07:00
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/*
|
|
|
|
* extent_io.c call back to do delayed allocation processing
|
|
|
|
*/
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
static int run_delalloc_range(struct inode *inode, struct page *locked_page,
|
2008-11-06 20:02:51 -07:00
|
|
|
u64 start, u64 end, int *page_started,
|
|
|
|
unsigned long *nr_written)
|
2007-12-17 18:14:01 -07:00
|
|
|
{
|
|
|
|
int ret;
|
2009-03-12 18:12:45 -06:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2008-06-25 14:01:30 -06:00
|
|
|
|
2009-04-17 02:37:41 -06:00
|
|
|
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
ret = run_delalloc_nocow(inode, locked_page, start, end,
|
2009-01-05 19:25:51 -07:00
|
|
|
page_started, 1, nr_written);
|
2009-04-17 02:37:41 -06:00
|
|
|
else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
|
2008-10-30 12:25:28 -06:00
|
|
|
ret = run_delalloc_nocow(inode, locked_page, start, end,
|
2009-01-05 19:25:51 -07:00
|
|
|
page_started, 0, nr_written);
|
2010-03-11 07:42:04 -07:00
|
|
|
else if (!btrfs_test_opt(root, COMPRESS) &&
|
Btrfs: Per file/directory controls for COW and compression
Data compression and data cow are controlled across the entire FS by mount
options right now. ioctls are needed to set this on a per file or per
directory basis. This has been proposed previously, but VFS developers
wanted us to use generic ioctls rather than btrfs-specific ones.
According to Chris's comment, there should be just one true compression
method(probably LZO) stored in the super. However, before this, we would
wait for that one method is stable enough to be adopted into the super.
So I list it as a long term goal, and just store it in ram today.
After applying this patch, we can use the generic "FS_IOC_SETFLAGS" ioctl to
control file and directory's datacow and compression attribute.
NOTE:
- The compression type is selected by such rules:
If we mount btrfs with compress options, ie, zlib/lzo, the type is it.
Otherwise, we'll use the default compress type (zlib today).
v1->v2:
- rebase to the latest btrfs.
v2->v3:
- fix a problem, i.e. when a file is set NOCOW via mount option, then this NOCOW
will be screwed by inheritance from parent directory.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-22 04:12:20 -06:00
|
|
|
!(BTRFS_I(inode)->force_compress) &&
|
|
|
|
!(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))
|
2009-03-12 18:12:45 -06:00
|
|
|
ret = cow_file_range(inode, locked_page, start, end,
|
|
|
|
page_started, nr_written, 1);
|
2007-12-17 18:14:01 -07:00
|
|
|
else
|
2008-11-06 20:02:51 -07:00
|
|
|
ret = cow_file_range_async(inode, locked_page, start, end,
|
2009-01-05 19:25:51 -07:00
|
|
|
page_started, nr_written);
|
2007-08-27 14:49:44 -06:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2009-09-11 14:12:44 -06:00
|
|
|
static int btrfs_split_extent_hook(struct inode *inode,
|
2010-05-16 08:48:47 -06:00
|
|
|
struct extent_state *orig, u64 split)
|
2009-09-11 14:12:44 -06:00
|
|
|
{
|
2010-05-16 08:48:47 -06:00
|
|
|
/* not delalloc, ignore it */
|
2009-09-11 14:12:44 -06:00
|
|
|
if (!(orig->state & EXTENT_DELALLOC))
|
|
|
|
return 0;
|
|
|
|
|
2010-05-16 08:48:47 -06:00
|
|
|
atomic_inc(&BTRFS_I(inode)->outstanding_extents);
|
2009-09-11 14:12:44 -06:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* extent_io.c merge_extent_hook, used to track merged delayed allocation
|
|
|
|
* extents so we can keep track of new extents that are just merged onto old
|
|
|
|
* extents, such as when we are doing sequential writes, so we can properly
|
|
|
|
* account for the metadata space we'll need.
|
|
|
|
*/
|
|
|
|
static int btrfs_merge_extent_hook(struct inode *inode,
|
|
|
|
struct extent_state *new,
|
|
|
|
struct extent_state *other)
|
|
|
|
{
|
|
|
|
/* not delalloc, ignore it */
|
|
|
|
if (!(other->state & EXTENT_DELALLOC))
|
|
|
|
return 0;
|
|
|
|
|
2010-05-16 08:48:47 -06:00
|
|
|
atomic_dec(&BTRFS_I(inode)->outstanding_extents);
|
2009-09-11 14:12:44 -06:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/*
|
|
|
|
* extent_io.c set_bit_hook, used to track delayed allocation
|
|
|
|
* bytes in this file, and to maintain the list of inodes that
|
|
|
|
* have pending delalloc work to be done.
|
|
|
|
*/
|
2010-05-16 08:48:47 -06:00
|
|
|
static int btrfs_set_bit_hook(struct inode *inode,
|
|
|
|
struct extent_state *state, int *bits)
|
2008-01-29 13:55:23 -07:00
|
|
|
{
|
2009-09-11 14:12:44 -06:00
|
|
|
|
2008-12-15 13:54:40 -07:00
|
|
|
/*
|
|
|
|
* set_bit and clear bit hooks normally require _irqsave/restore
|
2011-05-20 14:20:32 -06:00
|
|
|
* but in this case, we are only testing for the DELALLOC
|
2008-12-15 13:54:40 -07:00
|
|
|
* bit, which is only set or cleared with irqs on
|
|
|
|
*/
|
2010-05-16 08:48:47 -06:00
|
|
|
if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
|
2008-01-29 13:55:23 -07:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2010-05-16 08:48:47 -06:00
|
|
|
u64 len = state->end + 1 - state->start;
|
2011-04-19 20:33:24 -06:00
|
|
|
bool do_list = !is_free_space_inode(root, inode);
|
2009-09-11 14:12:44 -06:00
|
|
|
|
2010-05-16 08:48:47 -06:00
|
|
|
if (*bits & EXTENT_FIRST_DELALLOC)
|
|
|
|
*bits &= ~EXTENT_FIRST_DELALLOC;
|
|
|
|
else
|
|
|
|
atomic_inc(&BTRFS_I(inode)->outstanding_extents);
|
2010-03-19 12:07:23 -06:00
|
|
|
|
2008-12-15 13:54:40 -07:00
|
|
|
spin_lock(&root->fs_info->delalloc_lock);
|
2010-05-16 08:48:47 -06:00
|
|
|
BTRFS_I(inode)->delalloc_bytes += len;
|
|
|
|
root->fs_info->delalloc_bytes += len;
|
2010-07-02 10:14:14 -06:00
|
|
|
if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
|
2008-08-04 21:17:27 -06:00
|
|
|
list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
|
|
|
|
&root->fs_info->delalloc_inodes);
|
|
|
|
}
|
2008-12-15 13:54:40 -07:00
|
|
|
spin_unlock(&root->fs_info->delalloc_lock);
|
2008-01-29 13:55:23 -07:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/*
|
|
|
|
* extent_io.c clear_bit_hook, see set_bit_hook for why
|
|
|
|
*/
|
2009-09-11 14:12:44 -06:00
|
|
|
static int btrfs_clear_bit_hook(struct inode *inode,
|
2010-05-16 08:48:47 -06:00
|
|
|
struct extent_state *state, int *bits)
|
2008-01-29 13:55:23 -07:00
|
|
|
{
|
2008-12-15 13:54:40 -07:00
|
|
|
/*
|
|
|
|
* set_bit and clear bit hooks normally require _irqsave/restore
|
2011-05-20 14:20:32 -06:00
|
|
|
* but in this case, we are only testing for the DELALLOC
|
2008-12-15 13:54:40 -07:00
|
|
|
* bit, which is only set or cleared with irqs on
|
|
|
|
*/
|
2010-05-16 08:48:47 -06:00
|
|
|
if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
|
2008-01-29 13:55:23 -07:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2010-05-16 08:48:47 -06:00
|
|
|
u64 len = state->end + 1 - state->start;
|
2011-04-19 20:33:24 -06:00
|
|
|
bool do_list = !is_free_space_inode(root, inode);
|
2008-04-22 11:26:47 -06:00
|
|
|
|
2010-05-16 08:48:47 -06:00
|
|
|
if (*bits & EXTENT_FIRST_DELALLOC)
|
|
|
|
*bits &= ~EXTENT_FIRST_DELALLOC;
|
|
|
|
else if (!(*bits & EXTENT_DO_ACCOUNTING))
|
|
|
|
atomic_dec(&BTRFS_I(inode)->outstanding_extents);
|
|
|
|
|
|
|
|
if (*bits & EXTENT_DO_ACCOUNTING)
|
|
|
|
btrfs_delalloc_release_metadata(inode, len);
|
|
|
|
|
2010-07-02 10:14:14 -06:00
|
|
|
if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
|
|
|
|
&& do_list)
|
2010-05-16 08:48:47 -06:00
|
|
|
btrfs_free_reserved_data_space(inode, len);
|
2009-09-11 14:12:44 -06:00
|
|
|
|
2008-12-15 13:54:40 -07:00
|
|
|
spin_lock(&root->fs_info->delalloc_lock);
|
2010-05-16 08:48:47 -06:00
|
|
|
root->fs_info->delalloc_bytes -= len;
|
|
|
|
BTRFS_I(inode)->delalloc_bytes -= len;
|
|
|
|
|
2010-07-02 10:14:14 -06:00
|
|
|
if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
|
2008-08-04 21:17:27 -06:00
|
|
|
!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
|
|
|
|
list_del_init(&BTRFS_I(inode)->delalloc_inodes);
|
|
|
|
}
|
2008-12-15 13:54:40 -07:00
|
|
|
spin_unlock(&root->fs_info->delalloc_lock);
|
2008-01-29 13:55:23 -07:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/*
|
|
|
|
* extent_io.c merge_bio_hook, this must check the chunk tree to make sure
|
|
|
|
* we don't create bios that span stripes or chunks
|
|
|
|
*/
|
2008-03-24 13:02:07 -06:00
|
|
|
int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
size_t size, struct bio *bio,
|
|
|
|
unsigned long bio_flags)
|
2008-03-24 13:02:07 -06:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
|
|
|
|
struct btrfs_mapping_tree *map_tree;
|
2008-10-03 14:31:08 -06:00
|
|
|
u64 logical = (u64)bio->bi_sector << 9;
|
2008-03-24 13:02:07 -06:00
|
|
|
u64 length = 0;
|
|
|
|
u64 map_length;
|
|
|
|
int ret;
|
|
|
|
|
2008-11-06 20:02:51 -07:00
|
|
|
if (bio_flags & EXTENT_BIO_COMPRESSED)
|
|
|
|
return 0;
|
|
|
|
|
2008-04-21 08:03:05 -06:00
|
|
|
length = bio->bi_size;
|
2008-03-24 13:02:07 -06:00
|
|
|
map_tree = &root->fs_info->mapping_tree;
|
|
|
|
map_length = length;
|
2008-04-09 14:28:12 -06:00
|
|
|
ret = btrfs_map_block(map_tree, READ, logical,
|
2008-04-09 14:28:12 -06:00
|
|
|
&map_length, NULL, 0);
|
2008-04-09 14:28:12 -06:00
|
|
|
|
2009-01-05 19:25:51 -07:00
|
|
|
if (map_length < length + size)
|
2008-03-24 13:02:07 -06:00
|
|
|
return 1;
|
2010-10-29 13:14:31 -06:00
|
|
|
return ret;
|
2008-03-24 13:02:07 -06:00
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/*
|
|
|
|
* in order to insert checksums into the metadata in large chunks,
|
|
|
|
* we wait until bio submission time. All the pages in the bio are
|
|
|
|
* checksummed and sums are attached onto the ordered extent record.
|
|
|
|
*
|
|
|
|
* At IO completion time the cums attached on the ordered extent record
|
|
|
|
* are inserted into the btree
|
|
|
|
*/
|
2009-01-05 19:25:51 -07:00
|
|
|
static int __btrfs_submit_bio_start(struct inode *inode, int rw,
|
|
|
|
struct bio *bio, int mirror_num,
|
2010-05-25 07:48:28 -06:00
|
|
|
unsigned long bio_flags,
|
|
|
|
u64 bio_offset)
|
2008-02-20 10:07:25 -07:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
int ret = 0;
|
2008-04-16 09:15:20 -06:00
|
|
|
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-08 14:58:54 -07:00
|
|
|
ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
|
2008-04-16 09:14:51 -06:00
|
|
|
BUG_ON(ret);
|
Btrfs: Add ordered async work queues
Btrfs uses kernel threads to create async work queues for cpu intensive
operations such as checksumming and decompression. These work well,
but they make it difficult to keep IO order intact.
A single writepages call from pdflush or fsync will turn into a number
of bios, and each bio is checksummed in parallel. Once the checksum is
computed, the bio is sent down to the disk, and since we don't control
the order in which the parallel operations happen, they might go down to
the disk in almost any order.
The code deals with this somewhat by having deep work queues for a single
kernel thread, making it very likely that a single thread will process all
the bios for a single inode.
This patch introduces an explicitly ordered work queue. As work structs
are placed into the queue they are put onto the tail of a list. They have
three callbacks:
->func (cpu intensive processing here)
->ordered_func (order sensitive processing here)
->ordered_free (free the work struct, all processing is done)
The work struct has three callbacks. The func callback does the cpu intensive
work, and when it completes the work struct is marked as done.
Every time a work struct completes, the list is checked to see if the head
is marked as done. If so the ordered_func callback is used to do the
order sensitive processing and the ordered_free callback is used to do
any cleanup. Then we loop back and check the head of the list again.
This patch also changes the checksumming code to use the ordered workqueues.
One a 4 drive array, it increases streaming writes from 280MB/s to 350MB/s.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-11-06 20:03:00 -07:00
|
|
|
return 0;
|
|
|
|
}
|
2008-04-16 09:15:20 -06:00
|
|
|
|
Btrfs: Add ordered async work queues
Btrfs uses kernel threads to create async work queues for cpu intensive
operations such as checksumming and decompression. These work well,
but they make it difficult to keep IO order intact.
A single writepages call from pdflush or fsync will turn into a number
of bios, and each bio is checksummed in parallel. Once the checksum is
computed, the bio is sent down to the disk, and since we don't control
the order in which the parallel operations happen, they might go down to
the disk in almost any order.
The code deals with this somewhat by having deep work queues for a single
kernel thread, making it very likely that a single thread will process all
the bios for a single inode.
This patch introduces an explicitly ordered work queue. As work structs
are placed into the queue they are put onto the tail of a list. They have
three callbacks:
->func (cpu intensive processing here)
->ordered_func (order sensitive processing here)
->ordered_free (free the work struct, all processing is done)
The work struct has three callbacks. The func callback does the cpu intensive
work, and when it completes the work struct is marked as done.
Every time a work struct completes, the list is checked to see if the head
is marked as done. If so the ordered_func callback is used to do the
order sensitive processing and the ordered_free callback is used to do
any cleanup. Then we loop back and check the head of the list again.
This patch also changes the checksumming code to use the ordered workqueues.
One a 4 drive array, it increases streaming writes from 280MB/s to 350MB/s.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-11-06 20:03:00 -07:00
|
|
|
/*
|
|
|
|
* in order to insert checksums into the metadata in large chunks,
|
|
|
|
* we wait until bio submission time. All the pages in the bio are
|
|
|
|
* checksummed and sums are attached onto the ordered extent record.
|
|
|
|
*
|
|
|
|
* At IO completion time the cums attached on the ordered extent record
|
|
|
|
* are inserted into the btree
|
|
|
|
*/
|
2008-12-02 07:54:17 -07:00
|
|
|
static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
|
2010-05-25 07:48:28 -06:00
|
|
|
int mirror_num, unsigned long bio_flags,
|
|
|
|
u64 bio_offset)
|
Btrfs: Add ordered async work queues
Btrfs uses kernel threads to create async work queues for cpu intensive
operations such as checksumming and decompression. These work well,
but they make it difficult to keep IO order intact.
A single writepages call from pdflush or fsync will turn into a number
of bios, and each bio is checksummed in parallel. Once the checksum is
computed, the bio is sent down to the disk, and since we don't control
the order in which the parallel operations happen, they might go down to
the disk in almost any order.
The code deals with this somewhat by having deep work queues for a single
kernel thread, making it very likely that a single thread will process all
the bios for a single inode.
This patch introduces an explicitly ordered work queue. As work structs
are placed into the queue they are put onto the tail of a list. They have
three callbacks:
->func (cpu intensive processing here)
->ordered_func (order sensitive processing here)
->ordered_free (free the work struct, all processing is done)
The work struct has three callbacks. The func callback does the cpu intensive
work, and when it completes the work struct is marked as done.
Every time a work struct completes, the list is checked to see if the head
is marked as done. If so the ordered_func callback is used to do the
order sensitive processing and the ordered_free callback is used to do
any cleanup. Then we loop back and check the head of the list again.
This patch also changes the checksumming code to use the ordered workqueues.
One a 4 drive array, it increases streaming writes from 280MB/s to 350MB/s.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-11-06 20:03:00 -07:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2008-06-11 14:50:36 -06:00
|
|
|
return btrfs_map_bio(root, rw, bio, mirror_num, 1);
|
2008-04-16 09:14:51 -06:00
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/*
|
2008-12-17 12:51:42 -07:00
|
|
|
* extent_io.c submission hook. This does the right thing for csum calculation
|
|
|
|
* on write, or reading the csums from the tree before a read
|
2008-09-29 13:18:18 -06:00
|
|
|
*/
|
2008-12-02 07:54:17 -07:00
|
|
|
static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
|
2010-05-25 07:48:28 -06:00
|
|
|
int mirror_num, unsigned long bio_flags,
|
|
|
|
u64 bio_offset)
|
2008-04-16 09:14:51 -06:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
int ret = 0;
|
2008-10-30 12:23:13 -06:00
|
|
|
int skip_sum;
|
2008-04-16 09:14:51 -06:00
|
|
|
|
2009-04-17 02:37:41 -06:00
|
|
|
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
|
2008-12-17 12:51:42 -07:00
|
|
|
|
2011-04-19 20:33:24 -06:00
|
|
|
if (is_free_space_inode(root, inode))
|
2010-07-02 10:14:14 -06:00
|
|
|
ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
|
|
|
|
else
|
|
|
|
ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
|
2008-07-17 10:53:50 -06:00
|
|
|
BUG_ON(ret);
|
2008-02-20 10:07:25 -07:00
|
|
|
|
2010-08-07 10:20:39 -06:00
|
|
|
if (!(rw & REQ_WRITE)) {
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-08 14:58:54 -07:00
|
|
|
if (bio_flags & EXTENT_BIO_COMPRESSED) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
return btrfs_submit_compressed_read(inode, bio,
|
|
|
|
mirror_num, bio_flags);
|
2011-02-28 23:48:31 -07:00
|
|
|
} else if (!skip_sum) {
|
|
|
|
ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
2008-08-20 07:44:52 -06:00
|
|
|
goto mapit;
|
2008-10-30 12:23:13 -06:00
|
|
|
} else if (!skip_sum) {
|
2008-12-12 08:03:38 -07:00
|
|
|
/* csum items have already been cloned */
|
|
|
|
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
|
|
|
|
goto mapit;
|
2008-10-30 12:23:13 -06:00
|
|
|
/* we're doing a write, do the async checksumming */
|
|
|
|
return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
|
2008-04-16 09:14:51 -06:00
|
|
|
inode, rw, bio, mirror_num,
|
2010-05-25 07:48:28 -06:00
|
|
|
bio_flags, bio_offset,
|
|
|
|
__btrfs_submit_bio_start,
|
Btrfs: Add ordered async work queues
Btrfs uses kernel threads to create async work queues for cpu intensive
operations such as checksumming and decompression. These work well,
but they make it difficult to keep IO order intact.
A single writepages call from pdflush or fsync will turn into a number
of bios, and each bio is checksummed in parallel. Once the checksum is
computed, the bio is sent down to the disk, and since we don't control
the order in which the parallel operations happen, they might go down to
the disk in almost any order.
The code deals with this somewhat by having deep work queues for a single
kernel thread, making it very likely that a single thread will process all
the bios for a single inode.
This patch introduces an explicitly ordered work queue. As work structs
are placed into the queue they are put onto the tail of a list. They have
three callbacks:
->func (cpu intensive processing here)
->ordered_func (order sensitive processing here)
->ordered_free (free the work struct, all processing is done)
The work struct has three callbacks. The func callback does the cpu intensive
work, and when it completes the work struct is marked as done.
Every time a work struct completes, the list is checked to see if the head
is marked as done. If so the ordered_func callback is used to do the
order sensitive processing and the ordered_free callback is used to do
any cleanup. Then we loop back and check the head of the list again.
This patch also changes the checksumming code to use the ordered workqueues.
One a 4 drive array, it increases streaming writes from 280MB/s to 350MB/s.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-11-06 20:03:00 -07:00
|
|
|
__btrfs_submit_bio_done);
|
2008-10-30 12:23:13 -06:00
|
|
|
}
|
|
|
|
|
2008-03-24 13:01:56 -06:00
|
|
|
mapit:
|
2008-06-11 14:50:36 -06:00
|
|
|
return btrfs_map_bio(root, rw, bio, mirror_num, 0);
|
2008-02-20 10:07:25 -07:00
|
|
|
}
|
2008-02-20 14:11:05 -07:00
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/*
|
|
|
|
* given a list of ordered sums record them in the inode. This happens
|
|
|
|
* at IO completion time based on sums calculated at bio submission time.
|
|
|
|
*/
|
2008-07-17 10:54:15 -06:00
|
|
|
static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
|
2008-07-17 10:53:50 -06:00
|
|
|
struct inode *inode, u64 file_offset,
|
|
|
|
struct list_head *list)
|
|
|
|
{
|
|
|
|
struct btrfs_ordered_sum *sum;
|
|
|
|
|
2009-01-21 08:59:08 -07:00
|
|
|
list_for_each_entry(sum, list, list) {
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-08 14:58:54 -07:00
|
|
|
btrfs_csum_file_blocks(trans,
|
|
|
|
BTRFS_I(inode)->root->fs_info->csum_root, sum);
|
2008-07-17 10:53:50 -06:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-02-03 12:33:23 -07:00
|
|
|
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
|
|
|
|
struct extent_state **cached_state)
|
2008-08-04 21:17:27 -06:00
|
|
|
{
|
2009-01-05 19:25:51 -07:00
|
|
|
if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
|
2008-11-06 20:02:51 -07:00
|
|
|
WARN_ON(1);
|
2008-08-04 21:17:27 -06:00
|
|
|
return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
|
2010-02-03 12:33:23 -07:00
|
|
|
cached_state, GFP_NOFS);
|
2008-08-04 21:17:27 -06:00
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/* see btrfs_writepage_start_hook for details on why this is required */
|
2008-07-17 10:53:51 -06:00
|
|
|
struct btrfs_writepage_fixup {
|
|
|
|
struct page *page;
|
|
|
|
struct btrfs_work work;
|
|
|
|
};
|
|
|
|
|
2008-12-02 07:54:17 -07:00
|
|
|
static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
|
2008-07-17 10:53:51 -06:00
|
|
|
{
|
|
|
|
struct btrfs_writepage_fixup *fixup;
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
2010-02-03 12:33:23 -07:00
|
|
|
struct extent_state *cached_state = NULL;
|
2008-07-17 10:53:51 -06:00
|
|
|
struct page *page;
|
|
|
|
struct inode *inode;
|
|
|
|
u64 page_start;
|
|
|
|
u64 page_end;
|
|
|
|
|
|
|
|
fixup = container_of(work, struct btrfs_writepage_fixup, work);
|
|
|
|
page = fixup->page;
|
2008-07-21 08:29:44 -06:00
|
|
|
again:
|
2008-07-17 10:53:51 -06:00
|
|
|
lock_page(page);
|
|
|
|
if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
|
|
|
|
ClearPageChecked(page);
|
|
|
|
goto out_page;
|
|
|
|
}
|
|
|
|
|
|
|
|
inode = page->mapping->host;
|
|
|
|
page_start = page_offset(page);
|
|
|
|
page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
|
|
|
|
|
2010-02-03 12:33:23 -07:00
|
|
|
lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
|
|
|
|
&cached_state, GFP_NOFS);
|
2008-07-21 08:29:44 -06:00
|
|
|
|
|
|
|
/* already ordered? We're done */
|
2009-09-02 14:53:46 -06:00
|
|
|
if (PagePrivate2(page))
|
2008-07-17 10:53:51 -06:00
|
|
|
goto out;
|
2008-07-21 08:29:44 -06:00
|
|
|
|
|
|
|
ordered = btrfs_lookup_ordered_extent(inode, page_start);
|
|
|
|
if (ordered) {
|
2010-02-03 12:33:23 -07:00
|
|
|
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
|
|
|
|
page_end, &cached_state, GFP_NOFS);
|
2008-07-21 08:29:44 -06:00
|
|
|
unlock_page(page);
|
|
|
|
btrfs_start_ordered_extent(inode, ordered, 1);
|
|
|
|
goto again;
|
|
|
|
}
|
2008-07-17 10:53:51 -06:00
|
|
|
|
2010-05-16 08:48:47 -06:00
|
|
|
BUG();
|
2010-02-03 12:33:23 -07:00
|
|
|
btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
|
2008-07-17 10:53:51 -06:00
|
|
|
ClearPageChecked(page);
|
|
|
|
out:
|
2010-02-03 12:33:23 -07:00
|
|
|
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
|
|
|
|
&cached_state, GFP_NOFS);
|
2008-07-17 10:53:51 -06:00
|
|
|
out_page:
|
|
|
|
unlock_page(page);
|
|
|
|
page_cache_release(page);
|
2011-01-26 01:19:22 -07:00
|
|
|
kfree(fixup);
|
2008-07-17 10:53:51 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* There are a few paths in the higher layers of the kernel that directly
|
|
|
|
* set the page dirty bit without asking the filesystem if it is a
|
|
|
|
* good idea. This causes problems because we want to make sure COW
|
|
|
|
* properly happens and the data=ordered rules are followed.
|
|
|
|
*
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
* In our case any range that doesn't have the ORDERED bit set
|
2008-07-17 10:53:51 -06:00
|
|
|
* hasn't been properly setup for IO. We kick off an async process
|
|
|
|
* to fix it up. The async helper will wait for ordered extents, set
|
|
|
|
* the delalloc bit and make it safe to write the page.
|
|
|
|
*/
|
2008-12-02 07:54:17 -07:00
|
|
|
static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
|
2008-07-17 10:53:51 -06:00
|
|
|
{
|
|
|
|
struct inode *inode = page->mapping->host;
|
|
|
|
struct btrfs_writepage_fixup *fixup;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
|
2009-09-02 14:53:46 -06:00
|
|
|
/* this page is properly in the ordered list */
|
|
|
|
if (TestClearPagePrivate2(page))
|
2008-07-17 10:53:51 -06:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (PageChecked(page))
|
|
|
|
return -EAGAIN;
|
|
|
|
|
|
|
|
fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
|
|
|
|
if (!fixup)
|
|
|
|
return -EAGAIN;
|
2008-07-22 09:18:09 -06:00
|
|
|
|
2008-07-17 10:53:51 -06:00
|
|
|
SetPageChecked(page);
|
|
|
|
page_cache_get(page);
|
|
|
|
fixup->work.func = btrfs_writepage_fixup_worker;
|
|
|
|
fixup->page = page;
|
|
|
|
btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
|
|
|
|
return -EAGAIN;
|
|
|
|
}
|
|
|
|
|
2008-10-30 12:25:28 -06:00
|
|
|
static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
|
|
|
|
struct inode *inode, u64 file_pos,
|
|
|
|
u64 disk_bytenr, u64 disk_num_bytes,
|
|
|
|
u64 num_bytes, u64 ram_bytes,
|
|
|
|
u8 compression, u8 encryption,
|
|
|
|
u16 other_encoding, int extent_type)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_file_extent_item *fi;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_key ins;
|
|
|
|
u64 hint;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-13 11:38:47 -06:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
2008-10-30 12:25:28 -06:00
|
|
|
|
2009-03-13 09:00:37 -06:00
|
|
|
path->leave_spinning = 1;
|
2009-09-11 10:27:37 -06:00
|
|
|
|
|
|
|
/*
|
|
|
|
* we may be replacing one extent in the tree with another.
|
|
|
|
* The new extent is pinned in the extent map, and we don't want
|
|
|
|
* to drop it from the cache until it is completely in the btree.
|
|
|
|
*
|
|
|
|
* So, tell btrfs_drop_extents to leave this extent in the cache.
|
|
|
|
* the caller is expected to unpin it and allow it to be merged
|
|
|
|
* with the others.
|
|
|
|
*/
|
2009-11-12 02:34:08 -07:00
|
|
|
ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
|
|
|
|
&hint, 0);
|
2008-10-30 12:25:28 -06:00
|
|
|
BUG_ON(ret);
|
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
ins.objectid = btrfs_ino(inode);
|
2008-10-30 12:25:28 -06:00
|
|
|
ins.offset = file_pos;
|
|
|
|
ins.type = BTRFS_EXTENT_DATA_KEY;
|
|
|
|
ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
|
|
|
|
BUG_ON(ret);
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
fi = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
btrfs_set_file_extent_generation(leaf, fi, trans->transid);
|
|
|
|
btrfs_set_file_extent_type(leaf, fi, extent_type);
|
|
|
|
btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
|
|
|
|
btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
|
|
|
|
btrfs_set_file_extent_offset(leaf, fi, 0);
|
|
|
|
btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
|
|
|
|
btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
|
|
|
|
btrfs_set_file_extent_compression(leaf, fi, compression);
|
|
|
|
btrfs_set_file_extent_encryption(leaf, fi, encryption);
|
|
|
|
btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
|
2009-03-13 09:00:37 -06:00
|
|
|
|
|
|
|
btrfs_unlock_up_safe(path, 1);
|
|
|
|
btrfs_set_lock_blocking(leaf);
|
|
|
|
|
2008-10-30 12:25:28 -06:00
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
|
|
|
|
|
|
|
inode_add_bytes(inode, num_bytes);
|
|
|
|
|
|
|
|
ins.objectid = disk_bytenr;
|
|
|
|
ins.offset = disk_num_bytes;
|
|
|
|
ins.type = BTRFS_EXTENT_ITEM_KEY;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
ret = btrfs_alloc_reserved_file_extent(trans, root,
|
|
|
|
root->root_key.objectid,
|
2011-04-19 20:31:50 -06:00
|
|
|
btrfs_ino(inode), file_pos, &ins);
|
2008-10-30 12:25:28 -06:00
|
|
|
BUG_ON(ret);
|
|
|
|
btrfs_free_path(path);
|
2009-03-13 09:00:37 -06:00
|
|
|
|
2008-10-30 12:25:28 -06:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-03-13 09:41:46 -06:00
|
|
|
/*
|
|
|
|
* helper function for btrfs_finish_ordered_io, this
|
|
|
|
* just reads in some of the csum leaves to prime them into ram
|
|
|
|
* before we start the transaction. It limits the amount of btree
|
|
|
|
* reads required while inside the transaction.
|
|
|
|
*/
|
2008-09-29 13:18:18 -06:00
|
|
|
/* as ordered data IO finishes, this gets called so we can finish
|
|
|
|
* an ordered extent if the range of bytes in the file it covers are
|
|
|
|
* fully written.
|
|
|
|
*/
|
2008-07-18 09:56:15 -06:00
|
|
|
static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
|
2008-07-17 10:53:50 -06:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2010-05-16 08:48:47 -06:00
|
|
|
struct btrfs_trans_handle *trans = NULL;
|
2009-03-13 09:41:46 -06:00
|
|
|
struct btrfs_ordered_extent *ordered_extent = NULL;
|
2008-07-17 10:53:50 -06:00
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
2010-02-03 12:33:23 -07:00
|
|
|
struct extent_state *cached_state = NULL;
|
2010-12-16 23:21:50 -07:00
|
|
|
int compress_type = 0;
|
2008-07-17 10:53:50 -06:00
|
|
|
int ret;
|
2011-04-19 20:33:24 -06:00
|
|
|
bool nolock;
|
2008-07-17 10:53:50 -06:00
|
|
|
|
2010-02-02 13:51:14 -07:00
|
|
|
ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
|
|
|
|
end - start + 1);
|
2008-07-17 10:54:15 -06:00
|
|
|
if (!ret)
|
2008-07-17 10:53:50 -06:00
|
|
|
return 0;
|
|
|
|
BUG_ON(!ordered_extent);
|
2010-02-02 13:50:10 -07:00
|
|
|
|
2011-04-19 20:33:24 -06:00
|
|
|
nolock = is_free_space_inode(root, inode);
|
2010-07-02 10:14:14 -06:00
|
|
|
|
2009-11-12 02:34:21 -07:00
|
|
|
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
|
|
|
|
BUG_ON(!list_empty(&ordered_extent->list));
|
|
|
|
ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
|
|
|
|
if (!ret) {
|
2010-07-02 10:14:14 -06:00
|
|
|
if (nolock)
|
2011-04-13 10:54:33 -06:00
|
|
|
trans = btrfs_join_transaction_nolock(root);
|
2010-07-02 10:14:14 -06:00
|
|
|
else
|
2011-04-13 10:54:33 -06:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-01-24 19:51:38 -07:00
|
|
|
BUG_ON(IS_ERR(trans));
|
2010-05-16 08:48:47 -06:00
|
|
|
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
|
2009-11-12 02:34:21 -07:00
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
|
|
|
BUG_ON(ret);
|
|
|
|
}
|
|
|
|
goto out;
|
|
|
|
}
|
2008-07-17 10:53:50 -06:00
|
|
|
|
2010-02-03 12:33:23 -07:00
|
|
|
lock_extent_bits(io_tree, ordered_extent->file_offset,
|
|
|
|
ordered_extent->file_offset + ordered_extent->len - 1,
|
|
|
|
0, &cached_state, GFP_NOFS);
|
2008-07-17 10:53:50 -06:00
|
|
|
|
2010-07-02 10:14:14 -06:00
|
|
|
if (nolock)
|
2011-04-13 10:54:33 -06:00
|
|
|
trans = btrfs_join_transaction_nolock(root);
|
2010-07-02 10:14:14 -06:00
|
|
|
else
|
2011-04-13 10:54:33 -06:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-01-24 19:51:38 -07:00
|
|
|
BUG_ON(IS_ERR(trans));
|
2010-05-16 08:48:47 -06:00
|
|
|
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
|
2009-11-12 02:34:21 -07:00
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
|
2010-12-16 23:21:50 -07:00
|
|
|
compress_type = ordered_extent->compress_type;
|
2008-10-30 12:25:28 -06:00
|
|
|
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
|
2010-12-16 23:21:50 -07:00
|
|
|
BUG_ON(compress_type);
|
2009-11-12 02:34:08 -07:00
|
|
|
ret = btrfs_mark_extent_written(trans, inode,
|
2008-10-30 12:25:28 -06:00
|
|
|
ordered_extent->file_offset,
|
|
|
|
ordered_extent->file_offset +
|
|
|
|
ordered_extent->len);
|
|
|
|
BUG_ON(ret);
|
|
|
|
} else {
|
2010-06-21 12:48:16 -06:00
|
|
|
BUG_ON(root == root->fs_info->tree_root);
|
2008-10-30 12:25:28 -06:00
|
|
|
ret = insert_reserved_file_extent(trans, inode,
|
|
|
|
ordered_extent->file_offset,
|
|
|
|
ordered_extent->start,
|
|
|
|
ordered_extent->disk_len,
|
|
|
|
ordered_extent->len,
|
|
|
|
ordered_extent->len,
|
2010-12-16 23:21:50 -07:00
|
|
|
compress_type, 0, 0,
|
2008-10-30 12:25:28 -06:00
|
|
|
BTRFS_FILE_EXTENT_REG);
|
2009-09-11 10:27:37 -06:00
|
|
|
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
|
|
|
|
ordered_extent->file_offset,
|
|
|
|
ordered_extent->len);
|
2008-10-30 12:25:28 -06:00
|
|
|
BUG_ON(ret);
|
|
|
|
}
|
2010-02-03 12:33:23 -07:00
|
|
|
unlock_extent_cached(io_tree, ordered_extent->file_offset,
|
|
|
|
ordered_extent->file_offset +
|
|
|
|
ordered_extent->len - 1, &cached_state, GFP_NOFS);
|
|
|
|
|
2008-07-17 10:53:50 -06:00
|
|
|
add_pending_csums(trans, inode, ordered_extent->file_offset,
|
|
|
|
&ordered_extent->list);
|
|
|
|
|
2011-04-05 17:25:36 -06:00
|
|
|
ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
|
|
|
|
if (!ret) {
|
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
|
|
|
BUG_ON(ret);
|
|
|
|
}
|
|
|
|
ret = 0;
|
2009-11-12 02:34:21 -07:00
|
|
|
out:
|
2010-07-02 10:14:14 -06:00
|
|
|
if (nolock) {
|
|
|
|
if (trans)
|
|
|
|
btrfs_end_transaction_nolock(trans, root);
|
|
|
|
} else {
|
|
|
|
btrfs_delalloc_release_metadata(inode, ordered_extent->len);
|
|
|
|
if (trans)
|
|
|
|
btrfs_end_transaction(trans, root);
|
|
|
|
}
|
|
|
|
|
2008-07-17 10:53:50 -06:00
|
|
|
/* once for us */
|
|
|
|
btrfs_put_ordered_extent(ordered_extent);
|
|
|
|
/* once for the tree */
|
|
|
|
btrfs_put_ordered_extent(ordered_extent);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-12-02 07:54:17 -07:00
|
|
|
static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
|
2008-07-18 09:56:15 -06:00
|
|
|
struct extent_state *state, int uptodate)
|
|
|
|
{
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 05:18:59 -06:00
|
|
|
trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
|
|
|
|
|
2009-09-02 14:53:46 -06:00
|
|
|
ClearPagePrivate2(page);
|
2008-07-18 09:56:15 -06:00
|
|
|
return btrfs_finish_ordered_io(page->mapping->host, start, end);
|
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/*
|
|
|
|
* When IO fails, either with EIO or csum verification fails, we
|
|
|
|
* try other mirrors that might have a good copy of the data. This
|
|
|
|
* io_failure_record is used to record state as we go through all the
|
|
|
|
* mirrors. If another mirror has good data, the page is set up to date
|
|
|
|
* and things continue. If a good mirror can't be found, the original
|
|
|
|
* bio end_io callback is called to indicate things have failed.
|
|
|
|
*/
|
2008-04-09 14:28:12 -06:00
|
|
|
struct io_failure_record {
|
|
|
|
struct page *page;
|
|
|
|
u64 start;
|
|
|
|
u64 len;
|
|
|
|
u64 logical;
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-08 14:58:54 -07:00
|
|
|
unsigned long bio_flags;
|
2008-04-09 14:28:12 -06:00
|
|
|
int last_mirror;
|
|
|
|
};
|
|
|
|
|
2008-12-02 07:54:17 -07:00
|
|
|
static int btrfs_io_failed_hook(struct bio *failed_bio,
|
2008-05-12 11:39:03 -06:00
|
|
|
struct page *page, u64 start, u64 end,
|
|
|
|
struct extent_state *state)
|
2008-04-09 14:28:12 -06:00
|
|
|
{
|
|
|
|
struct io_failure_record *failrec = NULL;
|
|
|
|
u64 private;
|
|
|
|
struct extent_map *em;
|
|
|
|
struct inode *inode = page->mapping->host;
|
|
|
|
struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
|
2008-04-17 09:29:12 -06:00
|
|
|
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
|
2008-04-09 14:28:12 -06:00
|
|
|
struct bio *bio;
|
|
|
|
int num_copies;
|
|
|
|
int ret;
|
2008-05-12 11:39:03 -06:00
|
|
|
int rw;
|
2008-04-09 14:28:12 -06:00
|
|
|
u64 logical;
|
|
|
|
|
|
|
|
ret = get_state_private(failure_tree, start, &private);
|
|
|
|
if (ret) {
|
|
|
|
failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
|
|
|
|
if (!failrec)
|
|
|
|
return -ENOMEM;
|
|
|
|
failrec->start = start;
|
|
|
|
failrec->len = end - start + 1;
|
|
|
|
failrec->last_mirror = 0;
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-08 14:58:54 -07:00
|
|
|
failrec->bio_flags = 0;
|
2008-04-09 14:28:12 -06:00
|
|
|
|
2009-09-02 14:24:52 -06:00
|
|
|
read_lock(&em_tree->lock);
|
2008-04-17 09:29:12 -06:00
|
|
|
em = lookup_extent_mapping(em_tree, start, failrec->len);
|
|
|
|
if (em->start > start || em->start + em->len < start) {
|
|
|
|
free_extent_map(em);
|
|
|
|
em = NULL;
|
|
|
|
}
|
2009-09-02 14:24:52 -06:00
|
|
|
read_unlock(&em_tree->lock);
|
2008-04-09 14:28:12 -06:00
|
|
|
|
2011-04-19 10:00:01 -06:00
|
|
|
if (IS_ERR_OR_NULL(em)) {
|
2008-04-09 14:28:12 -06:00
|
|
|
kfree(failrec);
|
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
logical = start - em->start;
|
|
|
|
logical = em->block_start + logical;
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-08 14:58:54 -07:00
|
|
|
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
|
|
|
|
logical = em->block_start;
|
|
|
|
failrec->bio_flags = EXTENT_BIO_COMPRESSED;
|
2010-12-16 23:21:50 -07:00
|
|
|
extent_set_compress_type(&failrec->bio_flags,
|
|
|
|
em->compress_type);
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-08 14:58:54 -07:00
|
|
|
}
|
2008-04-09 14:28:12 -06:00
|
|
|
failrec->logical = logical;
|
|
|
|
free_extent_map(em);
|
|
|
|
set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
|
|
|
|
EXTENT_DIRTY, GFP_NOFS);
|
2008-04-11 10:16:46 -06:00
|
|
|
set_state_private(failure_tree, start,
|
|
|
|
(u64)(unsigned long)failrec);
|
2008-04-09 14:28:12 -06:00
|
|
|
} else {
|
2008-04-11 10:16:46 -06:00
|
|
|
failrec = (struct io_failure_record *)(unsigned long)private;
|
2008-04-09 14:28:12 -06:00
|
|
|
}
|
|
|
|
num_copies = btrfs_num_copies(
|
|
|
|
&BTRFS_I(inode)->root->fs_info->mapping_tree,
|
|
|
|
failrec->logical, failrec->len);
|
|
|
|
failrec->last_mirror++;
|
|
|
|
if (!state) {
|
2008-12-17 12:51:42 -07:00
|
|
|
spin_lock(&BTRFS_I(inode)->io_tree.lock);
|
2008-04-09 14:28:12 -06:00
|
|
|
state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
|
|
|
|
failrec->start,
|
|
|
|
EXTENT_LOCKED);
|
|
|
|
if (state && state->start != failrec->start)
|
|
|
|
state = NULL;
|
2008-12-17 12:51:42 -07:00
|
|
|
spin_unlock(&BTRFS_I(inode)->io_tree.lock);
|
2008-04-09 14:28:12 -06:00
|
|
|
}
|
|
|
|
if (!state || failrec->last_mirror > num_copies) {
|
|
|
|
set_state_private(failure_tree, failrec->start, 0);
|
|
|
|
clear_extent_bits(failure_tree, failrec->start,
|
|
|
|
failrec->start + failrec->len - 1,
|
|
|
|
EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
|
|
|
|
kfree(failrec);
|
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
bio = bio_alloc(GFP_NOFS, 1);
|
|
|
|
bio->bi_private = state;
|
|
|
|
bio->bi_end_io = failed_bio->bi_end_io;
|
|
|
|
bio->bi_sector = failrec->logical >> 9;
|
|
|
|
bio->bi_bdev = failed_bio->bi_bdev;
|
2008-04-22 11:26:46 -06:00
|
|
|
bio->bi_size = 0;
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-08 14:58:54 -07:00
|
|
|
|
2008-04-09 14:28:12 -06:00
|
|
|
bio_add_page(bio, page, failrec->len, start - page_offset(page));
|
2010-08-07 10:20:39 -06:00
|
|
|
if (failed_bio->bi_rw & REQ_WRITE)
|
2008-05-12 11:39:03 -06:00
|
|
|
rw = WRITE;
|
|
|
|
else
|
|
|
|
rw = READ;
|
|
|
|
|
2011-02-28 23:48:31 -07:00
|
|
|
ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
failrec->last_mirror,
|
2010-05-25 07:48:28 -06:00
|
|
|
failrec->bio_flags, 0);
|
2011-02-28 23:48:31 -07:00
|
|
|
return ret;
|
2008-05-12 11:39:03 -06:00
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/*
|
|
|
|
* each time an IO finishes, we do a fast check in the IO failure tree
|
|
|
|
* to see if we need to process or clean up an io_failure_record
|
|
|
|
*/
|
2008-12-02 07:54:17 -07:00
|
|
|
static int btrfs_clean_io_failures(struct inode *inode, u64 start)
|
2008-05-12 11:39:03 -06:00
|
|
|
{
|
|
|
|
u64 private;
|
|
|
|
u64 private_failure;
|
|
|
|
struct io_failure_record *failure;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
private = 0;
|
|
|
|
if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
|
2011-02-23 14:23:20 -07:00
|
|
|
(u64)-1, 1, EXTENT_DIRTY, 0)) {
|
2008-05-12 11:39:03 -06:00
|
|
|
ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
|
|
|
|
start, &private_failure);
|
|
|
|
if (ret == 0) {
|
|
|
|
failure = (struct io_failure_record *)(unsigned long)
|
|
|
|
private_failure;
|
|
|
|
set_state_private(&BTRFS_I(inode)->io_failure_tree,
|
|
|
|
failure->start, 0);
|
|
|
|
clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
|
|
|
|
failure->start,
|
|
|
|
failure->start + failure->len - 1,
|
|
|
|
EXTENT_DIRTY | EXTENT_LOCKED,
|
|
|
|
GFP_NOFS);
|
|
|
|
kfree(failure);
|
|
|
|
}
|
|
|
|
}
|
2008-04-09 14:28:12 -06:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/*
|
|
|
|
* when reads are done, we need to check csums to verify the data is correct
|
|
|
|
* if there's a match, we allow the bio to finish. If not, we go through
|
|
|
|
* the io_failure_record routines to find good copies
|
|
|
|
*/
|
2008-12-02 07:54:17 -07:00
|
|
|
static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
|
2008-01-29 07:59:12 -07:00
|
|
|
struct extent_state *state)
|
2007-08-30 06:50:51 -06:00
|
|
|
{
|
2007-10-30 14:56:53 -06:00
|
|
|
size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
|
2007-08-30 06:50:51 -06:00
|
|
|
struct inode *inode = page->mapping->host;
|
2008-01-24 14:13:08 -07:00
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
2007-08-30 06:50:51 -06:00
|
|
|
char *kaddr;
|
2008-01-29 07:10:27 -07:00
|
|
|
u64 private = ~(u32)0;
|
2007-08-30 06:50:51 -06:00
|
|
|
int ret;
|
2007-10-15 14:22:25 -06:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
u32 csum = ~(u32)0;
|
2008-01-24 14:13:08 -07:00
|
|
|
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-08 14:58:54 -07:00
|
|
|
if (PageChecked(page)) {
|
|
|
|
ClearPageChecked(page);
|
|
|
|
goto good;
|
|
|
|
}
|
2009-04-17 02:37:41 -06:00
|
|
|
|
|
|
|
if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
|
2011-05-04 08:18:50 -06:00
|
|
|
goto good;
|
2008-12-12 08:03:38 -07:00
|
|
|
|
|
|
|
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
|
2009-09-02 13:22:30 -06:00
|
|
|
test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
|
2008-12-12 08:03:38 -07:00
|
|
|
clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
|
|
|
|
GFP_NOFS);
|
2007-12-14 13:30:32 -07:00
|
|
|
return 0;
|
2008-12-12 08:03:38 -07:00
|
|
|
}
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-08 14:58:54 -07:00
|
|
|
|
2008-02-04 06:57:25 -07:00
|
|
|
if (state && state->start == start) {
|
2008-01-29 07:59:12 -07:00
|
|
|
private = state->private;
|
|
|
|
ret = 0;
|
|
|
|
} else {
|
|
|
|
ret = get_state_private(io_tree, start, &private);
|
|
|
|
}
|
2009-01-07 07:48:51 -07:00
|
|
|
kaddr = kmap_atomic(page, KM_USER0);
|
2009-01-05 19:25:51 -07:00
|
|
|
if (ret)
|
2007-08-30 06:50:51 -06:00
|
|
|
goto zeroit;
|
2009-01-05 19:25:51 -07:00
|
|
|
|
2007-10-15 14:22:25 -06:00
|
|
|
csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1);
|
|
|
|
btrfs_csum_final(csum, (char *)&csum);
|
2009-01-05 19:25:51 -07:00
|
|
|
if (csum != private)
|
2007-08-30 06:50:51 -06:00
|
|
|
goto zeroit;
|
2009-01-05 19:25:51 -07:00
|
|
|
|
2009-01-07 07:48:51 -07:00
|
|
|
kunmap_atomic(kaddr, KM_USER0);
|
Btrfs: move data checksumming into a dedicated tree
Btrfs stores checksums for each data block. Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block. This means that when we read the inode,
we've probably read in at least some checksums as well.
But, this has a few problems:
* The checksums are indexed by logical offset in the file. When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data. It would be faster if we could checksum
the compressed data instead.
* If we implement encryption, we'll be checksumming the plain text and
storing that on disk. This is significantly less secure.
* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct. This makes the raid
layer balancing and extent moving much more expensive.
* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.
* There is potentitally one copy of the checksum in each subvolume
referencing an extent.
The solution used here is to store the extent checksums in a dedicated
tree. This allows us to index the checksums by phyiscal extent
start and length. It means:
* The checksum is against the data stored on disk, after any compression
or encryption is done.
* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.
This makes compression significantly faster by reducing the amount of
data that needs to be checksummed. It will also allow much faster
raid management code in general.
The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent. This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-12-08 14:58:54 -07:00
|
|
|
good:
|
2008-04-09 14:28:12 -06:00
|
|
|
/* if the io failure tree for this inode is non-empty,
|
|
|
|
* check to see if we've recovered from a failed IO
|
|
|
|
*/
|
2008-05-12 11:39:03 -06:00
|
|
|
btrfs_clean_io_failures(inode, start);
|
2007-08-30 06:50:51 -06:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
zeroit:
|
2011-05-22 10:33:42 -06:00
|
|
|
printk_ratelimited(KERN_INFO "btrfs csum failed ino %llu off %llu csum %u "
|
2011-04-19 20:31:50 -06:00
|
|
|
"private %llu\n",
|
|
|
|
(unsigned long long)btrfs_ino(page->mapping->host),
|
2009-04-27 05:29:05 -06:00
|
|
|
(unsigned long long)start, csum,
|
|
|
|
(unsigned long long)private);
|
2007-10-15 14:15:53 -06:00
|
|
|
memset(kaddr + offset, 1, end - start + 1);
|
|
|
|
flush_dcache_page(page);
|
2009-01-07 07:48:51 -07:00
|
|
|
kunmap_atomic(kaddr, KM_USER0);
|
2008-04-17 09:29:12 -06:00
|
|
|
if (private == 0)
|
|
|
|
return 0;
|
2008-04-09 14:28:12 -06:00
|
|
|
return -EIO;
|
2007-08-30 06:50:51 -06:00
|
|
|
}
|
2007-08-27 14:49:44 -06:00
|
|
|
|
2009-11-12 02:36:34 -07:00
|
|
|
struct delayed_iput {
|
|
|
|
struct list_head list;
|
|
|
|
struct inode *inode;
|
|
|
|
};
|
|
|
|
|
|
|
|
void btrfs_add_delayed_iput(struct inode *inode)
|
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
|
|
|
|
struct delayed_iput *delayed;
|
|
|
|
|
|
|
|
if (atomic_add_unless(&inode->i_count, -1, 1))
|
|
|
|
return;
|
|
|
|
|
|
|
|
delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
|
|
|
|
delayed->inode = inode;
|
|
|
|
|
|
|
|
spin_lock(&fs_info->delayed_iput_lock);
|
|
|
|
list_add_tail(&delayed->list, &fs_info->delayed_iputs);
|
|
|
|
spin_unlock(&fs_info->delayed_iput_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
void btrfs_run_delayed_iputs(struct btrfs_root *root)
|
|
|
|
{
|
|
|
|
LIST_HEAD(list);
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
|
|
struct delayed_iput *delayed;
|
|
|
|
int empty;
|
|
|
|
|
|
|
|
spin_lock(&fs_info->delayed_iput_lock);
|
|
|
|
empty = list_empty(&fs_info->delayed_iputs);
|
|
|
|
spin_unlock(&fs_info->delayed_iput_lock);
|
|
|
|
if (empty)
|
|
|
|
return;
|
|
|
|
|
|
|
|
down_read(&root->fs_info->cleanup_work_sem);
|
|
|
|
spin_lock(&fs_info->delayed_iput_lock);
|
|
|
|
list_splice_init(&fs_info->delayed_iputs, &list);
|
|
|
|
spin_unlock(&fs_info->delayed_iput_lock);
|
|
|
|
|
|
|
|
while (!list_empty(&list)) {
|
|
|
|
delayed = list_entry(list.next, struct delayed_iput, list);
|
|
|
|
list_del(&delayed->list);
|
|
|
|
iput(delayed->inode);
|
|
|
|
kfree(delayed);
|
|
|
|
}
|
|
|
|
up_read(&root->fs_info->cleanup_work_sem);
|
|
|
|
}
|
|
|
|
|
2010-05-16 08:49:58 -06:00
|
|
|
/*
|
|
|
|
* calculate extra metadata reservation when snapshotting a subvolume
|
|
|
|
* contains orphan files.
|
|
|
|
*/
|
|
|
|
void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_pending_snapshot *pending,
|
|
|
|
u64 *bytes_to_reserve)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root;
|
|
|
|
struct btrfs_block_rsv *block_rsv;
|
|
|
|
u64 num_bytes;
|
|
|
|
int index;
|
|
|
|
|
|
|
|
root = pending->root;
|
|
|
|
if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
|
|
|
|
return;
|
|
|
|
|
|
|
|
block_rsv = root->orphan_block_rsv;
|
|
|
|
|
|
|
|
/* orphan block reservation for the snapshot */
|
|
|
|
num_bytes = block_rsv->size;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* after the snapshot is created, COWing tree blocks may use more
|
|
|
|
* space than it frees. So we should make sure there is enough
|
|
|
|
* reserved space.
|
|
|
|
*/
|
|
|
|
index = trans->transid & 0x1;
|
|
|
|
if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
|
|
|
|
num_bytes += block_rsv->size -
|
|
|
|
(block_rsv->reserved + block_rsv->freed[index]);
|
|
|
|
}
|
|
|
|
|
|
|
|
*bytes_to_reserve += num_bytes;
|
|
|
|
}
|
|
|
|
|
|
|
|
void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_pending_snapshot *pending)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = pending->root;
|
|
|
|
struct btrfs_root *snap = pending->snap;
|
|
|
|
struct btrfs_block_rsv *block_rsv;
|
|
|
|
u64 num_bytes;
|
|
|
|
int index;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* refill source subvolume's orphan block reservation */
|
|
|
|
block_rsv = root->orphan_block_rsv;
|
|
|
|
index = trans->transid & 0x1;
|
|
|
|
if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
|
|
|
|
num_bytes = block_rsv->size -
|
|
|
|
(block_rsv->reserved + block_rsv->freed[index]);
|
|
|
|
ret = btrfs_block_rsv_migrate(&pending->block_rsv,
|
|
|
|
root->orphan_block_rsv,
|
|
|
|
num_bytes);
|
|
|
|
BUG_ON(ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* setup orphan block reservation for the snapshot */
|
|
|
|
block_rsv = btrfs_alloc_block_rsv(snap);
|
|
|
|
BUG_ON(!block_rsv);
|
|
|
|
|
|
|
|
btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
|
|
|
|
snap->orphan_block_rsv = block_rsv;
|
|
|
|
|
|
|
|
num_bytes = root->orphan_block_rsv->size;
|
|
|
|
ret = btrfs_block_rsv_migrate(&pending->block_rsv,
|
|
|
|
block_rsv, num_bytes);
|
|
|
|
BUG_ON(ret);
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
/* insert orphan item for the snapshot */
|
|
|
|
WARN_ON(!root->orphan_item_inserted);
|
|
|
|
ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
|
|
|
|
snap->root_key.objectid);
|
|
|
|
BUG_ON(ret);
|
|
|
|
snap->orphan_item_inserted = 1;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
enum btrfs_orphan_cleanup_state {
|
|
|
|
ORPHAN_CLEANUP_STARTED = 1,
|
|
|
|
ORPHAN_CLEANUP_DONE = 2,
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is called in transaction commmit time. If there are no orphan
|
|
|
|
* files in the subvolume, it removes orphan item and frees block_rsv
|
|
|
|
* structure.
|
|
|
|
*/
|
|
|
|
void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!list_empty(&root->orphan_list) ||
|
|
|
|
root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (root->orphan_item_inserted &&
|
|
|
|
btrfs_root_refs(&root->root_item) > 0) {
|
|
|
|
ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
|
|
|
|
root->root_key.objectid);
|
|
|
|
BUG_ON(ret);
|
|
|
|
root->orphan_item_inserted = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (root->orphan_block_rsv) {
|
|
|
|
WARN_ON(root->orphan_block_rsv->size > 0);
|
|
|
|
btrfs_free_block_rsv(root, root->orphan_block_rsv);
|
|
|
|
root->orphan_block_rsv = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-07-24 10:17:14 -06:00
|
|
|
/*
|
|
|
|
* This creates an orphan entry for the given inode in case something goes
|
|
|
|
* wrong in the middle of an unlink/truncate.
|
2010-05-16 08:49:58 -06:00
|
|
|
*
|
|
|
|
* NOTE: caller of this function should reserve 5 units of metadata for
|
|
|
|
* this function.
|
2008-07-24 10:17:14 -06:00
|
|
|
*/
|
|
|
|
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2010-05-16 08:49:58 -06:00
|
|
|
struct btrfs_block_rsv *block_rsv = NULL;
|
|
|
|
int reserve = 0;
|
|
|
|
int insert = 0;
|
|
|
|
int ret;
|
2008-07-24 10:17:14 -06:00
|
|
|
|
2010-05-16 08:49:58 -06:00
|
|
|
if (!root->orphan_block_rsv) {
|
|
|
|
block_rsv = btrfs_alloc_block_rsv(root);
|
|
|
|
BUG_ON(!block_rsv);
|
|
|
|
}
|
2008-07-24 10:17:14 -06:00
|
|
|
|
2010-05-16 08:49:58 -06:00
|
|
|
spin_lock(&root->orphan_lock);
|
|
|
|
if (!root->orphan_block_rsv) {
|
|
|
|
root->orphan_block_rsv = block_rsv;
|
|
|
|
} else if (block_rsv) {
|
|
|
|
btrfs_free_block_rsv(root, block_rsv);
|
|
|
|
block_rsv = NULL;
|
2008-07-24 10:17:14 -06:00
|
|
|
}
|
|
|
|
|
2010-05-16 08:49:58 -06:00
|
|
|
if (list_empty(&BTRFS_I(inode)->i_orphan)) {
|
|
|
|
list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
|
|
|
|
#if 0
|
|
|
|
/*
|
|
|
|
* For proper ENOSPC handling, we should do orphan
|
|
|
|
* cleanup when mounting. But this introduces backward
|
|
|
|
* compatibility issue.
|
|
|
|
*/
|
|
|
|
if (!xchg(&root->orphan_item_inserted, 1))
|
|
|
|
insert = 2;
|
|
|
|
else
|
|
|
|
insert = 1;
|
|
|
|
#endif
|
|
|
|
insert = 1;
|
2008-07-24 10:17:14 -06:00
|
|
|
}
|
|
|
|
|
2010-05-16 08:49:58 -06:00
|
|
|
if (!BTRFS_I(inode)->orphan_meta_reserved) {
|
|
|
|
BTRFS_I(inode)->orphan_meta_reserved = 1;
|
|
|
|
reserve = 1;
|
|
|
|
}
|
|
|
|
spin_unlock(&root->orphan_lock);
|
2008-07-24 10:17:14 -06:00
|
|
|
|
2010-05-16 08:49:58 -06:00
|
|
|
if (block_rsv)
|
|
|
|
btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
|
2008-07-24 10:17:14 -06:00
|
|
|
|
2010-05-16 08:49:58 -06:00
|
|
|
/* grab metadata reservation from transaction handle */
|
|
|
|
if (reserve) {
|
|
|
|
ret = btrfs_orphan_reserve_metadata(trans, inode);
|
|
|
|
BUG_ON(ret);
|
|
|
|
}
|
2008-07-24 10:17:14 -06:00
|
|
|
|
2010-05-16 08:49:58 -06:00
|
|
|
/* insert an orphan item to track this unlinked/truncated file */
|
|
|
|
if (insert >= 1) {
|
2011-04-19 20:31:50 -06:00
|
|
|
ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
|
2010-05-16 08:49:58 -06:00
|
|
|
BUG_ON(ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* insert an orphan item to track subvolume contains orphan files */
|
|
|
|
if (insert >= 2) {
|
|
|
|
ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
|
|
|
|
root->root_key.objectid);
|
|
|
|
BUG_ON(ret);
|
|
|
|
}
|
|
|
|
return 0;
|
2008-07-24 10:17:14 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We have done the truncate/delete so we can go ahead and remove the orphan
|
|
|
|
* item for this particular inode.
|
|
|
|
*/
|
|
|
|
int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2010-05-16 08:49:58 -06:00
|
|
|
int delete_item = 0;
|
|
|
|
int release_rsv = 0;
|
2008-07-24 10:17:14 -06:00
|
|
|
int ret = 0;
|
|
|
|
|
2010-05-16 08:49:58 -06:00
|
|
|
spin_lock(&root->orphan_lock);
|
|
|
|
if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
|
|
|
|
list_del_init(&BTRFS_I(inode)->i_orphan);
|
|
|
|
delete_item = 1;
|
2008-07-24 10:17:14 -06:00
|
|
|
}
|
|
|
|
|
2010-05-16 08:49:58 -06:00
|
|
|
if (BTRFS_I(inode)->orphan_meta_reserved) {
|
|
|
|
BTRFS_I(inode)->orphan_meta_reserved = 0;
|
|
|
|
release_rsv = 1;
|
2008-07-24 10:17:14 -06:00
|
|
|
}
|
2010-05-16 08:49:58 -06:00
|
|
|
spin_unlock(&root->orphan_lock);
|
2008-07-24 10:17:14 -06:00
|
|
|
|
2010-05-16 08:49:58 -06:00
|
|
|
if (trans && delete_item) {
|
2011-04-19 20:31:50 -06:00
|
|
|
ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
|
2010-05-16 08:49:58 -06:00
|
|
|
BUG_ON(ret);
|
|
|
|
}
|
2008-07-24 10:17:14 -06:00
|
|
|
|
2010-05-16 08:49:58 -06:00
|
|
|
if (release_rsv)
|
|
|
|
btrfs_orphan_release_metadata(inode);
|
2008-07-24 10:17:14 -06:00
|
|
|
|
2010-05-16 08:49:58 -06:00
|
|
|
return 0;
|
2008-07-24 10:17:14 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* this cleans up any orphans that may be left on the list from the last use
|
|
|
|
* of this root.
|
|
|
|
*/
|
2011-01-31 14:22:42 -07:00
|
|
|
int btrfs_orphan_cleanup(struct btrfs_root *root)
|
2008-07-24 10:17:14 -06:00
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_key key, found_key;
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct inode *inode;
|
|
|
|
int ret = 0, nr_unlink = 0, nr_truncate = 0;
|
|
|
|
|
2010-05-16 08:49:58 -06:00
|
|
|
if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
|
2011-01-31 14:22:42 -07:00
|
|
|
return 0;
|
2009-11-12 02:34:40 -07:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2011-01-31 14:22:42 -07:00
|
|
|
if (!path) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
2008-07-24 10:17:14 -06:00
|
|
|
path->reada = -1;
|
|
|
|
|
|
|
|
key.objectid = BTRFS_ORPHAN_OBJECTID;
|
|
|
|
btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
|
|
|
|
key.offset = (u64)-1;
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
2011-01-31 14:22:42 -07:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2008-07-24 10:17:14 -06:00
|
|
|
|
|
|
|
/*
|
|
|
|
* if ret == 0 means we found what we were searching for, which
|
2011-03-30 19:57:33 -06:00
|
|
|
* is weird, but possible, so only screw with path if we didn't
|
2008-07-24 10:17:14 -06:00
|
|
|
* find the key and see if we have stuff that matches
|
|
|
|
*/
|
|
|
|
if (ret > 0) {
|
2011-01-31 14:22:42 -07:00
|
|
|
ret = 0;
|
2008-07-24 10:17:14 -06:00
|
|
|
if (path->slots[0] == 0)
|
|
|
|
break;
|
|
|
|
path->slots[0]--;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* pull out the item */
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
|
|
|
|
|
|
|
/* make sure the item matches what we want */
|
|
|
|
if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
|
|
|
|
break;
|
|
|
|
if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* release the path since we're done with it */
|
2011-04-20 17:20:15 -06:00
|
|
|
btrfs_release_path(path);
|
2008-07-24 10:17:14 -06:00
|
|
|
|
|
|
|
/*
|
|
|
|
* this is where we are basically btrfs_lookup, without the
|
|
|
|
* crossing root thing. we store the inode number in the
|
|
|
|
* offset of the orphan item.
|
|
|
|
*/
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
found_key.objectid = found_key.offset;
|
|
|
|
found_key.type = BTRFS_INODE_ITEM_KEY;
|
|
|
|
found_key.offset = 0;
|
Btrfs: change how we mount subvolumes
This work is in preperation for being able to set a different root as the
default mounting root.
There is currently a problem with how we mount subvolumes. We cannot currently
mount a subvolume of a subvolume, you can only mount subvolumes/snapshots of the
default subvolume. So say you take a snapshot of the default subvolume and call
it snap1, and then take a snapshot of snap1 and call it snap2, so now you have
/
/snap1
/snap1/snap2
as your available volumes. Currently you can only mount / and /snap1,
you cannot mount /snap1/snap2. To fix this problem instead of passing
subvolid=<name> you must pass in subvolid=<treeid>, where <treeid> is
the tree id that gets spit out via the subvolume listing you get from
the subvolume listing patches (btrfs filesystem list). This allows us
to mount /, /snap1 and /snap1/snap2 as the root volume.
In addition to the above, we also now read the default dir item in the
tree root to get the root key that it points to. For now this just
points at what has always been the default subvolme, but later on I plan
to change it to point at whatever root you want to be the new default
root, so you can just set the default mount and not have to mount with
-o subvolid=<treeid>. I tested this out with the above scenario and it
worked perfectly. Thanks,
mount -o subvol operates inside the selected subvolid. For example:
mount -o subvol=snap1,subvolid=256 /dev/xxx /mnt
/mnt will have the snap1 directory for the subvolume with id
256.
mount -o subvol=snap /dev/xxx /mnt
/mnt will be the snap directory of whatever the default subvolume
is.
Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-12-04 10:38:27 -07:00
|
|
|
inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
|
2011-01-31 14:22:42 -07:00
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
ret = PTR_ERR(inode);
|
|
|
|
goto out;
|
|
|
|
}
|
2008-07-24 10:17:14 -06:00
|
|
|
|
|
|
|
/*
|
|
|
|
* add this inode to the orphan list so btrfs_orphan_del does
|
|
|
|
* the proper thing when we hit it
|
|
|
|
*/
|
2010-05-16 08:49:58 -06:00
|
|
|
spin_lock(&root->orphan_lock);
|
2008-07-24 10:17:14 -06:00
|
|
|
list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
|
2010-05-16 08:49:58 -06:00
|
|
|
spin_unlock(&root->orphan_lock);
|
2008-07-24 10:17:14 -06:00
|
|
|
|
|
|
|
/*
|
|
|
|
* if this is a bad inode, means we actually succeeded in
|
|
|
|
* removing the inode, but not the orphan record, which means
|
|
|
|
* we need to manually delete the orphan since iput will just
|
|
|
|
* do a destroy_inode
|
|
|
|
*/
|
|
|
|
if (is_bad_inode(inode)) {
|
2010-05-16 08:48:46 -06:00
|
|
|
trans = btrfs_start_transaction(root, 0);
|
2011-01-31 14:22:42 -07:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
goto out;
|
|
|
|
}
|
2008-07-24 10:17:14 -06:00
|
|
|
btrfs_orphan_del(trans, inode);
|
2008-09-26 08:05:38 -06:00
|
|
|
btrfs_end_transaction(trans, root);
|
2008-07-24 10:17:14 -06:00
|
|
|
iput(inode);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* if we have links, this was a truncate, lets do that */
|
|
|
|
if (inode->i_nlink) {
|
2011-01-31 13:30:16 -07:00
|
|
|
if (!S_ISREG(inode->i_mode)) {
|
|
|
|
WARN_ON(1);
|
|
|
|
iput(inode);
|
|
|
|
continue;
|
|
|
|
}
|
2008-07-24 10:17:14 -06:00
|
|
|
nr_truncate++;
|
2011-01-31 14:22:42 -07:00
|
|
|
ret = btrfs_truncate(inode);
|
2008-07-24 10:17:14 -06:00
|
|
|
} else {
|
|
|
|
nr_unlink++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* this will do delete_inode and everything for us */
|
|
|
|
iput(inode);
|
2011-01-31 14:22:42 -07:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
2008-07-24 10:17:14 -06:00
|
|
|
}
|
2010-05-16 08:49:58 -06:00
|
|
|
root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
|
|
|
|
|
|
|
|
if (root->orphan_block_rsv)
|
|
|
|
btrfs_block_rsv_release(root, root->orphan_block_rsv,
|
|
|
|
(u64)-1);
|
|
|
|
|
|
|
|
if (root->orphan_block_rsv || root->orphan_item_inserted) {
|
2011-04-13 10:54:33 -06:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-01-31 14:22:42 -07:00
|
|
|
if (!IS_ERR(trans))
|
|
|
|
btrfs_end_transaction(trans, root);
|
2010-05-16 08:49:58 -06:00
|
|
|
}
|
2008-07-24 10:17:14 -06:00
|
|
|
|
|
|
|
if (nr_unlink)
|
|
|
|
printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
|
|
|
|
if (nr_truncate)
|
|
|
|
printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
|
2011-01-31 14:22:42 -07:00
|
|
|
|
|
|
|
out:
|
|
|
|
if (ret)
|
|
|
|
printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret);
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
2008-07-24 10:17:14 -06:00
|
|
|
}
|
|
|
|
|
2009-04-27 09:47:50 -06:00
|
|
|
/*
|
|
|
|
* very simple check to peek ahead in the leaf looking for xattrs. If we
|
|
|
|
* don't find any xattrs, we know there can't be any acls.
|
|
|
|
*
|
|
|
|
* slot is the slot the inode is in, objectid is the objectid of the inode
|
|
|
|
*/
|
|
|
|
static noinline int acls_after_inode_item(struct extent_buffer *leaf,
|
|
|
|
int slot, u64 objectid)
|
|
|
|
{
|
|
|
|
u32 nritems = btrfs_header_nritems(leaf);
|
|
|
|
struct btrfs_key found_key;
|
|
|
|
int scanned = 0;
|
|
|
|
|
|
|
|
slot++;
|
|
|
|
while (slot < nritems) {
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, slot);
|
|
|
|
|
|
|
|
/* we found a different objectid, there must not be acls */
|
|
|
|
if (found_key.objectid != objectid)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* we found an xattr, assume we've got an acl */
|
|
|
|
if (found_key.type == BTRFS_XATTR_ITEM_KEY)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* we found a key greater than an xattr key, there can't
|
|
|
|
* be any acls later on
|
|
|
|
*/
|
|
|
|
if (found_key.type > BTRFS_XATTR_ITEM_KEY)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
slot++;
|
|
|
|
scanned++;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* it goes inode, inode backrefs, xattrs, extents,
|
|
|
|
* so if there are a ton of hard links to an inode there can
|
|
|
|
* be a lot of backrefs. Don't waste time searching too hard,
|
|
|
|
* this is just an optimization
|
|
|
|
*/
|
|
|
|
if (scanned >= 8)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
/* we hit the end of the leaf before we found an xattr or
|
|
|
|
* something larger than an xattr. We have to assume the inode
|
|
|
|
* has acls
|
|
|
|
*/
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/*
|
|
|
|
* read an inode from the btree into the in-memory inode
|
|
|
|
*/
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
static void btrfs_read_locked_inode(struct inode *inode)
|
2007-06-12 04:35:45 -06:00
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
2007-10-15 14:14:19 -06:00
|
|
|
struct extent_buffer *leaf;
|
2007-06-12 04:35:45 -06:00
|
|
|
struct btrfs_inode_item *inode_item;
|
2008-03-24 13:01:56 -06:00
|
|
|
struct btrfs_timespec *tspec;
|
2007-06-12 04:35:45 -06:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_key location;
|
2009-04-27 09:47:50 -06:00
|
|
|
int maybe_acls;
|
2007-07-11 08:18:17 -06:00
|
|
|
u32 rdev;
|
2007-06-12 04:35:45 -06:00
|
|
|
int ret;
|
2011-06-23 01:27:13 -06:00
|
|
|
bool filled = false;
|
|
|
|
|
|
|
|
ret = btrfs_fill_inode(inode, &rdev);
|
|
|
|
if (!ret)
|
|
|
|
filled = true;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
BUG_ON(!path);
|
2011-05-17 07:50:54 -06:00
|
|
|
path->leave_spinning = 1;
|
2007-06-12 04:35:45 -06:00
|
|
|
memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
|
2008-01-08 13:46:30 -07:00
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
|
2007-10-15 14:14:19 -06:00
|
|
|
if (ret)
|
2007-06-12 04:35:45 -06:00
|
|
|
goto make_bad;
|
|
|
|
|
2007-10-15 14:14:19 -06:00
|
|
|
leaf = path->nodes[0];
|
2011-06-23 01:27:13 -06:00
|
|
|
|
|
|
|
if (filled)
|
|
|
|
goto cache_acl;
|
|
|
|
|
2007-10-15 14:14:19 -06:00
|
|
|
inode_item = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_inode_item);
|
2011-05-17 07:50:54 -06:00
|
|
|
if (!leaf->map_token)
|
|
|
|
map_private_extent_buffer(leaf, (unsigned long)inode_item,
|
|
|
|
sizeof(struct btrfs_inode_item),
|
|
|
|
&leaf->map_token, &leaf->kaddr,
|
|
|
|
&leaf->map_start, &leaf->map_len,
|
|
|
|
KM_USER1);
|
2007-10-15 14:14:19 -06:00
|
|
|
|
|
|
|
inode->i_mode = btrfs_inode_mode(leaf, inode_item);
|
|
|
|
inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
|
|
|
|
inode->i_uid = btrfs_inode_uid(leaf, inode_item);
|
|
|
|
inode->i_gid = btrfs_inode_gid(leaf, inode_item);
|
2008-07-17 10:54:05 -06:00
|
|
|
btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
|
2007-10-15 14:14:19 -06:00
|
|
|
|
|
|
|
tspec = btrfs_inode_atime(inode_item);
|
|
|
|
inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
|
|
|
|
inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
|
|
|
|
|
|
|
|
tspec = btrfs_inode_mtime(inode_item);
|
|
|
|
inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
|
|
|
|
inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
|
|
|
|
|
|
|
|
tspec = btrfs_inode_ctime(inode_item);
|
|
|
|
inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
|
|
|
|
inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
|
|
|
|
|
2008-10-09 09:46:29 -06:00
|
|
|
inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
|
2008-09-05 14:13:11 -06:00
|
|
|
BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
|
2008-12-08 14:40:21 -07:00
|
|
|
BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
|
2008-09-05 14:13:11 -06:00
|
|
|
inode->i_generation = BTRFS_I(inode)->generation;
|
2007-07-11 08:18:17 -06:00
|
|
|
inode->i_rdev = 0;
|
2007-10-15 14:14:19 -06:00
|
|
|
rdev = btrfs_inode_rdev(leaf, inode_item);
|
|
|
|
|
2008-07-24 10:12:38 -06:00
|
|
|
BTRFS_I(inode)->index_cnt = (u64)-1;
|
2008-12-11 14:30:39 -07:00
|
|
|
BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
|
2011-06-23 01:27:13 -06:00
|
|
|
cache_acl:
|
2009-04-27 09:47:50 -06:00
|
|
|
/*
|
|
|
|
* try to precache a NULL acl entry for files that don't have
|
|
|
|
* any xattrs or acls
|
|
|
|
*/
|
2011-04-19 20:31:50 -06:00
|
|
|
maybe_acls = acls_after_inode_item(leaf, path->slots[0],
|
|
|
|
btrfs_ino(inode));
|
2009-06-24 14:58:48 -06:00
|
|
|
if (!maybe_acls)
|
|
|
|
cache_no_acl(inode);
|
2009-04-27 09:47:50 -06:00
|
|
|
|
2011-05-17 07:50:54 -06:00
|
|
|
if (leaf->map_token) {
|
|
|
|
unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
|
|
|
|
leaf->map_token = NULL;
|
|
|
|
}
|
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
btrfs_free_path(path);
|
|
|
|
|
|
|
|
switch (inode->i_mode & S_IFMT) {
|
|
|
|
case S_IFREG:
|
|
|
|
inode->i_mapping->a_ops = &btrfs_aops;
|
2008-03-26 08:28:07 -06:00
|
|
|
inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
|
2008-01-24 14:13:08 -07:00
|
|
|
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
|
2007-06-12 04:35:45 -06:00
|
|
|
inode->i_fop = &btrfs_file_operations;
|
|
|
|
inode->i_op = &btrfs_file_inode_operations;
|
|
|
|
break;
|
|
|
|
case S_IFDIR:
|
|
|
|
inode->i_fop = &btrfs_dir_file_operations;
|
|
|
|
if (root == root->fs_info->tree_root)
|
|
|
|
inode->i_op = &btrfs_dir_ro_inode_operations;
|
|
|
|
else
|
|
|
|
inode->i_op = &btrfs_dir_inode_operations;
|
|
|
|
break;
|
|
|
|
case S_IFLNK:
|
|
|
|
inode->i_op = &btrfs_symlink_inode_operations;
|
|
|
|
inode->i_mapping->a_ops = &btrfs_symlink_aops;
|
2008-03-26 08:28:07 -06:00
|
|
|
inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
|
2007-06-12 04:35:45 -06:00
|
|
|
break;
|
2007-07-11 08:18:17 -06:00
|
|
|
default:
|
2009-02-04 07:29:13 -07:00
|
|
|
inode->i_op = &btrfs_special_inode_operations;
|
2007-07-11 08:18:17 -06:00
|
|
|
init_special_inode(inode, inode->i_mode, rdev);
|
|
|
|
break;
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
2009-04-17 02:37:41 -06:00
|
|
|
|
|
|
|
btrfs_update_iflags(inode);
|
2007-06-12 04:35:45 -06:00
|
|
|
return;
|
|
|
|
|
|
|
|
make_bad:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
make_bad_inode(inode);
|
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/*
|
|
|
|
* given a leaf and an inode, copy the inode fields into the leaf
|
|
|
|
*/
|
2008-09-05 14:13:11 -06:00
|
|
|
static void fill_inode_item(struct btrfs_trans_handle *trans,
|
|
|
|
struct extent_buffer *leaf,
|
2007-10-15 14:14:19 -06:00
|
|
|
struct btrfs_inode_item *item,
|
2007-06-12 04:35:45 -06:00
|
|
|
struct inode *inode)
|
|
|
|
{
|
2011-04-05 11:02:27 -06:00
|
|
|
if (!leaf->map_token)
|
|
|
|
map_private_extent_buffer(leaf, (unsigned long)item,
|
|
|
|
sizeof(struct btrfs_inode_item),
|
|
|
|
&leaf->map_token, &leaf->kaddr,
|
|
|
|
&leaf->map_start, &leaf->map_len,
|
|
|
|
KM_USER1);
|
|
|
|
|
2007-10-15 14:14:19 -06:00
|
|
|
btrfs_set_inode_uid(leaf, item, inode->i_uid);
|
|
|
|
btrfs_set_inode_gid(leaf, item, inode->i_gid);
|
2008-07-17 10:54:05 -06:00
|
|
|
btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
|
2007-10-15 14:14:19 -06:00
|
|
|
btrfs_set_inode_mode(leaf, item, inode->i_mode);
|
|
|
|
btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
|
|
|
|
|
|
|
|
btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
|
|
|
|
inode->i_atime.tv_sec);
|
|
|
|
btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
|
|
|
|
inode->i_atime.tv_nsec);
|
|
|
|
|
|
|
|
btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
|
|
|
|
inode->i_mtime.tv_sec);
|
|
|
|
btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
|
|
|
|
inode->i_mtime.tv_nsec);
|
|
|
|
|
|
|
|
btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
|
|
|
|
inode->i_ctime.tv_sec);
|
|
|
|
btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
|
|
|
|
inode->i_ctime.tv_nsec);
|
|
|
|
|
2008-10-09 09:46:29 -06:00
|
|
|
btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
|
2008-09-05 14:13:11 -06:00
|
|
|
btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
|
2008-12-08 14:40:21 -07:00
|
|
|
btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
|
2008-09-05 14:13:11 -06:00
|
|
|
btrfs_set_inode_transid(leaf, item, trans->transid);
|
2007-10-15 14:14:19 -06:00
|
|
|
btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
|
2008-01-08 13:54:37 -07:00
|
|
|
btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
|
2011-05-11 13:26:06 -06:00
|
|
|
btrfs_set_inode_block_group(leaf, item, 0);
|
2011-04-05 11:02:27 -06:00
|
|
|
|
|
|
|
if (leaf->map_token) {
|
|
|
|
unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
|
|
|
|
leaf->map_token = NULL;
|
|
|
|
}
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/*
|
|
|
|
* copy everything in the in-memory inode into the btree.
|
|
|
|
*/
|
2009-01-05 19:25:51 -07:00
|
|
|
noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root, struct inode *inode)
|
2007-06-12 04:35:45 -06:00
|
|
|
{
|
|
|
|
struct btrfs_inode_item *inode_item;
|
|
|
|
struct btrfs_path *path;
|
2007-10-15 14:14:19 -06:00
|
|
|
struct extent_buffer *leaf;
|
2007-06-12 04:35:45 -06:00
|
|
|
int ret;
|
|
|
|
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
/*
|
2011-07-06 16:51:53 -06:00
|
|
|
* If the inode is a free space inode, we can deadlock during commit
|
|
|
|
* if we put it into the delayed code.
|
|
|
|
*
|
|
|
|
* The data relocation inode should also be directly updated
|
|
|
|
* without delay
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
*/
|
2011-07-06 16:51:53 -06:00
|
|
|
if (!is_free_space_inode(root, inode)
|
|
|
|
&& root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
ret = btrfs_delayed_update_inode(trans, root, inode);
|
|
|
|
if (!ret)
|
|
|
|
btrfs_set_inode_last_trans(trans, inode);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
path = btrfs_alloc_path();
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2009-03-13 09:00:37 -06:00
|
|
|
path->leave_spinning = 1;
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
|
|
|
|
1);
|
2007-06-12 04:35:45 -06:00
|
|
|
if (ret) {
|
|
|
|
if (ret > 0)
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto failed;
|
|
|
|
}
|
|
|
|
|
Btrfs: Change btree locking to use explicit blocking points
Most of the btrfs metadata operations can be protected by a spinlock,
but some operations still need to schedule.
So far, btrfs has been using a mutex along with a trylock loop,
most of the time it is able to avoid going for the full mutex, so
the trylock loop is a big performance gain.
This commit is step one for getting rid of the blocking locks entirely.
btrfs_tree_lock takes a spinlock, and the code explicitly switches
to a blocking lock when it starts an operation that can schedule.
We'll be able get rid of the blocking locks in smaller pieces over time.
Tracing allows us to find the most common cause of blocking, so we
can start with the hot spots first.
The basic idea is:
btrfs_tree_lock() returns with the spin lock held
btrfs_set_lock_blocking() sets the EXTENT_BUFFER_BLOCKING bit in
the extent buffer flags, and then drops the spin lock. The buffer is
still considered locked by all of the btrfs code.
If btrfs_tree_lock gets the spinlock but finds the blocking bit set, it drops
the spin lock and waits on a wait queue for the blocking bit to go away.
Much of the code that needs to set the blocking bit finishes without actually
blocking a good percentage of the time. So, an adaptive spin is still
used against the blocking bit to avoid very high context switch rates.
btrfs_clear_lock_blocking() clears the blocking bit and returns
with the spinlock held again.
btrfs_tree_unlock() can be called on either blocking or spinning locks,
it does the right thing based on the blocking bit.
ctree.c has a helper function to set/clear all the locked buffers in a
path as blocking.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-02-04 07:25:08 -07:00
|
|
|
btrfs_unlock_up_safe(path, 1);
|
2007-10-15 14:14:19 -06:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
inode_item = btrfs_item_ptr(leaf, path->slots[0],
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
struct btrfs_inode_item);
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2008-09-05 14:13:11 -06:00
|
|
|
fill_inode_item(trans, leaf, inode_item, inode);
|
2007-10-15 14:14:19 -06:00
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
2007-08-10 14:22:09 -06:00
|
|
|
btrfs_set_inode_last_trans(trans, inode);
|
2007-06-12 04:35:45 -06:00
|
|
|
ret = 0;
|
|
|
|
failed:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/*
|
|
|
|
* unlink helper that gets used here in inode.c and in the tree logging
|
|
|
|
* recovery code. It remove a link in a directory with a given name, and
|
|
|
|
* also drops the back refs in the inode to the directory
|
|
|
|
*/
|
2011-03-04 10:14:37 -07:00
|
|
|
static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
struct inode *dir, struct inode *inode,
|
|
|
|
const char *name, int name_len)
|
2007-06-12 04:35:45 -06:00
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
int ret = 0;
|
2007-10-15 14:14:19 -06:00
|
|
|
struct extent_buffer *leaf;
|
2007-06-12 04:35:45 -06:00
|
|
|
struct btrfs_dir_item *di;
|
2007-10-15 14:14:19 -06:00
|
|
|
struct btrfs_key key;
|
2008-07-24 10:12:38 -06:00
|
|
|
u64 index;
|
2011-04-19 20:31:50 -06:00
|
|
|
u64 ino = btrfs_ino(inode);
|
|
|
|
u64 dir_ino = btrfs_ino(dir);
|
2007-06-12 04:35:45 -06:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2007-06-22 12:16:25 -06:00
|
|
|
if (!path) {
|
|
|
|
ret = -ENOMEM;
|
2011-02-02 20:16:25 -07:00
|
|
|
goto out;
|
2007-06-22 12:16:25 -06:00
|
|
|
}
|
|
|
|
|
2009-03-13 09:00:37 -06:00
|
|
|
path->leave_spinning = 1;
|
2011-04-19 20:31:50 -06:00
|
|
|
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
|
2007-06-12 04:35:45 -06:00
|
|
|
name, name_len, -1);
|
|
|
|
if (IS_ERR(di)) {
|
|
|
|
ret = PTR_ERR(di);
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
if (!di) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto err;
|
|
|
|
}
|
2007-10-15 14:14:19 -06:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_dir_item_key_to_cpu(leaf, di, &key);
|
2007-06-12 04:35:45 -06:00
|
|
|
ret = btrfs_delete_one_dir_name(trans, root, path, di);
|
2007-06-22 12:16:25 -06:00
|
|
|
if (ret)
|
|
|
|
goto err;
|
2011-04-20 17:20:15 -06:00
|
|
|
btrfs_release_path(path);
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
|
|
|
|
dir_ino, &index);
|
2008-07-24 10:12:38 -06:00
|
|
|
if (ret) {
|
2009-01-05 19:25:51 -07:00
|
|
|
printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
|
2011-04-19 20:31:50 -06:00
|
|
|
"inode %llu parent %llu\n", name_len, name,
|
|
|
|
(unsigned long long)ino, (unsigned long long)dir_ino);
|
2008-07-24 10:12:38 -06:00
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
|
|
|
|
if (ret)
|
2007-06-12 04:35:45 -06:00
|
|
|
goto err;
|
|
|
|
|
2008-09-05 14:13:11 -06:00
|
|
|
ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
|
2011-04-19 20:31:50 -06:00
|
|
|
inode, dir_ino);
|
2008-09-11 13:53:12 -06:00
|
|
|
BUG_ON(ret != 0 && ret != -ENOENT);
|
2008-09-05 14:13:11 -06:00
|
|
|
|
|
|
|
ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
|
|
|
|
dir, index);
|
2010-10-30 05:34:24 -06:00
|
|
|
if (ret == -ENOENT)
|
|
|
|
ret = 0;
|
2007-06-12 04:35:45 -06:00
|
|
|
err:
|
|
|
|
btrfs_free_path(path);
|
2008-09-05 14:13:11 -06:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
btrfs_i_size_write(dir, dir->i_size - name_len * 2);
|
|
|
|
inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
|
|
|
|
btrfs_update_inode(trans, root, dir);
|
|
|
|
out:
|
2007-06-12 04:35:45 -06:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-03-04 10:14:37 -07:00
|
|
|
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
struct inode *dir, struct inode *inode,
|
|
|
|
const char *name, int name_len)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
|
|
|
|
if (!ret) {
|
|
|
|
btrfs_drop_nlink(inode);
|
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-05-16 08:48:46 -06:00
|
|
|
/* helper to check if there is any shared block in the path */
|
|
|
|
static int check_path_shared(struct btrfs_root *root,
|
|
|
|
struct btrfs_path *path)
|
2007-06-12 04:35:45 -06:00
|
|
|
{
|
2010-05-16 08:48:46 -06:00
|
|
|
struct extent_buffer *eb;
|
|
|
|
int level;
|
2010-06-01 02:23:11 -06:00
|
|
|
u64 refs = 1;
|
2009-11-10 19:23:48 -07:00
|
|
|
|
2010-05-16 08:48:46 -06:00
|
|
|
for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
|
2011-01-24 14:43:18 -07:00
|
|
|
int ret;
|
|
|
|
|
2010-05-16 08:48:46 -06:00
|
|
|
if (!path->nodes[level])
|
|
|
|
break;
|
|
|
|
eb = path->nodes[level];
|
|
|
|
if (!btrfs_block_can_be_shared(root, eb))
|
|
|
|
continue;
|
|
|
|
ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
|
|
|
|
&refs, NULL);
|
|
|
|
if (refs > 1)
|
|
|
|
return 1;
|
2009-11-10 19:23:48 -07:00
|
|
|
}
|
2011-01-24 14:43:18 -07:00
|
|
|
return 0;
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
|
|
|
|
2010-05-16 08:48:46 -06:00
|
|
|
/*
|
|
|
|
* helper to start transaction for unlink and rmdir.
|
|
|
|
*
|
|
|
|
* unlink and rmdir are special in btrfs, they do not always free space.
|
|
|
|
* so in enospc case, we should make sure they will free space before
|
|
|
|
* allowing them to use the global metadata reservation.
|
|
|
|
*/
|
|
|
|
static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
|
|
|
|
struct dentry *dentry)
|
2009-09-21 13:56:00 -06:00
|
|
|
{
|
2007-06-12 04:35:45 -06:00
|
|
|
struct btrfs_trans_handle *trans;
|
2010-05-16 08:48:46 -06:00
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2009-09-21 13:56:00 -06:00
|
|
|
struct btrfs_path *path;
|
2010-05-16 08:48:46 -06:00
|
|
|
struct btrfs_inode_ref *ref;
|
2009-09-21 13:56:00 -06:00
|
|
|
struct btrfs_dir_item *di;
|
2008-07-24 10:17:14 -06:00
|
|
|
struct inode *inode = dentry->d_inode;
|
2009-09-21 13:56:00 -06:00
|
|
|
u64 index;
|
2010-05-16 08:48:46 -06:00
|
|
|
int check_link = 1;
|
|
|
|
int err = -ENOSPC;
|
2009-09-21 13:56:00 -06:00
|
|
|
int ret;
|
2011-04-19 20:31:50 -06:00
|
|
|
u64 ino = btrfs_ino(inode);
|
|
|
|
u64 dir_ino = btrfs_ino(dir);
|
2009-09-21 13:56:00 -06:00
|
|
|
|
2010-05-16 08:48:46 -06:00
|
|
|
trans = btrfs_start_transaction(root, 10);
|
|
|
|
if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
|
|
|
|
return trans;
|
2009-09-21 13:56:00 -06:00
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
|
2010-05-16 08:48:46 -06:00
|
|
|
return ERR_PTR(-ENOSPC);
|
2009-09-21 13:56:00 -06:00
|
|
|
|
2010-05-16 08:48:46 -06:00
|
|
|
/* check if there is someone else holds reference */
|
|
|
|
if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
|
|
|
|
return ERR_PTR(-ENOSPC);
|
2009-09-21 13:56:00 -06:00
|
|
|
|
2010-05-16 08:48:46 -06:00
|
|
|
if (atomic_read(&inode->i_count) > 2)
|
|
|
|
return ERR_PTR(-ENOSPC);
|
2009-09-21 13:56:00 -06:00
|
|
|
|
2010-05-16 08:48:46 -06:00
|
|
|
if (xchg(&root->fs_info->enospc_unlink, 1))
|
|
|
|
return ERR_PTR(-ENOSPC);
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path) {
|
|
|
|
root->fs_info->enospc_unlink = 0;
|
|
|
|
return ERR_PTR(-ENOMEM);
|
2009-09-21 13:56:00 -06:00
|
|
|
}
|
|
|
|
|
2010-05-16 08:48:46 -06:00
|
|
|
trans = btrfs_start_transaction(root, 0);
|
2009-11-10 19:23:48 -07:00
|
|
|
if (IS_ERR(trans)) {
|
2010-05-16 08:48:46 -06:00
|
|
|
btrfs_free_path(path);
|
|
|
|
root->fs_info->enospc_unlink = 0;
|
|
|
|
return trans;
|
|
|
|
}
|
2009-09-21 13:56:00 -06:00
|
|
|
|
2010-05-16 08:48:46 -06:00
|
|
|
path->skip_locking = 1;
|
|
|
|
path->search_commit_root = 1;
|
2009-09-21 13:56:00 -06:00
|
|
|
|
2010-05-16 08:48:46 -06:00
|
|
|
ret = btrfs_lookup_inode(trans, root, path,
|
|
|
|
&BTRFS_I(dir)->location, 0);
|
|
|
|
if (ret < 0) {
|
|
|
|
err = ret;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (ret == 0) {
|
|
|
|
if (check_path_shared(root, path))
|
|
|
|
goto out;
|
|
|
|
} else {
|
|
|
|
check_link = 0;
|
2009-11-10 19:23:48 -07:00
|
|
|
}
|
2011-04-20 17:20:15 -06:00
|
|
|
btrfs_release_path(path);
|
2010-05-16 08:48:46 -06:00
|
|
|
|
|
|
|
ret = btrfs_lookup_inode(trans, root, path,
|
|
|
|
&BTRFS_I(inode)->location, 0);
|
|
|
|
if (ret < 0) {
|
|
|
|
err = ret;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (ret == 0) {
|
|
|
|
if (check_path_shared(root, path))
|
|
|
|
goto out;
|
|
|
|
} else {
|
|
|
|
check_link = 0;
|
|
|
|
}
|
2011-04-20 17:20:15 -06:00
|
|
|
btrfs_release_path(path);
|
2010-05-16 08:48:46 -06:00
|
|
|
|
|
|
|
if (ret == 0 && S_ISREG(inode->i_mode)) {
|
|
|
|
ret = btrfs_lookup_file_extent(trans, root, path,
|
2011-04-19 20:31:50 -06:00
|
|
|
ino, (u64)-1, 0);
|
2010-05-16 08:48:46 -06:00
|
|
|
if (ret < 0) {
|
|
|
|
err = ret;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
BUG_ON(ret == 0);
|
|
|
|
if (check_path_shared(root, path))
|
|
|
|
goto out;
|
2011-04-20 17:20:15 -06:00
|
|
|
btrfs_release_path(path);
|
2010-05-16 08:48:46 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!check_link) {
|
|
|
|
err = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
|
2010-05-16 08:48:46 -06:00
|
|
|
dentry->d_name.name, dentry->d_name.len, 0);
|
|
|
|
if (IS_ERR(di)) {
|
|
|
|
err = PTR_ERR(di);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (di) {
|
|
|
|
if (check_path_shared(root, path))
|
|
|
|
goto out;
|
|
|
|
} else {
|
|
|
|
err = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
2011-04-20 17:20:15 -06:00
|
|
|
btrfs_release_path(path);
|
2010-05-16 08:48:46 -06:00
|
|
|
|
|
|
|
ref = btrfs_lookup_inode_ref(trans, root, path,
|
|
|
|
dentry->d_name.name, dentry->d_name.len,
|
2011-04-19 20:31:50 -06:00
|
|
|
ino, dir_ino, 0);
|
2010-05-16 08:48:46 -06:00
|
|
|
if (IS_ERR(ref)) {
|
|
|
|
err = PTR_ERR(ref);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
BUG_ON(!ref);
|
|
|
|
if (check_path_shared(root, path))
|
|
|
|
goto out;
|
|
|
|
index = btrfs_inode_ref_index(path->nodes[0], ref);
|
2011-04-20 17:20:15 -06:00
|
|
|
btrfs_release_path(path);
|
2010-05-16 08:48:46 -06:00
|
|
|
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
/*
|
|
|
|
* This is a commit root search, if we can lookup inode item and other
|
|
|
|
* relative items in the commit root, it means the transaction of
|
|
|
|
* dir/file creation has been committed, and the dir index item that we
|
|
|
|
* delay to insert has also been inserted into the commit root. So
|
|
|
|
* we needn't worry about the delayed insertion of the dir index item
|
|
|
|
* here.
|
|
|
|
*/
|
2011-04-19 20:31:50 -06:00
|
|
|
di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index,
|
2010-05-16 08:48:46 -06:00
|
|
|
dentry->d_name.name, dentry->d_name.len, 0);
|
|
|
|
if (IS_ERR(di)) {
|
|
|
|
err = PTR_ERR(di);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
BUG_ON(ret == -ENOENT);
|
|
|
|
if (check_path_shared(root, path))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
err = 0;
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
if (err) {
|
|
|
|
btrfs_end_transaction(trans, root);
|
|
|
|
root->fs_info->enospc_unlink = 0;
|
|
|
|
return ERR_PTR(err);
|
|
|
|
}
|
|
|
|
|
|
|
|
trans->block_rsv = &root->fs_info->global_block_rsv;
|
|
|
|
return trans;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __unlink_end_trans(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root)
|
|
|
|
{
|
|
|
|
if (trans->block_rsv == &root->fs_info->global_block_rsv) {
|
|
|
|
BUG_ON(!root->fs_info->enospc_unlink);
|
|
|
|
root->fs_info->enospc_unlink = 0;
|
|
|
|
}
|
|
|
|
btrfs_end_transaction_throttle(trans, root);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct inode *inode = dentry->d_inode;
|
|
|
|
int ret;
|
|
|
|
unsigned long nr = 0;
|
|
|
|
|
|
|
|
trans = __unlink_start_trans(dir, dentry);
|
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2009-03-24 08:24:20 -06:00
|
|
|
btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
|
|
|
|
|
2008-09-05 14:13:11 -06:00
|
|
|
ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
|
|
|
|
dentry->d_name.name, dentry->d_name.len);
|
2010-05-16 08:48:46 -06:00
|
|
|
BUG_ON(ret);
|
2008-07-24 10:17:14 -06:00
|
|
|
|
2010-05-16 08:48:46 -06:00
|
|
|
if (inode->i_nlink == 0) {
|
2008-07-24 10:17:14 -06:00
|
|
|
ret = btrfs_orphan_add(trans, inode);
|
2010-05-16 08:48:46 -06:00
|
|
|
BUG_ON(ret);
|
|
|
|
}
|
2008-07-24 10:17:14 -06:00
|
|
|
|
2007-09-17 08:58:06 -06:00
|
|
|
nr = trans->blocks_used;
|
2010-05-16 08:48:46 -06:00
|
|
|
__unlink_end_trans(trans, root);
|
2007-09-17 08:58:06 -06:00
|
|
|
btrfs_btree_balance_dirty(root, nr);
|
2007-06-12 04:35:45 -06:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2009-09-21 13:56:00 -06:00
|
|
|
int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
struct inode *dir, u64 objectid,
|
|
|
|
const char *name, int name_len)
|
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_dir_item *di;
|
|
|
|
struct btrfs_key key;
|
|
|
|
u64 index;
|
|
|
|
int ret;
|
2011-04-19 20:31:50 -06:00
|
|
|
u64 dir_ino = btrfs_ino(dir);
|
2009-09-21 13:56:00 -06:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
|
2009-09-21 13:56:00 -06:00
|
|
|
name, name_len, -1);
|
2011-04-19 10:00:01 -06:00
|
|
|
BUG_ON(IS_ERR_OR_NULL(di));
|
2009-09-21 13:56:00 -06:00
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_dir_item_key_to_cpu(leaf, di, &key);
|
|
|
|
WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
|
|
|
|
ret = btrfs_delete_one_dir_name(trans, root, path, di);
|
|
|
|
BUG_ON(ret);
|
2011-04-20 17:20:15 -06:00
|
|
|
btrfs_release_path(path);
|
2009-09-21 13:56:00 -06:00
|
|
|
|
|
|
|
ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
|
|
|
|
objectid, root->root_key.objectid,
|
2011-04-19 20:31:50 -06:00
|
|
|
dir_ino, &index, name, name_len);
|
2009-09-21 13:56:00 -06:00
|
|
|
if (ret < 0) {
|
|
|
|
BUG_ON(ret != -ENOENT);
|
2011-04-19 20:31:50 -06:00
|
|
|
di = btrfs_search_dir_index_item(root, path, dir_ino,
|
2009-09-21 13:56:00 -06:00
|
|
|
name, name_len);
|
2011-04-19 10:00:01 -06:00
|
|
|
BUG_ON(IS_ERR_OR_NULL(di));
|
2009-09-21 13:56:00 -06:00
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
|
2011-04-20 17:20:15 -06:00
|
|
|
btrfs_release_path(path);
|
2009-09-21 13:56:00 -06:00
|
|
|
index = key.offset;
|
|
|
|
}
|
2011-05-22 10:33:42 -06:00
|
|
|
btrfs_release_path(path);
|
2009-09-21 13:56:00 -06:00
|
|
|
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
|
2009-09-21 13:56:00 -06:00
|
|
|
BUG_ON(ret);
|
|
|
|
|
|
|
|
btrfs_i_size_write(dir, dir->i_size - name_len * 2);
|
|
|
|
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
|
|
|
|
ret = btrfs_update_inode(trans, root, dir);
|
|
|
|
BUG_ON(ret);
|
|
|
|
|
2011-06-14 12:24:32 -06:00
|
|
|
btrfs_free_path(path);
|
2009-09-21 13:56:00 -06:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
|
|
|
|
{
|
|
|
|
struct inode *inode = dentry->d_inode;
|
2007-12-21 14:27:21 -07:00
|
|
|
int err = 0;
|
2007-06-12 04:35:45 -06:00
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
|
|
|
struct btrfs_trans_handle *trans;
|
2007-12-21 14:27:21 -07:00
|
|
|
unsigned long nr = 0;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2008-11-17 18:42:26 -07:00
|
|
|
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
|
2011-04-19 20:31:50 -06:00
|
|
|
btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
|
2007-10-25 13:49:25 -06:00
|
|
|
return -ENOTEMPTY;
|
|
|
|
|
2010-05-16 08:48:46 -06:00
|
|
|
trans = __unlink_start_trans(dir, dentry);
|
|
|
|
if (IS_ERR(trans))
|
2009-11-10 19:23:48 -07:00
|
|
|
return PTR_ERR(trans);
|
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
|
2009-09-21 13:56:00 -06:00
|
|
|
err = btrfs_unlink_subvol(trans, root, dir,
|
|
|
|
BTRFS_I(inode)->location.objectid,
|
|
|
|
dentry->d_name.name,
|
|
|
|
dentry->d_name.len);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2008-07-24 10:17:14 -06:00
|
|
|
err = btrfs_orphan_add(trans, inode);
|
|
|
|
if (err)
|
2009-09-21 13:56:00 -06:00
|
|
|
goto out;
|
2008-07-24 10:17:14 -06:00
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
/* now the directory is empty */
|
2008-09-05 14:13:11 -06:00
|
|
|
err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
|
|
|
|
dentry->d_name.name, dentry->d_name.len);
|
2009-01-05 19:25:51 -07:00
|
|
|
if (!err)
|
2008-07-17 10:54:05 -06:00
|
|
|
btrfs_i_size_write(inode, 0);
|
2009-09-21 13:56:00 -06:00
|
|
|
out:
|
2007-09-17 08:58:06 -06:00
|
|
|
nr = trans->blocks_used;
|
2010-05-16 08:48:46 -06:00
|
|
|
__unlink_end_trans(trans, root);
|
2007-09-17 08:58:06 -06:00
|
|
|
btrfs_btree_balance_dirty(root, nr);
|
2007-12-12 12:38:19 -07:00
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* this can truncate away extent items, csum items and directory items.
|
|
|
|
* It starts at a high offset and removes keys until it can't find
|
2008-09-29 13:18:18 -06:00
|
|
|
* any higher than new_size
|
2007-06-12 04:35:45 -06:00
|
|
|
*
|
|
|
|
* csum items that cross the new i_size are truncated to the new size
|
|
|
|
* as well.
|
2008-07-24 10:17:14 -06:00
|
|
|
*
|
|
|
|
* min_type is the minimum key type to truncate down to. If set to 0, this
|
|
|
|
* will kill all the items on this inode, including the INODE_ITEM_KEY.
|
2007-06-12 04:35:45 -06:00
|
|
|
*/
|
2009-11-12 02:35:36 -07:00
|
|
|
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
|
|
|
struct inode *inode,
|
|
|
|
u64 new_size, u32 min_type)
|
2007-06-12 04:35:45 -06:00
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
2007-10-15 14:14:19 -06:00
|
|
|
struct extent_buffer *leaf;
|
2007-06-12 04:35:45 -06:00
|
|
|
struct btrfs_file_extent_item *fi;
|
2009-11-12 02:35:36 -07:00
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_key found_key;
|
2007-06-12 04:35:45 -06:00
|
|
|
u64 extent_start = 0;
|
2007-10-15 14:15:53 -06:00
|
|
|
u64 extent_num_bytes = 0;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
u64 extent_offset = 0;
|
2007-06-12 04:35:45 -06:00
|
|
|
u64 item_end = 0;
|
2009-11-12 02:35:36 -07:00
|
|
|
u64 mask = root->sectorsize - 1;
|
|
|
|
u32 found_type = (u8)-1;
|
2007-06-12 04:35:45 -06:00
|
|
|
int found_extent;
|
|
|
|
int del_item;
|
2008-01-29 13:11:36 -07:00
|
|
|
int pending_del_nr = 0;
|
|
|
|
int pending_del_slot = 0;
|
2007-11-01 09:28:41 -06:00
|
|
|
int extent_type = -1;
|
2008-11-06 20:02:51 -07:00
|
|
|
int encoding;
|
2009-11-12 02:35:36 -07:00
|
|
|
int ret;
|
|
|
|
int err = 0;
|
2011-04-19 20:31:50 -06:00
|
|
|
u64 ino = btrfs_ino(inode);
|
2009-11-12 02:35:36 -07:00
|
|
|
|
|
|
|
BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2010-06-21 12:48:16 -06:00
|
|
|
if (root->ref_cows || root == root->fs_info->tree_root)
|
2008-09-26 08:05:38 -06:00
|
|
|
btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
|
2009-11-12 02:35:36 -07:00
|
|
|
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
/*
|
|
|
|
* This function is also used to drop the items in the log tree before
|
|
|
|
* we relog the inode, so if root != BTRFS_I(inode)->root, it means
|
|
|
|
* it is used to drop the loged items. So we shouldn't kill the delayed
|
|
|
|
* items.
|
|
|
|
*/
|
|
|
|
if (min_type == 0 && root == BTRFS_I(inode)->root)
|
|
|
|
btrfs_kill_delayed_inode_items(inode);
|
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
BUG_ON(!path);
|
2009-07-22 14:49:01 -06:00
|
|
|
path->reada = -1;
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
key.objectid = ino;
|
2007-06-12 04:35:45 -06:00
|
|
|
key.offset = (u64)-1;
|
2007-10-15 14:14:19 -06:00
|
|
|
key.type = (u8)-1;
|
|
|
|
|
2008-01-29 13:11:36 -07:00
|
|
|
search_again:
|
2009-03-13 09:00:37 -06:00
|
|
|
path->leave_spinning = 1;
|
2008-01-29 13:11:36 -07:00
|
|
|
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
|
2009-11-12 02:35:36 -07:00
|
|
|
if (ret < 0) {
|
|
|
|
err = ret;
|
|
|
|
goto out;
|
|
|
|
}
|
2009-01-05 19:25:51 -07:00
|
|
|
|
2008-01-29 13:11:36 -07:00
|
|
|
if (ret > 0) {
|
2008-09-05 14:13:11 -06:00
|
|
|
/* there are no items in the tree for us to truncate, we're
|
|
|
|
* done
|
|
|
|
*/
|
2009-11-12 02:35:36 -07:00
|
|
|
if (path->slots[0] == 0)
|
|
|
|
goto out;
|
2008-01-29 13:11:36 -07:00
|
|
|
path->slots[0]--;
|
|
|
|
}
|
|
|
|
|
2009-01-05 19:25:51 -07:00
|
|
|
while (1) {
|
2007-06-12 04:35:45 -06:00
|
|
|
fi = NULL;
|
2007-10-15 14:14:19 -06:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
|
|
|
found_type = btrfs_key_type(&found_key);
|
2008-11-06 20:02:51 -07:00
|
|
|
encoding = 0;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
if (found_key.objectid != ino)
|
2007-06-12 04:35:45 -06:00
|
|
|
break;
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2008-01-29 13:11:36 -07:00
|
|
|
if (found_type < min_type)
|
2007-06-12 04:35:45 -06:00
|
|
|
break;
|
|
|
|
|
2007-10-15 14:14:19 -06:00
|
|
|
item_end = found_key.offset;
|
2007-06-12 04:35:45 -06:00
|
|
|
if (found_type == BTRFS_EXTENT_DATA_KEY) {
|
2007-10-15 14:14:19 -06:00
|
|
|
fi = btrfs_item_ptr(leaf, path->slots[0],
|
2007-06-12 04:35:45 -06:00
|
|
|
struct btrfs_file_extent_item);
|
2007-11-01 09:28:41 -06:00
|
|
|
extent_type = btrfs_file_extent_type(leaf, fi);
|
2008-11-06 20:02:51 -07:00
|
|
|
encoding = btrfs_file_extent_compression(leaf, fi);
|
|
|
|
encoding |= btrfs_file_extent_encryption(leaf, fi);
|
|
|
|
encoding |= btrfs_file_extent_other_encoding(leaf, fi);
|
|
|
|
|
2007-11-01 09:28:41 -06:00
|
|
|
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
|
2007-10-15 14:14:19 -06:00
|
|
|
item_end +=
|
2007-10-15 14:15:53 -06:00
|
|
|
btrfs_file_extent_num_bytes(leaf, fi);
|
2007-11-01 09:28:41 -06:00
|
|
|
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
|
|
|
|
item_end += btrfs_file_extent_inline_len(leaf,
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
fi);
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
2007-11-07 11:31:09 -07:00
|
|
|
item_end--;
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
2009-11-12 02:35:36 -07:00
|
|
|
if (found_type > min_type) {
|
|
|
|
del_item = 1;
|
|
|
|
} else {
|
|
|
|
if (item_end < new_size)
|
2007-08-27 14:49:44 -06:00
|
|
|
break;
|
2009-11-12 02:35:36 -07:00
|
|
|
if (found_key.offset >= new_size)
|
|
|
|
del_item = 1;
|
|
|
|
else
|
|
|
|
del_item = 0;
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
|
|
|
found_extent = 0;
|
|
|
|
/* FIXME, shrink the extent if the ref count is only 1 */
|
2007-11-01 09:28:41 -06:00
|
|
|
if (found_type != BTRFS_EXTENT_DATA_KEY)
|
|
|
|
goto delete;
|
|
|
|
|
|
|
|
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
|
2007-06-12 04:35:45 -06:00
|
|
|
u64 num_dec;
|
2007-10-15 14:15:53 -06:00
|
|
|
extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
|
2008-11-06 20:02:51 -07:00
|
|
|
if (!del_item && !encoding) {
|
2007-10-15 14:15:53 -06:00
|
|
|
u64 orig_num_bytes =
|
|
|
|
btrfs_file_extent_num_bytes(leaf, fi);
|
2008-09-05 14:13:11 -06:00
|
|
|
extent_num_bytes = new_size -
|
2007-10-15 14:14:19 -06:00
|
|
|
found_key.offset + root->sectorsize - 1;
|
2008-01-30 09:54:04 -07:00
|
|
|
extent_num_bytes = extent_num_bytes &
|
|
|
|
~((u64)root->sectorsize - 1);
|
2007-10-15 14:15:53 -06:00
|
|
|
btrfs_set_file_extent_num_bytes(leaf, fi,
|
|
|
|
extent_num_bytes);
|
|
|
|
num_dec = (orig_num_bytes -
|
2008-02-08 11:49:28 -07:00
|
|
|
extent_num_bytes);
|
2008-09-05 14:13:11 -06:00
|
|
|
if (root->ref_cows && extent_start != 0)
|
2008-10-09 09:46:29 -06:00
|
|
|
inode_sub_bytes(inode, num_dec);
|
2007-10-15 14:14:19 -06:00
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
2007-06-12 04:35:45 -06:00
|
|
|
} else {
|
2007-10-15 14:15:53 -06:00
|
|
|
extent_num_bytes =
|
|
|
|
btrfs_file_extent_disk_num_bytes(leaf,
|
|
|
|
fi);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
extent_offset = found_key.offset -
|
|
|
|
btrfs_file_extent_offset(leaf, fi);
|
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
/* FIXME blocksize != 4096 */
|
2008-02-08 11:49:28 -07:00
|
|
|
num_dec = btrfs_file_extent_num_bytes(leaf, fi);
|
2007-06-12 04:35:45 -06:00
|
|
|
if (extent_start != 0) {
|
|
|
|
found_extent = 1;
|
2008-09-05 14:13:11 -06:00
|
|
|
if (root->ref_cows)
|
2008-10-09 09:46:29 -06:00
|
|
|
inode_sub_bytes(inode, num_dec);
|
2008-09-05 14:13:11 -06:00
|
|
|
}
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
2008-02-08 11:49:28 -07:00
|
|
|
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
/*
|
|
|
|
* we can't truncate inline items that have had
|
|
|
|
* special encodings
|
|
|
|
*/
|
|
|
|
if (!del_item &&
|
|
|
|
btrfs_file_extent_compression(leaf, fi) == 0 &&
|
|
|
|
btrfs_file_extent_encryption(leaf, fi) == 0 &&
|
|
|
|
btrfs_file_extent_other_encoding(leaf, fi) == 0) {
|
2008-09-05 14:13:11 -06:00
|
|
|
u32 size = new_size - found_key.offset;
|
|
|
|
|
|
|
|
if (root->ref_cows) {
|
2008-10-09 09:46:29 -06:00
|
|
|
inode_sub_bytes(inode, item_end + 1 -
|
|
|
|
new_size);
|
2008-09-05 14:13:11 -06:00
|
|
|
}
|
|
|
|
size =
|
|
|
|
btrfs_file_extent_calc_inline_size(size);
|
2008-02-08 11:49:28 -07:00
|
|
|
ret = btrfs_truncate_item(trans, root, path,
|
2008-09-05 14:13:11 -06:00
|
|
|
size, 1);
|
|
|
|
} else if (root->ref_cows) {
|
2008-10-09 09:46:29 -06:00
|
|
|
inode_sub_bytes(inode, item_end + 1 -
|
|
|
|
found_key.offset);
|
2008-02-08 11:49:28 -07:00
|
|
|
}
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
2007-11-01 09:28:41 -06:00
|
|
|
delete:
|
2007-06-12 04:35:45 -06:00
|
|
|
if (del_item) {
|
2008-01-29 13:11:36 -07:00
|
|
|
if (!pending_del_nr) {
|
|
|
|
/* no pending yet, add ourselves */
|
|
|
|
pending_del_slot = path->slots[0];
|
|
|
|
pending_del_nr = 1;
|
|
|
|
} else if (pending_del_nr &&
|
|
|
|
path->slots[0] + 1 == pending_del_slot) {
|
|
|
|
/* hop on the pending chunk */
|
|
|
|
pending_del_nr++;
|
|
|
|
pending_del_slot = path->slots[0];
|
|
|
|
} else {
|
2009-01-05 19:25:51 -07:00
|
|
|
BUG();
|
2008-01-29 13:11:36 -07:00
|
|
|
}
|
2007-06-12 04:35:45 -06:00
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
2010-06-21 12:48:16 -06:00
|
|
|
if (found_extent && (root->ref_cows ||
|
|
|
|
root == root->fs_info->tree_root)) {
|
2009-03-13 09:00:37 -06:00
|
|
|
btrfs_set_path_blocking(path);
|
2007-06-12 04:35:45 -06:00
|
|
|
ret = btrfs_free_extent(trans, root, extent_start,
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
extent_num_bytes, 0,
|
|
|
|
btrfs_header_owner(leaf),
|
2011-04-19 20:31:50 -06:00
|
|
|
ino, extent_offset);
|
2007-06-12 04:35:45 -06:00
|
|
|
BUG_ON(ret);
|
|
|
|
}
|
2008-01-29 13:11:36 -07:00
|
|
|
|
2009-11-12 02:35:36 -07:00
|
|
|
if (found_type == BTRFS_INODE_ITEM_KEY)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (path->slots[0] == 0 ||
|
|
|
|
path->slots[0] != pending_del_slot) {
|
2011-04-19 20:33:24 -06:00
|
|
|
if (root->ref_cows &&
|
|
|
|
BTRFS_I(inode)->location.objectid !=
|
|
|
|
BTRFS_FREE_INO_OBJECTID) {
|
2009-11-12 02:35:36 -07:00
|
|
|
err = -EAGAIN;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (pending_del_nr) {
|
|
|
|
ret = btrfs_del_items(trans, root, path,
|
|
|
|
pending_del_slot,
|
|
|
|
pending_del_nr);
|
|
|
|
BUG_ON(ret);
|
|
|
|
pending_del_nr = 0;
|
|
|
|
}
|
2011-04-20 17:20:15 -06:00
|
|
|
btrfs_release_path(path);
|
2008-01-29 13:11:36 -07:00
|
|
|
goto search_again;
|
2009-11-12 02:35:36 -07:00
|
|
|
} else {
|
|
|
|
path->slots[0]--;
|
2008-01-29 13:11:36 -07:00
|
|
|
}
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
2009-11-12 02:35:36 -07:00
|
|
|
out:
|
2008-01-29 13:11:36 -07:00
|
|
|
if (pending_del_nr) {
|
|
|
|
ret = btrfs_del_items(trans, root, path, pending_del_slot,
|
|
|
|
pending_del_nr);
|
2010-05-16 08:49:58 -06:00
|
|
|
BUG_ON(ret);
|
2008-01-29 13:11:36 -07:00
|
|
|
}
|
2007-06-12 04:35:45 -06:00
|
|
|
btrfs_free_path(path);
|
2009-11-12 02:35:36 -07:00
|
|
|
return err;
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* taken from block_truncate_page, but does cow as it zeros out
|
|
|
|
* any bytes left in the last page in the file.
|
|
|
|
*/
|
|
|
|
static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
|
|
|
|
{
|
|
|
|
struct inode *inode = mapping->host;
|
2007-10-15 14:15:53 -06:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2008-07-17 10:53:50 -06:00
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
2010-02-03 12:33:23 -07:00
|
|
|
struct extent_state *cached_state = NULL;
|
2008-07-17 10:53:50 -06:00
|
|
|
char *kaddr;
|
2007-10-15 14:15:53 -06:00
|
|
|
u32 blocksize = root->sectorsize;
|
2007-06-12 04:35:45 -06:00
|
|
|
pgoff_t index = from >> PAGE_CACHE_SHIFT;
|
|
|
|
unsigned offset = from & (PAGE_CACHE_SIZE-1);
|
|
|
|
struct page *page;
|
|
|
|
int ret = 0;
|
2007-08-27 14:49:44 -06:00
|
|
|
u64 page_start;
|
2008-07-17 10:53:50 -06:00
|
|
|
u64 page_end;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
|
|
|
if ((offset & (blocksize - 1)) == 0)
|
|
|
|
goto out;
|
2010-05-16 08:48:47 -06:00
|
|
|
ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
|
2009-10-13 14:46:49 -06:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
|
|
|
ret = -ENOMEM;
|
2008-05-15 07:13:45 -06:00
|
|
|
again:
|
2007-06-12 04:35:45 -06:00
|
|
|
page = grab_cache_page(mapping, index);
|
2009-10-13 14:46:49 -06:00
|
|
|
if (!page) {
|
2010-05-16 08:48:47 -06:00
|
|
|
btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
|
2007-06-12 04:35:45 -06:00
|
|
|
goto out;
|
2009-10-13 14:46:49 -06:00
|
|
|
}
|
2008-07-17 10:53:50 -06:00
|
|
|
|
|
|
|
page_start = page_offset(page);
|
|
|
|
page_end = page_start + PAGE_CACHE_SIZE - 1;
|
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
if (!PageUptodate(page)) {
|
2007-06-15 11:50:00 -06:00
|
|
|
ret = btrfs_readpage(NULL, page);
|
2007-06-12 04:35:45 -06:00
|
|
|
lock_page(page);
|
2008-05-15 07:13:45 -06:00
|
|
|
if (page->mapping != mapping) {
|
|
|
|
unlock_page(page);
|
|
|
|
page_cache_release(page);
|
|
|
|
goto again;
|
|
|
|
}
|
2007-06-12 04:35:45 -06:00
|
|
|
if (!PageUptodate(page)) {
|
|
|
|
ret = -EIO;
|
2008-07-24 07:41:53 -06:00
|
|
|
goto out_unlock;
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
|
|
|
}
|
2008-05-15 07:13:45 -06:00
|
|
|
wait_on_page_writeback(page);
|
2008-07-17 10:53:50 -06:00
|
|
|
|
2010-02-03 12:33:23 -07:00
|
|
|
lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
|
|
|
|
GFP_NOFS);
|
2008-07-17 10:53:50 -06:00
|
|
|
set_page_extent_mapped(page);
|
|
|
|
|
|
|
|
ordered = btrfs_lookup_ordered_extent(inode, page_start);
|
|
|
|
if (ordered) {
|
2010-02-03 12:33:23 -07:00
|
|
|
unlock_extent_cached(io_tree, page_start, page_end,
|
|
|
|
&cached_state, GFP_NOFS);
|
2008-07-17 10:53:50 -06:00
|
|
|
unlock_page(page);
|
|
|
|
page_cache_release(page);
|
2008-07-17 11:53:27 -06:00
|
|
|
btrfs_start_ordered_extent(inode, ordered, 1);
|
2008-07-17 10:53:50 -06:00
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
2010-02-03 12:33:23 -07:00
|
|
|
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
|
2009-10-13 14:46:49 -06:00
|
|
|
EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
|
2010-02-03 12:33:23 -07:00
|
|
|
0, 0, &cached_state, GFP_NOFS);
|
2009-10-13 14:46:49 -06:00
|
|
|
|
2010-02-03 12:33:23 -07:00
|
|
|
ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
|
|
|
|
&cached_state);
|
2009-09-11 14:12:44 -06:00
|
|
|
if (ret) {
|
2010-02-03 12:33:23 -07:00
|
|
|
unlock_extent_cached(io_tree, page_start, page_end,
|
|
|
|
&cached_state, GFP_NOFS);
|
2009-09-11 14:12:44 -06:00
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
2008-07-17 10:53:50 -06:00
|
|
|
ret = 0;
|
|
|
|
if (offset != PAGE_CACHE_SIZE) {
|
|
|
|
kaddr = kmap(page);
|
|
|
|
memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
|
|
|
|
flush_dcache_page(page);
|
|
|
|
kunmap(page);
|
|
|
|
}
|
2008-07-17 10:53:51 -06:00
|
|
|
ClearPageChecked(page);
|
2008-07-17 10:53:50 -06:00
|
|
|
set_page_dirty(page);
|
2010-02-03 12:33:23 -07:00
|
|
|
unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
|
|
|
|
GFP_NOFS);
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2008-07-24 07:41:53 -06:00
|
|
|
out_unlock:
|
2009-10-13 14:46:49 -06:00
|
|
|
if (ret)
|
2010-05-16 08:48:47 -06:00
|
|
|
btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
|
2007-06-12 04:35:45 -06:00
|
|
|
unlock_page(page);
|
|
|
|
page_cache_release(page);
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-03-04 13:46:53 -07:00
|
|
|
/*
|
|
|
|
* This function puts in dummy file extents for the area we're creating a hole
|
|
|
|
* for. So if we are truncating this file to a larger size we need to insert
|
|
|
|
* these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
|
|
|
|
* the range between oldsize and size
|
|
|
|
*/
|
2011-01-31 13:30:16 -07:00
|
|
|
int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
|
2007-06-12 04:35:45 -06:00
|
|
|
{
|
2008-10-30 12:19:41 -06:00
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
2010-05-16 08:48:46 -06:00
|
|
|
struct extent_map *em = NULL;
|
2010-02-03 12:33:23 -07:00
|
|
|
struct extent_state *cached_state = NULL;
|
2008-10-30 12:19:41 -06:00
|
|
|
u64 mask = root->sectorsize - 1;
|
2011-01-31 13:30:16 -07:00
|
|
|
u64 hole_start = (oldsize + mask) & ~mask;
|
2008-10-30 12:19:41 -06:00
|
|
|
u64 block_end = (size + mask) & ~mask;
|
|
|
|
u64 last_byte;
|
|
|
|
u64 cur_offset;
|
|
|
|
u64 hole_size;
|
2009-09-11 14:12:44 -06:00
|
|
|
int err = 0;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2008-10-30 12:19:41 -06:00
|
|
|
if (size <= hole_start)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
|
|
|
btrfs_wait_ordered_range(inode, hole_start,
|
|
|
|
block_end - hole_start);
|
2010-02-03 12:33:23 -07:00
|
|
|
lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
|
|
|
|
&cached_state, GFP_NOFS);
|
2008-10-30 12:19:41 -06:00
|
|
|
ordered = btrfs_lookup_ordered_extent(inode, hole_start);
|
|
|
|
if (!ordered)
|
|
|
|
break;
|
2010-02-03 12:33:23 -07:00
|
|
|
unlock_extent_cached(io_tree, hole_start, block_end - 1,
|
|
|
|
&cached_state, GFP_NOFS);
|
2008-10-30 12:19:41 -06:00
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
}
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2008-10-30 12:19:41 -06:00
|
|
|
cur_offset = hole_start;
|
|
|
|
while (1) {
|
|
|
|
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
|
|
|
|
block_end - cur_offset, 0);
|
2011-04-19 10:00:01 -06:00
|
|
|
BUG_ON(IS_ERR_OR_NULL(em));
|
2008-10-30 12:19:41 -06:00
|
|
|
last_byte = min(extent_map_end(em), block_end);
|
|
|
|
last_byte = (last_byte + mask) & ~mask;
|
2009-11-12 02:35:36 -07:00
|
|
|
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
|
2008-11-06 20:02:51 -07:00
|
|
|
u64 hint_byte = 0;
|
2008-10-30 12:19:41 -06:00
|
|
|
hole_size = last_byte - cur_offset;
|
2009-09-11 14:12:44 -06:00
|
|
|
|
2010-05-16 08:48:46 -06:00
|
|
|
trans = btrfs_start_transaction(root, 2);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
err = PTR_ERR(trans);
|
2009-09-11 14:12:44 -06:00
|
|
|
break;
|
2010-05-16 08:48:46 -06:00
|
|
|
}
|
2009-11-12 02:35:36 -07:00
|
|
|
|
|
|
|
err = btrfs_drop_extents(trans, inode, cur_offset,
|
|
|
|
cur_offset + hole_size,
|
|
|
|
&hint_byte, 1);
|
2011-01-31 14:03:11 -07:00
|
|
|
if (err)
|
|
|
|
break;
|
2009-11-12 02:35:36 -07:00
|
|
|
|
2008-10-30 12:19:41 -06:00
|
|
|
err = btrfs_insert_file_extent(trans, root,
|
2011-04-19 20:31:50 -06:00
|
|
|
btrfs_ino(inode), cur_offset, 0,
|
2008-10-30 12:19:41 -06:00
|
|
|
0, hole_size, 0, hole_size,
|
|
|
|
0, 0, 0);
|
2011-01-31 14:03:11 -07:00
|
|
|
if (err)
|
|
|
|
break;
|
2009-11-12 02:35:36 -07:00
|
|
|
|
2008-10-30 12:19:41 -06:00
|
|
|
btrfs_drop_extent_cache(inode, hole_start,
|
|
|
|
last_byte - 1, 0);
|
2009-11-12 02:35:36 -07:00
|
|
|
|
|
|
|
btrfs_end_transaction(trans, root);
|
2008-10-30 12:19:41 -06:00
|
|
|
}
|
|
|
|
free_extent_map(em);
|
2010-05-16 08:48:46 -06:00
|
|
|
em = NULL;
|
2008-10-30 12:19:41 -06:00
|
|
|
cur_offset = last_byte;
|
2009-11-12 02:35:36 -07:00
|
|
|
if (cur_offset >= block_end)
|
2008-10-30 12:19:41 -06:00
|
|
|
break;
|
|
|
|
}
|
2007-12-21 14:27:21 -07:00
|
|
|
|
2010-05-16 08:48:46 -06:00
|
|
|
free_extent_map(em);
|
2010-02-03 12:33:23 -07:00
|
|
|
unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
|
|
|
|
GFP_NOFS);
|
2008-10-30 12:19:41 -06:00
|
|
|
return err;
|
|
|
|
}
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2011-01-31 13:30:16 -07:00
|
|
|
static int btrfs_setsize(struct inode *inode, loff_t newsize)
|
2009-11-12 02:35:36 -07:00
|
|
|
{
|
2011-01-31 13:30:16 -07:00
|
|
|
loff_t oldsize = i_size_read(inode);
|
2009-11-12 02:35:36 -07:00
|
|
|
int ret;
|
|
|
|
|
2011-01-31 13:30:16 -07:00
|
|
|
if (newsize == oldsize)
|
2009-11-12 02:35:36 -07:00
|
|
|
return 0;
|
|
|
|
|
2011-01-31 13:30:16 -07:00
|
|
|
if (newsize > oldsize) {
|
|
|
|
i_size_write(inode, newsize);
|
|
|
|
btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
|
|
|
|
truncate_pagecache(inode, oldsize, newsize);
|
|
|
|
ret = btrfs_cont_expand(inode, oldsize, newsize);
|
2009-11-12 02:35:36 -07:00
|
|
|
if (ret) {
|
2011-01-31 13:30:16 -07:00
|
|
|
btrfs_setsize(inode, oldsize);
|
2009-11-12 02:35:36 -07:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-03-04 12:41:41 -07:00
|
|
|
mark_inode_dirty(inode);
|
2011-01-31 13:30:16 -07:00
|
|
|
} else {
|
2009-11-12 02:35:36 -07:00
|
|
|
|
2011-01-31 13:30:16 -07:00
|
|
|
/*
|
|
|
|
* We're truncating a file that used to have good data down to
|
|
|
|
* zero. Make sure it gets into the ordered flush list so that
|
|
|
|
* any new writes get down to disk quickly.
|
|
|
|
*/
|
|
|
|
if (newsize == 0)
|
|
|
|
BTRFS_I(inode)->ordered_data_close = 1;
|
2009-11-12 02:35:36 -07:00
|
|
|
|
2011-01-31 13:30:16 -07:00
|
|
|
/* we don't support swapfiles, so vmtruncate shouldn't fail */
|
|
|
|
truncate_setsize(inode, newsize);
|
|
|
|
ret = btrfs_truncate(inode);
|
2009-11-12 02:35:36 -07:00
|
|
|
}
|
|
|
|
|
2011-01-31 13:30:16 -07:00
|
|
|
return ret;
|
2009-11-12 02:35:36 -07:00
|
|
|
}
|
|
|
|
|
2008-10-30 12:19:41 -06:00
|
|
|
static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
|
|
|
|
{
|
|
|
|
struct inode *inode = dentry->d_inode;
|
2010-12-20 01:04:08 -07:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2008-10-30 12:19:41 -06:00
|
|
|
int err;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2010-12-20 01:04:08 -07:00
|
|
|
if (btrfs_root_readonly(root))
|
|
|
|
return -EROFS;
|
|
|
|
|
2008-10-30 12:19:41 -06:00
|
|
|
err = inode_change_ok(inode, attr);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2007-08-30 09:54:02 -06:00
|
|
|
|
2009-03-31 11:27:11 -06:00
|
|
|
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
|
2011-01-31 13:30:16 -07:00
|
|
|
err = btrfs_setsize(inode, attr->ia_size);
|
2009-11-12 02:35:36 -07:00
|
|
|
if (err)
|
|
|
|
return err;
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
2008-10-30 12:19:41 -06:00
|
|
|
|
2010-06-04 03:30:02 -06:00
|
|
|
if (attr->ia_valid) {
|
|
|
|
setattr_copy(inode, attr);
|
|
|
|
mark_inode_dirty(inode);
|
|
|
|
|
|
|
|
if (attr->ia_valid & ATTR_MODE)
|
|
|
|
err = btrfs_acl_chmod(inode);
|
|
|
|
}
|
2008-07-24 10:16:36 -06:00
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
return err;
|
|
|
|
}
|
2008-01-14 14:24:38 -07:00
|
|
|
|
2010-06-07 09:35:40 -06:00
|
|
|
void btrfs_evict_inode(struct inode *inode)
|
2007-06-12 04:35:45 -06:00
|
|
|
{
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2007-09-17 08:58:06 -06:00
|
|
|
unsigned long nr;
|
2007-06-12 04:35:45 -06:00
|
|
|
int ret;
|
|
|
|
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 05:18:59 -06:00
|
|
|
trace_btrfs_inode_evict(inode);
|
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
truncate_inode_pages(&inode->i_data, 0);
|
2010-06-21 12:48:16 -06:00
|
|
|
if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
|
2011-04-19 20:33:24 -06:00
|
|
|
is_free_space_inode(root, inode)))
|
2010-06-07 09:35:40 -06:00
|
|
|
goto no_delete;
|
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
if (is_bad_inode(inode)) {
|
2008-07-24 10:17:14 -06:00
|
|
|
btrfs_orphan_del(NULL, inode);
|
2007-06-12 04:35:45 -06:00
|
|
|
goto no_delete;
|
|
|
|
}
|
2010-06-07 09:35:40 -06:00
|
|
|
/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
|
2008-07-21 08:29:44 -06:00
|
|
|
btrfs_wait_ordered_range(inode, 0, (u64)-1);
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2009-11-12 02:34:40 -07:00
|
|
|
if (root->fs_info->log_root_recovering) {
|
|
|
|
BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan));
|
|
|
|
goto no_delete;
|
|
|
|
}
|
|
|
|
|
2009-09-21 14:00:26 -06:00
|
|
|
if (inode->i_nlink > 0) {
|
|
|
|
BUG_ON(btrfs_root_refs(&root->root_item) != 0);
|
|
|
|
goto no_delete;
|
|
|
|
}
|
|
|
|
|
2008-07-17 10:54:05 -06:00
|
|
|
btrfs_i_size_write(inode, 0);
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2009-11-12 02:35:36 -07:00
|
|
|
while (1) {
|
2011-06-07 21:56:44 -06:00
|
|
|
trans = btrfs_join_transaction(root);
|
2010-05-16 08:49:58 -06:00
|
|
|
BUG_ON(IS_ERR(trans));
|
|
|
|
trans->block_rsv = root->orphan_block_rsv;
|
|
|
|
|
|
|
|
ret = btrfs_block_rsv_check(trans, root,
|
|
|
|
root->orphan_block_rsv, 0, 5);
|
|
|
|
if (ret) {
|
|
|
|
BUG_ON(ret != -EAGAIN);
|
|
|
|
ret = btrfs_commit_transaction(trans, root);
|
|
|
|
BUG_ON(ret);
|
|
|
|
continue;
|
|
|
|
}
|
2008-07-24 10:17:14 -06:00
|
|
|
|
2010-05-16 08:49:58 -06:00
|
|
|
ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
|
2009-11-12 02:35:36 -07:00
|
|
|
if (ret != -EAGAIN)
|
|
|
|
break;
|
2008-01-29 13:11:36 -07:00
|
|
|
|
2009-11-12 02:35:36 -07:00
|
|
|
nr = trans->blocks_used;
|
|
|
|
btrfs_end_transaction(trans, root);
|
|
|
|
trans = NULL;
|
|
|
|
btrfs_btree_balance_dirty(root, nr);
|
2010-05-16 08:49:58 -06:00
|
|
|
|
2009-11-12 02:35:36 -07:00
|
|
|
}
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2009-11-12 02:35:36 -07:00
|
|
|
if (ret == 0) {
|
|
|
|
ret = btrfs_orphan_del(trans, inode);
|
|
|
|
BUG_ON(ret);
|
|
|
|
}
|
2007-06-22 12:16:25 -06:00
|
|
|
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-19 20:06:11 -06:00
|
|
|
if (!(root == root->fs_info->tree_root ||
|
|
|
|
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
|
2011-04-19 20:31:50 -06:00
|
|
|
btrfs_return_ino(root, btrfs_ino(inode));
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-19 20:06:11 -06:00
|
|
|
|
2007-09-17 08:58:06 -06:00
|
|
|
nr = trans->blocks_used;
|
2007-06-22 12:16:25 -06:00
|
|
|
btrfs_end_transaction(trans, root);
|
2007-09-17 08:58:06 -06:00
|
|
|
btrfs_btree_balance_dirty(root, nr);
|
2007-06-12 04:35:45 -06:00
|
|
|
no_delete:
|
2010-06-07 09:35:40 -06:00
|
|
|
end_writeback(inode);
|
2009-11-12 02:35:36 -07:00
|
|
|
return;
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* this returns the key found in the dir entry in the location pointer.
|
|
|
|
* If no dir entries were found, location->objectid is 0.
|
|
|
|
*/
|
|
|
|
static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
|
|
|
|
struct btrfs_key *location)
|
|
|
|
{
|
|
|
|
const char *name = dentry->d_name.name;
|
|
|
|
int namelen = dentry->d_name.len;
|
|
|
|
struct btrfs_dir_item *di;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2007-10-25 13:48:28 -06:00
|
|
|
int ret = 0;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-13 11:38:47 -06:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
2007-12-12 12:38:19 -07:00
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
|
2007-06-12 04:35:45 -06:00
|
|
|
namelen, 0);
|
2007-10-25 13:48:28 -06:00
|
|
|
if (IS_ERR(di))
|
|
|
|
ret = PTR_ERR(di);
|
2009-01-05 19:25:51 -07:00
|
|
|
|
2011-04-19 10:00:01 -06:00
|
|
|
if (IS_ERR_OR_NULL(di))
|
2007-12-12 12:38:19 -07:00
|
|
|
goto out_err;
|
2009-01-05 19:25:51 -07:00
|
|
|
|
2007-10-15 14:14:19 -06:00
|
|
|
btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
|
2007-06-12 04:35:45 -06:00
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
2007-12-12 12:38:19 -07:00
|
|
|
out_err:
|
|
|
|
location->objectid = 0;
|
|
|
|
goto out;
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* when we hit a tree root in a directory, the btrfs part of the inode
|
|
|
|
* needs to be changed to reflect the root directory of the tree root. This
|
|
|
|
* is kind of like crossing a mount point.
|
|
|
|
*/
|
|
|
|
static int fixup_tree_root_location(struct btrfs_root *root,
|
2009-09-21 13:56:00 -06:00
|
|
|
struct inode *dir,
|
|
|
|
struct dentry *dentry,
|
|
|
|
struct btrfs_key *location,
|
|
|
|
struct btrfs_root **sub_root)
|
2007-06-12 04:35:45 -06:00
|
|
|
{
|
2009-09-21 13:56:00 -06:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_root *new_root;
|
|
|
|
struct btrfs_root_ref *ref;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
int ret;
|
|
|
|
int err = 0;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2009-09-21 13:56:00 -06:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2009-09-21 13:56:00 -06:00
|
|
|
err = -ENOENT;
|
|
|
|
ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
|
|
|
|
BTRFS_I(dir)->root->root_key.objectid,
|
|
|
|
location->objectid);
|
|
|
|
if (ret) {
|
|
|
|
if (ret < 0)
|
|
|
|
err = ret;
|
|
|
|
goto out;
|
|
|
|
}
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2009-09-21 13:56:00 -06:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
|
2011-04-19 20:31:50 -06:00
|
|
|
if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
|
2009-09-21 13:56:00 -06:00
|
|
|
btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
|
|
|
|
goto out;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2009-09-21 13:56:00 -06:00
|
|
|
ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
|
|
|
|
(unsigned long)(ref + 1),
|
|
|
|
dentry->d_name.len);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
2011-04-20 17:20:15 -06:00
|
|
|
btrfs_release_path(path);
|
2009-09-21 13:56:00 -06:00
|
|
|
|
|
|
|
new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
|
|
|
|
if (IS_ERR(new_root)) {
|
|
|
|
err = PTR_ERR(new_root);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (btrfs_root_refs(&new_root->root_item) == 0) {
|
|
|
|
err = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
*sub_root = new_root;
|
|
|
|
location->objectid = btrfs_root_dirid(&new_root->root_item);
|
|
|
|
location->type = BTRFS_INODE_ITEM_KEY;
|
|
|
|
location->offset = 0;
|
|
|
|
err = 0;
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return err;
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
static void inode_tree_add(struct inode *inode)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_inode *entry;
|
2009-08-21 02:09:44 -06:00
|
|
|
struct rb_node **p;
|
|
|
|
struct rb_node *parent;
|
2011-04-19 20:31:50 -06:00
|
|
|
u64 ino = btrfs_ino(inode);
|
2009-08-21 02:09:44 -06:00
|
|
|
again:
|
|
|
|
p = &root->inode_tree.rb_node;
|
|
|
|
parent = NULL;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
|
2010-10-23 13:19:20 -06:00
|
|
|
if (inode_unhashed(inode))
|
2009-09-21 14:00:26 -06:00
|
|
|
return;
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
spin_lock(&root->inode_lock);
|
|
|
|
while (*p) {
|
|
|
|
parent = *p;
|
|
|
|
entry = rb_entry(parent, struct btrfs_inode, rb_node);
|
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
if (ino < btrfs_ino(&entry->vfs_inode))
|
2009-08-21 02:09:44 -06:00
|
|
|
p = &parent->rb_left;
|
2011-04-19 20:31:50 -06:00
|
|
|
else if (ino > btrfs_ino(&entry->vfs_inode))
|
2009-08-21 02:09:44 -06:00
|
|
|
p = &parent->rb_right;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
else {
|
|
|
|
WARN_ON(!(entry->vfs_inode.i_state &
|
2010-06-02 15:38:30 -06:00
|
|
|
(I_WILL_FREE | I_FREEING)));
|
2009-08-21 02:09:44 -06:00
|
|
|
rb_erase(parent, &root->inode_tree);
|
|
|
|
RB_CLEAR_NODE(parent);
|
|
|
|
spin_unlock(&root->inode_lock);
|
|
|
|
goto again;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
|
|
|
|
rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree);
|
|
|
|
spin_unlock(&root->inode_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void inode_tree_del(struct inode *inode)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2009-09-21 14:00:26 -06:00
|
|
|
int empty = 0;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
|
2009-08-21 02:09:44 -06:00
|
|
|
spin_lock(&root->inode_lock);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
|
|
|
|
rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
|
|
|
|
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
|
2009-09-21 14:00:26 -06:00
|
|
|
empty = RB_EMPTY_ROOT(&root->inode_tree);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
}
|
2009-08-21 02:09:44 -06:00
|
|
|
spin_unlock(&root->inode_lock);
|
2009-09-21 14:00:26 -06:00
|
|
|
|
2010-06-21 12:48:16 -06:00
|
|
|
/*
|
|
|
|
* Free space cache has inodes in the tree root, but the tree root has a
|
|
|
|
* root_refs of 0, so this could end up dropping the tree root as a
|
|
|
|
* snapshot, so we need the extra !root->fs_info->tree_root check to
|
|
|
|
* make sure we don't drop it.
|
|
|
|
*/
|
|
|
|
if (empty && btrfs_root_refs(&root->root_item) == 0 &&
|
|
|
|
root != root->fs_info->tree_root) {
|
2009-09-21 14:00:26 -06:00
|
|
|
synchronize_srcu(&root->fs_info->subvol_srcu);
|
|
|
|
spin_lock(&root->inode_lock);
|
|
|
|
empty = RB_EMPTY_ROOT(&root->inode_tree);
|
|
|
|
spin_unlock(&root->inode_lock);
|
|
|
|
if (empty)
|
|
|
|
btrfs_add_dead_root(root);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int btrfs_invalidate_inodes(struct btrfs_root *root)
|
|
|
|
{
|
|
|
|
struct rb_node *node;
|
|
|
|
struct rb_node *prev;
|
|
|
|
struct btrfs_inode *entry;
|
|
|
|
struct inode *inode;
|
|
|
|
u64 objectid = 0;
|
|
|
|
|
|
|
|
WARN_ON(btrfs_root_refs(&root->root_item) != 0);
|
|
|
|
|
|
|
|
spin_lock(&root->inode_lock);
|
|
|
|
again:
|
|
|
|
node = root->inode_tree.rb_node;
|
|
|
|
prev = NULL;
|
|
|
|
while (node) {
|
|
|
|
prev = node;
|
|
|
|
entry = rb_entry(node, struct btrfs_inode, rb_node);
|
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
if (objectid < btrfs_ino(&entry->vfs_inode))
|
2009-09-21 14:00:26 -06:00
|
|
|
node = node->rb_left;
|
2011-04-19 20:31:50 -06:00
|
|
|
else if (objectid > btrfs_ino(&entry->vfs_inode))
|
2009-09-21 14:00:26 -06:00
|
|
|
node = node->rb_right;
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (!node) {
|
|
|
|
while (prev) {
|
|
|
|
entry = rb_entry(prev, struct btrfs_inode, rb_node);
|
2011-04-19 20:31:50 -06:00
|
|
|
if (objectid <= btrfs_ino(&entry->vfs_inode)) {
|
2009-09-21 14:00:26 -06:00
|
|
|
node = prev;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
prev = rb_next(prev);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
while (node) {
|
|
|
|
entry = rb_entry(node, struct btrfs_inode, rb_node);
|
2011-04-19 20:31:50 -06:00
|
|
|
objectid = btrfs_ino(&entry->vfs_inode) + 1;
|
2009-09-21 14:00:26 -06:00
|
|
|
inode = igrab(&entry->vfs_inode);
|
|
|
|
if (inode) {
|
|
|
|
spin_unlock(&root->inode_lock);
|
|
|
|
if (atomic_read(&inode->i_count) > 1)
|
|
|
|
d_prune_aliases(inode);
|
|
|
|
/*
|
2010-06-07 11:43:19 -06:00
|
|
|
* btrfs_drop_inode will have it removed from
|
2009-09-21 14:00:26 -06:00
|
|
|
* the inode cache when its usage count
|
|
|
|
* hits zero.
|
|
|
|
*/
|
|
|
|
iput(inode);
|
|
|
|
cond_resched();
|
|
|
|
spin_lock(&root->inode_lock);
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cond_resched_lock(&root->inode_lock))
|
|
|
|
goto again;
|
|
|
|
|
|
|
|
node = rb_next(node);
|
|
|
|
}
|
|
|
|
spin_unlock(&root->inode_lock);
|
|
|
|
return 0;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
}
|
|
|
|
|
2008-09-05 14:13:11 -06:00
|
|
|
static int btrfs_init_locked_inode(struct inode *inode, void *p)
|
|
|
|
{
|
|
|
|
struct btrfs_iget_args *args = p;
|
|
|
|
inode->i_ino = args->ino;
|
|
|
|
BTRFS_I(inode)->root = args->root;
|
2009-02-20 09:00:09 -07:00
|
|
|
btrfs_set_inode_space_info(args->root, inode);
|
2007-06-12 04:35:45 -06:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_find_actor(struct inode *inode, void *opaque)
|
|
|
|
{
|
|
|
|
struct btrfs_iget_args *args = opaque;
|
2011-04-19 20:31:50 -06:00
|
|
|
return args->ino == btrfs_ino(inode) &&
|
2009-01-05 19:25:51 -07:00
|
|
|
args->root == BTRFS_I(inode)->root;
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
static struct inode *btrfs_iget_locked(struct super_block *s,
|
|
|
|
u64 objectid,
|
|
|
|
struct btrfs_root *root)
|
2007-06-12 04:35:45 -06:00
|
|
|
{
|
|
|
|
struct inode *inode;
|
|
|
|
struct btrfs_iget_args args;
|
|
|
|
args.ino = objectid;
|
|
|
|
args.root = root;
|
|
|
|
|
|
|
|
inode = iget5_locked(s, objectid, btrfs_find_actor,
|
|
|
|
btrfs_init_locked_inode,
|
|
|
|
(void *)&args);
|
|
|
|
return inode;
|
|
|
|
}
|
|
|
|
|
2008-07-20 14:31:04 -06:00
|
|
|
/* Get an inode object given its location and corresponding root.
|
|
|
|
* Returns in *is_new if the inode was read from disk
|
|
|
|
*/
|
|
|
|
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
|
Btrfs: change how we mount subvolumes
This work is in preperation for being able to set a different root as the
default mounting root.
There is currently a problem with how we mount subvolumes. We cannot currently
mount a subvolume of a subvolume, you can only mount subvolumes/snapshots of the
default subvolume. So say you take a snapshot of the default subvolume and call
it snap1, and then take a snapshot of snap1 and call it snap2, so now you have
/
/snap1
/snap1/snap2
as your available volumes. Currently you can only mount / and /snap1,
you cannot mount /snap1/snap2. To fix this problem instead of passing
subvolid=<name> you must pass in subvolid=<treeid>, where <treeid> is
the tree id that gets spit out via the subvolume listing you get from
the subvolume listing patches (btrfs filesystem list). This allows us
to mount /, /snap1 and /snap1/snap2 as the root volume.
In addition to the above, we also now read the default dir item in the
tree root to get the root key that it points to. For now this just
points at what has always been the default subvolme, but later on I plan
to change it to point at whatever root you want to be the new default
root, so you can just set the default mount and not have to mount with
-o subvolid=<treeid>. I tested this out with the above scenario and it
worked perfectly. Thanks,
mount -o subvol operates inside the selected subvolid. For example:
mount -o subvol=snap1,subvolid=256 /dev/xxx /mnt
/mnt will have the snap1 directory for the subvolume with id
256.
mount -o subvol=snap /dev/xxx /mnt
/mnt will be the snap directory of whatever the default subvolume
is.
Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-12-04 10:38:27 -07:00
|
|
|
struct btrfs_root *root, int *new)
|
2008-07-20 14:31:04 -06:00
|
|
|
{
|
|
|
|
struct inode *inode;
|
|
|
|
|
|
|
|
inode = btrfs_iget_locked(s, location->objectid, root);
|
|
|
|
if (!inode)
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
return ERR_PTR(-ENOMEM);
|
2008-07-20 14:31:04 -06:00
|
|
|
|
|
|
|
if (inode->i_state & I_NEW) {
|
|
|
|
BTRFS_I(inode)->root = root;
|
|
|
|
memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
|
|
|
|
btrfs_read_locked_inode(inode);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
inode_tree_add(inode);
|
2008-07-20 14:31:04 -06:00
|
|
|
unlock_new_inode(inode);
|
Btrfs: change how we mount subvolumes
This work is in preperation for being able to set a different root as the
default mounting root.
There is currently a problem with how we mount subvolumes. We cannot currently
mount a subvolume of a subvolume, you can only mount subvolumes/snapshots of the
default subvolume. So say you take a snapshot of the default subvolume and call
it snap1, and then take a snapshot of snap1 and call it snap2, so now you have
/
/snap1
/snap1/snap2
as your available volumes. Currently you can only mount / and /snap1,
you cannot mount /snap1/snap2. To fix this problem instead of passing
subvolid=<name> you must pass in subvolid=<treeid>, where <treeid> is
the tree id that gets spit out via the subvolume listing you get from
the subvolume listing patches (btrfs filesystem list). This allows us
to mount /, /snap1 and /snap1/snap2 as the root volume.
In addition to the above, we also now read the default dir item in the
tree root to get the root key that it points to. For now this just
points at what has always been the default subvolme, but later on I plan
to change it to point at whatever root you want to be the new default
root, so you can just set the default mount and not have to mount with
-o subvolid=<treeid>. I tested this out with the above scenario and it
worked perfectly. Thanks,
mount -o subvol operates inside the selected subvolid. For example:
mount -o subvol=snap1,subvolid=256 /dev/xxx /mnt
/mnt will have the snap1 directory for the subvolume with id
256.
mount -o subvol=snap /dev/xxx /mnt
/mnt will be the snap directory of whatever the default subvolume
is.
Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-12-04 10:38:27 -07:00
|
|
|
if (new)
|
|
|
|
*new = 1;
|
2008-07-20 14:31:04 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
return inode;
|
|
|
|
}
|
|
|
|
|
2009-09-21 13:56:00 -06:00
|
|
|
static struct inode *new_simple_dir(struct super_block *s,
|
|
|
|
struct btrfs_key *key,
|
|
|
|
struct btrfs_root *root)
|
|
|
|
{
|
|
|
|
struct inode *inode = new_inode(s);
|
|
|
|
|
|
|
|
if (!inode)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
BTRFS_I(inode)->root = root;
|
|
|
|
memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
|
|
|
|
BTRFS_I(inode)->dummy_inode = 1;
|
|
|
|
|
|
|
|
inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
|
|
|
|
inode->i_op = &simple_dir_inode_operations;
|
|
|
|
inode->i_fop = &simple_dir_operations;
|
|
|
|
inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
|
|
|
|
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
|
|
|
|
|
|
|
|
return inode;
|
|
|
|
}
|
|
|
|
|
2008-11-17 19:02:50 -07:00
|
|
|
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
|
2007-06-12 04:35:45 -06:00
|
|
|
{
|
2009-01-05 19:25:51 -07:00
|
|
|
struct inode *inode;
|
2009-09-21 13:56:00 -06:00
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2007-06-12 04:35:45 -06:00
|
|
|
struct btrfs_root *sub_root = root;
|
|
|
|
struct btrfs_key location;
|
2009-09-21 14:00:26 -06:00
|
|
|
int index;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
int ret;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
|
|
|
if (dentry->d_name.len > BTRFS_NAME_LEN)
|
|
|
|
return ERR_PTR(-ENAMETOOLONG);
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
ret = btrfs_inode_by_name(dir, dentry, &location);
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
if (ret < 0)
|
|
|
|
return ERR_PTR(ret);
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2009-09-21 13:56:00 -06:00
|
|
|
if (location.objectid == 0)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
if (location.type == BTRFS_INODE_ITEM_KEY) {
|
Btrfs: change how we mount subvolumes
This work is in preperation for being able to set a different root as the
default mounting root.
There is currently a problem with how we mount subvolumes. We cannot currently
mount a subvolume of a subvolume, you can only mount subvolumes/snapshots of the
default subvolume. So say you take a snapshot of the default subvolume and call
it snap1, and then take a snapshot of snap1 and call it snap2, so now you have
/
/snap1
/snap1/snap2
as your available volumes. Currently you can only mount / and /snap1,
you cannot mount /snap1/snap2. To fix this problem instead of passing
subvolid=<name> you must pass in subvolid=<treeid>, where <treeid> is
the tree id that gets spit out via the subvolume listing you get from
the subvolume listing patches (btrfs filesystem list). This allows us
to mount /, /snap1 and /snap1/snap2 as the root volume.
In addition to the above, we also now read the default dir item in the
tree root to get the root key that it points to. For now this just
points at what has always been the default subvolme, but later on I plan
to change it to point at whatever root you want to be the new default
root, so you can just set the default mount and not have to mount with
-o subvolid=<treeid>. I tested this out with the above scenario and it
worked perfectly. Thanks,
mount -o subvol operates inside the selected subvolid. For example:
mount -o subvol=snap1,subvolid=256 /dev/xxx /mnt
/mnt will have the snap1 directory for the subvolume with id
256.
mount -o subvol=snap /dev/xxx /mnt
/mnt will be the snap directory of whatever the default subvolume
is.
Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-12-04 10:38:27 -07:00
|
|
|
inode = btrfs_iget(dir->i_sb, &location, root, NULL);
|
2009-09-21 13:56:00 -06:00
|
|
|
return inode;
|
|
|
|
}
|
|
|
|
|
|
|
|
BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
|
|
|
|
|
2009-09-21 14:00:26 -06:00
|
|
|
index = srcu_read_lock(&root->fs_info->subvol_srcu);
|
2009-09-21 13:56:00 -06:00
|
|
|
ret = fixup_tree_root_location(root, dir, dentry,
|
|
|
|
&location, &sub_root);
|
|
|
|
if (ret < 0) {
|
|
|
|
if (ret != -ENOENT)
|
|
|
|
inode = ERR_PTR(ret);
|
|
|
|
else
|
|
|
|
inode = new_simple_dir(dir->i_sb, &location, sub_root);
|
|
|
|
} else {
|
Btrfs: change how we mount subvolumes
This work is in preperation for being able to set a different root as the
default mounting root.
There is currently a problem with how we mount subvolumes. We cannot currently
mount a subvolume of a subvolume, you can only mount subvolumes/snapshots of the
default subvolume. So say you take a snapshot of the default subvolume and call
it snap1, and then take a snapshot of snap1 and call it snap2, so now you have
/
/snap1
/snap1/snap2
as your available volumes. Currently you can only mount / and /snap1,
you cannot mount /snap1/snap2. To fix this problem instead of passing
subvolid=<name> you must pass in subvolid=<treeid>, where <treeid> is
the tree id that gets spit out via the subvolume listing you get from
the subvolume listing patches (btrfs filesystem list). This allows us
to mount /, /snap1 and /snap1/snap2 as the root volume.
In addition to the above, we also now read the default dir item in the
tree root to get the root key that it points to. For now this just
points at what has always been the default subvolme, but later on I plan
to change it to point at whatever root you want to be the new default
root, so you can just set the default mount and not have to mount with
-o subvolid=<treeid>. I tested this out with the above scenario and it
worked perfectly. Thanks,
mount -o subvol operates inside the selected subvolid. For example:
mount -o subvol=snap1,subvolid=256 /dev/xxx /mnt
/mnt will have the snap1 directory for the subvolume with id
256.
mount -o subvol=snap /dev/xxx /mnt
/mnt will be the snap directory of whatever the default subvolume
is.
Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-12-04 10:38:27 -07:00
|
|
|
inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
2009-09-21 14:00:26 -06:00
|
|
|
srcu_read_unlock(&root->fs_info->subvol_srcu, index);
|
|
|
|
|
2011-01-24 12:55:19 -07:00
|
|
|
if (!IS_ERR(inode) && root != sub_root) {
|
2009-11-12 02:34:40 -07:00
|
|
|
down_read(&root->fs_info->cleanup_work_sem);
|
|
|
|
if (!(inode->i_sb->s_flags & MS_RDONLY))
|
2011-01-31 14:22:42 -07:00
|
|
|
ret = btrfs_orphan_cleanup(sub_root);
|
2009-11-12 02:34:40 -07:00
|
|
|
up_read(&root->fs_info->cleanup_work_sem);
|
2011-01-31 14:22:42 -07:00
|
|
|
if (ret)
|
|
|
|
inode = ERR_PTR(ret);
|
2009-11-12 02:34:40 -07:00
|
|
|
}
|
|
|
|
|
2008-11-17 19:02:50 -07:00
|
|
|
return inode;
|
|
|
|
}
|
|
|
|
|
2011-01-06 23:49:23 -07:00
|
|
|
static int btrfs_dentry_delete(const struct dentry *dentry)
|
2009-09-21 14:00:26 -06:00
|
|
|
{
|
|
|
|
struct btrfs_root *root;
|
|
|
|
|
2009-10-09 07:25:16 -06:00
|
|
|
if (!dentry->d_inode && !IS_ROOT(dentry))
|
|
|
|
dentry = dentry->d_parent;
|
2009-09-21 14:00:26 -06:00
|
|
|
|
2009-10-09 07:25:16 -06:00
|
|
|
if (dentry->d_inode) {
|
|
|
|
root = BTRFS_I(dentry->d_inode)->root;
|
|
|
|
if (btrfs_root_refs(&root->root_item) == 0)
|
|
|
|
return 1;
|
|
|
|
}
|
2009-09-21 14:00:26 -06:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-11-17 19:02:50 -07:00
|
|
|
static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
|
|
|
|
struct nameidata *nd)
|
|
|
|
{
|
|
|
|
struct inode *inode;
|
|
|
|
|
|
|
|
inode = btrfs_lookup_dentry(dir, dentry);
|
|
|
|
if (IS_ERR(inode))
|
|
|
|
return ERR_CAST(inode);
|
2008-07-24 10:17:14 -06:00
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
return d_splice_alias(inode, dentry);
|
|
|
|
}
|
|
|
|
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
unsigned char btrfs_filetype_table[] = {
|
2007-06-12 04:35:45 -06:00
|
|
|
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
|
|
|
|
};
|
|
|
|
|
2008-08-06 12:42:33 -06:00
|
|
|
static int btrfs_real_readdir(struct file *filp, void *dirent,
|
|
|
|
filldir_t filldir)
|
2007-06-12 04:35:45 -06:00
|
|
|
{
|
2007-12-18 14:15:09 -07:00
|
|
|
struct inode *inode = filp->f_dentry->d_inode;
|
2007-06-12 04:35:45 -06:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_item *item;
|
|
|
|
struct btrfs_dir_item *di;
|
|
|
|
struct btrfs_key key;
|
2007-10-15 14:14:19 -06:00
|
|
|
struct btrfs_key found_key;
|
2007-06-12 04:35:45 -06:00
|
|
|
struct btrfs_path *path;
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
struct list_head ins_list;
|
|
|
|
struct list_head del_list;
|
2007-06-12 04:35:45 -06:00
|
|
|
int ret;
|
2007-10-15 14:14:19 -06:00
|
|
|
struct extent_buffer *leaf;
|
2007-06-12 04:35:45 -06:00
|
|
|
int slot;
|
|
|
|
unsigned char d_type;
|
|
|
|
int over = 0;
|
|
|
|
u32 di_cur;
|
|
|
|
u32 di_total;
|
|
|
|
u32 di_len;
|
|
|
|
int key_type = BTRFS_DIR_INDEX_KEY;
|
2007-10-15 14:14:19 -06:00
|
|
|
char tmp_name[32];
|
|
|
|
char *name_ptr;
|
|
|
|
int name_len;
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
int is_curr = 0; /* filp->f_pos points to the current index? */
|
2007-06-12 04:35:45 -06:00
|
|
|
|
|
|
|
/* FIXME, use a real flag for deciding about the key type */
|
|
|
|
if (root->fs_info->tree_root == root)
|
|
|
|
key_type = BTRFS_DIR_ITEM_KEY;
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2007-12-12 12:38:19 -07:00
|
|
|
/* special case for "." */
|
|
|
|
if (filp->f_pos == 0) {
|
2011-04-19 20:31:50 -06:00
|
|
|
over = filldir(dirent, ".", 1, 1, btrfs_ino(inode), DT_DIR);
|
2007-12-12 12:38:19 -07:00
|
|
|
if (over)
|
|
|
|
return 0;
|
|
|
|
filp->f_pos = 1;
|
|
|
|
}
|
|
|
|
/* special case for .., just use the back ref */
|
|
|
|
if (filp->f_pos == 1) {
|
2008-08-17 08:14:48 -06:00
|
|
|
u64 pino = parent_ino(filp->f_path.dentry);
|
2007-12-12 12:38:19 -07:00
|
|
|
over = filldir(dirent, "..", 2,
|
2008-08-17 08:14:48 -06:00
|
|
|
2, pino, DT_DIR);
|
2007-12-12 12:38:19 -07:00
|
|
|
if (over)
|
2008-08-17 10:08:36 -06:00
|
|
|
return 0;
|
2007-12-12 12:38:19 -07:00
|
|
|
filp->f_pos = 2;
|
|
|
|
}
|
2008-08-17 10:08:36 -06:00
|
|
|
path = btrfs_alloc_path();
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
2011-05-28 05:00:39 -06:00
|
|
|
|
2011-05-13 08:32:11 -06:00
|
|
|
path->reada = 1;
|
2008-08-17 10:08:36 -06:00
|
|
|
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
if (key_type == BTRFS_DIR_INDEX_KEY) {
|
|
|
|
INIT_LIST_HEAD(&ins_list);
|
|
|
|
INIT_LIST_HEAD(&del_list);
|
|
|
|
btrfs_get_delayed_items(inode, &ins_list, &del_list);
|
|
|
|
}
|
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
btrfs_set_key_type(&key, key_type);
|
|
|
|
key.offset = filp->f_pos;
|
2011-04-19 20:31:50 -06:00
|
|
|
key.objectid = btrfs_ino(inode);
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto err;
|
2008-08-17 10:08:36 -06:00
|
|
|
|
|
|
|
while (1) {
|
2007-10-15 14:14:19 -06:00
|
|
|
leaf = path->nodes[0];
|
2007-06-12 04:35:45 -06:00
|
|
|
slot = path->slots[0];
|
2011-03-22 20:43:58 -06:00
|
|
|
if (slot >= btrfs_header_nritems(leaf)) {
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto err;
|
|
|
|
else if (ret > 0)
|
|
|
|
break;
|
|
|
|
continue;
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
2008-11-17 19:02:50 -07:00
|
|
|
|
2007-10-15 14:14:19 -06:00
|
|
|
item = btrfs_item_nr(leaf, slot);
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, slot);
|
|
|
|
|
|
|
|
if (found_key.objectid != key.objectid)
|
2007-06-12 04:35:45 -06:00
|
|
|
break;
|
2007-10-15 14:14:19 -06:00
|
|
|
if (btrfs_key_type(&found_key) != key_type)
|
2007-06-12 04:35:45 -06:00
|
|
|
break;
|
2007-10-15 14:14:19 -06:00
|
|
|
if (found_key.offset < filp->f_pos)
|
2011-03-22 20:43:58 -06:00
|
|
|
goto next;
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
if (key_type == BTRFS_DIR_INDEX_KEY &&
|
|
|
|
btrfs_should_delete_dir_index(&del_list,
|
|
|
|
found_key.offset))
|
|
|
|
goto next;
|
2007-10-15 14:14:19 -06:00
|
|
|
|
|
|
|
filp->f_pos = found_key.offset;
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
is_curr = 1;
|
2008-08-17 10:08:36 -06:00
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
|
|
|
|
di_cur = 0;
|
2007-10-15 14:14:19 -06:00
|
|
|
di_total = btrfs_item_size(leaf, item);
|
2008-08-17 10:08:36 -06:00
|
|
|
|
|
|
|
while (di_cur < di_total) {
|
2007-10-15 14:14:19 -06:00
|
|
|
struct btrfs_key location;
|
|
|
|
|
2011-03-16 14:47:17 -06:00
|
|
|
if (verify_dir_item(root, leaf, di))
|
|
|
|
break;
|
|
|
|
|
2007-10-15 14:14:19 -06:00
|
|
|
name_len = btrfs_dir_name_len(leaf, di);
|
2008-08-17 10:08:36 -06:00
|
|
|
if (name_len <= sizeof(tmp_name)) {
|
2007-10-15 14:14:19 -06:00
|
|
|
name_ptr = tmp_name;
|
|
|
|
} else {
|
|
|
|
name_ptr = kmalloc(name_len, GFP_NOFS);
|
2008-08-17 10:08:36 -06:00
|
|
|
if (!name_ptr) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto err;
|
|
|
|
}
|
2007-10-15 14:14:19 -06:00
|
|
|
}
|
|
|
|
read_extent_buffer(leaf, name_ptr,
|
|
|
|
(unsigned long)(di + 1), name_len);
|
|
|
|
|
|
|
|
d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
|
|
|
|
btrfs_dir_item_key_to_cpu(leaf, di, &location);
|
2008-11-17 19:02:50 -07:00
|
|
|
|
|
|
|
/* is this a reference to our own snapshot? If so
|
|
|
|
* skip it
|
|
|
|
*/
|
|
|
|
if (location.type == BTRFS_ROOT_ITEM_KEY &&
|
|
|
|
location.objectid == root->root_key.objectid) {
|
|
|
|
over = 0;
|
|
|
|
goto skip;
|
|
|
|
}
|
2007-10-15 14:14:19 -06:00
|
|
|
over = filldir(dirent, name_ptr, name_len,
|
2008-08-17 10:08:36 -06:00
|
|
|
found_key.offset, location.objectid,
|
2007-06-12 04:35:45 -06:00
|
|
|
d_type);
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2008-11-17 19:02:50 -07:00
|
|
|
skip:
|
2007-10-15 14:14:19 -06:00
|
|
|
if (name_ptr != tmp_name)
|
|
|
|
kfree(name_ptr);
|
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
if (over)
|
|
|
|
goto nopos;
|
2007-11-16 09:45:54 -07:00
|
|
|
di_len = btrfs_dir_name_len(leaf, di) +
|
2008-08-17 10:08:36 -06:00
|
|
|
btrfs_dir_data_len(leaf, di) + sizeof(*di);
|
2007-06-12 04:35:45 -06:00
|
|
|
di_cur += di_len;
|
|
|
|
di = (struct btrfs_dir_item *)((char *)di + di_len);
|
|
|
|
}
|
2011-03-22 20:43:58 -06:00
|
|
|
next:
|
|
|
|
path->slots[0]++;
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
2008-08-17 10:08:36 -06:00
|
|
|
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
if (key_type == BTRFS_DIR_INDEX_KEY) {
|
|
|
|
if (is_curr)
|
|
|
|
filp->f_pos++;
|
|
|
|
ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir,
|
|
|
|
&ins_list);
|
|
|
|
if (ret)
|
|
|
|
goto nopos;
|
|
|
|
}
|
|
|
|
|
2008-08-17 10:08:36 -06:00
|
|
|
/* Reached end of directory/root. Bump pos past the last item. */
|
2008-02-19 09:41:02 -07:00
|
|
|
if (key_type == BTRFS_DIR_INDEX_KEY)
|
2009-12-09 15:00:38 -07:00
|
|
|
/*
|
|
|
|
* 32-bit glibc will use getdents64, but then strtol -
|
|
|
|
* so the last number we can serve is this.
|
|
|
|
*/
|
|
|
|
filp->f_pos = 0x7fffffff;
|
2008-02-19 09:41:02 -07:00
|
|
|
else
|
|
|
|
filp->f_pos++;
|
2007-06-12 04:35:45 -06:00
|
|
|
nopos:
|
|
|
|
ret = 0;
|
|
|
|
err:
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
if (key_type == BTRFS_DIR_INDEX_KEY)
|
|
|
|
btrfs_put_delayed_items(&ins_list, &del_list);
|
2007-06-12 04:35:45 -06:00
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2010-03-05 01:21:37 -07:00
|
|
|
int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
|
2007-06-12 04:35:45 -06:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
int ret = 0;
|
2010-06-21 12:48:16 -06:00
|
|
|
bool nolock = false;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2010-05-16 08:49:58 -06:00
|
|
|
if (BTRFS_I(inode)->dummy_inode)
|
2008-08-05 11:30:48 -06:00
|
|
|
return 0;
|
|
|
|
|
2011-05-31 10:07:27 -06:00
|
|
|
if (btrfs_fs_closing(root->fs_info) && is_free_space_inode(root, inode))
|
2011-04-19 20:33:24 -06:00
|
|
|
nolock = true;
|
2010-06-21 12:48:16 -06:00
|
|
|
|
2010-03-05 01:21:37 -07:00
|
|
|
if (wbc->sync_mode == WB_SYNC_ALL) {
|
2010-06-21 12:48:16 -06:00
|
|
|
if (nolock)
|
2011-04-13 10:54:33 -06:00
|
|
|
trans = btrfs_join_transaction_nolock(root);
|
2010-06-21 12:48:16 -06:00
|
|
|
else
|
2011-04-13 10:54:33 -06:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-01-24 19:51:38 -07:00
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
2010-06-21 12:48:16 -06:00
|
|
|
if (nolock)
|
|
|
|
ret = btrfs_end_transaction_nolock(trans, root);
|
|
|
|
else
|
|
|
|
ret = btrfs_commit_transaction(trans, root);
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2007-06-22 12:16:25 -06:00
|
|
|
* This is somewhat expensive, updating the tree every time the
|
2007-06-12 04:35:45 -06:00
|
|
|
* inode changes. But, it is most likely to find the inode in cache.
|
|
|
|
* FIXME, needs more benchmarking...there are no reasons other than performance
|
|
|
|
* to keep or drop this code.
|
|
|
|
*/
|
2011-05-27 04:53:02 -06:00
|
|
|
void btrfs_dirty_inode(struct inode *inode, int flags)
|
2007-06-12 04:35:45 -06:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_trans_handle *trans;
|
2010-05-16 08:49:58 -06:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (BTRFS_I(inode)->dummy_inode)
|
|
|
|
return;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2011-04-13 10:54:33 -06:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-01-24 19:51:38 -07:00
|
|
|
BUG_ON(IS_ERR(trans));
|
2010-05-16 08:49:58 -06:00
|
|
|
|
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
2010-05-26 09:02:00 -06:00
|
|
|
if (ret && ret == -ENOSPC) {
|
|
|
|
/* whoops, lets try again with the full transaction */
|
|
|
|
btrfs_end_transaction(trans, root);
|
|
|
|
trans = btrfs_start_transaction(root, 1);
|
2010-05-27 08:23:00 -06:00
|
|
|
if (IS_ERR(trans)) {
|
2011-05-06 07:33:15 -06:00
|
|
|
printk_ratelimited(KERN_ERR "btrfs: fail to "
|
2011-04-19 20:31:50 -06:00
|
|
|
"dirty inode %llu error %ld\n",
|
|
|
|
(unsigned long long)btrfs_ino(inode),
|
|
|
|
PTR_ERR(trans));
|
2010-05-27 08:23:00 -06:00
|
|
|
return;
|
|
|
|
}
|
2010-05-16 08:49:58 -06:00
|
|
|
|
2010-05-26 09:02:00 -06:00
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
|
|
|
if (ret) {
|
2011-05-06 07:33:15 -06:00
|
|
|
printk_ratelimited(KERN_ERR "btrfs: fail to "
|
2011-04-19 20:31:50 -06:00
|
|
|
"dirty inode %llu error %d\n",
|
|
|
|
(unsigned long long)btrfs_ino(inode),
|
|
|
|
ret);
|
2010-05-26 09:02:00 -06:00
|
|
|
}
|
|
|
|
}
|
2007-06-12 04:35:45 -06:00
|
|
|
btrfs_end_transaction(trans, root);
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
if (BTRFS_I(inode)->delayed_node)
|
|
|
|
btrfs_balance_delayed_items(root);
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/*
|
|
|
|
* find the highest existing sequence number in a directory
|
|
|
|
* and then set the in-memory index_cnt variable to reflect
|
|
|
|
* free sequence numbers
|
|
|
|
*/
|
2008-07-24 10:12:38 -06:00
|
|
|
static int btrfs_set_inode_index_count(struct inode *inode)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_key key, found_key;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
int ret;
|
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
key.objectid = btrfs_ino(inode);
|
2008-07-24 10:12:38 -06:00
|
|
|
btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
|
|
|
|
key.offset = (u64)-1;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
/* FIXME: we should be able to handle this */
|
|
|
|
if (ret == 0)
|
|
|
|
goto out;
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* MAGIC NUMBER EXPLANATION:
|
|
|
|
* since we search a directory based on f_pos we have to start at 2
|
|
|
|
* since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
|
|
|
|
* else has to start at 2
|
|
|
|
*/
|
|
|
|
if (path->slots[0] == 0) {
|
|
|
|
BTRFS_I(inode)->index_cnt = 2;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
path->slots[0]--;
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
if (found_key.objectid != btrfs_ino(inode) ||
|
2008-07-24 10:12:38 -06:00
|
|
|
btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
|
|
|
|
BTRFS_I(inode)->index_cnt = 2;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
BTRFS_I(inode)->index_cnt = found_key.offset + 1;
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/*
|
|
|
|
* helper to find a free sequence number in a given directory. This current
|
|
|
|
* code is very simple, later versions will do smarter things in the btree
|
|
|
|
*/
|
2008-11-17 19:02:50 -07:00
|
|
|
int btrfs_set_inode_index(struct inode *dir, u64 *index)
|
2008-07-24 10:12:38 -06:00
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
if (BTRFS_I(dir)->index_cnt == (u64)-1) {
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
ret = btrfs_inode_delayed_dir_index_count(dir);
|
|
|
|
if (ret) {
|
|
|
|
ret = btrfs_set_inode_index_count(dir);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
2008-07-24 10:12:38 -06:00
|
|
|
}
|
|
|
|
|
2008-08-05 09:18:09 -06:00
|
|
|
*index = BTRFS_I(dir)->index_cnt;
|
2008-07-24 10:12:38 -06:00
|
|
|
BTRFS_I(dir)->index_cnt++;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root,
|
2008-07-24 10:12:38 -06:00
|
|
|
struct inode *dir,
|
2008-01-29 13:15:18 -07:00
|
|
|
const char *name, int name_len,
|
2011-05-11 13:26:06 -06:00
|
|
|
u64 ref_objectid, u64 objectid, int mode,
|
|
|
|
u64 *index)
|
2007-06-12 04:35:45 -06:00
|
|
|
{
|
|
|
|
struct inode *inode;
|
2007-10-15 14:14:19 -06:00
|
|
|
struct btrfs_inode_item *inode_item;
|
2007-06-12 04:35:45 -06:00
|
|
|
struct btrfs_key *location;
|
2007-10-15 14:14:19 -06:00
|
|
|
struct btrfs_path *path;
|
2008-01-29 13:15:18 -07:00
|
|
|
struct btrfs_inode_ref *ref;
|
|
|
|
struct btrfs_key key[2];
|
|
|
|
u32 sizes[2];
|
|
|
|
unsigned long ptr;
|
2007-06-12 04:35:45 -06:00
|
|
|
int ret;
|
|
|
|
int owner;
|
|
|
|
|
2007-10-15 14:14:19 -06:00
|
|
|
path = btrfs_alloc_path();
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-13 11:38:47 -06:00
|
|
|
if (!path)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
inode = new_inode(root->fs_info->sb);
|
2011-04-08 20:30:07 -06:00
|
|
|
if (!inode) {
|
|
|
|
btrfs_free_path(path);
|
2007-06-12 04:35:45 -06:00
|
|
|
return ERR_PTR(-ENOMEM);
|
2011-04-08 20:30:07 -06:00
|
|
|
}
|
2007-06-12 04:35:45 -06:00
|
|
|
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-19 20:06:11 -06:00
|
|
|
/*
|
|
|
|
* we have to initialize this early, so we can reclaim the inode
|
|
|
|
* number if we fail afterwards in this function.
|
|
|
|
*/
|
|
|
|
inode->i_ino = objectid;
|
|
|
|
|
2008-07-24 10:12:38 -06:00
|
|
|
if (dir) {
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 05:18:59 -06:00
|
|
|
trace_btrfs_inode_request(dir);
|
|
|
|
|
2008-11-17 19:02:50 -07:00
|
|
|
ret = btrfs_set_inode_index(dir, index);
|
2009-04-02 14:46:06 -06:00
|
|
|
if (ret) {
|
2011-04-08 20:30:07 -06:00
|
|
|
btrfs_free_path(path);
|
2009-04-02 14:46:06 -06:00
|
|
|
iput(inode);
|
2008-07-24 10:12:38 -06:00
|
|
|
return ERR_PTR(ret);
|
2009-04-02 14:46:06 -06:00
|
|
|
}
|
2008-07-24 10:12:38 -06:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* index_cnt is ignored for everything but a dir,
|
|
|
|
* btrfs_get_inode_index_count has an explanation for the magic
|
|
|
|
* number
|
|
|
|
*/
|
|
|
|
BTRFS_I(inode)->index_cnt = 2;
|
2007-06-12 04:35:45 -06:00
|
|
|
BTRFS_I(inode)->root = root;
|
2008-09-05 14:13:11 -06:00
|
|
|
BTRFS_I(inode)->generation = trans->transid;
|
2010-11-18 19:18:02 -07:00
|
|
|
inode->i_generation = BTRFS_I(inode)->generation;
|
2009-02-20 09:00:09 -07:00
|
|
|
btrfs_set_inode_space_info(root, inode);
|
2007-08-27 14:49:44 -06:00
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
if (mode & S_IFDIR)
|
|
|
|
owner = 0;
|
|
|
|
else
|
|
|
|
owner = 1;
|
2008-01-29 13:15:18 -07:00
|
|
|
|
|
|
|
key[0].objectid = objectid;
|
|
|
|
btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
|
|
|
|
key[0].offset = 0;
|
|
|
|
|
|
|
|
key[1].objectid = objectid;
|
|
|
|
btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
|
|
|
|
key[1].offset = ref_objectid;
|
|
|
|
|
|
|
|
sizes[0] = sizeof(struct btrfs_inode_item);
|
|
|
|
sizes[1] = name_len + sizeof(*ref);
|
|
|
|
|
2009-03-13 09:00:37 -06:00
|
|
|
path->leave_spinning = 1;
|
2008-01-29 13:15:18 -07:00
|
|
|
ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
|
|
|
|
if (ret != 0)
|
2007-10-15 14:14:19 -06:00
|
|
|
goto fail;
|
|
|
|
|
2010-03-04 07:31:47 -07:00
|
|
|
inode_init_owner(inode, dir, mode);
|
2008-10-09 09:46:29 -06:00
|
|
|
inode_set_bytes(inode, 0);
|
2007-06-12 04:35:45 -06:00
|
|
|
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
|
2007-10-15 14:14:19 -06:00
|
|
|
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
|
|
|
|
struct btrfs_inode_item);
|
2008-09-05 14:13:11 -06:00
|
|
|
fill_inode_item(trans, path->nodes[0], inode_item, inode);
|
2008-01-29 13:15:18 -07:00
|
|
|
|
|
|
|
ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
|
|
|
|
struct btrfs_inode_ref);
|
|
|
|
btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
|
2008-08-05 09:18:09 -06:00
|
|
|
btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
|
2008-01-29 13:15:18 -07:00
|
|
|
ptr = (unsigned long)(ref + 1);
|
|
|
|
write_extent_buffer(path->nodes[0], name, ptr, name_len);
|
|
|
|
|
2007-10-15 14:14:19 -06:00
|
|
|
btrfs_mark_buffer_dirty(path->nodes[0]);
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
location = &BTRFS_I(inode)->location;
|
|
|
|
location->objectid = objectid;
|
|
|
|
location->offset = 0;
|
|
|
|
btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
|
|
|
|
|
2009-04-17 02:37:41 -06:00
|
|
|
btrfs_inherit_iflags(inode, dir);
|
|
|
|
|
2009-07-02 10:26:06 -06:00
|
|
|
if ((mode & S_IFREG)) {
|
|
|
|
if (btrfs_test_opt(root, NODATASUM))
|
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
|
Btrfs: Per file/directory controls for COW and compression
Data compression and data cow are controlled across the entire FS by mount
options right now. ioctls are needed to set this on a per file or per
directory basis. This has been proposed previously, but VFS developers
wanted us to use generic ioctls rather than btrfs-specific ones.
According to Chris's comment, there should be just one true compression
method(probably LZO) stored in the super. However, before this, we would
wait for that one method is stable enough to be adopted into the super.
So I list it as a long term goal, and just store it in ram today.
After applying this patch, we can use the generic "FS_IOC_SETFLAGS" ioctl to
control file and directory's datacow and compression attribute.
NOTE:
- The compression type is selected by such rules:
If we mount btrfs with compress options, ie, zlib/lzo, the type is it.
Otherwise, we'll use the default compress type (zlib today).
v1->v2:
- rebase to the latest btrfs.
v2->v3:
- fix a problem, i.e. when a file is set NOCOW via mount option, then this NOCOW
will be screwed by inheritance from parent directory.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-22 04:12:20 -06:00
|
|
|
if (btrfs_test_opt(root, NODATACOW) ||
|
|
|
|
(BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
|
2009-07-02 10:26:06 -06:00
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
|
|
|
|
}
|
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
insert_inode_hash(inode);
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
inode_tree_add(inode);
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 05:18:59 -06:00
|
|
|
|
|
|
|
trace_btrfs_inode_new(inode);
|
2011-06-24 11:13:29 -06:00
|
|
|
btrfs_set_inode_last_trans(trans, inode);
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 05:18:59 -06:00
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
return inode;
|
2007-10-15 14:14:19 -06:00
|
|
|
fail:
|
2008-07-24 10:12:38 -06:00
|
|
|
if (dir)
|
|
|
|
BTRFS_I(dir)->index_cnt--;
|
2007-10-15 14:14:19 -06:00
|
|
|
btrfs_free_path(path);
|
2009-04-02 14:46:06 -06:00
|
|
|
iput(inode);
|
2007-10-15 14:14:19 -06:00
|
|
|
return ERR_PTR(ret);
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline u8 btrfs_inode_type(struct inode *inode)
|
|
|
|
{
|
|
|
|
return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
|
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/*
|
|
|
|
* utility function to add 'inode' into 'parent_inode' with
|
|
|
|
* a give name and a given sequence number.
|
|
|
|
* if 'add_backref' is true, also insert a backref from the
|
|
|
|
* inode to the parent directory.
|
|
|
|
*/
|
2008-09-05 14:13:11 -06:00
|
|
|
int btrfs_add_link(struct btrfs_trans_handle *trans,
|
|
|
|
struct inode *parent_inode, struct inode *inode,
|
|
|
|
const char *name, int name_len, int add_backref, u64 index)
|
2007-06-12 04:35:45 -06:00
|
|
|
{
|
2009-09-21 13:56:00 -06:00
|
|
|
int ret = 0;
|
2007-06-12 04:35:45 -06:00
|
|
|
struct btrfs_key key;
|
2008-09-05 14:13:11 -06:00
|
|
|
struct btrfs_root *root = BTRFS_I(parent_inode)->root;
|
2011-04-19 20:31:50 -06:00
|
|
|
u64 ino = btrfs_ino(inode);
|
|
|
|
u64 parent_ino = btrfs_ino(parent_inode);
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
|
2009-09-21 13:56:00 -06:00
|
|
|
memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
|
|
|
|
} else {
|
2011-04-19 20:31:50 -06:00
|
|
|
key.objectid = ino;
|
2009-09-21 13:56:00 -06:00
|
|
|
btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
|
|
|
|
key.offset = 0;
|
|
|
|
}
|
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
|
2009-09-21 13:56:00 -06:00
|
|
|
ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
|
|
|
|
key.objectid, root->root_key.objectid,
|
2011-04-19 20:31:50 -06:00
|
|
|
parent_ino, index, name, name_len);
|
2009-09-21 13:56:00 -06:00
|
|
|
} else if (add_backref) {
|
2011-04-19 20:31:50 -06:00
|
|
|
ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
|
|
|
|
parent_ino, index);
|
2009-09-21 13:56:00 -06:00
|
|
|
}
|
2007-06-12 04:35:45 -06:00
|
|
|
|
|
|
|
if (ret == 0) {
|
2009-09-21 13:56:00 -06:00
|
|
|
ret = btrfs_insert_dir_item(trans, root, name, name_len,
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
parent_inode, &key,
|
2009-09-21 13:56:00 -06:00
|
|
|
btrfs_inode_type(inode), index);
|
|
|
|
BUG_ON(ret);
|
|
|
|
|
2008-07-17 10:54:05 -06:00
|
|
|
btrfs_i_size_write(parent_inode, parent_inode->i_size +
|
2008-09-05 14:13:11 -06:00
|
|
|
name_len * 2);
|
2007-06-25 08:09:33 -06:00
|
|
|
parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
|
2008-09-05 14:13:11 -06:00
|
|
|
ret = btrfs_update_inode(trans, root, parent_inode);
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
|
2010-11-19 13:36:11 -07:00
|
|
|
struct inode *dir, struct dentry *dentry,
|
|
|
|
struct inode *inode, int backref, u64 index)
|
2007-06-12 04:35:45 -06:00
|
|
|
{
|
2010-11-19 13:36:11 -07:00
|
|
|
int err = btrfs_add_link(trans, dir, inode,
|
|
|
|
dentry->d_name.name, dentry->d_name.len,
|
|
|
|
backref, index);
|
2007-06-12 04:35:45 -06:00
|
|
|
if (!err) {
|
|
|
|
d_instantiate(dentry, inode);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (err > 0)
|
|
|
|
err = -EEXIST;
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2007-07-11 08:18:17 -06:00
|
|
|
static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
|
|
|
|
int mode, dev_t rdev)
|
|
|
|
{
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2007-12-21 14:27:21 -07:00
|
|
|
struct inode *inode = NULL;
|
2007-07-11 08:18:17 -06:00
|
|
|
int err;
|
|
|
|
int drop_inode = 0;
|
|
|
|
u64 objectid;
|
2007-12-21 14:27:21 -07:00
|
|
|
unsigned long nr = 0;
|
2008-08-05 09:18:09 -06:00
|
|
|
u64 index = 0;
|
2007-07-11 08:18:17 -06:00
|
|
|
|
|
|
|
if (!new_valid_dev(rdev))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2009-09-11 14:12:44 -06:00
|
|
|
/*
|
|
|
|
* 2 for inode item and ref
|
|
|
|
* 2 for dir items
|
|
|
|
* 1 for xattr if selinux is on
|
|
|
|
*/
|
2010-05-16 08:48:46 -06:00
|
|
|
trans = btrfs_start_transaction(root, 5);
|
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
2007-12-21 14:27:21 -07:00
|
|
|
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-19 20:06:11 -06:00
|
|
|
err = btrfs_find_free_ino(root, &objectid);
|
|
|
|
if (err)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2008-07-24 10:12:38 -06:00
|
|
|
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
|
2011-04-19 20:31:50 -06:00
|
|
|
dentry->d_name.len, btrfs_ino(dir), objectid,
|
2011-05-11 13:26:06 -06:00
|
|
|
mode, &index);
|
2011-04-25 17:43:53 -06:00
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
err = PTR_ERR(inode);
|
2007-07-11 08:18:17 -06:00
|
|
|
goto out_unlock;
|
2011-04-25 17:43:53 -06:00
|
|
|
}
|
2007-07-11 08:18:17 -06:00
|
|
|
|
2011-02-01 09:05:39 -07:00
|
|
|
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
|
2008-07-24 10:16:36 -06:00
|
|
|
if (err) {
|
|
|
|
drop_inode = 1;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
2010-11-19 13:36:11 -07:00
|
|
|
err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
|
2007-07-11 08:18:17 -06:00
|
|
|
if (err)
|
|
|
|
drop_inode = 1;
|
|
|
|
else {
|
|
|
|
inode->i_op = &btrfs_special_inode_operations;
|
|
|
|
init_special_inode(inode, inode->i_mode, rdev);
|
2007-08-29 07:11:44 -06:00
|
|
|
btrfs_update_inode(trans, root, inode);
|
2007-07-11 08:18:17 -06:00
|
|
|
}
|
|
|
|
out_unlock:
|
2007-09-17 08:58:06 -06:00
|
|
|
nr = trans->blocks_used;
|
2008-06-25 14:01:31 -06:00
|
|
|
btrfs_end_transaction_throttle(trans, root);
|
2010-05-16 08:48:46 -06:00
|
|
|
btrfs_btree_balance_dirty(root, nr);
|
2007-07-11 08:18:17 -06:00
|
|
|
if (drop_inode) {
|
|
|
|
inode_dec_link_count(inode);
|
|
|
|
iput(inode);
|
|
|
|
}
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
static int btrfs_create(struct inode *dir, struct dentry *dentry,
|
|
|
|
int mode, struct nameidata *nd)
|
|
|
|
{
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
2007-12-21 14:27:21 -07:00
|
|
|
struct inode *inode = NULL;
|
2007-06-12 04:35:45 -06:00
|
|
|
int drop_inode = 0;
|
2010-05-16 08:48:46 -06:00
|
|
|
int err;
|
2007-12-21 14:27:21 -07:00
|
|
|
unsigned long nr = 0;
|
2007-06-12 04:35:45 -06:00
|
|
|
u64 objectid;
|
2008-08-05 09:18:09 -06:00
|
|
|
u64 index = 0;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2009-09-11 14:12:44 -06:00
|
|
|
/*
|
|
|
|
* 2 for inode item and ref
|
|
|
|
* 2 for dir items
|
|
|
|
* 1 for xattr if selinux is on
|
|
|
|
*/
|
2010-05-16 08:48:46 -06:00
|
|
|
trans = btrfs_start_transaction(root, 5);
|
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
2009-09-11 14:12:44 -06:00
|
|
|
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-19 20:06:11 -06:00
|
|
|
err = btrfs_find_free_ino(root, &objectid);
|
|
|
|
if (err)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2008-07-24 10:12:38 -06:00
|
|
|
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
|
2011-04-19 20:31:50 -06:00
|
|
|
dentry->d_name.len, btrfs_ino(dir), objectid,
|
2011-05-11 13:26:06 -06:00
|
|
|
mode, &index);
|
2011-04-25 17:43:53 -06:00
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
err = PTR_ERR(inode);
|
2007-06-12 04:35:45 -06:00
|
|
|
goto out_unlock;
|
2011-04-25 17:43:53 -06:00
|
|
|
}
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2011-02-01 09:05:39 -07:00
|
|
|
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
|
2008-07-24 10:16:36 -06:00
|
|
|
if (err) {
|
|
|
|
drop_inode = 1;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
2010-11-19 13:36:11 -07:00
|
|
|
err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
|
2007-06-12 04:35:45 -06:00
|
|
|
if (err)
|
|
|
|
drop_inode = 1;
|
|
|
|
else {
|
|
|
|
inode->i_mapping->a_ops = &btrfs_aops;
|
2008-03-26 08:28:07 -06:00
|
|
|
inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
|
2007-06-12 04:35:45 -06:00
|
|
|
inode->i_fop = &btrfs_file_operations;
|
|
|
|
inode->i_op = &btrfs_file_inode_operations;
|
2008-01-24 14:13:08 -07:00
|
|
|
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
|
|
|
out_unlock:
|
2007-09-17 08:58:06 -06:00
|
|
|
nr = trans->blocks_used;
|
2008-07-29 14:15:18 -06:00
|
|
|
btrfs_end_transaction_throttle(trans, root);
|
2007-06-12 04:35:45 -06:00
|
|
|
if (drop_inode) {
|
|
|
|
inode_dec_link_count(inode);
|
|
|
|
iput(inode);
|
|
|
|
}
|
2007-09-17 08:58:06 -06:00
|
|
|
btrfs_btree_balance_dirty(root, nr);
|
2007-06-12 04:35:45 -06:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
|
|
|
|
struct dentry *dentry)
|
|
|
|
{
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
|
|
|
struct inode *inode = old_dentry->d_inode;
|
2008-08-05 09:18:09 -06:00
|
|
|
u64 index;
|
2007-12-21 14:27:21 -07:00
|
|
|
unsigned long nr = 0;
|
2007-06-12 04:35:45 -06:00
|
|
|
int err;
|
|
|
|
int drop_inode = 0;
|
|
|
|
|
2009-11-12 00:14:26 -07:00
|
|
|
/* do not allow sys_link's with other subvols of the same device */
|
|
|
|
if (root->objectid != BTRFS_I(inode)->root->objectid)
|
2011-03-22 11:20:26 -06:00
|
|
|
return -EXDEV;
|
2009-11-12 00:14:26 -07:00
|
|
|
|
2011-03-04 10:15:18 -07:00
|
|
|
if (inode->i_nlink == ~0U)
|
|
|
|
return -EMLINK;
|
2009-11-12 00:14:26 -07:00
|
|
|
|
2008-11-17 19:02:50 -07:00
|
|
|
err = btrfs_set_inode_index(dir, &index);
|
2008-07-24 10:12:38 -06:00
|
|
|
if (err)
|
|
|
|
goto fail;
|
|
|
|
|
2010-05-16 08:48:46 -06:00
|
|
|
/*
|
2011-02-18 02:21:17 -07:00
|
|
|
* 2 items for inode and inode ref
|
2010-05-16 08:48:46 -06:00
|
|
|
* 2 items for dir items
|
2011-02-18 02:21:17 -07:00
|
|
|
* 1 item for parent inode
|
2010-05-16 08:48:46 -06:00
|
|
|
*/
|
2011-02-18 02:21:17 -07:00
|
|
|
trans = btrfs_start_transaction(root, 5);
|
2010-05-16 08:48:46 -06:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
err = PTR_ERR(trans);
|
|
|
|
goto fail;
|
|
|
|
}
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2011-04-12 23:19:21 -06:00
|
|
|
btrfs_inc_nlink(inode);
|
|
|
|
inode->i_ctime = CURRENT_TIME;
|
2010-10-23 09:11:40 -06:00
|
|
|
ihold(inode);
|
2008-07-24 10:12:38 -06:00
|
|
|
|
2010-11-19 13:36:11 -07:00
|
|
|
err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2009-09-24 07:17:31 -06:00
|
|
|
if (err) {
|
2007-06-22 12:16:25 -06:00
|
|
|
drop_inode = 1;
|
2009-09-24 07:17:31 -06:00
|
|
|
} else {
|
2010-11-20 02:48:00 -07:00
|
|
|
struct dentry *parent = dget_parent(dentry);
|
2009-09-24 07:17:31 -06:00
|
|
|
err = btrfs_update_inode(trans, root, inode);
|
|
|
|
BUG_ON(err);
|
2010-11-20 02:48:00 -07:00
|
|
|
btrfs_log_new_name(trans, inode, NULL, parent);
|
|
|
|
dput(parent);
|
2009-09-24 07:17:31 -06:00
|
|
|
}
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2007-09-17 08:58:06 -06:00
|
|
|
nr = trans->blocks_used;
|
2008-07-29 14:15:18 -06:00
|
|
|
btrfs_end_transaction_throttle(trans, root);
|
2007-12-21 14:27:21 -07:00
|
|
|
fail:
|
2007-06-12 04:35:45 -06:00
|
|
|
if (drop_inode) {
|
|
|
|
inode_dec_link_count(inode);
|
|
|
|
iput(inode);
|
|
|
|
}
|
2007-09-17 08:58:06 -06:00
|
|
|
btrfs_btree_balance_dirty(root, nr);
|
2007-06-12 04:35:45 -06:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
|
|
|
|
{
|
2008-05-02 14:13:49 -06:00
|
|
|
struct inode *inode = NULL;
|
2007-06-12 04:35:45 -06:00
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
|
|
|
int err = 0;
|
|
|
|
int drop_on_err = 0;
|
2008-05-02 14:13:49 -06:00
|
|
|
u64 objectid = 0;
|
2008-08-05 09:18:09 -06:00
|
|
|
u64 index = 0;
|
2007-09-17 08:58:06 -06:00
|
|
|
unsigned long nr = 1;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2009-09-11 14:12:44 -06:00
|
|
|
/*
|
|
|
|
* 2 items for inode and ref
|
|
|
|
* 2 items for dir items
|
|
|
|
* 1 for xattr if selinux is on
|
|
|
|
*/
|
2010-05-16 08:48:46 -06:00
|
|
|
trans = btrfs_start_transaction(root, 5);
|
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
2007-06-12 04:35:45 -06:00
|
|
|
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-19 20:06:11 -06:00
|
|
|
err = btrfs_find_free_ino(root, &objectid);
|
|
|
|
if (err)
|
|
|
|
goto out_fail;
|
|
|
|
|
2008-07-24 10:12:38 -06:00
|
|
|
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
|
2011-04-19 20:31:50 -06:00
|
|
|
dentry->d_name.len, btrfs_ino(dir), objectid,
|
2011-05-11 13:26:06 -06:00
|
|
|
S_IFDIR | mode, &index);
|
2007-06-12 04:35:45 -06:00
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
err = PTR_ERR(inode);
|
|
|
|
goto out_fail;
|
|
|
|
}
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
drop_on_err = 1;
|
2008-07-24 10:16:36 -06:00
|
|
|
|
2011-02-01 09:05:39 -07:00
|
|
|
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
|
2008-07-24 10:16:36 -06:00
|
|
|
if (err)
|
|
|
|
goto out_fail;
|
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
inode->i_op = &btrfs_dir_inode_operations;
|
|
|
|
inode->i_fop = &btrfs_dir_file_operations;
|
|
|
|
|
2008-07-17 10:54:05 -06:00
|
|
|
btrfs_i_size_write(inode, 0);
|
2007-06-12 04:35:45 -06:00
|
|
|
err = btrfs_update_inode(trans, root, inode);
|
|
|
|
if (err)
|
|
|
|
goto out_fail;
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2010-11-19 13:36:11 -07:00
|
|
|
err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
|
|
|
|
dentry->d_name.len, 0, index);
|
2007-06-12 04:35:45 -06:00
|
|
|
if (err)
|
|
|
|
goto out_fail;
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
d_instantiate(dentry, inode);
|
|
|
|
drop_on_err = 0;
|
|
|
|
|
|
|
|
out_fail:
|
2007-09-17 08:58:06 -06:00
|
|
|
nr = trans->blocks_used;
|
2008-07-29 14:15:18 -06:00
|
|
|
btrfs_end_transaction_throttle(trans, root);
|
2007-06-12 04:35:45 -06:00
|
|
|
if (drop_on_err)
|
|
|
|
iput(inode);
|
2007-09-17 08:58:06 -06:00
|
|
|
btrfs_btree_balance_dirty(root, nr);
|
2007-06-12 04:35:45 -06:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/* helper for btfs_get_extent. Given an existing extent in the tree,
|
|
|
|
* and an extent that you want to insert, deal with overlap and insert
|
|
|
|
* the new extent into the tree.
|
|
|
|
*/
|
2008-04-17 09:29:12 -06:00
|
|
|
static int merge_extent_mapping(struct extent_map_tree *em_tree,
|
|
|
|
struct extent_map *existing,
|
2008-07-17 10:53:50 -06:00
|
|
|
struct extent_map *em,
|
|
|
|
u64 map_start, u64 map_len)
|
2008-04-17 09:29:12 -06:00
|
|
|
{
|
|
|
|
u64 start_diff;
|
|
|
|
|
2008-07-17 10:53:50 -06:00
|
|
|
BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
|
|
|
|
start_diff = map_start - em->start;
|
|
|
|
em->start = map_start;
|
|
|
|
em->len = map_len;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
if (em->block_start < EXTENT_MAP_LAST_BYTE &&
|
|
|
|
!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
|
2008-07-17 10:53:50 -06:00
|
|
|
em->block_start += start_diff;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
em->block_len -= start_diff;
|
|
|
|
}
|
2008-07-17 10:53:50 -06:00
|
|
|
return add_extent_mapping(em_tree, em);
|
2008-04-17 09:29:12 -06:00
|
|
|
}
|
|
|
|
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
static noinline int uncompress_inline(struct btrfs_path *path,
|
|
|
|
struct inode *inode, struct page *page,
|
|
|
|
size_t pg_offset, u64 extent_offset,
|
|
|
|
struct btrfs_file_extent_item *item)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct extent_buffer *leaf = path->nodes[0];
|
|
|
|
char *tmp;
|
|
|
|
size_t max_size;
|
|
|
|
unsigned long inline_size;
|
|
|
|
unsigned long ptr;
|
2010-12-16 23:21:50 -07:00
|
|
|
int compress_type;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
|
|
|
|
WARN_ON(pg_offset != 0);
|
2010-12-16 23:21:50 -07:00
|
|
|
compress_type = btrfs_file_extent_compression(leaf, item);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
max_size = btrfs_file_extent_ram_bytes(leaf, item);
|
|
|
|
inline_size = btrfs_file_extent_inline_item_len(leaf,
|
|
|
|
btrfs_item_nr(leaf, path->slots[0]));
|
|
|
|
tmp = kmalloc(inline_size, GFP_NOFS);
|
2011-04-25 17:43:52 -06:00
|
|
|
if (!tmp)
|
|
|
|
return -ENOMEM;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
ptr = btrfs_file_extent_inline_start(item);
|
|
|
|
|
|
|
|
read_extent_buffer(leaf, tmp, ptr, inline_size);
|
|
|
|
|
2008-11-11 07:34:41 -07:00
|
|
|
max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
|
2010-12-16 23:21:50 -07:00
|
|
|
ret = btrfs_decompress(compress_type, tmp, page,
|
|
|
|
extent_offset, inline_size, max_size);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
if (ret) {
|
|
|
|
char *kaddr = kmap_atomic(page, KM_USER0);
|
|
|
|
unsigned long copy_size = min_t(u64,
|
|
|
|
PAGE_CACHE_SIZE - pg_offset,
|
|
|
|
max_size - extent_offset);
|
|
|
|
memset(kaddr + pg_offset, 0, copy_size);
|
|
|
|
kunmap_atomic(kaddr, KM_USER0);
|
|
|
|
}
|
|
|
|
kfree(tmp);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/*
|
|
|
|
* a bit scary, this does extent mapping from logical file offset to the disk.
|
2009-01-05 19:25:51 -07:00
|
|
|
* the ugly parts come from merging extents from the disk with the in-ram
|
|
|
|
* representation. This gets more complex because of the data=ordered code,
|
2008-09-29 13:18:18 -06:00
|
|
|
* where the in-ram extents might be locked pending data=ordered completion.
|
|
|
|
*
|
|
|
|
* This also copies inline extents directly into the page.
|
|
|
|
*/
|
2009-01-05 19:25:51 -07:00
|
|
|
|
2007-08-27 14:49:44 -06:00
|
|
|
struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
|
2008-01-29 07:59:12 -07:00
|
|
|
size_t pg_offset, u64 start, u64 len,
|
2007-08-27 14:49:44 -06:00
|
|
|
int create)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
int err = 0;
|
2007-10-15 14:15:53 -06:00
|
|
|
u64 bytenr;
|
2007-08-27 14:49:44 -06:00
|
|
|
u64 extent_start = 0;
|
|
|
|
u64 extent_end = 0;
|
2011-04-19 20:31:50 -06:00
|
|
|
u64 objectid = btrfs_ino(inode);
|
2007-08-27 14:49:44 -06:00
|
|
|
u32 found_type;
|
2008-07-22 09:18:09 -06:00
|
|
|
struct btrfs_path *path = NULL;
|
2007-08-27 14:49:44 -06:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_file_extent_item *item;
|
2007-10-15 14:14:19 -06:00
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_key found_key;
|
2007-08-27 14:49:44 -06:00
|
|
|
struct extent_map *em = NULL;
|
|
|
|
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
|
2008-01-24 14:13:08 -07:00
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
2007-08-27 14:49:44 -06:00
|
|
|
struct btrfs_trans_handle *trans = NULL;
|
2010-12-16 23:21:50 -07:00
|
|
|
int compress_type;
|
2007-08-27 14:49:44 -06:00
|
|
|
|
|
|
|
again:
|
2009-09-02 14:24:52 -06:00
|
|
|
read_lock(&em_tree->lock);
|
2008-01-24 14:13:08 -07:00
|
|
|
em = lookup_extent_mapping(em_tree, start, len);
|
2008-05-07 09:43:44 -06:00
|
|
|
if (em)
|
|
|
|
em->bdev = root->fs_info->fs_devices->latest_bdev;
|
2009-09-02 14:24:52 -06:00
|
|
|
read_unlock(&em_tree->lock);
|
2008-01-24 14:13:08 -07:00
|
|
|
|
2007-08-27 14:49:44 -06:00
|
|
|
if (em) {
|
2008-04-22 11:26:46 -06:00
|
|
|
if (em->start > start || em->start + em->len <= start)
|
|
|
|
free_extent_map(em);
|
|
|
|
else if (em->block_start == EXTENT_MAP_INLINE && page)
|
2008-01-29 07:59:12 -07:00
|
|
|
free_extent_map(em);
|
|
|
|
else
|
|
|
|
goto out;
|
2007-08-27 14:49:44 -06:00
|
|
|
}
|
2011-04-20 16:48:27 -06:00
|
|
|
em = alloc_extent_map();
|
2007-08-27 14:49:44 -06:00
|
|
|
if (!em) {
|
2008-01-24 14:13:08 -07:00
|
|
|
err = -ENOMEM;
|
|
|
|
goto out;
|
2007-08-27 14:49:44 -06:00
|
|
|
}
|
2008-07-17 10:53:50 -06:00
|
|
|
em->bdev = root->fs_info->fs_devices->latest_bdev;
|
2008-01-24 14:13:08 -07:00
|
|
|
em->start = EXTENT_MAP_HOLE;
|
2008-11-10 09:53:33 -07:00
|
|
|
em->orig_start = EXTENT_MAP_HOLE;
|
2008-01-24 14:13:08 -07:00
|
|
|
em->len = (u64)-1;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
em->block_len = (u64)-1;
|
2008-07-22 09:18:09 -06:00
|
|
|
|
|
|
|
if (!path) {
|
|
|
|
path = btrfs_alloc_path();
|
2011-05-13 08:32:11 -06:00
|
|
|
if (!path) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Chances are we'll be called again, so go ahead and do
|
|
|
|
* readahead
|
|
|
|
*/
|
|
|
|
path->reada = 1;
|
2008-07-22 09:18:09 -06:00
|
|
|
}
|
|
|
|
|
2007-11-01 09:28:41 -06:00
|
|
|
ret = btrfs_lookup_file_extent(trans, root, path,
|
|
|
|
objectid, start, trans != NULL);
|
2007-08-27 14:49:44 -06:00
|
|
|
if (ret < 0) {
|
|
|
|
err = ret;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ret != 0) {
|
|
|
|
if (path->slots[0] == 0)
|
|
|
|
goto not_found;
|
|
|
|
path->slots[0]--;
|
|
|
|
}
|
|
|
|
|
2007-10-15 14:14:19 -06:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
item = btrfs_item_ptr(leaf, path->slots[0],
|
2007-08-27 14:49:44 -06:00
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
/* are we inside the extent that was found? */
|
2007-10-15 14:14:19 -06:00
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
|
|
|
found_type = btrfs_key_type(&found_key);
|
|
|
|
if (found_key.objectid != objectid ||
|
2007-08-27 14:49:44 -06:00
|
|
|
found_type != BTRFS_EXTENT_DATA_KEY) {
|
|
|
|
goto not_found;
|
|
|
|
}
|
|
|
|
|
2007-10-15 14:14:19 -06:00
|
|
|
found_type = btrfs_file_extent_type(leaf, item);
|
|
|
|
extent_start = found_key.offset;
|
2010-12-16 23:21:50 -07:00
|
|
|
compress_type = btrfs_file_extent_compression(leaf, item);
|
2008-10-30 12:25:28 -06:00
|
|
|
if (found_type == BTRFS_FILE_EXTENT_REG ||
|
|
|
|
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
|
2007-08-27 14:49:44 -06:00
|
|
|
extent_end = extent_start +
|
2007-10-15 14:15:53 -06:00
|
|
|
btrfs_file_extent_num_bytes(leaf, item);
|
2008-10-30 12:19:41 -06:00
|
|
|
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
|
|
|
|
size_t size;
|
|
|
|
size = btrfs_file_extent_inline_len(leaf, item);
|
|
|
|
extent_end = (extent_start + size + root->sectorsize - 1) &
|
|
|
|
~((u64)root->sectorsize - 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (start >= extent_end) {
|
|
|
|
path->slots[0]++;
|
|
|
|
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
|
|
|
if (ret < 0) {
|
|
|
|
err = ret;
|
|
|
|
goto out;
|
2007-08-27 14:49:44 -06:00
|
|
|
}
|
2008-10-30 12:19:41 -06:00
|
|
|
if (ret > 0)
|
|
|
|
goto not_found;
|
|
|
|
leaf = path->nodes[0];
|
2007-08-27 14:49:44 -06:00
|
|
|
}
|
2008-10-30 12:19:41 -06:00
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
|
|
|
|
if (found_key.objectid != objectid ||
|
|
|
|
found_key.type != BTRFS_EXTENT_DATA_KEY)
|
|
|
|
goto not_found;
|
|
|
|
if (start + len <= found_key.offset)
|
|
|
|
goto not_found;
|
|
|
|
em->start = start;
|
|
|
|
em->len = found_key.offset - start;
|
|
|
|
goto not_found_em;
|
|
|
|
}
|
|
|
|
|
2008-10-30 12:25:28 -06:00
|
|
|
if (found_type == BTRFS_FILE_EXTENT_REG ||
|
|
|
|
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
|
2008-10-30 12:19:41 -06:00
|
|
|
em->start = extent_start;
|
|
|
|
em->len = extent_end - extent_start;
|
2008-11-10 05:34:43 -07:00
|
|
|
em->orig_start = extent_start -
|
|
|
|
btrfs_file_extent_offset(leaf, item);
|
2007-10-15 14:15:53 -06:00
|
|
|
bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
|
|
|
|
if (bytenr == 0) {
|
2007-10-15 14:14:19 -06:00
|
|
|
em->block_start = EXTENT_MAP_HOLE;
|
2007-08-27 14:49:44 -06:00
|
|
|
goto insert;
|
|
|
|
}
|
2010-12-16 23:21:50 -07:00
|
|
|
if (compress_type != BTRFS_COMPRESS_NONE) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
|
2010-12-16 23:21:50 -07:00
|
|
|
em->compress_type = compress_type;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
em->block_start = bytenr;
|
|
|
|
em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
|
|
|
|
item);
|
|
|
|
} else {
|
|
|
|
bytenr += btrfs_file_extent_offset(leaf, item);
|
|
|
|
em->block_start = bytenr;
|
|
|
|
em->block_len = em->len;
|
2008-10-30 12:25:28 -06:00
|
|
|
if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
|
|
|
|
set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
}
|
2007-08-27 14:49:44 -06:00
|
|
|
goto insert;
|
|
|
|
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
|
2007-10-15 14:14:19 -06:00
|
|
|
unsigned long ptr;
|
2007-08-27 14:49:44 -06:00
|
|
|
char *map;
|
2007-10-15 14:18:25 -06:00
|
|
|
size_t size;
|
|
|
|
size_t extent_offset;
|
|
|
|
size_t copy_size;
|
2007-08-27 14:49:44 -06:00
|
|
|
|
2007-10-29 09:41:07 -06:00
|
|
|
em->block_start = EXTENT_MAP_INLINE;
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
if (!page || create) {
|
2007-10-29 09:41:07 -06:00
|
|
|
em->start = extent_start;
|
2008-10-30 12:19:41 -06:00
|
|
|
em->len = extent_end - extent_start;
|
2007-10-29 09:41:07 -06:00
|
|
|
goto out;
|
|
|
|
}
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2008-10-30 12:19:41 -06:00
|
|
|
size = btrfs_file_extent_inline_len(leaf, item);
|
|
|
|
extent_offset = page_offset(page) + pg_offset - extent_start;
|
2008-01-29 07:59:12 -07:00
|
|
|
copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
|
2007-10-15 14:18:25 -06:00
|
|
|
size - extent_offset);
|
|
|
|
em->start = extent_start + extent_offset;
|
2008-01-29 07:59:12 -07:00
|
|
|
em->len = (copy_size + root->sectorsize - 1) &
|
|
|
|
~((u64)root->sectorsize - 1);
|
2008-11-10 05:34:43 -07:00
|
|
|
em->orig_start = EXTENT_MAP_INLINE;
|
2010-12-16 23:21:50 -07:00
|
|
|
if (compress_type) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
|
2010-12-16 23:21:50 -07:00
|
|
|
em->compress_type = compress_type;
|
|
|
|
}
|
2007-10-29 09:41:07 -06:00
|
|
|
ptr = btrfs_file_extent_inline_start(item) + extent_offset;
|
2007-11-01 09:28:41 -06:00
|
|
|
if (create == 0 && !PageUptodate(page)) {
|
2010-12-16 23:21:50 -07:00
|
|
|
if (btrfs_file_extent_compression(leaf, item) !=
|
|
|
|
BTRFS_COMPRESS_NONE) {
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
ret = uncompress_inline(path, inode, page,
|
|
|
|
pg_offset,
|
|
|
|
extent_offset, item);
|
|
|
|
BUG_ON(ret);
|
|
|
|
} else {
|
|
|
|
map = kmap(page);
|
|
|
|
read_extent_buffer(leaf, map + pg_offset, ptr,
|
|
|
|
copy_size);
|
2009-09-11 10:36:29 -06:00
|
|
|
if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
|
|
|
|
memset(map + pg_offset + copy_size, 0,
|
|
|
|
PAGE_CACHE_SIZE - pg_offset -
|
|
|
|
copy_size);
|
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
kunmap(page);
|
|
|
|
}
|
2007-11-01 09:28:41 -06:00
|
|
|
flush_dcache_page(page);
|
|
|
|
} else if (create && PageUptodate(page)) {
|
2010-05-16 08:48:47 -06:00
|
|
|
WARN_ON(1);
|
2007-11-01 09:28:41 -06:00
|
|
|
if (!trans) {
|
|
|
|
kunmap(page);
|
|
|
|
free_extent_map(em);
|
|
|
|
em = NULL;
|
2011-05-28 05:00:39 -06:00
|
|
|
|
2011-04-20 17:20:15 -06:00
|
|
|
btrfs_release_path(path);
|
2011-04-13 10:54:33 -06:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-05-28 05:00:39 -06:00
|
|
|
|
2011-01-24 19:51:38 -07:00
|
|
|
if (IS_ERR(trans))
|
|
|
|
return ERR_CAST(trans);
|
2007-11-01 09:28:41 -06:00
|
|
|
goto again;
|
|
|
|
}
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
map = kmap(page);
|
2008-01-29 07:59:12 -07:00
|
|
|
write_extent_buffer(leaf, map + pg_offset, ptr,
|
2007-11-01 09:28:41 -06:00
|
|
|
copy_size);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
kunmap(page);
|
2007-11-01 09:28:41 -06:00
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
2007-08-27 14:49:44 -06:00
|
|
|
}
|
2008-01-24 14:13:08 -07:00
|
|
|
set_extent_uptodate(io_tree, em->start,
|
2011-04-06 04:02:20 -06:00
|
|
|
extent_map_end(em) - 1, NULL, GFP_NOFS);
|
2007-08-27 14:49:44 -06:00
|
|
|
goto insert;
|
|
|
|
} else {
|
2009-01-05 19:25:51 -07:00
|
|
|
printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
|
2007-08-27 14:49:44 -06:00
|
|
|
WARN_ON(1);
|
|
|
|
}
|
|
|
|
not_found:
|
|
|
|
em->start = start;
|
2008-01-24 14:13:08 -07:00
|
|
|
em->len = len;
|
2007-08-27 14:49:44 -06:00
|
|
|
not_found_em:
|
2007-10-15 14:14:19 -06:00
|
|
|
em->block_start = EXTENT_MAP_HOLE;
|
2008-10-30 12:19:41 -06:00
|
|
|
set_bit(EXTENT_FLAG_VACANCY, &em->flags);
|
2007-08-27 14:49:44 -06:00
|
|
|
insert:
|
2011-04-20 17:20:15 -06:00
|
|
|
btrfs_release_path(path);
|
2008-01-24 14:13:08 -07:00
|
|
|
if (em->start > start || extent_map_end(em) <= start) {
|
2009-01-05 19:25:51 -07:00
|
|
|
printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
|
|
|
|
"[%llu %llu]\n", (unsigned long long)em->start,
|
|
|
|
(unsigned long long)em->len,
|
|
|
|
(unsigned long long)start,
|
|
|
|
(unsigned long long)len);
|
2007-08-27 14:49:44 -06:00
|
|
|
err = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
2008-01-24 14:13:08 -07:00
|
|
|
|
|
|
|
err = 0;
|
2009-09-02 14:24:52 -06:00
|
|
|
write_lock(&em_tree->lock);
|
2007-08-27 14:49:44 -06:00
|
|
|
ret = add_extent_mapping(em_tree, em);
|
2008-04-17 09:29:12 -06:00
|
|
|
/* it is possible that someone inserted the extent into the tree
|
|
|
|
* while we had the lock dropped. It is also possible that
|
|
|
|
* an overlapping map exists in the tree
|
|
|
|
*/
|
2007-08-27 14:49:44 -06:00
|
|
|
if (ret == -EEXIST) {
|
2008-04-17 09:29:12 -06:00
|
|
|
struct extent_map *existing;
|
2008-07-17 10:53:50 -06:00
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
|
2008-04-17 09:29:12 -06:00
|
|
|
existing = lookup_extent_mapping(em_tree, start, len);
|
2008-04-22 11:26:46 -06:00
|
|
|
if (existing && (existing->start > start ||
|
|
|
|
existing->start + existing->len <= start)) {
|
|
|
|
free_extent_map(existing);
|
|
|
|
existing = NULL;
|
|
|
|
}
|
2008-04-17 09:29:12 -06:00
|
|
|
if (!existing) {
|
|
|
|
existing = lookup_extent_mapping(em_tree, em->start,
|
|
|
|
em->len);
|
|
|
|
if (existing) {
|
|
|
|
err = merge_extent_mapping(em_tree, existing,
|
2008-07-17 10:53:50 -06:00
|
|
|
em, start,
|
|
|
|
root->sectorsize);
|
2008-04-17 09:29:12 -06:00
|
|
|
free_extent_map(existing);
|
|
|
|
if (err) {
|
|
|
|
free_extent_map(em);
|
|
|
|
em = NULL;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
err = -EIO;
|
|
|
|
free_extent_map(em);
|
|
|
|
em = NULL;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
free_extent_map(em);
|
|
|
|
em = existing;
|
2008-07-17 10:53:50 -06:00
|
|
|
err = 0;
|
2007-08-27 14:49:44 -06:00
|
|
|
}
|
|
|
|
}
|
2009-09-02 14:24:52 -06:00
|
|
|
write_unlock(&em_tree->lock);
|
2007-08-27 14:49:44 -06:00
|
|
|
out:
|
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 05:18:59 -06:00
|
|
|
|
|
|
|
trace_btrfs_get_extent(root, em);
|
|
|
|
|
2008-07-22 09:18:09 -06:00
|
|
|
if (path)
|
|
|
|
btrfs_free_path(path);
|
2007-08-27 14:49:44 -06:00
|
|
|
if (trans) {
|
|
|
|
ret = btrfs_end_transaction(trans, root);
|
2009-01-05 19:25:51 -07:00
|
|
|
if (!err)
|
2007-08-27 14:49:44 -06:00
|
|
|
err = ret;
|
|
|
|
}
|
|
|
|
if (err) {
|
|
|
|
free_extent_map(em);
|
|
|
|
return ERR_PTR(err);
|
|
|
|
}
|
|
|
|
return em;
|
|
|
|
}
|
|
|
|
|
2011-02-23 14:23:20 -07:00
|
|
|
struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
|
|
|
|
size_t pg_offset, u64 start, u64 len,
|
|
|
|
int create)
|
|
|
|
{
|
|
|
|
struct extent_map *em;
|
|
|
|
struct extent_map *hole_em = NULL;
|
|
|
|
u64 range_start = start;
|
|
|
|
u64 end;
|
|
|
|
u64 found;
|
|
|
|
u64 found_end;
|
|
|
|
int err = 0;
|
|
|
|
|
|
|
|
em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
|
|
|
|
if (IS_ERR(em))
|
|
|
|
return em;
|
|
|
|
if (em) {
|
|
|
|
/*
|
|
|
|
* if our em maps to a hole, there might
|
|
|
|
* actually be delalloc bytes behind it
|
|
|
|
*/
|
|
|
|
if (em->block_start != EXTENT_MAP_HOLE)
|
|
|
|
return em;
|
|
|
|
else
|
|
|
|
hole_em = em;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check to see if we've wrapped (len == -1 or similar) */
|
|
|
|
end = start + len;
|
|
|
|
if (end < start)
|
|
|
|
end = (u64)-1;
|
|
|
|
else
|
|
|
|
end -= 1;
|
|
|
|
|
|
|
|
em = NULL;
|
|
|
|
|
|
|
|
/* ok, we didn't find anything, lets look for delalloc */
|
|
|
|
found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
|
|
|
|
end, len, EXTENT_DELALLOC, 1);
|
|
|
|
found_end = range_start + found;
|
|
|
|
if (found_end < range_start)
|
|
|
|
found_end = (u64)-1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* we didn't find anything useful, return
|
|
|
|
* the original results from get_extent()
|
|
|
|
*/
|
|
|
|
if (range_start > end || found_end <= start) {
|
|
|
|
em = hole_em;
|
|
|
|
hole_em = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* adjust the range_start to make sure it doesn't
|
|
|
|
* go backwards from the start they passed in
|
|
|
|
*/
|
|
|
|
range_start = max(start,range_start);
|
|
|
|
found = found_end - range_start;
|
|
|
|
|
|
|
|
if (found > 0) {
|
|
|
|
u64 hole_start = start;
|
|
|
|
u64 hole_len = len;
|
|
|
|
|
2011-04-20 16:48:27 -06:00
|
|
|
em = alloc_extent_map();
|
2011-02-23 14:23:20 -07:00
|
|
|
if (!em) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* when btrfs_get_extent can't find anything it
|
|
|
|
* returns one huge hole
|
|
|
|
*
|
|
|
|
* make sure what it found really fits our range, and
|
|
|
|
* adjust to make sure it is based on the start from
|
|
|
|
* the caller
|
|
|
|
*/
|
|
|
|
if (hole_em) {
|
|
|
|
u64 calc_end = extent_map_end(hole_em);
|
|
|
|
|
|
|
|
if (calc_end <= start || (hole_em->start > end)) {
|
|
|
|
free_extent_map(hole_em);
|
|
|
|
hole_em = NULL;
|
|
|
|
} else {
|
|
|
|
hole_start = max(hole_em->start, start);
|
|
|
|
hole_len = calc_end - hole_start;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
em->bdev = NULL;
|
|
|
|
if (hole_em && range_start > hole_start) {
|
|
|
|
/* our hole starts before our delalloc, so we
|
|
|
|
* have to return just the parts of the hole
|
|
|
|
* that go until the delalloc starts
|
|
|
|
*/
|
|
|
|
em->len = min(hole_len,
|
|
|
|
range_start - hole_start);
|
|
|
|
em->start = hole_start;
|
|
|
|
em->orig_start = hole_start;
|
|
|
|
/*
|
|
|
|
* don't adjust block start at all,
|
|
|
|
* it is fixed at EXTENT_MAP_HOLE
|
|
|
|
*/
|
|
|
|
em->block_start = hole_em->block_start;
|
|
|
|
em->block_len = hole_len;
|
|
|
|
} else {
|
|
|
|
em->start = range_start;
|
|
|
|
em->len = found;
|
|
|
|
em->orig_start = range_start;
|
|
|
|
em->block_start = EXTENT_MAP_DELALLOC;
|
|
|
|
em->block_len = found;
|
|
|
|
}
|
|
|
|
} else if (hole_em) {
|
|
|
|
return hole_em;
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
|
|
|
|
free_extent_map(hole_em);
|
|
|
|
if (err) {
|
|
|
|
free_extent_map(em);
|
|
|
|
return ERR_PTR(err);
|
|
|
|
}
|
|
|
|
return em;
|
|
|
|
}
|
|
|
|
|
2010-05-23 09:00:55 -06:00
|
|
|
static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
|
2011-04-06 12:53:07 -06:00
|
|
|
struct extent_map *em,
|
2010-05-23 09:00:55 -06:00
|
|
|
u64 start, u64 len)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
|
|
|
|
struct btrfs_key ins;
|
|
|
|
u64 alloc_hint;
|
|
|
|
int ret;
|
2011-04-06 12:53:07 -06:00
|
|
|
bool insert = false;
|
2010-05-23 09:00:55 -06:00
|
|
|
|
2011-04-06 12:53:07 -06:00
|
|
|
/*
|
|
|
|
* Ok if the extent map we looked up is a hole and is for the exact
|
|
|
|
* range we want, there is no reason to allocate a new one, however if
|
|
|
|
* it is not right then we need to free this one and drop the cache for
|
|
|
|
* our range.
|
|
|
|
*/
|
|
|
|
if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
|
|
|
|
em->len != len) {
|
|
|
|
free_extent_map(em);
|
|
|
|
em = NULL;
|
|
|
|
insert = true;
|
|
|
|
btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
|
|
|
|
}
|
2010-05-23 09:00:55 -06:00
|
|
|
|
2011-04-13 10:54:33 -06:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-01-24 19:51:38 -07:00
|
|
|
if (IS_ERR(trans))
|
|
|
|
return ERR_CAST(trans);
|
2010-05-23 09:00:55 -06:00
|
|
|
|
2011-05-24 13:35:30 -06:00
|
|
|
if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
|
|
|
|
btrfs_add_inode_defrag(trans, inode);
|
|
|
|
|
2010-05-23 09:00:55 -06:00
|
|
|
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
|
|
|
|
|
|
|
|
alloc_hint = get_extent_allocation_hint(inode, start, len);
|
|
|
|
ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
|
|
|
|
alloc_hint, (u64)-1, &ins, 1);
|
|
|
|
if (ret) {
|
|
|
|
em = ERR_PTR(ret);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!em) {
|
2011-04-20 16:48:27 -06:00
|
|
|
em = alloc_extent_map();
|
2011-04-06 12:53:07 -06:00
|
|
|
if (!em) {
|
|
|
|
em = ERR_PTR(-ENOMEM);
|
|
|
|
goto out;
|
|
|
|
}
|
2010-05-23 09:00:55 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
em->start = start;
|
|
|
|
em->orig_start = em->start;
|
|
|
|
em->len = ins.offset;
|
|
|
|
|
|
|
|
em->block_start = ins.objectid;
|
|
|
|
em->block_len = ins.offset;
|
|
|
|
em->bdev = root->fs_info->fs_devices->latest_bdev;
|
2011-04-06 12:53:07 -06:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to do this because if we're using the original em we searched
|
|
|
|
* for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
|
|
|
|
*/
|
|
|
|
em->flags = 0;
|
2010-05-23 09:00:55 -06:00
|
|
|
set_bit(EXTENT_FLAG_PINNED, &em->flags);
|
|
|
|
|
2011-04-06 12:53:07 -06:00
|
|
|
while (insert) {
|
2010-05-23 09:00:55 -06:00
|
|
|
write_lock(&em_tree->lock);
|
|
|
|
ret = add_extent_mapping(em_tree, em);
|
|
|
|
write_unlock(&em_tree->lock);
|
|
|
|
if (ret != -EEXIST)
|
|
|
|
break;
|
|
|
|
btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
|
|
|
|
ins.offset, ins.offset, 0);
|
|
|
|
if (ret) {
|
|
|
|
btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
|
|
|
|
em = ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
btrfs_end_transaction(trans, root);
|
|
|
|
return em;
|
|
|
|
}
|
|
|
|
|
2010-05-26 09:04:10 -06:00
|
|
|
/*
|
|
|
|
* returns 1 when the nocow is safe, < 1 on error, 0 if the
|
|
|
|
* block must be cow'd
|
|
|
|
*/
|
|
|
|
static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
|
|
|
|
struct inode *inode, u64 offset, u64 len)
|
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
int ret;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_file_extent_item *fi;
|
|
|
|
struct btrfs_key key;
|
|
|
|
u64 disk_bytenr;
|
|
|
|
u64 backref_offset;
|
|
|
|
u64 extent_end;
|
|
|
|
u64 num_bytes;
|
|
|
|
int slot;
|
|
|
|
int found_type;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
|
2010-05-26 09:04:10 -06:00
|
|
|
offset, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
slot = path->slots[0];
|
|
|
|
if (ret == 1) {
|
|
|
|
if (slot == 0) {
|
|
|
|
/* can't find the item, must cow */
|
|
|
|
ret = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
slot--;
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &key, slot);
|
2011-04-19 20:31:50 -06:00
|
|
|
if (key.objectid != btrfs_ino(inode) ||
|
2010-05-26 09:04:10 -06:00
|
|
|
key.type != BTRFS_EXTENT_DATA_KEY) {
|
|
|
|
/* not our file or wrong item type, must cow */
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (key.offset > offset) {
|
|
|
|
/* Wrong offset, must cow */
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
|
|
|
|
found_type = btrfs_file_extent_type(leaf, fi);
|
|
|
|
if (found_type != BTRFS_FILE_EXTENT_REG &&
|
|
|
|
found_type != BTRFS_FILE_EXTENT_PREALLOC) {
|
|
|
|
/* not a regular extent, must cow */
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
|
|
|
|
backref_offset = btrfs_file_extent_offset(leaf, fi);
|
|
|
|
|
|
|
|
extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
|
|
|
|
if (extent_end < offset + len) {
|
|
|
|
/* extent doesn't include our full range, must cow */
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (btrfs_extent_readonly(root, disk_bytenr))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* look for other files referencing this extent, if we
|
|
|
|
* find any we must cow
|
|
|
|
*/
|
2011-04-19 20:31:50 -06:00
|
|
|
if (btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
|
2010-05-26 09:04:10 -06:00
|
|
|
key.offset - backref_offset, disk_bytenr))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* adjust disk_bytenr and num_bytes to cover just the bytes
|
|
|
|
* in this extent we are about to write. If there
|
|
|
|
* are any csums in that range we have to cow in order
|
|
|
|
* to keep the csums correct
|
|
|
|
*/
|
|
|
|
disk_bytenr += backref_offset;
|
|
|
|
disk_bytenr += offset - key.offset;
|
|
|
|
num_bytes = min(offset + len, extent_end) - offset;
|
|
|
|
if (csum_exist_in_range(root, disk_bytenr, num_bytes))
|
|
|
|
goto out;
|
|
|
|
/*
|
|
|
|
* all of the above have passed, it is safe to overwrite this extent
|
|
|
|
* without cow
|
|
|
|
*/
|
|
|
|
ret = 1;
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2010-05-23 09:00:55 -06:00
|
|
|
static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
|
|
|
|
struct buffer_head *bh_result, int create)
|
|
|
|
{
|
|
|
|
struct extent_map *em;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
u64 start = iblock << inode->i_blkbits;
|
|
|
|
u64 len = bh_result->b_size;
|
2010-05-26 09:04:10 -06:00
|
|
|
struct btrfs_trans_handle *trans;
|
2010-05-23 09:00:55 -06:00
|
|
|
|
|
|
|
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
|
|
|
|
if (IS_ERR(em))
|
|
|
|
return PTR_ERR(em);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Ok for INLINE and COMPRESSED extents we need to fallback on buffered
|
|
|
|
* io. INLINE is special, and we could probably kludge it in here, but
|
|
|
|
* it's still buffered so for safety lets just fall back to the generic
|
|
|
|
* buffered path.
|
|
|
|
*
|
|
|
|
* For COMPRESSED we _have_ to read the entire extent in so we can
|
|
|
|
* decompress it, so there will be buffering required no matter what we
|
|
|
|
* do, so go ahead and fallback to buffered.
|
|
|
|
*
|
|
|
|
* We return -ENOTBLK because thats what makes DIO go ahead and go back
|
|
|
|
* to buffered IO. Don't blame me, this is the price we pay for using
|
|
|
|
* the generic code.
|
|
|
|
*/
|
|
|
|
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
|
|
|
|
em->block_start == EXTENT_MAP_INLINE) {
|
|
|
|
free_extent_map(em);
|
|
|
|
return -ENOTBLK;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Just a good old fashioned hole, return */
|
|
|
|
if (!create && (em->block_start == EXTENT_MAP_HOLE ||
|
|
|
|
test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
|
|
|
|
free_extent_map(em);
|
|
|
|
/* DIO will do one hole at a time, so just unlock a sector */
|
|
|
|
unlock_extent(&BTRFS_I(inode)->io_tree, start,
|
|
|
|
start + root->sectorsize - 1, GFP_NOFS);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't allocate a new extent in the following cases
|
|
|
|
*
|
|
|
|
* 1) The inode is marked as NODATACOW. In this case we'll just use the
|
|
|
|
* existing extent.
|
|
|
|
* 2) The extent is marked as PREALLOC. We're good to go here and can
|
|
|
|
* just use the extent.
|
|
|
|
*
|
|
|
|
*/
|
2010-05-26 09:04:10 -06:00
|
|
|
if (!create) {
|
|
|
|
len = em->len - (start - em->start);
|
2010-05-23 09:00:55 -06:00
|
|
|
goto map;
|
2010-05-26 09:04:10 -06:00
|
|
|
}
|
2010-05-23 09:00:55 -06:00
|
|
|
|
|
|
|
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
|
|
|
|
((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
|
|
|
|
em->block_start != EXTENT_MAP_HOLE)) {
|
|
|
|
int type;
|
|
|
|
int ret;
|
2010-05-26 09:04:10 -06:00
|
|
|
u64 block_start;
|
2010-05-23 09:00:55 -06:00
|
|
|
|
|
|
|
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
|
|
|
|
type = BTRFS_ORDERED_PREALLOC;
|
|
|
|
else
|
|
|
|
type = BTRFS_ORDERED_NOCOW;
|
2010-05-26 09:04:10 -06:00
|
|
|
len = min(len, em->len - (start - em->start));
|
2010-05-23 09:00:55 -06:00
|
|
|
block_start = em->block_start + (start - em->start);
|
2010-05-26 09:04:10 -06:00
|
|
|
|
|
|
|
/*
|
|
|
|
* we're not going to log anything, but we do need
|
|
|
|
* to make sure the current transaction stays open
|
|
|
|
* while we look for nocow cross refs
|
|
|
|
*/
|
2011-04-13 10:54:33 -06:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-01-24 19:51:38 -07:00
|
|
|
if (IS_ERR(trans))
|
2010-05-26 09:04:10 -06:00
|
|
|
goto must_cow;
|
|
|
|
|
|
|
|
if (can_nocow_odirect(trans, inode, start, len) == 1) {
|
|
|
|
ret = btrfs_add_ordered_extent_dio(inode, start,
|
|
|
|
block_start, len, len, type);
|
|
|
|
btrfs_end_transaction(trans, root);
|
|
|
|
if (ret) {
|
|
|
|
free_extent_map(em);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
goto unlock;
|
2010-05-23 09:00:55 -06:00
|
|
|
}
|
2010-05-26 09:04:10 -06:00
|
|
|
btrfs_end_transaction(trans, root);
|
2010-05-23 09:00:55 -06:00
|
|
|
}
|
2010-05-26 09:04:10 -06:00
|
|
|
must_cow:
|
|
|
|
/*
|
|
|
|
* this will cow the extent, reset the len in case we changed
|
|
|
|
* it above
|
|
|
|
*/
|
|
|
|
len = bh_result->b_size;
|
2011-04-06 12:53:07 -06:00
|
|
|
em = btrfs_new_extent_direct(inode, em, start, len);
|
2010-05-26 09:04:10 -06:00
|
|
|
if (IS_ERR(em))
|
|
|
|
return PTR_ERR(em);
|
|
|
|
len = min(len, em->len - (start - em->start));
|
|
|
|
unlock:
|
2010-05-25 18:56:50 -06:00
|
|
|
clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
|
|
|
|
EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
|
|
|
|
0, NULL, GFP_NOFS);
|
2010-05-23 09:00:55 -06:00
|
|
|
map:
|
|
|
|
bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
|
|
|
|
inode->i_blkbits;
|
2010-05-26 09:04:10 -06:00
|
|
|
bh_result->b_size = len;
|
2010-05-23 09:00:55 -06:00
|
|
|
bh_result->b_bdev = em->bdev;
|
|
|
|
set_buffer_mapped(bh_result);
|
|
|
|
if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
|
|
|
|
set_buffer_new(bh_result);
|
|
|
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct btrfs_dio_private {
|
|
|
|
struct inode *inode;
|
|
|
|
u64 logical_offset;
|
|
|
|
u64 disk_bytenr;
|
|
|
|
u64 bytes;
|
|
|
|
u32 *csums;
|
|
|
|
void *private;
|
2010-11-21 20:04:43 -07:00
|
|
|
|
|
|
|
/* number of bios pending for this dio */
|
|
|
|
atomic_t pending_bios;
|
|
|
|
|
|
|
|
/* IO errors */
|
|
|
|
int errors;
|
|
|
|
|
|
|
|
struct bio *orig_bio;
|
2010-05-23 09:00:55 -06:00
|
|
|
};
|
|
|
|
|
|
|
|
static void btrfs_endio_direct_read(struct bio *bio, int err)
|
|
|
|
{
|
2010-11-21 20:04:43 -07:00
|
|
|
struct btrfs_dio_private *dip = bio->bi_private;
|
2010-05-23 09:00:55 -06:00
|
|
|
struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
|
|
|
|
struct bio_vec *bvec = bio->bi_io_vec;
|
|
|
|
struct inode *inode = dip->inode;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
u64 start;
|
|
|
|
u32 *private = dip->csums;
|
|
|
|
|
|
|
|
start = dip->logical_offset;
|
|
|
|
do {
|
|
|
|
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
|
|
|
|
struct page *page = bvec->bv_page;
|
|
|
|
char *kaddr;
|
|
|
|
u32 csum = ~(u32)0;
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
|
|
|
kaddr = kmap_atomic(page, KM_IRQ0);
|
|
|
|
csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
|
|
|
|
csum, bvec->bv_len);
|
|
|
|
btrfs_csum_final(csum, (char *)&csum);
|
|
|
|
kunmap_atomic(kaddr, KM_IRQ0);
|
|
|
|
local_irq_restore(flags);
|
|
|
|
|
|
|
|
flush_dcache_page(bvec->bv_page);
|
|
|
|
if (csum != *private) {
|
2011-04-19 20:31:50 -06:00
|
|
|
printk(KERN_ERR "btrfs csum failed ino %llu off"
|
2010-05-23 09:00:55 -06:00
|
|
|
" %llu csum %u private %u\n",
|
2011-04-19 20:31:50 -06:00
|
|
|
(unsigned long long)btrfs_ino(inode),
|
|
|
|
(unsigned long long)start,
|
2010-05-23 09:00:55 -06:00
|
|
|
csum, *private);
|
|
|
|
err = -EIO;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
start += bvec->bv_len;
|
|
|
|
private++;
|
|
|
|
bvec++;
|
|
|
|
} while (bvec <= bvec_end);
|
|
|
|
|
|
|
|
unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
|
|
|
|
dip->logical_offset + dip->bytes - 1, GFP_NOFS);
|
|
|
|
bio->bi_private = dip->private;
|
|
|
|
|
|
|
|
kfree(dip->csums);
|
|
|
|
kfree(dip);
|
2011-03-22 09:05:07 -06:00
|
|
|
|
|
|
|
/* If we had a csum failure make sure to clear the uptodate flag */
|
|
|
|
if (err)
|
|
|
|
clear_bit(BIO_UPTODATE, &bio->bi_flags);
|
2010-05-23 09:00:55 -06:00
|
|
|
dio_end_io(bio, err);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void btrfs_endio_direct_write(struct bio *bio, int err)
|
|
|
|
{
|
|
|
|
struct btrfs_dio_private *dip = bio->bi_private;
|
|
|
|
struct inode *inode = dip->inode;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_ordered_extent *ordered = NULL;
|
|
|
|
struct extent_state *cached_state = NULL;
|
2010-11-28 17:56:33 -07:00
|
|
|
u64 ordered_offset = dip->logical_offset;
|
|
|
|
u64 ordered_bytes = dip->bytes;
|
2010-05-23 09:00:55 -06:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (err)
|
|
|
|
goto out_done;
|
2010-11-28 17:56:33 -07:00
|
|
|
again:
|
|
|
|
ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
|
|
|
|
&ordered_offset,
|
|
|
|
ordered_bytes);
|
2010-05-23 09:00:55 -06:00
|
|
|
if (!ret)
|
2010-11-28 17:56:33 -07:00
|
|
|
goto out_test;
|
2010-05-23 09:00:55 -06:00
|
|
|
|
|
|
|
BUG_ON(!ordered);
|
|
|
|
|
2011-04-13 10:54:33 -06:00
|
|
|
trans = btrfs_join_transaction(root);
|
2011-01-24 19:51:38 -07:00
|
|
|
if (IS_ERR(trans)) {
|
2010-05-23 09:00:55 -06:00
|
|
|
err = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
|
|
|
|
|
|
|
|
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
|
|
|
|
ret = btrfs_ordered_update_i_size(inode, 0, ordered);
|
|
|
|
if (!ret)
|
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
|
|
|
err = ret;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
|
|
|
|
ordered->file_offset + ordered->len - 1, 0,
|
|
|
|
&cached_state, GFP_NOFS);
|
|
|
|
|
|
|
|
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
|
|
|
|
ret = btrfs_mark_extent_written(trans, inode,
|
|
|
|
ordered->file_offset,
|
|
|
|
ordered->file_offset +
|
|
|
|
ordered->len);
|
|
|
|
if (ret) {
|
|
|
|
err = ret;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
ret = insert_reserved_file_extent(trans, inode,
|
|
|
|
ordered->file_offset,
|
|
|
|
ordered->start,
|
|
|
|
ordered->disk_len,
|
|
|
|
ordered->len,
|
|
|
|
ordered->len,
|
|
|
|
0, 0, 0,
|
|
|
|
BTRFS_FILE_EXTENT_REG);
|
|
|
|
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
|
|
|
|
ordered->file_offset, ordered->len);
|
|
|
|
if (ret) {
|
|
|
|
err = ret;
|
|
|
|
WARN_ON(1);
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
|
2011-04-05 17:25:36 -06:00
|
|
|
ret = btrfs_ordered_update_i_size(inode, 0, ordered);
|
|
|
|
if (!ret)
|
|
|
|
btrfs_update_inode(trans, root, inode);
|
|
|
|
ret = 0;
|
2010-05-23 09:00:55 -06:00
|
|
|
out_unlock:
|
|
|
|
unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
|
|
|
|
ordered->file_offset + ordered->len - 1,
|
|
|
|
&cached_state, GFP_NOFS);
|
|
|
|
out:
|
|
|
|
btrfs_delalloc_release_metadata(inode, ordered->len);
|
|
|
|
btrfs_end_transaction(trans, root);
|
2010-11-28 17:56:33 -07:00
|
|
|
ordered_offset = ordered->file_offset + ordered->len;
|
2010-05-23 09:00:55 -06:00
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
btrfs_put_ordered_extent(ordered);
|
2010-11-28 17:56:33 -07:00
|
|
|
|
|
|
|
out_test:
|
|
|
|
/*
|
|
|
|
* our bio might span multiple ordered extents. If we haven't
|
|
|
|
* completed the accounting for the whole dio, go back and try again
|
|
|
|
*/
|
|
|
|
if (ordered_offset < dip->logical_offset + dip->bytes) {
|
|
|
|
ordered_bytes = dip->logical_offset + dip->bytes -
|
|
|
|
ordered_offset;
|
|
|
|
goto again;
|
|
|
|
}
|
2010-05-23 09:00:55 -06:00
|
|
|
out_done:
|
|
|
|
bio->bi_private = dip->private;
|
|
|
|
|
|
|
|
kfree(dip->csums);
|
|
|
|
kfree(dip);
|
2011-03-22 09:05:07 -06:00
|
|
|
|
|
|
|
/* If we had an error make sure to clear the uptodate flag */
|
|
|
|
if (err)
|
|
|
|
clear_bit(BIO_UPTODATE, &bio->bi_flags);
|
2010-05-23 09:00:55 -06:00
|
|
|
dio_end_io(bio, err);
|
|
|
|
}
|
|
|
|
|
2010-05-25 07:48:28 -06:00
|
|
|
static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
|
|
|
|
struct bio *bio, int mirror_num,
|
|
|
|
unsigned long bio_flags, u64 offset)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
|
|
|
|
BUG_ON(ret);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-11-21 20:04:43 -07:00
|
|
|
static void btrfs_end_dio_bio(struct bio *bio, int err)
|
|
|
|
{
|
|
|
|
struct btrfs_dio_private *dip = bio->bi_private;
|
|
|
|
|
|
|
|
if (err) {
|
2011-04-19 20:31:50 -06:00
|
|
|
printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu "
|
2010-12-07 07:54:09 -07:00
|
|
|
"sector %#Lx len %u err no %d\n",
|
2011-04-19 20:31:50 -06:00
|
|
|
(unsigned long long)btrfs_ino(dip->inode), bio->bi_rw,
|
2010-12-07 07:54:09 -07:00
|
|
|
(unsigned long long)bio->bi_sector, bio->bi_size, err);
|
2010-11-21 20:04:43 -07:00
|
|
|
dip->errors = 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* before atomic variable goto zero, we must make sure
|
|
|
|
* dip->errors is perceived to be set.
|
|
|
|
*/
|
|
|
|
smp_mb__before_atomic_dec();
|
|
|
|
}
|
|
|
|
|
|
|
|
/* if there are more bios still pending for this dio, just exit */
|
|
|
|
if (!atomic_dec_and_test(&dip->pending_bios))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (dip->errors)
|
|
|
|
bio_io_error(dip->orig_bio);
|
|
|
|
else {
|
|
|
|
set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags);
|
|
|
|
bio_endio(dip->orig_bio, 0);
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
bio_put(bio);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
|
|
|
|
u64 first_sector, gfp_t gfp_flags)
|
|
|
|
{
|
|
|
|
int nr_vecs = bio_get_nr_vecs(bdev);
|
|
|
|
return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
|
|
|
|
int rw, u64 file_offset, int skip_sum,
|
2011-04-06 12:41:34 -06:00
|
|
|
u32 *csums, int async_submit)
|
2010-11-21 20:04:43 -07:00
|
|
|
{
|
|
|
|
int write = rw & REQ_WRITE;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
bio_get(bio);
|
|
|
|
ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
|
2011-04-06 12:41:34 -06:00
|
|
|
if (skip_sum)
|
|
|
|
goto map;
|
|
|
|
|
|
|
|
if (write && async_submit) {
|
2010-11-21 20:04:43 -07:00
|
|
|
ret = btrfs_wq_submit_bio(root->fs_info,
|
|
|
|
inode, rw, bio, 0, 0,
|
|
|
|
file_offset,
|
|
|
|
__btrfs_submit_bio_start_direct_io,
|
|
|
|
__btrfs_submit_bio_done);
|
|
|
|
goto err;
|
2011-04-06 12:41:34 -06:00
|
|
|
} else if (write) {
|
|
|
|
/*
|
|
|
|
* If we aren't doing async submit, calculate the csum of the
|
|
|
|
* bio now.
|
|
|
|
*/
|
|
|
|
ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
2011-02-28 23:48:31 -07:00
|
|
|
} else if (!skip_sum) {
|
|
|
|
ret = btrfs_lookup_bio_sums_dio(root, inode, bio,
|
2010-11-21 20:04:43 -07:00
|
|
|
file_offset, csums);
|
2011-02-28 23:48:31 -07:00
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
}
|
2010-11-21 20:04:43 -07:00
|
|
|
|
2011-04-06 12:41:34 -06:00
|
|
|
map:
|
|
|
|
ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
|
2010-11-21 20:04:43 -07:00
|
|
|
err:
|
|
|
|
bio_put(bio);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
|
|
|
|
int skip_sum)
|
|
|
|
{
|
|
|
|
struct inode *inode = dip->inode;
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
|
|
|
|
struct bio *bio;
|
|
|
|
struct bio *orig_bio = dip->orig_bio;
|
|
|
|
struct bio_vec *bvec = orig_bio->bi_io_vec;
|
|
|
|
u64 start_sector = orig_bio->bi_sector;
|
|
|
|
u64 file_offset = dip->logical_offset;
|
|
|
|
u64 submit_len = 0;
|
|
|
|
u64 map_length;
|
|
|
|
int nr_pages = 0;
|
|
|
|
u32 *csums = dip->csums;
|
|
|
|
int ret = 0;
|
2011-04-06 12:41:34 -06:00
|
|
|
int async_submit = 0;
|
2011-03-22 09:00:46 -06:00
|
|
|
int write = rw & REQ_WRITE;
|
2010-11-21 20:04:43 -07:00
|
|
|
|
|
|
|
map_length = orig_bio->bi_size;
|
|
|
|
ret = btrfs_map_block(map_tree, READ, start_sector << 9,
|
|
|
|
&map_length, NULL, 0);
|
|
|
|
if (ret) {
|
2011-04-25 17:43:52 -06:00
|
|
|
bio_put(orig_bio);
|
2010-11-21 20:04:43 -07:00
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
|
2011-04-06 12:25:44 -06:00
|
|
|
if (map_length >= orig_bio->bi_size) {
|
|
|
|
bio = orig_bio;
|
|
|
|
goto submit;
|
|
|
|
}
|
|
|
|
|
2011-04-06 12:41:34 -06:00
|
|
|
async_submit = 1;
|
2011-04-06 12:25:44 -06:00
|
|
|
bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
|
|
|
|
if (!bio)
|
|
|
|
return -ENOMEM;
|
|
|
|
bio->bi_private = dip;
|
|
|
|
bio->bi_end_io = btrfs_end_dio_bio;
|
|
|
|
atomic_inc(&dip->pending_bios);
|
|
|
|
|
2010-11-21 20:04:43 -07:00
|
|
|
while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
|
|
|
|
if (unlikely(map_length < submit_len + bvec->bv_len ||
|
|
|
|
bio_add_page(bio, bvec->bv_page, bvec->bv_len,
|
|
|
|
bvec->bv_offset) < bvec->bv_len)) {
|
|
|
|
/*
|
|
|
|
* inc the count before we submit the bio so
|
|
|
|
* we know the end IO handler won't happen before
|
|
|
|
* we inc the count. Otherwise, the dip might get freed
|
|
|
|
* before we're done setting it up
|
|
|
|
*/
|
|
|
|
atomic_inc(&dip->pending_bios);
|
|
|
|
ret = __btrfs_submit_dio_bio(bio, inode, rw,
|
|
|
|
file_offset, skip_sum,
|
2011-04-06 12:41:34 -06:00
|
|
|
csums, async_submit);
|
2010-11-21 20:04:43 -07:00
|
|
|
if (ret) {
|
|
|
|
bio_put(bio);
|
|
|
|
atomic_dec(&dip->pending_bios);
|
|
|
|
goto out_err;
|
|
|
|
}
|
|
|
|
|
2011-03-22 09:00:46 -06:00
|
|
|
/* Write's use the ordered csums */
|
|
|
|
if (!write && !skip_sum)
|
2010-11-21 20:04:43 -07:00
|
|
|
csums = csums + nr_pages;
|
|
|
|
start_sector += submit_len >> 9;
|
|
|
|
file_offset += submit_len;
|
|
|
|
|
|
|
|
submit_len = 0;
|
|
|
|
nr_pages = 0;
|
|
|
|
|
|
|
|
bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
|
|
|
|
start_sector, GFP_NOFS);
|
|
|
|
if (!bio)
|
|
|
|
goto out_err;
|
|
|
|
bio->bi_private = dip;
|
|
|
|
bio->bi_end_io = btrfs_end_dio_bio;
|
|
|
|
|
|
|
|
map_length = orig_bio->bi_size;
|
|
|
|
ret = btrfs_map_block(map_tree, READ, start_sector << 9,
|
|
|
|
&map_length, NULL, 0);
|
|
|
|
if (ret) {
|
|
|
|
bio_put(bio);
|
|
|
|
goto out_err;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
submit_len += bvec->bv_len;
|
|
|
|
nr_pages ++;
|
|
|
|
bvec++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-04-06 12:25:44 -06:00
|
|
|
submit:
|
2010-11-21 20:04:43 -07:00
|
|
|
ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
|
2011-04-06 12:41:34 -06:00
|
|
|
csums, async_submit);
|
2010-11-21 20:04:43 -07:00
|
|
|
if (!ret)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
bio_put(bio);
|
|
|
|
out_err:
|
|
|
|
dip->errors = 1;
|
|
|
|
/*
|
|
|
|
* before atomic variable goto zero, we must
|
|
|
|
* make sure dip->errors is perceived to be set.
|
|
|
|
*/
|
|
|
|
smp_mb__before_atomic_dec();
|
|
|
|
if (atomic_dec_and_test(&dip->pending_bios))
|
|
|
|
bio_io_error(dip->orig_bio);
|
|
|
|
|
|
|
|
/* bio_end_io() will handle error, so we needn't return it */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-05-23 09:00:55 -06:00
|
|
|
static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
|
|
|
|
loff_t file_offset)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_dio_private *dip;
|
|
|
|
struct bio_vec *bvec = bio->bi_io_vec;
|
|
|
|
int skip_sum;
|
2010-08-07 10:20:39 -06:00
|
|
|
int write = rw & REQ_WRITE;
|
2010-05-23 09:00:55 -06:00
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
|
|
|
|
|
|
|
|
dip = kmalloc(sizeof(*dip), GFP_NOFS);
|
|
|
|
if (!dip) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto free_ordered;
|
|
|
|
}
|
|
|
|
dip->csums = NULL;
|
|
|
|
|
2011-03-22 09:00:46 -06:00
|
|
|
/* Write's use the ordered csum stuff, so we don't need dip->csums */
|
|
|
|
if (!write && !skip_sum) {
|
2010-05-23 09:00:55 -06:00
|
|
|
dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
|
|
|
|
if (!dip->csums) {
|
2011-03-09 09:46:42 -07:00
|
|
|
kfree(dip);
|
2010-05-23 09:00:55 -06:00
|
|
|
ret = -ENOMEM;
|
|
|
|
goto free_ordered;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
dip->private = bio->bi_private;
|
|
|
|
dip->inode = inode;
|
|
|
|
dip->logical_offset = file_offset;
|
|
|
|
|
|
|
|
dip->bytes = 0;
|
|
|
|
do {
|
|
|
|
dip->bytes += bvec->bv_len;
|
|
|
|
bvec++;
|
|
|
|
} while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
|
|
|
|
|
2010-05-26 09:04:10 -06:00
|
|
|
dip->disk_bytenr = (u64)bio->bi_sector << 9;
|
2010-05-23 09:00:55 -06:00
|
|
|
bio->bi_private = dip;
|
2010-11-21 20:04:43 -07:00
|
|
|
dip->errors = 0;
|
|
|
|
dip->orig_bio = bio;
|
|
|
|
atomic_set(&dip->pending_bios, 0);
|
2010-05-23 09:00:55 -06:00
|
|
|
|
|
|
|
if (write)
|
|
|
|
bio->bi_end_io = btrfs_endio_direct_write;
|
|
|
|
else
|
|
|
|
bio->bi_end_io = btrfs_endio_direct_read;
|
|
|
|
|
2010-11-21 20:04:43 -07:00
|
|
|
ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
|
|
|
|
if (!ret)
|
2010-05-25 07:48:28 -06:00
|
|
|
return;
|
2010-05-23 09:00:55 -06:00
|
|
|
free_ordered:
|
|
|
|
/*
|
|
|
|
* If this is a write, we need to clean up the reserved space and kill
|
|
|
|
* the ordered extent.
|
|
|
|
*/
|
|
|
|
if (write) {
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
2010-11-19 07:41:10 -07:00
|
|
|
ordered = btrfs_lookup_ordered_extent(inode, file_offset);
|
2010-05-23 09:00:55 -06:00
|
|
|
if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
|
|
|
|
!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
|
|
|
|
btrfs_free_reserved_extent(root, ordered->start,
|
|
|
|
ordered->disk_len);
|
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
}
|
|
|
|
bio_endio(bio, ret);
|
|
|
|
}
|
|
|
|
|
2010-05-26 19:33:37 -06:00
|
|
|
static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
|
|
|
|
const struct iovec *iov, loff_t offset,
|
|
|
|
unsigned long nr_segs)
|
|
|
|
{
|
|
|
|
int seg;
|
2011-04-08 09:51:18 -06:00
|
|
|
int i;
|
2010-05-26 19:33:37 -06:00
|
|
|
size_t size;
|
|
|
|
unsigned long addr;
|
|
|
|
unsigned blocksize_mask = root->sectorsize - 1;
|
|
|
|
ssize_t retval = -EINVAL;
|
|
|
|
loff_t end = offset;
|
|
|
|
|
|
|
|
if (offset & blocksize_mask)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/* Check the memory alignment. Blocks cannot straddle pages */
|
|
|
|
for (seg = 0; seg < nr_segs; seg++) {
|
|
|
|
addr = (unsigned long)iov[seg].iov_base;
|
|
|
|
size = iov[seg].iov_len;
|
|
|
|
end += size;
|
2011-04-08 09:51:18 -06:00
|
|
|
if ((addr & blocksize_mask) || (size & blocksize_mask))
|
2010-05-26 19:33:37 -06:00
|
|
|
goto out;
|
2011-04-08 09:51:18 -06:00
|
|
|
|
|
|
|
/* If this is a write we don't need to check anymore */
|
|
|
|
if (rw & WRITE)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check to make sure we don't have duplicate iov_base's in this
|
|
|
|
* iovec, if so return EINVAL, otherwise we'll get csum errors
|
|
|
|
* when reading back.
|
|
|
|
*/
|
|
|
|
for (i = seg + 1; i < nr_segs; i++) {
|
|
|
|
if (iov[seg].iov_base == iov[i].iov_base)
|
|
|
|
goto out;
|
|
|
|
}
|
2010-05-26 19:33:37 -06:00
|
|
|
}
|
|
|
|
retval = 0;
|
|
|
|
out:
|
|
|
|
return retval;
|
|
|
|
}
|
2008-04-10 08:23:21 -06:00
|
|
|
static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
|
|
|
|
const struct iovec *iov, loff_t offset,
|
|
|
|
unsigned long nr_segs)
|
|
|
|
{
|
2010-05-23 09:00:55 -06:00
|
|
|
struct file *file = iocb->ki_filp;
|
|
|
|
struct inode *inode = file->f_mapping->host;
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
2010-05-25 18:56:50 -06:00
|
|
|
struct extent_state *cached_state = NULL;
|
2010-05-23 09:00:55 -06:00
|
|
|
u64 lockstart, lockend;
|
|
|
|
ssize_t ret;
|
2010-05-25 18:56:50 -06:00
|
|
|
int writing = rw & WRITE;
|
|
|
|
int write_bits = 0;
|
2010-05-26 08:59:53 -06:00
|
|
|
size_t count = iov_length(iov, nr_segs);
|
2010-05-23 09:00:55 -06:00
|
|
|
|
2010-05-26 19:33:37 -06:00
|
|
|
if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
|
|
|
|
offset, nr_segs)) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-05-23 09:00:55 -06:00
|
|
|
lockstart = offset;
|
2010-05-26 08:59:53 -06:00
|
|
|
lockend = offset + count - 1;
|
|
|
|
|
|
|
|
if (writing) {
|
|
|
|
ret = btrfs_delalloc_reserve_space(inode, count);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
}
|
2010-05-25 18:56:50 -06:00
|
|
|
|
2010-05-23 09:00:55 -06:00
|
|
|
while (1) {
|
2010-05-25 18:56:50 -06:00
|
|
|
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
|
|
|
|
0, &cached_state, GFP_NOFS);
|
2010-05-23 09:00:55 -06:00
|
|
|
/*
|
|
|
|
* We're concerned with the entire range that we're going to be
|
|
|
|
* doing DIO to, so we need to make sure theres no ordered
|
|
|
|
* extents in this range.
|
|
|
|
*/
|
|
|
|
ordered = btrfs_lookup_ordered_range(inode, lockstart,
|
|
|
|
lockend - lockstart + 1);
|
|
|
|
if (!ordered)
|
|
|
|
break;
|
2010-05-25 18:56:50 -06:00
|
|
|
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
|
|
|
|
&cached_state, GFP_NOFS);
|
2010-05-23 09:00:55 -06:00
|
|
|
btrfs_start_ordered_extent(inode, ordered, 1);
|
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
|
2010-05-25 18:56:50 -06:00
|
|
|
/*
|
|
|
|
* we don't use btrfs_set_extent_delalloc because we don't want
|
|
|
|
* the dirty or uptodate bits
|
|
|
|
*/
|
|
|
|
if (writing) {
|
|
|
|
write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
|
|
|
|
ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
|
|
|
|
EXTENT_DELALLOC, 0, NULL, &cached_state,
|
|
|
|
GFP_NOFS);
|
|
|
|
if (ret) {
|
|
|
|
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
|
|
|
|
lockend, EXTENT_LOCKED | write_bits,
|
|
|
|
1, 0, &cached_state, GFP_NOFS);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
free_extent_state(cached_state);
|
|
|
|
cached_state = NULL;
|
|
|
|
|
2010-05-26 19:33:37 -06:00
|
|
|
ret = __blockdev_direct_IO(rw, iocb, inode,
|
|
|
|
BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
|
|
|
|
iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
|
|
|
|
btrfs_submit_direct, 0);
|
2010-05-23 09:00:55 -06:00
|
|
|
|
|
|
|
if (ret < 0 && ret != -EIOCBQUEUED) {
|
2010-05-25 18:56:50 -06:00
|
|
|
clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
|
|
|
|
offset + iov_length(iov, nr_segs) - 1,
|
|
|
|
EXTENT_LOCKED | write_bits, 1, 0,
|
|
|
|
&cached_state, GFP_NOFS);
|
2010-05-23 09:00:55 -06:00
|
|
|
} else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
|
|
|
|
/*
|
|
|
|
* We're falling back to buffered, unlock the section we didn't
|
|
|
|
* do IO on.
|
|
|
|
*/
|
2010-05-25 18:56:50 -06:00
|
|
|
clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
|
|
|
|
offset + iov_length(iov, nr_segs) - 1,
|
|
|
|
EXTENT_LOCKED | write_bits, 1, 0,
|
|
|
|
&cached_state, GFP_NOFS);
|
2010-05-23 09:00:55 -06:00
|
|
|
}
|
2010-05-25 18:56:50 -06:00
|
|
|
out:
|
|
|
|
free_extent_state(cached_state);
|
2010-05-23 09:00:55 -06:00
|
|
|
return ret;
|
2008-04-10 08:23:21 -06:00
|
|
|
}
|
|
|
|
|
2009-01-21 12:39:14 -07:00
|
|
|
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
|
|
|
|
__u64 start, __u64 len)
|
|
|
|
{
|
2011-02-23 14:23:20 -07:00
|
|
|
return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
|
2009-01-21 12:39:14 -07:00
|
|
|
}
|
|
|
|
|
2007-08-27 14:49:44 -06:00
|
|
|
int btrfs_readpage(struct file *file, struct page *page)
|
2007-06-15 11:50:00 -06:00
|
|
|
{
|
2008-01-24 14:13:08 -07:00
|
|
|
struct extent_io_tree *tree;
|
|
|
|
tree = &BTRFS_I(page->mapping->host)->io_tree;
|
2007-08-27 14:49:44 -06:00
|
|
|
return extent_read_full_page(tree, page, btrfs_get_extent);
|
2007-06-15 11:50:00 -06:00
|
|
|
}
|
2007-12-21 14:27:21 -07:00
|
|
|
|
2007-08-27 14:49:44 -06:00
|
|
|
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
|
2007-06-12 04:35:45 -06:00
|
|
|
{
|
2008-01-24 14:13:08 -07:00
|
|
|
struct extent_io_tree *tree;
|
2007-08-27 14:49:44 -06:00
|
|
|
|
|
|
|
|
|
|
|
if (current->flags & PF_MEMALLOC) {
|
|
|
|
redirty_page_for_writepage(wbc, page);
|
|
|
|
unlock_page(page);
|
|
|
|
return 0;
|
|
|
|
}
|
2008-01-24 14:13:08 -07:00
|
|
|
tree = &BTRFS_I(page->mapping->host)->io_tree;
|
2007-08-27 14:49:44 -06:00
|
|
|
return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
|
2007-06-15 11:50:00 -06:00
|
|
|
}
|
|
|
|
|
2008-07-22 09:18:09 -06:00
|
|
|
int btrfs_writepages(struct address_space *mapping,
|
|
|
|
struct writeback_control *wbc)
|
2007-11-01 17:45:34 -06:00
|
|
|
{
|
2008-01-24 14:13:08 -07:00
|
|
|
struct extent_io_tree *tree;
|
2008-11-06 20:02:51 -07:00
|
|
|
|
2008-01-24 14:13:08 -07:00
|
|
|
tree = &BTRFS_I(mapping->host)->io_tree;
|
2007-11-01 17:45:34 -06:00
|
|
|
return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
|
|
|
|
}
|
|
|
|
|
2007-11-08 08:59:22 -07:00
|
|
|
static int
|
|
|
|
btrfs_readpages(struct file *file, struct address_space *mapping,
|
|
|
|
struct list_head *pages, unsigned nr_pages)
|
|
|
|
{
|
2008-01-24 14:13:08 -07:00
|
|
|
struct extent_io_tree *tree;
|
|
|
|
tree = &BTRFS_I(mapping->host)->io_tree;
|
2007-11-08 08:59:22 -07:00
|
|
|
return extent_readpages(tree, mapping, pages, nr_pages,
|
|
|
|
btrfs_get_extent);
|
|
|
|
}
|
2008-07-17 10:53:50 -06:00
|
|
|
static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
|
2007-06-15 11:50:00 -06:00
|
|
|
{
|
2008-01-24 14:13:08 -07:00
|
|
|
struct extent_io_tree *tree;
|
|
|
|
struct extent_map_tree *map;
|
2007-08-27 14:49:44 -06:00
|
|
|
int ret;
|
2007-06-18 07:57:58 -06:00
|
|
|
|
2008-01-24 14:13:08 -07:00
|
|
|
tree = &BTRFS_I(page->mapping->host)->io_tree;
|
|
|
|
map = &BTRFS_I(page->mapping->host)->extent_tree;
|
2008-01-29 07:59:12 -07:00
|
|
|
ret = try_release_extent_mapping(map, tree, page, gfp_flags);
|
2007-08-27 14:49:44 -06:00
|
|
|
if (ret == 1) {
|
|
|
|
ClearPagePrivate(page);
|
|
|
|
set_page_private(page, 0);
|
|
|
|
page_cache_release(page);
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
2007-08-27 14:49:44 -06:00
|
|
|
return ret;
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
|
|
|
|
2008-07-17 10:53:50 -06:00
|
|
|
static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
|
|
|
|
{
|
2008-09-11 13:51:43 -06:00
|
|
|
if (PageWriteback(page) || PageDirty(page))
|
|
|
|
return 0;
|
2009-02-12 08:06:04 -07:00
|
|
|
return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
|
2008-07-17 10:53:50 -06:00
|
|
|
}
|
|
|
|
|
2007-08-27 14:49:44 -06:00
|
|
|
static void btrfs_invalidatepage(struct page *page, unsigned long offset)
|
2007-06-12 04:35:45 -06:00
|
|
|
{
|
2008-01-24 14:13:08 -07:00
|
|
|
struct extent_io_tree *tree;
|
2008-07-17 10:53:50 -06:00
|
|
|
struct btrfs_ordered_extent *ordered;
|
2010-02-03 12:33:23 -07:00
|
|
|
struct extent_state *cached_state = NULL;
|
2008-07-17 10:53:50 -06:00
|
|
|
u64 page_start = page_offset(page);
|
|
|
|
u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2009-09-02 14:53:46 -06:00
|
|
|
|
|
|
|
/*
|
|
|
|
* we have the page locked, so new writeback can't start,
|
|
|
|
* and the dirty bit won't be cleared while we are here.
|
|
|
|
*
|
|
|
|
* Wait for IO on this page so that we can safely clear
|
|
|
|
* the PagePrivate2 bit and do ordered accounting
|
|
|
|
*/
|
2008-07-17 10:53:50 -06:00
|
|
|
wait_on_page_writeback(page);
|
2009-09-02 14:53:46 -06:00
|
|
|
|
2008-01-24 14:13:08 -07:00
|
|
|
tree = &BTRFS_I(page->mapping->host)->io_tree;
|
2008-07-17 10:53:50 -06:00
|
|
|
if (offset) {
|
|
|
|
btrfs_releasepage(page, GFP_NOFS);
|
|
|
|
return;
|
|
|
|
}
|
2010-02-03 12:33:23 -07:00
|
|
|
lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
|
|
|
|
GFP_NOFS);
|
2008-07-17 10:53:50 -06:00
|
|
|
ordered = btrfs_lookup_ordered_extent(page->mapping->host,
|
|
|
|
page_offset(page));
|
|
|
|
if (ordered) {
|
2008-07-17 11:53:27 -06:00
|
|
|
/*
|
|
|
|
* IO on this page will never be started, so we need
|
|
|
|
* to account for any ordered extents now
|
|
|
|
*/
|
2008-07-17 10:53:50 -06:00
|
|
|
clear_extent_bit(tree, page_start, page_end,
|
|
|
|
EXTENT_DIRTY | EXTENT_DELALLOC |
|
2009-10-08 11:34:05 -06:00
|
|
|
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
|
2010-02-03 12:33:23 -07:00
|
|
|
&cached_state, GFP_NOFS);
|
2009-09-02 14:53:46 -06:00
|
|
|
/*
|
|
|
|
* whoever cleared the private bit is responsible
|
|
|
|
* for the finish_ordered_io
|
|
|
|
*/
|
|
|
|
if (TestClearPagePrivate2(page)) {
|
|
|
|
btrfs_finish_ordered_io(page->mapping->host,
|
|
|
|
page_start, page_end);
|
|
|
|
}
|
2008-07-17 10:53:50 -06:00
|
|
|
btrfs_put_ordered_extent(ordered);
|
2010-02-03 12:33:23 -07:00
|
|
|
cached_state = NULL;
|
|
|
|
lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
|
|
|
|
GFP_NOFS);
|
2008-07-17 10:53:50 -06:00
|
|
|
}
|
|
|
|
clear_extent_bit(tree, page_start, page_end,
|
2009-10-08 11:34:05 -06:00
|
|
|
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
|
2010-02-03 12:33:23 -07:00
|
|
|
EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
|
2008-07-17 10:53:50 -06:00
|
|
|
__btrfs_releasepage(page, GFP_NOFS);
|
|
|
|
|
2008-07-21 08:29:44 -06:00
|
|
|
ClearPageChecked(page);
|
2008-04-18 14:11:30 -06:00
|
|
|
if (PagePrivate(page)) {
|
|
|
|
ClearPagePrivate(page);
|
|
|
|
set_page_private(page, 0);
|
|
|
|
page_cache_release(page);
|
|
|
|
}
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
|
|
|
|
2007-06-15 11:50:00 -06:00
|
|
|
/*
|
|
|
|
* btrfs_page_mkwrite() is not allowed to change the file size as it gets
|
|
|
|
* called from a page fault handler when a page is first dirtied. Hence we must
|
|
|
|
* be careful to check for EOF conditions here. We set the page up correctly
|
|
|
|
* for a written page which means we get ENOSPC checking when writing into
|
|
|
|
* holes and correct delalloc and unwritten extent mapping on filesystems that
|
|
|
|
* support these features.
|
|
|
|
*
|
|
|
|
* We are not allowed to take the i_mutex here so we have to play games to
|
|
|
|
* protect against truncate races as the page could now be beyond EOF. Because
|
|
|
|
* vmtruncate() writes the inode size before removing pages, once we have the
|
|
|
|
* page lock we can determine safely if the page is beyond EOF. If it is not
|
|
|
|
* beyond EOF, then the page is guaranteed safe against truncation until we
|
|
|
|
* unlock the page.
|
|
|
|
*/
|
2009-03-31 16:23:21 -06:00
|
|
|
int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
2007-06-15 11:50:00 -06:00
|
|
|
{
|
2009-03-31 16:23:21 -06:00
|
|
|
struct page *page = vmf->page;
|
2007-12-18 14:15:09 -07:00
|
|
|
struct inode *inode = fdentry(vma->vm_file)->d_inode;
|
2007-12-21 14:27:21 -07:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2008-07-17 10:53:50 -06:00
|
|
|
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
|
|
|
struct btrfs_ordered_extent *ordered;
|
2010-02-03 12:33:23 -07:00
|
|
|
struct extent_state *cached_state = NULL;
|
2008-07-17 10:53:50 -06:00
|
|
|
char *kaddr;
|
|
|
|
unsigned long zero_start;
|
2007-06-15 11:50:00 -06:00
|
|
|
loff_t size;
|
2007-12-21 14:27:21 -07:00
|
|
|
int ret;
|
2007-08-27 14:49:44 -06:00
|
|
|
u64 page_start;
|
2008-07-17 10:53:50 -06:00
|
|
|
u64 page_end;
|
2007-06-15 11:50:00 -06:00
|
|
|
|
2010-05-16 08:48:47 -06:00
|
|
|
ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
|
2009-03-31 16:23:23 -06:00
|
|
|
if (ret) {
|
|
|
|
if (ret == -ENOMEM)
|
|
|
|
ret = VM_FAULT_OOM;
|
|
|
|
else /* -ENOSPC, -EIO, etc */
|
|
|
|
ret = VM_FAULT_SIGBUS;
|
2007-12-21 14:27:21 -07:00
|
|
|
goto out;
|
2009-03-31 16:23:23 -06:00
|
|
|
}
|
2007-12-21 14:27:21 -07:00
|
|
|
|
2009-03-31 16:23:23 -06:00
|
|
|
ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
|
2008-07-17 10:53:50 -06:00
|
|
|
again:
|
2007-06-15 11:50:00 -06:00
|
|
|
lock_page(page);
|
|
|
|
size = i_size_read(inode);
|
2008-07-17 10:53:50 -06:00
|
|
|
page_start = page_offset(page);
|
|
|
|
page_end = page_start + PAGE_CACHE_SIZE - 1;
|
2007-08-27 14:49:44 -06:00
|
|
|
|
2007-06-15 11:50:00 -06:00
|
|
|
if ((page->mapping != inode->i_mapping) ||
|
2008-07-17 10:53:50 -06:00
|
|
|
(page_start >= size)) {
|
2007-06-15 11:50:00 -06:00
|
|
|
/* page got truncated out from underneath us */
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
2008-07-17 10:53:50 -06:00
|
|
|
wait_on_page_writeback(page);
|
|
|
|
|
2010-02-03 12:33:23 -07:00
|
|
|
lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
|
|
|
|
GFP_NOFS);
|
2008-07-17 10:53:50 -06:00
|
|
|
set_page_extent_mapped(page);
|
|
|
|
|
2008-07-17 11:53:27 -06:00
|
|
|
/*
|
|
|
|
* we can't set the delalloc bits if there are pending ordered
|
|
|
|
* extents. Drop our locks and wait for them to finish
|
|
|
|
*/
|
2008-07-17 10:53:50 -06:00
|
|
|
ordered = btrfs_lookup_ordered_extent(inode, page_start);
|
|
|
|
if (ordered) {
|
2010-02-03 12:33:23 -07:00
|
|
|
unlock_extent_cached(io_tree, page_start, page_end,
|
|
|
|
&cached_state, GFP_NOFS);
|
2008-07-17 10:53:50 -06:00
|
|
|
unlock_page(page);
|
2008-07-17 11:53:27 -06:00
|
|
|
btrfs_start_ordered_extent(inode, ordered, 1);
|
2008-07-17 10:53:50 -06:00
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
2009-10-01 15:10:23 -06:00
|
|
|
/*
|
|
|
|
* XXX - page_mkwrite gets called every time the page is dirtied, even
|
|
|
|
* if it was already dirty, so for space accounting reasons we need to
|
|
|
|
* clear any delalloc bits for the range we are fixing to save. There
|
|
|
|
* is probably a better way to do this, but for now keep consistent with
|
|
|
|
* prepare_pages in the normal write path.
|
|
|
|
*/
|
2010-02-03 12:33:23 -07:00
|
|
|
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
|
2009-10-08 11:34:05 -06:00
|
|
|
EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
|
2010-02-03 12:33:23 -07:00
|
|
|
0, 0, &cached_state, GFP_NOFS);
|
2009-10-01 15:10:23 -06:00
|
|
|
|
2010-02-03 12:33:23 -07:00
|
|
|
ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
|
|
|
|
&cached_state);
|
2009-09-11 14:12:44 -06:00
|
|
|
if (ret) {
|
2010-02-03 12:33:23 -07:00
|
|
|
unlock_extent_cached(io_tree, page_start, page_end,
|
|
|
|
&cached_state, GFP_NOFS);
|
2009-09-11 14:12:44 -06:00
|
|
|
ret = VM_FAULT_SIGBUS;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
2008-07-17 10:53:50 -06:00
|
|
|
ret = 0;
|
2007-06-15 11:50:00 -06:00
|
|
|
|
|
|
|
/* page is wholly or partially inside EOF */
|
2007-08-27 14:49:44 -06:00
|
|
|
if (page_start + PAGE_CACHE_SIZE > size)
|
2008-07-17 10:53:50 -06:00
|
|
|
zero_start = size & ~PAGE_CACHE_MASK;
|
2007-06-15 11:50:00 -06:00
|
|
|
else
|
2008-07-17 10:53:50 -06:00
|
|
|
zero_start = PAGE_CACHE_SIZE;
|
2007-06-15 11:50:00 -06:00
|
|
|
|
2008-07-17 10:53:50 -06:00
|
|
|
if (zero_start != PAGE_CACHE_SIZE) {
|
|
|
|
kaddr = kmap(page);
|
|
|
|
memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
|
|
|
|
flush_dcache_page(page);
|
|
|
|
kunmap(page);
|
|
|
|
}
|
2008-07-17 10:53:51 -06:00
|
|
|
ClearPageChecked(page);
|
2008-07-17 10:53:50 -06:00
|
|
|
set_page_dirty(page);
|
2009-09-11 10:33:12 -06:00
|
|
|
SetPageUptodate(page);
|
2009-03-31 11:27:11 -06:00
|
|
|
|
2009-10-13 11:21:08 -06:00
|
|
|
BTRFS_I(inode)->last_trans = root->fs_info->generation;
|
|
|
|
BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
|
|
|
|
|
2010-02-03 12:33:23 -07:00
|
|
|
unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
|
2007-06-15 11:50:00 -06:00
|
|
|
|
|
|
|
out_unlock:
|
2009-09-11 10:33:12 -06:00
|
|
|
if (!ret)
|
|
|
|
return VM_FAULT_LOCKED;
|
2007-06-15 11:50:00 -06:00
|
|
|
unlock_page(page);
|
2010-05-16 08:48:47 -06:00
|
|
|
btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
|
2007-12-21 14:27:21 -07:00
|
|
|
out:
|
2007-06-15 11:50:00 -06:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-01-31 13:30:16 -07:00
|
|
|
static int btrfs_truncate(struct inode *inode)
|
2007-06-12 04:35:45 -06:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2011-05-03 08:40:22 -06:00
|
|
|
struct btrfs_block_rsv *rsv;
|
2007-06-12 04:35:45 -06:00
|
|
|
int ret;
|
2011-01-31 14:03:11 -07:00
|
|
|
int err = 0;
|
2007-06-12 04:35:45 -06:00
|
|
|
struct btrfs_trans_handle *trans;
|
2007-09-17 08:58:06 -06:00
|
|
|
unsigned long nr;
|
2008-07-17 10:54:05 -06:00
|
|
|
u64 mask = root->sectorsize - 1;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2009-10-13 14:46:49 -06:00
|
|
|
ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
|
|
|
|
if (ret)
|
2011-01-31 13:30:16 -07:00
|
|
|
return ret;
|
2009-11-12 02:35:36 -07:00
|
|
|
|
2008-07-21 08:29:44 -06:00
|
|
|
btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
|
2009-11-12 02:35:36 -07:00
|
|
|
btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2011-05-03 08:40:22 -06:00
|
|
|
/*
|
|
|
|
* Yes ladies and gentelment, this is indeed ugly. The fact is we have
|
|
|
|
* 3 things going on here
|
|
|
|
*
|
|
|
|
* 1) We need to reserve space for our orphan item and the space to
|
|
|
|
* delete our orphan item. Lord knows we don't want to have a dangling
|
|
|
|
* orphan item because we didn't reserve space to remove it.
|
|
|
|
*
|
|
|
|
* 2) We need to reserve space to update our inode.
|
|
|
|
*
|
|
|
|
* 3) We need to have something to cache all the space that is going to
|
|
|
|
* be free'd up by the truncate operation, but also have some slack
|
|
|
|
* space reserved in case it uses space during the truncate (thank you
|
|
|
|
* very much snapshotting).
|
|
|
|
*
|
|
|
|
* And we need these to all be seperate. The fact is we can use alot of
|
|
|
|
* space doing the truncate, and we have no earthly idea how much space
|
|
|
|
* we will use, so we need the truncate reservation to be seperate so it
|
|
|
|
* doesn't end up using space reserved for updating the inode or
|
|
|
|
* removing the orphan item. We also need to be able to stop the
|
|
|
|
* transaction and start a new one, which means we need to be able to
|
|
|
|
* update the inode several times, and we have no idea of knowing how
|
|
|
|
* many times that will be, so we can't just reserve 1 item for the
|
|
|
|
* entirety of the opration, so that has to be done seperately as well.
|
|
|
|
* Then there is the orphan item, which does indeed need to be held on
|
|
|
|
* to for the whole operation, and we need nobody to touch this reserved
|
|
|
|
* space except the orphan code.
|
|
|
|
*
|
|
|
|
* So that leaves us with
|
|
|
|
*
|
|
|
|
* 1) root->orphan_block_rsv - for the orphan deletion.
|
|
|
|
* 2) rsv - for the truncate reservation, which we will steal from the
|
|
|
|
* transaction reservation.
|
|
|
|
* 3) fs_info->trans_block_rsv - this will have 1 items worth left for
|
|
|
|
* updating the inode.
|
|
|
|
*/
|
|
|
|
rsv = btrfs_alloc_block_rsv(root);
|
|
|
|
if (!rsv)
|
|
|
|
return -ENOMEM;
|
|
|
|
btrfs_add_durable_block_rsv(root->fs_info, rsv);
|
2011-03-04 12:37:08 -07:00
|
|
|
|
2011-05-03 08:40:22 -06:00
|
|
|
trans = btrfs_start_transaction(root, 4);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
err = PTR_ERR(trans);
|
|
|
|
goto out;
|
|
|
|
}
|
2011-03-04 12:37:08 -07:00
|
|
|
|
2011-05-03 08:40:22 -06:00
|
|
|
/*
|
|
|
|
* Reserve space for the truncate process. Truncate should be adding
|
|
|
|
* space, but if there are snapshots it may end up using space.
|
|
|
|
*/
|
|
|
|
ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
|
|
|
|
BUG_ON(ret);
|
2011-03-04 12:37:08 -07:00
|
|
|
|
|
|
|
ret = btrfs_orphan_add(trans, inode);
|
|
|
|
if (ret) {
|
|
|
|
btrfs_end_transaction(trans, root);
|
2011-05-03 08:40:22 -06:00
|
|
|
goto out;
|
2011-03-04 12:37:08 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
nr = trans->blocks_used;
|
|
|
|
btrfs_end_transaction(trans, root);
|
|
|
|
btrfs_btree_balance_dirty(root, nr);
|
|
|
|
|
2011-05-03 08:40:22 -06:00
|
|
|
/*
|
|
|
|
* Ok so we've already migrated our bytes over for the truncate, so here
|
|
|
|
* just reserve the one slot we need for updating the inode.
|
|
|
|
*/
|
|
|
|
trans = btrfs_start_transaction(root, 1);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
err = PTR_ERR(trans);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
trans->block_rsv = rsv;
|
2009-03-31 11:27:11 -06:00
|
|
|
|
|
|
|
/*
|
|
|
|
* setattr is responsible for setting the ordered_data_close flag,
|
|
|
|
* but that is only tested during the last file release. That
|
|
|
|
* could happen well after the next commit, leaving a great big
|
|
|
|
* window where new writes may get lost if someone chooses to write
|
|
|
|
* to this file after truncating to zero
|
|
|
|
*
|
|
|
|
* The inode doesn't have any dirty data here, and so if we commit
|
|
|
|
* this is a noop. If someone immediately starts writing to the inode
|
|
|
|
* it is very likely we'll catch some of their writes in this
|
|
|
|
* transaction, and the commit will find this file on the ordered
|
|
|
|
* data list with good things to send down.
|
|
|
|
*
|
|
|
|
* This is a best effort solution, there is still a window where
|
|
|
|
* using truncate to replace the contents of the file will
|
|
|
|
* end up with a zero length file after a crash.
|
|
|
|
*/
|
|
|
|
if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
|
|
|
|
btrfs_add_ordered_operation(trans, root, inode);
|
|
|
|
|
2009-11-12 02:35:36 -07:00
|
|
|
while (1) {
|
2010-05-16 08:49:58 -06:00
|
|
|
if (!trans) {
|
2011-05-03 08:40:22 -06:00
|
|
|
trans = btrfs_start_transaction(root, 3);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
err = PTR_ERR(trans);
|
|
|
|
goto out;
|
|
|
|
}
|
2010-05-16 08:49:58 -06:00
|
|
|
|
2011-05-03 08:40:22 -06:00
|
|
|
ret = btrfs_truncate_reserve_metadata(trans, root,
|
|
|
|
rsv);
|
|
|
|
BUG_ON(ret);
|
|
|
|
|
|
|
|
trans->block_rsv = rsv;
|
2010-05-16 08:49:58 -06:00
|
|
|
}
|
|
|
|
|
2009-11-12 02:35:36 -07:00
|
|
|
ret = btrfs_truncate_inode_items(trans, root, inode,
|
|
|
|
inode->i_size,
|
|
|
|
BTRFS_EXTENT_DATA_KEY);
|
2011-01-31 14:03:11 -07:00
|
|
|
if (ret != -EAGAIN) {
|
|
|
|
err = ret;
|
2009-11-12 02:35:36 -07:00
|
|
|
break;
|
2011-01-31 14:03:11 -07:00
|
|
|
}
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2011-05-03 08:40:22 -06:00
|
|
|
trans->block_rsv = &root->fs_info->trans_block_rsv;
|
2009-11-12 02:35:36 -07:00
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
2011-01-31 14:03:11 -07:00
|
|
|
if (ret) {
|
|
|
|
err = ret;
|
|
|
|
break;
|
|
|
|
}
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2009-11-12 02:35:36 -07:00
|
|
|
nr = trans->blocks_used;
|
|
|
|
btrfs_end_transaction(trans, root);
|
2010-05-16 08:49:58 -06:00
|
|
|
trans = NULL;
|
2009-11-12 02:35:36 -07:00
|
|
|
btrfs_btree_balance_dirty(root, nr);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ret == 0 && inode->i_nlink > 0) {
|
2011-05-03 08:40:22 -06:00
|
|
|
trans->block_rsv = root->orphan_block_rsv;
|
2009-11-12 02:35:36 -07:00
|
|
|
ret = btrfs_orphan_del(trans, inode);
|
2011-01-31 14:03:11 -07:00
|
|
|
if (ret)
|
|
|
|
err = ret;
|
2011-03-04 12:09:46 -07:00
|
|
|
} else if (ret && inode->i_nlink > 0) {
|
|
|
|
/*
|
|
|
|
* Failed to do the truncate, remove us from the in memory
|
|
|
|
* orphan list.
|
|
|
|
*/
|
|
|
|
ret = btrfs_orphan_del(NULL, inode);
|
2009-11-12 02:35:36 -07:00
|
|
|
}
|
|
|
|
|
2011-05-03 08:40:22 -06:00
|
|
|
trans->block_rsv = &root->fs_info->trans_block_rsv;
|
2009-11-12 02:35:36 -07:00
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
2011-01-31 14:03:11 -07:00
|
|
|
if (ret && !err)
|
|
|
|
err = ret;
|
2008-07-24 10:17:14 -06:00
|
|
|
|
|
|
|
nr = trans->blocks_used;
|
2008-06-25 14:01:31 -06:00
|
|
|
ret = btrfs_end_transaction_throttle(trans, root);
|
2011-05-03 08:40:22 -06:00
|
|
|
btrfs_btree_balance_dirty(root, nr);
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_block_rsv(root, rsv);
|
|
|
|
|
2011-01-31 14:03:11 -07:00
|
|
|
if (ret && !err)
|
|
|
|
err = ret;
|
2011-01-31 13:30:16 -07:00
|
|
|
|
2011-01-31 14:03:11 -07:00
|
|
|
return err;
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/*
|
|
|
|
* create a new subvolume directory/inode (helper for the ioctl).
|
|
|
|
*/
|
2008-12-11 14:30:39 -07:00
|
|
|
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
|
2011-05-11 13:26:06 -06:00
|
|
|
struct btrfs_root *new_root, u64 new_dirid)
|
2007-06-12 04:35:45 -06:00
|
|
|
{
|
|
|
|
struct inode *inode;
|
2009-09-21 14:00:26 -06:00
|
|
|
int err;
|
2008-08-05 09:18:09 -06:00
|
|
|
u64 index = 0;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2008-07-24 10:12:38 -06:00
|
|
|
inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
|
2011-05-11 13:26:06 -06:00
|
|
|
new_dirid, S_IFDIR | 0700, &index);
|
2007-06-22 12:16:25 -06:00
|
|
|
if (IS_ERR(inode))
|
2008-06-11 19:53:53 -06:00
|
|
|
return PTR_ERR(inode);
|
2007-06-12 04:35:45 -06:00
|
|
|
inode->i_op = &btrfs_dir_inode_operations;
|
|
|
|
inode->i_fop = &btrfs_dir_file_operations;
|
|
|
|
|
|
|
|
inode->i_nlink = 1;
|
2008-07-17 10:54:05 -06:00
|
|
|
btrfs_i_size_write(inode, 0);
|
2008-06-09 19:57:42 -06:00
|
|
|
|
2009-09-21 14:00:26 -06:00
|
|
|
err = btrfs_update_inode(trans, new_root, inode);
|
|
|
|
BUG_ON(err);
|
2008-10-09 11:39:39 -06:00
|
|
|
|
2009-09-21 14:00:26 -06:00
|
|
|
iput(inode);
|
2008-10-09 11:39:39 -06:00
|
|
|
return 0;
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/* helper function for file defrag and space balancing. This
|
|
|
|
* forces readahead on a given range of bytes in an inode
|
|
|
|
*/
|
2007-12-21 14:27:24 -07:00
|
|
|
unsigned long btrfs_force_ra(struct address_space *mapping,
|
2007-09-10 17:58:16 -06:00
|
|
|
struct file_ra_state *ra, struct file *file,
|
|
|
|
pgoff_t offset, pgoff_t last_index)
|
|
|
|
{
|
2008-04-28 07:02:36 -06:00
|
|
|
pgoff_t req_size = last_index - offset + 1;
|
2007-09-10 17:58:16 -06:00
|
|
|
|
|
|
|
page_cache_sync_readahead(mapping, ra, file, offset, req_size);
|
|
|
|
return offset + req_size;
|
|
|
|
}
|
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
struct inode *btrfs_alloc_inode(struct super_block *sb)
|
|
|
|
{
|
|
|
|
struct btrfs_inode *ei;
|
2010-05-16 08:46:25 -06:00
|
|
|
struct inode *inode;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
|
|
|
ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
|
|
|
|
if (!ei)
|
|
|
|
return NULL;
|
2010-05-16 08:46:25 -06:00
|
|
|
|
|
|
|
ei->root = NULL;
|
|
|
|
ei->space_info = NULL;
|
|
|
|
ei->generation = 0;
|
|
|
|
ei->sequence = 0;
|
2007-08-10 14:22:09 -06:00
|
|
|
ei->last_trans = 0;
|
2009-10-13 11:21:08 -06:00
|
|
|
ei->last_sub_trans = 0;
|
2008-09-05 14:13:11 -06:00
|
|
|
ei->logged_trans = 0;
|
2010-05-16 08:46:25 -06:00
|
|
|
ei->delalloc_bytes = 0;
|
|
|
|
ei->reserved_bytes = 0;
|
|
|
|
ei->disk_i_size = 0;
|
|
|
|
ei->flags = 0;
|
|
|
|
ei->index_cnt = (u64)-1;
|
|
|
|
ei->last_unlink_trans = 0;
|
|
|
|
|
2010-05-16 08:48:47 -06:00
|
|
|
atomic_set(&ei->outstanding_extents, 0);
|
2011-01-25 14:30:38 -07:00
|
|
|
atomic_set(&ei->reserved_extents, 0);
|
2010-05-16 08:46:25 -06:00
|
|
|
|
|
|
|
ei->ordered_data_close = 0;
|
2010-05-16 08:49:58 -06:00
|
|
|
ei->orphan_meta_reserved = 0;
|
2010-05-16 08:46:25 -06:00
|
|
|
ei->dummy_inode = 0;
|
2011-05-24 13:35:30 -06:00
|
|
|
ei->in_defrag = 0;
|
2010-12-16 23:21:50 -07:00
|
|
|
ei->force_compress = BTRFS_COMPRESS_NONE;
|
2010-05-16 08:46:25 -06:00
|
|
|
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
ei->delayed_node = NULL;
|
|
|
|
|
2010-05-16 08:46:25 -06:00
|
|
|
inode = &ei->vfs_inode;
|
2011-04-20 16:34:43 -06:00
|
|
|
extent_map_tree_init(&ei->extent_tree);
|
2011-04-20 15:35:57 -06:00
|
|
|
extent_io_tree_init(&ei->io_tree, &inode->i_data);
|
|
|
|
extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
|
2010-05-16 08:46:25 -06:00
|
|
|
mutex_init(&ei->log_mutex);
|
2008-07-17 10:53:50 -06:00
|
|
|
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
|
2008-07-24 10:17:14 -06:00
|
|
|
INIT_LIST_HEAD(&ei->i_orphan);
|
2010-05-16 08:46:25 -06:00
|
|
|
INIT_LIST_HEAD(&ei->delalloc_inodes);
|
2009-03-31 11:27:11 -06:00
|
|
|
INIT_LIST_HEAD(&ei->ordered_operations);
|
2010-05-16 08:46:25 -06:00
|
|
|
RB_CLEAR_NODE(&ei->rb_node);
|
|
|
|
|
|
|
|
return inode;
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
|
|
|
|
2011-01-06 23:49:49 -07:00
|
|
|
static void btrfs_i_callback(struct rcu_head *head)
|
|
|
|
{
|
|
|
|
struct inode *inode = container_of(head, struct inode, i_rcu);
|
|
|
|
INIT_LIST_HEAD(&inode->i_dentry);
|
|
|
|
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
|
|
|
|
}
|
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
void btrfs_destroy_inode(struct inode *inode)
|
|
|
|
{
|
2008-07-17 10:53:50 -06:00
|
|
|
struct btrfs_ordered_extent *ordered;
|
2009-03-31 11:27:11 -06:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
WARN_ON(!list_empty(&inode->i_dentry));
|
|
|
|
WARN_ON(inode->i_data.nrpages);
|
2010-05-16 08:48:47 -06:00
|
|
|
WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
|
2011-01-25 14:30:38 -07:00
|
|
|
WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents));
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2009-11-11 13:53:34 -07:00
|
|
|
/*
|
|
|
|
* This can happen where we create an inode, but somebody else also
|
|
|
|
* created the same inode and we need to destroy the one we already
|
|
|
|
* created.
|
|
|
|
*/
|
|
|
|
if (!root)
|
|
|
|
goto free;
|
|
|
|
|
2009-03-31 11:27:11 -06:00
|
|
|
/*
|
|
|
|
* Make sure we're properly removed from the ordered operation
|
|
|
|
* lists.
|
|
|
|
*/
|
|
|
|
smp_mb();
|
|
|
|
if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
|
|
|
|
spin_lock(&root->fs_info->ordered_extent_lock);
|
|
|
|
list_del_init(&BTRFS_I(inode)->ordered_operations);
|
|
|
|
spin_unlock(&root->fs_info->ordered_extent_lock);
|
|
|
|
}
|
|
|
|
|
2010-05-16 08:49:58 -06:00
|
|
|
spin_lock(&root->orphan_lock);
|
2008-07-24 10:17:14 -06:00
|
|
|
if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
|
2011-04-19 20:31:50 -06:00
|
|
|
printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
|
|
|
|
(unsigned long long)btrfs_ino(inode));
|
2009-11-12 02:35:36 -07:00
|
|
|
list_del_init(&BTRFS_I(inode)->i_orphan);
|
2008-07-24 10:17:14 -06:00
|
|
|
}
|
2010-05-16 08:49:58 -06:00
|
|
|
spin_unlock(&root->orphan_lock);
|
2008-07-24 10:17:14 -06:00
|
|
|
|
2009-01-05 19:25:51 -07:00
|
|
|
while (1) {
|
2008-07-17 10:53:50 -06:00
|
|
|
ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
|
|
|
|
if (!ordered)
|
|
|
|
break;
|
|
|
|
else {
|
2009-01-05 19:25:51 -07:00
|
|
|
printk(KERN_ERR "btrfs found ordered "
|
|
|
|
"extent %llu %llu on inode cleanup\n",
|
|
|
|
(unsigned long long)ordered->file_offset,
|
|
|
|
(unsigned long long)ordered->len);
|
2008-07-17 10:53:50 -06:00
|
|
|
btrfs_remove_ordered_extent(inode, ordered);
|
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
btrfs_put_ordered_extent(ordered);
|
|
|
|
}
|
|
|
|
}
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 08:45:14 -06:00
|
|
|
inode_tree_del(inode);
|
2008-09-26 08:05:38 -06:00
|
|
|
btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
|
2009-11-11 13:53:34 -07:00
|
|
|
free:
|
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 04:12:22 -06:00
|
|
|
btrfs_remove_delayed_node(inode);
|
2011-01-06 23:49:49 -07:00
|
|
|
call_rcu(&inode->i_rcu, btrfs_i_callback);
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
|
|
|
|
2010-06-07 11:43:19 -06:00
|
|
|
int btrfs_drop_inode(struct inode *inode)
|
2009-09-21 14:00:26 -06:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
2010-06-07 11:43:19 -06:00
|
|
|
|
2010-06-21 12:48:16 -06:00
|
|
|
if (btrfs_root_refs(&root->root_item) == 0 &&
|
2011-04-19 20:33:24 -06:00
|
|
|
!is_free_space_inode(root, inode))
|
2010-06-07 11:43:19 -06:00
|
|
|
return 1;
|
2009-09-21 14:00:26 -06:00
|
|
|
else
|
2010-06-07 11:43:19 -06:00
|
|
|
return generic_drop_inode(inode);
|
2009-09-21 14:00:26 -06:00
|
|
|
}
|
|
|
|
|
2008-07-30 14:54:26 -06:00
|
|
|
static void init_once(void *foo)
|
2007-06-12 04:35:45 -06:00
|
|
|
{
|
|
|
|
struct btrfs_inode *ei = (struct btrfs_inode *) foo;
|
|
|
|
|
|
|
|
inode_init_once(&ei->vfs_inode);
|
|
|
|
}
|
|
|
|
|
|
|
|
void btrfs_destroy_cachep(void)
|
|
|
|
{
|
|
|
|
if (btrfs_inode_cachep)
|
|
|
|
kmem_cache_destroy(btrfs_inode_cachep);
|
|
|
|
if (btrfs_trans_handle_cachep)
|
|
|
|
kmem_cache_destroy(btrfs_trans_handle_cachep);
|
|
|
|
if (btrfs_transaction_cachep)
|
|
|
|
kmem_cache_destroy(btrfs_transaction_cachep);
|
|
|
|
if (btrfs_path_cachep)
|
|
|
|
kmem_cache_destroy(btrfs_path_cachep);
|
2011-01-28 15:05:48 -07:00
|
|
|
if (btrfs_free_space_cachep)
|
|
|
|
kmem_cache_destroy(btrfs_free_space_cachep);
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
int btrfs_init_cachep(void)
|
|
|
|
{
|
2009-04-13 07:33:09 -06:00
|
|
|
btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
|
|
|
|
sizeof(struct btrfs_inode), 0,
|
|
|
|
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
|
2007-06-12 04:35:45 -06:00
|
|
|
if (!btrfs_inode_cachep)
|
|
|
|
goto fail;
|
2009-04-13 07:33:09 -06:00
|
|
|
|
|
|
|
btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
|
|
|
|
sizeof(struct btrfs_trans_handle), 0,
|
|
|
|
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
|
2007-06-12 04:35:45 -06:00
|
|
|
if (!btrfs_trans_handle_cachep)
|
|
|
|
goto fail;
|
2009-04-13 07:33:09 -06:00
|
|
|
|
|
|
|
btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
|
|
|
|
sizeof(struct btrfs_transaction), 0,
|
|
|
|
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
|
2007-06-12 04:35:45 -06:00
|
|
|
if (!btrfs_transaction_cachep)
|
|
|
|
goto fail;
|
2009-04-13 07:33:09 -06:00
|
|
|
|
|
|
|
btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
|
|
|
|
sizeof(struct btrfs_path), 0,
|
|
|
|
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
|
2007-06-12 04:35:45 -06:00
|
|
|
if (!btrfs_path_cachep)
|
|
|
|
goto fail;
|
2009-04-13 07:33:09 -06:00
|
|
|
|
2011-01-28 15:05:48 -07:00
|
|
|
btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache",
|
|
|
|
sizeof(struct btrfs_free_space), 0,
|
|
|
|
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
|
|
|
|
if (!btrfs_free_space_cachep)
|
|
|
|
goto fail;
|
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
return 0;
|
|
|
|
fail:
|
|
|
|
btrfs_destroy_cachep();
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_getattr(struct vfsmount *mnt,
|
|
|
|
struct dentry *dentry, struct kstat *stat)
|
|
|
|
{
|
|
|
|
struct inode *inode = dentry->d_inode;
|
|
|
|
generic_fillattr(inode, stat);
|
2008-11-17 18:42:26 -07:00
|
|
|
stat->dev = BTRFS_I(inode)->root->anon_super.s_dev;
|
2008-01-03 12:51:00 -07:00
|
|
|
stat->blksize = PAGE_CACHE_SIZE;
|
2008-10-09 09:46:29 -06:00
|
|
|
stat->blocks = (inode_get_bytes(inode) +
|
|
|
|
BTRFS_I(inode)->delalloc_bytes) >> 9;
|
2007-06-12 04:35:45 -06:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
Btrfs: Per file/directory controls for COW and compression
Data compression and data cow are controlled across the entire FS by mount
options right now. ioctls are needed to set this on a per file or per
directory basis. This has been proposed previously, but VFS developers
wanted us to use generic ioctls rather than btrfs-specific ones.
According to Chris's comment, there should be just one true compression
method(probably LZO) stored in the super. However, before this, we would
wait for that one method is stable enough to be adopted into the super.
So I list it as a long term goal, and just store it in ram today.
After applying this patch, we can use the generic "FS_IOC_SETFLAGS" ioctl to
control file and directory's datacow and compression attribute.
NOTE:
- The compression type is selected by such rules:
If we mount btrfs with compress options, ie, zlib/lzo, the type is it.
Otherwise, we'll use the default compress type (zlib today).
v1->v2:
- rebase to the latest btrfs.
v2->v3:
- fix a problem, i.e. when a file is set NOCOW via mount option, then this NOCOW
will be screwed by inheritance from parent directory.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-22 04:12:20 -06:00
|
|
|
/*
|
|
|
|
* If a file is moved, it will inherit the cow and compression flags of the new
|
|
|
|
* directory.
|
|
|
|
*/
|
|
|
|
static void fixup_inode_flags(struct inode *dir, struct inode *inode)
|
|
|
|
{
|
|
|
|
struct btrfs_inode *b_dir = BTRFS_I(dir);
|
|
|
|
struct btrfs_inode *b_inode = BTRFS_I(inode);
|
|
|
|
|
|
|
|
if (b_dir->flags & BTRFS_INODE_NODATACOW)
|
|
|
|
b_inode->flags |= BTRFS_INODE_NODATACOW;
|
|
|
|
else
|
|
|
|
b_inode->flags &= ~BTRFS_INODE_NODATACOW;
|
|
|
|
|
|
|
|
if (b_dir->flags & BTRFS_INODE_COMPRESS)
|
|
|
|
b_inode->flags |= BTRFS_INODE_COMPRESS;
|
|
|
|
else
|
|
|
|
b_inode->flags &= ~BTRFS_INODE_COMPRESS;
|
|
|
|
}
|
|
|
|
|
2009-01-05 19:25:51 -07:00
|
|
|
static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
|
|
|
|
struct inode *new_dir, struct dentry *new_dentry)
|
2007-06-12 04:35:45 -06:00
|
|
|
{
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = BTRFS_I(old_dir)->root;
|
2009-09-21 13:56:00 -06:00
|
|
|
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
|
2007-06-12 04:35:45 -06:00
|
|
|
struct inode *new_inode = new_dentry->d_inode;
|
|
|
|
struct inode *old_inode = old_dentry->d_inode;
|
|
|
|
struct timespec ctime = CURRENT_TIME;
|
2008-08-05 09:18:09 -06:00
|
|
|
u64 index = 0;
|
2009-09-21 13:56:00 -06:00
|
|
|
u64 root_objectid;
|
2007-06-12 04:35:45 -06:00
|
|
|
int ret;
|
2011-04-19 20:31:50 -06:00
|
|
|
u64 old_ino = btrfs_ino(old_inode);
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
|
2009-09-24 07:17:31 -06:00
|
|
|
return -EPERM;
|
|
|
|
|
2009-09-21 13:56:00 -06:00
|
|
|
/* we only allow rename subvolume link between subvolumes */
|
2011-04-19 20:31:50 -06:00
|
|
|
if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
|
2008-11-17 18:42:26 -07:00
|
|
|
return -EXDEV;
|
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
|
|
|
|
(new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
|
2007-06-12 04:35:45 -06:00
|
|
|
return -ENOTEMPTY;
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2009-09-21 13:56:00 -06:00
|
|
|
if (S_ISDIR(old_inode->i_mode) && new_inode &&
|
|
|
|
new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
|
|
|
|
return -ENOTEMPTY;
|
2009-03-31 11:27:11 -06:00
|
|
|
/*
|
|
|
|
* we're using rename to replace one file with another.
|
|
|
|
* and the replacement file is large. Start IO on it now so
|
|
|
|
* we don't add too much work to the end of the transaction
|
|
|
|
*/
|
2009-08-07 11:47:08 -06:00
|
|
|
if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
|
2009-03-31 11:27:11 -06:00
|
|
|
old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
|
|
|
|
filemap_flush(old_inode->i_mapping);
|
|
|
|
|
2009-09-21 14:00:26 -06:00
|
|
|
/* close the racy window with snapshot create/destroy ioctl */
|
2011-04-19 20:31:50 -06:00
|
|
|
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
|
2009-09-21 14:00:26 -06:00
|
|
|
down_read(&root->fs_info->subvol_sem);
|
2010-05-16 08:48:46 -06:00
|
|
|
/*
|
|
|
|
* We want to reserve the absolute worst case amount of items. So if
|
|
|
|
* both inodes are subvols and we need to unlink them then that would
|
|
|
|
* require 4 item modifications, but if they are both normal inodes it
|
|
|
|
* would require 5 item modifications, so we'll assume their normal
|
|
|
|
* inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
|
|
|
|
* should cover the worst case number of items we'll modify.
|
|
|
|
*/
|
|
|
|
trans = btrfs_start_transaction(root, 20);
|
2011-03-31 07:23:47 -06:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
goto out_notrans;
|
|
|
|
}
|
2009-09-21 14:00:26 -06:00
|
|
|
|
2009-09-21 13:56:00 -06:00
|
|
|
if (dest != root)
|
|
|
|
btrfs_record_root_in_trans(trans, dest);
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2009-09-24 07:17:31 -06:00
|
|
|
ret = btrfs_set_inode_index(new_dir, &index);
|
|
|
|
if (ret)
|
|
|
|
goto out_fail;
|
2009-03-31 11:27:11 -06:00
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
|
2009-09-21 13:56:00 -06:00
|
|
|
/* force full log commit if subvolume involved. */
|
|
|
|
root->fs_info->last_trans_log_full_commit = trans->transid;
|
|
|
|
} else {
|
2009-09-24 07:17:31 -06:00
|
|
|
ret = btrfs_insert_inode_ref(trans, dest,
|
|
|
|
new_dentry->d_name.name,
|
|
|
|
new_dentry->d_name.len,
|
2011-04-19 20:31:50 -06:00
|
|
|
old_ino,
|
|
|
|
btrfs_ino(new_dir), index);
|
2009-09-24 07:17:31 -06:00
|
|
|
if (ret)
|
|
|
|
goto out_fail;
|
2009-09-21 13:56:00 -06:00
|
|
|
/*
|
|
|
|
* this is an ugly little race, but the rename is required
|
|
|
|
* to make sure that if we crash, the inode is either at the
|
|
|
|
* old name or the new one. pinning the log transaction lets
|
|
|
|
* us make sure we don't allow a log commit to come in after
|
|
|
|
* we unlink the name but before we add the new name back in.
|
|
|
|
*/
|
|
|
|
btrfs_pin_log_trans(root);
|
|
|
|
}
|
2009-03-31 11:27:11 -06:00
|
|
|
/*
|
|
|
|
* make sure the inode gets flushed if it is replacing
|
|
|
|
* something.
|
|
|
|
*/
|
2011-04-19 20:31:50 -06:00
|
|
|
if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
|
2009-03-31 11:27:11 -06:00
|
|
|
btrfs_add_ordered_operation(trans, root, old_inode);
|
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
old_dir->i_ctime = old_dir->i_mtime = ctime;
|
|
|
|
new_dir->i_ctime = new_dir->i_mtime = ctime;
|
|
|
|
old_inode->i_ctime = ctime;
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2009-03-24 08:24:20 -06:00
|
|
|
if (old_dentry->d_parent != new_dentry->d_parent)
|
|
|
|
btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
|
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
|
2009-09-21 13:56:00 -06:00
|
|
|
root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
|
|
|
|
ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
|
|
|
|
old_dentry->d_name.name,
|
|
|
|
old_dentry->d_name.len);
|
|
|
|
} else {
|
2011-03-04 10:14:37 -07:00
|
|
|
ret = __btrfs_unlink_inode(trans, root, old_dir,
|
|
|
|
old_dentry->d_inode,
|
|
|
|
old_dentry->d_name.name,
|
|
|
|
old_dentry->d_name.len);
|
|
|
|
if (!ret)
|
|
|
|
ret = btrfs_update_inode(trans, root, old_inode);
|
2009-09-21 13:56:00 -06:00
|
|
|
}
|
|
|
|
BUG_ON(ret);
|
2007-06-12 04:35:45 -06:00
|
|
|
|
|
|
|
if (new_inode) {
|
|
|
|
new_inode->i_ctime = CURRENT_TIME;
|
2011-04-19 20:31:50 -06:00
|
|
|
if (unlikely(btrfs_ino(new_inode) ==
|
2009-09-21 13:56:00 -06:00
|
|
|
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
|
|
|
|
root_objectid = BTRFS_I(new_inode)->location.objectid;
|
|
|
|
ret = btrfs_unlink_subvol(trans, dest, new_dir,
|
|
|
|
root_objectid,
|
|
|
|
new_dentry->d_name.name,
|
|
|
|
new_dentry->d_name.len);
|
|
|
|
BUG_ON(new_inode->i_nlink == 0);
|
|
|
|
} else {
|
|
|
|
ret = btrfs_unlink_inode(trans, dest, new_dir,
|
|
|
|
new_dentry->d_inode,
|
|
|
|
new_dentry->d_name.name,
|
|
|
|
new_dentry->d_name.len);
|
|
|
|
}
|
|
|
|
BUG_ON(ret);
|
2008-07-24 10:17:14 -06:00
|
|
|
if (new_inode->i_nlink == 0) {
|
2008-09-05 14:13:11 -06:00
|
|
|
ret = btrfs_orphan_add(trans, new_dentry->d_inode);
|
2009-09-21 13:56:00 -06:00
|
|
|
BUG_ON(ret);
|
2008-07-24 10:17:14 -06:00
|
|
|
}
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
2008-07-24 10:12:38 -06:00
|
|
|
|
Btrfs: Per file/directory controls for COW and compression
Data compression and data cow are controlled across the entire FS by mount
options right now. ioctls are needed to set this on a per file or per
directory basis. This has been proposed previously, but VFS developers
wanted us to use generic ioctls rather than btrfs-specific ones.
According to Chris's comment, there should be just one true compression
method(probably LZO) stored in the super. However, before this, we would
wait for that one method is stable enough to be adopted into the super.
So I list it as a long term goal, and just store it in ram today.
After applying this patch, we can use the generic "FS_IOC_SETFLAGS" ioctl to
control file and directory's datacow and compression attribute.
NOTE:
- The compression type is selected by such rules:
If we mount btrfs with compress options, ie, zlib/lzo, the type is it.
Otherwise, we'll use the default compress type (zlib today).
v1->v2:
- rebase to the latest btrfs.
v2->v3:
- fix a problem, i.e. when a file is set NOCOW via mount option, then this NOCOW
will be screwed by inheritance from parent directory.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-22 04:12:20 -06:00
|
|
|
fixup_inode_flags(new_dir, old_inode);
|
|
|
|
|
2009-09-21 13:56:00 -06:00
|
|
|
ret = btrfs_add_link(trans, new_dir, old_inode,
|
|
|
|
new_dentry->d_name.name,
|
2009-09-24 07:17:31 -06:00
|
|
|
new_dentry->d_name.len, 0, index);
|
2009-09-21 13:56:00 -06:00
|
|
|
BUG_ON(ret);
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2011-04-19 20:31:50 -06:00
|
|
|
if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
|
2010-11-20 02:48:00 -07:00
|
|
|
struct dentry *parent = dget_parent(new_dentry);
|
|
|
|
btrfs_log_new_name(trans, old_inode, old_dir, parent);
|
|
|
|
dput(parent);
|
2009-09-21 13:56:00 -06:00
|
|
|
btrfs_end_log_trans(root);
|
|
|
|
}
|
2007-06-12 04:35:45 -06:00
|
|
|
out_fail:
|
2008-07-29 14:15:18 -06:00
|
|
|
btrfs_end_transaction_throttle(trans, root);
|
2011-03-31 07:23:47 -06:00
|
|
|
out_notrans:
|
2011-04-19 20:31:50 -06:00
|
|
|
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
|
2009-09-21 14:00:26 -06:00
|
|
|
up_read(&root->fs_info->subvol_sem);
|
2009-09-11 14:12:44 -06:00
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2008-09-29 13:18:18 -06:00
|
|
|
/*
|
|
|
|
* some fairly slow code that needs optimization. This walks the list
|
|
|
|
* of all the inodes with pending delalloc and forces them to disk.
|
|
|
|
*/
|
2009-11-12 02:36:34 -07:00
|
|
|
int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
|
2008-08-04 21:17:27 -06:00
|
|
|
{
|
|
|
|
struct list_head *head = &root->fs_info->delalloc_inodes;
|
|
|
|
struct btrfs_inode *binode;
|
2008-09-26 08:05:38 -06:00
|
|
|
struct inode *inode;
|
2008-08-04 21:17:27 -06:00
|
|
|
|
2008-11-12 12:34:12 -07:00
|
|
|
if (root->fs_info->sb->s_flags & MS_RDONLY)
|
|
|
|
return -EROFS;
|
|
|
|
|
2008-12-15 13:54:40 -07:00
|
|
|
spin_lock(&root->fs_info->delalloc_lock);
|
2009-01-05 19:25:51 -07:00
|
|
|
while (!list_empty(head)) {
|
2008-08-04 21:17:27 -06:00
|
|
|
binode = list_entry(head->next, struct btrfs_inode,
|
|
|
|
delalloc_inodes);
|
2008-09-26 08:05:38 -06:00
|
|
|
inode = igrab(&binode->vfs_inode);
|
|
|
|
if (!inode)
|
|
|
|
list_del_init(&binode->delalloc_inodes);
|
2008-12-15 13:54:40 -07:00
|
|
|
spin_unlock(&root->fs_info->delalloc_lock);
|
2008-09-26 08:05:38 -06:00
|
|
|
if (inode) {
|
2008-09-29 09:19:10 -06:00
|
|
|
filemap_flush(inode->i_mapping);
|
2009-11-12 02:36:34 -07:00
|
|
|
if (delay_iput)
|
|
|
|
btrfs_add_delayed_iput(inode);
|
|
|
|
else
|
|
|
|
iput(inode);
|
2008-09-26 08:05:38 -06:00
|
|
|
}
|
|
|
|
cond_resched();
|
2008-12-15 13:54:40 -07:00
|
|
|
spin_lock(&root->fs_info->delalloc_lock);
|
2008-08-04 21:17:27 -06:00
|
|
|
}
|
2008-12-15 13:54:40 -07:00
|
|
|
spin_unlock(&root->fs_info->delalloc_lock);
|
2008-09-29 09:19:10 -06:00
|
|
|
|
|
|
|
/* the filemap_flush will queue IO into the worker threads, but
|
|
|
|
* we have to make sure the IO is actually started and that
|
|
|
|
* ordered extents get created before we return
|
|
|
|
*/
|
|
|
|
atomic_inc(&root->fs_info->async_submit_draining);
|
2009-01-05 19:25:51 -07:00
|
|
|
while (atomic_read(&root->fs_info->nr_async_submits) ||
|
2008-11-06 20:02:51 -07:00
|
|
|
atomic_read(&root->fs_info->async_delalloc_pages)) {
|
2008-09-29 09:19:10 -06:00
|
|
|
wait_event(root->fs_info->async_submit_wait,
|
2008-11-06 20:02:51 -07:00
|
|
|
(atomic_read(&root->fs_info->nr_async_submits) == 0 &&
|
|
|
|
atomic_read(&root->fs_info->async_delalloc_pages) == 0));
|
2008-09-29 09:19:10 -06:00
|
|
|
}
|
|
|
|
atomic_dec(&root->fs_info->async_submit_draining);
|
2008-08-04 21:17:27 -06:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
|
|
|
|
const char *symname)
|
|
|
|
{
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = BTRFS_I(dir)->root;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
2007-12-21 14:27:21 -07:00
|
|
|
struct inode *inode = NULL;
|
2007-06-12 04:35:45 -06:00
|
|
|
int err;
|
|
|
|
int drop_inode = 0;
|
|
|
|
u64 objectid;
|
2008-08-05 09:18:09 -06:00
|
|
|
u64 index = 0 ;
|
2007-06-12 04:35:45 -06:00
|
|
|
int name_len;
|
|
|
|
int datasize;
|
2007-10-15 14:14:19 -06:00
|
|
|
unsigned long ptr;
|
2007-06-12 04:35:45 -06:00
|
|
|
struct btrfs_file_extent_item *ei;
|
2007-10-15 14:14:19 -06:00
|
|
|
struct extent_buffer *leaf;
|
2007-12-21 14:27:21 -07:00
|
|
|
unsigned long nr = 0;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
|
|
|
name_len = strlen(symname) + 1;
|
|
|
|
if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
|
|
|
|
return -ENAMETOOLONG;
|
2007-12-21 14:27:21 -07:00
|
|
|
|
2009-09-11 14:12:44 -06:00
|
|
|
/*
|
|
|
|
* 2 items for inode item and ref
|
|
|
|
* 2 items for dir items
|
|
|
|
* 1 item for xattr if selinux is on
|
|
|
|
*/
|
2010-05-16 08:48:46 -06:00
|
|
|
trans = btrfs_start_transaction(root, 5);
|
|
|
|
if (IS_ERR(trans))
|
|
|
|
return PTR_ERR(trans);
|
2007-12-21 14:27:21 -07:00
|
|
|
|
Btrfs: Cache free inode numbers in memory
Currently btrfs stores the highest objectid of the fs tree, and it always
returns (highest+1) inode number when we create a file, so inode numbers
won't be reclaimed when we delete files, so we'll run out of inode numbers
as we keep create/delete files in 32bits machines.
This fixes it, and it works similarly to how we cache free space in block
cgroups.
We start a kernel thread to read the file tree. By scanning inode items,
we know which chunks of inode numbers are free, and we cache them in
an rb-tree.
Because we are searching the commit root, we have to carefully handle the
cross-transaction case.
The rb-tree is a hybrid extent+bitmap tree, so if we have too many small
chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram
of extents, and a bitmap will be used if we exceed this threshold. The
extents threshold is adjusted in runtime.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
2011-04-19 20:06:11 -06:00
|
|
|
err = btrfs_find_free_ino(root, &objectid);
|
|
|
|
if (err)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2008-07-24 10:12:38 -06:00
|
|
|
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
|
2011-04-19 20:31:50 -06:00
|
|
|
dentry->d_name.len, btrfs_ino(dir), objectid,
|
2011-05-11 13:26:06 -06:00
|
|
|
S_IFLNK|S_IRWXUGO, &index);
|
2011-04-25 17:43:53 -06:00
|
|
|
if (IS_ERR(inode)) {
|
|
|
|
err = PTR_ERR(inode);
|
2007-06-12 04:35:45 -06:00
|
|
|
goto out_unlock;
|
2011-04-25 17:43:53 -06:00
|
|
|
}
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2011-02-01 09:05:39 -07:00
|
|
|
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
|
2008-07-24 10:16:36 -06:00
|
|
|
if (err) {
|
|
|
|
drop_inode = 1;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
2010-11-19 13:36:11 -07:00
|
|
|
err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
|
2007-06-12 04:35:45 -06:00
|
|
|
if (err)
|
|
|
|
drop_inode = 1;
|
|
|
|
else {
|
|
|
|
inode->i_mapping->a_ops = &btrfs_aops;
|
2008-03-26 08:28:07 -06:00
|
|
|
inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
|
2007-06-12 04:35:45 -06:00
|
|
|
inode->i_fop = &btrfs_file_operations;
|
|
|
|
inode->i_op = &btrfs_file_inode_operations;
|
2008-01-24 14:13:08 -07:00
|
|
|
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
|
2007-06-12 04:35:45 -06:00
|
|
|
}
|
|
|
|
if (drop_inode)
|
|
|
|
goto out_unlock;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-13 11:38:47 -06:00
|
|
|
if (!path) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
drop_inode = 1;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
2011-04-19 20:31:50 -06:00
|
|
|
key.objectid = btrfs_ino(inode);
|
2007-06-12 04:35:45 -06:00
|
|
|
key.offset = 0;
|
|
|
|
btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
|
|
|
|
datasize = btrfs_file_extent_calc_inline_size(name_len);
|
|
|
|
err = btrfs_insert_empty_item(trans, root, path, &key,
|
|
|
|
datasize);
|
2007-06-22 12:16:25 -06:00
|
|
|
if (err) {
|
|
|
|
drop_inode = 1;
|
2011-05-14 01:10:51 -06:00
|
|
|
btrfs_free_path(path);
|
2007-06-22 12:16:25 -06:00
|
|
|
goto out_unlock;
|
|
|
|
}
|
2007-10-15 14:14:19 -06:00
|
|
|
leaf = path->nodes[0];
|
|
|
|
ei = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
btrfs_set_file_extent_generation(leaf, ei, trans->transid);
|
|
|
|
btrfs_set_file_extent_type(leaf, ei,
|
2007-06-12 04:35:45 -06:00
|
|
|
BTRFS_FILE_EXTENT_INLINE);
|
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 12:49:59 -06:00
|
|
|
btrfs_set_file_extent_encryption(leaf, ei, 0);
|
|
|
|
btrfs_set_file_extent_compression(leaf, ei, 0);
|
|
|
|
btrfs_set_file_extent_other_encoding(leaf, ei, 0);
|
|
|
|
btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
|
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
ptr = btrfs_file_extent_inline_start(ei);
|
2007-10-15 14:14:19 -06:00
|
|
|
write_extent_buffer(leaf, symname, ptr, name_len);
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
2007-06-12 04:35:45 -06:00
|
|
|
btrfs_free_path(path);
|
2007-10-15 14:14:19 -06:00
|
|
|
|
2007-06-12 04:35:45 -06:00
|
|
|
inode->i_op = &btrfs_symlink_inode_operations;
|
|
|
|
inode->i_mapping->a_ops = &btrfs_symlink_aops;
|
2008-03-26 08:28:07 -06:00
|
|
|
inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
|
2008-10-30 12:25:28 -06:00
|
|
|
inode_set_bytes(inode, name_len);
|
2008-07-17 10:54:05 -06:00
|
|
|
btrfs_i_size_write(inode, name_len - 1);
|
2007-06-22 12:16:25 -06:00
|
|
|
err = btrfs_update_inode(trans, root, inode);
|
|
|
|
if (err)
|
|
|
|
drop_inode = 1;
|
2007-06-12 04:35:45 -06:00
|
|
|
|
|
|
|
out_unlock:
|
2007-09-17 08:58:06 -06:00
|
|
|
nr = trans->blocks_used;
|
2008-07-29 14:15:18 -06:00
|
|
|
btrfs_end_transaction_throttle(trans, root);
|
2007-06-12 04:35:45 -06:00
|
|
|
if (drop_inode) {
|
|
|
|
inode_dec_link_count(inode);
|
|
|
|
iput(inode);
|
|
|
|
}
|
2007-09-17 08:58:06 -06:00
|
|
|
btrfs_btree_balance_dirty(root, nr);
|
2007-06-12 04:35:45 -06:00
|
|
|
return err;
|
|
|
|
}
|
2008-04-10 08:23:21 -06:00
|
|
|
|
2010-06-21 12:48:16 -06:00
|
|
|
static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
|
|
|
|
u64 start, u64 num_bytes, u64 min_size,
|
|
|
|
loff_t actual_len, u64 *alloc_hint,
|
|
|
|
struct btrfs_trans_handle *trans)
|
2008-10-30 12:25:28 -06:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct btrfs_key ins;
|
|
|
|
u64 cur_offset = start;
|
2010-11-22 11:50:32 -07:00
|
|
|
u64 i_size;
|
2008-10-30 12:25:28 -06:00
|
|
|
int ret = 0;
|
2010-06-21 12:48:16 -06:00
|
|
|
bool own_trans = true;
|
2008-10-30 12:25:28 -06:00
|
|
|
|
2010-06-21 12:48:16 -06:00
|
|
|
if (trans)
|
|
|
|
own_trans = false;
|
2008-10-30 12:25:28 -06:00
|
|
|
while (num_bytes > 0) {
|
2010-06-21 12:48:16 -06:00
|
|
|
if (own_trans) {
|
|
|
|
trans = btrfs_start_transaction(root, 3);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
break;
|
|
|
|
}
|
2009-11-12 02:34:52 -07:00
|
|
|
}
|
|
|
|
|
2010-05-16 08:49:59 -06:00
|
|
|
ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
|
|
|
|
0, *alloc_hint, (u64)-1, &ins, 1);
|
2009-11-12 02:34:52 -07:00
|
|
|
if (ret) {
|
2010-06-21 12:48:16 -06:00
|
|
|
if (own_trans)
|
|
|
|
btrfs_end_transaction(trans, root);
|
2010-05-16 08:48:46 -06:00
|
|
|
break;
|
2008-10-30 12:25:28 -06:00
|
|
|
}
|
2009-11-12 02:34:52 -07:00
|
|
|
|
2008-10-30 12:25:28 -06:00
|
|
|
ret = insert_reserved_file_extent(trans, inode,
|
|
|
|
cur_offset, ins.objectid,
|
|
|
|
ins.offset, ins.offset,
|
2009-11-12 02:34:08 -07:00
|
|
|
ins.offset, 0, 0, 0,
|
2008-10-30 12:25:28 -06:00
|
|
|
BTRFS_FILE_EXTENT_PREALLOC);
|
|
|
|
BUG_ON(ret);
|
2009-09-11 10:27:37 -06:00
|
|
|
btrfs_drop_extent_cache(inode, cur_offset,
|
|
|
|
cur_offset + ins.offset -1, 0);
|
2009-11-12 02:34:52 -07:00
|
|
|
|
2008-10-30 12:25:28 -06:00
|
|
|
num_bytes -= ins.offset;
|
|
|
|
cur_offset += ins.offset;
|
2010-05-16 08:49:59 -06:00
|
|
|
*alloc_hint = ins.objectid + ins.offset;
|
2009-11-12 02:34:52 -07:00
|
|
|
|
2008-10-30 12:25:28 -06:00
|
|
|
inode->i_ctime = CURRENT_TIME;
|
2009-04-17 02:37:41 -06:00
|
|
|
BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
|
2008-10-30 12:25:28 -06:00
|
|
|
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
|
2010-05-16 08:49:59 -06:00
|
|
|
(actual_len > inode->i_size) &&
|
|
|
|
(cur_offset > inode->i_size)) {
|
2010-01-20 00:28:54 -07:00
|
|
|
if (cur_offset > actual_len)
|
2010-11-22 11:50:32 -07:00
|
|
|
i_size = actual_len;
|
2010-01-20 00:28:54 -07:00
|
|
|
else
|
2010-11-22 11:50:32 -07:00
|
|
|
i_size = cur_offset;
|
|
|
|
i_size_write(inode, i_size);
|
|
|
|
btrfs_ordered_update_i_size(inode, i_size, NULL);
|
2009-11-12 02:34:52 -07:00
|
|
|
}
|
|
|
|
|
2008-10-30 12:25:28 -06:00
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
|
|
|
BUG_ON(ret);
|
|
|
|
|
2010-06-21 12:48:16 -06:00
|
|
|
if (own_trans)
|
|
|
|
btrfs_end_transaction(trans, root);
|
2009-11-12 02:34:52 -07:00
|
|
|
}
|
2008-10-30 12:25:28 -06:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2010-06-21 12:48:16 -06:00
|
|
|
int btrfs_prealloc_file_range(struct inode *inode, int mode,
|
|
|
|
u64 start, u64 num_bytes, u64 min_size,
|
|
|
|
loff_t actual_len, u64 *alloc_hint)
|
|
|
|
{
|
|
|
|
return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
|
|
|
|
min_size, actual_len, alloc_hint,
|
|
|
|
NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
int btrfs_prealloc_file_range_trans(struct inode *inode,
|
|
|
|
struct btrfs_trans_handle *trans, int mode,
|
|
|
|
u64 start, u64 num_bytes, u64 min_size,
|
|
|
|
loff_t actual_len, u64 *alloc_hint)
|
|
|
|
{
|
|
|
|
return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
|
|
|
|
min_size, actual_len, alloc_hint, trans);
|
|
|
|
}
|
|
|
|
|
2008-07-17 10:53:50 -06:00
|
|
|
static int btrfs_set_page_dirty(struct page *page)
|
|
|
|
{
|
|
|
|
return __set_page_dirty_nobuffers(page);
|
|
|
|
}
|
|
|
|
|
2011-01-06 23:49:58 -07:00
|
|
|
static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
|
2008-01-14 11:26:08 -07:00
|
|
|
{
|
2010-12-20 01:04:08 -07:00
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
|
|
|
|
if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
|
|
|
|
return -EROFS;
|
2009-04-17 02:37:41 -06:00
|
|
|
if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
|
2008-01-14 11:26:08 -07:00
|
|
|
return -EACCES;
|
2011-01-06 23:49:58 -07:00
|
|
|
return generic_permission(inode, mask, flags, btrfs_check_acl);
|
2008-01-14 11:26:08 -07:00
|
|
|
}
|
2007-06-12 04:35:45 -06:00
|
|
|
|
2009-09-21 18:01:11 -06:00
|
|
|
static const struct inode_operations btrfs_dir_inode_operations = {
|
2008-11-17 18:42:26 -07:00
|
|
|
.getattr = btrfs_getattr,
|
2007-06-12 04:35:45 -06:00
|
|
|
.lookup = btrfs_lookup,
|
|
|
|
.create = btrfs_create,
|
|
|
|
.unlink = btrfs_unlink,
|
|
|
|
.link = btrfs_link,
|
|
|
|
.mkdir = btrfs_mkdir,
|
|
|
|
.rmdir = btrfs_rmdir,
|
|
|
|
.rename = btrfs_rename,
|
|
|
|
.symlink = btrfs_symlink,
|
|
|
|
.setattr = btrfs_setattr,
|
2007-07-11 08:18:17 -06:00
|
|
|
.mknod = btrfs_mknod,
|
2008-08-28 04:21:17 -06:00
|
|
|
.setxattr = btrfs_setxattr,
|
|
|
|
.getxattr = btrfs_getxattr,
|
2007-11-16 09:45:54 -07:00
|
|
|
.listxattr = btrfs_listxattr,
|
2008-08-28 04:21:17 -06:00
|
|
|
.removexattr = btrfs_removexattr,
|
2008-01-14 11:26:08 -07:00
|
|
|
.permission = btrfs_permission,
|
2007-06-12 04:35:45 -06:00
|
|
|
};
|
2009-09-21 18:01:11 -06:00
|
|
|
static const struct inode_operations btrfs_dir_ro_inode_operations = {
|
2007-06-12 04:35:45 -06:00
|
|
|
.lookup = btrfs_lookup,
|
2008-01-14 11:26:08 -07:00
|
|
|
.permission = btrfs_permission,
|
2007-06-12 04:35:45 -06:00
|
|
|
};
|
2009-09-21 14:00:26 -06:00
|
|
|
|
2009-10-01 16:43:56 -06:00
|
|
|
static const struct file_operations btrfs_dir_file_operations = {
|
2007-06-12 04:35:45 -06:00
|
|
|
.llseek = generic_file_llseek,
|
|
|
|
.read = generic_read_dir,
|
2008-08-06 12:42:33 -06:00
|
|
|
.readdir = btrfs_real_readdir,
|
2007-09-14 08:22:47 -06:00
|
|
|
.unlocked_ioctl = btrfs_ioctl,
|
2007-06-12 04:35:45 -06:00
|
|
|
#ifdef CONFIG_COMPAT
|
2007-09-14 08:22:47 -06:00
|
|
|
.compat_ioctl = btrfs_ioctl,
|
2007-06-12 04:35:45 -06:00
|
|
|
#endif
|
2008-06-10 08:07:39 -06:00
|
|
|
.release = btrfs_release_file,
|
2008-09-05 14:13:11 -06:00
|
|
|
.fsync = btrfs_sync_file,
|
2007-06-12 04:35:45 -06:00
|
|
|
};
|
|
|
|
|
2008-01-24 14:13:08 -07:00
|
|
|
static struct extent_io_ops btrfs_extent_io_ops = {
|
2007-08-30 06:50:51 -06:00
|
|
|
.fill_delalloc = run_delalloc_range,
|
2008-02-20 10:07:25 -07:00
|
|
|
.submit_bio_hook = btrfs_submit_bio_hook,
|
2008-03-24 13:02:07 -06:00
|
|
|
.merge_bio_hook = btrfs_merge_bio_hook,
|
2007-08-30 06:50:51 -06:00
|
|
|
.readpage_end_io_hook = btrfs_readpage_end_io_hook,
|
2008-07-17 10:53:50 -06:00
|
|
|
.writepage_end_io_hook = btrfs_writepage_end_io_hook,
|
2008-07-17 10:53:51 -06:00
|
|
|
.writepage_start_hook = btrfs_writepage_start_hook,
|
2008-05-12 11:39:03 -06:00
|
|
|
.readpage_io_failed_hook = btrfs_io_failed_hook,
|
2008-01-31 09:05:37 -07:00
|
|
|
.set_bit_hook = btrfs_set_bit_hook,
|
|
|
|
.clear_bit_hook = btrfs_clear_bit_hook,
|
2009-09-11 14:12:44 -06:00
|
|
|
.merge_extent_hook = btrfs_merge_extent_hook,
|
|
|
|
.split_extent_hook = btrfs_split_extent_hook,
|
2007-08-30 06:50:51 -06:00
|
|
|
};
|
|
|
|
|
2009-01-21 11:11:13 -07:00
|
|
|
/*
|
|
|
|
* btrfs doesn't support the bmap operation because swapfiles
|
|
|
|
* use bmap to make a mapping of extents in the file. They assume
|
|
|
|
* these extents won't change over the life of the file and they
|
|
|
|
* use the bmap result to do IO directly to the drive.
|
|
|
|
*
|
|
|
|
* the btrfs bmap call would return logical addresses that aren't
|
|
|
|
* suitable for IO and they also will change frequently as COW
|
|
|
|
* operations happen. So, swapfile + btrfs == corruption.
|
|
|
|
*
|
|
|
|
* For now we're avoiding this by dropping bmap.
|
|
|
|
*/
|
2009-09-21 18:01:10 -06:00
|
|
|
static const struct address_space_operations btrfs_aops = {
|
2007-06-12 04:35:45 -06:00
|
|
|
.readpage = btrfs_readpage,
|
|
|
|
.writepage = btrfs_writepage,
|
2007-11-01 17:45:34 -06:00
|
|
|
.writepages = btrfs_writepages,
|
2007-11-08 08:59:22 -07:00
|
|
|
.readpages = btrfs_readpages,
|
2008-04-10 08:23:21 -06:00
|
|
|
.direct_IO = btrfs_direct_IO,
|
2007-08-27 14:49:44 -06:00
|
|
|
.invalidatepage = btrfs_invalidatepage,
|
|
|
|
.releasepage = btrfs_releasepage,
|
2008-07-17 10:53:50 -06:00
|
|
|
.set_page_dirty = btrfs_set_page_dirty,
|
2009-09-16 03:50:18 -06:00
|
|
|
.error_remove_page = generic_error_remove_page,
|
2007-06-12 04:35:45 -06:00
|
|
|
};
|
|
|
|
|
2009-09-21 18:01:10 -06:00
|
|
|
static const struct address_space_operations btrfs_symlink_aops = {
|
2007-06-12 04:35:45 -06:00
|
|
|
.readpage = btrfs_readpage,
|
|
|
|
.writepage = btrfs_writepage,
|
2007-08-30 09:54:02 -06:00
|
|
|
.invalidatepage = btrfs_invalidatepage,
|
|
|
|
.releasepage = btrfs_releasepage,
|
2007-06-12 04:35:45 -06:00
|
|
|
};
|
|
|
|
|
2009-09-21 18:01:11 -06:00
|
|
|
static const struct inode_operations btrfs_file_inode_operations = {
|
2007-06-12 04:35:45 -06:00
|
|
|
.getattr = btrfs_getattr,
|
|
|
|
.setattr = btrfs_setattr,
|
2008-08-28 04:21:17 -06:00
|
|
|
.setxattr = btrfs_setxattr,
|
|
|
|
.getxattr = btrfs_getxattr,
|
2007-11-16 09:45:54 -07:00
|
|
|
.listxattr = btrfs_listxattr,
|
2008-08-28 04:21:17 -06:00
|
|
|
.removexattr = btrfs_removexattr,
|
2008-01-14 11:26:08 -07:00
|
|
|
.permission = btrfs_permission,
|
2009-01-21 12:39:14 -07:00
|
|
|
.fiemap = btrfs_fiemap,
|
2007-06-12 04:35:45 -06:00
|
|
|
};
|
2009-09-21 18:01:11 -06:00
|
|
|
static const struct inode_operations btrfs_special_inode_operations = {
|
2007-07-11 08:18:17 -06:00
|
|
|
.getattr = btrfs_getattr,
|
|
|
|
.setattr = btrfs_setattr,
|
2008-01-14 11:26:08 -07:00
|
|
|
.permission = btrfs_permission,
|
2008-08-28 04:21:17 -06:00
|
|
|
.setxattr = btrfs_setxattr,
|
|
|
|
.getxattr = btrfs_getxattr,
|
2008-07-24 10:16:36 -06:00
|
|
|
.listxattr = btrfs_listxattr,
|
2008-08-28 04:21:17 -06:00
|
|
|
.removexattr = btrfs_removexattr,
|
2007-07-11 08:18:17 -06:00
|
|
|
};
|
2009-09-21 18:01:11 -06:00
|
|
|
static const struct inode_operations btrfs_symlink_inode_operations = {
|
2007-06-12 04:35:45 -06:00
|
|
|
.readlink = generic_readlink,
|
|
|
|
.follow_link = page_follow_link_light,
|
|
|
|
.put_link = page_put_link,
|
2010-11-18 19:05:24 -07:00
|
|
|
.getattr = btrfs_getattr,
|
2008-01-14 11:26:08 -07:00
|
|
|
.permission = btrfs_permission,
|
2009-02-04 07:29:13 -07:00
|
|
|
.setxattr = btrfs_setxattr,
|
|
|
|
.getxattr = btrfs_getxattr,
|
|
|
|
.listxattr = btrfs_listxattr,
|
|
|
|
.removexattr = btrfs_removexattr,
|
2007-06-12 04:35:45 -06:00
|
|
|
};
|
2009-09-21 14:00:26 -06:00
|
|
|
|
2009-10-09 07:54:36 -06:00
|
|
|
const struct dentry_operations btrfs_dentry_operations = {
|
2009-09-21 14:00:26 -06:00
|
|
|
.d_delete = btrfs_dentry_delete,
|
|
|
|
};
|