Merge branch 'xfs-dax-support' into for-next
This commit is contained in:
commit
66e8ac7bfa
12 changed files with 333 additions and 154 deletions
34
fs/dax.c
34
fs/dax.c
|
@ -309,14 +309,21 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
|
|||
out:
|
||||
i_mmap_unlock_read(mapping);
|
||||
|
||||
if (bh->b_end_io)
|
||||
bh->b_end_io(bh, 1);
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
||||
get_block_t get_block)
|
||||
/**
|
||||
* __dax_fault - handle a page fault on a DAX file
|
||||
* @vma: The virtual memory area where the fault occurred
|
||||
* @vmf: The description of the fault
|
||||
* @get_block: The filesystem method used to translate file offsets to blocks
|
||||
*
|
||||
* When a page fault occurs, filesystems may call this helper in their
|
||||
* fault handler for DAX files. __dax_fault() assumes the caller has done all
|
||||
* the necessary locking for the page fault to proceed successfully.
|
||||
*/
|
||||
int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
||||
get_block_t get_block, dax_iodone_t complete_unwritten)
|
||||
{
|
||||
struct file *file = vma->vm_file;
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
|
@ -417,7 +424,19 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
|||
page_cache_release(page);
|
||||
}
|
||||
|
||||
/*
|
||||
* If we successfully insert the new mapping over an unwritten extent,
|
||||
* we need to ensure we convert the unwritten extent. If there is an
|
||||
* error inserting the mapping, the filesystem needs to leave it as
|
||||
* unwritten to prevent exposure of the stale underlying data to
|
||||
* userspace, but we still need to call the completion function so
|
||||
* the private resources on the mapping buffer can be released. We
|
||||
* indicate what the callback should do via the uptodate variable, same
|
||||
* as for normal BH based IO completions.
|
||||
*/
|
||||
error = dax_insert_mapping(inode, &bh, vma, vmf);
|
||||
if (buffer_unwritten(&bh))
|
||||
complete_unwritten(&bh, !error);
|
||||
|
||||
out:
|
||||
if (error == -ENOMEM)
|
||||
|
@ -434,6 +453,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
|||
}
|
||||
goto out;
|
||||
}
|
||||
EXPORT_SYMBOL(__dax_fault);
|
||||
|
||||
/**
|
||||
* dax_fault - handle a page fault on a DAX file
|
||||
|
@ -445,7 +465,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
|||
* fault handler for DAX files.
|
||||
*/
|
||||
int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
||||
get_block_t get_block)
|
||||
get_block_t get_block, dax_iodone_t complete_unwritten)
|
||||
{
|
||||
int result;
|
||||
struct super_block *sb = file_inode(vma->vm_file)->i_sb;
|
||||
|
@ -454,7 +474,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
|||
sb_start_pagefault(sb);
|
||||
file_update_time(vma->vm_file);
|
||||
}
|
||||
result = do_dax_fault(vma, vmf, get_block);
|
||||
result = __dax_fault(vma, vmf, get_block, complete_unwritten);
|
||||
if (vmf->flags & FAULT_FLAG_WRITE)
|
||||
sb_end_pagefault(sb);
|
||||
|
||||
|
|
|
@ -28,12 +28,12 @@
|
|||
#ifdef CONFIG_FS_DAX
|
||||
static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
{
|
||||
return dax_fault(vma, vmf, ext2_get_block);
|
||||
return dax_fault(vma, vmf, ext2_get_block, NULL);
|
||||
}
|
||||
|
||||
static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
{
|
||||
return dax_mkwrite(vma, vmf, ext2_get_block);
|
||||
return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
|
||||
}
|
||||
|
||||
static const struct vm_operations_struct ext2_dax_vm_ops = {
|
||||
|
|
|
@ -192,15 +192,27 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
|||
}
|
||||
|
||||
#ifdef CONFIG_FS_DAX
|
||||
static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
|
||||
{
|
||||
struct inode *inode = bh->b_assoc_map->host;
|
||||
/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
|
||||
loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
|
||||
int err;
|
||||
if (!uptodate)
|
||||
return;
|
||||
WARN_ON(!buffer_unwritten(bh));
|
||||
err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
|
||||
}
|
||||
|
||||
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
{
|
||||
return dax_fault(vma, vmf, ext4_get_block);
|
||||
return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
|
||||
/* Is this the right get_block? */
|
||||
}
|
||||
|
||||
static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
{
|
||||
return dax_mkwrite(vma, vmf, ext4_get_block);
|
||||
return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
|
||||
}
|
||||
|
||||
static const struct vm_operations_struct ext4_dax_vm_ops = {
|
||||
|
|
|
@ -656,18 +656,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
|
|||
return retval;
|
||||
}
|
||||
|
||||
static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
|
||||
{
|
||||
struct inode *inode = bh->b_assoc_map->host;
|
||||
/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
|
||||
loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
|
||||
int err;
|
||||
if (!uptodate)
|
||||
return;
|
||||
WARN_ON(!buffer_unwritten(bh));
|
||||
err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
|
||||
}
|
||||
|
||||
/* Maximum number of blocks we map for direct IO at once. */
|
||||
#define DIO_MAX_BLOCKS 4096
|
||||
|
||||
|
@ -705,10 +693,15 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
|
|||
|
||||
map_bh(bh, inode->i_sb, map.m_pblk);
|
||||
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
|
||||
if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
|
||||
if (IS_DAX(inode) && buffer_unwritten(bh)) {
|
||||
/*
|
||||
* dgc: I suspect unwritten conversion on ext4+DAX is
|
||||
* fundamentally broken here when there are concurrent
|
||||
* read/write in progress on this inode.
|
||||
*/
|
||||
WARN_ON_ONCE(io_end);
|
||||
bh->b_assoc_map = inode->i_mapping;
|
||||
bh->b_private = (void *)(unsigned long)iblock;
|
||||
bh->b_end_io = ext4_end_io_unwritten;
|
||||
}
|
||||
if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
|
||||
set_buffer_defer_completion(bh);
|
||||
|
|
|
@ -1349,7 +1349,7 @@ __xfs_get_blocks(
|
|||
sector_t iblock,
|
||||
struct buffer_head *bh_result,
|
||||
int create,
|
||||
int direct)
|
||||
bool direct)
|
||||
{
|
||||
struct xfs_inode *ip = XFS_I(inode);
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
|
@ -1414,6 +1414,7 @@ __xfs_get_blocks(
|
|||
if (error)
|
||||
return error;
|
||||
new = 1;
|
||||
|
||||
} else {
|
||||
/*
|
||||
* Delalloc reservations do not require a transaction,
|
||||
|
@ -1508,49 +1509,29 @@ xfs_get_blocks(
|
|||
struct buffer_head *bh_result,
|
||||
int create)
|
||||
{
|
||||
return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
|
||||
return __xfs_get_blocks(inode, iblock, bh_result, create, false);
|
||||
}
|
||||
|
||||
STATIC int
|
||||
int
|
||||
xfs_get_blocks_direct(
|
||||
struct inode *inode,
|
||||
sector_t iblock,
|
||||
struct buffer_head *bh_result,
|
||||
int create)
|
||||
{
|
||||
return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
|
||||
return __xfs_get_blocks(inode, iblock, bh_result, create, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* Complete a direct I/O write request.
|
||||
*
|
||||
* The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
|
||||
* If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
|
||||
* wholly within the EOF and so there is nothing for us to do. Note that in this
|
||||
* case the completion can be called in interrupt context, whereas if we have an
|
||||
* ioend we will always be called in task context (i.e. from a workqueue).
|
||||
*/
|
||||
STATIC void
|
||||
xfs_end_io_direct_write(
|
||||
struct kiocb *iocb,
|
||||
static void
|
||||
__xfs_end_io_direct_write(
|
||||
struct inode *inode,
|
||||
struct xfs_ioend *ioend,
|
||||
loff_t offset,
|
||||
ssize_t size,
|
||||
void *private)
|
||||
ssize_t size)
|
||||
{
|
||||
struct inode *inode = file_inode(iocb->ki_filp);
|
||||
struct xfs_inode *ip = XFS_I(inode);
|
||||
struct xfs_mount *mp = ip->i_mount;
|
||||
struct xfs_ioend *ioend = private;
|
||||
struct xfs_mount *mp = XFS_I(inode)->i_mount;
|
||||
|
||||
trace_xfs_gbmap_direct_endio(ip, offset, size,
|
||||
ioend ? ioend->io_type : 0, NULL);
|
||||
|
||||
if (!ioend) {
|
||||
ASSERT(offset + size <= i_size_read(inode));
|
||||
return;
|
||||
}
|
||||
|
||||
if (XFS_FORCED_SHUTDOWN(mp))
|
||||
if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
|
||||
goto out_end_io;
|
||||
|
||||
/*
|
||||
|
@ -1587,10 +1568,10 @@ xfs_end_io_direct_write(
|
|||
* here can result in EOF moving backwards and Bad Things Happen when
|
||||
* that occurs.
|
||||
*/
|
||||
spin_lock(&ip->i_flags_lock);
|
||||
spin_lock(&XFS_I(inode)->i_flags_lock);
|
||||
if (offset + size > i_size_read(inode))
|
||||
i_size_write(inode, offset + size);
|
||||
spin_unlock(&ip->i_flags_lock);
|
||||
spin_unlock(&XFS_I(inode)->i_flags_lock);
|
||||
|
||||
/*
|
||||
* If we are doing an append IO that needs to update the EOF on disk,
|
||||
|
@ -1607,6 +1588,98 @@ xfs_end_io_direct_write(
|
|||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Complete a direct I/O write request.
|
||||
*
|
||||
* The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
|
||||
* If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
|
||||
* wholly within the EOF and so there is nothing for us to do. Note that in this
|
||||
* case the completion can be called in interrupt context, whereas if we have an
|
||||
* ioend we will always be called in task context (i.e. from a workqueue).
|
||||
*/
|
||||
STATIC void
|
||||
xfs_end_io_direct_write(
|
||||
struct kiocb *iocb,
|
||||
loff_t offset,
|
||||
ssize_t size,
|
||||
void *private)
|
||||
{
|
||||
struct inode *inode = file_inode(iocb->ki_filp);
|
||||
struct xfs_ioend *ioend = private;
|
||||
|
||||
trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
|
||||
ioend ? ioend->io_type : 0, NULL);
|
||||
|
||||
if (!ioend) {
|
||||
ASSERT(offset + size <= i_size_read(inode));
|
||||
return;
|
||||
}
|
||||
|
||||
__xfs_end_io_direct_write(inode, ioend, offset, size);
|
||||
}
|
||||
|
||||
/*
|
||||
* For DAX we need a mapping buffer callback for unwritten extent conversion
|
||||
* when page faults allocate blocks and then zero them. Note that in this
|
||||
* case the mapping indicated by the ioend may extend beyond EOF. We most
|
||||
* definitely do not want to extend EOF here, so we trim back the ioend size to
|
||||
* EOF.
|
||||
*/
|
||||
#ifdef CONFIG_FS_DAX
|
||||
void
|
||||
xfs_end_io_dax_write(
|
||||
struct buffer_head *bh,
|
||||
int uptodate)
|
||||
{
|
||||
struct xfs_ioend *ioend = bh->b_private;
|
||||
struct inode *inode = ioend->io_inode;
|
||||
ssize_t size = ioend->io_size;
|
||||
|
||||
ASSERT(IS_DAX(ioend->io_inode));
|
||||
|
||||
/* if there was an error zeroing, then don't convert it */
|
||||
if (!uptodate)
|
||||
ioend->io_error = -EIO;
|
||||
|
||||
/*
|
||||
* Trim update to EOF, so we don't extend EOF during unwritten extent
|
||||
* conversion of partial EOF blocks.
|
||||
*/
|
||||
spin_lock(&XFS_I(inode)->i_flags_lock);
|
||||
if (ioend->io_offset + size > i_size_read(inode))
|
||||
size = i_size_read(inode) - ioend->io_offset;
|
||||
spin_unlock(&XFS_I(inode)->i_flags_lock);
|
||||
|
||||
__xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);
|
||||
|
||||
}
|
||||
#else
|
||||
void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }
|
||||
#endif
|
||||
|
||||
static inline ssize_t
|
||||
xfs_vm_do_dio(
|
||||
struct inode *inode,
|
||||
struct kiocb *iocb,
|
||||
struct iov_iter *iter,
|
||||
loff_t offset,
|
||||
void (*endio)(struct kiocb *iocb,
|
||||
loff_t offset,
|
||||
ssize_t size,
|
||||
void *private),
|
||||
int flags)
|
||||
{
|
||||
struct block_device *bdev;
|
||||
|
||||
if (IS_DAX(inode))
|
||||
return dax_do_io(iocb, inode, iter, offset,
|
||||
xfs_get_blocks_direct, endio, 0);
|
||||
|
||||
bdev = xfs_find_bdev_for_inode(inode);
|
||||
return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
|
||||
xfs_get_blocks_direct, endio, NULL, flags);
|
||||
}
|
||||
|
||||
STATIC ssize_t
|
||||
xfs_vm_direct_IO(
|
||||
struct kiocb *iocb,
|
||||
|
@ -1614,16 +1687,11 @@ xfs_vm_direct_IO(
|
|||
loff_t offset)
|
||||
{
|
||||
struct inode *inode = iocb->ki_filp->f_mapping->host;
|
||||
struct block_device *bdev = xfs_find_bdev_for_inode(inode);
|
||||
|
||||
if (iov_iter_rw(iter) == WRITE) {
|
||||
return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
|
||||
xfs_get_blocks_direct,
|
||||
xfs_end_io_direct_write, NULL,
|
||||
DIO_ASYNC_EXTEND);
|
||||
}
|
||||
return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
|
||||
xfs_get_blocks_direct, NULL, NULL, 0);
|
||||
if (iov_iter_rw(iter) == WRITE)
|
||||
return xfs_vm_do_dio(inode, iocb, iter, offset,
|
||||
xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
|
||||
return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -53,7 +53,12 @@ typedef struct xfs_ioend {
|
|||
} xfs_ioend_t;
|
||||
|
||||
extern const struct address_space_operations xfs_address_space_operations;
|
||||
extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
|
||||
|
||||
int xfs_get_blocks(struct inode *inode, sector_t offset,
|
||||
struct buffer_head *map_bh, int create);
|
||||
int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
|
||||
struct buffer_head *map_bh, int create);
|
||||
void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
|
||||
|
||||
extern void xfs_count_page_state(struct page *, int *, int *);
|
||||
|
||||
|
|
|
@ -1133,14 +1133,29 @@ xfs_zero_remaining_bytes(
|
|||
break;
|
||||
ASSERT(imap.br_blockcount >= 1);
|
||||
ASSERT(imap.br_startoff == offset_fsb);
|
||||
ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
|
||||
|
||||
if (imap.br_startblock == HOLESTARTBLOCK ||
|
||||
imap.br_state == XFS_EXT_UNWRITTEN) {
|
||||
/* skip the entire extent */
|
||||
lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +
|
||||
imap.br_blockcount) - 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
|
||||
if (lastoffset > endoff)
|
||||
lastoffset = endoff;
|
||||
if (imap.br_startblock == HOLESTARTBLOCK)
|
||||
continue;
|
||||
ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
|
||||
if (imap.br_state == XFS_EXT_UNWRITTEN)
|
||||
|
||||
/* DAX can just zero the backing device directly */
|
||||
if (IS_DAX(VFS_I(ip))) {
|
||||
error = dax_zero_page_range(VFS_I(ip), offset,
|
||||
lastoffset - offset + 1,
|
||||
xfs_get_blocks_direct);
|
||||
if (error)
|
||||
return error;
|
||||
continue;
|
||||
}
|
||||
|
||||
error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
|
||||
mp->m_rtdev_targp : mp->m_ddev_targp,
|
||||
|
|
|
@ -79,14 +79,15 @@ xfs_rw_ilock_demote(
|
|||
}
|
||||
|
||||
/*
|
||||
* xfs_iozero
|
||||
* xfs_iozero clears the specified range supplied via the page cache (except in
|
||||
* the DAX case). Writes through the page cache will allocate blocks over holes,
|
||||
* though the callers usually map the holes first and avoid them. If a block is
|
||||
* not completely zeroed, then it will be read from disk before being partially
|
||||
* zeroed.
|
||||
*
|
||||
* xfs_iozero clears the specified range of buffer supplied,
|
||||
* and marks all the affected blocks as valid and modified. If
|
||||
* an affected block is not allocated, it will be allocated. If
|
||||
* an affected block is not completely overwritten, and is not
|
||||
* valid before the operation, it will be read from disk before
|
||||
* being partially zeroed.
|
||||
* In the DAX case, we can just directly write to the underlying pages. This
|
||||
* will not allocate blocks, but will avoid holes and unwritten extents and so
|
||||
* not do unnecessary work.
|
||||
*/
|
||||
int
|
||||
xfs_iozero(
|
||||
|
@ -96,7 +97,8 @@ xfs_iozero(
|
|||
{
|
||||
struct page *page;
|
||||
struct address_space *mapping;
|
||||
int status;
|
||||
int status = 0;
|
||||
|
||||
|
||||
mapping = VFS_I(ip)->i_mapping;
|
||||
do {
|
||||
|
@ -108,20 +110,27 @@ xfs_iozero(
|
|||
if (bytes > count)
|
||||
bytes = count;
|
||||
|
||||
status = pagecache_write_begin(NULL, mapping, pos, bytes,
|
||||
AOP_FLAG_UNINTERRUPTIBLE,
|
||||
&page, &fsdata);
|
||||
if (status)
|
||||
break;
|
||||
if (IS_DAX(VFS_I(ip))) {
|
||||
status = dax_zero_page_range(VFS_I(ip), pos, bytes,
|
||||
xfs_get_blocks_direct);
|
||||
if (status)
|
||||
break;
|
||||
} else {
|
||||
status = pagecache_write_begin(NULL, mapping, pos, bytes,
|
||||
AOP_FLAG_UNINTERRUPTIBLE,
|
||||
&page, &fsdata);
|
||||
if (status)
|
||||
break;
|
||||
|
||||
zero_user(page, offset, bytes);
|
||||
zero_user(page, offset, bytes);
|
||||
|
||||
status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
|
||||
page, fsdata);
|
||||
WARN_ON(status <= 0); /* can't return less than zero! */
|
||||
status = pagecache_write_end(NULL, mapping, pos, bytes,
|
||||
bytes, page, fsdata);
|
||||
WARN_ON(status <= 0); /* can't return less than zero! */
|
||||
status = 0;
|
||||
}
|
||||
pos += bytes;
|
||||
count -= bytes;
|
||||
status = 0;
|
||||
} while (count);
|
||||
|
||||
return status;
|
||||
|
@ -284,7 +293,7 @@ xfs_file_read_iter(
|
|||
if (file->f_mode & FMODE_NOCMTIME)
|
||||
ioflags |= XFS_IO_INVIS;
|
||||
|
||||
if (unlikely(ioflags & XFS_IO_ISDIRECT)) {
|
||||
if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
|
||||
xfs_buftarg_t *target =
|
||||
XFS_IS_REALTIME_INODE(ip) ?
|
||||
mp->m_rtdev_targp : mp->m_ddev_targp;
|
||||
|
@ -378,7 +387,11 @@ xfs_file_splice_read(
|
|||
|
||||
trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
|
||||
|
||||
ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
|
||||
/* for dax, we need to avoid the page cache */
|
||||
if (IS_DAX(VFS_I(ip)))
|
||||
ret = default_file_splice_read(infilp, ppos, pipe, count, flags);
|
||||
else
|
||||
ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
|
||||
if (ret > 0)
|
||||
XFS_STATS_ADD(xs_read_bytes, ret);
|
||||
|
||||
|
@ -672,7 +685,7 @@ xfs_file_dio_aio_write(
|
|||
mp->m_rtdev_targp : mp->m_ddev_targp;
|
||||
|
||||
/* DIO must be aligned to device logical sector size */
|
||||
if ((pos | count) & target->bt_logical_sectormask)
|
||||
if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask))
|
||||
return -EINVAL;
|
||||
|
||||
/* "unaligned" here means not aligned to a filesystem block */
|
||||
|
@ -758,8 +771,11 @@ xfs_file_dio_aio_write(
|
|||
out:
|
||||
xfs_rw_iunlock(ip, iolock);
|
||||
|
||||
/* No fallback to buffered IO on errors for XFS. */
|
||||
ASSERT(ret < 0 || ret == count);
|
||||
/*
|
||||
* No fallback to buffered IO on errors for XFS. DAX can result in
|
||||
* partial writes, but direct IO will either complete fully or fail.
|
||||
*/
|
||||
ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -842,7 +858,7 @@ xfs_file_write_iter(
|
|||
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
|
||||
return -EIO;
|
||||
|
||||
if (unlikely(iocb->ki_flags & IOCB_DIRECT))
|
||||
if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
|
||||
ret = xfs_file_dio_aio_write(iocb, from);
|
||||
else
|
||||
ret = xfs_file_buffered_aio_write(iocb, from);
|
||||
|
@ -1063,17 +1079,6 @@ xfs_file_readdir(
|
|||
return xfs_readdir(ip, ctx, bufsize);
|
||||
}
|
||||
|
||||
STATIC int
|
||||
xfs_file_mmap(
|
||||
struct file *filp,
|
||||
struct vm_area_struct *vma)
|
||||
{
|
||||
vma->vm_ops = &xfs_file_vm_ops;
|
||||
|
||||
file_accessed(filp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* This type is designed to indicate the type of offset we would like
|
||||
* to search from page cache for xfs_seek_hole_data().
|
||||
|
@ -1454,26 +1459,11 @@ xfs_file_llseek(
|
|||
* ordering of:
|
||||
*
|
||||
* mmap_sem (MM)
|
||||
* i_mmap_lock (XFS - truncate serialisation)
|
||||
* page_lock (MM)
|
||||
* i_lock (XFS - extent map serialisation)
|
||||
* sb_start_pagefault(vfs, freeze)
|
||||
* i_mmap_lock (XFS - truncate serialisation)
|
||||
* page_lock (MM)
|
||||
* i_lock (XFS - extent map serialisation)
|
||||
*/
|
||||
STATIC int
|
||||
xfs_filemap_fault(
|
||||
struct vm_area_struct *vma,
|
||||
struct vm_fault *vmf)
|
||||
{
|
||||
struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host);
|
||||
int error;
|
||||
|
||||
trace_xfs_filemap_fault(ip);
|
||||
|
||||
xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
|
||||
error = filemap_fault(vma, vmf);
|
||||
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* mmap()d file has taken write protection fault and is being made writable. We
|
||||
|
@ -1486,16 +1476,66 @@ xfs_filemap_page_mkwrite(
|
|||
struct vm_area_struct *vma,
|
||||
struct vm_fault *vmf)
|
||||
{
|
||||
struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host);
|
||||
int error;
|
||||
struct inode *inode = file_inode(vma->vm_file);
|
||||
int ret;
|
||||
|
||||
trace_xfs_filemap_page_mkwrite(ip);
|
||||
trace_xfs_filemap_page_mkwrite(XFS_I(inode));
|
||||
|
||||
sb_start_pagefault(inode->i_sb);
|
||||
file_update_time(vma->vm_file);
|
||||
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
|
||||
|
||||
if (IS_DAX(inode)) {
|
||||
ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
|
||||
xfs_end_io_dax_write);
|
||||
} else {
|
||||
ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
|
||||
ret = block_page_mkwrite_return(ret);
|
||||
}
|
||||
|
||||
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
|
||||
sb_end_pagefault(inode->i_sb);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
STATIC int
|
||||
xfs_filemap_fault(
|
||||
struct vm_area_struct *vma,
|
||||
struct vm_fault *vmf)
|
||||
{
|
||||
struct xfs_inode *ip = XFS_I(file_inode(vma->vm_file));
|
||||
int ret;
|
||||
|
||||
trace_xfs_filemap_fault(ip);
|
||||
|
||||
/* DAX can shortcut the normal fault path on write faults! */
|
||||
if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip)))
|
||||
return xfs_filemap_page_mkwrite(vma, vmf);
|
||||
|
||||
xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
|
||||
error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
|
||||
ret = filemap_fault(vma, vmf);
|
||||
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
|
||||
|
||||
return error;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const struct vm_operations_struct xfs_file_vm_ops = {
|
||||
.fault = xfs_filemap_fault,
|
||||
.map_pages = filemap_map_pages,
|
||||
.page_mkwrite = xfs_filemap_page_mkwrite,
|
||||
};
|
||||
|
||||
STATIC int
|
||||
xfs_file_mmap(
|
||||
struct file *filp,
|
||||
struct vm_area_struct *vma)
|
||||
{
|
||||
file_accessed(filp);
|
||||
vma->vm_ops = &xfs_file_vm_ops;
|
||||
if (IS_DAX(file_inode(filp)))
|
||||
vma->vm_flags |= VM_MIXEDMAP;
|
||||
return 0;
|
||||
}
|
||||
|
||||
const struct file_operations xfs_file_operations = {
|
||||
|
@ -1526,9 +1566,3 @@ const struct file_operations xfs_dir_file_operations = {
|
|||
#endif
|
||||
.fsync = xfs_dir_fsync,
|
||||
};
|
||||
|
||||
static const struct vm_operations_struct xfs_file_vm_ops = {
|
||||
.fault = xfs_filemap_fault,
|
||||
.map_pages = filemap_map_pages,
|
||||
.page_mkwrite = xfs_filemap_page_mkwrite,
|
||||
};
|
||||
|
|
|
@ -851,7 +851,11 @@ xfs_setattr_size(
|
|||
* to hope that the caller sees ENOMEM and retries the truncate
|
||||
* operation.
|
||||
*/
|
||||
error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
|
||||
if (IS_DAX(inode))
|
||||
error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
|
||||
else
|
||||
error = block_truncate_page(inode->i_mapping, newsize,
|
||||
xfs_get_blocks);
|
||||
if (error)
|
||||
return error;
|
||||
truncate_setsize(inode, newsize);
|
||||
|
@ -1191,22 +1195,22 @@ xfs_diflags_to_iflags(
|
|||
struct inode *inode,
|
||||
struct xfs_inode *ip)
|
||||
{
|
||||
if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
|
||||
uint16_t flags = ip->i_d.di_flags;
|
||||
|
||||
inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC |
|
||||
S_NOATIME | S_DAX);
|
||||
|
||||
if (flags & XFS_DIFLAG_IMMUTABLE)
|
||||
inode->i_flags |= S_IMMUTABLE;
|
||||
else
|
||||
inode->i_flags &= ~S_IMMUTABLE;
|
||||
if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
|
||||
if (flags & XFS_DIFLAG_APPEND)
|
||||
inode->i_flags |= S_APPEND;
|
||||
else
|
||||
inode->i_flags &= ~S_APPEND;
|
||||
if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
|
||||
if (flags & XFS_DIFLAG_SYNC)
|
||||
inode->i_flags |= S_SYNC;
|
||||
else
|
||||
inode->i_flags &= ~S_SYNC;
|
||||
if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
|
||||
if (flags & XFS_DIFLAG_NOATIME)
|
||||
inode->i_flags |= S_NOATIME;
|
||||
else
|
||||
inode->i_flags &= ~S_NOATIME;
|
||||
/* XXX: Also needs an on-disk per inode flag! */
|
||||
if (ip->i_mount->m_flags & XFS_MOUNT_DAX)
|
||||
inode->i_flags |= S_DAX;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -181,6 +181,8 @@ typedef struct xfs_mount {
|
|||
allocator */
|
||||
#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */
|
||||
|
||||
#define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */
|
||||
|
||||
|
||||
/*
|
||||
* Default minimum read and write sizes.
|
||||
|
|
|
@ -112,6 +112,8 @@ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */
|
|||
#define MNTOPT_DISCARD "discard" /* Discard unused blocks */
|
||||
#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */
|
||||
|
||||
#define MNTOPT_DAX "dax" /* Enable direct access to bdev pages */
|
||||
|
||||
/*
|
||||
* Table driven mount option parser.
|
||||
*
|
||||
|
@ -363,6 +365,10 @@ xfs_parseargs(
|
|||
mp->m_flags |= XFS_MOUNT_DISCARD;
|
||||
} else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
|
||||
mp->m_flags &= ~XFS_MOUNT_DISCARD;
|
||||
#ifdef CONFIG_FS_DAX
|
||||
} else if (!strcmp(this_char, MNTOPT_DAX)) {
|
||||
mp->m_flags |= XFS_MOUNT_DAX;
|
||||
#endif
|
||||
} else {
|
||||
xfs_warn(mp, "unknown mount option [%s].", this_char);
|
||||
return -EINVAL;
|
||||
|
@ -452,8 +458,8 @@ xfs_parseargs(
|
|||
}
|
||||
|
||||
struct proc_xfs_info {
|
||||
int flag;
|
||||
char *str;
|
||||
uint64_t flag;
|
||||
char *str;
|
||||
};
|
||||
|
||||
STATIC int
|
||||
|
@ -474,6 +480,7 @@ xfs_showargs(
|
|||
{ XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
|
||||
{ XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD },
|
||||
{ XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE },
|
||||
{ XFS_MOUNT_DAX, "," MNTOPT_DAX },
|
||||
{ 0, NULL }
|
||||
};
|
||||
static struct proc_xfs_info xfs_info_unset[] = {
|
||||
|
@ -1507,6 +1514,20 @@ xfs_fs_fill_super(
|
|||
if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
|
||||
sb->s_flags |= MS_I_VERSION;
|
||||
|
||||
if (mp->m_flags & XFS_MOUNT_DAX) {
|
||||
xfs_warn(mp,
|
||||
"DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
|
||||
if (sb->s_blocksize != PAGE_SIZE) {
|
||||
xfs_alert(mp,
|
||||
"Filesystem block size invalid for DAX Turning DAX off.");
|
||||
mp->m_flags &= ~XFS_MOUNT_DAX;
|
||||
} else if (!sb->s_bdev->bd_disk->fops->direct_access) {
|
||||
xfs_alert(mp,
|
||||
"Block device does not support DAX Turning DAX off.");
|
||||
mp->m_flags &= ~XFS_MOUNT_DAX;
|
||||
}
|
||||
}
|
||||
|
||||
error = xfs_mountfs(mp);
|
||||
if (error)
|
||||
goto out_filestream_unmount;
|
||||
|
|
|
@ -70,6 +70,7 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock,
|
|||
struct buffer_head *bh_result, int create);
|
||||
typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
|
||||
ssize_t bytes, void *private);
|
||||
typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
|
||||
|
||||
#define MAY_EXEC 0x00000001
|
||||
#define MAY_WRITE 0x00000002
|
||||
|
@ -2627,9 +2628,13 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
|
|||
int dax_clear_blocks(struct inode *, sector_t block, long size);
|
||||
int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
|
||||
int dax_truncate_page(struct inode *, loff_t from, get_block_t);
|
||||
int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
|
||||
int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
|
||||
dax_iodone_t);
|
||||
int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
|
||||
dax_iodone_t);
|
||||
int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
|
||||
#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb)
|
||||
#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod)
|
||||
#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod)
|
||||
|
||||
#ifdef CONFIG_BLOCK
|
||||
typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
|
||||
|
|
Loading…
Reference in a new issue