4dc4199770
-----BEGIN PGP SIGNATURE----- iQIzBAABCAAdFiEEZH8oZUiU471FcZm+ONu9yGCSaT4FAl5TfLwACgkQONu9yGCS aT5wlRAAhZELK39c78NMCTZKHtKGLsGb2os2IiI7zIRbqNNwnvJi+jAc3kgbS9jP +W+wnhYFtFisDvqdCQ009I6A0NA1p3Nqy166JplW0iIg1e7rgUKKUfabCN9sJmjh HGK913cJlHwGmkSxq//sBucBwWhYYGaHec28pZ7uCFATjWrTaH3G4VrvLStuicYR YgS9MH261tWJKJm5+V2MxnOOI0103+Uey+xVqwSnLlV+qmasxwDCMU5ae+SK7e7f cXIkNZwvDph1zunekHg+jd64GN3GYswXVcRighWP0n7Lr+0tGPN7SY5pvZIjZLv/ sdroyrqAxytTYP32hypIUgsToVvJr7zXD09LGdsgOCKVwFVn8yl1e4zgGKH3L9Xu OK2krI90v1MVevibyaNndZ4UDKilF75oE2YYDOFW/BU1lorFAIzk4hh15CfKc8s1 KHRjePfcgQREs/SGK8k2BAmf/JwxFN1/Ro5dl7MvKn07ZYqx6QOwUoMhgxspIntN 9TlFw6elu1RSwu2BFts9wvoHO1tr7GZBa1cVkNF8qV1rzaGVY68aLDvvHGdffD6W JgX+BCfr6vcN7R4izak1RxzAoqDrRxS0vWoC1vVsPqeIIZydSxpYDquaFnbZm+Wc MRuh5gpQ2PzTXuMLeBB+ig6UnzsAO3x+3yIG/l5ZmmYxJbMFBKU= =zE/i -----END PGP SIGNATURE----- Merge 4.19.106 into android-4.19 Changes in 4.19.106 core: Don't skip generic XDP program execution for cloned SKBs enic: prevent waking up stopped tx queues over watchdog reset net/smc: fix leak of kernel memory to user space net: dsa: tag_qca: Make sure there is headroom for tag net/sched: matchall: add missing validation of TCA_MATCHALL_FLAGS net/sched: flower: add missing validation of TCA_FLOWER_FLAGS Revert "KVM: nVMX: Use correct root level for nested EPT shadow page tables" Revert "KVM: VMX: Add non-canonical check on writes to RTIT address MSRs" KVM: nVMX: Use correct root level for nested EPT shadow page tables drm/gma500: Fixup fbdev stolen size usage evaluation cpu/hotplug, stop_machine: Fix stop_machine vs hotplug order brcmfmac: Fix use after free in brcmf_sdio_readframes() leds: pca963x: Fix open-drain initialization ext4: fix ext4_dax_read/write inode locking sequence for IOCB_NOWAIT ALSA: ctl: allow TLV read operation for callback type of element in locked case gianfar: Fix TX timestamping with a stacked DSA driver pinctrl: sh-pfc: sh7264: Fix CAN function GPIOs pxa168fb: Fix the function used to release some memory in an error handling path media: i2c: mt9v032: fix enum mbus codes and frame sizes powerpc/powernv/iov: Ensure the pdn for VFs always contains a valid PE number gpio: gpio-grgpio: fix possible sleep-in-atomic-context bugs in grgpio_irq_map/unmap() iommu/vt-d: Fix off-by-one in PASID allocation char/random: silence a lockdep splat with printk() media: sti: bdisp: fix a possible sleep-in-atomic-context bug in bdisp_device_run() pinctrl: baytrail: Do not clear IRQ flags on direct-irq enabled pins efi/x86: Map the entire EFI vendor string before copying it MIPS: Loongson: Fix potential NULL dereference in loongson3_platform_init() sparc: Add .exit.data section. uio: fix a sleep-in-atomic-context bug in uio_dmem_genirq_irqcontrol() usb: gadget: udc: fix possible sleep-in-atomic-context bugs in gr_probe() usb: dwc2: Fix IN FIFO allocation clocksource/drivers/bcm2835_timer: Fix memory leak of timer kselftest: Minimise dependency of get_size on C library interfaces jbd2: clear JBD2_ABORT flag before journal_reset to update log tail info when load journal x86/sysfb: Fix check for bad VRAM size pwm: omap-dmtimer: Simplify error handling s390/pci: Fix possible deadlock in recover_store() powerpc/iov: Move VF pdev fixup into pcibios_fixup_iov() tracing: Fix tracing_stat return values in error handling paths tracing: Fix very unlikely race of registering two stat tracers ARM: 8952/1: Disable kmemleak on XIP kernels ext4, jbd2: ensure panic when aborting with zero errno ath10k: Correct the DMA direction for management tx buffers drm/amd/display: Retrain dongles when SINK_COUNT becomes non-zero nbd: add a flush_workqueue in nbd_start_device KVM: s390: ENOTSUPP -> EOPNOTSUPP fixups kconfig: fix broken dependency in randconfig-generated .config clk: qcom: rcg2: Don't crash if our parent can't be found; return an error drm/amdgpu: remove 4 set but not used variable in amdgpu_atombios_get_connector_info_from_object_table drm/amdgpu: Ensure ret is always initialized when using SOC15_WAIT_ON_RREG regulator: rk808: Lower log level on optional GPIOs being not available net/wan/fsl_ucc_hdlc: reject muram offsets above 64K NFC: port100: Convert cpu_to_le16(le16_to_cpu(E1) + E2) to use le16_add_cpu(). selinux: fall back to ref-walk if audit is required arm64: dts: allwinner: H6: Add PMU mode arm: dts: allwinner: H3: Add PMU node selinux: ensure we cleanup the internal AVC counters on error in avc_insert() arm64: dts: qcom: msm8996: Disable USB2 PHY suspend by core ARM: dts: imx6: rdu2: Disable WP for USDHC2 and USDHC3 ARM: dts: imx6: rdu2: Limit USBH1 to Full Speed PCI: iproc: Apply quirk_paxc_bridge() for module as well as built-in media: cx23885: Add support for AVerMedia CE310B PCI: Add generic quirk for increasing D3hot delay PCI: Increase D3 delay for AMD Ryzen5/7 XHCI controllers media: v4l2-device.h: Explicitly compare grp{id,mask} to zero in v4l2_device macros reiserfs: Fix spurious unlock in reiserfs_fill_super() error handling r8169: check that Realtek PHY driver module is loaded fore200e: Fix incorrect checks of NULL pointer dereference netfilter: nft_tunnel: add the missing ERSPAN_VERSION nla_policy ALSA: usx2y: Adjust indentation in snd_usX2Y_hwdep_dsp_status b43legacy: Fix -Wcast-function-type ipw2x00: Fix -Wcast-function-type iwlegacy: Fix -Wcast-function-type rtlwifi: rtl_pci: Fix -Wcast-function-type orinoco: avoid assertion in case of NULL pointer ACPICA: Disassembler: create buffer fields in ACPI_PARSE_LOAD_PASS1 scsi: ufs: Complete pending requests in host reset and restore path scsi: aic7xxx: Adjust indentation in ahc_find_syncrate drm/mediatek: handle events when enabling/disabling crtc ARM: dts: r8a7779: Add device node for ARM global timer selinux: ensure we cleanup the internal AVC counters on error in avc_update() dmaengine: Store module owner in dma_device struct dmaengine: imx-sdma: Fix memory leak crypto: chtls - Fixed memory leak x86/vdso: Provide missing include file PM / devfreq: rk3399_dmc: Add COMPILE_TEST and HAVE_ARM_SMCCC dependency pinctrl: sh-pfc: sh7269: Fix CAN function GPIOs reset: uniphier: Add SCSSI reset control for each channel RDMA/rxe: Fix error type of mmap_offset clk: sunxi-ng: add mux and pll notifiers for A64 CPU clock ALSA: sh: Fix unused variable warnings clk: uniphier: Add SCSSI clock gate for each channel ALSA: sh: Fix compile warning wrt const tools lib api fs: Fix gcc9 stringop-truncation compilation error ACPI: button: Add DMI quirk for Razer Blade Stealth 13 late 2019 lid switch mlx5: work around high stack usage with gcc drm: remove the newline for CRC source name. ARM: dts: stm32: Add power-supply for DSI panel on stm32f469-disco usbip: Fix unsafe unaligned pointer usage udf: Fix free space reporting for metadata and virtual partitions staging: rtl8188: avoid excessive stack usage IB/hfi1: Add software counter for ctxt0 seq drop soc/tegra: fuse: Correct straps' address for older Tegra124 device trees efi/x86: Don't panic or BUG() on non-critical error conditions rcu: Use WRITE_ONCE() for assignments to ->pprev for hlist_nulls Input: edt-ft5x06 - work around first register access error x86/nmi: Remove irq_work from the long duration NMI handler wan: ixp4xx_hss: fix compile-testing on 64-bit ASoC: atmel: fix build error with CONFIG_SND_ATMEL_SOC_DMA=m tty: synclinkmp: Adjust indentation in several functions tty: synclink_gt: Adjust indentation in several functions visorbus: fix uninitialized variable access driver core: platform: Prevent resouce overflow from causing infinite loops driver core: Print device when resources present in really_probe() bpf: Return -EBADRQC for invalid map type in __bpf_tx_xdp_map vme: bridges: reduce stack usage drm/nouveau/secboot/gm20b: initialize pointer in gm20b_secboot_new() drm/nouveau/gr/gk20a,gm200-: add terminators to method lists read from fw drm/nouveau: Fix copy-paste error in nouveau_fence_wait_uevent_handler drm/nouveau/drm/ttm: Remove set but not used variable 'mem' drm/nouveau/fault/gv100-: fix memory leak on module unload drm/vmwgfx: prevent memory leak in vmw_cmdbuf_res_add usb: musb: omap2430: Get rid of musb .set_vbus for omap2430 glue iommu/arm-smmu-v3: Use WRITE_ONCE() when changing validity of an STE f2fs: set I_LINKABLE early to avoid wrong access by vfs f2fs: free sysfs kobject scsi: iscsi: Don't destroy session if there are outstanding connections arm64: fix alternatives with LLVM's integrated assembler drm/amd/display: fixup DML dependencies watchdog/softlockup: Enforce that timestamp is valid on boot f2fs: fix memleak of kobject x86/mm: Fix NX bit clearing issue in kernel_map_pages_in_pgd pwm: omap-dmtimer: Remove PWM chip in .remove before making it unfunctional cmd64x: potential buffer overflow in cmd64x_program_timings() ide: serverworks: potential overflow in svwks_set_pio_mode() pwm: Remove set but not set variable 'pwm' btrfs: fix possible NULL-pointer dereference in integrity checks btrfs: safely advance counter when looking up bio csums btrfs: device stats, log when stats are zeroed module: avoid setting info->name early in case we can fall back to info->mod->name remoteproc: Initialize rproc_class before use irqchip/mbigen: Set driver .suppress_bind_attrs to avoid remove problems ALSA: hda/hdmi - add retry logic to parse_intel_hdmi() kbuild: use -S instead of -E for precise cc-option test in Kconfig x86/decoder: Add TEST opcode to Group3-2 s390: adjust -mpacked-stack support check for clang 10 s390/ftrace: generate traced function stack frame driver core: platform: fix u32 greater or equal to zero comparison ALSA: hda - Add docking station support for Lenovo Thinkpad T420s drm/nouveau/mmu: fix comptag memory leak powerpc/sriov: Remove VF eeh_dev state when disabling SR-IOV bcache: cached_dev_free needs to put the sb page iommu/vt-d: Remove unnecessary WARN_ON_ONCE() selftests: bpf: Reset global state between reuseport test runs jbd2: switch to use jbd2_journal_abort() when failed to submit the commit record jbd2: make sure ESHUTDOWN to be recorded in the journal superblock ARM: 8951/1: Fix Kexec compilation issue. hostap: Adjust indentation in prism2_hostapd_add_sta iwlegacy: ensure loop counter addr does not wrap and cause an infinite loop cifs: fix NULL dereference in match_prepath bpf: map_seq_next should always increase position index ceph: check availability of mds cluster on mount after wait timeout rbd: work around -Wuninitialized warning irqchip/gic-v3: Only provision redistributors that are enabled in ACPI drm/nouveau/disp/nv50-: prevent oops when no channel method map provided ftrace: fpid_next() should increase position index trigger_next should increase position index radeon: insert 10ms sleep in dce5_crtc_load_lut ocfs2: fix a NULL pointer dereference when call ocfs2_update_inode_fsync_trans() lib/scatterlist.c: adjust indentation in __sg_alloc_table reiserfs: prevent NULL pointer dereference in reiserfs_insert_item() bcache: explicity type cast in bset_bkey_last() irqchip/gic-v3-its: Reference to its_invall_cmd descriptor when building INVALL iwlwifi: mvm: Fix thermal zone registration microblaze: Prevent the overflow of the start brd: check and limit max_part par drm/amdgpu/smu10: fix smu10_get_clock_by_type_with_latency drm/amdgpu/smu10: fix smu10_get_clock_by_type_with_voltage NFS: Fix memory leaks help_next should increase position index cifs: log warning message (once) if out of disk space virtio_balloon: prevent pfn array overflow mlxsw: spectrum_dpipe: Add missing error path drm/amdgpu/display: handle multiple numbers of fclks in dcn_calcs.c (v2) Linux 4.19.106 Signed-off-by: Greg Kroah-Hartman <gregkh@google.com> Change-Id: Ia1032b50dd82b42e13973120dcbf94ae7b864648
541 lines
14 KiB
C
541 lines
14 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* linux/fs/ext4/file.c
|
|
*
|
|
* Copyright (C) 1992, 1993, 1994, 1995
|
|
* Remy Card (card@masi.ibp.fr)
|
|
* Laboratoire MASI - Institut Blaise Pascal
|
|
* Universite Pierre et Marie Curie (Paris VI)
|
|
*
|
|
* from
|
|
*
|
|
* linux/fs/minix/file.c
|
|
*
|
|
* Copyright (C) 1991, 1992 Linus Torvalds
|
|
*
|
|
* ext4 fs regular file handling primitives
|
|
*
|
|
* 64-bit file support on 64-bit platforms by Jakub Jelinek
|
|
* (jj@sunsite.ms.mff.cuni.cz)
|
|
*/
|
|
|
|
#include <linux/time.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/iomap.h>
|
|
#include <linux/mount.h>
|
|
#include <linux/path.h>
|
|
#include <linux/dax.h>
|
|
#include <linux/quotaops.h>
|
|
#include <linux/pagevec.h>
|
|
#include <linux/uio.h>
|
|
#include <linux/mman.h>
|
|
#include "ext4.h"
|
|
#include "ext4_jbd2.h"
|
|
#include "xattr.h"
|
|
#include "acl.h"
|
|
|
|
#ifdef CONFIG_FS_DAX
|
|
static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
|
{
|
|
struct inode *inode = file_inode(iocb->ki_filp);
|
|
ssize_t ret;
|
|
|
|
if (iocb->ki_flags & IOCB_NOWAIT) {
|
|
if (!inode_trylock_shared(inode))
|
|
return -EAGAIN;
|
|
} else {
|
|
inode_lock_shared(inode);
|
|
}
|
|
/*
|
|
* Recheck under inode lock - at this point we are sure it cannot
|
|
* change anymore
|
|
*/
|
|
if (!IS_DAX(inode)) {
|
|
inode_unlock_shared(inode);
|
|
/* Fallback to buffered IO in case we cannot support DAX */
|
|
return generic_file_read_iter(iocb, to);
|
|
}
|
|
ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops);
|
|
inode_unlock_shared(inode);
|
|
|
|
file_accessed(iocb->ki_filp);
|
|
return ret;
|
|
}
|
|
#endif
|
|
|
|
static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
|
{
|
|
if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb))))
|
|
return -EIO;
|
|
|
|
if (!iov_iter_count(to))
|
|
return 0; /* skip atime */
|
|
|
|
#ifdef CONFIG_FS_DAX
|
|
if (IS_DAX(file_inode(iocb->ki_filp)))
|
|
return ext4_dax_read_iter(iocb, to);
|
|
#endif
|
|
return generic_file_read_iter(iocb, to);
|
|
}
|
|
|
|
/*
|
|
* Called when an inode is released. Note that this is different
|
|
* from ext4_file_open: open gets called at every open, but release
|
|
* gets called only when /all/ the files are closed.
|
|
*/
|
|
static int ext4_release_file(struct inode *inode, struct file *filp)
|
|
{
|
|
if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
|
|
ext4_alloc_da_blocks(inode);
|
|
ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
|
|
}
|
|
/* if we are the last writer on the inode, drop the block reservation */
|
|
if ((filp->f_mode & FMODE_WRITE) &&
|
|
(atomic_read(&inode->i_writecount) == 1) &&
|
|
!EXT4_I(inode)->i_reserved_data_blocks)
|
|
{
|
|
down_write(&EXT4_I(inode)->i_data_sem);
|
|
ext4_discard_preallocations(inode);
|
|
up_write(&EXT4_I(inode)->i_data_sem);
|
|
}
|
|
if (is_dx(inode) && filp->private_data)
|
|
ext4_htree_free_dir_info(filp->private_data);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void ext4_unwritten_wait(struct inode *inode)
|
|
{
|
|
wait_queue_head_t *wq = ext4_ioend_wq(inode);
|
|
|
|
wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
|
|
}
|
|
|
|
/*
|
|
* This tests whether the IO in question is block-aligned or not.
|
|
* Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
|
|
* are converted to written only after the IO is complete. Until they are
|
|
* mapped, these blocks appear as holes, so dio_zero_block() will assume that
|
|
* it needs to zero out portions of the start and/or end block. If 2 AIO
|
|
* threads are at work on the same unwritten block, they must be synchronized
|
|
* or one thread will zero the other's data, causing corruption.
|
|
*/
|
|
static int
|
|
ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
|
|
{
|
|
struct super_block *sb = inode->i_sb;
|
|
int blockmask = sb->s_blocksize - 1;
|
|
|
|
if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize))
|
|
return 0;
|
|
|
|
if ((pos | iov_iter_alignment(from)) & blockmask)
|
|
return 1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* Is IO overwriting allocated and initialized blocks? */
|
|
static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
|
|
{
|
|
struct ext4_map_blocks map;
|
|
unsigned int blkbits = inode->i_blkbits;
|
|
int err, blklen;
|
|
|
|
if (pos + len > i_size_read(inode))
|
|
return false;
|
|
|
|
map.m_lblk = pos >> blkbits;
|
|
map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits);
|
|
blklen = map.m_len;
|
|
|
|
err = ext4_map_blocks(NULL, inode, &map, 0);
|
|
/*
|
|
* 'err==len' means that all of the blocks have been preallocated,
|
|
* regardless of whether they have been initialized or not. To exclude
|
|
* unwritten extents, we need to check m_flags.
|
|
*/
|
|
return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
|
|
}
|
|
|
|
static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
|
|
{
|
|
struct inode *inode = file_inode(iocb->ki_filp);
|
|
ssize_t ret;
|
|
|
|
ret = generic_write_checks(iocb, from);
|
|
if (ret <= 0)
|
|
return ret;
|
|
|
|
if (unlikely(IS_IMMUTABLE(inode)))
|
|
return -EPERM;
|
|
|
|
/*
|
|
* If we have encountered a bitmap-format file, the size limit
|
|
* is smaller than s_maxbytes, which is for extent-mapped files.
|
|
*/
|
|
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
|
|
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
|
|
|
if (iocb->ki_pos >= sbi->s_bitmap_maxbytes)
|
|
return -EFBIG;
|
|
iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
|
|
}
|
|
return iov_iter_count(from);
|
|
}
|
|
|
|
#ifdef CONFIG_FS_DAX
|
|
static ssize_t
|
|
ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
|
{
|
|
struct inode *inode = file_inode(iocb->ki_filp);
|
|
ssize_t ret;
|
|
|
|
if (iocb->ki_flags & IOCB_NOWAIT) {
|
|
if (!inode_trylock(inode))
|
|
return -EAGAIN;
|
|
} else {
|
|
inode_lock(inode);
|
|
}
|
|
ret = ext4_write_checks(iocb, from);
|
|
if (ret <= 0)
|
|
goto out;
|
|
ret = file_remove_privs(iocb->ki_filp);
|
|
if (ret)
|
|
goto out;
|
|
ret = file_update_time(iocb->ki_filp);
|
|
if (ret)
|
|
goto out;
|
|
|
|
ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
|
|
out:
|
|
inode_unlock(inode);
|
|
if (ret > 0)
|
|
ret = generic_write_sync(iocb, ret);
|
|
return ret;
|
|
}
|
|
#endif
|
|
|
|
static ssize_t
|
|
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
|
{
|
|
struct inode *inode = file_inode(iocb->ki_filp);
|
|
int o_direct = iocb->ki_flags & IOCB_DIRECT;
|
|
int unaligned_aio = 0;
|
|
int overwrite = 0;
|
|
ssize_t ret;
|
|
|
|
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
|
|
return -EIO;
|
|
|
|
#ifdef CONFIG_FS_DAX
|
|
if (IS_DAX(inode))
|
|
return ext4_dax_write_iter(iocb, from);
|
|
#endif
|
|
if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT))
|
|
return -EOPNOTSUPP;
|
|
|
|
if (!inode_trylock(inode)) {
|
|
if (iocb->ki_flags & IOCB_NOWAIT)
|
|
return -EAGAIN;
|
|
inode_lock(inode);
|
|
}
|
|
|
|
ret = ext4_write_checks(iocb, from);
|
|
if (ret <= 0)
|
|
goto out;
|
|
|
|
/*
|
|
* Unaligned direct AIO must be serialized among each other as zeroing
|
|
* of partial blocks of two competing unaligned AIOs can result in data
|
|
* corruption.
|
|
*/
|
|
if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
|
|
!is_sync_kiocb(iocb) &&
|
|
ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
|
|
unaligned_aio = 1;
|
|
ext4_unwritten_wait(inode);
|
|
}
|
|
|
|
iocb->private = &overwrite;
|
|
/* Check whether we do a DIO overwrite or not */
|
|
if (o_direct && !unaligned_aio) {
|
|
if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
|
|
if (ext4_should_dioread_nolock(inode))
|
|
overwrite = 1;
|
|
} else if (iocb->ki_flags & IOCB_NOWAIT) {
|
|
ret = -EAGAIN;
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
ret = __generic_file_write_iter(iocb, from);
|
|
/*
|
|
* Unaligned direct AIO must be the only IO in flight. Otherwise
|
|
* overlapping aligned IO after unaligned might result in data
|
|
* corruption.
|
|
*/
|
|
if (ret == -EIOCBQUEUED && unaligned_aio)
|
|
ext4_unwritten_wait(inode);
|
|
inode_unlock(inode);
|
|
|
|
if (ret > 0)
|
|
ret = generic_write_sync(iocb, ret);
|
|
|
|
return ret;
|
|
|
|
out:
|
|
inode_unlock(inode);
|
|
return ret;
|
|
}
|
|
|
|
#ifdef CONFIG_FS_DAX
|
|
static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
|
|
enum page_entry_size pe_size)
|
|
{
|
|
int error = 0;
|
|
vm_fault_t result;
|
|
int retries = 0;
|
|
handle_t *handle = NULL;
|
|
struct inode *inode = file_inode(vmf->vma->vm_file);
|
|
struct super_block *sb = inode->i_sb;
|
|
|
|
/*
|
|
* We have to distinguish real writes from writes which will result in a
|
|
* COW page; COW writes should *not* poke the journal (the file will not
|
|
* be changed). Doing so would cause unintended failures when mounted
|
|
* read-only.
|
|
*
|
|
* We check for VM_SHARED rather than vmf->cow_page since the latter is
|
|
* unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for
|
|
* other sizes, dax_iomap_fault will handle splitting / fallback so that
|
|
* we eventually come back with a COW page.
|
|
*/
|
|
bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
|
|
(vmf->vma->vm_flags & VM_SHARED);
|
|
pfn_t pfn;
|
|
|
|
if (write) {
|
|
sb_start_pagefault(sb);
|
|
file_update_time(vmf->vma->vm_file);
|
|
down_read(&EXT4_I(inode)->i_mmap_sem);
|
|
retry:
|
|
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
|
|
EXT4_DATA_TRANS_BLOCKS(sb));
|
|
if (IS_ERR(handle)) {
|
|
up_read(&EXT4_I(inode)->i_mmap_sem);
|
|
sb_end_pagefault(sb);
|
|
return VM_FAULT_SIGBUS;
|
|
}
|
|
} else {
|
|
down_read(&EXT4_I(inode)->i_mmap_sem);
|
|
}
|
|
result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
|
|
if (write) {
|
|
ext4_journal_stop(handle);
|
|
|
|
if ((result & VM_FAULT_ERROR) && error == -ENOSPC &&
|
|
ext4_should_retry_alloc(sb, &retries))
|
|
goto retry;
|
|
/* Handling synchronous page fault? */
|
|
if (result & VM_FAULT_NEEDDSYNC)
|
|
result = dax_finish_sync_fault(vmf, pe_size, pfn);
|
|
up_read(&EXT4_I(inode)->i_mmap_sem);
|
|
sb_end_pagefault(sb);
|
|
} else {
|
|
up_read(&EXT4_I(inode)->i_mmap_sem);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
static vm_fault_t ext4_dax_fault(struct vm_fault *vmf)
|
|
{
|
|
return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
|
|
}
|
|
|
|
static const struct vm_operations_struct ext4_dax_vm_ops = {
|
|
.fault = ext4_dax_fault,
|
|
.huge_fault = ext4_dax_huge_fault,
|
|
.page_mkwrite = ext4_dax_fault,
|
|
.pfn_mkwrite = ext4_dax_fault,
|
|
};
|
|
#else
|
|
#define ext4_dax_vm_ops ext4_file_vm_ops
|
|
#endif
|
|
|
|
static const struct vm_operations_struct ext4_file_vm_ops = {
|
|
.fault = ext4_filemap_fault,
|
|
.map_pages = filemap_map_pages,
|
|
.page_mkwrite = ext4_page_mkwrite,
|
|
};
|
|
|
|
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
|
|
{
|
|
struct inode *inode = file->f_mapping->host;
|
|
|
|
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
|
|
return -EIO;
|
|
|
|
/*
|
|
* We don't support synchronous mappings for non-DAX files. At least
|
|
* until someone comes with a sensible use case.
|
|
*/
|
|
if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC))
|
|
return -EOPNOTSUPP;
|
|
|
|
file_accessed(file);
|
|
if (IS_DAX(file_inode(file))) {
|
|
vma->vm_ops = &ext4_dax_vm_ops;
|
|
vma->vm_flags |= VM_HUGEPAGE;
|
|
} else {
|
|
vma->vm_ops = &ext4_file_vm_ops;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int ext4_sample_last_mounted(struct super_block *sb,
|
|
struct vfsmount *mnt)
|
|
{
|
|
struct ext4_sb_info *sbi = EXT4_SB(sb);
|
|
struct path path;
|
|
char buf[64], *cp;
|
|
handle_t *handle;
|
|
int err;
|
|
|
|
if (likely(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED))
|
|
return 0;
|
|
|
|
if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
|
|
return 0;
|
|
|
|
sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
|
|
/*
|
|
* Sample where the filesystem has been mounted and
|
|
* store it in the superblock for sysadmin convenience
|
|
* when trying to sort through large numbers of block
|
|
* devices or filesystem images.
|
|
*/
|
|
memset(buf, 0, sizeof(buf));
|
|
path.mnt = mnt;
|
|
path.dentry = mnt->mnt_root;
|
|
cp = d_path(&path, buf, sizeof(buf));
|
|
err = 0;
|
|
if (IS_ERR(cp))
|
|
goto out;
|
|
|
|
handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
|
|
err = PTR_ERR(handle);
|
|
if (IS_ERR(handle))
|
|
goto out;
|
|
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
|
|
err = ext4_journal_get_write_access(handle, sbi->s_sbh);
|
|
if (err)
|
|
goto out_journal;
|
|
strlcpy(sbi->s_es->s_last_mounted, cp,
|
|
sizeof(sbi->s_es->s_last_mounted));
|
|
ext4_handle_dirty_super(handle, sb);
|
|
out_journal:
|
|
ext4_journal_stop(handle);
|
|
out:
|
|
sb_end_intwrite(sb);
|
|
return err;
|
|
}
|
|
|
|
static int ext4_file_open(struct inode * inode, struct file * filp)
|
|
{
|
|
int ret;
|
|
|
|
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
|
|
return -EIO;
|
|
|
|
ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt);
|
|
if (ret)
|
|
return ret;
|
|
|
|
ret = fscrypt_file_open(inode, filp);
|
|
if (ret)
|
|
return ret;
|
|
|
|
ret = fsverity_file_open(inode, filp);
|
|
if (ret)
|
|
return ret;
|
|
|
|
/*
|
|
* Set up the jbd2_inode if we are opening the inode for
|
|
* writing and the journal is present
|
|
*/
|
|
if (filp->f_mode & FMODE_WRITE) {
|
|
ret = ext4_inode_attach_jinode(inode);
|
|
if (ret < 0)
|
|
return ret;
|
|
}
|
|
|
|
filp->f_mode |= FMODE_NOWAIT;
|
|
return dquot_file_open(inode, filp);
|
|
}
|
|
|
|
/*
|
|
* ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
|
|
* by calling generic_file_llseek_size() with the appropriate maxbytes
|
|
* value for each.
|
|
*/
|
|
loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
|
|
{
|
|
struct inode *inode = file->f_mapping->host;
|
|
loff_t maxbytes;
|
|
|
|
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
|
|
maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
|
|
else
|
|
maxbytes = inode->i_sb->s_maxbytes;
|
|
|
|
switch (whence) {
|
|
default:
|
|
return generic_file_llseek_size(file, offset, whence,
|
|
maxbytes, i_size_read(inode));
|
|
case SEEK_HOLE:
|
|
inode_lock_shared(inode);
|
|
offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops);
|
|
inode_unlock_shared(inode);
|
|
break;
|
|
case SEEK_DATA:
|
|
inode_lock_shared(inode);
|
|
offset = iomap_seek_data(inode, offset, &ext4_iomap_ops);
|
|
inode_unlock_shared(inode);
|
|
break;
|
|
}
|
|
|
|
if (offset < 0)
|
|
return offset;
|
|
return vfs_setpos(file, offset, maxbytes);
|
|
}
|
|
|
|
const struct file_operations ext4_file_operations = {
|
|
.llseek = ext4_llseek,
|
|
.read_iter = ext4_file_read_iter,
|
|
.write_iter = ext4_file_write_iter,
|
|
.unlocked_ioctl = ext4_ioctl,
|
|
#ifdef CONFIG_COMPAT
|
|
.compat_ioctl = ext4_compat_ioctl,
|
|
#endif
|
|
.mmap = ext4_file_mmap,
|
|
.mmap_supported_flags = MAP_SYNC,
|
|
.open = ext4_file_open,
|
|
.release = ext4_release_file,
|
|
.fsync = ext4_sync_file,
|
|
.get_unmapped_area = thp_get_unmapped_area,
|
|
.splice_read = generic_file_splice_read,
|
|
.splice_write = iter_file_splice_write,
|
|
.fallocate = ext4_fallocate,
|
|
};
|
|
|
|
const struct inode_operations ext4_file_inode_operations = {
|
|
.setattr = ext4_setattr,
|
|
.getattr = ext4_file_getattr,
|
|
.listxattr = ext4_listxattr,
|
|
.get_acl = ext4_get_acl,
|
|
.set_acl = ext4_set_acl,
|
|
.fiemap = ext4_fiemap,
|
|
};
|
|
|