diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4a01ba315262..3b83cd604796 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -335,9 +335,9 @@ struct ext4_group_desc */ struct flex_groups { - atomic_t free_inodes; - atomic_t free_clusters; - atomic_t used_dirs; + atomic64_t free_clusters; + atomic_t free_inodes; + atomic_t used_dirs; }; #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ @@ -2617,7 +2617,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, extern int __init ext4_init_pageio(void); extern void ext4_add_complete_io(ext4_io_end_t *io_end); extern void ext4_exit_pageio(void); -extern void ext4_ioend_wait(struct inode *); +extern void ext4_ioend_shutdown(struct inode *); extern void ext4_free_io_end(ext4_io_end_t *io); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); extern void ext4_end_io_work(struct work_struct *work); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 28dd8eeea6a9..56efcaadf848 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1584,10 +1584,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, unsigned short ext1_ee_len, ext2_ee_len, max_len; /* - * Make sure that either both extents are uninitialized, or - * both are _not_. + * Make sure that both extents are initialized. We don't merge + * uninitialized extents so that we can be sure that end_io code has + * the extent that was written properly split out and conversion to + * initialized is trivial. */ - if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) + if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2)) return 0; if (ext4_ext_is_uninitialized(ex1)) @@ -2923,7 +2925,7 @@ static int ext4_split_extent_at(handle_t *handle, { ext4_fsblk_t newblock; ext4_lblk_t ee_block; - struct ext4_extent *ex, newex, orig_ex; + struct ext4_extent *ex, newex, orig_ex, zero_ex; struct ext4_extent *ex2 = NULL; unsigned int ee_len, depth; int err = 0; @@ -2943,6 +2945,10 @@ static int ext4_split_extent_at(handle_t *handle, newblock = split - ee_block + ext4_ext_pblock(ex); BUG_ON(split < ee_block || split >= (ee_block + ee_len)); + BUG_ON(!ext4_ext_is_uninitialized(ex) && + split_flag & (EXT4_EXT_MAY_ZEROOUT | + EXT4_EXT_MARK_UNINIT1 | + EXT4_EXT_MARK_UNINIT2)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -2990,12 +2996,26 @@ static int ext4_split_extent_at(handle_t *handle, err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { - if (split_flag & EXT4_EXT_DATA_VALID1) + if (split_flag & EXT4_EXT_DATA_VALID1) { err = ext4_ext_zeroout(inode, ex2); - else + zero_ex.ee_block = ex2->ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(ex2); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex2)); + } else { err = ext4_ext_zeroout(inode, ex); - } else + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(ex); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex)); + } + } else { err = ext4_ext_zeroout(inode, &orig_ex); + zero_ex.ee_block = orig_ex.ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(&orig_ex); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(&orig_ex)); + } if (err) goto fix_extent_len; @@ -3003,6 +3023,12 @@ static int ext4_split_extent_at(handle_t *handle, ex->ee_len = cpu_to_le16(ee_len); ext4_ext_try_to_merge(handle, inode, path, ex); err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (err) + goto fix_extent_len; + + /* update extent status tree */ + err = ext4_es_zeroout(inode, &zero_ex); + goto out; } else if (err) goto fix_extent_len; @@ -3041,6 +3067,7 @@ static int ext4_split_extent(handle_t *handle, int err = 0; int uninitialized; int split_flag1, flags1; + int allocated = map->m_len; depth = ext_depth(inode); ex = path[depth].p_ext; @@ -3060,20 +3087,29 @@ static int ext4_split_extent(handle_t *handle, map->m_lblk + map->m_len, split_flag1, flags1); if (err) goto out; + } else { + allocated = ee_len - (map->m_lblk - ee_block); } - + /* + * Update path is required because previous ext4_split_extent_at() may + * result in split of original leaf or extent zeroout. + */ ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, map->m_lblk, path); if (IS_ERR(path)) return PTR_ERR(path); + depth = ext_depth(inode); + ex = path[depth].p_ext; + uninitialized = ext4_ext_is_uninitialized(ex); + split_flag1 = 0; if (map->m_lblk >= ee_block) { - split_flag1 = split_flag & (EXT4_EXT_MAY_ZEROOUT | - EXT4_EXT_DATA_VALID2); - if (uninitialized) + split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; + if (uninitialized) { split_flag1 |= EXT4_EXT_MARK_UNINIT1; - if (split_flag & EXT4_EXT_MARK_UNINIT2) - split_flag1 |= EXT4_EXT_MARK_UNINIT2; + split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | + EXT4_EXT_MARK_UNINIT2); + } err = ext4_split_extent_at(handle, inode, path, map->m_lblk, split_flag1, flags); if (err) @@ -3082,7 +3118,7 @@ static int ext4_split_extent(handle_t *handle, ext4_ext_show_leaf(inode, path); out: - return err ? err : map->m_len; + return err ? err : allocated; } /* @@ -3137,6 +3173,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); allocated = ee_len - (map->m_lblk - ee_block); + zero_ex.ee_len = 0; trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); @@ -3227,13 +3264,16 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, if (EXT4_EXT_MAY_ZEROOUT & split_flag) max_zeroout = sbi->s_extent_max_zeroout_kb >> - inode->i_sb->s_blocksize_bits; + (inode->i_sb->s_blocksize_bits - 10); /* If extent is less than s_max_zeroout_kb, zeroout directly */ if (max_zeroout && (ee_len <= max_zeroout)) { err = ext4_ext_zeroout(inode, ex); if (err) goto out; + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(ex); + ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -3292,6 +3332,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = allocated; out: + /* If we have gotten a failure, don't zero out status tree */ + if (!err) + err = ext4_es_zeroout(inode, &zero_ex); return err ? err : allocated; } @@ -3374,8 +3417,19 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, "block %llu, max_blocks %u\n", inode->i_ino, (unsigned long long)ee_block, ee_len); - /* If extent is larger than requested then split is required */ + /* If extent is larger than requested it is a clear sign that we still + * have some extent state machine issues left. So extent_split is still + * required. + * TODO: Once all related issues will be fixed this situation should be + * illegal. + */ if (ee_block != map->m_lblk || ee_len > map->m_len) { +#ifdef EXT4_DEBUG + ext4_warning("Inode (%ld) finished: extent logical block %llu," + " len %u; IO logical block %llu, len %u\n", + inode->i_ino, (unsigned long long)ee_block, ee_len, + (unsigned long long)map->m_lblk, map->m_len); +#endif err = ext4_split_unwritten_extents(handle, inode, map, path, EXT4_GET_BLOCKS_CONVERT); if (err < 0) @@ -3626,6 +3680,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, path, map->m_len); } else err = ret; + map->m_flags |= EXT4_MAP_MAPPED; + if (allocated > map->m_len) + allocated = map->m_len; + map->m_len = allocated; goto out2; } /* buffered IO case */ @@ -3675,6 +3733,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, allocated - map->m_len); allocated = map->m_len; } + map->m_len = allocated; /* * If we have done fallocate with the offset that is already @@ -4106,9 +4165,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, } } else { BUG_ON(allocated_clusters < reserved_clusters); - /* We will claim quota for all newly allocated blocks.*/ - ext4_da_update_reserve_space(inode, allocated_clusters, - 1); if (reserved_clusters < allocated_clusters) { struct ext4_inode_info *ei = EXT4_I(inode); int reservation = allocated_clusters - @@ -4159,6 +4215,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ei->i_reserved_data_blocks += reservation; spin_unlock(&ei->i_block_reservation_lock); } + /* + * We will claim quota for all newly allocated blocks. + * We're updating the reserved space *after* the + * correction above so we do not accidentally free + * all the metadata reservation because we might + * actually need it later on. + */ + ext4_da_update_reserve_space(inode, allocated_clusters, + 1); } } @@ -4368,8 +4433,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (len <= EXT_UNINIT_MAX_LEN << blkbits) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; - /* Prevent race condition between unwritten */ - ext4_flush_unwritten_io(inode); retry: while (ret >= 0 && ret < max_blocks) { map.m_lblk = map.m_lblk + ret; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 95796a1b7522..fe3337a85ede 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -333,17 +333,27 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) static int ext4_es_can_be_merged(struct extent_status *es1, struct extent_status *es2) { - if (es1->es_lblk + es1->es_len != es2->es_lblk) - return 0; - if (ext4_es_status(es1) != ext4_es_status(es2)) return 0; - if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && - (ext4_es_pblock(es1) + es1->es_len != ext4_es_pblock(es2))) + if (((__u64) es1->es_len) + es2->es_len > 0xFFFFFFFFULL) return 0; - return 1; + if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk) + return 0; + + if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && + (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2))) + return 1; + + if (ext4_es_is_hole(es1)) + return 1; + + /* we need to check delayed extent is without unwritten status */ + if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1)) + return 1; + + return 0; } static struct extent_status * @@ -389,6 +399,179 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es) return es; } +#ifdef ES_AGGRESSIVE_TEST +static void ext4_es_insert_extent_ext_check(struct inode *inode, + struct extent_status *es) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent *ex; + ext4_lblk_t ee_block; + ext4_fsblk_t ee_start; + unsigned short ee_len; + int depth, ee_status, es_status; + + path = ext4_ext_find_extent(inode, es->es_lblk, NULL); + if (IS_ERR(path)) + return; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + + if (ex) { + + ee_block = le32_to_cpu(ex->ee_block); + ee_start = ext4_ext_pblock(ex); + ee_len = ext4_ext_get_actual_len(ex); + + ee_status = ext4_ext_is_uninitialized(ex) ? 1 : 0; + es_status = ext4_es_is_unwritten(es) ? 1 : 0; + + /* + * Make sure ex and es are not overlap when we try to insert + * a delayed/hole extent. + */ + if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { + if (in_range(es->es_lblk, ee_block, ee_len)) { + pr_warn("ES insert assertation failed for " + "inode: %lu we can find an extent " + "at block [%d/%d/%llu/%c], but we " + "want to add an delayed/hole extent " + "[%d/%d/%llu/%llx]\n", + inode->i_ino, ee_block, ee_len, + ee_start, ee_status ? 'u' : 'w', + es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + } + goto out; + } + + /* + * We don't check ee_block == es->es_lblk, etc. because es + * might be a part of whole extent, vice versa. + */ + if (es->es_lblk < ee_block || + ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { + pr_warn("ES insert assertation failed for inode: %lu " + "ex_status [%d/%d/%llu/%c] != " + "es_status [%d/%d/%llu/%c]\n", inode->i_ino, + ee_block, ee_len, ee_start, + ee_status ? 'u' : 'w', es->es_lblk, es->es_len, + ext4_es_pblock(es), es_status ? 'u' : 'w'); + goto out; + } + + if (ee_status ^ es_status) { + pr_warn("ES insert assertation failed for inode: %lu " + "ex_status [%d/%d/%llu/%c] != " + "es_status [%d/%d/%llu/%c]\n", inode->i_ino, + ee_block, ee_len, ee_start, + ee_status ? 'u' : 'w', es->es_lblk, es->es_len, + ext4_es_pblock(es), es_status ? 'u' : 'w'); + } + } else { + /* + * We can't find an extent on disk. So we need to make sure + * that we don't want to add an written/unwritten extent. + */ + if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { + pr_warn("ES insert assertation failed for inode: %lu " + "can't find an extent at block %d but we want " + "to add an written/unwritten extent " + "[%d/%d/%llu/%llx]\n", inode->i_ino, + es->es_lblk, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + } + } +out: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } +} + +static void ext4_es_insert_extent_ind_check(struct inode *inode, + struct extent_status *es) +{ + struct ext4_map_blocks map; + int retval; + + /* + * Here we call ext4_ind_map_blocks to lookup a block mapping because + * 'Indirect' structure is defined in indirect.c. So we couldn't + * access direct/indirect tree from outside. It is too dirty to define + * this function in indirect.c file. + */ + + map.m_lblk = es->es_lblk; + map.m_len = es->es_len; + + retval = ext4_ind_map_blocks(NULL, inode, &map, 0); + if (retval > 0) { + if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) { + /* + * We want to add a delayed/hole extent but this + * block has been allocated. + */ + pr_warn("ES insert assertation failed for inode: %lu " + "We can find blocks but we want to add a " + "delayed/hole extent [%d/%d/%llu/%llx]\n", + inode->i_ino, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + return; + } else if (ext4_es_is_written(es)) { + if (retval != es->es_len) { + pr_warn("ES insert assertation failed for " + "inode: %lu retval %d != es_len %d\n", + inode->i_ino, retval, es->es_len); + return; + } + if (map.m_pblk != ext4_es_pblock(es)) { + pr_warn("ES insert assertation failed for " + "inode: %lu m_pblk %llu != " + "es_pblk %llu\n", + inode->i_ino, map.m_pblk, + ext4_es_pblock(es)); + return; + } + } else { + /* + * We don't need to check unwritten extent because + * indirect-based file doesn't have it. + */ + BUG_ON(1); + } + } else if (retval == 0) { + if (ext4_es_is_written(es)) { + pr_warn("ES insert assertation failed for inode: %lu " + "We can't find the block but we want to add " + "an written extent [%d/%d/%llu/%llx]\n", + inode->i_ino, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + return; + } + } +} + +static inline void ext4_es_insert_extent_check(struct inode *inode, + struct extent_status *es) +{ + /* + * We don't need to worry about the race condition because + * caller takes i_data_sem locking. + */ + BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ext4_es_insert_extent_ext_check(inode, es); + else + ext4_es_insert_extent_ind_check(inode, es); +} +#else +static inline void ext4_es_insert_extent_check(struct inode *inode, + struct extent_status *es) +{ +} +#endif + static int __es_insert_extent(struct inode *inode, struct extent_status *newes) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; @@ -471,6 +654,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_es_store_status(&newes, status); trace_ext4_es_insert_extent(inode, &newes); + ext4_es_insert_extent_check(inode, &newes); + write_lock(&EXT4_I(inode)->i_es_lock); err = __es_remove_extent(inode, lblk, end); if (err != 0) @@ -669,6 +854,23 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, return err; } +int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex) +{ + ext4_lblk_t ee_block; + ext4_fsblk_t ee_pblock; + unsigned int ee_len; + + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + ee_pblock = ext4_ext_pblock(ex); + + if (ee_len == 0) + return 0; + + return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, + EXTENT_STATUS_WRITTEN); +} + static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) { struct ext4_sb_info *sbi = container_of(shrink, diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index f190dfe969da..d8e2d4dc311e 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -20,6 +20,12 @@ #define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif +/* + * With ES_AGGRESSIVE_TEST defined, the result of es caching will be + * checked with old map_block's result. + */ +#define ES_AGGRESSIVE_TEST__ + /* * These flags live in the high bits of extent_status.es_pblk */ @@ -33,6 +39,8 @@ EXTENT_STATUS_DELAYED | \ EXTENT_STATUS_HOLE) +struct ext4_extent; + struct extent_status { struct rb_node rb_node; ext4_lblk_t es_lblk; /* first logical block extent covers */ @@ -58,6 +66,7 @@ extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); +extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex); static inline int ext4_es_is_written(struct extent_status *es) { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 32fd2b9075dd..6c5bb8d993fe 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -324,8 +324,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) } struct orlov_stats { + __u64 free_clusters; __u32 free_inodes; - __u32 free_clusters; __u32 used_dirs; }; @@ -342,7 +342,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, if (flex_size > 1) { stats->free_inodes = atomic_read(&flex_group[g].free_inodes); - stats->free_clusters = atomic_read(&flex_group[g].free_clusters); + stats->free_clusters = atomic64_read(&flex_group[g].free_clusters); stats->used_dirs = atomic_read(&flex_group[g].used_dirs); return; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9ea0cde3fa9e..b3a5213bc73e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -185,8 +185,6 @@ void ext4_evict_inode(struct inode *inode) trace_ext4_evict_inode(inode); - ext4_ioend_wait(inode); - if (inode->i_nlink) { /* * When journalling data dirty buffers are tracked only in the @@ -207,7 +205,8 @@ void ext4_evict_inode(struct inode *inode) * don't use page cache. */ if (ext4_should_journal_data(inode) && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && + inode->i_ino != EXT4_JOURNAL_INO) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; @@ -216,6 +215,7 @@ void ext4_evict_inode(struct inode *inode) filemap_write_and_wait(&inode->i_data); } truncate_inode_pages(&inode->i_data, 0); + ext4_ioend_shutdown(inode); goto no_delete; } @@ -225,6 +225,7 @@ void ext4_evict_inode(struct inode *inode) if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages(&inode->i_data, 0); + ext4_ioend_shutdown(inode); if (is_bad_inode(inode)) goto no_delete; @@ -482,6 +483,58 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, return num; } +#ifdef ES_AGGRESSIVE_TEST +static void ext4_map_blocks_es_recheck(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *es_map, + struct ext4_map_blocks *map, + int flags) +{ + int retval; + + map->m_flags = 0; + /* + * There is a race window that the result is not the same. + * e.g. xfstests #223 when dioread_nolock enables. The reason + * is that we lookup a block mapping in extent status tree with + * out taking i_data_sem. So at the time the unwritten extent + * could be converted. + */ + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) + down_read((&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); + } + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) + up_read((&EXT4_I(inode)->i_data_sem)); + /* + * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag + * because it shouldn't be marked in es_map->m_flags. + */ + map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY); + + /* + * We don't check m_len because extent will be collpased in status + * tree. So the m_len might not equal. + */ + if (es_map->m_lblk != map->m_lblk || + es_map->m_flags != map->m_flags || + es_map->m_pblk != map->m_pblk) { + printk("ES cache assertation failed for inode: %lu " + "es_cached ex [%d/%d/%llu/%x] != " + "found ex [%d/%d/%llu/%x] retval %d flags %x\n", + inode->i_ino, es_map->m_lblk, es_map->m_len, + es_map->m_pblk, es_map->m_flags, map->m_lblk, + map->m_len, map->m_pblk, map->m_flags, + retval, flags); + } +} +#endif /* ES_AGGRESSIVE_TEST */ + /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -509,6 +562,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, { struct extent_status es; int retval; +#ifdef ES_AGGRESSIVE_TEST + struct ext4_map_blocks orig_map; + + memcpy(&orig_map, map, sizeof(*map)); +#endif map->m_flags = 0; ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," @@ -531,6 +589,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, } else { BUG_ON(1); } +#ifdef ES_AGGRESSIVE_TEST + ext4_map_blocks_es_recheck(handle, inode, map, + &orig_map, flags); +#endif goto found; } @@ -551,6 +613,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, int ret; unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST + if (retval != map->m_len) { + printk("ES len assertation failed for inode: %lu " + "retval %d != map->m_len %d " + "in %s (lookup)\n", inode->i_ino, retval, + map->m_len, __func__); + } +#endif + status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && @@ -643,6 +714,24 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, int ret; unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST + if (retval != map->m_len) { + printk("ES len assertation failed for inode: %lu " + "retval %d != map->m_len %d " + "in %s (allocation)\n", inode->i_ino, retval, + map->m_len, __func__); + } +#endif + + /* + * If the extent has been zeroed out, we don't need to update + * extent status tree. + */ + if ((flags & EXT4_GET_BLOCKS_PRE_IO) && + ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + if (ext4_es_is_written(&es)) + goto has_zeroout; + } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && @@ -655,6 +744,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, retval = ret; } +has_zeroout: up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { int ret = check_block_validity(inode, map); @@ -1215,6 +1305,55 @@ static int ext4_journalled_write_end(struct file *file, return ret ? ret : copied; } +/* + * Reserve a metadata for a single block located at lblock + */ +static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) +{ + int retries = 0; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int md_needed; + ext4_lblk_t save_last_lblock; + int save_len; + + /* + * recalculate the amount of metadata blocks to reserve + * in order to allocate nrblocks + * worse case is one extent per block + */ +repeat: + spin_lock(&ei->i_block_reservation_lock); + /* + * ext4_calc_metadata_amount() has side effects, which we have + * to be prepared undo if we fail to claim space. + */ + save_len = ei->i_da_metadata_calc_len; + save_last_lblock = ei->i_da_metadata_calc_last_lblock; + md_needed = EXT4_NUM_B2C(sbi, + ext4_calc_metadata_amount(inode, lblock)); + trace_ext4_da_reserve_space(inode, md_needed); + + /* + * We do still charge estimated metadata to the sb though; + * we cannot afford to run out of free blocks. + */ + if (ext4_claim_free_clusters(sbi, md_needed, 0)) { + ei->i_da_metadata_calc_len = save_len; + ei->i_da_metadata_calc_last_lblock = save_last_lblock; + spin_unlock(&ei->i_block_reservation_lock); + if (ext4_should_retry_alloc(inode->i_sb, &retries)) { + cond_resched(); + goto repeat; + } + return -ENOSPC; + } + ei->i_reserved_meta_blocks += md_needed; + spin_unlock(&ei->i_block_reservation_lock); + + return 0; /* success */ +} + /* * Reserve a single cluster located at lblock */ @@ -1263,7 +1402,7 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) ei->i_da_metadata_calc_last_lblock = save_last_lblock; spin_unlock(&ei->i_block_reservation_lock); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { - yield(); + cond_resched(); goto repeat; } dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); @@ -1768,6 +1907,11 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, struct extent_status es; int retval; sector_t invalid_block = ~((sector_t) 0xffff); +#ifdef ES_AGGRESSIVE_TEST + struct ext4_map_blocks orig_map; + + memcpy(&orig_map, map, sizeof(*map)); +#endif if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) invalid_block = ~0; @@ -1809,6 +1953,9 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, else BUG_ON(1); +#ifdef ES_AGGRESSIVE_TEST + ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); +#endif return retval; } @@ -1843,8 +1990,11 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, * XXX: __block_prepare_write() unmaps passed block, * is it OK? */ - /* If the block was allocated from previously allocated cluster, - * then we dont need to reserve it again. */ + /* + * If the block was allocated from previously allocated cluster, + * then we don't need to reserve it again. However we still need + * to reserve metadata for every block we're going to write. + */ if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { ret = ext4_da_reserve_space(inode, iblock); if (ret) { @@ -1852,6 +2002,13 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, retval = ret; goto out_unlock; } + } else { + ret = ext4_da_reserve_metadata(inode, iblock); + if (ret) { + /* not enough space to reserve */ + retval = ret; + goto out_unlock; + } } ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, @@ -1873,6 +2030,15 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, int ret; unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST + if (retval != map->m_len) { + printk("ES len assertation failed for inode: %lu " + "retval %d != map->m_len %d " + "in %s (lookup)\n", inode->i_ino, retval, + map->m_len, __func__); + } +#endif + status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, @@ -2908,8 +3074,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait) trace_ext4_releasepage(page); - WARN_ON(PageChecked(page)); - if (!page_has_buffers(page)) + /* Page has dirty journalled data -> cannot release */ + if (PageChecked(page)) return 0; if (journal) return jbd2_journal_try_to_free_buffers(journal, page, wait); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7bb713a46fe4..ee6614bdb639 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2804,8 +2804,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, ac->ac_b_ex.fe_group); - atomic_sub(ac->ac_b_ex.fe_len, - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_sub(ac->ac_b_ex.fe_len, + &sbi->s_flex_groups[flex_group].free_clusters); } err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); @@ -3692,11 +3692,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, if (free < needed && busy) { busy = 0; ext4_unlock_group(sb, group); - /* - * Yield the CPU here so that we don't get soft lockup - * in non preempt case. - */ - yield(); + cond_resched(); goto repeat; } @@ -4246,7 +4242,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { /* let others to free the space */ - yield(); + cond_resched(); ar->len = ar->len >> 1; } if (!ar->len) { @@ -4464,7 +4460,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; struct ext4_group_desc *gdp; - unsigned long freed = 0; unsigned int overflow; ext4_grpblk_t bit; struct buffer_head *gd_bh; @@ -4666,14 +4661,12 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(count_clusters, - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(count_clusters, + &sbi->s_flex_groups[flex_group].free_clusters); } ext4_mb_unload_buddy(&e4b); - freed += count; - if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); @@ -4811,8 +4804,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(EXT4_NUM_B2C(sbi, blocks_freed), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed), + &sbi->s_flex_groups[flex_group].free_clusters); } ext4_mb_unload_buddy(&e4b); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 4e81d47aa8cb..33e1c086858b 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -32,16 +32,18 @@ */ static inline int get_ext_path(struct inode *inode, ext4_lblk_t lblock, - struct ext4_ext_path **path) + struct ext4_ext_path **orig_path) { int ret = 0; + struct ext4_ext_path *path; - *path = ext4_ext_find_extent(inode, lblock, *path); - if (IS_ERR(*path)) { - ret = PTR_ERR(*path); - *path = NULL; - } else if ((*path)[ext_depth(inode)].p_ext == NULL) + path = ext4_ext_find_extent(inode, lblock, *orig_path); + if (IS_ERR(path)) + ret = PTR_ERR(path); + else if (path[ext_depth(inode)].p_ext == NULL) ret = -ENODATA; + else + *orig_path = path; return ret; } @@ -611,24 +613,25 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, { struct ext4_ext_path *path = NULL; struct ext4_extent *ext; + int ret = 0; ext4_lblk_t last = from + count; while (from < last) { *err = get_ext_path(inode, from, &path); if (*err) - return 0; + goto out; ext = path[ext_depth(inode)].p_ext; - if (!ext) { - ext4_ext_drop_refs(path); - return 0; - } - if (uninit != ext4_ext_is_uninitialized(ext)) { - ext4_ext_drop_refs(path); - return 0; - } + if (uninit != ext4_ext_is_uninitialized(ext)) + goto out; from += ext4_ext_get_actual_len(ext); ext4_ext_drop_refs(path); } - return 1; + ret = 1; +out: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + return ret; } /** @@ -666,6 +669,14 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, int replaced_count = 0; int dext_alen; + *err = ext4_es_remove_extent(orig_inode, from, count); + if (*err) + goto out; + + *err = ext4_es_remove_extent(donor_inode, from, count); + if (*err) + goto out; + /* Get the original extent for the block "orig_off" */ *err = get_ext_path(orig_inode, orig_off, &orig_path); if (*err) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 809b31003ecc..047a6de04a0a 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -50,11 +50,21 @@ void ext4_exit_pageio(void) kmem_cache_destroy(io_page_cachep); } -void ext4_ioend_wait(struct inode *inode) +/* + * This function is called by ext4_evict_inode() to make sure there is + * no more pending I/O completion work left to do. + */ +void ext4_ioend_shutdown(struct inode *inode) { wait_queue_head_t *wq = ext4_ioend_wq(inode); wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); + /* + * We need to make sure the work structure is finished being + * used before we let the inode get destroyed. + */ + if (work_pending(&EXT4_I(inode)->i_unwritten_work)) + cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); } static void put_io_page(struct ext4_io_page *io_page) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b2c8ee56eb98..c169477a62c9 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1360,8 +1360,8 @@ static void ext4_update_super(struct super_block *sb, sbi->s_log_groups_per_flex) { ext4_group_t flex_group; flex_group = ext4_flex_group(sbi, group_data[0].group); - atomic_add(EXT4_NUM_B2C(sbi, free_blocks), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(EXT4_NUM_B2C(sbi, free_blocks), + &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count, &sbi->s_flex_groups[flex_group].free_inodes); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b3818b48f418..5d6d53578124 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1927,8 +1927,8 @@ static int ext4_fill_flex_info(struct super_block *sb) flex_group = ext4_flex_group(sbi, i); atomic_add(ext4_free_inodes_count(sb, gdp), &sbi->s_flex_groups[flex_group].free_inodes); - atomic_add(ext4_free_group_clusters(sb, gdp), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(ext4_free_group_clusters(sb, gdp), + &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(ext4_used_dirs_count(sb, gdp), &sbi->s_flex_groups[flex_group].used_dirs); } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index d6ee5aed56b1..325bc019ed88 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1065,9 +1065,12 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) void jbd2_journal_set_triggers(struct buffer_head *bh, struct jbd2_buffer_trigger_type *type) { - struct journal_head *jh = bh2jh(bh); + struct journal_head *jh = jbd2_journal_grab_journal_head(bh); + if (WARN_ON(!jh)) + return; jh->b_triggers = type; + jbd2_journal_put_journal_head(jh); } void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, @@ -1119,17 +1122,18 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; - struct journal_head *jh = bh2jh(bh); + struct journal_head *jh; int ret = 0; - jbd_debug(5, "journal_head %p\n", jh); - JBUFFER_TRACE(jh, "entry"); if (is_handle_aborted(handle)) goto out; - if (!buffer_jbd(bh)) { + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) { ret = -EUCLEAN; goto out; } + jbd_debug(5, "journal_head %p\n", jh); + JBUFFER_TRACE(jh, "entry"); jbd_lock_bh_state(bh); @@ -1220,6 +1224,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) spin_unlock(&journal->j_list_lock); out_unlock_bh: jbd_unlock_bh_state(bh); + jbd2_journal_put_journal_head(jh); out: JBUFFER_TRACE(jh, "exit"); WARN_ON(ret); /* All errors are bugs, so dump the stack */