ext4: return 32/64-bit dir name hash according to usage type
Traditionally ext2/3/4 has returned a 32-bit hash value from llseek() to appease NFSv2, which can only handle a 32-bit cookie for seekdir() and telldir(). However, this causes problems if there are 32-bit hash collisions, since the NFSv2 server can get stuck resending the same entries from the directory repeatedly. Allow ext4 to return a full 64-bit hash (both major and minor) for telldir to decrease the chance of hash collisions. This still needs integration on the NFS side. Patch-updated-by: Bernd Schubert <bernd.schubert@itwm.fraunhofer.de> (blame me if something is not correct) Signed-off-by: Fan Yong <yong.fan@whamcloud.com> Signed-off-by: Andreas Dilger <adilger@whamcloud.com> Signed-off-by: Bernd Schubert <bernd.schubert@itwm.fraunhofer.de> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
This commit is contained in:
parent
6a8a13e038
commit
d1f5273e9a
3 changed files with 176 additions and 48 deletions
214
fs/ext4/dir.c
214
fs/ext4/dir.c
|
@ -32,24 +32,8 @@ static unsigned char ext4_filetype_table[] = {
|
|||
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
|
||||
};
|
||||
|
||||
static int ext4_readdir(struct file *, void *, filldir_t);
|
||||
static int ext4_dx_readdir(struct file *filp,
|
||||
void *dirent, filldir_t filldir);
|
||||
static int ext4_release_dir(struct inode *inode,
|
||||
struct file *filp);
|
||||
|
||||
const struct file_operations ext4_dir_operations = {
|
||||
.llseek = ext4_llseek,
|
||||
.read = generic_read_dir,
|
||||
.readdir = ext4_readdir, /* we take BKL. needed?*/
|
||||
.unlocked_ioctl = ext4_ioctl,
|
||||
#ifdef CONFIG_COMPAT
|
||||
.compat_ioctl = ext4_compat_ioctl,
|
||||
#endif
|
||||
.fsync = ext4_sync_file,
|
||||
.release = ext4_release_dir,
|
||||
};
|
||||
|
||||
|
||||
static unsigned char get_dtype(struct super_block *sb, int filetype)
|
||||
{
|
||||
|
@ -60,6 +44,26 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
|
|||
return (ext4_filetype_table[filetype]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the given dir-inode refers to an htree-indexed directory
|
||||
* (or a directory which chould potentially get coverted to use htree
|
||||
* indexing).
|
||||
*
|
||||
* Return 1 if it is a dx dir, 0 if not
|
||||
*/
|
||||
static int is_dx_dir(struct inode *inode)
|
||||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
|
||||
if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
|
||||
EXT4_FEATURE_COMPAT_DIR_INDEX) &&
|
||||
((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
|
||||
((inode->i_size >> sb->s_blocksize_bits) == 1)))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return 0 if the directory entry is OK, and 1 if there is a problem
|
||||
*
|
||||
|
@ -115,18 +119,13 @@ static int ext4_readdir(struct file *filp,
|
|||
unsigned int offset;
|
||||
int i, stored;
|
||||
struct ext4_dir_entry_2 *de;
|
||||
struct super_block *sb;
|
||||
int err;
|
||||
struct inode *inode = filp->f_path.dentry->d_inode;
|
||||
struct super_block *sb = inode->i_sb;
|
||||
int ret = 0;
|
||||
int dir_has_error = 0;
|
||||
|
||||
sb = inode->i_sb;
|
||||
|
||||
if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
|
||||
EXT4_FEATURE_COMPAT_DIR_INDEX) &&
|
||||
((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
|
||||
((inode->i_size >> sb->s_blocksize_bits) == 1))) {
|
||||
if (is_dx_dir(inode)) {
|
||||
err = ext4_dx_readdir(filp, dirent, filldir);
|
||||
if (err != ERR_BAD_DX_DIR) {
|
||||
ret = err;
|
||||
|
@ -254,22 +253,134 @@ static int ext4_readdir(struct file *filp,
|
|||
return ret;
|
||||
}
|
||||
|
||||
static inline int is_32bit_api(void)
|
||||
{
|
||||
#ifdef CONFIG_COMPAT
|
||||
return is_compat_task();
|
||||
#else
|
||||
return (BITS_PER_LONG == 32);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* These functions convert from the major/minor hash to an f_pos
|
||||
* value.
|
||||
* value for dx directories
|
||||
*
|
||||
* Currently we only use major hash numer. This is unfortunate, but
|
||||
* on 32-bit machines, the same VFS interface is used for lseek and
|
||||
* llseek, so if we use the 64 bit offset, then the 32-bit versions of
|
||||
* lseek/telldir/seekdir will blow out spectacularly, and from within
|
||||
* the ext2 low-level routine, we don't know if we're being called by
|
||||
* a 64-bit version of the system call or the 32-bit version of the
|
||||
* system call. Worse yet, NFSv2 only allows for a 32-bit readdir
|
||||
* cookie. Sigh.
|
||||
* Upper layer (for example NFS) should specify FMODE_32BITHASH or
|
||||
* FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted
|
||||
* directly on both 32-bit and 64-bit nodes, under such case, neither
|
||||
* FMODE_32BITHASH nor FMODE_64BITHASH is specified.
|
||||
*/
|
||||
#define hash2pos(major, minor) (major >> 1)
|
||||
#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
|
||||
#define pos2min_hash(pos) (0)
|
||||
static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
|
||||
{
|
||||
if ((filp->f_mode & FMODE_32BITHASH) ||
|
||||
(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
|
||||
return major >> 1;
|
||||
else
|
||||
return ((__u64)(major >> 1) << 32) | (__u64)minor;
|
||||
}
|
||||
|
||||
static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
|
||||
{
|
||||
if ((filp->f_mode & FMODE_32BITHASH) ||
|
||||
(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
|
||||
return (pos << 1) & 0xffffffff;
|
||||
else
|
||||
return ((pos >> 32) << 1) & 0xffffffff;
|
||||
}
|
||||
|
||||
static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
|
||||
{
|
||||
if ((filp->f_mode & FMODE_32BITHASH) ||
|
||||
(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
|
||||
return 0;
|
||||
else
|
||||
return pos & 0xffffffff;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return 32- or 64-bit end-of-file for dx directories
|
||||
*/
|
||||
static inline loff_t ext4_get_htree_eof(struct file *filp)
|
||||
{
|
||||
if ((filp->f_mode & FMODE_32BITHASH) ||
|
||||
(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
|
||||
return EXT4_HTREE_EOF_32BIT;
|
||||
else
|
||||
return EXT4_HTREE_EOF_64BIT;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* ext4_dir_llseek() based on generic_file_llseek() to handle both
|
||||
* non-htree and htree directories, where the "offset" is in terms
|
||||
* of the filename hash value instead of the byte offset.
|
||||
*
|
||||
* NOTE: offsets obtained *before* ext4_set_inode_flag(dir, EXT4_INODE_INDEX)
|
||||
* will be invalid once the directory was converted into a dx directory
|
||||
*/
|
||||
loff_t ext4_dir_llseek(struct file *file, loff_t offset, int origin)
|
||||
{
|
||||
struct inode *inode = file->f_mapping->host;
|
||||
loff_t ret = -EINVAL;
|
||||
int dx_dir = is_dx_dir(inode);
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
|
||||
/* NOTE: relative offsets with dx directories might not work
|
||||
* as expected, as it is difficult to figure out the
|
||||
* correct offset between dx hashes */
|
||||
|
||||
switch (origin) {
|
||||
case SEEK_END:
|
||||
if (unlikely(offset > 0))
|
||||
goto out_err; /* not supported for directories */
|
||||
|
||||
/* so only negative offsets are left, does that have a
|
||||
* meaning for directories at all? */
|
||||
if (dx_dir)
|
||||
offset += ext4_get_htree_eof(file);
|
||||
else
|
||||
offset += inode->i_size;
|
||||
break;
|
||||
case SEEK_CUR:
|
||||
/*
|
||||
* Here we special-case the lseek(fd, 0, SEEK_CUR)
|
||||
* position-querying operation. Avoid rewriting the "same"
|
||||
* f_pos value back to the file because a concurrent read(),
|
||||
* write() or lseek() might have altered it
|
||||
*/
|
||||
if (offset == 0) {
|
||||
offset = file->f_pos;
|
||||
goto out_ok;
|
||||
}
|
||||
|
||||
offset += file->f_pos;
|
||||
break;
|
||||
}
|
||||
|
||||
if (unlikely(offset < 0))
|
||||
goto out_err;
|
||||
|
||||
if (!dx_dir) {
|
||||
if (offset > inode->i_sb->s_maxbytes)
|
||||
goto out_err;
|
||||
} else if (offset > ext4_get_htree_eof(file))
|
||||
goto out_err;
|
||||
|
||||
/* Special lock needed here? */
|
||||
if (offset != file->f_pos) {
|
||||
file->f_pos = offset;
|
||||
file->f_version = 0;
|
||||
}
|
||||
|
||||
out_ok:
|
||||
ret = offset;
|
||||
out_err:
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* This structure holds the nodes of the red-black tree used to store
|
||||
|
@ -330,15 +441,16 @@ static void free_rb_tree_fname(struct rb_root *root)
|
|||
}
|
||||
|
||||
|
||||
static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
|
||||
static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
|
||||
loff_t pos)
|
||||
{
|
||||
struct dir_private_info *p;
|
||||
|
||||
p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
|
||||
if (!p)
|
||||
return NULL;
|
||||
p->curr_hash = pos2maj_hash(pos);
|
||||
p->curr_minor_hash = pos2min_hash(pos);
|
||||
p->curr_hash = pos2maj_hash(filp, pos);
|
||||
p->curr_minor_hash = pos2min_hash(filp, pos);
|
||||
return p;
|
||||
}
|
||||
|
||||
|
@ -429,7 +541,7 @@ static int call_filldir(struct file *filp, void *dirent,
|
|||
"null fname?!?\n");
|
||||
return 0;
|
||||
}
|
||||
curr_pos = hash2pos(fname->hash, fname->minor_hash);
|
||||
curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
|
||||
while (fname) {
|
||||
error = filldir(dirent, fname->name,
|
||||
fname->name_len, curr_pos,
|
||||
|
@ -454,13 +566,13 @@ static int ext4_dx_readdir(struct file *filp,
|
|||
int ret;
|
||||
|
||||
if (!info) {
|
||||
info = ext4_htree_create_dir_info(filp->f_pos);
|
||||
info = ext4_htree_create_dir_info(filp, filp->f_pos);
|
||||
if (!info)
|
||||
return -ENOMEM;
|
||||
filp->private_data = info;
|
||||
}
|
||||
|
||||
if (filp->f_pos == EXT4_HTREE_EOF)
|
||||
if (filp->f_pos == ext4_get_htree_eof(filp))
|
||||
return 0; /* EOF */
|
||||
|
||||
/* Some one has messed with f_pos; reset the world */
|
||||
|
@ -468,8 +580,8 @@ static int ext4_dx_readdir(struct file *filp,
|
|||
free_rb_tree_fname(&info->root);
|
||||
info->curr_node = NULL;
|
||||
info->extra_fname = NULL;
|
||||
info->curr_hash = pos2maj_hash(filp->f_pos);
|
||||
info->curr_minor_hash = pos2min_hash(filp->f_pos);
|
||||
info->curr_hash = pos2maj_hash(filp, filp->f_pos);
|
||||
info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -501,7 +613,7 @@ static int ext4_dx_readdir(struct file *filp,
|
|||
if (ret < 0)
|
||||
return ret;
|
||||
if (ret == 0) {
|
||||
filp->f_pos = EXT4_HTREE_EOF;
|
||||
filp->f_pos = ext4_get_htree_eof(filp);
|
||||
break;
|
||||
}
|
||||
info->curr_node = rb_first(&info->root);
|
||||
|
@ -521,7 +633,7 @@ static int ext4_dx_readdir(struct file *filp,
|
|||
info->curr_minor_hash = fname->minor_hash;
|
||||
} else {
|
||||
if (info->next_hash == ~0) {
|
||||
filp->f_pos = EXT4_HTREE_EOF;
|
||||
filp->f_pos = ext4_get_htree_eof(filp);
|
||||
break;
|
||||
}
|
||||
info->curr_hash = info->next_hash;
|
||||
|
@ -540,3 +652,15 @@ static int ext4_release_dir(struct inode *inode, struct file *filp)
|
|||
|
||||
return 0;
|
||||
}
|
||||
|
||||
const struct file_operations ext4_dir_operations = {
|
||||
.llseek = ext4_dir_llseek,
|
||||
.read = generic_read_dir,
|
||||
.readdir = ext4_readdir,
|
||||
.unlocked_ioctl = ext4_ioctl,
|
||||
#ifdef CONFIG_COMPAT
|
||||
.compat_ioctl = ext4_compat_ioctl,
|
||||
#endif
|
||||
.fsync = ext4_sync_file,
|
||||
.release = ext4_release_dir,
|
||||
};
|
||||
|
|
|
@ -1612,7 +1612,11 @@ struct dx_hash_info
|
|||
u32 *seed;
|
||||
};
|
||||
|
||||
#define EXT4_HTREE_EOF 0x7fffffff
|
||||
|
||||
/* 32 and 64 bit signed EOF for dx directories */
|
||||
#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1)
|
||||
#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1)
|
||||
|
||||
|
||||
/*
|
||||
* Control parameters used by ext4_htree_next_block
|
||||
|
|
|
@ -200,8 +200,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
|
|||
return -1;
|
||||
}
|
||||
hash = hash & ~1;
|
||||
if (hash == (EXT4_HTREE_EOF << 1))
|
||||
hash = (EXT4_HTREE_EOF-1) << 1;
|
||||
if (hash == (EXT4_HTREE_EOF_32BIT << 1))
|
||||
hash = (EXT4_HTREE_EOF_32BIT - 1) << 1;
|
||||
hinfo->hash = hash;
|
||||
hinfo->minor_hash = minor_hash;
|
||||
return 0;
|
||||
|
|
Loading…
Reference in a new issue