ext4: return 32/64-bit dir name hash according to usage type

Traditionally ext2/3/4 has returned a 32-bit hash value from llseek()
to appease NFSv2, which can only handle a 32-bit cookie for seekdir()
and telldir().  However, this causes problems if there are 32-bit hash
collisions, since the NFSv2 server can get stuck resending the same
entries from the directory repeatedly.

Allow ext4 to return a full 64-bit hash (both major and minor) for
telldir to decrease the chance of hash collisions.  This still needs
integration on the NFS side.

Patch-updated-by: Bernd Schubert <bernd.schubert@itwm.fraunhofer.de>
(blame me if something is not correct)

Signed-off-by: Fan Yong <yong.fan@whamcloud.com>
Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Signed-off-by: Bernd Schubert <bernd.schubert@itwm.fraunhofer.de>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
This commit is contained in:
Fan Yong 2012-03-18 22:44:40 -04:00 committed by Theodore Ts'o
parent 6a8a13e038
commit d1f5273e9a
3 changed files with 176 additions and 48 deletions

View file

@ -32,24 +32,8 @@ static unsigned char ext4_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
}; };
static int ext4_readdir(struct file *, void *, filldir_t);
static int ext4_dx_readdir(struct file *filp, static int ext4_dx_readdir(struct file *filp,
void *dirent, filldir_t filldir); void *dirent, filldir_t filldir);
static int ext4_release_dir(struct inode *inode,
struct file *filp);
const struct file_operations ext4_dir_operations = {
.llseek = ext4_llseek,
.read = generic_read_dir,
.readdir = ext4_readdir, /* we take BKL. needed?*/
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
.fsync = ext4_sync_file,
.release = ext4_release_dir,
};
static unsigned char get_dtype(struct super_block *sb, int filetype) static unsigned char get_dtype(struct super_block *sb, int filetype)
{ {
@ -60,6 +44,26 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
return (ext4_filetype_table[filetype]); return (ext4_filetype_table[filetype]);
} }
/**
* Check if the given dir-inode refers to an htree-indexed directory
* (or a directory which chould potentially get coverted to use htree
* indexing).
*
* Return 1 if it is a dx dir, 0 if not
*/
static int is_dx_dir(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_COMPAT_DIR_INDEX) &&
((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
((inode->i_size >> sb->s_blocksize_bits) == 1)))
return 1;
return 0;
}
/* /*
* Return 0 if the directory entry is OK, and 1 if there is a problem * Return 0 if the directory entry is OK, and 1 if there is a problem
* *
@ -115,18 +119,13 @@ static int ext4_readdir(struct file *filp,
unsigned int offset; unsigned int offset;
int i, stored; int i, stored;
struct ext4_dir_entry_2 *de; struct ext4_dir_entry_2 *de;
struct super_block *sb;
int err; int err;
struct inode *inode = filp->f_path.dentry->d_inode; struct inode *inode = filp->f_path.dentry->d_inode;
struct super_block *sb = inode->i_sb;
int ret = 0; int ret = 0;
int dir_has_error = 0; int dir_has_error = 0;
sb = inode->i_sb; if (is_dx_dir(inode)) {
if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_COMPAT_DIR_INDEX) &&
((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
((inode->i_size >> sb->s_blocksize_bits) == 1))) {
err = ext4_dx_readdir(filp, dirent, filldir); err = ext4_dx_readdir(filp, dirent, filldir);
if (err != ERR_BAD_DX_DIR) { if (err != ERR_BAD_DX_DIR) {
ret = err; ret = err;
@ -254,22 +253,134 @@ static int ext4_readdir(struct file *filp,
return ret; return ret;
} }
static inline int is_32bit_api(void)
{
#ifdef CONFIG_COMPAT
return is_compat_task();
#else
return (BITS_PER_LONG == 32);
#endif
}
/* /*
* These functions convert from the major/minor hash to an f_pos * These functions convert from the major/minor hash to an f_pos
* value. * value for dx directories
* *
* Currently we only use major hash numer. This is unfortunate, but * Upper layer (for example NFS) should specify FMODE_32BITHASH or
* on 32-bit machines, the same VFS interface is used for lseek and * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted
* llseek, so if we use the 64 bit offset, then the 32-bit versions of * directly on both 32-bit and 64-bit nodes, under such case, neither
* lseek/telldir/seekdir will blow out spectacularly, and from within * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
* the ext2 low-level routine, we don't know if we're being called by
* a 64-bit version of the system call or the 32-bit version of the
* system call. Worse yet, NFSv2 only allows for a 32-bit readdir
* cookie. Sigh.
*/ */
#define hash2pos(major, minor) (major >> 1) static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) {
#define pos2min_hash(pos) (0) if ((filp->f_mode & FMODE_32BITHASH) ||
(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
return major >> 1;
else
return ((__u64)(major >> 1) << 32) | (__u64)minor;
}
static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
{
if ((filp->f_mode & FMODE_32BITHASH) ||
(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
return (pos << 1) & 0xffffffff;
else
return ((pos >> 32) << 1) & 0xffffffff;
}
static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
{
if ((filp->f_mode & FMODE_32BITHASH) ||
(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
return 0;
else
return pos & 0xffffffff;
}
/*
* Return 32- or 64-bit end-of-file for dx directories
*/
static inline loff_t ext4_get_htree_eof(struct file *filp)
{
if ((filp->f_mode & FMODE_32BITHASH) ||
(!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
return EXT4_HTREE_EOF_32BIT;
else
return EXT4_HTREE_EOF_64BIT;
}
/*
* ext4_dir_llseek() based on generic_file_llseek() to handle both
* non-htree and htree directories, where the "offset" is in terms
* of the filename hash value instead of the byte offset.
*
* NOTE: offsets obtained *before* ext4_set_inode_flag(dir, EXT4_INODE_INDEX)
* will be invalid once the directory was converted into a dx directory
*/
loff_t ext4_dir_llseek(struct file *file, loff_t offset, int origin)
{
struct inode *inode = file->f_mapping->host;
loff_t ret = -EINVAL;
int dx_dir = is_dx_dir(inode);
mutex_lock(&inode->i_mutex);
/* NOTE: relative offsets with dx directories might not work
* as expected, as it is difficult to figure out the
* correct offset between dx hashes */
switch (origin) {
case SEEK_END:
if (unlikely(offset > 0))
goto out_err; /* not supported for directories */
/* so only negative offsets are left, does that have a
* meaning for directories at all? */
if (dx_dir)
offset += ext4_get_htree_eof(file);
else
offset += inode->i_size;
break;
case SEEK_CUR:
/*
* Here we special-case the lseek(fd, 0, SEEK_CUR)
* position-querying operation. Avoid rewriting the "same"
* f_pos value back to the file because a concurrent read(),
* write() or lseek() might have altered it
*/
if (offset == 0) {
offset = file->f_pos;
goto out_ok;
}
offset += file->f_pos;
break;
}
if (unlikely(offset < 0))
goto out_err;
if (!dx_dir) {
if (offset > inode->i_sb->s_maxbytes)
goto out_err;
} else if (offset > ext4_get_htree_eof(file))
goto out_err;
/* Special lock needed here? */
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = 0;
}
out_ok:
ret = offset;
out_err:
mutex_unlock(&inode->i_mutex);
return ret;
}
/* /*
* This structure holds the nodes of the red-black tree used to store * This structure holds the nodes of the red-black tree used to store
@ -330,15 +441,16 @@ static void free_rb_tree_fname(struct rb_root *root)
} }
static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos) static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
loff_t pos)
{ {
struct dir_private_info *p; struct dir_private_info *p;
p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
if (!p) if (!p)
return NULL; return NULL;
p->curr_hash = pos2maj_hash(pos); p->curr_hash = pos2maj_hash(filp, pos);
p->curr_minor_hash = pos2min_hash(pos); p->curr_minor_hash = pos2min_hash(filp, pos);
return p; return p;
} }
@ -429,7 +541,7 @@ static int call_filldir(struct file *filp, void *dirent,
"null fname?!?\n"); "null fname?!?\n");
return 0; return 0;
} }
curr_pos = hash2pos(fname->hash, fname->minor_hash); curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
while (fname) { while (fname) {
error = filldir(dirent, fname->name, error = filldir(dirent, fname->name,
fname->name_len, curr_pos, fname->name_len, curr_pos,
@ -454,13 +566,13 @@ static int ext4_dx_readdir(struct file *filp,
int ret; int ret;
if (!info) { if (!info) {
info = ext4_htree_create_dir_info(filp->f_pos); info = ext4_htree_create_dir_info(filp, filp->f_pos);
if (!info) if (!info)
return -ENOMEM; return -ENOMEM;
filp->private_data = info; filp->private_data = info;
} }
if (filp->f_pos == EXT4_HTREE_EOF) if (filp->f_pos == ext4_get_htree_eof(filp))
return 0; /* EOF */ return 0; /* EOF */
/* Some one has messed with f_pos; reset the world */ /* Some one has messed with f_pos; reset the world */
@ -468,8 +580,8 @@ static int ext4_dx_readdir(struct file *filp,
free_rb_tree_fname(&info->root); free_rb_tree_fname(&info->root);
info->curr_node = NULL; info->curr_node = NULL;
info->extra_fname = NULL; info->extra_fname = NULL;
info->curr_hash = pos2maj_hash(filp->f_pos); info->curr_hash = pos2maj_hash(filp, filp->f_pos);
info->curr_minor_hash = pos2min_hash(filp->f_pos); info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
} }
/* /*
@ -501,7 +613,7 @@ static int ext4_dx_readdir(struct file *filp,
if (ret < 0) if (ret < 0)
return ret; return ret;
if (ret == 0) { if (ret == 0) {
filp->f_pos = EXT4_HTREE_EOF; filp->f_pos = ext4_get_htree_eof(filp);
break; break;
} }
info->curr_node = rb_first(&info->root); info->curr_node = rb_first(&info->root);
@ -521,7 +633,7 @@ static int ext4_dx_readdir(struct file *filp,
info->curr_minor_hash = fname->minor_hash; info->curr_minor_hash = fname->minor_hash;
} else { } else {
if (info->next_hash == ~0) { if (info->next_hash == ~0) {
filp->f_pos = EXT4_HTREE_EOF; filp->f_pos = ext4_get_htree_eof(filp);
break; break;
} }
info->curr_hash = info->next_hash; info->curr_hash = info->next_hash;
@ -540,3 +652,15 @@ static int ext4_release_dir(struct inode *inode, struct file *filp)
return 0; return 0;
} }
const struct file_operations ext4_dir_operations = {
.llseek = ext4_dir_llseek,
.read = generic_read_dir,
.readdir = ext4_readdir,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
.fsync = ext4_sync_file,
.release = ext4_release_dir,
};

View file

@ -1612,7 +1612,11 @@ struct dx_hash_info
u32 *seed; u32 *seed;
}; };
#define EXT4_HTREE_EOF 0x7fffffff
/* 32 and 64 bit signed EOF for dx directories */
#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1)
#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1)
/* /*
* Control parameters used by ext4_htree_next_block * Control parameters used by ext4_htree_next_block

View file

@ -200,8 +200,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
return -1; return -1;
} }
hash = hash & ~1; hash = hash & ~1;
if (hash == (EXT4_HTREE_EOF << 1)) if (hash == (EXT4_HTREE_EOF_32BIT << 1))
hash = (EXT4_HTREE_EOF-1) << 1; hash = (EXT4_HTREE_EOF_32BIT - 1) << 1;
hinfo->hash = hash; hinfo->hash = hash;
hinfo->minor_hash = minor_hash; hinfo->minor_hash = minor_hash;
return 0; return 0;