dlm: fix race between remove and lookup
It was possible for a remove message on an old rsb to be sent after a lookup message on a new rsb, where the rsbs were for the same resource name. This could lead to a missing directory entry for the new rsb. It is fixed by keeping a copy of the resource name being removed until after the remove has been sent. A lookup checks if this in-progress remove matches the name it is looking up. Signed-off-by: David Teigland <teigland@redhat.com>
This commit is contained in:
parent
1d7c484eeb
commit
05c32f47bf
3 changed files with 178 additions and 41 deletions
|
@ -498,6 +498,13 @@ struct rcom_lock {
|
||||||
char rl_lvb[0];
|
char rl_lvb[0];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The max number of resources per rsbtbl bucket that shrink will attempt
|
||||||
|
* to remove in each iteration.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define DLM_REMOVE_NAMES_MAX 8
|
||||||
|
|
||||||
struct dlm_ls {
|
struct dlm_ls {
|
||||||
struct list_head ls_list; /* list of lockspaces */
|
struct list_head ls_list; /* list of lockspaces */
|
||||||
dlm_lockspace_t *ls_local_handle;
|
dlm_lockspace_t *ls_local_handle;
|
||||||
|
@ -531,6 +538,12 @@ struct dlm_ls {
|
||||||
int ls_new_rsb_count;
|
int ls_new_rsb_count;
|
||||||
struct list_head ls_new_rsb; /* new rsb structs */
|
struct list_head ls_new_rsb; /* new rsb structs */
|
||||||
|
|
||||||
|
spinlock_t ls_remove_spin;
|
||||||
|
char ls_remove_name[DLM_RESNAME_MAXLEN+1];
|
||||||
|
char *ls_remove_names[DLM_REMOVE_NAMES_MAX];
|
||||||
|
int ls_remove_len;
|
||||||
|
int ls_remove_lens[DLM_REMOVE_NAMES_MAX];
|
||||||
|
|
||||||
struct list_head ls_nodes; /* current nodes in ls */
|
struct list_head ls_nodes; /* current nodes in ls */
|
||||||
struct list_head ls_nodes_gone; /* dead node list, recovery */
|
struct list_head ls_nodes_gone; /* dead node list, recovery */
|
||||||
int ls_num_nodes; /* number of nodes in ls */
|
int ls_num_nodes; /* number of nodes in ls */
|
||||||
|
|
185
fs/dlm/lock.c
185
fs/dlm/lock.c
|
@ -1624,65 +1624,170 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* FIXME: make this more efficient */
|
/* If there's an rsb for the same resource being removed, ensure
|
||||||
|
that the remove message is sent before the new lookup message.
|
||||||
|
It should be rare to need a delay here, but if not, then it may
|
||||||
|
be worthwhile to add a proper wait mechanism rather than a delay. */
|
||||||
|
|
||||||
static int shrink_bucket(struct dlm_ls *ls, int b)
|
static void wait_pending_remove(struct dlm_rsb *r)
|
||||||
{
|
{
|
||||||
struct rb_node *n;
|
struct dlm_ls *ls = r->res_ls;
|
||||||
|
restart:
|
||||||
|
spin_lock(&ls->ls_remove_spin);
|
||||||
|
if (ls->ls_remove_len &&
|
||||||
|
!rsb_cmp(r, ls->ls_remove_name, ls->ls_remove_len)) {
|
||||||
|
log_debug(ls, "delay lookup for remove dir %d %s",
|
||||||
|
r->res_dir_nodeid, r->res_name);
|
||||||
|
spin_unlock(&ls->ls_remove_spin);
|
||||||
|
msleep(1);
|
||||||
|
goto restart;
|
||||||
|
}
|
||||||
|
spin_unlock(&ls->ls_remove_spin);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* ls_remove_spin protects ls_remove_name and ls_remove_len which are
|
||||||
|
* read by other threads in wait_pending_remove. ls_remove_names
|
||||||
|
* and ls_remove_lens are only used by the scan thread, so they do
|
||||||
|
* not need protection.
|
||||||
|
*/
|
||||||
|
|
||||||
|
static void shrink_bucket(struct dlm_ls *ls, int b)
|
||||||
|
{
|
||||||
|
struct rb_node *n, *next;
|
||||||
struct dlm_rsb *r;
|
struct dlm_rsb *r;
|
||||||
|
char *name;
|
||||||
int our_nodeid = dlm_our_nodeid();
|
int our_nodeid = dlm_our_nodeid();
|
||||||
int count = 0, found;
|
int remote_count = 0;
|
||||||
|
int i, len, rv;
|
||||||
|
|
||||||
for (;;) {
|
memset(&ls->ls_remove_lens, 0, sizeof(int) * DLM_REMOVE_NAMES_MAX);
|
||||||
found = 0;
|
|
||||||
spin_lock(&ls->ls_rsbtbl[b].lock);
|
|
||||||
for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = rb_next(n)) {
|
|
||||||
r = rb_entry(n, struct dlm_rsb, res_hashnode);
|
|
||||||
|
|
||||||
/* If we're the directory record for this rsb, and
|
spin_lock(&ls->ls_rsbtbl[b].lock);
|
||||||
we're not the master of it, then we need to wait
|
for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = next) {
|
||||||
for the master node to send us a dir remove for
|
next = rb_next(n);
|
||||||
before removing the dir record. */
|
r = rb_entry(n, struct dlm_rsb, res_hashnode);
|
||||||
|
|
||||||
if (!dlm_no_directory(ls) && !is_master(r) &&
|
/* If we're the directory record for this rsb, and
|
||||||
(dlm_dir_nodeid(r) == our_nodeid)) {
|
we're not the master of it, then we need to wait
|
||||||
continue;
|
for the master node to send us a dir remove for
|
||||||
}
|
before removing the dir record. */
|
||||||
|
|
||||||
if (!time_after_eq(jiffies, r->res_toss_time +
|
if (!dlm_no_directory(ls) &&
|
||||||
dlm_config.ci_toss_secs * HZ))
|
(r->res_master_nodeid != our_nodeid) &&
|
||||||
continue;
|
(dlm_dir_nodeid(r) == our_nodeid)) {
|
||||||
found = 1;
|
continue;
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!found) {
|
if (!time_after_eq(jiffies, r->res_toss_time +
|
||||||
spin_unlock(&ls->ls_rsbtbl[b].lock);
|
dlm_config.ci_toss_secs * HZ)) {
|
||||||
break;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (kref_put(&r->res_ref, kill_rsb)) {
|
if (!dlm_no_directory(ls) &&
|
||||||
rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
|
(r->res_master_nodeid == our_nodeid) &&
|
||||||
spin_unlock(&ls->ls_rsbtbl[b].lock);
|
(dlm_dir_nodeid(r) != our_nodeid)) {
|
||||||
|
|
||||||
/* We're the master of this rsb but we're not
|
/* We're the master of this rsb but we're not
|
||||||
the directory record, so we need to tell the
|
the directory record, so we need to tell the
|
||||||
dir node to remove the dir record. */
|
dir node to remove the dir record. */
|
||||||
|
|
||||||
if (!dlm_no_directory(ls) && is_master(r) &&
|
ls->ls_remove_lens[remote_count] = r->res_length;
|
||||||
(dlm_dir_nodeid(r) != our_nodeid)) {
|
memcpy(ls->ls_remove_names[remote_count], r->res_name,
|
||||||
send_remove(r);
|
DLM_RESNAME_MAXLEN);
|
||||||
}
|
remote_count++;
|
||||||
|
|
||||||
dlm_free_rsb(r);
|
if (remote_count >= DLM_REMOVE_NAMES_MAX)
|
||||||
count++;
|
break;
|
||||||
} else {
|
continue;
|
||||||
spin_unlock(&ls->ls_rsbtbl[b].lock);
|
|
||||||
log_error(ls, "tossed rsb in use %s", r->res_name);
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return count;
|
if (!kref_put(&r->res_ref, kill_rsb)) {
|
||||||
|
log_error(ls, "tossed rsb in use %s", r->res_name);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
|
||||||
|
dlm_free_rsb(r);
|
||||||
|
}
|
||||||
|
spin_unlock(&ls->ls_rsbtbl[b].lock);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* While searching for rsb's to free, we found some that require
|
||||||
|
* remote removal. We leave them in place and find them again here
|
||||||
|
* so there is a very small gap between removing them from the toss
|
||||||
|
* list and sending the removal. Keeping this gap small is
|
||||||
|
* important to keep us (the master node) from being out of sync
|
||||||
|
* with the remote dir node for very long.
|
||||||
|
*
|
||||||
|
* From the time the rsb is removed from toss until just after
|
||||||
|
* send_remove, the rsb name is saved in ls_remove_name. A new
|
||||||
|
* lookup checks this to ensure that a new lookup message for the
|
||||||
|
* same resource name is not sent just before the remove message.
|
||||||
|
*/
|
||||||
|
|
||||||
|
for (i = 0; i < remote_count; i++) {
|
||||||
|
name = ls->ls_remove_names[i];
|
||||||
|
len = ls->ls_remove_lens[i];
|
||||||
|
|
||||||
|
spin_lock(&ls->ls_rsbtbl[b].lock);
|
||||||
|
rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
|
||||||
|
if (rv) {
|
||||||
|
spin_unlock(&ls->ls_rsbtbl[b].lock);
|
||||||
|
log_debug(ls, "remove_name not toss %s", name);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (r->res_master_nodeid != our_nodeid) {
|
||||||
|
spin_unlock(&ls->ls_rsbtbl[b].lock);
|
||||||
|
log_debug(ls, "remove_name master %d dir %d our %d %s",
|
||||||
|
r->res_master_nodeid, r->res_dir_nodeid,
|
||||||
|
our_nodeid, name);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (r->res_dir_nodeid == our_nodeid) {
|
||||||
|
/* should never happen */
|
||||||
|
spin_unlock(&ls->ls_rsbtbl[b].lock);
|
||||||
|
log_error(ls, "remove_name dir %d master %d our %d %s",
|
||||||
|
r->res_dir_nodeid, r->res_master_nodeid,
|
||||||
|
our_nodeid, name);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!time_after_eq(jiffies, r->res_toss_time +
|
||||||
|
dlm_config.ci_toss_secs * HZ)) {
|
||||||
|
spin_unlock(&ls->ls_rsbtbl[b].lock);
|
||||||
|
log_debug(ls, "remove_name toss_time %lu now %lu %s",
|
||||||
|
r->res_toss_time, jiffies, name);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!kref_put(&r->res_ref, kill_rsb)) {
|
||||||
|
spin_unlock(&ls->ls_rsbtbl[b].lock);
|
||||||
|
log_error(ls, "remove_name in use %s", name);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
|
||||||
|
|
||||||
|
/* block lookup of same name until we've sent remove */
|
||||||
|
spin_lock(&ls->ls_remove_spin);
|
||||||
|
ls->ls_remove_len = len;
|
||||||
|
memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
|
||||||
|
spin_unlock(&ls->ls_remove_spin);
|
||||||
|
spin_unlock(&ls->ls_rsbtbl[b].lock);
|
||||||
|
|
||||||
|
send_remove(r);
|
||||||
|
|
||||||
|
/* allow lookup of name again */
|
||||||
|
spin_lock(&ls->ls_remove_spin);
|
||||||
|
ls->ls_remove_len = 0;
|
||||||
|
memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
|
||||||
|
spin_unlock(&ls->ls_remove_spin);
|
||||||
|
|
||||||
|
dlm_free_rsb(r);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void dlm_scan_rsbs(struct dlm_ls *ls)
|
void dlm_scan_rsbs(struct dlm_ls *ls)
|
||||||
|
@ -2608,6 +2713,8 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
wait_pending_remove(r);
|
||||||
|
|
||||||
r->res_first_lkid = lkb->lkb_id;
|
r->res_first_lkid = lkb->lkb_id;
|
||||||
send_lookup(r, lkb);
|
send_lookup(r, lkb);
|
||||||
return 1;
|
return 1;
|
||||||
|
|
|
@ -506,6 +506,15 @@ static int new_lockspace(const char *name, const char *cluster,
|
||||||
spin_lock_init(&ls->ls_rsbtbl[i].lock);
|
spin_lock_init(&ls->ls_rsbtbl[i].lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
spin_lock_init(&ls->ls_remove_spin);
|
||||||
|
|
||||||
|
for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) {
|
||||||
|
ls->ls_remove_names[i] = kzalloc(DLM_RESNAME_MAXLEN+1,
|
||||||
|
GFP_KERNEL);
|
||||||
|
if (!ls->ls_remove_names[i])
|
||||||
|
goto out_rsbtbl;
|
||||||
|
}
|
||||||
|
|
||||||
idr_init(&ls->ls_lkbidr);
|
idr_init(&ls->ls_lkbidr);
|
||||||
spin_lock_init(&ls->ls_lkbidr_spin);
|
spin_lock_init(&ls->ls_lkbidr_spin);
|
||||||
|
|
||||||
|
@ -556,7 +565,7 @@ static int new_lockspace(const char *name, const char *cluster,
|
||||||
|
|
||||||
ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
|
ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
|
||||||
if (!ls->ls_recover_buf)
|
if (!ls->ls_recover_buf)
|
||||||
goto out_lkbfree;
|
goto out_lkbidr;
|
||||||
|
|
||||||
ls->ls_slot = 0;
|
ls->ls_slot = 0;
|
||||||
ls->ls_num_slots = 0;
|
ls->ls_num_slots = 0;
|
||||||
|
@ -640,8 +649,13 @@ static int new_lockspace(const char *name, const char *cluster,
|
||||||
spin_unlock(&lslist_lock);
|
spin_unlock(&lslist_lock);
|
||||||
idr_destroy(&ls->ls_recover_idr);
|
idr_destroy(&ls->ls_recover_idr);
|
||||||
kfree(ls->ls_recover_buf);
|
kfree(ls->ls_recover_buf);
|
||||||
out_lkbfree:
|
out_lkbidr:
|
||||||
idr_destroy(&ls->ls_lkbidr);
|
idr_destroy(&ls->ls_lkbidr);
|
||||||
|
for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) {
|
||||||
|
if (ls->ls_remove_names[i])
|
||||||
|
kfree(ls->ls_remove_names[i]);
|
||||||
|
}
|
||||||
|
out_rsbtbl:
|
||||||
vfree(ls->ls_rsbtbl);
|
vfree(ls->ls_rsbtbl);
|
||||||
out_lsfree:
|
out_lsfree:
|
||||||
if (do_unreg)
|
if (do_unreg)
|
||||||
|
@ -796,6 +810,9 @@ static int release_lockspace(struct dlm_ls *ls, int force)
|
||||||
|
|
||||||
vfree(ls->ls_rsbtbl);
|
vfree(ls->ls_rsbtbl);
|
||||||
|
|
||||||
|
for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++)
|
||||||
|
kfree(ls->ls_remove_names[i]);
|
||||||
|
|
||||||
while (!list_empty(&ls->ls_new_rsb)) {
|
while (!list_empty(&ls->ls_new_rsb)) {
|
||||||
rsb = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb,
|
rsb = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb,
|
||||||
res_hashchain);
|
res_hashchain);
|
||||||
|
|
Loading…
Reference in a new issue