dlm: fix race between remove and lookup

It was possible for a remove message on an old
rsb to be sent after a lookup message on a new
rsb, where the rsbs were for the same resource
name.  This could lead to a missing directory
entry for the new rsb.

It is fixed by keeping a copy of the resource
name being removed until after the remove has
been sent.  A lookup checks if this in-progress
remove matches the name it is looking up.

Signed-off-by: David Teigland <teigland@redhat.com>
This commit is contained in:
David Teigland 2012-06-14 12:17:32 -05:00
parent 1d7c484eeb
commit 05c32f47bf
3 changed files with 178 additions and 41 deletions

View file

@ -498,6 +498,13 @@ struct rcom_lock {
char rl_lvb[0]; char rl_lvb[0];
}; };
/*
* The max number of resources per rsbtbl bucket that shrink will attempt
* to remove in each iteration.
*/
#define DLM_REMOVE_NAMES_MAX 8
struct dlm_ls { struct dlm_ls {
struct list_head ls_list; /* list of lockspaces */ struct list_head ls_list; /* list of lockspaces */
dlm_lockspace_t *ls_local_handle; dlm_lockspace_t *ls_local_handle;
@ -531,6 +538,12 @@ struct dlm_ls {
int ls_new_rsb_count; int ls_new_rsb_count;
struct list_head ls_new_rsb; /* new rsb structs */ struct list_head ls_new_rsb; /* new rsb structs */
spinlock_t ls_remove_spin;
char ls_remove_name[DLM_RESNAME_MAXLEN+1];
char *ls_remove_names[DLM_REMOVE_NAMES_MAX];
int ls_remove_len;
int ls_remove_lens[DLM_REMOVE_NAMES_MAX];
struct list_head ls_nodes; /* current nodes in ls */ struct list_head ls_nodes; /* current nodes in ls */
struct list_head ls_nodes_gone; /* dead node list, recovery */ struct list_head ls_nodes_gone; /* dead node list, recovery */
int ls_num_nodes; /* number of nodes in ls */ int ls_num_nodes; /* number of nodes in ls */

View file

@ -1624,65 +1624,170 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
return error; return error;
} }
/* FIXME: make this more efficient */ /* If there's an rsb for the same resource being removed, ensure
that the remove message is sent before the new lookup message.
It should be rare to need a delay here, but if not, then it may
be worthwhile to add a proper wait mechanism rather than a delay. */
static int shrink_bucket(struct dlm_ls *ls, int b) static void wait_pending_remove(struct dlm_rsb *r)
{ {
struct rb_node *n; struct dlm_ls *ls = r->res_ls;
restart:
spin_lock(&ls->ls_remove_spin);
if (ls->ls_remove_len &&
!rsb_cmp(r, ls->ls_remove_name, ls->ls_remove_len)) {
log_debug(ls, "delay lookup for remove dir %d %s",
r->res_dir_nodeid, r->res_name);
spin_unlock(&ls->ls_remove_spin);
msleep(1);
goto restart;
}
spin_unlock(&ls->ls_remove_spin);
}
/*
* ls_remove_spin protects ls_remove_name and ls_remove_len which are
* read by other threads in wait_pending_remove. ls_remove_names
* and ls_remove_lens are only used by the scan thread, so they do
* not need protection.
*/
static void shrink_bucket(struct dlm_ls *ls, int b)
{
struct rb_node *n, *next;
struct dlm_rsb *r; struct dlm_rsb *r;
char *name;
int our_nodeid = dlm_our_nodeid(); int our_nodeid = dlm_our_nodeid();
int count = 0, found; int remote_count = 0;
int i, len, rv;
for (;;) { memset(&ls->ls_remove_lens, 0, sizeof(int) * DLM_REMOVE_NAMES_MAX);
found = 0;
spin_lock(&ls->ls_rsbtbl[b].lock);
for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = rb_next(n)) {
r = rb_entry(n, struct dlm_rsb, res_hashnode);
/* If we're the directory record for this rsb, and spin_lock(&ls->ls_rsbtbl[b].lock);
we're not the master of it, then we need to wait for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = next) {
for the master node to send us a dir remove for next = rb_next(n);
before removing the dir record. */ r = rb_entry(n, struct dlm_rsb, res_hashnode);
if (!dlm_no_directory(ls) && !is_master(r) && /* If we're the directory record for this rsb, and
(dlm_dir_nodeid(r) == our_nodeid)) { we're not the master of it, then we need to wait
continue; for the master node to send us a dir remove for
} before removing the dir record. */
if (!time_after_eq(jiffies, r->res_toss_time + if (!dlm_no_directory(ls) &&
dlm_config.ci_toss_secs * HZ)) (r->res_master_nodeid != our_nodeid) &&
continue; (dlm_dir_nodeid(r) == our_nodeid)) {
found = 1; continue;
break;
} }
if (!found) { if (!time_after_eq(jiffies, r->res_toss_time +
spin_unlock(&ls->ls_rsbtbl[b].lock); dlm_config.ci_toss_secs * HZ)) {
break; continue;
} }
if (kref_put(&r->res_ref, kill_rsb)) { if (!dlm_no_directory(ls) &&
rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss); (r->res_master_nodeid == our_nodeid) &&
spin_unlock(&ls->ls_rsbtbl[b].lock); (dlm_dir_nodeid(r) != our_nodeid)) {
/* We're the master of this rsb but we're not /* We're the master of this rsb but we're not
the directory record, so we need to tell the the directory record, so we need to tell the
dir node to remove the dir record. */ dir node to remove the dir record. */
if (!dlm_no_directory(ls) && is_master(r) && ls->ls_remove_lens[remote_count] = r->res_length;
(dlm_dir_nodeid(r) != our_nodeid)) { memcpy(ls->ls_remove_names[remote_count], r->res_name,
send_remove(r); DLM_RESNAME_MAXLEN);
} remote_count++;
dlm_free_rsb(r); if (remote_count >= DLM_REMOVE_NAMES_MAX)
count++; break;
} else { continue;
spin_unlock(&ls->ls_rsbtbl[b].lock);
log_error(ls, "tossed rsb in use %s", r->res_name);
} }
}
return count; if (!kref_put(&r->res_ref, kill_rsb)) {
log_error(ls, "tossed rsb in use %s", r->res_name);
continue;
}
rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
dlm_free_rsb(r);
}
spin_unlock(&ls->ls_rsbtbl[b].lock);
/*
* While searching for rsb's to free, we found some that require
* remote removal. We leave them in place and find them again here
* so there is a very small gap between removing them from the toss
* list and sending the removal. Keeping this gap small is
* important to keep us (the master node) from being out of sync
* with the remote dir node for very long.
*
* From the time the rsb is removed from toss until just after
* send_remove, the rsb name is saved in ls_remove_name. A new
* lookup checks this to ensure that a new lookup message for the
* same resource name is not sent just before the remove message.
*/
for (i = 0; i < remote_count; i++) {
name = ls->ls_remove_names[i];
len = ls->ls_remove_lens[i];
spin_lock(&ls->ls_rsbtbl[b].lock);
rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
if (rv) {
spin_unlock(&ls->ls_rsbtbl[b].lock);
log_debug(ls, "remove_name not toss %s", name);
continue;
}
if (r->res_master_nodeid != our_nodeid) {
spin_unlock(&ls->ls_rsbtbl[b].lock);
log_debug(ls, "remove_name master %d dir %d our %d %s",
r->res_master_nodeid, r->res_dir_nodeid,
our_nodeid, name);
continue;
}
if (r->res_dir_nodeid == our_nodeid) {
/* should never happen */
spin_unlock(&ls->ls_rsbtbl[b].lock);
log_error(ls, "remove_name dir %d master %d our %d %s",
r->res_dir_nodeid, r->res_master_nodeid,
our_nodeid, name);
continue;
}
if (!time_after_eq(jiffies, r->res_toss_time +
dlm_config.ci_toss_secs * HZ)) {
spin_unlock(&ls->ls_rsbtbl[b].lock);
log_debug(ls, "remove_name toss_time %lu now %lu %s",
r->res_toss_time, jiffies, name);
continue;
}
if (!kref_put(&r->res_ref, kill_rsb)) {
spin_unlock(&ls->ls_rsbtbl[b].lock);
log_error(ls, "remove_name in use %s", name);
continue;
}
rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
/* block lookup of same name until we've sent remove */
spin_lock(&ls->ls_remove_spin);
ls->ls_remove_len = len;
memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
spin_unlock(&ls->ls_remove_spin);
spin_unlock(&ls->ls_rsbtbl[b].lock);
send_remove(r);
/* allow lookup of name again */
spin_lock(&ls->ls_remove_spin);
ls->ls_remove_len = 0;
memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
spin_unlock(&ls->ls_remove_spin);
dlm_free_rsb(r);
}
} }
void dlm_scan_rsbs(struct dlm_ls *ls) void dlm_scan_rsbs(struct dlm_ls *ls)
@ -2608,6 +2713,8 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
return 0; return 0;
} }
wait_pending_remove(r);
r->res_first_lkid = lkb->lkb_id; r->res_first_lkid = lkb->lkb_id;
send_lookup(r, lkb); send_lookup(r, lkb);
return 1; return 1;

View file

@ -506,6 +506,15 @@ static int new_lockspace(const char *name, const char *cluster,
spin_lock_init(&ls->ls_rsbtbl[i].lock); spin_lock_init(&ls->ls_rsbtbl[i].lock);
} }
spin_lock_init(&ls->ls_remove_spin);
for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) {
ls->ls_remove_names[i] = kzalloc(DLM_RESNAME_MAXLEN+1,
GFP_KERNEL);
if (!ls->ls_remove_names[i])
goto out_rsbtbl;
}
idr_init(&ls->ls_lkbidr); idr_init(&ls->ls_lkbidr);
spin_lock_init(&ls->ls_lkbidr_spin); spin_lock_init(&ls->ls_lkbidr_spin);
@ -556,7 +565,7 @@ static int new_lockspace(const char *name, const char *cluster,
ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS); ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
if (!ls->ls_recover_buf) if (!ls->ls_recover_buf)
goto out_lkbfree; goto out_lkbidr;
ls->ls_slot = 0; ls->ls_slot = 0;
ls->ls_num_slots = 0; ls->ls_num_slots = 0;
@ -640,8 +649,13 @@ static int new_lockspace(const char *name, const char *cluster,
spin_unlock(&lslist_lock); spin_unlock(&lslist_lock);
idr_destroy(&ls->ls_recover_idr); idr_destroy(&ls->ls_recover_idr);
kfree(ls->ls_recover_buf); kfree(ls->ls_recover_buf);
out_lkbfree: out_lkbidr:
idr_destroy(&ls->ls_lkbidr); idr_destroy(&ls->ls_lkbidr);
for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) {
if (ls->ls_remove_names[i])
kfree(ls->ls_remove_names[i]);
}
out_rsbtbl:
vfree(ls->ls_rsbtbl); vfree(ls->ls_rsbtbl);
out_lsfree: out_lsfree:
if (do_unreg) if (do_unreg)
@ -796,6 +810,9 @@ static int release_lockspace(struct dlm_ls *ls, int force)
vfree(ls->ls_rsbtbl); vfree(ls->ls_rsbtbl);
for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++)
kfree(ls->ls_remove_names[i]);
while (!list_empty(&ls->ls_new_rsb)) { while (!list_empty(&ls->ls_new_rsb)) {
rsb = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, rsb = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb,
res_hashchain); res_hashchain);