[DLM] fix stopping unstarted recovery
Red Hat BZ 211914 When many nodes are joining a lockspace simultaneously, the dlm gets a quick sequence of stop/start events, a pair for adding each node. dlm_controld in user space sends dlm_recoverd in the kernel each stop and start event. dlm_controld will sometimes send the stop before dlm_recoverd has had a chance to take up the previously queued start. The stop aborts the processing of the previous start by setting the RECOVERY_STOP flag. dlm_recoverd is erroneously clearing this flag and ignoring the stop/abort if it happens to take up the start after the stop meant to abort it. The fix is to check the sequence number that's incremented for each stop/start before clearing the flag. Signed-off-by: David Teigland <teigland@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
This commit is contained in:
parent
91c0dc93a1
commit
2cdc98aaf0
1 changed files with 6 additions and 1 deletions
|
@ -219,6 +219,10 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
|
|||
return error;
|
||||
}
|
||||
|
||||
/* The dlm_ls_start() that created the rv we take here may already have been
|
||||
stopped via dlm_ls_stop(); in that case we need to leave the RECOVERY_STOP
|
||||
flag set. */
|
||||
|
||||
static void do_ls_recovery(struct dlm_ls *ls)
|
||||
{
|
||||
struct dlm_recover *rv = NULL;
|
||||
|
@ -226,7 +230,8 @@ static void do_ls_recovery(struct dlm_ls *ls)
|
|||
spin_lock(&ls->ls_recover_lock);
|
||||
rv = ls->ls_recover_args;
|
||||
ls->ls_recover_args = NULL;
|
||||
clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
|
||||
if (rv && ls->ls_recover_seq == rv->seq)
|
||||
clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
|
||||
spin_unlock(&ls->ls_recover_lock);
|
||||
|
||||
if (rv) {
|
||||
|
|
Loading…
Reference in a new issue