libceph: fix msgr backoff
With commit f363e45f we replaced a bunch of hacky workqueue mutual exclusion logic with the WQ_NON_REENTRANT flag. One pieces of fallout is that the exponential backoff breaks in certain cases: * con_work attempts to connect. * we get an immediate failure, and the socket state change handler queues immediate work. * con_work calls con_fault, we decide to back off, but can't queue delayed work. In this case, we add a BACKOFF bit to make con_work reschedule delayed work next time it runs (which should be immediately). Signed-off-by: Sage Weil <sage@newdream.net>
This commit is contained in:
parent
692d20f576
commit
60bf8bf881
2 changed files with 29 additions and 2 deletions
|
@ -123,6 +123,7 @@ struct ceph_msg_pos {
|
|||
#define SOCK_CLOSED 11 /* socket state changed to closed */
|
||||
#define OPENING 13 /* open connection w/ (possibly new) peer */
|
||||
#define DEAD 14 /* dead, about to kfree */
|
||||
#define BACKOFF 15
|
||||
|
||||
/*
|
||||
* A single connection with another host.
|
||||
|
|
|
@ -1949,6 +1949,19 @@ static void con_work(struct work_struct *work)
|
|||
work.work);
|
||||
|
||||
mutex_lock(&con->mutex);
|
||||
if (test_and_clear_bit(BACKOFF, &con->state)) {
|
||||
dout("con_work %p backing off\n", con);
|
||||
if (queue_delayed_work(ceph_msgr_wq, &con->work,
|
||||
round_jiffies_relative(con->delay))) {
|
||||
dout("con_work %p backoff %lu\n", con, con->delay);
|
||||
mutex_unlock(&con->mutex);
|
||||
return;
|
||||
} else {
|
||||
con->ops->put(con);
|
||||
dout("con_work %p FAILED to back off %lu\n", con,
|
||||
con->delay);
|
||||
}
|
||||
}
|
||||
|
||||
if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
|
||||
dout("con_work CLOSED\n");
|
||||
|
@ -2017,11 +2030,24 @@ static void ceph_fault(struct ceph_connection *con)
|
|||
con->delay = BASE_DELAY_INTERVAL;
|
||||
else if (con->delay < MAX_DELAY_INTERVAL)
|
||||
con->delay *= 2;
|
||||
dout("fault queueing %p delay %lu\n", con, con->delay);
|
||||
con->ops->get(con);
|
||||
if (queue_delayed_work(ceph_msgr_wq, &con->work,
|
||||
round_jiffies_relative(con->delay)) == 0)
|
||||
round_jiffies_relative(con->delay))) {
|
||||
dout("fault queued %p delay %lu\n", con, con->delay);
|
||||
} else {
|
||||
con->ops->put(con);
|
||||
dout("fault failed to queue %p delay %lu, backoff\n",
|
||||
con, con->delay);
|
||||
/*
|
||||
* In many cases we see a socket state change
|
||||
* while con_work is running and end up
|
||||
* queuing (non-delayed) work, such that we
|
||||
* can't backoff with a delay. Set a flag so
|
||||
* that when con_work restarts we schedule the
|
||||
* delay then.
|
||||
*/
|
||||
set_bit(BACKOFF, &con->state);
|
||||
}
|
||||
}
|
||||
|
||||
out_unlock:
|
||||
|
|
Loading…
Reference in a new issue