[NET_SCHED]: Fix fallout from dev->qdisc RCU change
The move of qdisc destruction to a rcu callback broke locking in the entire qdisc layer by invalidating previously valid assumptions about the context in which changes to the qdisc tree occur. The two assumptions were: - since changes only happen in process context, read_lock doesn't need bottem half protection. Now invalid since destruction of inner qdiscs, classifiers, actions and estimators happens in the RCU callback unless they're manually deleted, resulting in dead-locks when read_lock in process context is interrupted by write_lock_bh in bottem half context. - since changes only happen under the RTNL, no additional locking is necessary for data not used during packet processing (f.e. u32_list). Again, since destruction now happens in the RCU callback, this assumption is not valid anymore, causing races while using this data, which can result in corruption or use-after-free. Instead of "fixing" this by disabling bottem halfs everywhere and adding new locks/refcounting, this patch makes these assumptions valid again by moving destruction back to process context. Since only the dev->qdisc pointer is protected by RCU, but ->enqueue and the qdisc tree are still protected by dev->qdisc_lock, destruction of the tree can be performed immediately and only the final free needs to happen in the rcu callback to make sure dev_queue_xmit doesn't access already freed memory. Signed-off-by: Patrick McHardy <kaber@trash.net> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
787e0617e5
commit
85670cc1fa
4 changed files with 39 additions and 61 deletions
|
@ -1480,14 +1480,16 @@ int dev_queue_xmit(struct sk_buff *skb)
|
||||||
if (q->enqueue) {
|
if (q->enqueue) {
|
||||||
/* Grab device queue */
|
/* Grab device queue */
|
||||||
spin_lock(&dev->queue_lock);
|
spin_lock(&dev->queue_lock);
|
||||||
|
q = dev->qdisc;
|
||||||
|
if (q->enqueue) {
|
||||||
|
rc = q->enqueue(skb, q);
|
||||||
|
qdisc_run(dev);
|
||||||
|
spin_unlock(&dev->queue_lock);
|
||||||
|
|
||||||
rc = q->enqueue(skb, q);
|
rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
|
||||||
|
goto out;
|
||||||
qdisc_run(dev);
|
}
|
||||||
|
|
||||||
spin_unlock(&dev->queue_lock);
|
spin_unlock(&dev->queue_lock);
|
||||||
rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
|
|
||||||
goto out;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* The device has no queue. Common case for software devices:
|
/* The device has no queue. Common case for software devices:
|
||||||
|
|
|
@ -401,7 +401,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
|
||||||
if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
|
if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
|
||||||
return skb->len;
|
return skb->len;
|
||||||
|
|
||||||
read_lock_bh(&qdisc_tree_lock);
|
read_lock(&qdisc_tree_lock);
|
||||||
if (!tcm->tcm_parent)
|
if (!tcm->tcm_parent)
|
||||||
q = dev->qdisc_sleeping;
|
q = dev->qdisc_sleeping;
|
||||||
else
|
else
|
||||||
|
@ -458,7 +458,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
|
||||||
if (cl)
|
if (cl)
|
||||||
cops->put(q, cl);
|
cops->put(q, cl);
|
||||||
out:
|
out:
|
||||||
read_unlock_bh(&qdisc_tree_lock);
|
read_unlock(&qdisc_tree_lock);
|
||||||
dev_put(dev);
|
dev_put(dev);
|
||||||
return skb->len;
|
return skb->len;
|
||||||
}
|
}
|
||||||
|
|
|
@ -195,14 +195,14 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
|
||||||
{
|
{
|
||||||
struct Qdisc *q;
|
struct Qdisc *q;
|
||||||
|
|
||||||
read_lock_bh(&qdisc_tree_lock);
|
read_lock(&qdisc_tree_lock);
|
||||||
list_for_each_entry(q, &dev->qdisc_list, list) {
|
list_for_each_entry(q, &dev->qdisc_list, list) {
|
||||||
if (q->handle == handle) {
|
if (q->handle == handle) {
|
||||||
read_unlock_bh(&qdisc_tree_lock);
|
read_unlock(&qdisc_tree_lock);
|
||||||
return q;
|
return q;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
read_unlock_bh(&qdisc_tree_lock);
|
read_unlock(&qdisc_tree_lock);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -837,7 +837,7 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
|
||||||
continue;
|
continue;
|
||||||
if (idx > s_idx)
|
if (idx > s_idx)
|
||||||
s_q_idx = 0;
|
s_q_idx = 0;
|
||||||
read_lock_bh(&qdisc_tree_lock);
|
read_lock(&qdisc_tree_lock);
|
||||||
q_idx = 0;
|
q_idx = 0;
|
||||||
list_for_each_entry(q, &dev->qdisc_list, list) {
|
list_for_each_entry(q, &dev->qdisc_list, list) {
|
||||||
if (q_idx < s_q_idx) {
|
if (q_idx < s_q_idx) {
|
||||||
|
@ -846,12 +846,12 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
|
||||||
}
|
}
|
||||||
if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
|
if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
|
||||||
cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
|
cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
|
||||||
read_unlock_bh(&qdisc_tree_lock);
|
read_unlock(&qdisc_tree_lock);
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
q_idx++;
|
q_idx++;
|
||||||
}
|
}
|
||||||
read_unlock_bh(&qdisc_tree_lock);
|
read_unlock(&qdisc_tree_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
done:
|
done:
|
||||||
|
@ -1074,7 +1074,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
|
||||||
s_t = cb->args[0];
|
s_t = cb->args[0];
|
||||||
t = 0;
|
t = 0;
|
||||||
|
|
||||||
read_lock_bh(&qdisc_tree_lock);
|
read_lock(&qdisc_tree_lock);
|
||||||
list_for_each_entry(q, &dev->qdisc_list, list) {
|
list_for_each_entry(q, &dev->qdisc_list, list) {
|
||||||
if (t < s_t || !q->ops->cl_ops ||
|
if (t < s_t || !q->ops->cl_ops ||
|
||||||
(tcm->tcm_parent &&
|
(tcm->tcm_parent &&
|
||||||
|
@ -1096,7 +1096,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
|
||||||
break;
|
break;
|
||||||
t++;
|
t++;
|
||||||
}
|
}
|
||||||
read_unlock_bh(&qdisc_tree_lock);
|
read_unlock(&qdisc_tree_lock);
|
||||||
|
|
||||||
cb->args[0] = t;
|
cb->args[0] = t;
|
||||||
|
|
||||||
|
|
|
@ -45,11 +45,10 @@
|
||||||
The idea is the following:
|
The idea is the following:
|
||||||
- enqueue, dequeue are serialized via top level device
|
- enqueue, dequeue are serialized via top level device
|
||||||
spinlock dev->queue_lock.
|
spinlock dev->queue_lock.
|
||||||
- tree walking is protected by read_lock_bh(qdisc_tree_lock)
|
- tree walking is protected by read_lock(qdisc_tree_lock)
|
||||||
and this lock is used only in process context.
|
and this lock is used only in process context.
|
||||||
- updates to tree are made under rtnl semaphore or
|
- updates to tree are made only under rtnl semaphore,
|
||||||
from softirq context (__qdisc_destroy rcu-callback)
|
hence this lock may be made without local bh disabling.
|
||||||
hence this lock needs local bh disabling.
|
|
||||||
|
|
||||||
qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
|
qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
|
||||||
*/
|
*/
|
||||||
|
@ -57,14 +56,14 @@ DEFINE_RWLOCK(qdisc_tree_lock);
|
||||||
|
|
||||||
void qdisc_lock_tree(struct net_device *dev)
|
void qdisc_lock_tree(struct net_device *dev)
|
||||||
{
|
{
|
||||||
write_lock_bh(&qdisc_tree_lock);
|
write_lock(&qdisc_tree_lock);
|
||||||
spin_lock_bh(&dev->queue_lock);
|
spin_lock_bh(&dev->queue_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
void qdisc_unlock_tree(struct net_device *dev)
|
void qdisc_unlock_tree(struct net_device *dev)
|
||||||
{
|
{
|
||||||
spin_unlock_bh(&dev->queue_lock);
|
spin_unlock_bh(&dev->queue_lock);
|
||||||
write_unlock_bh(&qdisc_tree_lock);
|
write_unlock(&qdisc_tree_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -483,20 +482,6 @@ void qdisc_reset(struct Qdisc *qdisc)
|
||||||
static void __qdisc_destroy(struct rcu_head *head)
|
static void __qdisc_destroy(struct rcu_head *head)
|
||||||
{
|
{
|
||||||
struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
|
struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
|
||||||
struct Qdisc_ops *ops = qdisc->ops;
|
|
||||||
|
|
||||||
#ifdef CONFIG_NET_ESTIMATOR
|
|
||||||
gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
|
|
||||||
#endif
|
|
||||||
write_lock(&qdisc_tree_lock);
|
|
||||||
if (ops->reset)
|
|
||||||
ops->reset(qdisc);
|
|
||||||
if (ops->destroy)
|
|
||||||
ops->destroy(qdisc);
|
|
||||||
write_unlock(&qdisc_tree_lock);
|
|
||||||
module_put(ops->owner);
|
|
||||||
|
|
||||||
dev_put(qdisc->dev);
|
|
||||||
kfree((char *) qdisc - qdisc->padded);
|
kfree((char *) qdisc - qdisc->padded);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -504,32 +489,23 @@ static void __qdisc_destroy(struct rcu_head *head)
|
||||||
|
|
||||||
void qdisc_destroy(struct Qdisc *qdisc)
|
void qdisc_destroy(struct Qdisc *qdisc)
|
||||||
{
|
{
|
||||||
struct list_head cql = LIST_HEAD_INIT(cql);
|
struct Qdisc_ops *ops = qdisc->ops;
|
||||||
struct Qdisc *cq, *q, *n;
|
|
||||||
|
|
||||||
if (qdisc->flags & TCQ_F_BUILTIN ||
|
if (qdisc->flags & TCQ_F_BUILTIN ||
|
||||||
!atomic_dec_and_test(&qdisc->refcnt))
|
!atomic_dec_and_test(&qdisc->refcnt))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (!list_empty(&qdisc->list)) {
|
list_del(&qdisc->list);
|
||||||
if (qdisc->ops->cl_ops == NULL)
|
#ifdef CONFIG_NET_ESTIMATOR
|
||||||
list_del(&qdisc->list);
|
gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
|
||||||
else
|
#endif
|
||||||
list_move(&qdisc->list, &cql);
|
if (ops->reset)
|
||||||
}
|
ops->reset(qdisc);
|
||||||
|
if (ops->destroy)
|
||||||
/* unlink inner qdiscs from dev->qdisc_list immediately */
|
ops->destroy(qdisc);
|
||||||
list_for_each_entry(cq, &cql, list)
|
|
||||||
list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list)
|
|
||||||
if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) {
|
|
||||||
if (q->ops->cl_ops == NULL)
|
|
||||||
list_del_init(&q->list);
|
|
||||||
else
|
|
||||||
list_move_tail(&q->list, &cql);
|
|
||||||
}
|
|
||||||
list_for_each_entry_safe(cq, n, &cql, list)
|
|
||||||
list_del_init(&cq->list);
|
|
||||||
|
|
||||||
|
module_put(ops->owner);
|
||||||
|
dev_put(qdisc->dev);
|
||||||
call_rcu(&qdisc->q_rcu, __qdisc_destroy);
|
call_rcu(&qdisc->q_rcu, __qdisc_destroy);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -549,15 +525,15 @@ void dev_activate(struct net_device *dev)
|
||||||
printk(KERN_INFO "%s: activation failed\n", dev->name);
|
printk(KERN_INFO "%s: activation failed\n", dev->name);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
write_lock_bh(&qdisc_tree_lock);
|
write_lock(&qdisc_tree_lock);
|
||||||
list_add_tail(&qdisc->list, &dev->qdisc_list);
|
list_add_tail(&qdisc->list, &dev->qdisc_list);
|
||||||
write_unlock_bh(&qdisc_tree_lock);
|
write_unlock(&qdisc_tree_lock);
|
||||||
} else {
|
} else {
|
||||||
qdisc = &noqueue_qdisc;
|
qdisc = &noqueue_qdisc;
|
||||||
}
|
}
|
||||||
write_lock_bh(&qdisc_tree_lock);
|
write_lock(&qdisc_tree_lock);
|
||||||
dev->qdisc_sleeping = qdisc;
|
dev->qdisc_sleeping = qdisc;
|
||||||
write_unlock_bh(&qdisc_tree_lock);
|
write_unlock(&qdisc_tree_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!netif_carrier_ok(dev))
|
if (!netif_carrier_ok(dev))
|
||||||
|
|
Loading…
Reference in a new issue