ipv4: fib: Replay events when registering FIB notifier

Commit b90eb75494 ("fib: introduce FIB notification infrastructure")
introduced a new notification chain to notify listeners (f.e., switchdev
drivers) about addition and deletion of routes.

However, upon registration to the chain the FIB tables can already be
populated, which means potential listeners will have an incomplete view
of the tables.

Solve that by dumping the FIB tables and replaying the events to the
passed notification block. The dump itself is done using RCU in order
not to starve consumers that need RTNL to make progress.

The integrity of the dump is ensured by reading the FIB change sequence
counter before and after the dump under RTNL. This allows us to avoid
the problematic situation in which the dumping process sends a ENTRY_ADD
notification following ENTRY_DEL generated by another process holding
RTNL.

Callers of the registration function may pass a callback that is
executed in case the dump was inconsistent with current FIB tables.

The number of retries until a consistent dump is achieved is set to a
fixed number to prevent callers from looping for long periods of time.
In case current limit proves to be problematic in the future, it can be
easily converted to be configurable using a sysctl.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Ido Schimmel 2016-12-03 16:45:07 +01:00 committed by David S. Miller
parent cacaad11f4
commit c3852ef7f2
4 changed files with 174 additions and 5 deletions

View file

@ -2027,6 +2027,18 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
return NOTIFY_DONE;
}
static void mlxsw_sp_router_fib_dump_flush(struct notifier_block *nb)
{
struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
/* Flush pending FIB notifications and then flush the device's
* table before requesting another dump. The FIB notification
* block is unregistered, so no need to take RTNL.
*/
mlxsw_core_flush_owq();
mlxsw_sp_router_fib_flush(mlxsw_sp);
}
int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
{
int err;
@ -2047,9 +2059,15 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
goto err_neigh_init;
mlxsw_sp->fib_nb.notifier_call = mlxsw_sp_router_fib_event;
register_fib_notifier(&mlxsw_sp->fib_nb);
err = register_fib_notifier(&mlxsw_sp->fib_nb,
mlxsw_sp_router_fib_dump_flush);
if (err)
goto err_register_fib_notifier;
return 0;
err_register_fib_notifier:
mlxsw_sp_neigh_fini(mlxsw_sp);
err_neigh_init:
mlxsw_sp_vrs_fini(mlxsw_sp);
err_vrs_init:

View file

@ -2804,8 +2804,13 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
goto err_alloc_ordered_workqueue;
}
/* Only FIBs pointing to our own netdevs are programmed into
* the device, so no need to pass a callback.
*/
rocker->fib_nb.notifier_call = rocker_router_fib_event;
register_fib_notifier(&rocker->fib_nb);
err = register_fib_notifier(&rocker->fib_nb, NULL);
if (err)
goto err_register_fib_notifier;
rocker->hw.id = rocker_read64(rocker, SWITCH_ID);
@ -2822,6 +2827,7 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
err_probe_ports:
unregister_fib_notifier(&rocker->fib_nb);
err_register_fib_notifier:
destroy_workqueue(rocker->rocker_owq);
err_alloc_ordered_workqueue:
free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker);

View file

@ -221,7 +221,8 @@ enum fib_event_type {
FIB_EVENT_RULE_DEL,
};
int register_fib_notifier(struct notifier_block *nb);
int register_fib_notifier(struct notifier_block *nb,
void (*cb)(struct notifier_block *nb));
int unregister_fib_notifier(struct notifier_block *nb);
int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
struct fib_notifier_info *info);

View file

@ -84,11 +84,99 @@
#include <trace/events/fib.h>
#include "fib_lookup.h"
static unsigned int fib_seq_sum(void)
{
unsigned int fib_seq = 0;
struct net *net;
rtnl_lock();
for_each_net(net)
fib_seq += net->ipv4.fib_seq;
rtnl_unlock();
return fib_seq;
}
static ATOMIC_NOTIFIER_HEAD(fib_chain);
int register_fib_notifier(struct notifier_block *nb)
static int call_fib_notifier(struct notifier_block *nb, struct net *net,
enum fib_event_type event_type,
struct fib_notifier_info *info)
{
return atomic_notifier_chain_register(&fib_chain, nb);
info->net = net;
return nb->notifier_call(nb, event_type, info);
}
static void fib_rules_notify(struct net *net, struct notifier_block *nb,
enum fib_event_type event_type)
{
#ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_notifier_info info;
if (net->ipv4.fib_has_custom_rules)
call_fib_notifier(nb, net, event_type, &info);
#endif
}
static void fib_notify(struct net *net, struct notifier_block *nb,
enum fib_event_type event_type);
static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
enum fib_event_type event_type, u32 dst,
int dst_len, struct fib_info *fi,
u8 tos, u8 type, u32 tb_id, u32 nlflags)
{
struct fib_entry_notifier_info info = {
.dst = dst,
.dst_len = dst_len,
.fi = fi,
.tos = tos,
.type = type,
.tb_id = tb_id,
.nlflags = nlflags,
};
return call_fib_notifier(nb, net, event_type, &info.info);
}
static bool fib_dump_is_consistent(struct notifier_block *nb,
void (*cb)(struct notifier_block *nb),
unsigned int fib_seq)
{
atomic_notifier_chain_register(&fib_chain, nb);
if (fib_seq == fib_seq_sum())
return true;
atomic_notifier_chain_unregister(&fib_chain, nb);
if (cb)
cb(nb);
return false;
}
#define FIB_DUMP_MAX_RETRIES 5
int register_fib_notifier(struct notifier_block *nb,
void (*cb)(struct notifier_block *nb))
{
int retries = 0;
do {
unsigned int fib_seq = fib_seq_sum();
struct net *net;
/* Mutex semantics guarantee that every change done to
* FIB tries before we read the change sequence counter
* is now visible to us.
*/
rcu_read_lock();
for_each_net_rcu(net) {
fib_rules_notify(net, nb, FIB_EVENT_RULE_ADD);
fib_notify(net, nb, FIB_EVENT_ENTRY_ADD);
}
rcu_read_unlock();
if (fib_dump_is_consistent(nb, cb, fib_seq))
return 0;
} while (++retries < FIB_DUMP_MAX_RETRIES);
return -EBUSY;
}
EXPORT_SYMBOL(register_fib_notifier);
@ -1902,6 +1990,62 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
return found;
}
static void fib_leaf_notify(struct net *net, struct key_vector *l,
struct fib_table *tb, struct notifier_block *nb,
enum fib_event_type event_type)
{
struct fib_alias *fa;
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
struct fib_info *fi = fa->fa_info;
if (!fi)
continue;
/* local and main table can share the same trie,
* so don't notify twice for the same entry.
*/
if (tb->tb_id != fa->tb_id)
continue;
call_fib_entry_notifier(nb, net, event_type, l->key,
KEYLENGTH - fa->fa_slen, fi, fa->fa_tos,
fa->fa_type, fa->tb_id, 0);
}
}
static void fib_table_notify(struct net *net, struct fib_table *tb,
struct notifier_block *nb,
enum fib_event_type event_type)
{
struct trie *t = (struct trie *)tb->tb_data;
struct key_vector *l, *tp = t->kv;
t_key key = 0;
while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
fib_leaf_notify(net, l, tb, nb, event_type);
key = l->key + 1;
/* stop in case of wrap around */
if (key < l->key)
break;
}
}
static void fib_notify(struct net *net, struct notifier_block *nb,
enum fib_event_type event_type)
{
unsigned int h;
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
struct fib_table *tb;
hlist_for_each_entry_rcu(tb, head, tb_hlist)
fib_table_notify(net, tb, nb, event_type);
}
}
static void __trie_free_rcu(struct rcu_head *head)
{
struct fib_table *tb = container_of(head, struct fib_table, rcu);