77162022ab
This adds two new flags NTF_MASTER and NTF_SELF that can now be used to specify where PF_BRIDGE netlink commands should be sent. NTF_MASTER sends the commands to the 'dev->master' device for parsing. Typically this will be the linux net/bridge, or open-vswitch devices. Also without any flags set the command will be handled by the master device as well so that current user space tools continue to work as expected. The NTF_SELF flag will push the PF_BRIDGE commands to the device. In the basic example below the commands are then parsed and programmed in the embedded bridge. Note if both NTF_SELF and NTF_MASTER bits are set then the command will be sent to both 'dev->master' and 'dev' this allows user space to easily keep the embedded bridge and software bridge in sync. There is a slight complication in the case with both flags set when an error occurs. To resolve this the rtnl handler clears the NTF_ flag in the netlink ack to indicate which sets completed successfully. The add/del handlers will abort as soon as any error occurs. To support this new net device ops were added to call into the device and the existing bridging code was refactored to use these. There should be no required changes in user space to support the current bridge behavior. A basic setup with a SR-IOV enabled NIC looks like this, veth0 veth2 | | ------------ | bridge0 | <---- software bridging ------------ / / ethx.y ethx VF PF \ \ <---- propagate FDB entries to HW \ \ -------------------- | Embedded Bridge | <---- hardware offloaded switching -------------------- In this case the embedded bridge must be managed to allow 'veth0' to communicate with 'ethx.y' correctly. At present drivers managing the embedded bridge either send frames onto the network which then get dropped by the switch OR the embedded bridge will flood these frames. With this patch we have a mechanism to manage the embedded bridge correctly from user space. This example is specific to SR-IOV but replacing the VF with another PF or dropping this into the DSA framework generates similar management issues. Examples session using the 'br'[1] tool to add, dump and then delete a mac address with a new "embedded" option and enabled ixgbe driver: # br fdb add 22:35:19:ac:60:59 dev eth3 # br fdb port mac addr flags veth0 22:35:19:ac:60:58 static veth0 9a:5f:81:f7:f6:ec local eth3 00:1b:21:55:23:59 local eth3 22:35:19:ac:60:59 static veth0 22:35:19:ac:60:57 static #br fdb add 22:35:19:ac:60:59 embedded dev eth3 #br fdb port mac addr flags veth0 22:35:19:ac:60:58 static veth0 9a:5f:81:f7:f6:ec local eth3 00:1b:21:55:23:59 local eth3 22:35:19:ac:60:59 static veth0 22:35:19:ac:60:57 static eth3 22:35:19:ac:60:59 local embedded #br fdb del 22:35:19:ac:60:59 embedded dev eth3 I added a couple lines to 'br' to set the flags correctly is all. It is my opinion that the merit of this patch is now embedded and SW bridges can both be modeled correctly in user space using very nearly the same message passing. [1] 'br' tool was published as an RFC here and will be renamed 'bridge' http://patchwork.ozlabs.org/patch/117664/ Thanks to Jamal Hadi Salim, Stephen Hemminger and Ben Hutchings for valuable feedback, suggestions, and review. v2: fixed api descriptions and error case with both NTF_SELF and NTF_MASTER set plus updated patch description. Signed-off-by: John Fastabend <john.r.fastabend@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
250 lines
5.9 KiB
C
250 lines
5.9 KiB
C
/*
|
|
* Bridge netlink control interface
|
|
*
|
|
* Authors:
|
|
* Stephen Hemminger <shemminger@osdl.org>
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version
|
|
* 2 of the License, or (at your option) any later version.
|
|
*/
|
|
|
|
#include <linux/kernel.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/etherdevice.h>
|
|
#include <net/rtnetlink.h>
|
|
#include <net/net_namespace.h>
|
|
#include <net/sock.h>
|
|
|
|
#include "br_private.h"
|
|
#include "br_private_stp.h"
|
|
|
|
static inline size_t br_nlmsg_size(void)
|
|
{
|
|
return NLMSG_ALIGN(sizeof(struct ifinfomsg))
|
|
+ nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
|
|
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
|
|
+ nla_total_size(4) /* IFLA_MASTER */
|
|
+ nla_total_size(4) /* IFLA_MTU */
|
|
+ nla_total_size(4) /* IFLA_LINK */
|
|
+ nla_total_size(1) /* IFLA_OPERSTATE */
|
|
+ nla_total_size(1); /* IFLA_PROTINFO */
|
|
}
|
|
|
|
/*
|
|
* Create one netlink message for one interface
|
|
* Contains port and master info as well as carrier and bridge state.
|
|
*/
|
|
static int br_fill_ifinfo(struct sk_buff *skb, const struct net_bridge_port *port,
|
|
u32 pid, u32 seq, int event, unsigned int flags)
|
|
{
|
|
const struct net_bridge *br = port->br;
|
|
const struct net_device *dev = port->dev;
|
|
struct ifinfomsg *hdr;
|
|
struct nlmsghdr *nlh;
|
|
u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
|
|
|
|
br_debug(br, "br_fill_info event %d port %s master %s\n",
|
|
event, dev->name, br->dev->name);
|
|
|
|
nlh = nlmsg_put(skb, pid, seq, event, sizeof(*hdr), flags);
|
|
if (nlh == NULL)
|
|
return -EMSGSIZE;
|
|
|
|
hdr = nlmsg_data(nlh);
|
|
hdr->ifi_family = AF_BRIDGE;
|
|
hdr->__ifi_pad = 0;
|
|
hdr->ifi_type = dev->type;
|
|
hdr->ifi_index = dev->ifindex;
|
|
hdr->ifi_flags = dev_get_flags(dev);
|
|
hdr->ifi_change = 0;
|
|
|
|
if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
|
|
nla_put_u32(skb, IFLA_MASTER, br->dev->ifindex) ||
|
|
nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
|
|
nla_put_u8(skb, IFLA_OPERSTATE, operstate) ||
|
|
(dev->addr_len &&
|
|
nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) ||
|
|
(dev->ifindex != dev->iflink &&
|
|
nla_put_u32(skb, IFLA_LINK, dev->iflink)) ||
|
|
(event == RTM_NEWLINK &&
|
|
nla_put_u8(skb, IFLA_PROTINFO, port->state)))
|
|
goto nla_put_failure;
|
|
return nlmsg_end(skb, nlh);
|
|
|
|
nla_put_failure:
|
|
nlmsg_cancel(skb, nlh);
|
|
return -EMSGSIZE;
|
|
}
|
|
|
|
/*
|
|
* Notify listeners of a change in port information
|
|
*/
|
|
void br_ifinfo_notify(int event, struct net_bridge_port *port)
|
|
{
|
|
struct net *net = dev_net(port->dev);
|
|
struct sk_buff *skb;
|
|
int err = -ENOBUFS;
|
|
|
|
br_debug(port->br, "port %u(%s) event %d\n",
|
|
(unsigned int)port->port_no, port->dev->name, event);
|
|
|
|
skb = nlmsg_new(br_nlmsg_size(), GFP_ATOMIC);
|
|
if (skb == NULL)
|
|
goto errout;
|
|
|
|
err = br_fill_ifinfo(skb, port, 0, 0, event, 0);
|
|
if (err < 0) {
|
|
/* -EMSGSIZE implies BUG in br_nlmsg_size() */
|
|
WARN_ON(err == -EMSGSIZE);
|
|
kfree_skb(skb);
|
|
goto errout;
|
|
}
|
|
rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
|
|
return;
|
|
errout:
|
|
if (err < 0)
|
|
rtnl_set_sk_err(net, RTNLGRP_LINK, err);
|
|
}
|
|
|
|
/*
|
|
* Dump information about all ports, in response to GETLINK
|
|
*/
|
|
static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
|
|
{
|
|
struct net *net = sock_net(skb->sk);
|
|
struct net_device *dev;
|
|
int idx;
|
|
|
|
idx = 0;
|
|
rcu_read_lock();
|
|
for_each_netdev_rcu(net, dev) {
|
|
struct net_bridge_port *port = br_port_get_rcu(dev);
|
|
|
|
/* not a bridge port */
|
|
if (!port || idx < cb->args[0])
|
|
goto skip;
|
|
|
|
if (br_fill_ifinfo(skb, port,
|
|
NETLINK_CB(cb->skb).pid,
|
|
cb->nlh->nlmsg_seq, RTM_NEWLINK,
|
|
NLM_F_MULTI) < 0)
|
|
break;
|
|
skip:
|
|
++idx;
|
|
}
|
|
rcu_read_unlock();
|
|
cb->args[0] = idx;
|
|
|
|
return skb->len;
|
|
}
|
|
|
|
/*
|
|
* Change state of port (ie from forwarding to blocking etc)
|
|
* Used by spanning tree in user space.
|
|
*/
|
|
static int br_rtm_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
|
|
{
|
|
struct net *net = sock_net(skb->sk);
|
|
struct ifinfomsg *ifm;
|
|
struct nlattr *protinfo;
|
|
struct net_device *dev;
|
|
struct net_bridge_port *p;
|
|
u8 new_state;
|
|
|
|
if (nlmsg_len(nlh) < sizeof(*ifm))
|
|
return -EINVAL;
|
|
|
|
ifm = nlmsg_data(nlh);
|
|
if (ifm->ifi_family != AF_BRIDGE)
|
|
return -EPFNOSUPPORT;
|
|
|
|
protinfo = nlmsg_find_attr(nlh, sizeof(*ifm), IFLA_PROTINFO);
|
|
if (!protinfo || nla_len(protinfo) < sizeof(u8))
|
|
return -EINVAL;
|
|
|
|
new_state = nla_get_u8(protinfo);
|
|
if (new_state > BR_STATE_BLOCKING)
|
|
return -EINVAL;
|
|
|
|
dev = __dev_get_by_index(net, ifm->ifi_index);
|
|
if (!dev)
|
|
return -ENODEV;
|
|
|
|
p = br_port_get_rtnl(dev);
|
|
if (!p)
|
|
return -EINVAL;
|
|
|
|
/* if kernel STP is running, don't allow changes */
|
|
if (p->br->stp_enabled == BR_KERNEL_STP)
|
|
return -EBUSY;
|
|
|
|
if (!netif_running(dev) ||
|
|
(!netif_carrier_ok(dev) && new_state != BR_STATE_DISABLED))
|
|
return -ENETDOWN;
|
|
|
|
p->state = new_state;
|
|
br_log_state(p);
|
|
|
|
spin_lock_bh(&p->br->lock);
|
|
br_port_state_selection(p->br);
|
|
spin_unlock_bh(&p->br->lock);
|
|
|
|
br_ifinfo_notify(RTM_NEWLINK, p);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int br_validate(struct nlattr *tb[], struct nlattr *data[])
|
|
{
|
|
if (tb[IFLA_ADDRESS]) {
|
|
if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
|
|
return -EINVAL;
|
|
if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
|
|
return -EADDRNOTAVAIL;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static struct rtnl_link_ops br_link_ops __read_mostly = {
|
|
.kind = "bridge",
|
|
.priv_size = sizeof(struct net_bridge),
|
|
.setup = br_dev_setup,
|
|
.validate = br_validate,
|
|
.dellink = br_dev_delete,
|
|
};
|
|
|
|
int __init br_netlink_init(void)
|
|
{
|
|
int err;
|
|
|
|
err = rtnl_link_register(&br_link_ops);
|
|
if (err < 0)
|
|
goto err1;
|
|
|
|
err = __rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL,
|
|
br_dump_ifinfo, NULL);
|
|
if (err)
|
|
goto err2;
|
|
err = __rtnl_register(PF_BRIDGE, RTM_SETLINK,
|
|
br_rtm_setlink, NULL, NULL);
|
|
if (err)
|
|
goto err3;
|
|
|
|
return 0;
|
|
|
|
err3:
|
|
rtnl_unregister_all(PF_BRIDGE);
|
|
err2:
|
|
rtnl_link_unregister(&br_link_ops);
|
|
err1:
|
|
return err;
|
|
}
|
|
|
|
void __exit br_netlink_fini(void)
|
|
{
|
|
rtnl_link_unregister(&br_link_ops);
|
|
rtnl_unregister_all(PF_BRIDGE);
|
|
}
|