Advertisement

Linux MAC VLAN实现

阅读量:

概述

1、macvlan是一种网卡虚拟化解决方案,能够将一块物理网卡虚拟成多块虚拟网卡。

2、macvlan 这种技术听起来有点像 VLAN,但它们的实现机制是完全不一样的。macvlan 子接口和原来的主接口是完全独立的,可以单独配置 MAC 地址和 IP 地址,而 VLAN 子接口和主接口共用相同的 MAC 地址。VLAN 用来划分广播域,而 macvlan 共享同一个广播域。

3、macvlan 会根据收到包的目的 MAC 地址判断这个包需要交给哪个虚拟网卡,虚拟网卡再把包交给上层的协议栈处理。

macvlan模式

根据 macvlan 子接口之间的通信模式,macvlan 有四种网络模式:

private 模式

vepa(virtual ethernet port aggregator) 模式

bridge 模式

passthru 模式

private模式

这种模式下,同一主接口下的子接口之间彼此隔离,不能通信。即使从外部的物理交换机导流,也会被无情地丢掉。

vepa模式

这种模式下,子接口之间的通信流量需要导到外部支持 802.1Qbg/VPEA 功能的交换机上(可以是物理的或者虚拟的),经由外部交换机转发,再绕回来。

注:802.1Qbg/VPEA 功能简单说就是交换机要支持 发夹(``hairpin``) 功能,也就是数据包从一个接口上收上来之后还能再扔回去。

bridge 模式

这种模式下,模拟的是 Linux bridge 的功能,但比 bridge 要好的一点是每个接口的 MAC 地址是已知的,不用学习。所以,这种模式下,子接口之间就是直接可以通信的。

passthru 模式

这种模式,只允许单个子接口连接主接口,且必须设置成混杂模式,一般用于子接口桥接和创建 VLAN 子接口的场景。

macvlan使用命令举例

x
ip link add link ethX(主设备) name ethX_1(虚拟macvlan设备) type macvlan mode private

虚拟macvlan(ethX_1)设备创建

复制代码
    void __init rtnetlink_init(void)
    {
    	if (register_pernet_subsys(&rtnetlink_net_ops))
    		panic("rtnetlink_init: cannot initialize rtnetlink\n");
    
    	register_netdevice_notifier(&rtnetlink_dev_notifier);
    	
    	rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink,
    		      rtnl_dump_ifinfo, rtnl_calcit);
    	rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, NULL);
    	rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, NULL);
    	rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, NULL);
    	
    	rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, NULL);
    	rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, NULL);
    	
    	rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, NULL);
    	rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, NULL);
    	rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, NULL);
    	
    	rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, NULL);
    	rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, NULL);
    	rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL);
    
    }
复制代码
    static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh)
    {
    		.....
    		dev = rtnl_create_link(dest_net, ifname, ops, tb);
    		.....
    }
    
    
    struct net_device *rtnl_create_link(struct net *net,
    	char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[])
    {
    		......
    		dev = alloc_netdev_mqs(ops->priv_size, ifname, ops->setup,
    			       num_tx_queues, num_rx_queues);
    		.....
    }

其中 ops->priv_size = sizeof(struct macvlan_dev)

复制代码
    int macvlan_link_register(struct rtnl_link_ops *ops)
    {
    	/* common fields */
    	ops->priv_size		= sizeof(struct macvlan_dev);
    	ops->validate		= macvlan_validate;
    	ops->maxtype		= IFLA_MACVLAN_MAX;
    	ops->policy		= macvlan_policy;
    	ops->changelink		= macvlan_changelink;
    	ops->get_size		= macvlan_get_size;
    	ops->fill_info		= macvlan_fill_info;
    
    	return rtnl_link_register(ops);
    
    };

macvlan代码实现

关键数据结构

macvlan_port

与主设备关联;其成员dev即主设备net_device;vlan_hash、vlans记录了所有的macvlan子设备类别,一个用hash链表组织,一个采用普通链表组织。

复制代码
    struct macvlan_port {
    	struct net_device	*dev;
    	struct hlist_head	vlan_hash[MACVLAN_HASH_SIZE];
    	struct list_head	vlans;
    	struct rcu_head		rcu;
    	bool 			passthru;
    	int			count;
    };

macvlan_dev

与虚拟子设备关联;macvlan_dev 存储在net_device的priv部分(dev->priv)。

其中最重要的是receive和forward两个函数指针,receive是子设备收包函数,forward函数用于子设备之间互相转发。

struct macvlan_dev *vlan = netdev_priv(dev);

复制代码
    struct macvlan_dev {
    	struct net_device	*dev;
    	struct list_head	list;
    	struct hlist_node	hlist;
    	struct macvlan_port	*port;
    	struct net_device	*lowerdev;
    	struct macvlan_pcpu_stats __percpu *pcpu_stats;
    
    	DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);
    	
    	enum macvlan_mode	mode;
    	u16			flags;
    	int (*receive)(struct sk_buff *skb);
    	int (*forward)(struct net_device *dev, struct sk_buff *skb);
    	struct macvtap_queue	*taps[MAX_MACVTAP_QUEUES];
    	int			numvtaps;
    	int			minor;
    
    };

macvlan模块注册

复制代码
    static int __init macvlan_init_module(void)
    {
    	int err;
    
    	register_netdevice_notifier(&macvlan_notifier_block);
    	
    	err = macvlan_link_register(&macvlan_link_ops);
    	if (err < 0)
    		goto err1;
    	return 0;
    
    err1:
    	unregister_netdevice_notifier(&macvlan_notifier_block);
    	return err;
    }
复制代码
    static struct rtnl_link_ops macvlan_link_ops = {
    	.kind		= "macvlan",
    	.setup		= macvlan_setup,
    	.newlink	= macvlan_newlink,
    	.dellink	= macvlan_dellink,
    };

macvlan虚拟子设备与主设备建立关联关系

dev为虚拟子设备

static int macvlan_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
return macvlan_common_newlink(src_net, dev, tb, data,
netif_rx,
dev_forward_skb);
}

复制代码
    int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
    			   struct nlattr *tb[], struct nlattr *data[],
    			   int (*receive)(struct sk_buff *skb),
    			   int (*forward)(struct net_device *dev,
    					  struct sk_buff *skb))
    {
    	struct macvlan_dev *vlan = netdev_priv(dev);
    	struct macvlan_port *port;
    	struct net_device *lowerdev;
    	int err;
    
    	if (!tb[IFLA_LINK])
    		return -EINVAL;
    	
    	lowerdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); /* 从入参获取主设备 */
    	if (lowerdev == NULL)
    		return -ENODEV;
    	
    	/* When creating macvlans on top of other macvlans - use
    	 * the real device as the lowerdev.
    	 */
    	if (lowerdev->rtnl_link_ops == dev->rtnl_link_ops) {
    		struct macvlan_dev *lowervlan = netdev_priv(lowerdev);
    		lowerdev = lowervlan->lowerdev;
    	}
    	
    	if (!tb[IFLA_MTU])
    		dev->mtu = lowerdev->mtu;
    	else if (dev->mtu > lowerdev->mtu)
    		return -EINVAL;
    	
    	if (!tb[IFLA_ADDRESS])
    		eth_hw_addr_random(dev);
    	
    	/* 如果主设备没有macvlan子设备,则创建 macvlan_port,与主设备关联起来 */
    	if (!macvlan_port_exists(lowerdev)) {
    		err = macvlan_port_create(lowerdev);
    		if (err < 0)
    			return err;
    	}
    	port = macvlan_port_get_rtnl(lowerdev);
    	
    	/* Only 1 macvlan device can be created in passthru mode */
    	if (port->passthru)
    		return -EINVAL;
    	
    	vlan->lowerdev = lowerdev;
    	vlan->dev      = dev;
    	vlan->port     = port;
    	vlan->receive  = receive;
    	vlan->forward  = forward;
    	
    	vlan->mode     = MACVLAN_MODE_VEPA;
    	if (data && data[IFLA_MACVLAN_MODE])
    		vlan->mode = nla_get_u32(data[IFLA_MACVLAN_MODE]);
    	
    	if (data && data[IFLA_MACVLAN_FLAGS])
    		vlan->flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]);
    	
    	if (vlan->mode == MACVLAN_MODE_PASSTHRU) {
    		if (port->count)
    			return -EINVAL;
    		port->passthru = true;
    		memcpy(dev->dev_addr, lowerdev->dev_addr, ETH_ALEN);
    	}
    	
    	err = netdev_upper_dev_link(lowerdev, dev);
    	if (err)
    		goto destroy_port;
    	
    	port->count += 1;
    	err = register_netdevice(dev);
    	if (err < 0)
    		goto upper_dev_unlink;
    	
    	list_add_tail_rcu(&vlan->list, &port->vlans);
    	netif_stacked_transfer_operstate(lowerdev, dev);
    	
    	return 0;
    
    upper_dev_unlink:
    	netdev_upper_dev_unlink(lowerdev, dev);
    destroy_port:
    	port->count -= 1;
    	if (!port->count)
    		macvlan_port_destroy(lowerdev);
    
    	return err;
    
    }

macvlan_port_create

下面创建macvlan_port的函数macvlan_port_create入参就是主设备的net_device。

复制代码
    static int macvlan_port_create(struct net_device *dev)
    {
    	struct macvlan_port *port;
    	unsigned int i;
    	int err;
    
    	if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK)
    		return -EINVAL;
    	
    	port = kzalloc(sizeof(*port), GFP_KERNEL);
    	if (port == NULL)
    		return -ENOMEM;
    	
    	port->passthru = false;
    	port->dev = dev; /* 与主设备关联 */
    	INIT_LIST_HEAD(&port->vlans);
    	for (i = 0; i < MACVLAN_HASH_SIZE; i++)
    		INIT_HLIST_HEAD(&port->vlan_hash[i]);
    	
    	err = netdev_rx_handler_register(dev, macvlan_handle_frame, port); /* 注册了rx_handler */
    	if (err)
    		kfree(port);
    	else
    		dev->priv_flags |= IFF_MACVLAN_PORT;
    	return err;
    
    }

macvlan_handle_frame:主设备 rx_handler

err = netdev_rx_handler_register(dev, macvlan_handle_frame, port);

数据包收发

macvlan rx

macvlan的rx是通过在主设备上面注册rx_handle, rx_handle的处理函数为macvlan_handle_frame。macvlan收到数据包按照如下过程处理:

复制代码
    static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
    {
    	struct macvlan_port *port;
    	struct sk_buff *skb = *pskb;
    	const struct ethhdr *eth = eth_hdr(skb);
    	const struct macvlan_dev *vlan;
    	const struct macvlan_dev *src;
    	struct net_device *dev;
    	unsigned int len = 0;
    	int ret = NET_RX_DROP;
    
    	port = macvlan_port_get_rcu(skb->dev);
    	/*
    	广播、多播处理
    	1、利用源mac去macvlan_port中查找是否是端口forward的数据包
    	2、如果源没有找到广播给所有端口
    	3、如果源被找到了说明是bridge/evpa模式,如果源端口时evpa模式则叫广播扩散到evpa和bridge口,如果源端口是bridge模式则扩散给evpa口(bridge口在rx的时候早已转发),其它模式给该设备本身
    	*/
    	if (is_multicast_ether_addr(eth->h_dest)) {
    		skb = ip_check_defrag(skb, IP_DEFRAG_MACVLAN);
    		if (!skb)
    			return RX_HANDLER_CONSUMED;
    		eth = eth_hdr(skb);
    		src = macvlan_hash_lookup(port, eth->h_source);
    		if (!src)
    			/* frame comes from an external address */
    			macvlan_broadcast(skb, port, NULL,
    					  MACVLAN_MODE_PRIVATE |
    					  MACVLAN_MODE_VEPA    |
    					  MACVLAN_MODE_PASSTHRU|
    					  MACVLAN_MODE_BRIDGE);
    		else if (src->mode == MACVLAN_MODE_VEPA)
    			/* flood to everyone except source */
    			macvlan_broadcast(skb, port, src->dev,
    					  MACVLAN_MODE_VEPA |
    					  MACVLAN_MODE_BRIDGE);
    		else if (src->mode == MACVLAN_MODE_BRIDGE)
    			/* * flood only to VEPA ports, bridge ports
    			 * already saw the frame on the way out.
    			 */
    			macvlan_broadcast(skb, port, src->dev,
    					  MACVLAN_MODE_VEPA);
    		else {
    			/* forward to original port. */
    			vlan = src;
    			ret = macvlan_broadcast_one(skb, vlan, eth, 0);
    			goto out;
    		}
    	
    		return RX_HANDLER_PASS;
    	}
    	
    	/*
    	单播数据处理
    	1、macvlan处于passthrough模式下,则数据包直接交给该port
    	2、利用目的mac去macvlan_port中查找是否存在匹配的mac地址,查找到则调用macvlan_dev中的receive(这个和以太网不接收mac不是自己的数据一致)
    	3、没有匹配的macvlan_dev则将数据交给master
    	*/
    	if (port->passthru)
    		vlan = list_first_or_null_rcu(&port->vlans,
    					      struct macvlan_dev, list);
    	else
    		vlan = macvlan_hash_lookup(port, eth->h_dest);
    	
    	if (vlan == NULL)
    		return RX_HANDLER_PASS;
    	
    	dev = vlan->dev;
    	if (unlikely(!(dev->flags & IFF_UP))) {
    		kfree_skb(skb);
    		return RX_HANDLER_CONSUMED;
    	}
    	len = skb->len + ETH_HLEN;
    	skb = skb_share_check(skb, GFP_ATOMIC);
    	if (!skb)
    		goto out;
    	
    	skb->dev = dev;
    	skb->pkt_type = PACKET_HOST;
    	
    	ret = vlan->receive(skb);
    
    out:
    	macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, 0);
    	return RX_HANDLER_CONSUMED;
    }

macvlan tx

macvlan的tx和模式有关,当macvlan是bridge模式的时候,tx的时候会将广播包或者单播目的mac地址是其它网卡的设备通过macvlan_dev的forward函数转发给其它的虚拟网卡。其它模式的时候将直接从主设备TX出去。

复制代码
    static const struct net_device_ops macvlan_netdev_ops = {
    	.ndo_init		= macvlan_init,
    	.ndo_uninit		= macvlan_uninit,
    	.ndo_open		= macvlan_open,
    	.ndo_stop		= macvlan_stop,
    	.ndo_start_xmit		= macvlan_start_xmit,
    	.ndo_change_mtu		= macvlan_change_mtu,
    	.ndo_change_rx_flags	= macvlan_change_rx_flags,
    	.ndo_set_mac_address	= macvlan_set_mac_address,
    	.ndo_set_rx_mode	= macvlan_set_mac_lists,
    	.ndo_get_stats64	= macvlan_dev_get_stats64,
    	.ndo_validate_addr	= eth_validate_addr,
    	.ndo_vlan_rx_add_vid	= macvlan_vlan_rx_add_vid,
    	.ndo_vlan_rx_kill_vid	= macvlan_vlan_rx_kill_vid,
    	.ndo_fdb_add		= macvlan_fdb_add,
    	.ndo_fdb_del		= macvlan_fdb_del,
    	.ndo_fdb_dump		= ndo_dflt_fdb_dump,
    };
复制代码
    netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
    			       struct net_device *dev)
    {
    	unsigned int len = skb->len;
    	int ret;
    	const struct macvlan_dev *vlan = netdev_priv(dev);
    
    	ret = macvlan_queue_xmit(skb, dev);
    	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
    		struct macvlan_pcpu_stats *pcpu_stats;
    	
    		pcpu_stats = this_cpu_ptr(vlan->pcpu_stats);
    		u64_stats_update_begin(&pcpu_stats->syncp);
    		pcpu_stats->tx_packets++;
    		pcpu_stats->tx_bytes += len;
    		u64_stats_update_end(&pcpu_stats->syncp);
    	} else {
    		this_cpu_inc(vlan->pcpu_stats->tx_dropped);
    	}
    	return ret;
    
    }
复制代码
    static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
    {
    	const struct macvlan_dev *vlan = netdev_priv(dev);
    	const struct macvlan_port *port = vlan->port;
    	const struct macvlan_dev *dest;
    	__u8 ip_summed = skb->ip_summed;
    
    	if (vlan->mode == MACVLAN_MODE_BRIDGE) {
    		const struct ethhdr *eth = (void *)skb->data;
    		skb->ip_summed = CHECKSUM_UNNECESSARY;
    	
    		/* send to other bridge ports directly */
    		if (is_multicast_ether_addr(eth->h_dest)) {
    			macvlan_broadcast(skb, port, dev, MACVLAN_MODE_BRIDGE);
    			goto xmit_world;
    		}
    	
    		dest = macvlan_hash_lookup(port, eth->h_dest);
    		if (dest && dest->mode == MACVLAN_MODE_BRIDGE) {
    			/* send to lowerdev first for its network taps */
    			dev_forward_skb(vlan->lowerdev, skb);
    	
    			return NET_XMIT_SUCCESS;
    		}
    	}
    
    xmit_world:
    	skb->ip_summed = ip_summed;
    	skb->dev = vlan->lowerdev;
    	return dev_queue_xmit(skb);
    }

全部评论 (0)

还没有任何评论哟~