Created
January 2, 2020 07:58
-
-
Save nanjj/7bdf03d1cfca65a6ac417f031cddaf18 to your computer and use it in GitHub Desktop.
ubuntu fan overlay linux kernel patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c | |
index 28a1f8cb..141d8ba0 100644 | |
--- a/drivers/net/vxlan.c | |
+++ b/drivers/net/vxlan.c | |
@@ -16,6 +16,7 @@ | |
#include <linux/slab.h> | |
#include <linux/udp.h> | |
#include <linux/igmp.h> | |
+#include <linux/inetdevice.h> | |
#include <linux/if_ether.h> | |
#include <linux/ethtool.h> | |
#include <net/arp.h> | |
@@ -91,6 +92,167 @@ static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) | |
ip_tunnel_collect_metadata(); | |
} | |
+static struct ip_fan_map *vxlan_fan_find_map(struct vxlan_dev *vxlan, __be32 daddr) | |
+{ | |
+ struct ip_fan_map *fan_map; | |
+ | |
+ rcu_read_lock(); | |
+ list_for_each_entry_rcu(fan_map, &vxlan->fan.fan_maps, list) { | |
+ if (fan_map->overlay == | |
+ (daddr & inet_make_mask(fan_map->overlay_prefix))) { | |
+ rcu_read_unlock(); | |
+ return fan_map; | |
+ } | |
+ } | |
+ rcu_read_unlock(); | |
+ | |
+ return NULL; | |
+} | |
+ | |
+static void vxlan_fan_flush_map(struct vxlan_dev *vxlan) | |
+{ | |
+ struct ip_fan_map *fan_map; | |
+ | |
+ list_for_each_entry_rcu(fan_map, &vxlan->fan.fan_maps, list) { | |
+ list_del_rcu(&fan_map->list); | |
+ kfree_rcu(fan_map, rcu); | |
+ } | |
+} | |
+ | |
+static int vxlan_fan_del_map(struct vxlan_dev *vxlan, __be32 overlay) | |
+{ | |
+ struct ip_fan_map *fan_map; | |
+ | |
+ fan_map = vxlan_fan_find_map(vxlan, overlay); | |
+ if (!fan_map) | |
+ return -ENOENT; | |
+ | |
+ list_del_rcu(&fan_map->list); | |
+ kfree_rcu(fan_map, rcu); | |
+ | |
+ return 0; | |
+} | |
+ | |
+static int vxlan_fan_add_map(struct vxlan_dev *vxlan, struct ifla_fan_map *map) | |
+{ | |
+ __be32 overlay_mask, underlay_mask; | |
+ struct ip_fan_map *fan_map; | |
+ | |
+ overlay_mask = inet_make_mask(map->overlay_prefix); | |
+ underlay_mask = inet_make_mask(map->underlay_prefix); | |
+ | |
+ netdev_dbg(vxlan->dev, "vfam: map: o %x/%d u %x/%d om %x um %x\n", | |
+ map->overlay, map->overlay_prefix, | |
+ map->underlay, map->underlay_prefix, | |
+ overlay_mask, underlay_mask); | |
+ | |
+ if ((map->overlay & ~overlay_mask) || (map->underlay & ~underlay_mask)) | |
+ return -EINVAL; | |
+ | |
+ if (!(map->overlay & overlay_mask) && (map->underlay & underlay_mask)) | |
+ return -EINVAL; | |
+ | |
+ /* Special case: overlay 0 and underlay 0: flush all mappings */ | |
+ if (!map->overlay && !map->underlay) { | |
+ vxlan_fan_flush_map(vxlan); | |
+ return 0; | |
+ } | |
+ | |
+ /* Special case: overlay set and underlay 0: clear map for overlay */ | |
+ if (!map->underlay) | |
+ return vxlan_fan_del_map(vxlan, map->overlay); | |
+ | |
+ if (vxlan_fan_find_map(vxlan, map->overlay)) | |
+ return -EEXIST; | |
+ | |
+ fan_map = kmalloc(sizeof(*fan_map), GFP_KERNEL); | |
+ fan_map->underlay = map->underlay; | |
+ fan_map->overlay = map->overlay; | |
+ fan_map->underlay_prefix = map->underlay_prefix; | |
+ fan_map->overlay_mask = ntohl(overlay_mask); | |
+ fan_map->overlay_prefix = map->overlay_prefix; | |
+ | |
+ list_add_tail_rcu(&fan_map->list, &vxlan->fan.fan_maps); | |
+ | |
+ return 0; | |
+} | |
+ | |
+static int vxlan_parse_fan_map(struct nlattr *data[], struct vxlan_dev *vxlan) | |
+{ | |
+ struct ifla_fan_map *map; | |
+ struct nlattr *attr; | |
+ int rem, rv; | |
+ | |
+ nla_for_each_nested(attr, data[IFLA_IPTUN_FAN_MAP], rem) { | |
+ map = nla_data(attr); | |
+ rv = vxlan_fan_add_map(vxlan, map); | |
+ if (rv) | |
+ return rv; | |
+ } | |
+ | |
+ return 0; | |
+} | |
+ | |
+static int vxlan_fan_build_rdst(struct vxlan_dev *vxlan, struct sk_buff *skb, | |
+ struct vxlan_rdst *fan_rdst) | |
+{ | |
+ struct ip_fan_map *f_map; | |
+ union vxlan_addr *va; | |
+ u32 daddr, underlay; | |
+ struct arphdr *arp; | |
+ void *arp_ptr; | |
+ struct ethhdr *eth; | |
+ struct iphdr *iph; | |
+ | |
+ eth = eth_hdr(skb); | |
+ switch (eth->h_proto) { | |
+ case htons(ETH_P_IP): | |
+ iph = ip_hdr(skb); | |
+ if (!iph) | |
+ return -EINVAL; | |
+ daddr = iph->daddr; | |
+ break; | |
+ case htons(ETH_P_ARP): | |
+ arp = arp_hdr(skb); | |
+ if (!arp) | |
+ return -EINVAL; | |
+ arp_ptr = arp + 1; | |
+ netdev_dbg(vxlan->dev, | |
+ "vfbr: arp sha %pM sip %pI4 tha %pM tip %pI4\n", | |
+ arp_ptr, arp_ptr + skb->dev->addr_len, | |
+ arp_ptr + skb->dev->addr_len + 4, | |
+ arp_ptr + (skb->dev->addr_len * 2) + 4); | |
+ arp_ptr += (skb->dev->addr_len * 2) + 4; | |
+ memcpy(&daddr, arp_ptr, 4); | |
+ break; | |
+ default: | |
+ netdev_dbg(vxlan->dev, "vfbr: unknown eth p %x\n", eth->h_proto); | |
+ return -EINVAL; | |
+ } | |
+ | |
+ f_map = vxlan_fan_find_map(vxlan, daddr); | |
+ if (!f_map) | |
+ return -EINVAL; | |
+ | |
+ daddr = ntohl(daddr); | |
+ underlay = ntohl(f_map->underlay); | |
+ if (!underlay) | |
+ return -EINVAL; | |
+ | |
+ memset(fan_rdst, 0, sizeof(*fan_rdst)); | |
+ va = &fan_rdst->remote_ip; | |
+ va->sa.sa_family = AF_INET; | |
+ fan_rdst->remote_vni = vxlan->default_dst.remote_vni; | |
+ va->sin.sin_addr.s_addr = htonl(underlay | | |
+ ((daddr & ~f_map->overlay_mask) >> | |
+ (32 - f_map->overlay_prefix - | |
+ (32 - f_map->underlay_prefix)))); | |
+ netdev_dbg(vxlan->dev, "vfbr: daddr %x ul %x dst %x\n", | |
+ daddr, underlay, va->sin.sin_addr.s_addr); | |
+ | |
+ return 0; | |
+} | |
+ | |
#if IS_ENABLED(CONFIG_IPV6) | |
static inline | |
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) | |
@@ -2146,6 +2308,13 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, | |
goto tx_error; | |
} | |
+ if (fan_has_map(&vxlan->fan) && rt->rt_flags & RTCF_LOCAL) { | |
+ netdev_dbg(dev, "discard fan to localhost %pI4\n", | |
+ &dst->sin.sin_addr.s_addr); | |
+ ip_rt_put(rt); | |
+ goto tx_free; | |
+ } | |
+ | |
/* Bypass encapsulation if the destination is local */ | |
if (!info) { | |
err = encap_bypass_if_local(skb, dev, vxlan, dst, | |
@@ -2228,6 +2397,7 @@ tx_error: | |
dev->stats.tx_carrier_errors++; | |
dst_release(ndst); | |
dev->stats.tx_errors++; | |
+tx_free: | |
kfree_skb(skb); | |
} | |
@@ -2282,6 +2452,19 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) | |
#endif | |
} | |
+ if (fan_has_map(&vxlan->fan)) { | |
+ struct vxlan_rdst fan_rdst; | |
+ | |
+ netdev_dbg(vxlan->dev, "vxlan_xmit\n"); | |
+ if (vxlan_fan_build_rdst(vxlan, skb, &fan_rdst)) { | |
+ dev->stats.tx_dropped++; | |
+ kfree_skb(skb); | |
+ return NETDEV_TX_OK; | |
+ } | |
+ vxlan_xmit_one(skb, dev, vni, &fan_rdst, 0); | |
+ return NETDEV_TX_OK; | |
+ } | |
+ | |
eth = eth_hdr(skb); | |
f = vxlan_find_mac(vxlan, eth->h_dest, vni); | |
did_rsc = false; | |
@@ -2663,6 +2846,8 @@ static void vxlan_setup(struct net_device *dev) | |
for (h = 0; h < FDB_HASH_SIZE; ++h) | |
INIT_HLIST_HEAD(&vxlan->fdb_head[h]); | |
+ | |
+ INIT_LIST_HEAD(&vxlan->fan.fan_maps); | |
} | |
static void vxlan_ether_setup(struct net_device *dev) | |
@@ -3150,6 +3335,7 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], | |
bool changelink) | |
{ | |
struct vxlan_dev *vxlan = netdev_priv(dev); | |
+ int err; | |
memset(conf, 0, sizeof(*conf)); | |
@@ -3182,6 +3368,12 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], | |
conf->remote_ip.sa.sa_family = AF_INET6; | |
} | |
+ if (data[IFLA_VXLAN_FAN_MAP]) { | |
+ err = vxlan_parse_fan_map(data, vxlan); | |
+ if (err) | |
+ return err; | |
+ } | |
+ | |
if (data[IFLA_VXLAN_LOCAL]) { | |
if (changelink && (conf->saddr.sa.sa_family != AF_INET)) | |
return -EOPNOTSUPP; | |
@@ -3458,6 +3650,7 @@ static size_t vxlan_get_size(const struct net_device *dev) | |
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */ | |
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */ | |
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */ | |
+ nla_total_size(sizeof(struct ip_fan_map) * 256) + | |
0; | |
} | |
@@ -3504,6 +3697,26 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) | |
} | |
} | |
+ if (fan_has_map(&vxlan->fan)) { | |
+ struct nlattr *fan_nest; | |
+ struct ip_fan_map *fan_map; | |
+ | |
+ fan_nest = nla_nest_start(skb, IFLA_VXLAN_FAN_MAP); | |
+ if (!fan_nest) | |
+ goto nla_put_failure; | |
+ list_for_each_entry_rcu(fan_map, &vxlan->fan.fan_maps, list) { | |
+ struct ifla_fan_map map; | |
+ | |
+ map.underlay = fan_map->underlay; | |
+ map.underlay_prefix = fan_map->underlay_prefix; | |
+ map.overlay = fan_map->overlay; | |
+ map.overlay_prefix = fan_map->overlay_prefix; | |
+ if (nla_put(skb, IFLA_FAN_MAPPING, sizeof(map), &map)) | |
+ goto nla_put_failure; | |
+ } | |
+ nla_nest_end(skb, fan_nest); | |
+ } | |
+ | |
if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) || | |
nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) || | |
nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) || | |
@@ -3670,6 +3883,22 @@ static __net_init int vxlan_init_net(struct net *net) | |
return 0; | |
} | |
+#ifdef CONFIG_SYSCTL | |
+static struct ctl_table_header *vxlan_fan_header; | |
+static unsigned int vxlan_fan_version = 4; | |
+ | |
+static struct ctl_table vxlan_fan_sysctls[] = { | |
+ { | |
+ .procname = "vxlan", | |
+ .data = &vxlan_fan_version, | |
+ .maxlen = sizeof(vxlan_fan_version), | |
+ .mode = 0444, | |
+ .proc_handler = proc_dointvec, | |
+ }, | |
+ {}, | |
+}; | |
+#endif /* CONFIG_SYSCTL */ | |
+ | |
static void __net_exit vxlan_exit_net(struct net *net) | |
{ | |
struct vxlan_net *vn = net_generic(net, vxlan_net_id); | |
@@ -3718,8 +3947,20 @@ static int __init vxlan_init_module(void) | |
rc = rtnl_link_register(&vxlan_link_ops); | |
if (rc) | |
goto out3; | |
+#ifdef CONFIG_SYSCTL | |
+ vxlan_fan_header = register_net_sysctl(&init_net, "net/fan", | |
+ vxlan_fan_sysctls); | |
+ if (!vxlan_fan_header) { | |
+ rc = -ENOMEM; | |
+ goto sysctl_failed; | |
+ } | |
+#endif /* CONFIG_SYSCTL */ | |
return 0; | |
+#ifdef CONFIG_SYSCTL | |
+sysctl_failed: | |
+ rtnl_link_unregister(&vxlan_link_ops); | |
+#endif /* CONFIG_SYSCTL */ | |
out3: | |
unregister_netdevice_notifier_rh(&vxlan_notifier_block); | |
out2: | |
@@ -3731,6 +3972,9 @@ late_initcall(vxlan_init_module); | |
static void __exit vxlan_cleanup_module(void) | |
{ | |
+#ifdef CONFIG_SYSCTL | |
+ unregister_net_sysctl_table(vxlan_fan_header); | |
+#endif /* CONFIG_SYSCTL */ | |
rtnl_link_unregister(&vxlan_link_ops); | |
unregister_netdevice_notifier_rh(&vxlan_notifier_block); | |
unregister_pernet_subsys(&vxlan_net_ops); | |
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h | |
index 56a65e5b..b3ca3f84 100644 | |
--- a/include/net/ip_tunnels.h | |
+++ b/include/net/ip_tunnels.h | |
@@ -102,6 +102,30 @@ struct ip_tunnel_prl_entry { | |
struct metadata_dst; | |
+/* A fan overlay /8 (250.0.0.0/8, for example) maps to exactly one /16 | |
+ * underlay (10.88.0.0/16, for example). Multiple local addresses within | |
+ * the /16 may be used, but a particular overlay may not span | |
+ * multiple underlay subnets. | |
+ * | |
+ * We store one underlay, indexed by the overlay's high order octet. | |
+ */ | |
+#define FAN_OVERLAY_CNT 256 | |
+ | |
+struct ip_fan_map { | |
+ __be32 underlay; | |
+ __be32 overlay; | |
+ u16 underlay_prefix; | |
+ u16 overlay_prefix; | |
+ u32 overlay_mask; | |
+ struct list_head list; | |
+ struct rcu_head rcu; | |
+}; | |
+ | |
+struct ip_tunnel_fan { | |
+ struct list_head fan_maps; | |
+ | |
+}; | |
+ | |
struct ip_tunnel { | |
struct ip_tunnel __rcu *next; | |
struct hlist_node hash_node; | |
@@ -133,6 +157,7 @@ struct ip_tunnel { | |
#endif | |
struct ip_tunnel_prl_entry __rcu *prl; /* potential router list */ | |
unsigned int prl_count; /* # of entries in PRL */ | |
+ struct ip_tunnel_fan fan; | |
int ip_tnl_net_id; | |
struct gro_cells gro_cells; | |
@@ -165,6 +190,11 @@ struct ip_tunnel { | |
#define TUNNEL_OPTIONS_PRESENT (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT) | |
+static inline int fan_has_map(const struct ip_tunnel_fan *fan) | |
+{ | |
+ return !list_empty(&fan->fan_maps); | |
+} | |
+ | |
struct tnl_ptk_info { | |
__be16 flags; | |
__be16 proto; | |
diff --git a/include/net/vxlan.h b/include/net/vxlan.h | |
index d60235d6..bfab9125 100644 | |
--- a/include/net/vxlan.h | |
+++ b/include/net/vxlan.h | |
@@ -235,6 +235,8 @@ struct vxlan_dev { | |
struct net *net; /* netns for packet i/o */ | |
struct vxlan_rdst default_dst; /* default destination */ | |
+ struct ip_tunnel_fan fan; | |
+ | |
struct timer_list age_timer; | |
spinlock_t hash_lock; | |
unsigned int addrcnt; | |
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h | |
index 7c410287..de494192 100644 | |
--- a/include/uapi/linux/if_link.h | |
+++ b/include/uapi/linux/if_link.h | |
@@ -469,6 +469,7 @@ enum { | |
IFLA_VXLAN_LABEL, | |
IFLA_VXLAN_GPE, | |
IFLA_VXLAN_TTL_INHERIT, | |
+ IFLA_VXLAN_FAN_MAP = 33, | |
__IFLA_VXLAN_MAX | |
}; | |
#define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) | |
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h | |
index 83452433..05dee8f5 100644 | |
--- a/include/uapi/linux/if_tunnel.h | |
+++ b/include/uapi/linux/if_tunnel.h | |
@@ -62,6 +62,10 @@ enum { | |
IFLA_IPTUN_ENCAP_FLAGS, | |
IFLA_IPTUN_ENCAP_SPORT, | |
IFLA_IPTUN_ENCAP_DPORT, | |
+ | |
+ __IFLA_IPTUN_VENDOR_BREAK, /* Ensure new entries do not hit the below. */ | |
+ IFLA_IPTUN_FAN_MAP = 33, | |
+ | |
__IFLA_IPTUN_MAX, | |
}; | |
#define IFLA_IPTUN_MAX (__IFLA_IPTUN_MAX - 1) | |
@@ -138,4 +142,20 @@ enum { | |
}; | |
#define IFLA_VTI_MAX (__IFLA_VTI_MAX - 1) | |
+ | |
+enum { | |
+ IFLA_FAN_UNSPEC, | |
+ IFLA_FAN_MAPPING, | |
+ __IFLA_FAN_MAX, | |
+}; | |
+ | |
+#define IFLA_FAN_MAX (__IFLA_FAN_MAX - 1) | |
+ | |
+struct ifla_fan_map { | |
+ __be32 underlay; | |
+ __be32 overlay; | |
+ __u16 underlay_prefix; | |
+ __u16 overlay_prefix; | |
+}; | |
+ | |
#endif /* _UAPI_IF_TUNNEL_H_ */ | |
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c | |
index 347f9d99..328082d4 100644 | |
--- a/net/ipv4/ip_tunnel.c | |
+++ b/net/ipv4/ip_tunnel.c | |
@@ -1098,7 +1098,7 @@ int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], | |
struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); | |
if (dev == itn->fb_tunnel_dev) | |
- return -EINVAL; | |
+ return fan_has_map(&tunnel->fan) ? 0 : -EINVAL; | |
t = ip_tunnel_find(itn, p, dev->type); | |
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c | |
index 9d311c14..327c113e 100644 | |
--- a/net/ipv4/ipip.c | |
+++ b/net/ipv4/ipip.c | |
@@ -99,6 +99,7 @@ | |
#include <asm/uaccess.h> | |
#include <linux/skbuff.h> | |
#include <linux/netdevice.h> | |
+#include <linux/rculist.h> | |
#include <linux/in.h> | |
#include <linux/tcp.h> | |
#include <linux/udp.h> | |
@@ -106,6 +107,7 @@ | |
#include <linux/init.h> | |
#include <linux/netfilter_ipv4.h> | |
#include <linux/if_ether.h> | |
+#include <linux/inetdevice.h> | |
#include <net/sock.h> | |
#include <net/ip.h> | |
@@ -207,6 +209,147 @@ drop: | |
return 0; | |
} | |
+static struct ip_fan_map *ipip_fan_find_map(struct ip_tunnel *t, __be32 daddr) | |
+{ | |
+ struct ip_fan_map *fan_map; | |
+ | |
+ rcu_read_lock(); | |
+ list_for_each_entry_rcu(fan_map, &t->fan.fan_maps, list) { | |
+ if (fan_map->overlay == | |
+ (daddr & inet_make_mask(fan_map->overlay_prefix))) { | |
+ rcu_read_unlock(); | |
+ return fan_map; | |
+ } | |
+ } | |
+ rcu_read_unlock(); | |
+ | |
+ return NULL; | |
+} | |
+ | |
+/* Determine fan tunnel endpoint to send packet to, based on the inner IP | |
+ * address. | |
+ * | |
+ * Given a /8 overlay and /16 underlay, for an overlay (inner) address | |
+ * Y.A.B.C, the transformation is F.G.A.B, where "F" and "G" are the first | |
+ * two octets of the underlay network (the network portion of a /16), "A" | |
+ * and "B" are the low order two octets of the underlay network host (the | |
+ * host portion of a /16), and "Y" is a configured first octet of the | |
+ * overlay network. | |
+ * | |
+ * E.g., underlay host 10.88.3.4/16 with an overlay of 99.0.0.0/8 would | |
+ * host overlay subnet 99.3.4.0/24. An overlay network datagram from | |
+ * 99.3.4.5 to 99.6.7.8, would be directed to underlay host 10.88.6.7, | |
+ * which hosts overlay network subnet 99.6.7.0/24. This transformation is | |
+ * described in detail further below. | |
+ * | |
+ * Using netmasks for the overlay and underlay other than /8 and /16, as | |
+ * shown above, can yield larger (or smaller) overlay subnets, with the | |
+ * trade-off of allowing fewer (or more) underlay hosts to participate. | |
+ * | |
+ * The size of each overlay network subnet is defined by the total of the | |
+ * network mask of the overlay plus the size of host portion of the | |
+ * underlay network. In the above example, /8 + /16 = /24. | |
+ * | |
+ * E.g., consider underlay host 10.99.238.5/20 and overlay 99.0.0.0/8. In | |
+ * this case, the network portion of the underlay is 10.99.224.0/20, and | |
+ * the host portion is 0.0.14.5 (12 bits). To determine the overlay | |
+ * network subnet, the 12 bits of host portion are left shifted 12 bits | |
+ * (/20 - /8) and ORed with the overlay subnet prefix. This yields an | |
+ * overlay subnet of 99.224.80/20, composed of 8 bits overlay, followed by | |
+ * 12 bits underlay. This yields 12 bits in the overlay network portion, | |
+ * allowing for 4094 addresses in each overlay network subnet. The | |
+ * trade-off is that fewer hosts may participate in the underlay network, | |
+ * as its host address size has shrunk from 16 bits (65534 addresses) in | |
+ * the first example to 12 bits (4094 addresses) here. | |
+ * | |
+ * For fewer hosts per overlay subnet (permitting a larger number of | |
+ * underlay hosts to participate), the underlay netmask may be made | |
+ * smaller. | |
+ * | |
+ * E.g., underlay host 10.111.1.2/12 (network 10.96.0.0/12, host portion | |
+ * is 0.15.1.2, 20 bits) with an overlay of 33.0.0.0/8 would left shift | |
+ * the 20 bits of host by 4 (so that it's highest order bit is adjacent to | |
+ * the lowest order bit of the /8 overlay). This yields an overlay subnet | |
+ * of 33.240.16.32/28 (8 bits overlay, 20 bits from the host portion of | |
+ * the underlay). This provides more addresses for the underlay network | |
+ * (approximately 2^20), but each host's segment of the overlay provides | |
+ * only 4 bits of addresses (14 usable). | |
+ * | |
+ * It is also possible to adjust the overlay subnet. | |
+ * | |
+ * For an overlay of 240.0.0.0/5 and underlay of 10.88.0.0/20, consider | |
+ * underlay host 10.88.129.2; the 12 bits of host, 0.0.1.2, are left | |
+ * shifted 15 bits (/20 - /5), yielding an overlay network of | |
+ * 240.129.0.0/17. An underlay host of 10.88.244.215 would yield an | |
+ * overlay network of 242.107.128.0/17. | |
+ * | |
+ * For an overlay of 100.64.0.0/10 and underlay of 10.224.220.0/24, for | |
+ * underlay host 10.224.220.10, the underlay host portion (.10) is left | |
+ * shifted 14 bits, yielding an overlay network subnet of 100.66.128.0/18. | |
+ * This would permit 254 addresses on the underlay, with each overlay | |
+ * segment providing approximately 2^14 - 2 addresses (16382). | |
+ * | |
+ * For packets being encapsulated, the overlay network destination IP | |
+ * address is deconstructed into its overlay and underlay-derived | |
+ * portions. The underlay portion (determined by the overlay mask and | |
+ * overlay subnet mask) is right shifted according to the size of the | |
+ * underlay network mask. This value is then ORed with the network | |
+ * portion of the underlay network to produce the underlay network | |
+ * destination for the encapsulated datagram. | |
+ * | |
+ * For example, using the initial example of underlay 10.88.3.4/16 and | |
+ * overlay 99.0.0.0/8, with underlay host 10.88.3.4/16 providing overlay | |
+ * subnet 99.3.4.0/24 with specfic host 99.3.4.5. A datagram from | |
+ * 99.3.4.5 to 99.6.7.8 would first have the underlay host derived portion | |
+ * of the address extracted. This is a number of bits equal to underlay | |
+ * network host portion. In the destination address, the highest order of | |
+ * these bits is one bit lower than the lowest order bit from the overlay | |
+ * network mask. | |
+ * | |
+ * Using the sample value, 99.6.7.8, the overlay mask is /8, and the | |
+ * underlay mask is /16 (leaving 16 bits for the host portion). The bits | |
+ * to be shifted are the middle two octets, 0.6.7.0, as this is 99.6.7.8 | |
+ * ANDed with the mask 0x00ffff00 (which is 16 bits, the highest order of | |
+ * which is 1 bit lower than the lowest order overlay address bit). | |
+ * | |
+ * These octets, 0.6.7.0, are then right shifted 8 bits, yielding 0.0.6.7. | |
+ * This value is then ORed with the underlay network portion, | |
+ * 10.88.0.0/16, providing 10.88.6.7 as the final underlay destination for | |
+ * the encapuslated datagram. | |
+ * | |
+ * Another transform using the final example: overlay 100.64.0.0/10 and | |
+ * underlay 10.224.220.0/24. Consider overlay address 100.66.128.1 | |
+ * sending a datagram to 100.66.200.5. In this case, 8 bits (the host | |
+ * portion size of 10.224.220.0/24) beginning after the 100.64/10 overlay | |
+ * prefix are masked off, yielding 0.2.192.0. This is right shifted 14 | |
+ * (32 - 10 - (32 - 24), i.e., the number of bits between the overlay | |
+ * network portion and the underlay host portion) bits, yielding 0.0.0.11. | |
+ * This is ORed with the underlay network portion, 10.224.220.0/24, giving | |
+ * the underlay destination of 10.224.220.11 for overlay destination | |
+ * 100.66.200.5. | |
+ */ | |
+static int ipip_build_fan_iphdr(struct ip_tunnel *tunnel, struct sk_buff *skb, struct iphdr *iph) | |
+{ | |
+ struct ip_fan_map *f_map; | |
+ u32 daddr, underlay; | |
+ | |
+ f_map = ipip_fan_find_map(tunnel, ip_hdr(skb)->daddr); | |
+ if (!f_map) | |
+ return -ENOENT; | |
+ | |
+ daddr = ntohl(ip_hdr(skb)->daddr); | |
+ underlay = ntohl(f_map->underlay); | |
+ if (!underlay) | |
+ return -EINVAL; | |
+ | |
+ *iph = tunnel->parms.iph; | |
+ iph->daddr = htonl(underlay | | |
+ ((daddr & ~f_map->overlay_mask) >> | |
+ (32 - f_map->overlay_prefix - | |
+ (32 - f_map->underlay_prefix)))); | |
+ return 0; | |
+} | |
+ | |
/* | |
* This function assumes it is being called from dev_queue_xmit() | |
* and that skb is filled properly by that function. | |
@@ -215,6 +358,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | |
{ | |
struct ip_tunnel *tunnel = netdev_priv(dev); | |
const struct iphdr *tiph = &tunnel->parms.iph; | |
+ struct iphdr fiph; | |
if (unlikely(skb->protocol != htons(ETH_P_IP))) | |
goto tx_error; | |
@@ -222,6 +366,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) | |
if (iptunnel_handle_offloads(skb, SKB_GSO_IPIP)) | |
goto tx_error; | |
+ if (fan_has_map(&tunnel->fan)) { | |
+ if (ipip_build_fan_iphdr(tunnel, skb, &fiph)) | |
+ goto tx_error; | |
+ tiph = &fiph; | |
+ } else { | |
+ tiph = &tunnel->parms.iph; | |
+ } | |
+ | |
skb_set_inner_ipproto(skb, IPPROTO_IPIP); | |
ip_tunnel_xmit(skb, dev, tiph, tiph->protocol); | |
@@ -282,6 +434,8 @@ static const struct net_device_ops ipip_netdev_ops = { | |
static void ipip_tunnel_setup(struct net_device *dev) | |
{ | |
+ struct ip_tunnel *t = netdev_priv(dev); | |
+ | |
dev->netdev_ops = &ipip_netdev_ops; | |
dev->type = ARPHRD_TUNNEL; | |
@@ -293,6 +447,7 @@ static void ipip_tunnel_setup(struct net_device *dev) | |
dev->features |= IPIP_FEATURES; | |
dev->hw_features |= IPIP_FEATURES; | |
ip_tunnel_setup(dev, ipip_net_id); | |
+ INIT_LIST_HEAD(&t->fan.fan_maps); | |
} | |
static int ipip_tunnel_init(struct net_device *dev) | |
@@ -341,12 +496,107 @@ static void ipip_netlink_parms(struct nlattr *data[], | |
parms->iph.frag_off = htons(IP_DF); | |
} | |
+static void ipip_fan_flush_map(struct ip_tunnel *t) | |
+{ | |
+ struct ip_fan_map *fan_map; | |
+ | |
+ list_for_each_entry_rcu(fan_map, &t->fan.fan_maps, list) { | |
+ list_del_rcu(&fan_map->list); | |
+ kfree_rcu(fan_map, rcu); | |
+ } | |
+} | |
+ | |
+ | |
+static int ipip_fan_del_map(struct ip_tunnel *t, __be32 overlay) | |
+{ | |
+ struct ip_fan_map *fan_map; | |
+ | |
+ fan_map = ipip_fan_find_map(t, overlay); | |
+ if (!fan_map) | |
+ return -ENOENT; | |
+ | |
+ list_del_rcu(&fan_map->list); | |
+ kfree_rcu(fan_map, rcu); | |
+ | |
+ return 0; | |
+} | |
+ | |
+static int ipip_fan_add_map(struct ip_tunnel *t, struct ifla_fan_map *map) | |
+{ | |
+ __be32 overlay_mask, underlay_mask; | |
+ struct ip_fan_map *fan_map; | |
+ | |
+ overlay_mask = inet_make_mask(map->overlay_prefix); | |
+ underlay_mask = inet_make_mask(map->underlay_prefix); | |
+ | |
+ if ((map->overlay & ~overlay_mask) || (map->underlay & ~underlay_mask)) | |
+ return -EINVAL; | |
+ | |
+ if (!(map->overlay & overlay_mask) && (map->underlay & underlay_mask)) | |
+ return -EINVAL; | |
+ | |
+ /* Special case: overlay 0 and underlay 0: flush all mappings */ | |
+ if (!map->overlay && !map->underlay) { | |
+ ipip_fan_flush_map(t); | |
+ return 0; | |
+ } | |
+ | |
+ /* Special case: overlay set and underlay 0: clear map for overlay */ | |
+ if (!map->underlay) | |
+ return ipip_fan_del_map(t, map->overlay); | |
+ | |
+ if (ipip_fan_find_map(t, map->overlay)) | |
+ return -EEXIST; | |
+ | |
+ fan_map = kmalloc(sizeof(*fan_map), GFP_KERNEL); | |
+ fan_map->underlay = map->underlay; | |
+ fan_map->overlay = map->overlay; | |
+ | |
+ fan_map->underlay_prefix = map->underlay_prefix; | |
+ fan_map->overlay_mask = ntohl(overlay_mask); | |
+ fan_map->overlay_prefix = map->overlay_prefix; | |
+ | |
+ list_add_tail_rcu(&fan_map->list, &t->fan.fan_maps); | |
+ | |
+ return 0; | |
+} | |
+ | |
+ | |
+static int ipip_netlink_fan(struct nlattr *data[], struct ip_tunnel *t, | |
+ struct ip_tunnel_parm *parms) | |
+{ | |
+ struct ifla_fan_map *map; | |
+ struct nlattr *attr; | |
+ int rem, rv; | |
+ | |
+ if (!data[IFLA_IPTUN_FAN_MAP]) | |
+ return 0; | |
+ | |
+ if (parms->iph.daddr) | |
+ return -EINVAL; | |
+ | |
+ nla_for_each_nested(attr, data[IFLA_IPTUN_FAN_MAP], rem) { | |
+ map = nla_data(attr); | |
+ rv = ipip_fan_add_map(t, map); | |
+ if (rv) | |
+ return rv; | |
+ } | |
+ | |
+ return 0; | |
+} | |
+ | |
static int ipip_newlink(struct net *src_net, struct net_device *dev, | |
struct nlattr *tb[], struct nlattr *data[]) | |
{ | |
struct ip_tunnel_parm p; | |
+ int err; | |
+ struct ip_tunnel *t = netdev_priv(dev); | |
ipip_netlink_parms(data, &p); | |
+ err = ipip_netlink_fan(data, t, &p); | |
+ if (err < 0) | |
+ return err; | |
+ | |
return ip_tunnel_newlink(dev, tb, &p); | |
} | |
@@ -354,8 +604,13 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], | |
struct nlattr *data[]) | |
{ | |
struct ip_tunnel_parm p; | |
+ int err; | |
+ struct ip_tunnel *t = netdev_priv(dev); | |
ipip_netlink_parms(data, &p); | |
+ err = ipip_netlink_fan(data, t, &p); | |
+ if (err < 0) | |
+ return err; | |
if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || | |
(!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) | |
@@ -379,6 +634,8 @@ static size_t ipip_get_size(const struct net_device *dev) | |
nla_total_size(1) + | |
/* IFLA_IPTUN_PMTUDISC */ | |
nla_total_size(1) + | |
+ /* IFLA_IPTUN_FAN_MAP */ | |
+ nla_total_size(sizeof(struct ifla_fan_map)) * 256 + | |
0; | |
} | |
@@ -395,6 +652,27 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) | |
nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, | |
!!(parm->iph.frag_off & htons(IP_DF)))) | |
goto nla_put_failure; | |
+ | |
+ if (fan_has_map(&tunnel->fan)) { | |
+ struct nlattr *fan_nest; | |
+ struct ip_fan_map *fan_map; | |
+ | |
+ fan_nest = nla_nest_start(skb, IFLA_IPTUN_FAN_MAP); | |
+ if (!fan_nest) | |
+ goto nla_put_failure; | |
+ list_for_each_entry_rcu(fan_map, &tunnel->fan.fan_maps, list) { | |
+ struct ifla_fan_map map; | |
+ | |
+ map.underlay = fan_map->underlay; | |
+ map.underlay_prefix = fan_map->underlay_prefix; | |
+ map.overlay = fan_map->overlay; | |
+ map.overlay_prefix = fan_map->overlay_prefix; | |
+ if (nla_put(skb, IFLA_FAN_MAPPING, sizeof(map), &map)) | |
+ goto nla_put_failure; | |
+ } | |
+ nla_nest_end(skb, fan_nest); | |
+ } | |
+ | |
return 0; | |
nla_put_failure: | |
@@ -408,6 +686,9 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = { | |
[IFLA_IPTUN_TTL] = { .type = NLA_U8 }, | |
[IFLA_IPTUN_TOS] = { .type = NLA_U8 }, | |
[IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 }, | |
+ | |
+ [__IFLA_IPTUN_VENDOR_BREAK ... IFLA_IPTUN_MAX] = { .type = NLA_BINARY }, | |
+ [IFLA_IPTUN_FAN_MAP] = { .type = NLA_NESTED }, | |
}; | |
static struct rtnl_link_ops ipip_link_ops __read_mostly = { | |
@@ -448,6 +729,23 @@ static struct pernet_operations ipip_net_ops = { | |
.size = sizeof(struct ip_tunnel_net), | |
}; | |
+#ifdef CONFIG_SYSCTL | |
+static struct ctl_table_header *ipip_fan_header; | |
+static unsigned int ipip_fan_version = 3; | |
+ | |
+static struct ctl_table ipip_fan_sysctls[] = { | |
+ { | |
+ .procname = "version", | |
+ .data = &ipip_fan_version, | |
+ .maxlen = sizeof(ipip_fan_version), | |
+ .mode = 0444, | |
+ .proc_handler = proc_dointvec, | |
+ }, | |
+ {}, | |
+}; | |
+ | |
+#endif /* CONFIG_SYSCTL */ | |
+ | |
static int __init ipip_init(void) | |
{ | |
int err; | |
@@ -466,9 +764,23 @@ static int __init ipip_init(void) | |
if (err < 0) | |
goto rtnl_link_failed; | |
+#ifdef CONFIG_SYSCTL | |
+ ipip_fan_header = register_net_sysctl(&init_net, "net/fan", | |
+ ipip_fan_sysctls); | |
+ if (!ipip_fan_header) { | |
+ err = -ENOMEM; | |
+ goto sysctl_failed; | |
+ } | |
+#endif /* CONFIG_SYSCTL */ | |
+ | |
out: | |
return err; | |
+#ifdef CONFIG_SYSCTL | |
+sysctl_failed: | |
+ rtnl_link_unregister(&ipip_link_ops); | |
+#endif /* CONFIG_SYSCTL */ | |
+ | |
rtnl_link_failed: | |
xfrm4_tunnel_deregister(&ipip_handler, AF_INET); | |
xfrm_tunnel_failed: | |
@@ -478,6 +790,10 @@ xfrm_tunnel_failed: | |
static void __exit ipip_fini(void) | |
{ | |
+#ifdef CONFIG_SYSCTL | |
+ unregister_net_sysctl_table(ipip_fan_header); | |
+#endif /* CONFIG_SYSCTL */ | |
+ | |
rtnl_link_unregister(&ipip_link_ops); | |
if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) | |
pr_info("%s: can't deregister tunnel\n", __func__); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment