Skip to content

Instantly share code, notes, and snippets.

@nanjj
Created January 2, 2020 07:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nanjj/7bdf03d1cfca65a6ac417f031cddaf18 to your computer and use it in GitHub Desktop.
Save nanjj/7bdf03d1cfca65a6ac417f031cddaf18 to your computer and use it in GitHub Desktop.
ubuntu fan overlay linux kernel patch
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 28a1f8cb..141d8ba0 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -16,6 +16,7 @@
#include <linux/slab.h>
#include <linux/udp.h>
#include <linux/igmp.h>
+#include <linux/inetdevice.h>
#include <linux/if_ether.h>
#include <linux/ethtool.h>
#include <net/arp.h>
@@ -91,6 +92,167 @@ static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
ip_tunnel_collect_metadata();
}
+static struct ip_fan_map *vxlan_fan_find_map(struct vxlan_dev *vxlan, __be32 daddr)
+{
+ struct ip_fan_map *fan_map;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(fan_map, &vxlan->fan.fan_maps, list) {
+ if (fan_map->overlay ==
+ (daddr & inet_make_mask(fan_map->overlay_prefix))) {
+ rcu_read_unlock();
+ return fan_map;
+ }
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+static void vxlan_fan_flush_map(struct vxlan_dev *vxlan)
+{
+ struct ip_fan_map *fan_map;
+
+ list_for_each_entry_rcu(fan_map, &vxlan->fan.fan_maps, list) {
+ list_del_rcu(&fan_map->list);
+ kfree_rcu(fan_map, rcu);
+ }
+}
+
+static int vxlan_fan_del_map(struct vxlan_dev *vxlan, __be32 overlay)
+{
+ struct ip_fan_map *fan_map;
+
+ fan_map = vxlan_fan_find_map(vxlan, overlay);
+ if (!fan_map)
+ return -ENOENT;
+
+ list_del_rcu(&fan_map->list);
+ kfree_rcu(fan_map, rcu);
+
+ return 0;
+}
+
+static int vxlan_fan_add_map(struct vxlan_dev *vxlan, struct ifla_fan_map *map)
+{
+ __be32 overlay_mask, underlay_mask;
+ struct ip_fan_map *fan_map;
+
+ overlay_mask = inet_make_mask(map->overlay_prefix);
+ underlay_mask = inet_make_mask(map->underlay_prefix);
+
+ netdev_dbg(vxlan->dev, "vfam: map: o %x/%d u %x/%d om %x um %x\n",
+ map->overlay, map->overlay_prefix,
+ map->underlay, map->underlay_prefix,
+ overlay_mask, underlay_mask);
+
+ if ((map->overlay & ~overlay_mask) || (map->underlay & ~underlay_mask))
+ return -EINVAL;
+
+ if (!(map->overlay & overlay_mask) && (map->underlay & underlay_mask))
+ return -EINVAL;
+
+ /* Special case: overlay 0 and underlay 0: flush all mappings */
+ if (!map->overlay && !map->underlay) {
+ vxlan_fan_flush_map(vxlan);
+ return 0;
+ }
+
+ /* Special case: overlay set and underlay 0: clear map for overlay */
+ if (!map->underlay)
+ return vxlan_fan_del_map(vxlan, map->overlay);
+
+ if (vxlan_fan_find_map(vxlan, map->overlay))
+ return -EEXIST;
+
+ fan_map = kmalloc(sizeof(*fan_map), GFP_KERNEL);
+ fan_map->underlay = map->underlay;
+ fan_map->overlay = map->overlay;
+ fan_map->underlay_prefix = map->underlay_prefix;
+ fan_map->overlay_mask = ntohl(overlay_mask);
+ fan_map->overlay_prefix = map->overlay_prefix;
+
+ list_add_tail_rcu(&fan_map->list, &vxlan->fan.fan_maps);
+
+ return 0;
+}
+
+static int vxlan_parse_fan_map(struct nlattr *data[], struct vxlan_dev *vxlan)
+{
+ struct ifla_fan_map *map;
+ struct nlattr *attr;
+ int rem, rv;
+
+ nla_for_each_nested(attr, data[IFLA_IPTUN_FAN_MAP], rem) {
+ map = nla_data(attr);
+ rv = vxlan_fan_add_map(vxlan, map);
+ if (rv)
+ return rv;
+ }
+
+ return 0;
+}
+
+static int vxlan_fan_build_rdst(struct vxlan_dev *vxlan, struct sk_buff *skb,
+ struct vxlan_rdst *fan_rdst)
+{
+ struct ip_fan_map *f_map;
+ union vxlan_addr *va;
+ u32 daddr, underlay;
+ struct arphdr *arp;
+ void *arp_ptr;
+ struct ethhdr *eth;
+ struct iphdr *iph;
+
+ eth = eth_hdr(skb);
+ switch (eth->h_proto) {
+ case htons(ETH_P_IP):
+ iph = ip_hdr(skb);
+ if (!iph)
+ return -EINVAL;
+ daddr = iph->daddr;
+ break;
+ case htons(ETH_P_ARP):
+ arp = arp_hdr(skb);
+ if (!arp)
+ return -EINVAL;
+ arp_ptr = arp + 1;
+ netdev_dbg(vxlan->dev,
+ "vfbr: arp sha %pM sip %pI4 tha %pM tip %pI4\n",
+ arp_ptr, arp_ptr + skb->dev->addr_len,
+ arp_ptr + skb->dev->addr_len + 4,
+ arp_ptr + (skb->dev->addr_len * 2) + 4);
+ arp_ptr += (skb->dev->addr_len * 2) + 4;
+ memcpy(&daddr, arp_ptr, 4);
+ break;
+ default:
+ netdev_dbg(vxlan->dev, "vfbr: unknown eth p %x\n", eth->h_proto);
+ return -EINVAL;
+ }
+
+ f_map = vxlan_fan_find_map(vxlan, daddr);
+ if (!f_map)
+ return -EINVAL;
+
+ daddr = ntohl(daddr);
+ underlay = ntohl(f_map->underlay);
+ if (!underlay)
+ return -EINVAL;
+
+ memset(fan_rdst, 0, sizeof(*fan_rdst));
+ va = &fan_rdst->remote_ip;
+ va->sa.sa_family = AF_INET;
+ fan_rdst->remote_vni = vxlan->default_dst.remote_vni;
+ va->sin.sin_addr.s_addr = htonl(underlay |
+ ((daddr & ~f_map->overlay_mask) >>
+ (32 - f_map->overlay_prefix -
+ (32 - f_map->underlay_prefix))));
+ netdev_dbg(vxlan->dev, "vfbr: daddr %x ul %x dst %x\n",
+ daddr, underlay, va->sin.sin_addr.s_addr);
+
+ return 0;
+}
+
#if IS_ENABLED(CONFIG_IPV6)
static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
@@ -2146,6 +2308,13 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
goto tx_error;
}
+ if (fan_has_map(&vxlan->fan) && rt->rt_flags & RTCF_LOCAL) {
+ netdev_dbg(dev, "discard fan to localhost %pI4\n",
+ &dst->sin.sin_addr.s_addr);
+ ip_rt_put(rt);
+ goto tx_free;
+ }
+
/* Bypass encapsulation if the destination is local */
if (!info) {
err = encap_bypass_if_local(skb, dev, vxlan, dst,
@@ -2228,6 +2397,7 @@ tx_error:
dev->stats.tx_carrier_errors++;
dst_release(ndst);
dev->stats.tx_errors++;
+tx_free:
kfree_skb(skb);
}
@@ -2282,6 +2452,19 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
#endif
}
+ if (fan_has_map(&vxlan->fan)) {
+ struct vxlan_rdst fan_rdst;
+
+ netdev_dbg(vxlan->dev, "vxlan_xmit\n");
+ if (vxlan_fan_build_rdst(vxlan, skb, &fan_rdst)) {
+ dev->stats.tx_dropped++;
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+ }
+ vxlan_xmit_one(skb, dev, vni, &fan_rdst, 0);
+ return NETDEV_TX_OK;
+ }
+
eth = eth_hdr(skb);
f = vxlan_find_mac(vxlan, eth->h_dest, vni);
did_rsc = false;
@@ -2663,6 +2846,8 @@ static void vxlan_setup(struct net_device *dev)
for (h = 0; h < FDB_HASH_SIZE; ++h)
INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
+
+ INIT_LIST_HEAD(&vxlan->fan.fan_maps);
}
static void vxlan_ether_setup(struct net_device *dev)
@@ -3150,6 +3335,7 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
bool changelink)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
+ int err;
memset(conf, 0, sizeof(*conf));
@@ -3182,6 +3368,12 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
conf->remote_ip.sa.sa_family = AF_INET6;
}
+ if (data[IFLA_VXLAN_FAN_MAP]) {
+ err = vxlan_parse_fan_map(data, vxlan);
+ if (err)
+ return err;
+ }
+
if (data[IFLA_VXLAN_LOCAL]) {
if (changelink && (conf->saddr.sa.sa_family != AF_INET))
return -EOPNOTSUPP;
@@ -3458,6 +3650,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */
nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */
+ nla_total_size(sizeof(struct ip_fan_map) * 256) +
0;
}
@@ -3504,6 +3697,26 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
}
}
+ if (fan_has_map(&vxlan->fan)) {
+ struct nlattr *fan_nest;
+ struct ip_fan_map *fan_map;
+
+ fan_nest = nla_nest_start(skb, IFLA_VXLAN_FAN_MAP);
+ if (!fan_nest)
+ goto nla_put_failure;
+ list_for_each_entry_rcu(fan_map, &vxlan->fan.fan_maps, list) {
+ struct ifla_fan_map map;
+
+ map.underlay = fan_map->underlay;
+ map.underlay_prefix = fan_map->underlay_prefix;
+ map.overlay = fan_map->overlay;
+ map.overlay_prefix = fan_map->overlay_prefix;
+ if (nla_put(skb, IFLA_FAN_MAPPING, sizeof(map), &map))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, fan_nest);
+ }
+
if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) ||
nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
@@ -3670,6 +3883,22 @@ static __net_init int vxlan_init_net(struct net *net)
return 0;
}
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *vxlan_fan_header;
+static unsigned int vxlan_fan_version = 4;
+
+static struct ctl_table vxlan_fan_sysctls[] = {
+ {
+ .procname = "vxlan",
+ .data = &vxlan_fan_version,
+ .maxlen = sizeof(vxlan_fan_version),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+ {},
+};
+#endif /* CONFIG_SYSCTL */
+
static void __net_exit vxlan_exit_net(struct net *net)
{
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
@@ -3718,8 +3947,20 @@ static int __init vxlan_init_module(void)
rc = rtnl_link_register(&vxlan_link_ops);
if (rc)
goto out3;
+#ifdef CONFIG_SYSCTL
+ vxlan_fan_header = register_net_sysctl(&init_net, "net/fan",
+ vxlan_fan_sysctls);
+ if (!vxlan_fan_header) {
+ rc = -ENOMEM;
+ goto sysctl_failed;
+ }
+#endif /* CONFIG_SYSCTL */
return 0;
+#ifdef CONFIG_SYSCTL
+sysctl_failed:
+ rtnl_link_unregister(&vxlan_link_ops);
+#endif /* CONFIG_SYSCTL */
out3:
unregister_netdevice_notifier_rh(&vxlan_notifier_block);
out2:
@@ -3731,6 +3972,9 @@ late_initcall(vxlan_init_module);
static void __exit vxlan_cleanup_module(void)
{
+#ifdef CONFIG_SYSCTL
+ unregister_net_sysctl_table(vxlan_fan_header);
+#endif /* CONFIG_SYSCTL */
rtnl_link_unregister(&vxlan_link_ops);
unregister_netdevice_notifier_rh(&vxlan_notifier_block);
unregister_pernet_subsys(&vxlan_net_ops);
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 56a65e5b..b3ca3f84 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -102,6 +102,30 @@ struct ip_tunnel_prl_entry {
struct metadata_dst;
+/* A fan overlay /8 (250.0.0.0/8, for example) maps to exactly one /16
+ * underlay (10.88.0.0/16, for example). Multiple local addresses within
+ * the /16 may be used, but a particular overlay may not span
+ * multiple underlay subnets.
+ *
+ * We store one underlay, indexed by the overlay's high order octet.
+ */
+#define FAN_OVERLAY_CNT 256
+
+struct ip_fan_map {
+ __be32 underlay;
+ __be32 overlay;
+ u16 underlay_prefix;
+ u16 overlay_prefix;
+ u32 overlay_mask;
+ struct list_head list;
+ struct rcu_head rcu;
+};
+
+struct ip_tunnel_fan {
+ struct list_head fan_maps;
+
+};
+
struct ip_tunnel {
struct ip_tunnel __rcu *next;
struct hlist_node hash_node;
@@ -133,6 +157,7 @@ struct ip_tunnel {
#endif
struct ip_tunnel_prl_entry __rcu *prl; /* potential router list */
unsigned int prl_count; /* # of entries in PRL */
+ struct ip_tunnel_fan fan;
int ip_tnl_net_id;
struct gro_cells gro_cells;
@@ -165,6 +190,11 @@ struct ip_tunnel {
#define TUNNEL_OPTIONS_PRESENT (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT)
+static inline int fan_has_map(const struct ip_tunnel_fan *fan)
+{
+ return !list_empty(&fan->fan_maps);
+}
+
struct tnl_ptk_info {
__be16 flags;
__be16 proto;
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index d60235d6..bfab9125 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -235,6 +235,8 @@ struct vxlan_dev {
struct net *net; /* netns for packet i/o */
struct vxlan_rdst default_dst; /* default destination */
+ struct ip_tunnel_fan fan;
+
struct timer_list age_timer;
spinlock_t hash_lock;
unsigned int addrcnt;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 7c410287..de494192 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -469,6 +469,7 @@ enum {
IFLA_VXLAN_LABEL,
IFLA_VXLAN_GPE,
IFLA_VXLAN_TTL_INHERIT,
+ IFLA_VXLAN_FAN_MAP = 33,
__IFLA_VXLAN_MAX
};
#define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 83452433..05dee8f5 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -62,6 +62,10 @@ enum {
IFLA_IPTUN_ENCAP_FLAGS,
IFLA_IPTUN_ENCAP_SPORT,
IFLA_IPTUN_ENCAP_DPORT,
+
+ __IFLA_IPTUN_VENDOR_BREAK, /* Ensure new entries do not hit the below. */
+ IFLA_IPTUN_FAN_MAP = 33,
+
__IFLA_IPTUN_MAX,
};
#define IFLA_IPTUN_MAX (__IFLA_IPTUN_MAX - 1)
@@ -138,4 +142,20 @@ enum {
};
#define IFLA_VTI_MAX (__IFLA_VTI_MAX - 1)
+
+enum {
+ IFLA_FAN_UNSPEC,
+ IFLA_FAN_MAPPING,
+ __IFLA_FAN_MAX,
+};
+
+#define IFLA_FAN_MAX (__IFLA_FAN_MAX - 1)
+
+struct ifla_fan_map {
+ __be32 underlay;
+ __be32 overlay;
+ __u16 underlay_prefix;
+ __u16 overlay_prefix;
+};
+
#endif /* _UAPI_IF_TUNNEL_H_ */
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 347f9d99..328082d4 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -1098,7 +1098,7 @@ int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
if (dev == itn->fb_tunnel_dev)
- return -EINVAL;
+ return fan_has_map(&tunnel->fan) ? 0 : -EINVAL;
t = ip_tunnel_find(itn, p, dev->type);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 9d311c14..327c113e 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -99,6 +99,7 @@
#include <asm/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
+#include <linux/rculist.h>
#include <linux/in.h>
#include <linux/tcp.h>
#include <linux/udp.h>
@@ -106,6 +107,7 @@
#include <linux/init.h>
#include <linux/netfilter_ipv4.h>
#include <linux/if_ether.h>
+#include <linux/inetdevice.h>
#include <net/sock.h>
#include <net/ip.h>
@@ -207,6 +209,147 @@ drop:
return 0;
}
+static struct ip_fan_map *ipip_fan_find_map(struct ip_tunnel *t, __be32 daddr)
+{
+ struct ip_fan_map *fan_map;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(fan_map, &t->fan.fan_maps, list) {
+ if (fan_map->overlay ==
+ (daddr & inet_make_mask(fan_map->overlay_prefix))) {
+ rcu_read_unlock();
+ return fan_map;
+ }
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+/* Determine fan tunnel endpoint to send packet to, based on the inner IP
+ * address.
+ *
+ * Given a /8 overlay and /16 underlay, for an overlay (inner) address
+ * Y.A.B.C, the transformation is F.G.A.B, where "F" and "G" are the first
+ * two octets of the underlay network (the network portion of a /16), "A"
+ * and "B" are the low order two octets of the underlay network host (the
+ * host portion of a /16), and "Y" is a configured first octet of the
+ * overlay network.
+ *
+ * E.g., underlay host 10.88.3.4/16 with an overlay of 99.0.0.0/8 would
+ * host overlay subnet 99.3.4.0/24. An overlay network datagram from
+ * 99.3.4.5 to 99.6.7.8, would be directed to underlay host 10.88.6.7,
+ * which hosts overlay network subnet 99.6.7.0/24. This transformation is
+ * described in detail further below.
+ *
+ * Using netmasks for the overlay and underlay other than /8 and /16, as
+ * shown above, can yield larger (or smaller) overlay subnets, with the
+ * trade-off of allowing fewer (or more) underlay hosts to participate.
+ *
+ * The size of each overlay network subnet is defined by the total of the
+ * network mask of the overlay plus the size of host portion of the
+ * underlay network. In the above example, /8 + /16 = /24.
+ *
+ * E.g., consider underlay host 10.99.238.5/20 and overlay 99.0.0.0/8. In
+ * this case, the network portion of the underlay is 10.99.224.0/20, and
+ * the host portion is 0.0.14.5 (12 bits). To determine the overlay
+ * network subnet, the 12 bits of host portion are left shifted 12 bits
+ * (/20 - /8) and ORed with the overlay subnet prefix. This yields an
+ * overlay subnet of 99.224.80/20, composed of 8 bits overlay, followed by
+ * 12 bits underlay. This yields 12 bits in the overlay network portion,
+ * allowing for 4094 addresses in each overlay network subnet. The
+ * trade-off is that fewer hosts may participate in the underlay network,
+ * as its host address size has shrunk from 16 bits (65534 addresses) in
+ * the first example to 12 bits (4094 addresses) here.
+ *
+ * For fewer hosts per overlay subnet (permitting a larger number of
+ * underlay hosts to participate), the underlay netmask may be made
+ * smaller.
+ *
+ * E.g., underlay host 10.111.1.2/12 (network 10.96.0.0/12, host portion
+ * is 0.15.1.2, 20 bits) with an overlay of 33.0.0.0/8 would left shift
+ * the 20 bits of host by 4 (so that it's highest order bit is adjacent to
+ * the lowest order bit of the /8 overlay). This yields an overlay subnet
+ * of 33.240.16.32/28 (8 bits overlay, 20 bits from the host portion of
+ * the underlay). This provides more addresses for the underlay network
+ * (approximately 2^20), but each host's segment of the overlay provides
+ * only 4 bits of addresses (14 usable).
+ *
+ * It is also possible to adjust the overlay subnet.
+ *
+ * For an overlay of 240.0.0.0/5 and underlay of 10.88.0.0/20, consider
+ * underlay host 10.88.129.2; the 12 bits of host, 0.0.1.2, are left
+ * shifted 15 bits (/20 - /5), yielding an overlay network of
+ * 240.129.0.0/17. An underlay host of 10.88.244.215 would yield an
+ * overlay network of 242.107.128.0/17.
+ *
+ * For an overlay of 100.64.0.0/10 and underlay of 10.224.220.0/24, for
+ * underlay host 10.224.220.10, the underlay host portion (.10) is left
+ * shifted 14 bits, yielding an overlay network subnet of 100.66.128.0/18.
+ * This would permit 254 addresses on the underlay, with each overlay
+ * segment providing approximately 2^14 - 2 addresses (16382).
+ *
+ * For packets being encapsulated, the overlay network destination IP
+ * address is deconstructed into its overlay and underlay-derived
+ * portions. The underlay portion (determined by the overlay mask and
+ * overlay subnet mask) is right shifted according to the size of the
+ * underlay network mask. This value is then ORed with the network
+ * portion of the underlay network to produce the underlay network
+ * destination for the encapsulated datagram.
+ *
+ * For example, using the initial example of underlay 10.88.3.4/16 and
+ * overlay 99.0.0.0/8, with underlay host 10.88.3.4/16 providing overlay
+ * subnet 99.3.4.0/24 with specfic host 99.3.4.5. A datagram from
+ * 99.3.4.5 to 99.6.7.8 would first have the underlay host derived portion
+ * of the address extracted. This is a number of bits equal to underlay
+ * network host portion. In the destination address, the highest order of
+ * these bits is one bit lower than the lowest order bit from the overlay
+ * network mask.
+ *
+ * Using the sample value, 99.6.7.8, the overlay mask is /8, and the
+ * underlay mask is /16 (leaving 16 bits for the host portion). The bits
+ * to be shifted are the middle two octets, 0.6.7.0, as this is 99.6.7.8
+ * ANDed with the mask 0x00ffff00 (which is 16 bits, the highest order of
+ * which is 1 bit lower than the lowest order overlay address bit).
+ *
+ * These octets, 0.6.7.0, are then right shifted 8 bits, yielding 0.0.6.7.
+ * This value is then ORed with the underlay network portion,
+ * 10.88.0.0/16, providing 10.88.6.7 as the final underlay destination for
+ * the encapuslated datagram.
+ *
+ * Another transform using the final example: overlay 100.64.0.0/10 and
+ * underlay 10.224.220.0/24. Consider overlay address 100.66.128.1
+ * sending a datagram to 100.66.200.5. In this case, 8 bits (the host
+ * portion size of 10.224.220.0/24) beginning after the 100.64/10 overlay
+ * prefix are masked off, yielding 0.2.192.0. This is right shifted 14
+ * (32 - 10 - (32 - 24), i.e., the number of bits between the overlay
+ * network portion and the underlay host portion) bits, yielding 0.0.0.11.
+ * This is ORed with the underlay network portion, 10.224.220.0/24, giving
+ * the underlay destination of 10.224.220.11 for overlay destination
+ * 100.66.200.5.
+ */
+static int ipip_build_fan_iphdr(struct ip_tunnel *tunnel, struct sk_buff *skb, struct iphdr *iph)
+{
+ struct ip_fan_map *f_map;
+ u32 daddr, underlay;
+
+ f_map = ipip_fan_find_map(tunnel, ip_hdr(skb)->daddr);
+ if (!f_map)
+ return -ENOENT;
+
+ daddr = ntohl(ip_hdr(skb)->daddr);
+ underlay = ntohl(f_map->underlay);
+ if (!underlay)
+ return -EINVAL;
+
+ *iph = tunnel->parms.iph;
+ iph->daddr = htonl(underlay |
+ ((daddr & ~f_map->overlay_mask) >>
+ (32 - f_map->overlay_prefix -
+ (32 - f_map->underlay_prefix))));
+ return 0;
+}
+
/*
* This function assumes it is being called from dev_queue_xmit()
* and that skb is filled properly by that function.
@@ -215,6 +358,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
const struct iphdr *tiph = &tunnel->parms.iph;
+ struct iphdr fiph;
if (unlikely(skb->protocol != htons(ETH_P_IP)))
goto tx_error;
@@ -222,6 +366,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
if (iptunnel_handle_offloads(skb, SKB_GSO_IPIP))
goto tx_error;
+ if (fan_has_map(&tunnel->fan)) {
+ if (ipip_build_fan_iphdr(tunnel, skb, &fiph))
+ goto tx_error;
+ tiph = &fiph;
+ } else {
+ tiph = &tunnel->parms.iph;
+ }
+
skb_set_inner_ipproto(skb, IPPROTO_IPIP);
ip_tunnel_xmit(skb, dev, tiph, tiph->protocol);
@@ -282,6 +434,8 @@ static const struct net_device_ops ipip_netdev_ops = {
static void ipip_tunnel_setup(struct net_device *dev)
{
+ struct ip_tunnel *t = netdev_priv(dev);
+
dev->netdev_ops = &ipip_netdev_ops;
dev->type = ARPHRD_TUNNEL;
@@ -293,6 +447,7 @@ static void ipip_tunnel_setup(struct net_device *dev)
dev->features |= IPIP_FEATURES;
dev->hw_features |= IPIP_FEATURES;
ip_tunnel_setup(dev, ipip_net_id);
+ INIT_LIST_HEAD(&t->fan.fan_maps);
}
static int ipip_tunnel_init(struct net_device *dev)
@@ -341,12 +496,107 @@ static void ipip_netlink_parms(struct nlattr *data[],
parms->iph.frag_off = htons(IP_DF);
}
+static void ipip_fan_flush_map(struct ip_tunnel *t)
+{
+ struct ip_fan_map *fan_map;
+
+ list_for_each_entry_rcu(fan_map, &t->fan.fan_maps, list) {
+ list_del_rcu(&fan_map->list);
+ kfree_rcu(fan_map, rcu);
+ }
+}
+
+
+static int ipip_fan_del_map(struct ip_tunnel *t, __be32 overlay)
+{
+ struct ip_fan_map *fan_map;
+
+ fan_map = ipip_fan_find_map(t, overlay);
+ if (!fan_map)
+ return -ENOENT;
+
+ list_del_rcu(&fan_map->list);
+ kfree_rcu(fan_map, rcu);
+
+ return 0;
+}
+
+static int ipip_fan_add_map(struct ip_tunnel *t, struct ifla_fan_map *map)
+{
+ __be32 overlay_mask, underlay_mask;
+ struct ip_fan_map *fan_map;
+
+ overlay_mask = inet_make_mask(map->overlay_prefix);
+ underlay_mask = inet_make_mask(map->underlay_prefix);
+
+ if ((map->overlay & ~overlay_mask) || (map->underlay & ~underlay_mask))
+ return -EINVAL;
+
+ if (!(map->overlay & overlay_mask) && (map->underlay & underlay_mask))
+ return -EINVAL;
+
+ /* Special case: overlay 0 and underlay 0: flush all mappings */
+ if (!map->overlay && !map->underlay) {
+ ipip_fan_flush_map(t);
+ return 0;
+ }
+
+ /* Special case: overlay set and underlay 0: clear map for overlay */
+ if (!map->underlay)
+ return ipip_fan_del_map(t, map->overlay);
+
+ if (ipip_fan_find_map(t, map->overlay))
+ return -EEXIST;
+
+ fan_map = kmalloc(sizeof(*fan_map), GFP_KERNEL);
+ fan_map->underlay = map->underlay;
+ fan_map->overlay = map->overlay;
+
+ fan_map->underlay_prefix = map->underlay_prefix;
+ fan_map->overlay_mask = ntohl(overlay_mask);
+ fan_map->overlay_prefix = map->overlay_prefix;
+
+ list_add_tail_rcu(&fan_map->list, &t->fan.fan_maps);
+
+ return 0;
+}
+
+
+static int ipip_netlink_fan(struct nlattr *data[], struct ip_tunnel *t,
+ struct ip_tunnel_parm *parms)
+{
+ struct ifla_fan_map *map;
+ struct nlattr *attr;
+ int rem, rv;
+
+ if (!data[IFLA_IPTUN_FAN_MAP])
+ return 0;
+
+ if (parms->iph.daddr)
+ return -EINVAL;
+
+ nla_for_each_nested(attr, data[IFLA_IPTUN_FAN_MAP], rem) {
+ map = nla_data(attr);
+ rv = ipip_fan_add_map(t, map);
+ if (rv)
+ return rv;
+ }
+
+ return 0;
+}
+
static int ipip_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
struct ip_tunnel_parm p;
+ int err;
+ struct ip_tunnel *t = netdev_priv(dev);
ipip_netlink_parms(data, &p);
+ err = ipip_netlink_fan(data, t, &p);
+ if (err < 0)
+ return err;
+
return ip_tunnel_newlink(dev, tb, &p);
}
@@ -354,8 +604,13 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
struct nlattr *data[])
{
struct ip_tunnel_parm p;
+ int err;
+ struct ip_tunnel *t = netdev_priv(dev);
ipip_netlink_parms(data, &p);
+ err = ipip_netlink_fan(data, t, &p);
+ if (err < 0)
+ return err;
if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
(!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
@@ -379,6 +634,8 @@ static size_t ipip_get_size(const struct net_device *dev)
nla_total_size(1) +
/* IFLA_IPTUN_PMTUDISC */
nla_total_size(1) +
+ /* IFLA_IPTUN_FAN_MAP */
+ nla_total_size(sizeof(struct ifla_fan_map)) * 256 +
0;
}
@@ -395,6 +652,27 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
!!(parm->iph.frag_off & htons(IP_DF))))
goto nla_put_failure;
+
+ if (fan_has_map(&tunnel->fan)) {
+ struct nlattr *fan_nest;
+ struct ip_fan_map *fan_map;
+
+ fan_nest = nla_nest_start(skb, IFLA_IPTUN_FAN_MAP);
+ if (!fan_nest)
+ goto nla_put_failure;
+ list_for_each_entry_rcu(fan_map, &tunnel->fan.fan_maps, list) {
+ struct ifla_fan_map map;
+
+ map.underlay = fan_map->underlay;
+ map.underlay_prefix = fan_map->underlay_prefix;
+ map.overlay = fan_map->overlay;
+ map.overlay_prefix = fan_map->overlay_prefix;
+ if (nla_put(skb, IFLA_FAN_MAPPING, sizeof(map), &map))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, fan_nest);
+ }
+
return 0;
nla_put_failure:
@@ -408,6 +686,9 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
[IFLA_IPTUN_TTL] = { .type = NLA_U8 },
[IFLA_IPTUN_TOS] = { .type = NLA_U8 },
[IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 },
+
+ [__IFLA_IPTUN_VENDOR_BREAK ... IFLA_IPTUN_MAX] = { .type = NLA_BINARY },
+ [IFLA_IPTUN_FAN_MAP] = { .type = NLA_NESTED },
};
static struct rtnl_link_ops ipip_link_ops __read_mostly = {
@@ -448,6 +729,23 @@ static struct pernet_operations ipip_net_ops = {
.size = sizeof(struct ip_tunnel_net),
};
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *ipip_fan_header;
+static unsigned int ipip_fan_version = 3;
+
+static struct ctl_table ipip_fan_sysctls[] = {
+ {
+ .procname = "version",
+ .data = &ipip_fan_version,
+ .maxlen = sizeof(ipip_fan_version),
+ .mode = 0444,
+ .proc_handler = proc_dointvec,
+ },
+ {},
+};
+
+#endif /* CONFIG_SYSCTL */
+
static int __init ipip_init(void)
{
int err;
@@ -466,9 +764,23 @@ static int __init ipip_init(void)
if (err < 0)
goto rtnl_link_failed;
+#ifdef CONFIG_SYSCTL
+ ipip_fan_header = register_net_sysctl(&init_net, "net/fan",
+ ipip_fan_sysctls);
+ if (!ipip_fan_header) {
+ err = -ENOMEM;
+ goto sysctl_failed;
+ }
+#endif /* CONFIG_SYSCTL */
+
out:
return err;
+#ifdef CONFIG_SYSCTL
+sysctl_failed:
+ rtnl_link_unregister(&ipip_link_ops);
+#endif /* CONFIG_SYSCTL */
+
rtnl_link_failed:
xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
xfrm_tunnel_failed:
@@ -478,6 +790,10 @@ xfrm_tunnel_failed:
static void __exit ipip_fini(void)
{
+#ifdef CONFIG_SYSCTL
+ unregister_net_sysctl_table(ipip_fan_header);
+#endif /* CONFIG_SYSCTL */
+
rtnl_link_unregister(&ipip_link_ops);
if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
pr_info("%s: can't deregister tunnel\n", __func__);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment