nanjj/linux-kernel-fan-overlay-patch

## linux-kernel-fan-overlay-patch
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 28a1f8cb..141d8ba0 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -16,6 +16,7 @@
 #include <linux/slab.h>
 #include <linux/udp.h>
 #include <linux/igmp.h>
+#include <linux/inetdevice.h>
 #include <linux/if_ether.h>
 #include <linux/ethtool.h>
 #include <net/arp.h>
@@ -91,6 +92,167 @@ static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
 	       ip_tunnel_collect_metadata();
 }

+static struct ip_fan_map *vxlan_fan_find_map(struct vxlan_dev *vxlan, __be32 daddr)
+{
+	struct ip_fan_map *fan_map;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(fan_map, &vxlan->fan.fan_maps, list) {
+		if (fan_map->overlay ==
+		    (daddr & inet_make_mask(fan_map->overlay_prefix))) {
+			rcu_read_unlock();
+			return fan_map;
+		}
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+static void vxlan_fan_flush_map(struct vxlan_dev *vxlan)
+{
+	struct ip_fan_map *fan_map;
+
+	list_for_each_entry_rcu(fan_map, &vxlan->fan.fan_maps, list) {
+		list_del_rcu(&fan_map->list);
+		kfree_rcu(fan_map, rcu);
+	}
+}
+
+static int vxlan_fan_del_map(struct vxlan_dev *vxlan, __be32 overlay)
+{
+	struct ip_fan_map *fan_map;
+
+	fan_map = vxlan_fan_find_map(vxlan, overlay);
+	if (!fan_map)
+		return -ENOENT;
+
+	list_del_rcu(&fan_map->list);
+	kfree_rcu(fan_map, rcu);
+
+	return 0;
+}
+
+static int vxlan_fan_add_map(struct vxlan_dev *vxlan, struct ifla_fan_map *map)
+{
+	__be32 overlay_mask, underlay_mask;
+	struct ip_fan_map *fan_map;
+
+	overlay_mask = inet_make_mask(map->overlay_prefix);
+	underlay_mask = inet_make_mask(map->underlay_prefix);
+
+	netdev_dbg(vxlan->dev, "vfam: map: o %x/%d u %x/%d om %x um %x\n",
+		   map->overlay, map->overlay_prefix,
+		   map->underlay, map->underlay_prefix,
+		   overlay_mask, underlay_mask);
+
+	if ((map->overlay & ~overlay_mask) || (map->underlay & ~underlay_mask))
+		return -EINVAL;
+
+	if (!(map->overlay & overlay_mask) && (map->underlay & underlay_mask))
+		return -EINVAL;
+
+	/* Special case: overlay 0 and underlay 0: flush all mappings */
+	if (!map->overlay && !map->underlay) {
+		vxlan_fan_flush_map(vxlan);
+		return 0;
+	}
+
+	/* Special case: overlay set and underlay 0: clear map for overlay */
+	if (!map->underlay)
+		return vxlan_fan_del_map(vxlan, map->overlay);
+
+	if (vxlan_fan_find_map(vxlan, map->overlay))
+		return -EEXIST;
+
+	fan_map = kmalloc(sizeof(*fan_map), GFP_KERNEL);
+	fan_map->underlay = map->underlay;
+	fan_map->overlay = map->overlay;
+	fan_map->underlay_prefix = map->underlay_prefix;
+	fan_map->overlay_mask = ntohl(overlay_mask);
+	fan_map->overlay_prefix = map->overlay_prefix;
+
+	list_add_tail_rcu(&fan_map->list, &vxlan->fan.fan_maps);
+
+	return 0;
+}
+
+static int vxlan_parse_fan_map(struct nlattr *data[], struct vxlan_dev *vxlan)
+{
+	struct ifla_fan_map *map;
+	struct nlattr *attr;
+	int rem, rv;
+
+	nla_for_each_nested(attr, data[IFLA_IPTUN_FAN_MAP], rem) {
+		map = nla_data(attr);
+		rv = vxlan_fan_add_map(vxlan, map);
+		if (rv)
+			return rv;
+	}
+
+	return 0;
+}
+
+static int vxlan_fan_build_rdst(struct vxlan_dev *vxlan, struct sk_buff *skb,
+				      struct vxlan_rdst *fan_rdst)
+{
+	struct ip_fan_map *f_map;
+	union vxlan_addr *va;
+	u32 daddr, underlay;
+	struct arphdr *arp;
+	void *arp_ptr;
+	struct ethhdr *eth;
+	struct iphdr *iph;
+
+	eth = eth_hdr(skb);
+	switch (eth->h_proto) {
+	case htons(ETH_P_IP):
+		iph = ip_hdr(skb);
+		if (!iph)
+			return -EINVAL;
+		daddr = iph->daddr;
+		break;
+	case htons(ETH_P_ARP):
+		arp = arp_hdr(skb);
+		if (!arp)
+			return -EINVAL;
+		arp_ptr = arp + 1;
+		netdev_dbg(vxlan->dev,
+			   "vfbr: arp sha %pM sip %pI4 tha %pM tip %pI4\n",
+			   arp_ptr, arp_ptr + skb->dev->addr_len,
+			   arp_ptr + skb->dev->addr_len + 4,
+			   arp_ptr + (skb->dev->addr_len * 2) + 4);
+		arp_ptr += (skb->dev->addr_len * 2) + 4;
+		memcpy(&daddr, arp_ptr, 4);
+		break;
+	default:
+		netdev_dbg(vxlan->dev, "vfbr: unknown eth p %x\n", eth->h_proto);
+		return -EINVAL;
+	}
+
+	f_map = vxlan_fan_find_map(vxlan, daddr);
+	if (!f_map)
+		return -EINVAL;
+
+	daddr = ntohl(daddr);
+	underlay = ntohl(f_map->underlay);
+	if (!underlay)
+		return -EINVAL;
+
+	memset(fan_rdst, 0, sizeof(*fan_rdst));
+	va = &fan_rdst->remote_ip;
+	va->sa.sa_family = AF_INET;
+	fan_rdst->remote_vni = vxlan->default_dst.remote_vni;
+	va->sin.sin_addr.s_addr = htonl(underlay |
+					((daddr & ~f_map->overlay_mask) >>
+					 (32 - f_map->overlay_prefix -
+					  (32 - f_map->underlay_prefix))));
+	netdev_dbg(vxlan->dev, "vfbr: daddr %x ul %x dst %x\n",
+		   daddr, underlay, va->sin.sin_addr.s_addr);
+
+	return 0;
+}
+
 #if IS_ENABLED(CONFIG_IPV6)
 static inline
 bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
@@ -2146,6 +2308,13 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			goto tx_error;
 		}

+		if (fan_has_map(&vxlan->fan) && rt->rt_flags & RTCF_LOCAL) {
+			netdev_dbg(dev, "discard fan to localhost %pI4\n",
+				   &dst->sin.sin_addr.s_addr);
+			ip_rt_put(rt);
+			goto tx_free;
+		}
+
 		/* Bypass encapsulation if the destination is local */
 		if (!info) {
 			err = encap_bypass_if_local(skb, dev, vxlan, dst,
@@ -2228,6 +2397,7 @@ tx_error:
 		dev->stats.tx_carrier_errors++;
 	dst_release(ndst);
 	dev->stats.tx_errors++;
+tx_free:
 	kfree_skb(skb);
 }

@@ -2282,6 +2452,19 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
 #endif
 	}

+	if (fan_has_map(&vxlan->fan)) {
+		struct vxlan_rdst fan_rdst;
+
+		netdev_dbg(vxlan->dev, "vxlan_xmit\n");
+		if (vxlan_fan_build_rdst(vxlan, skb, &fan_rdst)) {
+			dev->stats.tx_dropped++;
+			kfree_skb(skb);
+			return NETDEV_TX_OK;
+		}
+		vxlan_xmit_one(skb, dev, vni, &fan_rdst, 0);
+		return NETDEV_TX_OK;
+	}
+
 	eth = eth_hdr(skb);
 	f = vxlan_find_mac(vxlan, eth->h_dest, vni);
 	did_rsc = false;
@@ -2663,6 +2846,8 @@ static void vxlan_setup(struct net_device *dev)

 	for (h = 0; h < FDB_HASH_SIZE; ++h)
 		INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
+
+	INIT_LIST_HEAD(&vxlan->fan.fan_maps);
 }

 static void vxlan_ether_setup(struct net_device *dev)
@@ -3150,6 +3335,7 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
 			 bool changelink)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
+	int err;

 	memset(conf, 0, sizeof(*conf));

@@ -3182,6 +3368,12 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
 		conf->remote_ip.sa.sa_family = AF_INET6;
 	}

+	if (data[IFLA_VXLAN_FAN_MAP]) {
+		err = vxlan_parse_fan_map(data, vxlan);
+		if (err)
+			return err;
+	}
+
 	if (data[IFLA_VXLAN_LOCAL]) {
 		if (changelink && (conf->saddr.sa.sa_family != AF_INET))
 			return -EOPNOTSUPP;
@@ -3458,6 +3650,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */
 		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */
 		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */
+		nla_total_size(sizeof(struct ip_fan_map) * 256) +
 		0;
 }

@@ -3504,6 +3697,26 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 		}
 	}

+	if (fan_has_map(&vxlan->fan)) {
+		struct nlattr *fan_nest;
+		struct ip_fan_map *fan_map;
+
+		fan_nest = nla_nest_start(skb, IFLA_VXLAN_FAN_MAP);
+		if (!fan_nest)
+			goto nla_put_failure;
+		list_for_each_entry_rcu(fan_map, &vxlan->fan.fan_maps, list) {
+			struct ifla_fan_map map;
+
+			map.underlay = fan_map->underlay;
+			map.underlay_prefix = fan_map->underlay_prefix;
+			map.overlay = fan_map->overlay;
+			map.overlay_prefix = fan_map->overlay_prefix;
+			if (nla_put(skb, IFLA_FAN_MAPPING, sizeof(map), &map))
+				goto nla_put_failure;
+		}
+		nla_nest_end(skb, fan_nest);
+	}
+
 	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) ||
 	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
 	    nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
@@ -3670,6 +3883,22 @@ static __net_init int vxlan_init_net(struct net *net)
 	return 0;
 }

+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *vxlan_fan_header;
+static unsigned int vxlan_fan_version = 4;
+
+static struct ctl_table vxlan_fan_sysctls[] = {
+	{
+		.procname	= "vxlan",
+		.data		= &vxlan_fan_version,
+		.maxlen		= sizeof(vxlan_fan_version),
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec,
+	},
+	{},
+};
+#endif /* CONFIG_SYSCTL */
+
 static void __net_exit vxlan_exit_net(struct net *net)
 {
 	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
@@ -3718,8 +3947,20 @@ static int __init vxlan_init_module(void)
 	rc = rtnl_link_register(&vxlan_link_ops);
 	if (rc)
 		goto out3;
+#ifdef CONFIG_SYSCTL
+	vxlan_fan_header = register_net_sysctl(&init_net, "net/fan",
+					      vxlan_fan_sysctls);
+	if (!vxlan_fan_header) {
+		rc = -ENOMEM;
+		goto sysctl_failed;
+	}
+#endif /* CONFIG_SYSCTL */

 	return 0;
+#ifdef CONFIG_SYSCTL
+sysctl_failed:
+	rtnl_link_unregister(&vxlan_link_ops);
+#endif /* CONFIG_SYSCTL */
 out3:
 	unregister_netdevice_notifier_rh(&vxlan_notifier_block);
 out2:
@@ -3731,6 +3972,9 @@ late_initcall(vxlan_init_module);

 static void __exit vxlan_cleanup_module(void)
 {
+#ifdef CONFIG_SYSCTL
+	unregister_net_sysctl_table(vxlan_fan_header);
+#endif /* CONFIG_SYSCTL */
 	rtnl_link_unregister(&vxlan_link_ops);
 	unregister_netdevice_notifier_rh(&vxlan_notifier_block);
 	unregister_pernet_subsys(&vxlan_net_ops);
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 56a65e5b..b3ca3f84 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -102,6 +102,30 @@ struct ip_tunnel_prl_entry {

 struct metadata_dst;

+/* A fan overlay /8 (250.0.0.0/8, for example) maps to exactly one /16
+ * underlay (10.88.0.0/16, for example).  Multiple local addresses within
+ * the /16 may be used, but a particular overlay may not span
+ * multiple underlay subnets.
+ *
+ * We store one underlay, indexed by the overlay's high order octet.
+ */
+#define FAN_OVERLAY_CNT		256
+
+struct ip_fan_map {
+	__be32			underlay;
+	__be32			overlay;
+	u16			underlay_prefix;
+	u16			overlay_prefix;
+	u32			overlay_mask;
+	struct list_head	list;
+	struct rcu_head		rcu;
+};
+
+struct ip_tunnel_fan {
+	struct list_head	fan_maps;
+
+};
+
 struct ip_tunnel {
 	struct ip_tunnel __rcu	*next;
 	struct hlist_node hash_node;
@@ -133,6 +157,7 @@ struct ip_tunnel {
 #endif
 	struct ip_tunnel_prl_entry __rcu *prl;	/* potential router list */
 	unsigned int		prl_count;	/* # of entries in PRL */
+	struct ip_tunnel_fan	fan;
 	int			ip_tnl_net_id;
 	struct gro_cells	gro_cells;

@@ -165,6 +190,11 @@ struct ip_tunnel {

 #define TUNNEL_OPTIONS_PRESENT	(TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT)

+static inline int fan_has_map(const struct ip_tunnel_fan *fan)
+{
+	return !list_empty(&fan->fan_maps);
+}
+
 struct tnl_ptk_info {
 	__be16 flags;
 	__be16 proto;
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index d60235d6..bfab9125 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -235,6 +235,8 @@ struct vxlan_dev {
 	struct net	  *net;		/* netns for packet i/o */
 	struct vxlan_rdst default_dst;	/* default destination */

+	struct ip_tunnel_fan fan;
+
 	struct timer_list age_timer;
 	spinlock_t	  hash_lock;
 	unsigned int	  addrcnt;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 7c410287..de494192 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -469,6 +469,7 @@ enum {
 	IFLA_VXLAN_LABEL,
 	IFLA_VXLAN_GPE,
 	IFLA_VXLAN_TTL_INHERIT,
+	IFLA_VXLAN_FAN_MAP = 33,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 83452433..05dee8f5 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -62,6 +62,10 @@ enum {
 	IFLA_IPTUN_ENCAP_FLAGS,
 	IFLA_IPTUN_ENCAP_SPORT,
 	IFLA_IPTUN_ENCAP_DPORT,
+
+	__IFLA_IPTUN_VENDOR_BREAK, /* Ensure new entries do not hit the below. */
+	IFLA_IPTUN_FAN_MAP = 33,
+
 	__IFLA_IPTUN_MAX,
 };
 #define IFLA_IPTUN_MAX	(__IFLA_IPTUN_MAX - 1)
@@ -138,4 +142,20 @@ enum {
 };

 #define IFLA_VTI_MAX	(__IFLA_VTI_MAX - 1)
+
+enum {
+	IFLA_FAN_UNSPEC,
+	IFLA_FAN_MAPPING,
+	__IFLA_FAN_MAX,
+};
+
+#define IFLA_FAN_MAX (__IFLA_FAN_MAX - 1)
+
+struct ifla_fan_map {
+	__be32		underlay;
+	__be32		overlay;
+	__u16		underlay_prefix;
+	__u16		overlay_prefix;
+};
+
 #endif /* _UAPI_IF_TUNNEL_H_ */
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 347f9d99..328082d4 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -1098,7 +1098,7 @@ int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);

 	if (dev == itn->fb_tunnel_dev)
-		return -EINVAL;
+		return fan_has_map(&tunnel->fan) ? 0 : -EINVAL;

 	t = ip_tunnel_find(itn, p, dev->type);

diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 9d311c14..327c113e 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -99,6 +99,7 @@
 #include <asm/uaccess.h>
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
+#include <linux/rculist.h>
 #include <linux/in.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
@@ -106,6 +107,7 @@
 #include <linux/init.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/if_ether.h>
+#include <linux/inetdevice.h>

 #include <net/sock.h>
 #include <net/ip.h>
@@ -207,6 +209,147 @@ drop:
 	return 0;
 }

+static struct ip_fan_map *ipip_fan_find_map(struct ip_tunnel *t, __be32 daddr)
+{
+	struct ip_fan_map *fan_map;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(fan_map, &t->fan.fan_maps, list) {
+		if (fan_map->overlay ==
+		    (daddr & inet_make_mask(fan_map->overlay_prefix))) {
+			rcu_read_unlock();
+			return fan_map;
+		}
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+/* Determine fan tunnel endpoint to send packet to, based on the inner IP
+ * address.
+ *
+ * Given a /8 overlay and /16 underlay, for an overlay (inner) address
+ * Y.A.B.C, the transformation is F.G.A.B, where "F" and "G" are the first
+ * two octets of the underlay network (the network portion of a /16), "A"
+ * and "B" are the low order two octets of the underlay network host (the
+ * host portion of a /16), and "Y" is a configured first octet of the
+ * overlay network.
+ *
+ * E.g., underlay host 10.88.3.4/16 with an overlay of 99.0.0.0/8 would
+ * host overlay subnet 99.3.4.0/24.  An overlay network datagram from
+ * 99.3.4.5 to 99.6.7.8, would be directed to underlay host 10.88.6.7,
+ * which hosts overlay network subnet 99.6.7.0/24.  This transformation is
+ * described in detail further below.
+ *
+ * Using netmasks for the overlay and underlay other than /8 and /16, as
+ * shown above, can yield larger (or smaller) overlay subnets, with the
+ * trade-off of allowing fewer (or more) underlay hosts to participate.
+ *
+ * The size of each overlay network subnet is defined by the total of the
+ * network mask of the overlay plus the size of host portion of the
+ * underlay network. In the above example, /8 + /16 = /24.
+ *
+ * E.g., consider underlay host 10.99.238.5/20 and overlay 99.0.0.0/8. In
+ * this case, the network portion of the underlay is 10.99.224.0/20, and
+ * the host portion is 0.0.14.5 (12 bits).  To determine the overlay
+ * network subnet, the 12 bits of host portion are left shifted 12 bits
+ * (/20 - /8) and ORed with the overlay subnet prefix.  This yields an
+ * overlay subnet of 99.224.80/20, composed of 8 bits overlay, followed by
+ * 12 bits underlay.  This yields 12 bits in the overlay network portion,
+ * allowing for 4094 addresses in each overlay network subnet.  The
+ * trade-off is that fewer hosts may participate in the underlay network,
+ * as its host address size has shrunk from 16 bits (65534 addresses) in
+ * the first example to 12 bits (4094 addresses) here.
+ *
+ * For fewer hosts per overlay subnet (permitting a larger number of
+ * underlay hosts to participate), the underlay netmask may be made
+ * smaller.
+ *
+ * E.g., underlay host 10.111.1.2/12 (network 10.96.0.0/12, host portion
+ * is 0.15.1.2, 20 bits) with an overlay of 33.0.0.0/8 would left shift
+ * the 20 bits of host by 4 (so that it's highest order bit is adjacent to
+ * the lowest order bit of the /8 overlay).  This yields an overlay subnet
+ * of 33.240.16.32/28 (8 bits overlay, 20 bits from the host portion of
+ * the underlay).  This provides more addresses for the underlay network
+ * (approximately 2^20), but each host's segment of the overlay provides
+ * only 4 bits of addresses (14 usable).
+ *
+ * It is also possible to adjust the overlay subnet.
+ *
+ * For an overlay of 240.0.0.0/5 and underlay of 10.88.0.0/20, consider
+ * underlay host 10.88.129.2; the 12 bits of host, 0.0.1.2, are left
+ * shifted 15 bits (/20 - /5), yielding an overlay network of
+ * 240.129.0.0/17.  An underlay host of 10.88.244.215 would yield an
+ * overlay network of 242.107.128.0/17.
+ *
+ * For an overlay of 100.64.0.0/10 and underlay of 10.224.220.0/24, for
+ * underlay host 10.224.220.10, the underlay host portion (.10) is left
+ * shifted 14 bits, yielding an overlay network subnet of 100.66.128.0/18.
+ * This would permit 254 addresses on the underlay, with each overlay
+ * segment providing approximately 2^14 - 2 addresses (16382).
+ *
+ * For packets being encapsulated, the overlay network destination IP
+ * address is deconstructed into its overlay and underlay-derived
+ * portions.  The underlay portion (determined by the overlay mask and
+ * overlay subnet mask) is right shifted according to the size of the
+ * underlay network mask.  This value is then ORed with the network
+ * portion of the underlay network to produce the underlay network
+ * destination for the encapsulated datagram.
+ *
+ * For example, using the initial example of underlay 10.88.3.4/16 and
+ * overlay 99.0.0.0/8, with underlay host 10.88.3.4/16 providing overlay
+ * subnet 99.3.4.0/24 with specfic host 99.3.4.5.  A datagram from
+ * 99.3.4.5 to 99.6.7.8 would first have the underlay host derived portion
+ * of the address extracted.  This is a number of bits equal to underlay
+ * network host portion.  In the destination address, the highest order of
+ * these bits is one bit lower than the lowest order bit from the overlay
+ * network mask.
+ *
+ * Using the sample value, 99.6.7.8, the overlay mask is /8, and the
+ * underlay mask is /16 (leaving 16 bits for the host portion).  The bits
+ * to be shifted are the middle two octets, 0.6.7.0, as this is 99.6.7.8
+ * ANDed with the mask 0x00ffff00 (which is 16 bits, the highest order of
+ * which is 1 bit lower than the lowest order overlay address bit).
+ *
+ * These octets, 0.6.7.0, are then right shifted 8 bits, yielding 0.0.6.7.
+ * This value is then ORed with the underlay network portion,
+ * 10.88.0.0/16, providing 10.88.6.7 as the final underlay destination for
+ * the encapuslated datagram.
+ *
+ * Another transform using the final example: overlay 100.64.0.0/10 and
+ * underlay 10.224.220.0/24.  Consider overlay address 100.66.128.1
+ * sending a datagram to 100.66.200.5.  In this case, 8 bits (the host
+ * portion size of 10.224.220.0/24) beginning after the 100.64/10 overlay
+ * prefix are masked off, yielding 0.2.192.0.  This is right shifted 14
+ * (32 - 10 - (32 - 24), i.e., the number of bits between the overlay
+ * network portion and the underlay host portion) bits, yielding 0.0.0.11.
+ * This is ORed with the underlay network portion, 10.224.220.0/24, giving
+ * the underlay destination of 10.224.220.11 for overlay destination
+ * 100.66.200.5.
+ */
+static int ipip_build_fan_iphdr(struct ip_tunnel *tunnel, struct sk_buff *skb, struct iphdr *iph)
+{
+	struct ip_fan_map *f_map;
+	u32 daddr, underlay;
+
+	f_map = ipip_fan_find_map(tunnel, ip_hdr(skb)->daddr);
+	if (!f_map)
+		return -ENOENT;
+
+	daddr = ntohl(ip_hdr(skb)->daddr);
+	underlay = ntohl(f_map->underlay);
+	if (!underlay)
+		return -EINVAL;
+
+	*iph = tunnel->parms.iph;
+	iph->daddr = htonl(underlay |
+			   ((daddr & ~f_map->overlay_mask) >>
+			    (32 - f_map->overlay_prefix -
+			     (32 - f_map->underlay_prefix))));
+	return 0;
+}
+
 /*
  *	This function assumes it is being called from dev_queue_xmit()
  *	and that skb is filled properly by that function.
@@ -215,6 +358,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	const struct iphdr  *tiph = &tunnel->parms.iph;
+	struct iphdr fiph;

 	if (unlikely(skb->protocol != htons(ETH_P_IP)))
 		goto tx_error;
@@ -222,6 +366,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (iptunnel_handle_offloads(skb, SKB_GSO_IPIP))
 		goto tx_error;

+	if (fan_has_map(&tunnel->fan)) {
+		if (ipip_build_fan_iphdr(tunnel, skb, &fiph))
+			goto tx_error;
+		tiph = &fiph;
+	} else {
+		tiph = &tunnel->parms.iph;
+	}
+
 	skb_set_inner_ipproto(skb, IPPROTO_IPIP);

 	ip_tunnel_xmit(skb, dev, tiph, tiph->protocol);
@@ -282,6 +434,8 @@ static const struct net_device_ops ipip_netdev_ops = {

 static void ipip_tunnel_setup(struct net_device *dev)
 {
+	struct ip_tunnel *t = netdev_priv(dev);
+
 	dev->netdev_ops		= &ipip_netdev_ops;

 	dev->type		= ARPHRD_TUNNEL;
@@ -293,6 +447,7 @@ static void ipip_tunnel_setup(struct net_device *dev)
 	dev->features		|= IPIP_FEATURES;
 	dev->hw_features	|= IPIP_FEATURES;
 	ip_tunnel_setup(dev, ipip_net_id);
+	INIT_LIST_HEAD(&t->fan.fan_maps);
 }

 static int ipip_tunnel_init(struct net_device *dev)
@@ -341,12 +496,107 @@ static void ipip_netlink_parms(struct nlattr *data[],
 		parms->iph.frag_off = htons(IP_DF);
 }

+static void ipip_fan_flush_map(struct ip_tunnel *t)
+{
+	struct ip_fan_map *fan_map;
+
+	list_for_each_entry_rcu(fan_map, &t->fan.fan_maps, list) {
+		list_del_rcu(&fan_map->list);
+		kfree_rcu(fan_map, rcu);
+	}
+}
+
+
+static int ipip_fan_del_map(struct ip_tunnel *t, __be32 overlay)
+{
+	struct ip_fan_map *fan_map;
+
+	fan_map = ipip_fan_find_map(t, overlay);
+	if (!fan_map)
+		return -ENOENT;
+
+	list_del_rcu(&fan_map->list);
+	kfree_rcu(fan_map, rcu);
+
+	return 0;
+}
+
+static int ipip_fan_add_map(struct ip_tunnel *t, struct ifla_fan_map *map)
+{
+	__be32 overlay_mask, underlay_mask;
+	struct ip_fan_map *fan_map;
+
+	overlay_mask = inet_make_mask(map->overlay_prefix);
+	underlay_mask = inet_make_mask(map->underlay_prefix);
+
+	if ((map->overlay & ~overlay_mask) || (map->underlay & ~underlay_mask))
+		return -EINVAL;
+
+	if (!(map->overlay & overlay_mask) && (map->underlay & underlay_mask))
+		return -EINVAL;
+
+	/* Special case: overlay 0 and underlay 0: flush all mappings */
+	if (!map->overlay && !map->underlay) {
+		ipip_fan_flush_map(t);
+		return 0;
+	}
+
+	/* Special case: overlay set and underlay 0: clear map for overlay */
+	if (!map->underlay)
+		return ipip_fan_del_map(t, map->overlay);
+
+	if (ipip_fan_find_map(t, map->overlay))
+		return -EEXIST;
+
+	fan_map = kmalloc(sizeof(*fan_map), GFP_KERNEL);
+	fan_map->underlay = map->underlay;
+	fan_map->overlay = map->overlay;
+
+	fan_map->underlay_prefix = map->underlay_prefix;
+	fan_map->overlay_mask = ntohl(overlay_mask);
+	fan_map->overlay_prefix = map->overlay_prefix;
+
+	list_add_tail_rcu(&fan_map->list, &t->fan.fan_maps);
+
+	return 0;
+}
+
+
+static int ipip_netlink_fan(struct nlattr *data[], struct ip_tunnel *t,
+			    struct ip_tunnel_parm *parms)
+{
+	struct ifla_fan_map *map;
+	struct nlattr *attr;
+	int rem, rv;
+
+	if (!data[IFLA_IPTUN_FAN_MAP])
+		return 0;
+
+	if (parms->iph.daddr)
+		return -EINVAL;
+
+	nla_for_each_nested(attr, data[IFLA_IPTUN_FAN_MAP], rem) {
+		map = nla_data(attr);
+		rv = ipip_fan_add_map(t, map);
+		if (rv)
+			return rv;
+	}
+
+	return 0;
+}
+
 static int ipip_newlink(struct net *src_net, struct net_device *dev,
 			struct nlattr *tb[], struct nlattr *data[])
 {
 	struct ip_tunnel_parm p;
+	int err;
+	struct ip_tunnel *t = netdev_priv(dev);

 	ipip_netlink_parms(data, &p);
+	err = ipip_netlink_fan(data, t, &p);
+	if (err < 0)
+		return err;
+
 	return ip_tunnel_newlink(dev, tb, &p);
 }

@@ -354,8 +604,13 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
 			   struct nlattr *data[])
 {
 	struct ip_tunnel_parm p;
+	int err;
+	struct ip_tunnel *t = netdev_priv(dev);

 	ipip_netlink_parms(data, &p);
+	err = ipip_netlink_fan(data, t, &p);
+	if (err < 0)
+		return err;

 	if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
 	    (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
@@ -379,6 +634,8 @@ static size_t ipip_get_size(const struct net_device *dev)
 		nla_total_size(1) +
 		/* IFLA_IPTUN_PMTUDISC */
 		nla_total_size(1) +
+		/* IFLA_IPTUN_FAN_MAP */
+		nla_total_size(sizeof(struct ifla_fan_map)) * 256 +
 		0;
 }

@@ -395,6 +652,27 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
 		       !!(parm->iph.frag_off & htons(IP_DF))))
 		goto nla_put_failure;
+
+	if (fan_has_map(&tunnel->fan)) {
+		struct nlattr *fan_nest;
+		struct ip_fan_map *fan_map;
+
+		fan_nest = nla_nest_start(skb, IFLA_IPTUN_FAN_MAP);
+		if (!fan_nest)
+			goto nla_put_failure;
+		list_for_each_entry_rcu(fan_map, &tunnel->fan.fan_maps, list) {
+			struct ifla_fan_map map;
+
+			map.underlay = fan_map->underlay;
+			map.underlay_prefix = fan_map->underlay_prefix;
+			map.overlay = fan_map->overlay;
+			map.overlay_prefix = fan_map->overlay_prefix;
+			if (nla_put(skb, IFLA_FAN_MAPPING, sizeof(map), &map))
+				goto nla_put_failure;
+		}
+		nla_nest_end(skb, fan_nest);
+	}
+
 	return 0;

 nla_put_failure:
@@ -408,6 +686,9 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
 	[IFLA_IPTUN_TTL]		= { .type = NLA_U8 },
 	[IFLA_IPTUN_TOS]		= { .type = NLA_U8 },
 	[IFLA_IPTUN_PMTUDISC]		= { .type = NLA_U8 },
+
+	[__IFLA_IPTUN_VENDOR_BREAK ... IFLA_IPTUN_MAX]	= { .type = NLA_BINARY },
+	[IFLA_IPTUN_FAN_MAP]		= { .type = NLA_NESTED },
 };

 static struct rtnl_link_ops ipip_link_ops __read_mostly = {
@@ -448,6 +729,23 @@ static struct pernet_operations ipip_net_ops = {
 	.size = sizeof(struct ip_tunnel_net),
 };

+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *ipip_fan_header;
+static unsigned int ipip_fan_version = 3;
+
+static struct ctl_table ipip_fan_sysctls[] = {
+	{
+		.procname	= "version",
+		.data		= &ipip_fan_version,
+		.maxlen		= sizeof(ipip_fan_version),
+		.mode		= 0444,
+		.proc_handler	= proc_dointvec,
+	},
+	{},
+};
+
+#endif /* CONFIG_SYSCTL */
+
 static int __init ipip_init(void)
 {
 	int err;
@@ -466,9 +764,23 @@ static int __init ipip_init(void)
 	if (err < 0)
 		goto rtnl_link_failed;

+#ifdef CONFIG_SYSCTL
+	ipip_fan_header = register_net_sysctl(&init_net, "net/fan",
+					      ipip_fan_sysctls);
+	if (!ipip_fan_header) {
+		err = -ENOMEM;
+		goto sysctl_failed;
+	}
+#endif /* CONFIG_SYSCTL */
+
 out:
 	return err;

+#ifdef CONFIG_SYSCTL
+sysctl_failed:
+	rtnl_link_unregister(&ipip_link_ops);
+#endif /* CONFIG_SYSCTL */
+
 rtnl_link_failed:
 	xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
 xfrm_tunnel_failed:
@@ -478,6 +790,10 @@ xfrm_tunnel_failed:

 static void __exit ipip_fini(void)
 {
+#ifdef CONFIG_SYSCTL
+	unregister_net_sysctl_table(ipip_fan_header);
+#endif /* CONFIG_SYSCTL */
+
 	rtnl_link_unregister(&ipip_link_ops);
 	if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
 		pr_info("%s: can't deregister tunnel\n", __func__);