Skip to content

Instantly share code, notes, and snippets.

@Ansuel
Last active June 9, 2020 15:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Ansuel/a1c5ff13d97e62223721392db643ecf5 to your computer and use it in GitHub Desktop.
Save Ansuel/a1c5ff13d97e62223721392db643ecf5 to your computer and use it in GitHub Desktop.
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -45,10 +45,26 @@ struct br_ip_list {
#define BR_PROXYARP BIT(8)
#define BR_LEARNING_SYNC BIT(9)
#define BR_PROXYARP_WIFI BIT(10)
+#define BR_ISOLATE_MODE BIT(11)
+#define BR_MULTICAST_TO_UCAST BIT(12)
#define BR_DEFAULT_AGEING_TIME (300 * HZ)
+struct net_bridge_port;
+
extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *));
+extern struct net_device *br_port_dev_get(struct net_device *dev,
+ unsigned char *addr,
+ struct sk_buff *skb,
+ unsigned int cookie);
+extern void br_refresh_fdb_entry(struct net_device *dev, const char *addr);
+extern void br_dev_update_stats(struct net_device *dev,
+ struct rtnl_link_stats64 *nlstats);
+extern struct net_bridge_fdb_entry *br_fdb_has_entry(struct net_device *dev,
+ const char *addr,
+ __u16 vid);
+extern void br_fdb_update_register_notify(struct notifier_block *nb);
+extern void br_fdb_update_unregister_notify(struct notifier_block *nb);
typedef int br_should_route_hook_t(struct sk_buff *skb);
extern br_should_route_hook_t __rcu *br_should_route_hook;
@@ -76,4 +92,36 @@ static inline bool br_multicast_has_querier_adjacent(struct net_device *dev,
}
#endif
+typedef struct net_bridge_port *br_port_dev_get_hook_t(struct net_device *dev,
+ struct sk_buff *skb,
+ unsigned char *addr,
+ unsigned int cookie);
+extern br_port_dev_get_hook_t __rcu *br_port_dev_get_hook;
+
+typedef void (br_notify_hook_t)(int group, int event, const void *ptr);
+extern br_notify_hook_t __rcu *br_notify_hook;
+typedef int (br_multicast_handle_hook_t)(const struct net_bridge_port *src,
+ struct sk_buff *skb);
+extern br_multicast_handle_hook_t __rcu *br_multicast_handle_hook;
+
+#define BR_FDB_EVENT_ADD 0x01
+#define BR_FDB_EVENT_DEL 0x02
+struct br_fdb_event {
+ unsigned char addr[6];
+ unsigned char is_local;
+ struct net_device *dev;
+ struct net_bridge *br;
+ struct net_device *orig_dev;
+};
+extern void br_fdb_register_notify(struct notifier_block *nb);
+extern void br_fdb_unregister_notify(struct notifier_block *nb);
+extern struct net_device *br_fdb_bridge_dev_get_and_hold(struct net_bridge *br);
+
+typedef struct net_bridge_port *br_get_dst_hook_t(
+ const struct net_bridge_port *src,
+ struct sk_buff **skb);
+extern br_get_dst_hook_t __rcu *br_get_dst_hook;
+
+typedef void (br_notify_hook_t)(int group, int event, const void *ptr);
+extern br_notify_hook_t __rcu *br_notify_hook;
#endif
--- a/include/linux/if_pppol2tp.h
+++ b/include/linux/if_pppol2tp.h
@@ -18,4 +18,27 @@
#include <linux/in6.h>
#include <uapi/linux/if_pppol2tp.h>
+/*
+ * Holds L2TP channel info
+ */
+struct pppol2tp_common_addr {
+ int tunnel_version; /* v2 or v3 */
+ __u32 local_tunnel_id, remote_tunnel_id; /* tunnel id */
+ __u32 local_session_id, remote_session_id; /* session id */
+ struct sockaddr_in local_addr, remote_addr; /* ip address and port */
+};
+
+/*
+ * L2TP channel operations
+ */
+struct pppol2tp_channel_ops {
+ struct ppp_channel_ops ops; /* ppp channel ops */
+};
+
+/*
+ * exported function which calls pppol2tp channel's get addressing
+ * function
+ */
+extern int pppol2tp_channel_addressing_get(struct ppp_channel *,
+ struct pppol2tp_common_addr *);
#endif
--- a/include/linux/if_pppox.h
+++ b/include/linux/if_pppox.h
@@ -1,6 +1,22 @@
+/*
+ **************************************************************************
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ * Permission to use, copy, modify, and/or distribute this software for
+ * any purpose with or without fee is hereby granted, provided that the
+ * above copyright notice and this permission notice appear in all copies.
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+ * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ **************************************************************************
+ */
+
/***************************************************************************
* Linux PPP over X - Generic PPP transport layer sockets
- * Linux PPP over Ethernet (PPPoE) Socket Implementation (RFC 2516)
+ * Linux PPP over Ethernet (PPPoE) Socket Implementation (RFC 2516)
*
* This file supplies definitions required by the PPP over Ethernet driver
* (pppox.c). All version information wrt this file is located in pppox.c
@@ -12,6 +28,7 @@
* 2 of the License, or (at your option) any later version.
*
*/
+
#ifndef __LINUX_IF_PPPOX_H
#define __LINUX_IF_PPPOX_H
@@ -42,6 +59,7 @@ struct pptp_opt {
u32 ack_sent, ack_recv;
u32 seq_sent, seq_recv;
int ppp_flags;
+ bool pptp_offload_mode;
};
#include <net/sock.h>
@@ -95,4 +113,45 @@ enum {
PPPOX_DEAD = 16 /* dead, useless, please clean me up!*/
};
+/*
+ * PPPoE Channel specific operations
+ */
+struct pppoe_channel_ops {
+ /* Must be first - general to all PPP channels */
+ struct ppp_channel_ops ops;
+ void (*get_addressing)(struct ppp_channel *, struct pppoe_opt *);
+};
+
+/* PPTP client callback */
+typedef int (*pptp_gre_seq_offload_callback_t)(struct sk_buff *skb,
+ struct net_device *pptp_dev);
+
+/* Return PPPoE channel specific addressing information */
+extern void pppoe_channel_addressing_get(struct ppp_channel *chan,
+ struct pppoe_opt *addressing);
+
+/* Lookup PPTP session info and return PPTP session */
+extern int pptp_session_find(struct pptp_opt *opt, __be16 peer_call_id,
+ __be32 peer_ip_addr);
+
+/* Return PPTP session information given the channel */
+extern void pptp_channel_addressing_get(struct pptp_opt *opt,
+ struct ppp_channel *chan);
+
+/* Enable the PPTP session offload flag */
+extern int pptp_session_enable_offload_mode(__be16 peer_call_id,
+ __be32 peer_ip_addr);
+
+/* Disable the PPTP session offload flag */
+extern int pptp_session_disable_offload_mode(__be16 peer_call_id,
+ __be32 peer_ip_addr);
+
+/* Register the PPTP GRE packets sequence number offload callback */
+extern int
+pptp_register_gre_seq_offload_callback(pptp_gre_seq_offload_callback_t
+ pptp_client_cb);
+
+/* Unregister the PPTP GRE packets sequence number offload callback */
+extern void pptp_unregister_gre_seq_offload_callback(void);
+
#endif /* !(__LINUX_IF_PPPOX_H) */
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -19,6 +19,12 @@
#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
struct socket *tun_get_socket(struct file *);
+#ifdef __KERNEL__
+typedef void (*tun_get_offload_stats_t)(struct net_device *dev,
+ struct rtnl_link_stats64 *stats);
+void tun_register_offload_stats_callback(tun_get_offload_stats_t stats_cb);
+void tun_unregister_offload_stats_callback(void);
+#endif
#else
#include <linux/err.h>
#include <linux/errno.h>
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -108,7 +108,15 @@ struct vlan_pcpu_stats {
extern struct net_device *__vlan_find_dev_deep_rcu(struct net_device *real_dev,
__be16 vlan_proto, u16 vlan_id);
+extern void __vlan_dev_update_accel_stats(struct net_device *dev,
+ struct rtnl_link_stats64 *stats);
+
+extern u16 vlan_dev_get_egress_prio(struct net_device *dev, u32 skb_prio);
+
+extern struct net_device *__vlan_find_dev_deep(struct net_device *real_dev,
+ __be16 vlan_proto, u16 vlan_id);
extern struct net_device *vlan_dev_real_dev(const struct net_device *dev);
+extern struct net_device *vlan_dev_next_dev(const struct net_device *dev);
extern u16 vlan_dev_vlan_id(const struct net_device *dev);
extern __be16 vlan_dev_vlan_proto(const struct net_device *dev);
@@ -204,6 +212,19 @@ static inline int vlan_get_encap_level(struct net_device *dev)
return vlan_dev_priv(dev)->nest_level;
}
#else
+static inline void __vlan_dev_update_accel_stats(struct net_device *dev,
+ struct rtnl_link_stats64 *stats)
+{
+
+}
+
+static inline u16 vlan_dev_get_egress_prio(struct net_device *dev,
+ u32 skb_prio)
+{
+ return 0;
+}
+
+
static inline struct net_device *
__vlan_find_dev_deep_rcu(struct net_device *real_dev,
__be16 vlan_proto, u16 vlan_id)
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -64,6 +64,46 @@ bool vlan_do_receive(struct sk_buff **skbp)
return true;
}
+/* Update the VLAN device with statistics from network offload engines */
+void __vlan_dev_update_accel_stats(struct net_device *dev,
+ struct rtnl_link_stats64 *nlstats)
+{
+ struct vlan_pcpu_stats *stats;
+
+ if (!is_vlan_dev(dev))
+ return;
+
+ stats = per_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats, 0);
+
+ u64_stats_update_begin(&stats->syncp);
+ stats->rx_packets += nlstats->rx_packets;
+ stats->rx_bytes += nlstats->rx_bytes;
+ stats->tx_packets += nlstats->tx_packets;
+ stats->tx_bytes += nlstats->tx_bytes;
+ u64_stats_update_end(&stats->syncp);
+}
+EXPORT_SYMBOL(__vlan_dev_update_accel_stats);
+
+/* Lookup the 802.1p egress_map table and return the 802.1p value */
+u16 vlan_dev_get_egress_prio(struct net_device *dev, u32 skb_prio)
+{
+ struct vlan_priority_tci_mapping *mp;
+
+ mp = vlan_dev_priv(dev)->egress_priority_map[(skb_prio & 0xf)];
+ while (mp) {
+ if (mp->priority == skb_prio) {
+ /* This should already be shifted
+ * to mask correctly with the
+ * VLAN's TCI
+ */
+ return mp->vlan_qos;
+ }
+ mp = mp->next;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(vlan_dev_get_egress_prio);
+
/* Must be invoked with rcu_read_lock. */
struct net_device *__vlan_find_dev_deep_rcu(struct net_device *dev,
__be16 vlan_proto, u16 vlan_id)
@@ -102,6 +142,12 @@ struct net_device *vlan_dev_real_dev(const struct net_device *dev)
}
EXPORT_SYMBOL(vlan_dev_real_dev);
+struct net_device *vlan_dev_next_dev(const struct net_device *dev)
+{
+ return vlan_dev_priv(dev)->real_dev;
+}
+EXPORT_SYMBOL(vlan_dev_next_dev);
+
u16 vlan_dev_vlan_id(const struct net_device *dev)
{
return vlan_dev_priv(dev)->vlan_id;
--- a/net/8021q/vlanproc.c
+++ b/net/8021q/vlanproc.c
@@ -127,6 +127,9 @@ void vlan_proc_cleanup(struct net *net)
{
struct vlan_net *vn = net_generic(net, vlan_net_id);
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED))
+ return;
+
if (vn->proc_vlan_conf)
remove_proc_entry(name_conf, vn->proc_vlan_dir);
@@ -146,6 +149,9 @@ int __net_init vlan_proc_init(struct net *net)
{
struct vlan_net *vn = net_generic(net, vlan_net_id);
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED))
+ return 0;
+
vn->proc_vlan_dir = proc_net_mkdir(net, name_root, net->proc_net);
if (!vn->proc_vlan_dir)
goto err;
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -25,6 +25,12 @@ menuconfig NET
if NET
+config ETHERNET_PACKET_MANGLE
+ bool
+ help
+ This option can be selected by phy drivers that need to mangle
+ packets going in or out of an ethernet device.
+
config WANT_COMPAT_NETLINK_MESSAGES
bool
help
@@ -86,6 +92,9 @@ source "net/netlabel/Kconfig"
endif # if INET
+config SOCK_DIAG
+ bool
+
config NETWORK_SECMARK
bool "Security Marking"
help
@@ -233,6 +242,8 @@ source "net/mpls/Kconfig"
source "net/hsr/Kconfig"
source "net/switchdev/Kconfig"
source "net/l3mdev/Kconfig"
+source "net/rmnet_data/Kconfig"
+source "net/qrtr/Kconfig"
config RPS
bool
@@ -297,6 +308,45 @@ config NET_FLOW_LIMIT
with many clients some protection against DoS by a single (spoofed)
flow that greatly exceeds average workload.
+config SKB_RECYCLER
+ bool "Generic skb recycling"
+ default y
+ ---help---
+ SKB_RECYCLER is used to implement RX-to-RX skb recycling.
+ This config enables the recycling scheme for bridging and
+ routing workloads. It can reduce skbuff freeing or
+ reallocation overhead.
+
+config SKB_RECYCLER_MULTI_CPU
+ bool "Cross-CPU recycling for CPU-locked workloads"
+ depends on SMP && SKB_RECYCLER
+ default n
+
+config SKB_RECYCLER_PREALLOC
+ bool "Enable preallocation of SKBs"
+ depends on SKB_RECYCLER
+ default n
+ ---help---
+ Preallocates SKBs in recycling lists and the number of
+ SKBs are configured through CONFIG_SKB_RECYCLE_MAX_PREALLOC_SKBS.
+ This needs SKB_RECYCLER to be enabled.
+ The number of preallocated SKBs can be passed using
+ SKB_RECYCLE_MAX_PREALLOC_SKBS.
+
+config SKB_RECYCLE_MAX_PREALLOC_SKBS
+ int "Number of SKBs to be preallocated"
+ depends on SKB_RECYCLER_PREALLOC
+ default 16384
+ ---help---
+ Number of SKBs each of 4K size to be preallocated for recycling
+
+config ALLOC_SKB_PAGE_FRAG_DISABLE
+ bool "Disable page fragment based skbuff payload allocations"
+ depends on !SKB_RECYCLER
+ default n
+ ---help---
+ Disable page fragment based allocations for skbuff payloads.
+
menu "Network testing"
config NET_PKTGEN
@@ -383,6 +433,8 @@ config LWTUNNEL
weight tunnel endpoint. Tunnel encapsulation parameters are stored
with light weight tunnel state associated with fib routes.
+source "drivers/soc/qcom/ipc_router/Kconfig"
+
endif # if NET
# Used by archs to tell that they support BPF_JIT
--- a/net/Makefile
+++ b/net/Makefile
@@ -5,6 +5,8 @@
# Rewritten to use lists instead of if-statements.
#
+KBUILD_CFLAGS_KERNEL := $(filter-out -Werror, $(KBUILD_CFLAGS_KERNEL))
+
obj-$(CONFIG_NET) := socket.o core/
tmp-$(CONFIG_COMPAT) := compat.o
@@ -77,3 +79,5 @@ endif
ifneq ($(CONFIG_NET_L3_MASTER_DEV),)
obj-y += l3mdev/
endif
+obj-$(CONFIG_RMNET_DATA) += rmnet_data/
+obj-$(CONFIG_QRTR) += qrtr/
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -431,7 +431,7 @@ static void hidp_del_timer(struct hidp_session *session)
}
static void hidp_process_report(struct hidp_session *session,
- int type, const u8 *data, int len, int intr)
+ int type, const u8 *data, unsigned int len, int intr)
{
if (len > HID_MAX_BUFFER_SIZE)
len = HID_MAX_BUFFER_SIZE;
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -266,6 +266,10 @@ static void __exit br_deinit(void)
br_fdb_fini();
}
+/* Hook for bridge event notifications */
+br_notify_hook_t __rcu *br_notify_hook __read_mostly;
+EXPORT_SYMBOL_GPL(br_notify_hook);
+
module_init(br_init)
module_exit(br_deinit)
MODULE_LICENSE("GPL");
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -40,6 +40,8 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);
const struct nf_br_ops *nf_ops;
u16 vid = 0;
+ struct net_bridge_port *pdst;
+ br_get_dst_hook_t *get_dst_hook;
rcu_read_lock();
nf_ops = rcu_dereference(nf_br_ops);
@@ -61,9 +63,16 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid))
goto out;
+ get_dst_hook = rcu_dereference(br_get_dst_hook);
+
if (is_broadcast_ether_addr(dest))
br_flood_deliver(br, skb, false);
else if (is_multicast_ether_addr(dest)) {
+ br_multicast_handle_hook_t *multicast_handle_hook =
+ rcu_dereference(br_multicast_handle_hook);
+ if (!__br_get(multicast_handle_hook, true, NULL, skb))
+ goto out;
+
if (unlikely(netpoll_tx_running(dev))) {
br_flood_deliver(br, skb, false);
goto out;
@@ -79,10 +88,20 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
br_multicast_deliver(mdst, skb);
else
br_flood_deliver(br, skb, false);
- } else if ((dst = __br_fdb_get(br, dest, vid)) != NULL)
- br_deliver(dst->dst, skb);
- else
- br_flood_deliver(br, skb, true);
+ } else {
+ pdst = __br_get(get_dst_hook, NULL, NULL, &skb);
+ if (pdst) {
+ if (!skb)
+ goto out;
+ br_deliver(pdst, skb);
+ } else {
+ dst = __br_fdb_get(br, dest, vid);
+ if (dst)
+ br_deliver(dst->dst, skb);
+ else
+ br_flood_deliver(br, skb, true);
+ }
+ }
out:
rcu_read_unlock();
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -38,6 +38,20 @@ static void fdb_notify(struct net_bridge *br,
static u32 fdb_salt __read_mostly;
+ATOMIC_NOTIFIER_HEAD(br_fdb_notifier_list);
+
+void br_fdb_register_notify(struct notifier_block *nb)
+{
+ atomic_notifier_chain_register(&br_fdb_notifier_list, nb);
+}
+EXPORT_SYMBOL_GPL(br_fdb_register_notify);
+
+void br_fdb_unregister_notify(struct notifier_block *nb)
+{
+ atomic_notifier_chain_unregister(&br_fdb_notifier_list, nb);
+}
+EXPORT_SYMBOL_GPL(br_fdb_unregister_notify);
+
int __init br_fdb_init(void)
{
br_fdb_cache = kmem_cache_create("bridge_fdb_cache",
@@ -289,12 +303,27 @@ out:
spin_unlock_bh(&br->hash_lock);
}
+ATOMIC_NOTIFIER_HEAD(br_fdb_update_notifier_list);
+
+void br_fdb_update_register_notify(struct notifier_block *nb)
+{
+ atomic_notifier_chain_register(&br_fdb_update_notifier_list, nb);
+}
+EXPORT_SYMBOL_GPL(br_fdb_update_register_notify);
+
+void br_fdb_update_unregister_notify(struct notifier_block *nb)
+{
+ atomic_notifier_chain_unregister(&br_fdb_update_notifier_list, nb);
+}
+EXPORT_SYMBOL_GPL(br_fdb_update_unregister_notify);
+
void br_fdb_cleanup(unsigned long _data)
{
struct net_bridge *br = (struct net_bridge *)_data;
unsigned long delay = hold_time(br);
unsigned long next_timer = jiffies + br->ageing_time;
int i;
+ struct br_fdb_event fdb_event;
spin_lock(&br->hash_lock);
for (i = 0; i < BR_HASH_SIZE; i++) {
@@ -308,10 +337,16 @@ void br_fdb_cleanup(unsigned long _data)
if (f->added_by_external_learn)
continue;
this_timer = f->updated + delay;
- if (time_before_eq(this_timer, jiffies))
+ if (time_before_eq(this_timer, jiffies)) {
+ memset(&fdb_event, 0, sizeof(fdb_event));
+ ether_addr_copy(fdb_event.addr, f->addr.addr);
fdb_delete(br, f);
- else if (time_before(this_timer, next_timer))
+ atomic_notifier_call_chain(
+ &br_fdb_update_notifier_list, 0,
+ (void *)&fdb_event);
+ } else if (time_before(this_timer, next_timer)) {
next_timer = this_timer;
+ }
}
}
spin_unlock(&br->hash_lock);
@@ -389,6 +424,7 @@ struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
return NULL;
}
+EXPORT_SYMBOL_GPL(__br_fdb_get);
#if IS_ENABLED(CONFIG_ATM_LANE)
/* Interface used by ATM LANE hook to test
@@ -561,12 +597,21 @@ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
return ret;
}
+/* Get the bridge device */
+struct net_device *br_fdb_bridge_dev_get_and_hold(struct net_bridge *br)
+{
+ dev_hold(br->dev);
+ return br->dev;
+}
+EXPORT_SYMBOL_GPL(br_fdb_bridge_dev_get_and_hold);
+
void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
const unsigned char *addr, u16 vid, bool added_by_user)
{
struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
struct net_bridge_fdb_entry *fdb;
bool fdb_modified = false;
+ struct br_fdb_event fdb_event;
/* some users want to always flood. */
if (hold_time(br) == 0)
@@ -588,8 +633,16 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
} else {
/* fastpath: update of existing entry */
if (unlikely(source != fdb->dst)) {
+ ether_addr_copy(fdb_event.addr, addr);
+ fdb_event.br = br;
+ fdb_event.orig_dev = fdb->dst->dev;
+ fdb_event.dev = source->dev;
fdb->dst = source;
fdb_modified = true;
+
+ atomic_notifier_call_chain(
+ &br_fdb_update_notifier_list,
+ 0, (void *)&fdb_event);
}
fdb->updated = jiffies;
if (unlikely(added_by_user))
@@ -614,8 +667,46 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
}
}
+/* Refresh FDB entries for bridge packets being forwarded by offload engines */
+void br_refresh_fdb_entry(struct net_device *dev, const char *addr)
+{
+ struct net_bridge_port *p = br_port_get_rcu(dev);
+
+ if (!p || p->state == BR_STATE_DISABLED)
+ return;
+
+ if (!is_valid_ether_addr(addr)) {
+ pr_info("bridge: Attempt to refresh with invalid ether address %pM\n",
+ addr);
+ return;
+ }
+
+ rcu_read_lock();
+ br_fdb_update(p->br, p, addr, 0, true);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(br_refresh_fdb_entry);
+
+/* Look up the MAC address in the device's bridge fdb table */
+struct net_bridge_fdb_entry *br_fdb_has_entry(struct net_device *dev,
+ const char *addr, __u16 vid)
+{
+ struct net_bridge_port *p = br_port_get_rcu(dev);
+ struct net_bridge_fdb_entry *fdb;
+
+ if (!p || p->state == BR_STATE_DISABLED)
+ return NULL;
+
+ rcu_read_lock();
+ fdb = fdb_find_rcu(&p->br->hash[br_mac_hash(addr, vid)], addr, vid);
+ rcu_read_unlock();
+
+ return fdb;
+}
+EXPORT_SYMBOL_GPL(br_fdb_has_entry);
+
static int fdb_to_nud(const struct net_bridge *br,
- const struct net_bridge_fdb_entry *fdb)
+ const struct net_bridge_fdb_entry *fdb)
{
if (fdb->is_local)
return NUD_PERMANENT;
@@ -687,6 +778,23 @@ static void fdb_notify(struct net_bridge *br,
struct sk_buff *skb;
int err = -ENOBUFS;
+ if (fdb->dst) {
+ int event;
+ struct br_fdb_event fdb_event;
+
+ if (type == RTM_NEWNEIGH)
+ event = BR_FDB_EVENT_ADD;
+ else
+ event = BR_FDB_EVENT_DEL;
+
+ fdb_event.dev = fdb->dst->dev;
+ ether_addr_copy(fdb_event.addr, fdb->addr.addr);
+ fdb_event.is_local = fdb->is_local;
+ atomic_notifier_call_chain(&br_fdb_notifier_list,
+ event,
+ (void *)&fdb_event);
+ }
+
skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC);
if (skb == NULL)
goto errout;
@@ -698,6 +806,7 @@ static void fdb_notify(struct net_bridge *br,
kfree_skb(skb);
goto errout;
}
+ __br_notify(RTNLGRP_NEIGH, type, fdb);
rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
return;
errout:
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -33,7 +33,8 @@ static inline int should_deliver(const struct net_bridge_port *p,
struct net_bridge_vlan_group *vg;
vg = nbp_vlan_group_rcu(p);
- return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
+ return ((skb->dev != p->dev) || ((p->flags & BR_HAIRPIN_MODE) &&
+ (!is_multicast_ether_addr(eth_hdr(skb)->h_dest)))) &&
br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING;
}
@@ -69,7 +70,7 @@ EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit);
int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING,
+ return BR_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING,
net, sk, skb, NULL, skb->dev,
br_dev_queue_push_xmit);
@@ -97,7 +98,7 @@ static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
return;
}
- NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT,
+ BR_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT,
dev_net(skb->dev), NULL, skb,NULL, skb->dev,
br_forward_finish);
}
@@ -121,7 +122,7 @@ static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
skb->dev = to->dev;
skb_forward_csum(skb);
- NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD,
+ BR_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD,
dev_net(indev), NULL, skb, indev, skb->dev,
br_forward_finish);
}
@@ -136,12 +137,11 @@ void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
kfree_skb(skb);
}
-EXPORT_SYMBOL_GPL(br_deliver);
/* called with rcu_read_lock */
void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, struct sk_buff *skb0)
{
- if (to && should_deliver(to, skb)) {
+ if (to && should_deliver(to, skb) && !(to->flags & BR_ISOLATE_MODE)) {
if (skb0)
deliver_clone(to, skb, __br_forward);
else
@@ -192,12 +192,40 @@ out:
return p;
}
+static struct net_bridge_port *maybe_deliver_addr(
+ struct net_bridge_port *prev, struct net_bridge_port *p,
+ struct sk_buff *skb, const unsigned char *addr,
+ void (*__packet_hook)(const struct net_bridge_port *p,
+ struct sk_buff *skb))
+{
+ struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
+ const unsigned char *src = eth_hdr(skb)->h_source;
+
+ if (!should_deliver(p, skb))
+ return prev;
+
+ /* Even with hairpin, no soliloquies - prevent breaking IPv6 DAD */
+ if (skb->dev == p->dev && ether_addr_equal(src, addr))
+ return prev;
+
+ skb = skb_copy(skb, GFP_ATOMIC);
+ if (!skb) {
+ dev->stats.tx_dropped++;
+ return prev;
+ }
+
+ memcpy(eth_hdr(skb)->h_dest, addr, ETH_ALEN);
+ __packet_hook(p, skb);
+
+ return prev;
+}
+
/* called under bridge lock */
static void br_flood(struct net_bridge *br, struct sk_buff *skb,
struct sk_buff *skb0,
void (*__packet_hook)(const struct net_bridge_port *p,
struct sk_buff *skb),
- bool unicast)
+ bool unicast, bool forward)
{
struct net_bridge_port *p;
struct net_bridge_port *prev;
@@ -205,6 +233,8 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb,
prev = NULL;
list_for_each_entry_rcu(p, &br->port_list, list) {
+ if (forward && (p->flags & BR_ISOLATE_MODE))
+ continue;
/* Do not flood unicast traffic to ports that turn it off */
if (unicast && !(p->flags & BR_FLOOD))
continue;
@@ -239,15 +269,17 @@ out:
/* called with rcu_read_lock */
void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, bool unicast)
{
- br_flood(br, skb, NULL, __br_deliver, unicast);
+ br_flood(br, skb, NULL, __br_deliver, unicast, false);
}
+EXPORT_SYMBOL_GPL(br_deliver);
/* called under bridge lock */
void br_flood_forward(struct net_bridge *br, struct sk_buff *skb,
struct sk_buff *skb2, bool unicast)
{
- br_flood(br, skb, skb2, __br_forward, unicast);
+ br_flood(br, skb, skb2, __br_forward, unicast, true);
}
+EXPORT_SYMBOL_GPL(br_forward);
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
/* called with rcu_read_lock */
@@ -262,6 +294,7 @@ static void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
struct net_bridge_port *prev = NULL;
struct net_bridge_port_group *p;
struct hlist_node *rp;
+ const unsigned char *addr;
rp = rcu_dereference(hlist_first_rcu(&br->router_list));
p = mdst ? rcu_dereference(mdst->ports) : NULL;
@@ -272,10 +305,19 @@ static void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
rport = rp ? hlist_entry(rp, struct net_bridge_port, rlist) :
NULL;
- port = (unsigned long)lport > (unsigned long)rport ?
- lport : rport;
+ if ((unsigned long)lport > (unsigned long)rport) {
+ port = lport;
+ addr = p->unicast ? p->eth_addr : NULL;
+ } else {
+ port = rport;
+ addr = NULL;
+ }
- prev = maybe_deliver(prev, port, skb, __packet_hook);
+ if (addr)
+ prev = maybe_deliver_addr(prev, port, skb, addr,
+ __packet_hook);
+ else
+ prev = maybe_deliver(prev, port, skb, __packet_hook);
if (IS_ERR(prev))
goto out;
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -1,3 +1,9 @@
+/*
+ **************************************************************************
+ * Copyright (c) 2015-2016, The Linux Foundation. All rights reserved.
+ **************************************************************************
+ */
+
/*
* Userspace interface
* Linux ethernet bridge
@@ -28,6 +34,10 @@
#include "br_private.h"
+/* Hook for external forwarding logic */
+br_port_dev_get_hook_t __rcu *br_port_dev_get_hook __read_mostly;
+EXPORT_SYMBOL_GPL(br_port_dev_get_hook);
+
/*
* Determine initial path cost based on speed.
* using recommendations from 802.1d standard
@@ -456,8 +466,8 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit)
return -ELOOP;
- /* Device is already being bridged */
- if (br_port_exists(dev))
+ /* Device has master upper dev */
+ if (netdev_master_upper_dev_get(dev))
return -EBUSY;
/* No bridging devices that dislike that (e.g. wireless) */
@@ -530,6 +540,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
dev_set_mtu(br->dev, br_min_mtu(br));
kobject_uevent(&p->kobj, KOBJ_ADD);
+ call_netdevice_notifiers(NETDEV_BR_JOIN, dev);
return 0;
@@ -561,6 +572,8 @@ int br_del_if(struct net_bridge *br, struct net_device *dev)
if (!p || p->br != br)
return -EINVAL;
+ call_netdevice_notifiers(NETDEV_BR_LEAVE, dev);
+
/* Since more than one interface can be attached to a bridge,
* there still maybe an alternate path for netconsole to use;
* therefore there is no reason for a NETDEV_RELEASE event.
@@ -588,3 +601,86 @@ void br_port_flags_change(struct net_bridge_port *p, unsigned long mask)
if (mask & BR_AUTO_MASK)
nbp_update_port_count(br);
}
+
+/* br_port_dev_get()
+ * If a skb is provided, and the br_port_dev_get_hook_t hook exists,
+ * use that to try and determine the egress port for that skb.
+ * If not, or no egress port could be determined, use the given addr
+ * to identify the port to which it is reachable,
+ * returing a reference to the net device associated with that port.
+ *
+ * NOTE: Return NULL if given dev is not a bridge or the mac has no
+ * associated port.
+ */
+struct net_device *br_port_dev_get(struct net_device *dev, unsigned char *addr,
+ struct sk_buff *skb,
+ unsigned int cookie)
+{
+ struct net_bridge_fdb_entry *fdbe;
+ struct net_bridge *br;
+ struct net_device *netdev = NULL;
+
+ /* Is this a bridge? */
+ if (!(dev->priv_flags & IFF_EBRIDGE))
+ return NULL;
+
+ rcu_read_lock();
+
+ /* If the hook exists and the skb isn't NULL, try and get the port */
+ if (skb) {
+ br_port_dev_get_hook_t *port_dev_get_hook;
+
+ port_dev_get_hook = rcu_dereference(br_port_dev_get_hook);
+ if (port_dev_get_hook) {
+ struct net_bridge_port *pdst =
+ __br_get(port_dev_get_hook, NULL, dev, skb,
+ addr, cookie);
+ if (pdst) {
+ dev_hold(pdst->dev);
+ netdev = pdst->dev;
+ goto out;
+ }
+ }
+ }
+
+ /* Either there is no hook, or can't
+ * determine the port to use - fall back to using FDB
+ */
+
+ br = netdev_priv(dev);
+
+ /* Lookup the fdb entry and get reference to the port dev */
+ fdbe = __br_fdb_get(br, addr, 0);
+ if (fdbe && fdbe->dst) {
+ netdev = fdbe->dst->dev; /* port device */
+ dev_hold(netdev);
+ }
+out:
+ rcu_read_unlock();
+ return netdev;
+}
+EXPORT_SYMBOL_GPL(br_port_dev_get);
+
+/* Update bridge statistics for bridge packets processed by offload engines */
+void br_dev_update_stats(struct net_device *dev,
+ struct rtnl_link_stats64 *nlstats)
+{
+ struct net_bridge *br;
+ struct pcpu_sw_netstats *stats;
+
+ /* Is this a bridge? */
+ if (!(dev->priv_flags & IFF_EBRIDGE))
+ return;
+
+ br = netdev_priv(dev);
+ stats = per_cpu_ptr(br->stats, 0);
+
+ u64_stats_update_begin(&stats->syncp);
+ stats->rx_packets += nlstats->rx_packets;
+ stats->rx_bytes += nlstats->rx_bytes;
+ stats->tx_packets += nlstats->tx_packets;
+ stats->tx_bytes += nlstats->tx_bytes;
+ u64_stats_update_end(&stats->syncp);
+}
+EXPORT_SYMBOL_GPL(br_dev_update_stats);
+
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -33,7 +33,15 @@ br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb)
return netif_receive_skb(skb);
}
-static int br_pass_frame_up(struct sk_buff *skb)
+/* Hook for external Multicast handler */
+br_multicast_handle_hook_t __rcu *br_multicast_handle_hook __read_mostly;
+EXPORT_SYMBOL_GPL(br_multicast_handle_hook);
+
+/* Hook for external forwarding logic */
+br_get_dst_hook_t __rcu *br_get_dst_hook __read_mostly;
+EXPORT_SYMBOL_GPL(br_get_dst_hook);
+
+int br_pass_frame_up(struct sk_buff *skb)
{
struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev;
struct net_bridge *br = netdev_priv(brdev);
@@ -62,10 +70,11 @@ static int br_pass_frame_up(struct sk_buff *skb)
if (!skb)
return NET_RX_DROP;
- return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,
+ return BR_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,
dev_net(indev), NULL, skb, indev, NULL,
br_netif_receive_skb);
}
+EXPORT_SYMBOL_GPL(br_pass_frame_up);
static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br,
u16 vid, struct net_bridge_port *p)
@@ -135,6 +144,8 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
struct net_bridge_fdb_entry *dst;
struct net_bridge_mdb_entry *mdst;
struct sk_buff *skb2;
+ struct net_bridge_port *pdst = NULL;
+ br_get_dst_hook_t *get_dst_hook = rcu_dereference(br_get_dst_hook);
bool unicast = true;
u16 vid = 0;
@@ -153,7 +164,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
br_multicast_rcv(br, p, skb, vid))
goto drop;
- if (p->state == BR_STATE_LEARNING)
+ if ((p->state == BR_STATE_LEARNING) && skb->protocol != htons(ETH_P_PAE))
goto drop;
BR_INPUT_SKB_CB(skb)->brdev = br->dev;
@@ -169,10 +180,19 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
if (IS_ENABLED(CONFIG_INET) && skb->protocol == htons(ETH_P_ARP))
br_do_proxy_arp(skb, br, vid, p);
- if (is_broadcast_ether_addr(dest)) {
+ if (skb->protocol == htons(ETH_P_PAE)) {
+ skb2 = skb;
+ /* Do not forward 802.1x/EAP frames */
+ skb = NULL;
+ } else if (is_broadcast_ether_addr(dest)) {
skb2 = skb;
unicast = false;
} else if (is_multicast_ether_addr(dest)) {
+ br_multicast_handle_hook_t *multicast_handle_hook =
+ rcu_dereference(br_multicast_handle_hook);
+ if (!__br_get(multicast_handle_hook, true, p, skb))
+ goto out;
+
mdst = br_mdb_get(br, skb, vid);
if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
br_multicast_querier_exists(br, eth_hdr(skb))) {
@@ -188,18 +208,31 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
unicast = false;
br->dev->stats.multicast++;
- } else if ((dst = __br_fdb_get(br, dest, vid)) &&
- dst->is_local) {
- skb2 = skb;
- /* Do not forward the packet since it's local. */
- skb = NULL;
+ } else {
+ pdst = __br_get(get_dst_hook, NULL, p, &skb);
+ if (pdst) {
+ if (!skb)
+ goto out;
+ } else {
+ dst = __br_fdb_get(br, dest, vid);
+ if ((p->flags & BR_ISOLATE_MODE) ||
+ (dst && dst->is_local)) {
+ skb2 = skb;
+ /* Do not forward the packet since it's local.*/
+ skb = NULL;
+ }
+ }
}
if (skb) {
if (dst) {
dst->used = jiffies;
- br_forward(dst->dst, skb, skb2);
- } else
+ pdst = dst->dst;
+ }
+
+ if (pdst)
+ br_forward(pdst, skb, skb2);
+ else
br_flood_forward(br, skb, skb2, unicast);
}
@@ -218,11 +251,13 @@ EXPORT_SYMBOL_GPL(br_handle_frame_finish);
static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_bridge_port *p = br_port_get_rcu(skb->dev);
- u16 vid = 0;
+ if (p->state != BR_STATE_DISABLED) {
+ u16 vid = 0;
- /* check if vlan is allowed, to avoid spoofing */
- if (p->flags & BR_LEARNING && br_should_learn(p, skb, &vid))
- br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false);
+ /* check if vlan is allowed, to avoid spoofing */
+ if (p->flags & BR_LEARNING && br_should_learn(p, skb, &vid))
+ br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false);
+ }
return 0; /* process further */
}
@@ -285,7 +320,7 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
}
/* Deliver packet to local host only */
- if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,
+ if (BR_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,
dev_net(skb->dev), NULL, skb, skb->dev, NULL,
br_handle_local_finish)) {
return RX_HANDLER_CONSUMED; /* consumed by filter */
@@ -297,6 +332,21 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
forward:
switch (p->state) {
+ case BR_STATE_DISABLED:
+ if (skb->protocol == htons(ETH_P_PAE)) {
+ if (ether_addr_equal(p->br->dev->dev_addr, dest))
+ skb->pkt_type = PACKET_HOST;
+
+ if (BR_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, dev_net(skb->dev), NULL,
+ skb, skb->dev, NULL, br_handle_local_finish))
+ break;
+
+ BR_INPUT_SKB_CB(skb)->brdev = p->br->dev;
+ br_pass_frame_up(skb);
+ break;
+ }
+ goto drop;
+
case BR_STATE_FORWARDING:
rhook = rcu_dereference(br_should_route_hook);
if (rhook) {
@@ -311,7 +361,7 @@ forward:
if (ether_addr_equal(p->br->dev->dev_addr, dest))
skb->pkt_type = PACKET_HOST;
- NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING,
+ BR_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING,
dev_net(skb->dev), NULL, skb, skb->dev, NULL,
br_handle_frame_finish);
break;
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -416,7 +416,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
break;
}
- p = br_multicast_new_port_group(port, group, *pp, state);
+ p = br_multicast_new_port_group(port, group, *pp, state, NULL);
if (unlikely(!p))
return -ENOMEM;
rcu_assign_pointer(*pp, p);
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -30,6 +30,7 @@
#include <net/ipv6.h>
#include <net/mld.h>
#include <net/ip6_checksum.h>
+#include <net/ip6_route.h>
#include <net/addrconf.h>
#endif
@@ -42,12 +43,13 @@ static void br_multicast_add_router(struct net_bridge *br,
static void br_ip4_multicast_leave_group(struct net_bridge *br,
struct net_bridge_port *port,
__be32 group,
- __u16 vid);
+ __u16 vid,
+ const unsigned char *src);
#if IS_ENABLED(CONFIG_IPV6)
static void br_ip6_multicast_leave_group(struct net_bridge *br,
struct net_bridge_port *port,
const struct in6_addr *group,
- __u16 vid);
+ __u16 vid, const unsigned char *src);
#endif
unsigned int br_mdb_rehash_seq;
@@ -652,7 +654,8 @@ struct net_bridge_port_group *br_multicast_new_port_group(
struct net_bridge_port *port,
struct br_ip *group,
struct net_bridge_port_group __rcu *next,
- unsigned char state)
+ unsigned char state,
+ const unsigned char *src)
{
struct net_bridge_port_group *p;
@@ -667,12 +670,33 @@ struct net_bridge_port_group *br_multicast_new_port_group(
hlist_add_head(&p->mglist, &port->mglist);
setup_timer(&p->timer, br_multicast_port_group_expired,
(unsigned long)p);
+ if ((port->flags & BR_MULTICAST_TO_UCAST) && src) {
+ memcpy(p->eth_addr, src, ETH_ALEN);
+ p->unicast = true;
+ }
return p;
}
+static bool br_port_group_equal(struct net_bridge_port_group *p,
+ struct net_bridge_port *port,
+ const unsigned char *src)
+{
+ if (p->port != port)
+ return false;
+
+ if (!p->unicast)
+ return true;
+
+ if (!src)
+ return false;
+
+ return ether_addr_equal(src, p->eth_addr);
+}
+
static int br_multicast_add_group(struct net_bridge *br,
struct net_bridge_port *port,
- struct br_ip *group)
+ struct br_ip *group,
+ const unsigned char *src)
{
struct net_bridge_mdb_entry *mp;
struct net_bridge_port_group *p;
@@ -699,13 +723,13 @@ static int br_multicast_add_group(struct net_bridge *br,
for (pp = &mp->ports;
(p = mlock_dereference(*pp, br)) != NULL;
pp = &p->next) {
- if (p->port == port)
+ if (br_port_group_equal(p, port, src))
goto found;
if ((unsigned long)p->port < (unsigned long)port)
break;
}
- p = br_multicast_new_port_group(port, group, *pp, MDB_TEMPORARY);
+ p = br_multicast_new_port_group(port, group, *pp, MDB_TEMPORARY, src);
if (unlikely(!p))
goto err;
rcu_assign_pointer(*pp, p);
@@ -724,7 +748,7 @@ err:
static int br_ip4_multicast_add_group(struct net_bridge *br,
struct net_bridge_port *port,
__be32 group,
- __u16 vid)
+ __u16 vid, const unsigned char *src)
{
struct br_ip br_group;
@@ -735,14 +759,14 @@ static int br_ip4_multicast_add_group(struct net_bridge *br,
br_group.proto = htons(ETH_P_IP);
br_group.vid = vid;
- return br_multicast_add_group(br, port, &br_group);
+ return br_multicast_add_group(br, port, &br_group, src);
}
#if IS_ENABLED(CONFIG_IPV6)
static int br_ip6_multicast_add_group(struct net_bridge *br,
struct net_bridge_port *port,
const struct in6_addr *group,
- __u16 vid)
+ __u16 vid, const unsigned char *src)
{
struct br_ip br_group;
@@ -753,7 +777,7 @@ static int br_ip6_multicast_add_group(struct net_bridge *br,
br_group.proto = htons(ETH_P_IPV6);
br_group.vid = vid;
- return br_multicast_add_group(br, port, &br_group);
+ return br_multicast_add_group(br, port, &br_group, src);
}
#endif
@@ -832,7 +856,7 @@ static void __br_multicast_send_query(struct net_bridge *br,
if (port) {
skb->dev = port->dev;
- NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT,
+ BR_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT,
dev_net(port->dev), NULL, skb, NULL, skb->dev,
br_dev_queue_push_xmit);
} else {
@@ -1003,6 +1027,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
struct sk_buff *skb,
u16 vid)
{
+ const unsigned char *src = eth_hdr(skb)->h_source;
struct igmpv3_report *ih;
struct igmpv3_grec *grec;
int i;
@@ -1046,9 +1071,9 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
if ((type == IGMPV3_CHANGE_TO_INCLUDE ||
type == IGMPV3_MODE_IS_INCLUDE) &&
ntohs(grec->grec_nsrcs) == 0) {
- br_ip4_multicast_leave_group(br, port, group, vid);
+ br_ip4_multicast_leave_group(br, port, group, vid, src);
} else {
- err = br_ip4_multicast_add_group(br, port, group, vid);
+ err = br_ip4_multicast_add_group(br, port, group, vid, src);
if (err)
break;
}
@@ -1063,6 +1088,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
struct sk_buff *skb,
u16 vid)
{
+ const unsigned char *src = eth_hdr(skb)->h_source;
struct icmp6hdr *icmp6h;
struct mld2_grec *grec;
int i;
@@ -1114,10 +1140,10 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
grec->grec_type == MLD2_MODE_IS_INCLUDE) &&
ntohs(*nsrcs) == 0) {
br_ip6_multicast_leave_group(br, port, &grec->grec_mca,
- vid);
+ vid, src);
} else {
err = br_ip6_multicast_add_group(br, port,
- &grec->grec_mca, vid);
+ &grec->grec_mca, vid, src);
if (err)
break;
}
@@ -1432,7 +1458,8 @@ br_multicast_leave_group(struct net_bridge *br,
struct net_bridge_port *port,
struct br_ip *group,
struct bridge_mcast_other_query *other_query,
- struct bridge_mcast_own_query *own_query)
+ struct bridge_mcast_own_query *own_query,
+ const unsigned char *src)
{
struct net_bridge_mdb_htable *mdb;
struct net_bridge_mdb_entry *mp;
@@ -1456,7 +1483,7 @@ br_multicast_leave_group(struct net_bridge *br,
for (pp = &mp->ports;
(p = mlock_dereference(*pp, br)) != NULL;
pp = &p->next) {
- if (p->port != port)
+ if (!br_port_group_equal(p, port, src))
continue;
rcu_assign_pointer(*pp, p->next);
@@ -1519,7 +1546,7 @@ br_multicast_leave_group(struct net_bridge *br,
for (p = mlock_dereference(mp->ports, br);
p != NULL;
p = mlock_dereference(p->next, br)) {
- if (p->port != port)
+ if (!br_port_group_equal(p, port, src))
continue;
if (!hlist_unhashed(&p->mglist) &&
@@ -1537,8 +1564,8 @@ out:
static void br_ip4_multicast_leave_group(struct net_bridge *br,
struct net_bridge_port *port,
- __be32 group,
- __u16 vid)
+ __be32 group, __u16 vid,
+ const unsigned char *src)
{
struct br_ip br_group;
struct bridge_mcast_own_query *own_query;
@@ -1553,14 +1580,14 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br,
br_group.vid = vid;
br_multicast_leave_group(br, port, &br_group, &br->ip4_other_query,
- own_query);
+ own_query, src);
}
#if IS_ENABLED(CONFIG_IPV6)
static void br_ip6_multicast_leave_group(struct net_bridge *br,
struct net_bridge_port *port,
const struct in6_addr *group,
- __u16 vid)
+ __u16 vid, const unsigned char *src)
{
struct br_ip br_group;
struct bridge_mcast_own_query *own_query;
@@ -1575,7 +1602,7 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br,
br_group.vid = vid;
br_multicast_leave_group(br, port, &br_group, &br->ip6_other_query,
- own_query);
+ own_query, src);
}
#endif
@@ -1584,6 +1611,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
struct sk_buff *skb,
u16 vid)
{
+ const unsigned char *src;
struct sk_buff *skb_trimmed = NULL;
struct igmphdr *ih;
int err;
@@ -1600,12 +1628,13 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
BR_INPUT_SKB_CB(skb)->igmp = 1;
ih = igmp_hdr(skb);
+ src = eth_hdr(skb)->h_source;
switch (ih->type) {
case IGMP_HOST_MEMBERSHIP_REPORT:
case IGMPV2_HOST_MEMBERSHIP_REPORT:
BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
- err = br_ip4_multicast_add_group(br, port, ih->group, vid);
+ err = br_ip4_multicast_add_group(br, port, ih->group, vid, src);
break;
case IGMPV3_HOST_MEMBERSHIP_REPORT:
err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid);
@@ -1614,7 +1643,7 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
err = br_ip4_multicast_query(br, port, skb_trimmed, vid);
break;
case IGMP_HOST_LEAVE_MESSAGE:
- br_ip4_multicast_leave_group(br, port, ih->group, vid);
+ br_ip4_multicast_leave_group(br, port, ih->group, vid, src);
break;
}
@@ -1625,11 +1654,268 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
}
#if IS_ENABLED(CONFIG_IPV6)
+static int br_ndisc_send_na_finish(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ return dev_queue_xmit(skb);
+}
+
+static int br_ndisc_send_na(struct net_device *dev,
+ const struct in6_addr *daddr,
+ const struct in6_addr *solicited_addr,
+ const u8 *target_lladdr, bool solicited,
+ bool override, const u8 *dest_hw)
+{
+ struct sk_buff *skb;
+ struct nd_msg *msg;
+ int hlen = LL_RESERVED_SPACE(dev);
+ int tlen = dev->needed_tailroom;
+ struct dst_entry *dst;
+ struct net *net = dev_net(dev);
+ struct sock *sk = net->ipv6.ndisc_sk;
+ struct inet6_dev *idev;
+ int err;
+ struct ipv6hdr *hdr;
+ struct icmp6hdr *icmp6h;
+ u8 type;
+ const struct in6_addr *saddr = solicited_addr;
+ int pad, data_len, space;
+ u8 *opt;
+
+ skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + sizeof(*msg) +
+ ndisc_opt_addr_space(dev) + tlen, GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ skb->protocol = htons(ETH_P_IPV6);
+ skb->dev = dev;
+
+ skb_reserve(skb, hlen + sizeof(struct ipv6hdr));
+ skb_reset_transport_header(skb);
+
+ /* Manually assign socket ownership as we avoid calling
+ * sock_alloc_send_pskb() to bypass wmem buffer limits
+ */
+ skb_set_owner_w(skb, sk);
+
+ msg = (struct nd_msg *)skb_put(skb, sizeof(*msg));
+ *msg = (struct nd_msg) {
+ .icmph = {
+ .icmp6_type = ICMPV6_NDISC_NBR_ADVERTISEMENT,
+ .icmp6_router = false,
+ .icmp6_solicited = solicited,
+ .icmp6_override = override,
+ },
+ .target = *solicited_addr,
+ };
+
+ /* We are replying on behalf of other entity. Let that entity's
+ * address be the target ll addr and src_addr.
+ */
+ pad = ndisc_addr_option_pad(skb->dev->type);
+ data_len = skb->dev->addr_len;
+ space = ndisc_opt_addr_space(skb->dev);
+ opt = skb_put(skb, space);
+
+ opt[0] = ND_OPT_TARGET_LL_ADDR;
+ opt[1] = space >> 3;
+
+ memset(opt + 2, 0, pad);
+ opt += pad;
+ space -= pad;
+
+ memcpy(opt + 2, target_lladdr, dev->addr_len);
+ data_len += 2;
+ opt += data_len;
+ space -= data_len;
+ if (space > 0)
+ memset(opt, 0, space);
+
+ dst = skb_dst(skb);
+ icmp6h = icmp6_hdr(skb);
+
+ type = icmp6h->icmp6_type;
+
+ if (!dst) {
+ struct flowi6 fl6;
+
+ icmpv6_flow_init(sk, &fl6, type, saddr, daddr,
+ skb->dev->ifindex);
+ dst = icmp6_dst_alloc(skb->dev, &fl6);
+ if (IS_ERR(dst))
+ goto out;
+
+ skb_dst_set(skb, dst);
+ }
+
+ icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, skb->len,
+ IPPROTO_ICMPV6,
+ csum_partial(icmp6h,
+ skb->len, 0));
+
+ skb_push(skb, sizeof(*hdr));
+ skb_reset_network_header(skb);
+ hdr = ipv6_hdr(skb);
+
+ ip6_flow_hdr(hdr, 0, 0);
+
+ hdr->payload_len = htons(skb->len - sizeof(*hdr));
+ hdr->nexthdr = IPPROTO_ICMPV6;
+ hdr->hop_limit = inet6_sk(sk)->hop_limit;
+
+ hdr->saddr = *saddr;
+ hdr->daddr = *daddr;
+
+ /* We are replying on behalf of another entity. Use that entity's
+ * address as the source link layer address if we have all the needed
+ * information to build the link layer header.
+ */
+ if (dest_hw &&
+ dev_hard_header(skb, dev, ETH_P_IPV6, dest_hw, target_lladdr,
+ skb->len) < 0)
+ goto out;
+
+ rcu_read_lock();
+ idev = __in6_dev_get(dst->dev);
+ IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
+
+ err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL,
+ dst->dev, dest_hw ? br_ndisc_send_na_finish : dst_output);
+
+ if (!err) {
+ ICMP6MSGOUT_INC_STATS(net, idev, type);
+ ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
+ }
+
+ rcu_read_unlock();
+ return 0;
+
+out:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static const u8 *br_get_ndisc_lladdr(const u8 *opt, int opt_len,
+ unsigned int alen)
+{
+ const struct nd_opt_hdr *nd_opt = (const struct nd_opt_hdr *)opt;
+
+ while (opt_len > sizeof(struct nd_opt_hdr)) {
+ int l;
+
+ l = nd_opt->nd_opt_len << 3;
+ if (opt_len < l || l == 0)
+ return NULL;
+
+ if (nd_opt->nd_opt_type == ND_OPT_SOURCE_LL_ADDR) {
+ if (l >= 2 + alen)
+ return (const u8 *)(nd_opt + 1);
+ }
+
+ opt_len -= l;
+ nd_opt = ((void *)nd_opt) + l;
+ }
+
+ return NULL;
+}
+
+static void br_do_proxy_ndisc(struct sk_buff *skb, struct net_bridge *br,
+ u16 vid, struct net_bridge_port *p)
+{
+ struct net_device *dev = br->dev;
+ struct nd_msg *msg;
+ const struct ipv6hdr *iphdr;
+ const struct in6_addr *saddr, *daddr;
+ struct neighbour *n, *n_sender = NULL;
+ struct net_bridge_fdb_entry *f;
+ int ndoptlen;
+ bool override = false, solicited = true;
+ bool dad;
+ const struct in6_addr *daddr_na;
+ const u8 *dest_hw = NULL;
+
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = false;
+
+ if (!p)
+ return;
+
+ if (!pskb_may_pull(skb, skb->len))
+ return;
+
+ iphdr = ipv6_hdr(skb);
+ saddr = &iphdr->saddr;
+ daddr = &iphdr->daddr;
+
+ msg = (struct nd_msg *)skb_transport_header(skb);
+ if (msg->icmph.icmp6_code != 0 ||
+ msg->icmph.icmp6_type != ICMPV6_NDISC_NBR_SOLICITATION)
+ return;
+
+ if (ipv6_addr_loopback(daddr) ||
+ ipv6_addr_is_multicast(&msg->target))
+ return;
+
+ n = neigh_lookup(&nd_tbl, &msg->target, dev);
+ if (!n)
+ return;
+
+ if (!(n->nud_state & NUD_VALID))
+ goto out;
+
+ f = __br_fdb_get(br, n->ha, vid);
+ if (!f)
+ goto out;
+
+ if (!(p->flags & BR_PROXYARP) &&
+ !(f->dst && (f->dst->flags & BR_PROXYARP_WIFI)))
+ goto out;
+
+ dad = ipv6_addr_any(saddr);
+ daddr_na = saddr;
+
+ if (dad && !ipv6_addr_is_solict_mult(daddr))
+ goto out;
+
+ if (dad) {
+ override = true;
+ solicited = false;
+ daddr_na = &in6addr_linklocal_allnodes;
+ }
+
+ if (!(p->flags & BR_PROXYARP)) {
+ ndoptlen = skb_tail_pointer(skb) -
+ (skb_transport_header(skb) +
+ offsetof(struct nd_msg, opt));
+ dest_hw = br_get_ndisc_lladdr(msg->opt, ndoptlen,
+ dev->addr_len);
+ if (!dest_hw && !dad) {
+ n_sender = neigh_lookup(&nd_tbl, saddr, dev);
+ if (n_sender)
+ dest_hw = n_sender->ha;
+ }
+
+ if (dest_hw && is_multicast_ether_addr(dest_hw))
+ dest_hw = NULL;
+ }
+
+ if (br_ndisc_send_na(dev, daddr_na, &msg->target, n->ha, solicited,
+ override, dest_hw))
+ goto out;
+
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+
+out:
+ neigh_release(n);
+ if (n_sender)
+ neigh_release(n_sender);
+}
+
static int br_multicast_ipv6_rcv(struct net_bridge *br,
struct net_bridge_port *port,
struct sk_buff *skb,
u16 vid)
{
+ const unsigned char *src;
struct sk_buff *skb_trimmed = NULL;
struct mld_msg *mld;
int err;
@@ -1649,8 +1935,9 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
switch (mld->mld_type) {
case ICMPV6_MGM_REPORT:
+ src = eth_hdr(skb)->h_source;
BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
- err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid);
+ err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid, src);
break;
case ICMPV6_MLD2_REPORT:
err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid);
@@ -1659,7 +1946,11 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
err = br_ip6_multicast_query(br, port, skb_trimmed, vid);
break;
case ICMPV6_MGM_REDUCTION:
- br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid);
+ src = eth_hdr(skb)->h_source;
+ br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid, src);
+ break;
+ case ICMPV6_NDISC_NBR_SOLICITATION:
+ br_do_proxy_ndisc(skb, br, vid, port);
break;
}
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -49,6 +49,7 @@ static struct ctl_table_header *brnf_sysctl_header;
static int brnf_call_iptables __read_mostly = 1;
static int brnf_call_ip6tables __read_mostly = 1;
static int brnf_call_arptables __read_mostly = 1;
+static int brnf_call_custom __read_mostly;
static int brnf_filter_vlan_tagged __read_mostly;
static int brnf_filter_pppoe_tagged __read_mostly;
static int brnf_pass_vlan_indev __read_mostly;
@@ -56,6 +57,7 @@ static int brnf_pass_vlan_indev __read_mostly;
#define brnf_call_iptables 1
#define brnf_call_ip6tables 1
#define brnf_call_arptables 1
+#define brnf_call_custom 1
#define brnf_filter_vlan_tagged 0
#define brnf_filter_pppoe_tagged 0
#define brnf_pass_vlan_indev 0
@@ -70,6 +72,15 @@ static int brnf_pass_vlan_indev __read_mostly;
#define IS_ARP(skb) \
(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_ARP))
+int brnf_call_ebtables __read_mostly;
+EXPORT_SYMBOL_GPL(brnf_call_ebtables);
+
+bool br_netfilter_run_hooks(void)
+{
+ return brnf_call_iptables | brnf_call_ip6tables | brnf_call_arptables |
+ brnf_call_ebtables | brnf_call_custom;
+}
+
static inline __be16 vlan_proto(const struct sk_buff *skb)
{
if (skb_vlan_tag_present(skb))
@@ -974,6 +985,13 @@ static struct ctl_table brnf_table[] = {
.mode = 0644,
.proc_handler = brnf_sysctl_call_tables,
},
+ {
+ .procname = "bridge-nf-call-custom",
+ .data = &brnf_call_custom,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = brnf_sysctl_call_tables,
+ },
{ }
};
#endif
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -449,6 +449,7 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port)
kfree_skb(skb);
goto errout;
}
+ __br_notify(RTNLGRP_LINK, event, port);
rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
return;
errout:
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -21,6 +21,8 @@
#include <net/ip6_fib.h>
#include <linux/if_vlan.h>
#include <linux/rhashtable.h>
+#include <linux/export.h>
+#include <linux/netfilter.h>
#define BR_HASH_BITS 8
#define BR_HASH_SIZE (1 << BR_HASH_BITS)
@@ -158,6 +160,9 @@ struct net_bridge_port_group {
struct timer_list timer;
struct br_ip addr;
unsigned char state;
+
+ unsigned char eth_addr[ETH_ALEN];
+ bool unicast;
};
struct net_bridge_mdb_entry
@@ -504,6 +509,7 @@ void br_port_flags_change(struct net_bridge_port *port, unsigned long mask);
void br_manage_promisc(struct net_bridge *br);
/* br_input.c */
+int br_pass_frame_up(struct sk_buff *skb);
int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
rx_handler_result_t br_handle_frame(struct sk_buff **pskb);
@@ -555,7 +561,8 @@ void br_multicast_free_pg(struct rcu_head *head);
struct net_bridge_port_group *
br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group,
struct net_bridge_port_group __rcu *next,
- unsigned char state);
+ unsigned char state,
+ const unsigned char *src);
void br_mdb_init(void);
void br_mdb_uninit(void);
void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
@@ -902,15 +909,29 @@ extern const struct nf_br_ops __rcu *nf_br_ops;
/* br_netfilter.c */
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+extern int brnf_call_ebtables;
int br_nf_core_init(void);
void br_nf_core_fini(void);
void br_netfilter_rtable_init(struct net_bridge *);
+bool br_netfilter_run_hooks(void);
#else
static inline int br_nf_core_init(void) { return 0; }
static inline void br_nf_core_fini(void) {}
#define br_netfilter_rtable_init(x)
+#define br_netfilter_run_hooks() false
#endif
+static inline int
+BR_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
+ struct sk_buff *skb, struct net_device *in, struct net_device *out,
+ int (*okfn)(struct net *, struct sock *, struct sk_buff *))
+{
+ if (!br_netfilter_run_hooks())
+ return okfn(net, sk, skb);
+
+ return NF_HOOK(pf, hook, net, sk, skb, in, out, okfn);
+}
+
/* br_stp.c */
void br_log_state(const struct net_bridge_port *p);
void br_set_state(struct net_bridge_port *p, unsigned int state);
@@ -981,4 +1002,15 @@ static inline int br_sysfs_addbr(struct net_device *dev) { return 0; }
static inline void br_sysfs_delbr(struct net_device *dev) { return; }
#endif /* CONFIG_SYSFS */
+#define __br_get(__hook, __default, __args ...) \
+ (__hook ? (__hook(__args)) : (__default))
+
+static inline void __br_notify(int group, int type, const void *data)
+{
+ br_notify_hook_t *notify_hook = rcu_dereference(br_notify_hook);
+
+ if (notify_hook)
+ notify_hook(group, type, data);
+}
+
#endif
--- a/net/bridge/br_stp_bpdu.c
+++ b/net/bridge/br_stp_bpdu.c
@@ -60,7 +60,7 @@ static void br_send_bpdu(struct net_bridge_port *p,
skb_reset_mac_header(skb);
- NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT,
+ BR_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT,
dev_net(p->dev), NULL, skb, NULL, skb->dev,
br_send_bpdu_finish);
}
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -166,6 +166,7 @@ static void br_stp_start(struct net_bridge *br)
br_debug(br, "using kernel STP\n");
/* To start timers on any ports left in blocking */
+ mod_timer(&br->hello_timer, jiffies + br->hello_time);
br_port_state_selection(br);
}
--- a/net/bridge/br_stp_timer.c
+++ b/net/bridge/br_stp_timer.c
@@ -40,7 +40,7 @@ static void br_hello_timer_expired(unsigned long arg)
if (br->dev->flags & IFF_UP) {
br_config_bpdu_generation(br);
- if (br->stp_enabled != BR_USER_STP)
+ if (br->stp_enabled == BR_KERNEL_STP)
mod_timer(&br->hello_timer,
round_jiffies(jiffies + br->hello_time));
}
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -173,6 +173,22 @@ BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD);
BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP);
BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI);
+static ssize_t show_isolate_mode(struct net_bridge_port *p, char *buf)
+{
+ int isolate_mode = (p->flags & BR_ISOLATE_MODE) ? 1 : 0;
+ return sprintf(buf, "%d\n", isolate_mode);
+}
+static int store_isolate_mode(struct net_bridge_port *p, unsigned long v)
+{
+ if (v)
+ p->flags |= BR_ISOLATE_MODE;
+ else
+ p->flags &= ~BR_ISOLATE_MODE;
+ return 0;
+}
+static BRPORT_ATTR(isolate_mode, S_IRUGO | S_IWUSR,
+ show_isolate_mode, store_isolate_mode);
+
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
{
@@ -188,6 +204,7 @@ static BRPORT_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router,
store_multicast_router);
BRPORT_ATTR_FLAG(multicast_fast_leave, BR_MULTICAST_FAST_LEAVE);
+BRPORT_ATTR_FLAG(multicast_to_unicast, BR_MULTICAST_TO_UCAST);
#endif
static const struct brport_attribute *brport_attrs[] = {
@@ -214,9 +231,11 @@ static const struct brport_attribute *brport_attrs[] = {
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
&brport_attr_multicast_router,
&brport_attr_multicast_fast_leave,
+ &brport_attr_multicast_to_unicast,
#endif
&brport_attr_proxyarp,
&brport_attr_proxyarp_wifi,
+ &brport_attr_isolate_mode,
NULL
};
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -2416,11 +2416,13 @@ static int __init ebtables_init(void)
}
printk(KERN_INFO "Ebtables v2.0 registered\n");
+ brnf_call_ebtables = 1;
return 0;
}
static void __exit ebtables_fini(void)
{
+ brnf_call_ebtables = 0;
nf_unregister_sockopt(&ebt_sockopts);
xt_unregister_target(&ebt_standard_target);
printk(KERN_INFO "Ebtables v2.0 unregistered\n");
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -9,8 +9,9 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
- sock_diag.o dev_ioctl.o tso.o
+ dev_ioctl.o tso.o
+obj-$(CONFIG_SOCK_DIAG) += sock_diag.o
obj-$(CONFIG_XFRM) += flow.o
obj-y += net-sysfs.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
@@ -24,3 +25,5 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
+obj-$(CONFIG_SKB_RECYCLER) += skbuff_recycle.o
+obj-$(CONFIG_DEBUG_OBJECTS_SKBUFF) += skbuff_debug.o skbuff_notifier.o
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -139,6 +139,7 @@
#include <linux/netfilter_ingress.h>
#include "net-sysfs.h"
+#include "skbuff_debug.h"
/* Instead of increasing this, you should create a hash table. */
#define MAX_GRO_SKBS 8
@@ -2732,13 +2733,28 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev,
unsigned int len;
int rc;
- if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
- dev_queue_xmit_nit(skb, dev);
+ /* If this skb has been fast forwarded then we don't want it to
+ * go to any taps (by definition we're trying to bypass them).
+ */
+ if (unlikely(!skb->fast_forwarded)) {
+ if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
+ dev_queue_xmit_nit(skb, dev);
+ }
- len = skb->len;
- trace_net_dev_start_xmit(skb, dev);
- rc = netdev_start_xmit(skb, dev, txq, more);
- trace_net_dev_xmit(skb, rc, dev, len);
+#ifdef CONFIG_ETHERNET_PACKET_MANGLE
+ if (!dev->eth_mangle_tx ||
+ (skb = dev->eth_mangle_tx(dev, skb)) != NULL)
+#else
+ if (1)
+#endif
+ {
+ len = skb->len;
+ trace_net_dev_start_xmit(skb, dev);
+ rc = netdev_start_xmit(skb, dev, txq, more);
+ trace_net_dev_xmit(skb, rc, dev, len);
+ } else {
+ rc = NETDEV_TX_OK;
+ }
return rc;
}
@@ -3813,6 +3829,9 @@ void netdev_rx_handler_unregister(struct net_device *dev)
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
+int (*athrs_fast_nat_recv)(struct sk_buff *skb) __rcu __read_mostly;
+EXPORT_SYMBOL_GPL(athrs_fast_nat_recv);
+
/*
* Limit the use of PFMEMALLOC reserves to those protocols that implement
* the special handling of PFMEMALLOC skbs.
@@ -3855,6 +3874,7 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
bool deliver_exact = false;
int ret = NET_RX_DROP;
__be16 type;
+ int (*fast_recv)(struct sk_buff *skb);
net_timestamp_check(!netdev_tstamp_prequeue, skb);
@@ -3881,6 +3901,14 @@ another_round:
goto out;
}
+ fast_recv = rcu_dereference(athrs_fast_nat_recv);
+ if (fast_recv) {
+ if (fast_recv(skb)) {
+ ret = NET_RX_SUCCESS;
+ goto out;
+ }
+ }
+
#ifdef CONFIG_NET_CLS_ACT
if (skb->tc_verd & TC_NCLS) {
skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
@@ -4246,6 +4274,9 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
enum gro_result ret;
int grow;
+ if (skb->gro_skip)
+ goto normal;
+
if (!(skb->dev->features & NETIF_F_GRO))
goto normal;
@@ -4388,6 +4419,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
skb_dst_drop(skb);
kmem_cache_free(skbuff_head_cache, skb);
+ skbuff_debugobj_deactivate(skb);
} else {
__kfree_skb(skb);
}
@@ -4824,6 +4856,14 @@ void netif_napi_del(struct napi_struct *napi)
}
EXPORT_SYMBOL(netif_napi_del);
+struct napi_struct *get_current_napi_context(void)
+{
+ struct softnet_data *sd = this_cpu_ptr(&softnet_data);
+
+ return sd->current_napi;
+}
+EXPORT_SYMBOL(get_current_napi_context);
+
static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
void *have;
@@ -5405,6 +5445,48 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
&upper_dev->adj_list.lower);
}
+static void __netdev_addr_mask(unsigned char *mask, const unsigned char *addr,
+ struct net_device *dev)
+{
+ int i;
+
+ for (i = 0; i < dev->addr_len; i++)
+ mask[i] |= addr[i] ^ dev->dev_addr[i];
+}
+
+static void __netdev_upper_mask(unsigned char *mask, struct net_device *dev,
+ struct net_device *lower)
+{
+ struct net_device *cur;
+ struct list_head *iter;
+
+ netdev_for_each_upper_dev_rcu(dev, cur, iter) {
+ __netdev_addr_mask(mask, cur->dev_addr, lower);
+ __netdev_upper_mask(mask, cur, lower);
+ }
+}
+
+static void __netdev_update_addr_mask(struct net_device *dev)
+{
+ unsigned char mask[MAX_ADDR_LEN];
+ struct net_device *cur;
+ struct list_head *iter;
+
+ memset(mask, 0, sizeof(mask));
+ __netdev_upper_mask(mask, dev, dev);
+ memcpy(dev->local_addr_mask, mask, dev->addr_len);
+
+ netdev_for_each_lower_dev(dev, cur, iter)
+ __netdev_update_addr_mask(cur);
+}
+
+static void netdev_update_addr_mask(struct net_device *dev)
+{
+ rcu_read_lock();
+ __netdev_update_addr_mask(dev);
+ rcu_read_unlock();
+}
+
static int __netdev_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev, bool master,
void *private)
@@ -5476,6 +5558,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
goto rollback_lower_mesh;
}
+ netdev_update_addr_mask(dev);
call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
&changeupper_info.info);
return 0;
@@ -5602,6 +5685,7 @@ void netdev_upper_dev_unlink(struct net_device *dev,
list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
__netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
+ netdev_update_addr_mask(dev);
call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
&changeupper_info.info);
}
@@ -6142,6 +6226,7 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
if (err)
return err;
dev->addr_assign_type = NET_ADDR_SET;
+ netdev_update_addr_mask(dev);
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
return 0;
@@ -6453,6 +6538,18 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
#endif
features &= ~NETIF_F_BUSY_POLL;
+ if (!(features & NETIF_F_RXCSUM)) {
+ /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
+ * successfully merged by hardware must also have the
+ * checksum verified by hardware. If the user does not
+ * want to enable RXCSUM, logically, we should disable GRO_HW.
+ */
+ if (features & NETIF_F_GRO_HW) {
+ netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
+ features &= ~NETIF_F_GRO_HW;
+ }
+ }
+
return features;
}
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -73,6 +73,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_LLTX_BIT] = "tx-lockless",
[NETIF_F_NETNS_LOCAL_BIT] = "netns-local",
[NETIF_F_GRO_BIT] = "rx-gro",
+ [NETIF_F_GRO_HW_BIT] = "rx-gro-hw",
[NETIF_F_LRO_BIT] = "rx-lro",
[NETIF_F_TSO_BIT] = "tx-tcp-segmentation",
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -95,7 +95,7 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
ports = __skb_header_pointer(skb, thoff + poff,
sizeof(_ports), data, hlen, &_ports);
if (ports)
- return *ports;
+ return (__be32)net_hdr_word(ports);
}
return 0;
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -687,7 +687,7 @@ void neigh_destroy(struct neighbour *neigh)
NEIGH_CACHE_STAT_INC(neigh->tbl, destroys);
if (!neigh->dead) {
- pr_warn("Destroying alive neighbour %p\n", neigh);
+ pr_warn("Destroying alive neighbour %pK\n", neigh);
dump_stack();
return;
}
@@ -1049,7 +1049,19 @@ static void neigh_update_hhs(struct neighbour *neigh)
}
}
+ATOMIC_NOTIFIER_HEAD(neigh_mac_update_notifier_list);
+void neigh_mac_update_register_notify(struct notifier_block *nb)
+{
+ atomic_notifier_chain_register(&neigh_mac_update_notifier_list, nb);
+}
+EXPORT_SYMBOL_GPL(neigh_mac_update_register_notify);
+
+void neigh_mac_update_unregister_notify(struct notifier_block *nb)
+{
+ atomic_notifier_chain_unregister(&neigh_mac_update_notifier_list, nb);
+}
+EXPORT_SYMBOL_GPL(neigh_mac_update_unregister_notify);
/* Generic update routine.
-- lladdr is new lladdr or NULL, if it is not supplied.
@@ -1080,6 +1092,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
int notify = 0;
struct net_device *dev;
int update_isrouter = 0;
+ struct neigh_mac_update nmu;
write_lock_bh(&neigh->lock);
@@ -1087,6 +1100,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
old = neigh->nud_state;
err = -EPERM;
+ memset(&nmu, 0, sizeof(struct neigh_mac_update));
+
if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
(old & (NUD_NOARP | NUD_PERMANENT)))
goto out;
@@ -1117,7 +1132,11 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
and a new address is proposed:
- compare new & old
- if they are different, check override flag
+ - copy old and new addresses for neigh update notification
*/
+ memcpy(nmu.old_mac, neigh->ha, dev->addr_len);
+ memcpy(nmu.update_mac, lladdr, dev->addr_len);
+
if ((old & NUD_VALID) &&
!memcmp(lladdr, neigh->ha, dev->addr_len))
lladdr = neigh->ha;
@@ -1231,8 +1250,11 @@ out:
}
write_unlock_bh(&neigh->lock);
- if (notify)
+ if (notify) {
neigh_update_notify(neigh);
+ atomic_notifier_call_chain(&neigh_mac_update_notifier_list, 0,
+ (struct neigh_mac_update *)&nmu);
+ }
return err;
}
@@ -3225,4 +3247,3 @@ static int __init neigh_init(void)
}
subsys_initcall(neigh_init);
-
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -318,10 +318,12 @@ static int __net_init dev_proc_net_init(struct net *net)
if (!proc_create("dev", S_IRUGO, net->proc_net, &dev_seq_fops))
goto out;
- if (!proc_create("softnet_stat", S_IRUGO, net->proc_net,
+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED) &&
+ !proc_create("softnet_stat", S_IRUGO, net->proc_net,
&softnet_seq_fops))
goto out_dev;
- if (!proc_create("ptype", S_IRUGO, net->proc_net, &ptype_seq_fops))
+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED) &&
+ !proc_create("ptype", S_IRUGO, net->proc_net, &ptype_seq_fops))
goto out_softnet;
if (wext_proc_init(net))
@@ -330,9 +332,11 @@ static int __net_init dev_proc_net_init(struct net *net)
out:
return rc;
out_ptype:
- remove_proc_entry("ptype", net->proc_net);
+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED))
+ remove_proc_entry("ptype", net->proc_net);
out_softnet:
- remove_proc_entry("softnet_stat", net->proc_net);
+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED))
+ remove_proc_entry("softnet_stat", net->proc_net);
out_dev:
remove_proc_entry("dev", net->proc_net);
goto out;
@@ -342,8 +346,10 @@ static void __net_exit dev_proc_net_exit(struct net *net)
{
wext_proc_exit(net);
- remove_proc_entry("ptype", net->proc_net);
- remove_proc_entry("softnet_stat", net->proc_net);
+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED)) {
+ remove_proc_entry("ptype", net->proc_net);
+ remove_proc_entry("softnet_stat", net->proc_net);
+ }
remove_proc_entry("dev", net->proc_net);
}
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -261,7 +261,7 @@ struct net *get_net_ns_by_id(struct net *net, int id)
spin_lock_irqsave(&net->nsid_lock, flags);
peer = idr_find(&net->netns_ids, id);
if (peer)
- get_net(peer);
+ peer = maybe_get_net(peer);
spin_unlock_irqrestore(&net->nsid_lock, flags);
rcu_read_unlock();
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -46,11 +46,12 @@ __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
u32 secret[MD5_MESSAGE_BYTES / 4];
u32 hash[MD5_DIGEST_WORDS];
u32 i;
+ const struct in6_addr *daddr6 = (struct in6_addr *) daddr;
net_secret_init();
memcpy(hash, saddr, 16);
for (i = 0; i < 4; i++)
- secret[i] = net_secret[i] + (__force u32)daddr[i];
+ secret[i] = net_secret[i] + (__force u32)daddr6->s6_addr32[i];
secret[4] = net_secret[4] +
(((__force u16)sport << 16) + (__force u16)dport);
for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
@@ -68,11 +69,12 @@ u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
u32 secret[MD5_MESSAGE_BYTES / 4];
u32 hash[MD5_DIGEST_WORDS];
u32 i;
+ const struct in6_addr *daddr6 = (struct in6_addr *) daddr;
net_secret_init();
memcpy(hash, saddr, 16);
for (i = 0; i < 4; i++)
- secret[i] = net_secret[i] + (__force u32) daddr[i];
+ secret[i] = net_secret[i] + (__force u32) daddr6->s6_addr32[i];
secret[4] = net_secret[4] + (__force u32)dport;
for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
secret[i] = net_secret[i];
@@ -146,6 +148,7 @@ EXPORT_SYMBOL(secure_dccp_sequence_number);
u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
__be16 sport, __be16 dport)
{
+ const struct in6_addr *daddr6 = (struct in6_addr *) daddr;
u32 secret[MD5_MESSAGE_BYTES / 4];
u32 hash[MD5_DIGEST_WORDS];
u64 seq;
@@ -154,7 +157,7 @@ u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
net_secret_init();
memcpy(hash, saddr, 16);
for (i = 0; i < 4; i++)
- secret[i] = net_secret[i] + (__force u32)daddr[i];
+ secret[i] = net_secret[i] + (__force u32)daddr6->s6_addr32[i];
secret[4] = net_secret[4] +
(((__force u16)sport << 16) + (__force u16)dport);
for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -63,6 +63,7 @@
#include <linux/errqueue.h>
#include <linux/prefetch.h>
#include <linux/if_vlan.h>
+#include <linux/if.h>
#include <net/protocol.h>
#include <net/dst.h>
@@ -77,6 +78,9 @@
#include <linux/capability.h>
#include <linux/user_namespace.h>
+#include "skbuff_recycle.h"
+#include "skbuff_debug.h"
+
struct kmem_cache *skbuff_head_cache __read_mostly;
static struct kmem_cache *skbuff_fclone_cache __read_mostly;
int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
@@ -166,6 +170,7 @@ struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node)
gfp_mask & ~__GFP_DMA, node);
if (!skb)
goto out;
+ skbuff_debugobj_init_and_activate(skb);
/*
* Only clear those fields we need to clear, not those that we will
@@ -218,6 +223,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
if (!skb)
goto out;
+ skbuff_debugobj_init_and_activate(skb);
prefetchw(skb);
/* We do our best to align skb_shared_info on a separate cache
@@ -275,6 +281,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
out:
return skb;
nodata:
+ skbuff_debugobj_deactivate(skb);
kmem_cache_free(cache, skb);
skb = NULL;
goto out;
@@ -309,6 +316,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size)
skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
if (!skb)
return NULL;
+ skbuff_debugobj_init_and_activate(skb);
size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
@@ -394,7 +402,7 @@ EXPORT_SYMBOL(napi_alloc_frag);
/**
* __netdev_alloc_skb - allocate an skbuff for rx on a specific device
* @dev: network device to receive on
- * @len: length to allocate
+ * @length: length to allocate
* @gfp_mask: get_free_pages mask, passed to alloc_skb
*
* Allocate a new &sk_buff and assign it a usage count of one. The
@@ -404,19 +412,56 @@ EXPORT_SYMBOL(napi_alloc_frag);
*
* %NULL is returned if there is no free memory.
*/
-struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
- gfp_t gfp_mask)
+struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
+ unsigned int length, gfp_t gfp_mask)
{
+#ifndef CONFIG_SKB_RECYCLER
struct page_frag_cache *nc;
unsigned long flags;
- struct sk_buff *skb;
bool pfmemalloc;
+ bool page_frag_alloc_enable = true;
void *data;
+#endif
+
+ struct sk_buff *skb;
+ unsigned int len = length;
+
+#ifdef CONFIG_SKB_RECYCLER
+ skb = skb_recycler_alloc(dev, length);
+ if (likely(skb)) {
+ /* SKBs in the recycler are from various unknown sources.
+ * Their truesize is unknown. We should set truesize
+ * as the needed buffer size before using it.
+ */
+ skb->truesize = SKB_TRUESIZE(SKB_DATA_ALIGN(len + NET_SKB_PAD));
+ return skb;
+ }
+
+ len = SKB_RECYCLE_SIZE;
+ if (unlikely(length > SKB_RECYCLE_SIZE))
+ len = length;
+
+ skb = __alloc_skb(len + NET_SKB_PAD, gfp_mask,
+ SKB_ALLOC_RX, NUMA_NO_NODE);
+ if (!skb)
+ goto skb_fail;
+
+ /* Set truesize as the needed buffer size
+ * rather than the allocated size by __alloc_skb().
+ */
+ if (length + NET_SKB_PAD < SKB_WITH_OVERHEAD(PAGE_SIZE))
+ skb->truesize = SKB_TRUESIZE(SKB_DATA_ALIGN(length + NET_SKB_PAD));
+ goto skb_success;
+#else
len += NET_SKB_PAD;
+#ifdef CONFIG_ALLOC_SKB_PAGE_FRAG_DISABLE
+ page_frag_alloc_enable = false;
+#endif
if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
- (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
+ (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA)) ||
+ !page_frag_alloc_enable) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
if (!skb)
goto skb_fail;
@@ -450,6 +495,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
if (pfmemalloc)
skb->pfmemalloc = 1;
skb->head_frag = 1;
+#endif
skb_success:
skb_reserve(skb, NET_SKB_PAD);
@@ -520,6 +566,22 @@ skb_fail:
}
EXPORT_SYMBOL(__napi_alloc_skb);
+struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev,
+ unsigned int length, gfp_t gfp)
+{
+ struct sk_buff *skb = __netdev_alloc_skb(dev, length + NET_IP_ALIGN, gfp);
+
+#ifdef CONFIG_ETHERNET_PACKET_MANGLE
+ if (dev && (dev->priv_flags & IFF_NO_IP_ALIGN))
+ return skb;
+#endif
+
+ if (NET_IP_ALIGN && skb)
+ skb_reserve(skb, NET_IP_ALIGN);
+ return skb;
+}
+EXPORT_SYMBOL(__netdev_alloc_skb_ip_align);
+
void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
int size, unsigned int truesize)
{
@@ -571,7 +633,7 @@ static void skb_free_head(struct sk_buff *skb)
kfree(head);
}
-static void skb_release_data(struct sk_buff *skb)
+void skb_release_data(struct sk_buff *skb)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
int i;
@@ -605,12 +667,13 @@ static void skb_release_data(struct sk_buff *skb)
/*
* Free an skbuff by memory without cleaning the state.
*/
-static void kfree_skbmem(struct sk_buff *skb)
+void kfree_skbmem(struct sk_buff *skb)
{
struct sk_buff_fclones *fclones;
switch (skb->fclone) {
case SKB_FCLONE_UNAVAILABLE:
+ skbuff_debugobj_deactivate(skb);
kmem_cache_free(skbuff_head_cache, skb);
return;
@@ -631,7 +694,9 @@ static void kfree_skbmem(struct sk_buff *skb)
}
if (!atomic_dec_and_test(&fclones->fclone_ref))
return;
+
fastpath:
+ skbuff_debugobj_deactivate(&fclones->skb1);
kmem_cache_free(skbuff_fclone_cache, fclones);
}
@@ -740,12 +805,38 @@ void consume_skb(struct sk_buff *skb)
{
if (unlikely(!skb))
return;
+
+ prefetch(&skb->destructor);
+
if (likely(atomic_read(&skb->users) == 1))
smp_rmb();
else if (likely(!atomic_dec_and_test(&skb->users)))
return;
+
+ /* If possible we'd like to recycle any skb rather than just free it,
+ * but in order to do that we need to release any head state too.
+ * We don't want to do this later because we'll be in a pre-emption
+ * disabled state.
+ */
+ skb_release_head_state(skb);
+
+ /* Can we recycle this skb? If we can then it will be much faster
+ * for us to recycle this one later than to allocate a new one
+ * from scratch.
+ */
+ if (likely(skb->head) && likely(skb_recycler_consume(skb)))
+ return;
+
trace_consume_skb(skb);
- __kfree_skb(skb);
+
+ /* We're not recycling so now we need to do the rest of what we would
+ * have done in __kfree_skb (above and beyond the skb_release_head_state
+ * that we already did).
+ */
+ if (likely(skb->head))
+ skb_release_data(skb);
+
+ kfree_skbmem(skb);
}
EXPORT_SYMBOL(consume_skb);
@@ -956,6 +1047,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
if (!n)
return NULL;
+ skbuff_debugobj_init_and_activate(n);
kmemcheck_annotate_bitfield(n, flags1);
n->fclone = SKB_FCLONE_UNAVAILABLE;
@@ -3327,6 +3419,7 @@ void __init skb_init(void)
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
+ skb_recycler_init();
}
/**
@@ -4115,6 +4208,7 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
{
if (head_stolen) {
skb_release_head_state(skb);
+ skbuff_debugobj_deactivate(skb);
kmem_cache_free(skbuff_head_cache, skb);
} else {
__kfree_skb(skb);
new file mode 100644
--- /dev/null
+++ b/net/core/skbuff_debug.c
@@ -0,0 +1,316 @@
+/* Copyright (c) 2015-2016, The Linux Foundation. All rights reserved.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <asm/stacktrace.h>
+#include <asm/current.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/smp.h>
+
+#include "skbuff_debug.h"
+#include "skbuff_notifier.h"
+#include "skbuff_recycle.h"
+
+static int skbuff_debugobj_enabled __read_mostly = 1;
+
+static int skbuff_debug_event_handler(struct notifier_block *nb,
+ unsigned long action, void *data);
+static struct notifier_block skbuff_debug_notify = {
+ .notifier_call = skbuff_debug_event_handler,
+ .priority = 0
+};
+
+inline u32 skbuff_debugobj_sum(struct sk_buff *skb)
+{
+ int pos = offsetof(struct sk_buff, free_addr);
+ u32 sum = 0;
+
+ while (pos--)
+ sum += ((u8 *)skb)[pos];
+
+ return sum;
+}
+
+struct skbuff_debugobj_walking {
+ int pos;
+ void **d;
+};
+
+static int skbuff_debugobj_walkstack(struct stackframe *frame, void *p)
+{
+ struct skbuff_debugobj_walking *w = (struct skbuff_debugobj_walking *)p;
+ unsigned long pc = frame->pc;
+
+ if (w->pos < DEBUG_OBJECTS_SKBUFF_STACKSIZE - 1) {
+ w->d[w->pos++] = (void *)pc;
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
+static void skbuff_debugobj_get_stack(void **ret)
+{
+ struct stackframe frame;
+
+ register unsigned long current_sp asm ("sp");
+ struct skbuff_debugobj_walking w = {0, ret};
+ void *p = &w;
+
+ frame.fp = (unsigned long)__builtin_frame_address(0);
+ frame.sp = current_sp;
+
+#ifdef CONFIG_ARM
+ frame.lr = (unsigned long)__builtin_return_address(0);
+#endif
+
+ frame.pc = (unsigned long)skbuff_debugobj_get_stack;
+
+ walk_stackframe(&frame, skbuff_debugobj_walkstack, p);
+
+ ret[w.pos] = NULL;
+}
+#else
+#error
+static void skbuff_debugobj_get_stack(void **ret)
+{
+ /* not supported */
+ ret[0] = 0xdeadbeef;
+}
+#endif
+
+void skbuff_debugobj_print_stack(void *const *stack)
+{
+ int i;
+
+ for (i = 0; stack[i]; i++)
+ pr_emerg("\t %pS (0x%p)\n", stack[i], stack[i]);
+}
+
+static const char *skbuff_debugobj_state_name(const struct sk_buff *skb)
+{
+ int obj_state;
+
+ obj_state = debug_object_get_state((struct sk_buff *)skb);
+ switch (obj_state) {
+ case ODEBUG_STATE_NONE:
+ return "none";
+ case ODEBUG_STATE_INIT:
+ return "init";
+ case ODEBUG_STATE_INACTIVE:
+ return "inactive";
+ case ODEBUG_STATE_ACTIVE:
+ return "active";
+ case ODEBUG_STATE_DESTROYED:
+ return "destroyed";
+ case ODEBUG_STATE_NOTAVAILABLE:
+ return "not available";
+ default:
+ return "invalid";
+ }
+}
+
+void skbuff_debugobj_print_skb(const struct sk_buff *skb)
+{
+ pr_emerg("skb_debug: current process = %s (pid %i)\n",
+ current->comm, current->pid);
+ pr_emerg("skb_debug: skb 0x%p, next 0x%p, prev 0x%p, state = %s\n", skb,
+ skb->next, skb->prev, skbuff_debugobj_state_name(skb));
+ pr_emerg("skb_debug: free stack:\n");
+ skbuff_debugobj_print_stack(skb->free_addr);
+ pr_emerg("skb_debug: alloc stack:\n");
+ skbuff_debugobj_print_stack(skb->alloc_addr);
+}
+EXPORT_SYMBOL(skbuff_debugobj_print_skb);
+
+/* skbuff_debugobj_fixup():
+ * Called when an error is detected in the state machine for
+ * the objects
+ */
+static int skbuff_debugobj_fixup(void *addr, enum debug_obj_state state)
+{
+ struct sk_buff *skb = (struct sk_buff *)addr;
+ ftrace_dump(DUMP_ALL);
+ WARN(1, "skb_debug: state = %d, skb = 0x%p sum = %d (now %d)\n",
+ state, skb, skb->sum, skbuff_debugobj_sum(skb));
+ skb_recycler_notifier_send_event(SKB_RECYCLER_NOTIFIER_FSM, skb);
+
+ return 0;
+}
+
+static struct debug_obj_descr skbuff_debug_descr = {
+ .name = "sk_buff_struct",
+ .fixup_init = skbuff_debugobj_fixup,
+ .fixup_activate = skbuff_debugobj_fixup,
+ .fixup_destroy = skbuff_debugobj_fixup,
+ .fixup_free = skbuff_debugobj_fixup,
+};
+
+inline void skbuff_debugobj_activate(struct sk_buff *skb)
+{
+ int ret = 0;
+
+ if (!skbuff_debugobj_enabled)
+ return;
+
+ skbuff_debugobj_get_stack(skb->alloc_addr);
+ ret = debug_object_activate(skb, &skbuff_debug_descr);
+ if (ret)
+ goto err_act;
+
+ skbuff_debugobj_sum_validate(skb);
+
+ return;
+
+err_act:
+ ftrace_dump(DUMP_ALL);
+ WARN(1, "skb_debug: failed to activate err = %d skb = 0x%p sum = %d (now %d)\n",
+ ret, skb, skb->sum, skbuff_debugobj_sum(skb));
+ skb_recycler_notifier_send_event(SKB_RECYCLER_NOTIFIER_DBLALLOC, skb);
+}
+
+inline void skbuff_debugobj_init_and_activate(struct sk_buff *skb)
+{
+ if (!skbuff_debugobj_enabled)
+ return;
+
+ /* if we're coming from the slab, the skb->sum might
+ * be invalid anyways
+ */
+ skb->sum = skbuff_debugobj_sum(skb);
+
+ debug_object_init(skb, &skbuff_debug_descr);
+ skbuff_debugobj_activate(skb);
+}
+
+inline void skbuff_debugobj_deactivate(struct sk_buff *skb)
+{
+ int obj_state;
+
+ if (!skbuff_debugobj_enabled)
+ return;
+
+ skb->sum = skbuff_debugobj_sum(skb);
+
+ obj_state = debug_object_get_state(skb);
+
+ if (obj_state == ODEBUG_STATE_ACTIVE) {
+ debug_object_deactivate(skb, &skbuff_debug_descr);
+ skbuff_debugobj_get_stack(skb->free_addr);
+ return;
+ }
+
+ ftrace_dump(DUMP_ALL);
+ WARN(1, "skb_debug: deactivating inactive object skb=0x%p state=%d sum = %d (now %d)\n",
+ skb, obj_state, skb->sum, skbuff_debugobj_sum(skb));
+ skb_recycler_notifier_send_event(SKB_RECYCLER_NOTIFIER_DBLFREE, skb);
+}
+
+inline void _skbuff_debugobj_sum_validate(struct sk_buff *skb,
+ const char *var, const char *src,
+ int line, const char *fxn)
+{
+ if (!skbuff_debugobj_enabled || !skb)
+ return;
+
+ if (skb->sum == skbuff_debugobj_sum(skb))
+ return;
+
+ ftrace_dump(DUMP_ALL);
+ WARN(1, "skb_debug: skb sum changed skb = 0x%p sum = %d (now %d)\n",
+ skb, skb->sum, skbuff_debugobj_sum(skb));
+ pr_emerg("skb_debug: %s() checking %s in %s:%d\n", fxn, var, src, line);
+ skb_recycler_notifier_send_event(SKB_RECYCLER_NOTIFIER_SUMERR, skb);
+}
+
+inline void skbuff_debugobj_sum_update(struct sk_buff *skb)
+{
+ if (!skbuff_debugobj_enabled || !skb)
+ return;
+
+ skb->sum = skbuff_debugobj_sum(skb);
+}
+
+inline void skbuff_debugobj_destroy(struct sk_buff *skb)
+{
+ if (!skbuff_debugobj_enabled)
+ return;
+
+ debug_object_destroy(skb, &skbuff_debug_descr);
+}
+
+static int __init disable_object_debug(char *str)
+{
+ skbuff_debugobj_enabled = 0;
+
+ pr_info("skb_debug: debug objects is disabled\n");
+ return 0;
+}
+
+early_param("no_skbuff_debug_objects", disable_object_debug);
+
+void skbuff_debugobj_print_skb_list(const struct sk_buff *skb_list,
+ const char *list_title, int cpu)
+{
+ int count;
+ struct sk_buff *skb_i = (struct sk_buff *)skb_list;
+ u32 sum_i, sum_now;
+ int obj_state;
+
+ if (cpu < 0) {
+ cpu = get_cpu();
+ put_cpu();
+ }
+ pr_emerg("skb_debug: start skb list '%s' [CPU#%d]\n", list_title, cpu);
+ count = 0;
+ if (skb_list) {
+ do {
+ obj_state =
+ debug_object_get_state(skb_i);
+ if (obj_state < ODEBUG_STATE_NOTAVAILABLE) {
+ sum_i = skb_i->sum;
+ sum_now = skbuff_debugobj_sum(skb_i);
+ } else {
+ sum_i = 0;
+ sum_now = 0;
+ }
+ pr_emerg("skb_debug: [%02d] skb 0x%p, next 0x%p, prev 0x%p, state %d (%s), sum %d (now %d)\n",
+ count, skb_i, skb_i->next, skb_i->prev,
+ obj_state, skbuff_debugobj_state_name(skb_i),
+ sum_i, sum_now);
+ skb_i = skb_i->next;
+ count++;
+ } while (skb_list != skb_i);
+ }
+ pr_emerg("skb_debug: end skb list '%s'\n", list_title);
+}
+
+void skbuff_debugobj_register_callback(void)
+{
+ skb_recycler_notifier_register(&skbuff_debug_notify);
+}
+
+int skbuff_debug_event_handler(struct notifier_block *nb, unsigned long action,
+ void *data)
+{
+ struct sk_buff *skb = (struct sk_buff *)data;
+
+ pr_emerg("skb_debug: notifier event %lu\n", action);
+ skbuff_debugobj_print_skb(skb);
+ skb_recycler_print_all_lists();
+
+ return NOTIFY_DONE;
+}
new file mode 100644
--- /dev/null
+++ b/net/core/skbuff_debug.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/debugobjects.h>
+
+#ifndef _LINUX_SKBBUFF_DEBUG_OBJECTS
+#define _LINUX_SKBBUFF_DEBUG_OBJECTS
+
+#ifdef CONFIG_DEBUG_OBJECTS_SKBUFF
+void skbuff_debugobj_init_and_activate(struct sk_buff *skb);
+void skbuff_debugobj_activate(struct sk_buff *skb);
+void skbuff_debugobj_deactivate(struct sk_buff *skb);
+void skbuff_debugobj_destroy(struct sk_buff *skb);
+#define skbuff_debugobj_sum_validate(skb) _skbuff_debugobj_sum_validate(skb, \
+ #skb, __FILE__, __LINE__, __func__)
+void _skbuff_debugobj_sum_validate(struct sk_buff *skb, const char *var,
+ const char *src, int line, const char *fxn);
+void skbuff_debugobj_sum_update(struct sk_buff *skb);
+void skbuff_debugobj_print_skb(const struct sk_buff *skb);
+void skbuff_debugobj_print_skb_list(const struct sk_buff *skb_list,
+ const char *list_title, int cpu);
+void skbuff_debugobj_register_callback(void);
+#else
+static inline void skbuff_debugobj_init_and_activate(struct sk_buff *skb) { }
+static inline void skbuff_debugobj_activate(struct sk_buff *skb) { }
+static inline void skbuff_debugobj_deactivate(struct sk_buff *skb) { }
+static inline void skbuff_debugobj_destroy(struct sk_buff *skb) { }
+static inline void skbuff_debugobj_sum_validate(struct sk_buff *skb) { }
+static inline void skbuff_debugobj_sum_update(struct sk_buff *skb) { }
+static inline void skbuff_debugobj_print_skb(const struct sk_buff *skb) { }
+static inline void skbuff_debugobj_print_skb_list
+ (const struct sk_buff *skb_list, const char *list_title, int cpu) { }
+static inline void skbuff_debugobj_register_callback(void) { }
+#endif
+
+#endif /* _LINUX_SKBBUFF_DEBUG_OBJECTS */
new file mode 100644
--- /dev/null
+++ b/net/core/skbuff_notifier.c
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/* Notifier interface for the SKB Recycler */
+
+#include "skbuff_notifier.h"
+
+static BLOCKING_NOTIFIER_HEAD(skb_recycler_notifier);
+
+int skb_recycler_notifier_register(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&skb_recycler_notifier, nb);
+}
+EXPORT_SYMBOL(skb_recycler_notifier_register);
+
+int skb_recycler_notifier_unregister(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&skb_recycler_notifier, nb);
+}
+EXPORT_SYMBOL(skb_recycler_notifier_unregister);
+
+int skb_recycler_notifier_send_event(unsigned long action, struct sk_buff *skb)
+{
+ int ret;
+
+ ret = blocking_notifier_call_chain(&skb_recycler_notifier, action, skb);
+
+ return 0;
+}
new file mode 100644
--- /dev/null
+++ b/net/core/skbuff_notifier.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016, The Linux Foundation. All rights reserved.
+*
+* Permission to use, copy, modify, and/or distribute this software for any
+* purpose with or without fee is hereby granted, provided that the above
+* copyright notice and this permission notice appear in all copies.
+*
+* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#ifndef SKBUFF_NOTIFIER_H
+#define SKBUFF_NOTIFIER_H
+
+#include <linux/notifier.h>
+#include <linux/skbuff.h>
+
+/* notifier events */
+#define SKB_RECYCLER_NOTIFIER_SUMERR 0x0001
+#define SKB_RECYCLER_NOTIFIER_DBLFREE 0x0002
+#define SKB_RECYCLER_NOTIFIER_DBLALLOC 0x0004
+#define SKB_RECYCLER_NOTIFIER_FSM 0x0008
+
+#if defined(CONFIG_DEBUG_OBJECTS_SKBUFF)
+int skb_recycler_notifier_register(struct notifier_block *nb);
+int skb_recycler_notifier_unregister(struct notifier_block *nb);
+int skb_recycler_notifier_send_event(unsigned long action,
+ struct sk_buff *skb);
+#else
+static inline int skb_recycler_notifier_register(struct notifier_block *nb)
+{
+ return 0;
+}
+
+static inline int skb_recycler_notifier_unregister(struct notifier_block *nb)
+{
+ return 0;
+}
+
+static inline int skb_recycler_notifier_send_event(unsigned long action,
+ struct sk_buff *skb)
+{
+ return 1;
+}
+#endif /* CONFIG_DEBUG_OBJECTS_SKBUFF */
+
+#endif /* SKBUFF_NOTIFIER_H */
new file mode 100644
--- /dev/null
+++ b/net/core/skbuff_recycle.c
@@ -0,0 +1,582 @@
+/* Copyright (c) 2013-2016, 2019, The Linux Foundation. All rights reserved.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/* Generic skb recycler */
+#include "skbuff_recycle.h"
+#include <linux/proc_fs.h>
+#include <linux/string.h>
+
+#include "skbuff_debug.h"
+
+static struct proc_dir_entry *proc_net_skbrecycler;
+
+static DEFINE_PER_CPU(struct sk_buff_head, recycle_list);
+static int skb_recycle_max_skbs = SKB_RECYCLE_MAX_SKBS;
+
+#ifdef CONFIG_SKB_RECYCLER_MULTI_CPU
+static DEFINE_PER_CPU(struct sk_buff_head, recycle_spare_list);
+static struct global_recycler glob_recycler;
+static int skb_recycle_spare_max_skbs = SKB_RECYCLE_SPARE_MAX_SKBS;
+#endif
+
+inline struct sk_buff *skb_recycler_alloc(struct net_device *dev,
+ unsigned int length)
+{
+ unsigned long flags;
+ struct sk_buff_head *h;
+ struct sk_buff *skb = NULL;
+ struct sk_buff *ln = NULL;
+
+ if (unlikely(length > SKB_RECYCLE_SIZE))
+ return NULL;
+
+ h = &get_cpu_var(recycle_list);
+ local_irq_save(flags);
+ skb = skb_peek(h);
+ if (skb) {
+ ln = skb_peek_next(skb, h);
+ skbuff_debugobj_activate(skb);
+ /* Recalculate the sum for skb->next as next and prev pointers
+ * of skb->next will be updated in __skb_unlink
+ */
+ skbuff_debugobj_sum_validate(ln);
+ __skb_unlink(skb, h);
+ skbuff_debugobj_sum_update(ln);
+ }
+#ifdef CONFIG_SKB_RECYCLER_MULTI_CPU
+ if (unlikely(!skb)) {
+ u8 head;
+
+ spin_lock(&glob_recycler.lock);
+ /* If global recycle list is not empty, use global buffers */
+ head = glob_recycler.head;
+ if (unlikely(head == glob_recycler.tail)) {
+ spin_unlock(&glob_recycler.lock);
+ } else {
+ struct sk_buff *gn = glob_recycler.pool[head].next;
+ struct sk_buff *gp = glob_recycler.pool[head].prev;
+
+ /* Move SKBs from global list to CPU pool */
+ skbuff_debugobj_sum_validate(gn);
+ skbuff_debugobj_sum_validate(gp);
+ skb_queue_splice_init(&glob_recycler.pool[head], h);
+ skbuff_debugobj_sum_update(gn);
+ skbuff_debugobj_sum_update(gp);
+
+ head = (head + 1) & SKB_RECYCLE_MAX_SHARED_POOLS_MASK;
+ glob_recycler.head = head;
+ spin_unlock(&glob_recycler.lock);
+ /* We have refilled the CPU pool - dequeue */
+ skb = skb_peek(h);
+ if (skb) {
+ /* Recalculate the sum for skb->next as next and
+ * prev pointers of skb->next will be updated
+ * in __skb_unlink
+ */
+ ln = skb_peek_next(skb, h);
+ skbuff_debugobj_activate(skb);
+ skbuff_debugobj_sum_validate(ln);
+ __skb_unlink(skb, h);
+ skbuff_debugobj_sum_update(ln);
+ }
+ }
+ }
+#endif
+ local_irq_restore(flags);
+ put_cpu_var(recycle_list);
+
+ if (likely(skb)) {
+ struct skb_shared_info *shinfo;
+
+ /* We're about to write a large amount to the skb to
+ * zero most of the structure so prefetch the start
+ * of the shinfo region now so it's in the D-cache
+ * before we start to write that.
+ */
+ shinfo = skb_shinfo(skb);
+ prefetchw(shinfo);
+
+ zero_struct(skb, offsetof(struct sk_buff, tail));
+ atomic_set(&skb->users, 1);
+ skb->mac_header = (typeof(skb->mac_header))~0U;
+ skb->transport_header = (typeof(skb->transport_header))~0U;
+ zero_struct(shinfo, offsetof(struct skb_shared_info, dataref));
+ atomic_set(&shinfo->dataref, 1);
+
+ skb->data = skb->head + NET_SKB_PAD;
+ skb_reset_tail_pointer(skb);
+
+ skb->dev = dev;
+ }
+
+ return skb;
+}
+
+inline bool skb_recycler_consume(struct sk_buff *skb)
+{
+ unsigned long flags;
+ struct sk_buff_head *h;
+ struct sk_buff *ln = NULL;
+ /* Can we recycle this skb? If not, simply return that we cannot */
+ if (unlikely(!consume_skb_can_recycle(skb, SKB_RECYCLE_MIN_SIZE,
+ SKB_RECYCLE_MAX_SIZE)))
+ return false;
+
+ /* If we can, then it will be much faster for us to recycle this one
+ * later than to allocate a new one from scratch.
+ */
+ h = &get_cpu_var(recycle_list);
+ local_irq_save(flags);
+ /* Attempt to enqueue the CPU hot recycle list first */
+ if (likely(skb_queue_len(h) < skb_recycle_max_skbs)) {
+ ln = skb_peek(h);
+ /* Recalculate the sum for peek of list as next and prev
+ * pointers of skb->next will be updated in __skb_queue_head
+ */
+ skbuff_debugobj_sum_validate(ln);
+ __skb_queue_head(h, skb);
+ skbuff_debugobj_deactivate(skb);
+ skbuff_debugobj_sum_update(ln);
+ local_irq_restore(flags);
+ preempt_enable();
+ return true;
+ }
+#ifdef CONFIG_SKB_RECYCLER_MULTI_CPU
+ h = this_cpu_ptr(&recycle_spare_list);
+
+ /* The CPU hot recycle list was full; if the spare list is also full,
+ * attempt to move the spare list to the global list for other CPUs to
+ * use.
+ */
+ if (unlikely(skb_queue_len(h) >= skb_recycle_spare_max_skbs)) {
+ u8 cur_tail, next_tail;
+
+ spin_lock(&glob_recycler.lock);
+ cur_tail = glob_recycler.tail;
+ next_tail = (cur_tail + 1) & SKB_RECYCLE_MAX_SHARED_POOLS_MASK;
+ if (next_tail != glob_recycler.head) {
+ struct sk_buff_head *p = &glob_recycler.pool[cur_tail];
+ struct sk_buff *hn = h->next, *hp = h->prev;
+
+ /* Move SKBs from CPU pool to Global pool*/
+ skbuff_debugobj_sum_validate(hp);
+ skbuff_debugobj_sum_validate(hn);
+ skb_queue_splice_init(h, p);
+ skbuff_debugobj_sum_update(hp);
+ skbuff_debugobj_sum_update(hn);
+
+ /* Done with global list init */
+ glob_recycler.tail = next_tail;
+ spin_unlock(&glob_recycler.lock);
+
+ /* Recalculate the sum for peek of list as next and prev
+ * pointers of skb->next will be updated in
+ * __skb_queue_head
+ */
+ ln = skb_peek(h);
+ skbuff_debugobj_sum_validate(ln);
+ /* We have now cleared room in the spare;
+ * Initialize and enqueue skb into spare
+ */
+ __skb_queue_head(h, skb);
+ skbuff_debugobj_sum_update(ln);
+ skbuff_debugobj_deactivate(skb);
+
+ local_irq_restore(flags);
+ preempt_enable();
+ return true;
+ }
+ /* We still have a full spare because the global is also full */
+ spin_unlock(&glob_recycler.lock);
+ } else {
+ /* We have room in the spare list; enqueue to spare list */
+ ln = skb_peek(h);
+ /* Recalculate the sum for peek of list as next and prev
+ * pointers of skb->next will be updated in __skb_queue_head
+ */
+ skbuff_debugobj_sum_validate(ln);
+ __skb_queue_head(h, skb);
+ skbuff_debugobj_deactivate(skb);
+ skbuff_debugobj_sum_update(ln);
+ local_irq_restore(flags);
+ preempt_enable();
+ return true;
+ }
+#endif
+
+ local_irq_restore(flags);
+ preempt_enable();
+
+ return false;
+}
+
+static void skb_recycler_free_skb(struct sk_buff_head *list)
+{
+ struct sk_buff *skb = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&list->lock, flags);
+ while ((skb = skb_peek(list)) != NULL) {
+ skbuff_debugobj_activate(skb);
+ __skb_unlink(skb, list);
+ skb_release_data(skb);
+ kfree_skbmem(skb);
+ }
+ spin_unlock_irqrestore(&list->lock, flags);
+}
+
+static int skb_cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *ocpu)
+{
+ unsigned long oldcpu = (unsigned long)ocpu;
+
+ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+ skb_recycler_free_skb(&per_cpu(recycle_list, oldcpu));
+#ifdef CONFIG_SKB_RECYCLER_MULTI_CPU
+ spin_lock(&glob_recycler.lock);
+ skb_recycler_free_skb(&per_cpu(recycle_spare_list, oldcpu));
+ spin_unlock(&glob_recycler.lock);
+#endif
+ }
+
+ return NOTIFY_OK;
+}
+
+#ifdef CONFIG_SKB_RECYCLER_PREALLOC
+static int __init skb_prealloc_init_list(void)
+{
+ int i;
+ struct sk_buff *skb;
+
+ for (i = 0; i < SKB_RECYCLE_MAX_PREALLOC_SKBS; i++) {
+ skb = __alloc_skb(SKB_RECYCLE_MAX_SIZE + NET_SKB_PAD,
+ GFP_KERNEL, 0, NUMA_NO_NODE);
+ if (unlikely(!skb))
+ return -ENOMEM;
+
+ skb_reserve(skb, NET_SKB_PAD);
+
+ skb_recycler_consume(skb);
+ }
+ return 0;
+}
+#endif
+
+/* procfs: count
+ * Show skb counts
+ */
+static int proc_skb_count_show(struct seq_file *seq, void *v)
+{
+ int cpu;
+ int len;
+ int total;
+#ifdef CONFIG_SKB_RECYCLER_MULTI_CPU
+ unsigned int i;
+ unsigned long flags;
+#endif
+
+ total = 0;
+
+ for_each_online_cpu(cpu) {
+ len = skb_queue_len(&per_cpu(recycle_list, cpu));
+ seq_printf(seq, "recycle_list[%d]: %d\n", cpu, len);
+ total += len;
+ }
+
+#ifdef CONFIG_SKB_RECYCLER_MULTI_CPU
+ for_each_online_cpu(cpu) {
+ len = skb_queue_len(&per_cpu(recycle_spare_list, cpu));
+ seq_printf(seq, "recycle_spare_list[%d]: %d\n", cpu, len);
+ total += len;
+ }
+
+ for (i = 0; i < SKB_RECYCLE_MAX_SHARED_POOLS; i++) {
+ spin_lock_irqsave(&glob_recycler.lock, flags);
+ len = skb_queue_len(&glob_recycler.pool[i]);
+ spin_unlock_irqrestore(&glob_recycler.lock, flags);
+ seq_printf(seq, "global_list[%d]: %d\n", i, len);
+ total += len;
+ }
+#endif
+
+ seq_printf(seq, "total: %d\n", total);
+ return 0;
+}
+
+static int proc_skb_count_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, proc_skb_count_show, PDE_DATA(inode));
+}
+
+static const struct file_operations proc_skb_count_fops = {
+ .owner = THIS_MODULE,
+ .open = proc_skb_count_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/* procfs: flush
+ * Flush skbs
+ */
+static void skb_recycler_flush_task(struct work_struct *work)
+{
+ unsigned long flags;
+ struct sk_buff_head *h;
+ struct sk_buff_head tmp;
+
+ skb_queue_head_init(&tmp);
+
+ h = &get_cpu_var(recycle_list);
+ local_irq_save(flags);
+ skb_queue_splice_init(h, &tmp);
+ local_irq_restore(flags);
+ put_cpu_var(recycle_list);
+ skb_recycler_free_skb(&tmp);
+
+#ifdef CONFIG_SKB_RECYCLER_MULTI_CPU
+ h = &get_cpu_var(recycle_spare_list);
+ local_irq_save(flags);
+ skb_queue_splice_init(h, &tmp);
+ local_irq_restore(flags);
+ put_cpu_var(recycle_spare_list);
+ skb_recycler_free_skb(&tmp);
+#endif
+}
+
+static ssize_t proc_skb_flush_write(struct file *file,
+ const char __user *buf,
+ size_t count,
+ loff_t *ppos)
+{
+#ifdef CONFIG_SKB_RECYCLER_MULTI_CPU
+ unsigned int i;
+ unsigned long flags;
+#endif
+ schedule_on_each_cpu(&skb_recycler_flush_task);
+
+#ifdef CONFIG_SKB_RECYCLER_MULTI_CPU
+ spin_lock_irqsave(&glob_recycler.lock, flags);
+ for (i = 0; i < SKB_RECYCLE_MAX_SHARED_POOLS; i++)
+ skb_recycler_free_skb(&glob_recycler.pool[i]);
+ glob_recycler.head = 0;
+ glob_recycler.tail = 0;
+ spin_unlock_irqrestore(&glob_recycler.lock, flags);
+#endif
+ return count;
+}
+
+static const struct file_operations proc_skb_flush_fops = {
+ .owner = THIS_MODULE,
+ .write = proc_skb_flush_write,
+ .open = simple_open,
+ .llseek = noop_llseek,
+};
+
+/* procfs: max_skbs
+ * Show max skbs
+ */
+static int proc_skb_max_skbs_show(struct seq_file *seq, void *v)
+{
+ seq_printf(seq, "%d\n", skb_recycle_max_skbs);
+ return 0;
+}
+
+static int proc_skb_max_skbs_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, proc_skb_max_skbs_show, PDE_DATA(inode));
+}
+
+static ssize_t proc_skb_max_skbs_write(struct file *file,
+ const char __user *buf,
+ size_t count,
+ loff_t *ppos)
+{
+ int ret;
+ int max;
+ char buffer[13];
+
+ memset(buffer, 0, sizeof(buffer));
+ if (count > sizeof(buffer) - 1)
+ count = sizeof(buffer) - 1;
+ if (copy_from_user(buffer, buf, count) != 0)
+ return -EFAULT;
+ ret = kstrtoint(strstrip(buffer), 10, &max);
+ if (ret == 0 && max >= 0)
+ skb_recycle_max_skbs = max;
+
+ return count;
+}
+
+static const struct file_operations proc_skb_max_skbs_fops = {
+ .owner = THIS_MODULE,
+ .open = proc_skb_max_skbs_open,
+ .read = seq_read,
+ .write = proc_skb_max_skbs_write,
+ .release = single_release,
+};
+
+#ifdef CONFIG_SKB_RECYCLER_MULTI_CPU
+/* procfs: max_spare_skbs
+ * Show max spare skbs
+ */
+static int proc_skb_max_spare_skbs_show(struct seq_file *seq, void *v)
+{
+ seq_printf(seq, "%d\n", skb_recycle_spare_max_skbs);
+ return 0;
+}
+
+static int proc_skb_max_spare_skbs_open(struct inode *inode, struct file *file)
+{
+ return single_open(file,
+ proc_skb_max_spare_skbs_show,
+ PDE_DATA(inode));
+}
+
+static ssize_t
+proc_skb_max_spare_skbs_write(struct file *file,
+ const char __user *buf,
+ size_t count,
+ loff_t *ppos)
+{
+ int ret;
+ int max;
+ char buffer[13];
+
+ memset(buffer, 0, sizeof(buffer));
+ if (count > sizeof(buffer) - 1)
+ count = sizeof(buffer) - 1;
+ if (copy_from_user(buffer, buf, count) != 0)
+ return -EFAULT;
+ ret = kstrtoint(strstrip(buffer), 10, &max);
+ if (ret == 0 && max >= 0)
+ skb_recycle_spare_max_skbs = max;
+
+ return count;
+}
+
+static const struct file_operations proc_skb_max_spare_skbs_fops = {
+ .owner = THIS_MODULE,
+ .open = proc_skb_max_spare_skbs_open,
+ .read = seq_read,
+ .write = proc_skb_max_spare_skbs_write,
+ .release = single_release,
+};
+#endif /* CONFIG_SKB_RECYCLER_MULTI_CPU */
+
+static void skb_recycler_init_procfs(void)
+{
+ proc_net_skbrecycler = proc_mkdir("skb_recycler", init_net.proc_net);
+ if (!proc_net_skbrecycler) {
+ pr_err("cannot create skb_recycle proc dir");
+ return;
+ }
+
+ if (!proc_create("count",
+ S_IRUGO,
+ proc_net_skbrecycler,
+ &proc_skb_count_fops))
+ pr_err("cannot create proc net skb_recycle held\n");
+
+ if (!proc_create("flush",
+ S_IWUGO,
+ proc_net_skbrecycler,
+ &proc_skb_flush_fops))
+ pr_err("cannot create proc net skb_recycle flush\n");
+
+ if (!proc_create("max_skbs",
+ S_IRUGO | S_IWUGO,
+ proc_net_skbrecycler,
+ &proc_skb_max_skbs_fops))
+ pr_err("cannot create proc net skb_recycle max_skbs\n");
+
+#ifdef CONFIG_SKB_RECYCLER_MULTI_CPU
+ if (!proc_create("max_spare_skbs",
+ S_IRUGO | S_IWUGO,
+ proc_net_skbrecycler,
+ &proc_skb_max_spare_skbs_fops))
+ pr_err("cannot create proc net skb_recycle max_spare_skbs\n");
+#endif
+}
+
+void __init skb_recycler_init(void)
+{
+ int cpu;
+#ifdef CONFIG_SKB_RECYCLER_MULTI_CPU
+ unsigned int i;
+#endif
+
+ for_each_possible_cpu(cpu) {
+ skb_queue_head_init(&per_cpu(recycle_list, cpu));
+ }
+
+#ifdef CONFIG_SKB_RECYCLER_MULTI_CPU
+ for_each_possible_cpu(cpu) {
+ skb_queue_head_init(&per_cpu(recycle_spare_list, cpu));
+ }
+
+ spin_lock_init(&glob_recycler.lock);
+
+ for (i = 0; i < SKB_RECYCLE_MAX_SHARED_POOLS; i++)
+ skb_queue_head_init(&glob_recycler.pool[i]);
+ glob_recycler.head = 0;
+ glob_recycler.tail = 0;
+#endif
+
+#ifdef CONFIG_SKB_RECYCLER_PREALLOC
+ if (skb_prealloc_init_list())
+ pr_err("Failed to preallocate SKBs for recycle list\n");
+#endif
+
+ hotcpu_notifier(skb_cpu_callback, 0);
+ skbuff_debugobj_register_callback();
+ skb_recycler_init_procfs();
+}
+
+void skb_recycler_print_all_lists(void)
+{
+ unsigned long flags;
+ int cpu;
+#ifdef CONFIG_SKB_RECYCLER_MULTI_CPU
+ int i;
+
+ spin_lock_irqsave(&glob_recycler.lock, flags);
+ for (i = 0; i < SKB_RECYCLE_MAX_SHARED_POOLS; i++)
+ skbuff_debugobj_print_skb_list((&glob_recycler.pool[i])->next,
+ "Global Pool", -1);
+ spin_unlock_irqrestore(&glob_recycler.lock, flags);
+
+ preempt_disable();
+ local_irq_save(flags);
+ for_each_possible_cpu(cpu) {
+ struct sk_buff_head *h;
+
+ h = &per_cpu(recycle_spare_list, cpu);
+ skbuff_debugobj_print_skb_list(h->next, "Recycle Spare", cpu);
+ }
+ local_irq_restore(flags);
+ preempt_enable();
+#endif
+
+ preempt_disable();
+ local_irq_save(flags);
+ for_each_possible_cpu(cpu) {
+ struct sk_buff_head *h;
+
+ h = &per_cpu(recycle_list, cpu);
+ skbuff_debugobj_print_skb_list(h->next, "Recycle List", cpu);
+ }
+ local_irq_restore(flags);
+ preempt_enable();
+}
new file mode 100644
--- /dev/null
+++ b/net/core/skbuff_recycle.h
@@ -0,0 +1,170 @@
+/* Copyright (c) 2013-2017, The Linux Foundation. All rights reserved.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+/* Definitions for the skb recycler functions */
+#ifndef _LINUX_SKBUFF_RECYCLE_H
+#define _LINUX_SKBUFF_RECYCLE_H
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/kmemcheck.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#ifdef CONFIG_NET_CLS_ACT
+#include <net/pkt_sched.h>
+#endif
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/splice.h>
+#include <linux/init.h>
+#include <linux/prefetch.h>
+#include <linux/if.h>
+
+#define SKB_RECYCLE_SIZE 2304
+#define SKB_RECYCLE_MIN_SIZE SKB_RECYCLE_SIZE
+#define SKB_RECYCLE_MAX_SIZE (3904 - NET_SKB_PAD)
+#define SKB_RECYCLE_MAX_SKBS 1024
+
+#define SKB_RECYCLE_SPARE_MAX_SKBS 256
+
+#ifdef CONFIG_SKB_RECYCLER_PREALLOC
+#define SKB_RECYCLE_MAX_PREALLOC_SKBS CONFIG_SKB_RECYCLE_MAX_PREALLOC_SKBS
+#define SKB_RECYCLE_MAX_SHARED_POOLS \
+ DIV_ROUND_UP(SKB_RECYCLE_MAX_PREALLOC_SKBS, \
+ SKB_RECYCLE_SPARE_MAX_SKBS)
+#else
+#define SKB_RECYCLE_MAX_SHARED_POOLS 8
+#endif
+
+#define SKB_RECYCLE_MAX_SHARED_POOLS_MASK \
+ (SKB_RECYCLE_MAX_SHARED_POOLS - 1)
+
+#ifdef CONFIG_SKB_RECYCLER_MULTI_CPU
+struct global_recycler {
+ /* Global circular list which holds the shared skb pools */
+ struct sk_buff_head pool[SKB_RECYCLE_MAX_SHARED_POOLS];
+ u8 head; /* head of the circular list */
+ u8 tail; /* tail of the circular list */
+ spinlock_t lock;
+};
+#endif
+
+static __always_inline void zero_struct(void *v, int size)
+{
+ u32 *s = (u32 *)v;
+
+ /* We assume that size is word aligned; in fact, it's constant */
+ WARN_ON((size & 3) != 0);
+
+ /* This looks odd but we "know" size is a constant, and so the
+ * compiler can fold away all of the conditionals. The compiler is
+ * pretty smart here, and can fold away the loop, too!
+ */
+ while (size > 0) {
+ if (size >= 4)
+ s[0] = 0;
+ if (size >= 8)
+ s[1] = 0;
+ if (size >= 12)
+ s[2] = 0;
+ if (size >= 16)
+ s[3] = 0;
+ if (size >= 20)
+ s[4] = 0;
+ if (size >= 24)
+ s[5] = 0;
+ if (size >= 28)
+ s[6] = 0;
+ if (size >= 32)
+ s[7] = 0;
+ if (size >= 36)
+ s[8] = 0;
+ if (size >= 40)
+ s[9] = 0;
+ if (size >= 44)
+ s[10] = 0;
+ if (size >= 48)
+ s[11] = 0;
+ if (size >= 52)
+ s[12] = 0;
+ if (size >= 56)
+ s[13] = 0;
+ if (size >= 60)
+ s[14] = 0;
+ if (size >= 64)
+ s[15] = 0;
+ size -= 64;
+ s += 16;
+ }
+}
+
+static inline bool consume_skb_can_recycle(const struct sk_buff *skb,
+ int min_skb_size, int max_skb_size)
+{
+ if (unlikely(irqs_disabled()))
+ return false;
+
+ if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY))
+ return false;
+
+ if (unlikely(skb_is_nonlinear(skb)))
+ return false;
+
+ if (unlikely(skb_shinfo(skb)->frag_list))
+ return false;
+
+ if (unlikely(skb_shinfo(skb)->nr_frags))
+ return false;
+
+ if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE))
+ return false;
+
+ min_skb_size = SKB_DATA_ALIGN(min_skb_size + NET_SKB_PAD);
+ if (unlikely(skb_end_pointer(skb) - skb->head < min_skb_size))
+ return false;
+
+ max_skb_size = SKB_DATA_ALIGN(max_skb_size + NET_SKB_PAD);
+ if (unlikely(skb_end_pointer(skb) - skb->head > max_skb_size))
+ return false;
+
+ if (unlikely(skb_cloned(skb)))
+ return false;
+
+ if (unlikely(skb_pfmemalloc(skb)))
+ return false;
+
+ return true;
+}
+
+#ifdef CONFIG_SKB_RECYCLER
+void __init skb_recycler_init(void);
+struct sk_buff *skb_recycler_alloc(struct net_device *dev, unsigned int length);
+bool skb_recycler_consume(struct sk_buff *skb);
+void skb_recycler_print_all_lists(void);
+#else
+#define skb_recycler_init() {}
+#define skb_recycler_alloc(dev, len) NULL
+#define skb_recycler_consume(skb) false
+#define skb_recycler_print_all_lists() false
+#endif
+#endif
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1474,9 +1474,11 @@ void sk_destruct(struct sock *sk)
static void __sk_free(struct sock *sk)
{
+#ifdef CONFIG_SOCK_DIAG
if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
sock_diag_broadcast_destroy(sk);
else
+#endif
sk_destruct(sk);
}
@@ -3040,6 +3042,8 @@ static __net_initdata struct pernet_operations proto_net_ops = {
static int __init proto_init(void)
{
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED))
+ return 0;
return register_pernet_subsys(&proto_net_ops);
}
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -63,6 +63,9 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb)
if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->drv)
return false;
+ if (!(skb->dev->phydev->advertising & ADVERTISED_PTP))
+ return false;
+
if (skb_headroom(skb) < ETH_HLEN)
return false;
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -422,6 +422,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
newsk->sk_backlog_rcv = dccp_v4_do_rcv;
newnp->pktoptions = NULL;
newnp->opt = NULL;
+ newnp->ipv6_mc_list = NULL;
+ newnp->ipv6_ac_list = NULL;
+ newnp->ipv6_fl_list = NULL;
newnp->mcast_oif = inet6_iif(skb);
newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
@@ -486,6 +489,9 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
/* Clone RX bits */
newnp->rxopt.all = np->rxopt.all;
+ newnp->ipv6_mc_list = NULL;
+ newnp->ipv6_ac_list = NULL;
+ newnp->ipv6_fl_list = NULL;
newnp->pktoptions = NULL;
newnp->opt = NULL;
newnp->mcast_oif = inet6_iif(skb);
new file mode 100644
--- /dev/null
+++ b/net/dsa/mv88e6063.c
@@ -0,0 +1,294 @@
+/*
+ * net/dsa/mv88e6063.c - Driver for Marvell 88e6063 switch chips
+ * Copyright (c) 2009 Gabor Juhos <juhosg@openwrt.org>
+ *
+ * This driver was base on: net/dsa/mv88e6060.c
+ * net/dsa/mv88e6063.c - Driver for Marvell 88e6060 switch chips
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+#include "dsa_priv.h"
+
+#define REG_BASE 0x10
+#define REG_PHY(p) (REG_BASE + (p))
+#define REG_PORT(p) (REG_BASE + 8 + (p))
+#define REG_GLOBAL (REG_BASE + 0x0f)
+#define NUM_PORTS 7
+
+static int reg_read(struct dsa_switch *ds, int addr, int reg)
+{
+ return mdiobus_read(ds->master_mii_bus, addr, reg);
+}
+
+#define REG_READ(addr, reg) \
+ ({ \
+ int __ret; \
+ \
+ __ret = reg_read(ds, addr, reg); \
+ if (__ret < 0) \
+ return __ret; \
+ __ret; \
+ })
+
+
+static int reg_write(struct dsa_switch *ds, int addr, int reg, u16 val)
+{
+ return mdiobus_write(ds->master_mii_bus, addr, reg, val);
+}
+
+#define REG_WRITE(addr, reg, val) \
+ ({ \
+ int __ret; \
+ \
+ __ret = reg_write(ds, addr, reg, val); \
+ if (__ret < 0) \
+ return __ret; \
+ })
+
+static char *mv88e6063_probe(struct mii_bus *bus, int sw_addr)
+{
+ int ret;
+
+ ret = mdiobus_read(bus, REG_PORT(0), 0x03);
+ if (ret >= 0) {
+ ret &= 0xfff0;
+ if (ret == 0x1530)
+ return "Marvell 88E6063";
+ }
+
+ return NULL;
+}
+
+static int mv88e6063_switch_reset(struct dsa_switch *ds)
+{
+ int i;
+ int ret;
+
+ /*
+ * Set all ports to the disabled state.
+ */
+ for (i = 0; i < NUM_PORTS; i++) {
+ ret = REG_READ(REG_PORT(i), 0x04);
+ REG_WRITE(REG_PORT(i), 0x04, ret & 0xfffc);
+ }
+
+ /*
+ * Wait for transmit queues to drain.
+ */
+ msleep(2);
+
+ /*
+ * Reset the switch.
+ */
+ REG_WRITE(REG_GLOBAL, 0x0a, 0xa130);
+
+ /*
+ * Wait up to one second for reset to complete.
+ */
+ for (i = 0; i < 1000; i++) {
+ ret = REG_READ(REG_GLOBAL, 0x00);
+ if ((ret & 0x8000) == 0x0000)
+ break;
+
+ msleep(1);
+ }
+ if (i == 1000)
+ return -ETIMEDOUT;
+
+ return 0;
+}
+
+static int mv88e6063_setup_global(struct dsa_switch *ds)
+{
+ /*
+ * Disable discarding of frames with excessive collisions,
+ * set the maximum frame size to 1536 bytes, and mask all
+ * interrupt sources.
+ */
+ REG_WRITE(REG_GLOBAL, 0x04, 0x0800);
+
+ /*
+ * Enable automatic address learning, set the address
+ * database size to 1024 entries, and set the default aging
+ * time to 5 minutes.
+ */
+ REG_WRITE(REG_GLOBAL, 0x0a, 0x2130);
+
+ return 0;
+}
+
+static int mv88e6063_setup_port(struct dsa_switch *ds, int p)
+{
+ int addr = REG_PORT(p);
+
+ /*
+ * Do not force flow control, disable Ingress and Egress
+ * Header tagging, disable VLAN tunneling, and set the port
+ * state to Forwarding. Additionally, if this is the CPU
+ * port, enable Ingress and Egress Trailer tagging mode.
+ */
+ REG_WRITE(addr, 0x04, dsa_is_cpu_port(ds, p) ? 0x4103 : 0x0003);
+
+ /*
+ * Port based VLAN map: give each port its own address
+ * database, allow the CPU port to talk to each of the 'real'
+ * ports, and allow each of the 'real' ports to only talk to
+ * the CPU port.
+ */
+ REG_WRITE(addr, 0x06,
+ ((p & 0xf) << 12) |
+ (dsa_is_cpu_port(ds, p) ?
+ ds->phys_port_mask :
+ (1 << ds->dst->cpu_port)));
+
+ /*
+ * Port Association Vector: when learning source addresses
+ * of packets, add the address to the address database using
+ * a port bitmap that has only the bit for this port set and
+ * the other bits clear.
+ */
+ REG_WRITE(addr, 0x0b, 1 << p);
+
+ return 0;
+}
+
+static int mv88e6063_setup(struct dsa_switch *ds)
+{
+ int i;
+ int ret;
+
+ ret = mv88e6063_switch_reset(ds);
+ if (ret < 0)
+ return ret;
+
+ /* @@@ initialise atu */
+
+ ret = mv88e6063_setup_global(ds);
+ if (ret < 0)
+ return ret;
+
+ for (i = 0; i < NUM_PORTS; i++) {
+ ret = mv88e6063_setup_port(ds, i);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int mv88e6063_set_addr(struct dsa_switch *ds, u8 *addr)
+{
+ REG_WRITE(REG_GLOBAL, 0x01, (addr[0] << 8) | addr[1]);
+ REG_WRITE(REG_GLOBAL, 0x02, (addr[2] << 8) | addr[3]);
+ REG_WRITE(REG_GLOBAL, 0x03, (addr[4] << 8) | addr[5]);
+
+ return 0;
+}
+
+static int mv88e6063_port_to_phy_addr(int port)
+{
+ if (port >= 0 && port <= NUM_PORTS)
+ return REG_PHY(port);
+ return -1;
+}
+
+static int mv88e6063_phy_read(struct dsa_switch *ds, int port, int regnum)
+{
+ int addr;
+
+ addr = mv88e6063_port_to_phy_addr(port);
+ if (addr == -1)
+ return 0xffff;
+
+ return reg_read(ds, addr, regnum);
+}
+
+static int
+mv88e6063_phy_write(struct dsa_switch *ds, int port, int regnum, u16 val)
+{
+ int addr;
+
+ addr = mv88e6063_port_to_phy_addr(port);
+ if (addr == -1)
+ return 0xffff;
+
+ return reg_write(ds, addr, regnum, val);
+}
+
+static void mv88e6063_poll_link(struct dsa_switch *ds)
+{
+ int i;
+
+ for (i = 0; i < DSA_MAX_PORTS; i++) {
+ struct net_device *dev;
+ int uninitialized_var(port_status);
+ int link;
+ int speed;
+ int duplex;
+ int fc;
+
+ dev = ds->ports[i];
+ if (dev == NULL)
+ continue;
+
+ link = 0;
+ if (dev->flags & IFF_UP) {
+ port_status = reg_read(ds, REG_PORT(i), 0x00);
+ if (port_status < 0)
+ continue;
+
+ link = !!(port_status & 0x1000);
+ }
+
+ if (!link) {
+ if (netif_carrier_ok(dev)) {
+ printk(KERN_INFO "%s: link down\n", dev->name);
+ netif_carrier_off(dev);
+ }
+ continue;
+ }
+
+ speed = (port_status & 0x0100) ? 100 : 10;
+ duplex = (port_status & 0x0200) ? 1 : 0;
+ fc = ((port_status & 0xc000) == 0xc000) ? 1 : 0;
+
+ if (!netif_carrier_ok(dev)) {
+ printk(KERN_INFO "%s: link up, %d Mb/s, %s duplex, "
+ "flow control %sabled\n", dev->name,
+ speed, duplex ? "full" : "half",
+ fc ? "en" : "dis");
+ netif_carrier_on(dev);
+ }
+ }
+}
+
+static struct dsa_switch_driver mv88e6063_switch_driver = {
+ .tag_protocol = htons(ETH_P_TRAILER),
+ .probe = mv88e6063_probe,
+ .setup = mv88e6063_setup,
+ .set_addr = mv88e6063_set_addr,
+ .phy_read = mv88e6063_phy_read,
+ .phy_write = mv88e6063_phy_write,
+ .poll_link = mv88e6063_poll_link,
+};
+
+static int __init mv88e6063_init(void)
+{
+ register_switch_driver(&mv88e6063_switch_driver);
+ return 0;
+}
+module_init(mv88e6063_init);
+
+static void __exit mv88e6063_cleanup(void)
+{
+ unregister_switch_driver(&mv88e6063_switch_driver);
+}
+module_exit(mv88e6063_cleanup);
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -140,6 +140,18 @@ u32 eth_get_headlen(void *data, unsigned int len)
}
EXPORT_SYMBOL(eth_get_headlen);
+static inline bool
+eth_check_local_mask(const void *addr1, const void *addr2, const void *mask)
+{
+ const u16 *a1 = addr1;
+ const u16 *a2 = addr2;
+ const u16 *m = mask;
+
+ return (((a1[0] ^ a2[0]) & ~m[0]) |
+ ((a1[1] ^ a2[1]) & ~m[1]) |
+ ((a1[2] ^ a2[2]) & ~m[2]));
+}
+
/**
* eth_type_trans - determine the packet's protocol ID.
* @skb: received socket data
@@ -156,6 +168,12 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
const struct ethhdr *eth;
skb->dev = dev;
+
+#ifdef CONFIG_ETHERNET_PACKET_MANGLE
+ if (dev->eth_mangle_rx)
+ dev->eth_mangle_rx(dev, skb);
+#endif
+
skb_reset_mac_header(skb);
eth = (struct ethhdr *)skb->data;
@@ -168,8 +186,12 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
skb->pkt_type = PACKET_MULTICAST;
}
else if (unlikely(!ether_addr_equal_64bits(eth->h_dest,
- dev->dev_addr)))
+ dev->dev_addr))) {
skb->pkt_type = PACKET_OTHERHOST;
+ if (eth_check_local_mask(eth->h_dest, dev->dev_addr,
+ dev->local_addr_mask))
+ skb->gro_skip = 1;
+ }
/*
* Some variants of DSA tagging don't have an ethertype field
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -414,6 +414,7 @@ config INET_LRO
config INET_DIAG
tristate "INET: socket monitoring interface"
+ select SOCK_DIAG
default y
---help---
Support for INET (TCP, DCCP, etc) socket monitoring interface used by
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1321,8 +1321,8 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
if (unlikely(ip_fast_csum((u8 *)iph, 5)))
goto out_unlock;
- id = ntohl(*(__be32 *)&iph->id);
- flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
+ id = ntohl(net_hdr_word(&iph->id));
+ flush = (u16)((ntohl(net_hdr_word(iph)) ^ skb_gro_len(skb)) | (id & ~IP_DF));
id >>= 16;
for (p = *head; p; p = p->next) {
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -150,6 +150,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
int assoclen;
int extralen;
__be64 seqno;
+ bool nosupp_sg;
/* skb is pure payload to encrypt */
@@ -157,6 +158,12 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
alen = crypto_aead_authsize(aead);
ivlen = crypto_aead_ivsize(aead);
+ nosupp_sg = crypto_tfm_alg_flags(&aead->base) & CRYPTO_ALG_NOSUPP_SG;
+ if (nosupp_sg && skb_linearize(skb)) {
+ err = -ENOMEM;
+ goto error;
+ }
+
tfclen = 0;
if (x->tfcpad) {
struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
@@ -430,6 +437,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
u8 *iv;
struct scatterlist *sg;
int err = -EINVAL;
+ bool nosupp_sg;
if (!pskb_may_pull(skb, sizeof(*esph) + ivlen))
goto out;
@@ -437,6 +445,12 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
if (elen <= 0)
goto out;
+ nosupp_sg = crypto_tfm_alg_flags(&aead->base) & CRYPTO_ALG_NOSUPP_SG;
+ if (nosupp_sg && skb_linearize(skb)) {
+ err = -ENOMEM;
+ goto out;
+ }
+
err = skb_cow_data(skb, 0, &trailer);
if (err < 0)
goto out;
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -138,6 +138,10 @@ const struct fib_prop fib_props[RTN_MAX + 1] = {
.error = -EINVAL,
.scope = RT_SCOPE_NOWHERE,
},
+ [RTN_POLICY_FAILED] = {
+ .error = -EACCES,
+ .scope = RT_SCOPE_UNIVERSE,
+ },
};
static void rt_fibinfo_free(struct rtable __rcu **rtp)
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1077,6 +1077,9 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp,
return 0;
}
+/* Define route change notification chain. */
+static BLOCKING_NOTIFIER_HEAD(iproute_chain);
+
/* Caller must hold RTNL. */
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
{
@@ -1246,6 +1249,8 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
&cfg->fc_nlinfo, nlflags);
succeeded:
+ blocking_notifier_call_chain(&iproute_chain,
+ RTM_NEWROUTE, fi);
return 0;
out_sw_fib_del:
@@ -1554,6 +1559,8 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
if (fa_to_delete->fa_state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
+ blocking_notifier_call_chain(&iproute_chain,
+ RTM_DELROUTE, fa_to_delete->fa_info);
fib_release_info(fa_to_delete->fa_info);
alias_free_mem_rcu(fa_to_delete);
return 0;
@@ -1982,6 +1989,18 @@ void __init fib_trie_init(void)
0, SLAB_PANIC, NULL);
}
+int ip_rt_register_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&iproute_chain, nb);
+}
+EXPORT_SYMBOL(ip_rt_register_notifier);
+
+int ip_rt_unregister_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&iproute_chain, nb);
+}
+EXPORT_SYMBOL(ip_rt_unregister_notifier);
+
struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
{
struct fib_table *tb;
@@ -2368,6 +2387,7 @@ static const char *const rtn_type_names[__RTN_MAX] = {
[RTN_THROW] = "THROW",
[RTN_NAT] = "NAT",
[RTN_XRESOLVE] = "XRESOLVE",
+ [RTN_POLICY_FAILED] = "POLICY_FAILED",
};
static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
@@ -2638,10 +2658,12 @@ static const struct file_operations fib_route_fops = {
int __net_init fib_proc_init(struct net *net)
{
- if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED) &&
+ !proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
goto out1;
- if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED) &&
+ !proc_create("fib_triestat", S_IRUGO, net->proc_net,
&fib_triestat_fops))
goto out2;
@@ -2651,17 +2673,21 @@ int __net_init fib_proc_init(struct net *net)
return 0;
out3:
- remove_proc_entry("fib_triestat", net->proc_net);
+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED))
+ remove_proc_entry("fib_triestat", net->proc_net);
out2:
- remove_proc_entry("fib_trie", net->proc_net);
+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED))
+ remove_proc_entry("fib_trie", net->proc_net);
out1:
return -ENOMEM;
}
void __net_exit fib_proc_exit(struct net *net)
{
- remove_proc_entry("fib_trie", net->proc_net);
- remove_proc_entry("fib_triestat", net->proc_net);
+ if (!IS_ENABLED(CONFIG_PROC_STRIPPED)) {
+ remove_proc_entry("fib_trie", net->proc_net);
+ remove_proc_entry("fib_triestat", net->proc_net);
+ }
remove_proc_entry("route", net->proc_net);
}
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -505,7 +505,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
if (!skb)
return NULL;
psrc = (__be32 *)skb_put(skb, sizeof(__be32));
- *psrc = psf->sf_inaddr;
+ net_hdr_word(psrc) = psf->sf_inaddr;
scount++; stotal++;
if ((type == IGMPV3_ALLOW_NEW_SOURCES ||
type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -669,6 +669,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
newsk->sk_write_space = sk_stream_write_space;
+ inet_sk(newsk)->mc_list = NULL;
+
newsk->sk_mark = inet_rsk(req)->ir_mark;
atomic64_set(&newsk->sk_cookie,
atomic64_read(&inet_rsk(req)->ir_cookie));
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -39,6 +39,9 @@
#include <net/route.h>
#include <net/xfrm.h>
+int sysctl_ip_use_legacy_tos __read_mostly = 1;
+EXPORT_SYMBOL(sysctl_ip_use_legacy_tos);
+
static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
{
if (skb->len <= mtu)
@@ -143,7 +146,11 @@ int ip_forward(struct sk_buff *skb)
!skb_sec_path(skb))
ip_rt_send_redirect(skb);
- skb->priority = rt_tos2priority(iph->tos);
+ /*
+ * Set skb priority using legacy ToS method if required.
+ */
+ if (sysctl_ip_use_legacy_tos != 0)
+ skb->priority = rt_tos2priority(iph->tos);
return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
net, NULL, skb, skb->dev, rt->dst.dev,
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -633,6 +633,8 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
if (IS_ERR(skb))
goto out;
+ skb->skb_iif = dev->ifindex;
+
__gre_xmit(skb, dev, tnl_params, skb->protocol);
return NETDEV_TX_OK;
@@ -660,6 +662,8 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
if (skb_cow_head(skb, dev->needed_headroom))
goto free_skb;
+ skb->skb_iif = dev->ifindex;
+
__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
return NETDEV_TX_OK;
@@ -703,7 +707,6 @@ static int ipgre_tunnel_ioctl(struct net_device *dev,
It allows to construct virtual multiprotocol broadcast "LAN"
over the Internet, provided multicast routing is tuned.
-
I have no idea was this bicycle invented before me,
so that I had to set ARPHRD_IPGRE to a random value.
I have an impression, that Cisco could make something similar,
@@ -1062,7 +1065,7 @@ static void ipgre_tap_setup(struct net_device *dev)
{
ether_setup(dev);
dev->netdev_ops = &gre_tap_netdev_ops;
- dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_GRE_V4_TAP;
ip_tunnel_setup(dev, gre_tap_net_id);
}
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -922,10 +922,12 @@ static int __ip_append_data(struct sock *sk,
csummode = CHECKSUM_PARTIAL;
cork->length += length;
- if (((length > mtu) || (skb && skb_is_gso(skb))) &&
+ if ((skb && skb_is_gso(skb)) ||
+ (((length + (skb ? skb->len : fragheaderlen)) > mtu) &&
+ (skb_queue_len(queue) <= 1) &&
(sk->sk_protocol == IPPROTO_UDP) &&
- (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
- (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) {
+ (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
+ (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx)) {
err = ip_ufo_append_data(sk, queue, getfrag, from, length,
hh_len, fragheaderlen, transhdrlen,
maxfraglen, flags);
@@ -1241,6 +1243,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
return -EINVAL;
if ((size + skb->len > mtu) &&
+ (skb_queue_len(&sk->sk_write_queue) == 1) &&
(sk->sk_protocol == IPPROTO_UDP) &&
(rt->dst.dev->features & NETIF_F_UFO)) {
if (skb->ip_summed != CHECKSUM_PARTIAL)
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -47,6 +47,7 @@
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/dst_metadata.h>
+#include <net/vxlan.h>
int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 proto,
@@ -55,7 +56,12 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
int pkt_len = skb->len - skb_inner_network_offset(skb);
struct net *net = dev_net(rt->dst.dev);
struct iphdr *iph;
+ struct net_device *in_dev = NULL;
int err;
+ int skb_iif;
+
+ /* Save input interface index */
+ skb_iif = skb->skb_iif;
skb_scrub_packet(skb, xnet);
@@ -79,7 +85,16 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
iph->ttl = ttl;
__ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1);
+ /* Get input interface */
+ if (skb_iif)
+ in_dev = __dev_get_by_index(&init_net, skb_iif);
+
+ if (proto == IPPROTO_IPV6 || proto == IPPROTO_GRE ||
+ is_vxlan_dev(in_dev))
+ skb->skb_iif = skb_iif;
+
err = ip_local_out(net, sk, skb);
+
if (unlikely(net_xmit_eval(err)))
pkt_len = 0;
return pkt_len;
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -136,6 +136,9 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
int cmd);
static void mroute_clean_tables(struct mr_table *mrt, bool all);
static void ipmr_expire_process(unsigned long arg);
+static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, __be32 origin,
+ __be32 mcastgrp);
+static ipmr_mfc_event_offload_callback_t __rcu ipmr_mfc_event_offload_callback;
#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
#define ipmr_for_each_table(mrt, net) \
@@ -182,6 +185,7 @@ static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
case FR_ACT_UNREACHABLE:
return -ENETUNREACH;
case FR_ACT_PROHIBIT:
+ case FR_ACT_POLICY_FAILED:
return -EACCES;
case FR_ACT_BLACKHOLE:
default:
@@ -225,6 +229,78 @@ static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
return 0;
}
+/* ipmr_sync_entry_update()
+ * Call the registered offload callback to report an update to a multicast
+ * route entry. The callback receives the list of destination interfaces and
+ * the interface count
+ */
+static void ipmr_sync_entry_update(struct mr_table *mrt,
+ struct mfc_cache *cache)
+{
+ int vifi, dest_if_count = 0;
+ u32 dest_dev[MAXVIFS];
+ __be32 origin;
+ __be32 group;
+ ipmr_mfc_event_offload_callback_t offload_update_cb_f;
+
+ memset(dest_dev, 0, sizeof(dest_dev));
+
+ origin = cache->mfc_origin;
+ group = cache->mfc_mcastgrp;
+
+ read_lock(&mrt_lock);
+ for (vifi = 0; vifi < cache->mfc_un.res.maxvif; vifi++) {
+ if (!((cache->mfc_un.res.ttls[vifi] > 0) &&
+ (cache->mfc_un.res.ttls[vifi] < 255))) {
+ continue;
+ }
+ if (dest_if_count == MAXVIFS) {
+ read_unlock(&mrt_lock);
+ return;
+ }
+
+ if (!VIF_EXISTS(mrt, vifi)) {
+ read_unlock(&mrt_lock);
+ return;
+ }
+ dest_dev[dest_if_count] = mrt->vif_table[vifi].dev->ifindex;
+ dest_if_count++;
+ }
+ read_unlock(&mrt_lock);
+
+ rcu_read_lock();
+ offload_update_cb_f = rcu_dereference(ipmr_mfc_event_offload_callback);
+
+ if (!offload_update_cb_f) {
+ rcu_read_unlock();
+ return;
+ }
+
+ offload_update_cb_f(group, origin, dest_if_count, dest_dev,
+ IPMR_MFC_EVENT_UPDATE);
+ rcu_read_unlock();
+}
+
+/* ipmr_sync_entry_delete()
+ * Call the registered offload callback to inform of a multicast route entry
+ * delete event
+ */
+static void ipmr_sync_entry_delete(u32 origin, u32 group)
+{
+ ipmr_mfc_event_offload_callback_t offload_update_cb_f;
+
+ rcu_read_lock();
+ offload_update_cb_f = rcu_dereference(ipmr_mfc_event_offload_callback);
+
+ if (!offload_update_cb_f) {
+ rcu_read_unlock();
+ return;
+ }
+
+ offload_update_cb_f(group, origin, 0, NULL, IPMR_MFC_EVENT_DELETE);
+ rcu_read_unlock();
+}
+
static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
.family = RTNL_FAMILY_IPMR,
.rule_size = sizeof(struct ipmr_rule),
@@ -239,6 +315,150 @@ static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
.owner = THIS_MODULE,
};
+/* ipmr_register_mfc_event_offload_callback()
+ * Register the IPv4 Multicast update offload callback with IPMR
+ */
+bool ipmr_register_mfc_event_offload_callback(
+ ipmr_mfc_event_offload_callback_t mfc_offload_cb)
+{
+ ipmr_mfc_event_offload_callback_t offload_update_cb_f;
+
+ rcu_read_lock();
+ offload_update_cb_f = rcu_dereference(ipmr_mfc_event_offload_callback);
+
+ if (offload_update_cb_f) {
+ rcu_read_unlock();
+ return false;
+ }
+
+ rcu_assign_pointer(ipmr_mfc_event_offload_callback, mfc_offload_cb);
+ rcu_read_unlock();
+ return true;
+}
+EXPORT_SYMBOL(ipmr_register_mfc_event_offload_callback);
+
+/* ipmr_unregister_mfc_event_offload_callback()
+ * De-register the IPv4 Multicast update offload callback with IPMR
+ */
+void ipmr_unregister_mfc_event_offload_callback(void)
+{
+ rcu_read_lock();
+ rcu_assign_pointer(ipmr_mfc_event_offload_callback, NULL);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(ipmr_unregister_mfc_event_offload_callback);
+
+/* ipmr_find_mfc_entry()
+ * Returns destination interface list for a particular multicast flow, and
+ * the number of interfaces in the list
+ */
+int ipmr_find_mfc_entry(struct net *net, __be32 origin, __be32 group,
+ u32 max_dest_cnt, u32 dest_dev[])
+{
+ int vifi, dest_if_count = 0;
+ struct mr_table *mrt;
+ struct mfc_cache *cache;
+
+ mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
+ if (!mrt)
+ return -ENOENT;
+
+ rcu_read_lock();
+ cache = ipmr_cache_find(mrt, origin, group);
+ if (!cache) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+
+ read_lock(&mrt_lock);
+ for (vifi = 0; vifi < cache->mfc_un.res.maxvif; vifi++) {
+ if (!((cache->mfc_un.res.ttls[vifi] > 0) &&
+ (cache->mfc_un.res.ttls[vifi] < 255))) {
+ continue;
+ }
+
+ /* We have another valid destination interface entry. Check if
+ * the number of the destination interfaces for the route is
+ * exceeding the size of the array given to us
+ */
+ if (dest_if_count == max_dest_cnt) {
+ read_unlock(&mrt_lock);
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ if (!VIF_EXISTS(mrt, vifi)) {
+ read_unlock(&mrt_lock);
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ dest_dev[dest_if_count] = mrt->vif_table[vifi].dev->ifindex;
+ dest_if_count++;
+ }
+ read_unlock(&mrt_lock);
+ rcu_read_unlock();
+
+ return dest_if_count;
+}
+EXPORT_SYMBOL(ipmr_find_mfc_entry);
+
+/* ipmr_mfc_stats_update()
+ * Update the MFC/VIF statistics for offloaded flows
+ */
+int ipmr_mfc_stats_update(struct net *net, __be32 origin, __be32 group,
+ u64 pkts_in, u64 bytes_in,
+ u64 pkts_out, u64 bytes_out)
+{
+ int vif, vifi;
+ struct mr_table *mrt;
+ struct mfc_cache *cache;
+
+ mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
+ if (!mrt)
+ return -ENOENT;
+
+ rcu_read_lock();
+ cache = ipmr_cache_find(mrt, origin, group);
+ if (!cache) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+
+ vif = cache->mfc_parent;
+
+ read_lock(&mrt_lock);
+ if (!VIF_EXISTS(mrt, vif)) {
+ read_unlock(&mrt_lock);
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ mrt->vif_table[vif].pkt_in += pkts_in;
+ mrt->vif_table[vif].bytes_in += bytes_in;
+ cache->mfc_un.res.pkt += pkts_out;
+ cache->mfc_un.res.bytes += bytes_out;
+
+ for (vifi = cache->mfc_un.res.minvif;
+ vifi < cache->mfc_un.res.maxvif; vifi++) {
+ if ((cache->mfc_un.res.ttls[vifi] > 0) &&
+ (cache->mfc_un.res.ttls[vifi] < 255)) {
+ if (!VIF_EXISTS(mrt, vifi)) {
+ read_unlock(&mrt_lock);
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ mrt->vif_table[vifi].pkt_out += pkts_out;
+ mrt->vif_table[vifi].bytes_out += bytes_out;
+ }
+ }
+ read_unlock(&mrt_lock);
+ rcu_read_unlock();
+
+ return 0;
+}
+EXPORT_SYMBOL(ipmr_mfc_stats_update);
+
static int __net_init ipmr_rules_init(struct net *net)
{
struct fib_rules_ops *ops;
@@ -1106,6 +1326,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
{
int line;
struct mfc_cache *c, *next;
+ u32 origin, group;
line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
@@ -1113,9 +1334,14 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
(parent == -1 || parent == c->mfc_parent)) {
+ origin = c->mfc_origin;
+ group = c->mfc_mcastgrp;
list_del_rcu(&c->list);
mroute_netlink_event(mrt, c, RTM_DELROUTE);
ipmr_cache_free(c);
+
+ /* Inform offload modules of the delete event */
+ ipmr_sync_entry_delete(origin, group);
return 0;
}
}
@@ -1151,6 +1377,9 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
c->mfc_flags |= MFC_STATIC;
write_unlock_bh(&mrt_lock);
mroute_netlink_event(mrt, c, RTM_NEWROUTE);
+
+ /* Inform offload modules of the update event */
+ ipmr_sync_entry_update(mrt, c);
return 0;
}
@@ -1207,6 +1436,7 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
int i;
LIST_HEAD(list);
struct mfc_cache *c, *next;
+ u32 origin, group;
/* Shut down all active vif entries */
@@ -1223,9 +1453,14 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
if (!all && (c->mfc_flags & MFC_STATIC))
continue;
+ origin = c->mfc_origin;
+ group = c->mfc_mcastgrp;
list_del_rcu(&c->list);
mroute_netlink_event(mrt, c, RTM_DELROUTE);
ipmr_cache_free(c);
+
+ /* Inform offload modules of the delete event */
+ ipmr_sync_entry_delete(origin, group);
}
}
@@ -2496,7 +2731,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
const char *name = vif->dev ? vif->dev->name : "none";
seq_printf(seq,
- "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
+ "%2Zd %-10s %8llu %7llu %8llu %7llu %05X %08X %08X\n",
vif - mrt->vif_table,
name, vif->bytes_in, vif->pkt_in,
vif->bytes_out, vif->pkt_out,
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -329,6 +329,10 @@ unsigned int arpt_do_table(struct sk_buff *skb,
}
if (table_base + v
!= arpt_next_entry(e)) {
+ if (unlikely(stackidx >= private->stacksize)) {
+ verdict = NF_DROP;
+ break;
+ }
jumpstack[stackidx++] = e;
}
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -82,9 +82,14 @@ ip_packet_match(const struct iphdr *ip,
#define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg)))
- if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
+ if (ipinfo->flags & IPT_F_NO_DEF_MATCH)
+ return true;
+
+ if (FWINV(ipinfo->smsk.s_addr &&
+ (ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
IPT_INV_SRCIP) ||
- FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
+ FWINV(ipinfo->dmsk.s_addr &&
+ (ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
IPT_INV_DSTIP)) {
dprintf("Source or dest mismatch.\n");
@@ -135,6 +140,29 @@ ip_packet_match(const struct iphdr *ip,
return true;
}
+static void
+ip_checkdefault(struct ipt_ip *ip)
+{
+ static const char iface_mask[IFNAMSIZ] = {};
+
+ if (ip->invflags || ip->flags & IPT_F_FRAG)
+ return;
+
+ if (memcmp(ip->iniface_mask, iface_mask, IFNAMSIZ) != 0)
+ return;
+
+ if (memcmp(ip->outiface_mask, iface_mask, IFNAMSIZ) != 0)
+ return;
+
+ if (ip->smsk.s_addr || ip->dmsk.s_addr)
+ return;
+
+ if (ip->proto)
+ return;
+
+ ip->flags |= IPT_F_NO_DEF_MATCH;
+}
+
static bool
ip_checkentry(const struct ipt_ip *ip)
{
@@ -282,6 +310,33 @@ struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry)
return (void *)entry + entry->next_offset;
}
+static bool
+ipt_handle_default_rule(struct ipt_entry *e, unsigned int *verdict)
+{
+ struct xt_entry_target *t;
+ struct xt_standard_target *st;
+
+ if (e->target_offset != sizeof(struct ipt_entry))
+ return false;
+
+ if (!(e->ip.flags & IPT_F_NO_DEF_MATCH))
+ return false;
+
+ t = ipt_get_target(e);
+ if (t->u.kernel.target->target)
+ return false;
+
+ st = (struct xt_standard_target *) t;
+ if (st->verdict == XT_RETURN)
+ return false;
+
+ if (st->verdict >= 0)
+ return false;
+
+ *verdict = (unsigned)(-st->verdict) - 1;
+ return true;
+}
+
/* Returns one of the generic firewall policies, like NF_ACCEPT. */
unsigned int
ipt_do_table(struct sk_buff *skb,
@@ -302,28 +357,8 @@ ipt_do_table(struct sk_buff *skb,
unsigned int addend;
/* Initialization */
- stackidx = 0;
- ip = ip_hdr(skb);
- indev = state->in ? state->in->name : nulldevname;
- outdev = state->out ? state->out->name : nulldevname;
- /* We handle fragments by dealing with the first fragment as
- * if it was a normal packet. All other fragments are treated
- * normally, except that they will NEVER match rules that ask
- * things we don't know, ie. tcp syn flag or ports). If the
- * rule is also a fragment-specific rule, non-fragments won't
- * match it. */
- acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
- acpar.thoff = ip_hdrlen(skb);
- acpar.hotdrop = false;
- acpar.net = state->net;
- acpar.in = state->in;
- acpar.out = state->out;
- acpar.family = NFPROTO_IPV4;
- acpar.hooknum = hook;
-
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
local_bh_disable();
- addend = xt_write_recseq_begin();
private = table->private;
cpu = smp_processor_id();
/*
@@ -332,6 +367,23 @@ ipt_do_table(struct sk_buff *skb,
*/
smp_read_barrier_depends();
table_base = private->entries;
+
+ e = get_entry(table_base, private->hook_entry[hook]);
+ if (ipt_handle_default_rule(e, &verdict)) {
+ struct xt_counters *counter;
+
+ counter = xt_get_this_cpu_counter(&e->counters);
+ ADD_COUNTER(*counter, skb->len, 1);
+ local_bh_enable();
+ return verdict;
+ }
+
+ stackidx = 0;
+ ip = ip_hdr(skb);
+ indev = state->in ? state->in->name : nulldevname;
+ outdev = state->out ? state->out->name : nulldevname;
+
+ addend = xt_write_recseq_begin();
jumpstack = (struct ipt_entry **)private->jumpstack[cpu];
/* Switch to alternate jumpstack if we're being invoked via TEE.
@@ -344,7 +396,20 @@ ipt_do_table(struct sk_buff *skb,
if (static_key_false(&xt_tee_enabled))
jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
- e = get_entry(table_base, private->hook_entry[hook]);
+ /* We handle fragments by dealing with the first fragment as
+ * if it was a normal packet. All other fragments are treated
+ * normally, except that they will NEVER match rules that ask
+ * things we don't know, ie. tcp syn flag or ports). If the
+ * rule is also a fragment-specific rule, non-fragments won't
+ * match it. */
+ acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
+ acpar.thoff = ip_hdrlen(skb);
+ acpar.hotdrop = false;
+ acpar.net = state->net;
+ acpar.in = state->in;
+ acpar.out = state->out;
+ acpar.family = NFPROTO_IPV4;
+ acpar.hooknum = hook;
pr_debug("Entering %s(hook %u), UF %p\n",
table->name, hook,
@@ -408,9 +473,11 @@ ipt_do_table(struct sk_buff *skb,
}
if (table_base + v != ipt_next_entry(e) &&
!(e->ip.flags & IPT_F_GOTO)) {
+ if (unlikely(stackidx >= private->stacksize)) {
+ verdict = NF_DROP;
+ break;
+ }
jumpstack[stackidx++] = e;
- pr_debug("Pushed %p into pos %u\n",
- e, stackidx - 1);
}
e = get_entry(table_base, v);
@@ -587,6 +654,28 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net)
module_put(par.match->me);
}
+static int
+check_entry(struct ipt_entry *e)
+{
+ const struct xt_entry_target *t;
+
+ if (!ip_checkentry(&e->ip))
+ return -EINVAL;
+
+ ip_checkdefault(&e->ip);
+
+ if (e->target_offset + sizeof(struct xt_entry_target) >
+ e->next_offset)
+ return -EINVAL;
+
+ t = ipt_get_target_c(e);
+
+ if (e->target_offset + t->u.target_size > e->next_offset)
+ return -EINVAL;
+
+ return 0;
+}
+
static int
check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
{
@@ -664,6 +753,10 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
struct xt_mtchk_param mtpar;
struct xt_entry_match *ematch;
+ ret = check_entry(e);
+ if (ret)
+ return ret;
+
e->counters.pcnt = xt_percpu_counter_alloc();
if (IS_ERR_VALUE(e->counters.pcnt))
return -ENOMEM;
@@ -948,6 +1041,7 @@ copy_entries_to_user(unsigned int total_size,
const struct xt_table_info *private = table->private;
int ret = 0;
const void *loc_cpu_entry;
+ u8 flags;
counters = alloc_counters(table);
if (IS_ERR(counters))
@@ -975,6 +1069,14 @@ copy_entries_to_user(unsigned int total_size,
goto free_counters;
}
+ flags = e->ip.flags & IPT_F_MASK;
+ if (copy_to_user(userptr + off
+ + offsetof(struct ipt_entry, ip.flags),
+ &flags, sizeof(flags)) != 0) {
+ ret = -EFAULT;
+ goto free_counters;
+ }
+
for (i = sizeof(struct ipt_entry);
i < e->target_offset;
i += m->u.match_size) {
@@ -1468,8 +1570,10 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
return -EINVAL;
}
- if (!ip_checkentry(&e->ip))
- return -EINVAL;
+ /* For purposes of check_entry casting the compat entry is fine */
+ ret = check_entry((struct ipt_entry *)e);
+ if (ret)
+ return ret;
ret = xt_compat_check_entry_offsets(e, e->elems,
e->target_offset, e->next_offset);
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -41,8 +41,8 @@ static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
if (ap == NULL)
return false;
- tuple->src.u3.ip = ap[0];
- tuple->dst.u3.ip = ap[1];
+ tuple->src.u3.ip = net_hdr_word(ap++);
+ tuple->dst.u3.ip = net_hdr_word(ap);
return true;
}
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -25,6 +25,9 @@
#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_l3proto.h>
#include <net/netfilter/nf_nat_l4proto.h>
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+#include <net/netfilter/br_netfilter.h>
+#endif
static const struct nf_nat_l3proto nf_nat_l3proto_ipv4;
@@ -302,6 +305,32 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
}
/* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
case IP_CT_NEW:
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ /* when skb is forwarding between ports of a bridge,the
+ * nf_bridge will be set and nf_bridge->physoutdev is not null,
+ * We can assume that it is not expecting NAT operation.
+ *
+ * when BR_HOOK is enabled, multicast packets will reach
+ * postrouting twice,the first time is when it is forwarded
+ * between ports of a bridge, the second time is that it is
+ * forwarded to upstream port.
+ *
+ * It will perform traversing of the NAT table at the first
+ * time, the next time, it will use the result of first time.
+ * since forwarding betweeng ports of a bridge, it won't hit
+ * rules of SNAT, it cause NO NAT operation on this skb when
+ * forwarding to the upstream port.
+ *
+ * To avoid the scenario above, accept it when it is forwarding
+ * between ports of a bridge for multicast.
+ */
+ if (skb->pkt_type == PACKET_MULTICAST) {
+ struct nf_bridge_info *nf_bridge =
+ nf_bridge_info_get(skb);
+ if (nf_bridge && nf_bridge->physoutdev)
+ return NF_ACCEPT;
+ }
+#endif
/* Seen it before? This can happen for loopback, retrans,
* or local packets.
*/
@@ -312,7 +341,7 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
if (ret != NF_ACCEPT)
return ret;
- if (nf_nat_initialized(ct, HOOK2MANIP(state->hook)))
+ if (nf_nat_initialized(ct, maniptype))
break;
ret = nf_nat_alloc_null_binding(ct, state->hook);
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -124,6 +124,8 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
/* ip_route_me_harder expects skb->dst to be set */
skb_dst_set_noref(nskb, skb_dst(oldskb));
+ nskb->mark = IP4_REPLY_MARK(net, oldskb->mark);
+
skb_reserve(nskb, LL_MAX_HEADER);
niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
ip4_dst_hoplimit(skb_dst(nskb)));
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -539,6 +539,9 @@ static __net_initdata struct pernet_operations ip_proc_ops = {
int __init ip_misc_proc_init(void)
{
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED))
+ return 0;
+
return register_pernet_subsys(&ip_proc_ops);
}
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -77,3 +77,29 @@ int inet_del_offload(const struct net_offload *prot, unsigned char protocol)
return ret;
}
EXPORT_SYMBOL(inet_del_offload);
+
+int inet_update_protocol(const struct net_protocol *new_prot,
+ unsigned char protocol, const struct net_protocol **old_prot)
+{
+ int ret;
+
+ rcu_read_lock();
+ *old_prot = rcu_dereference(inet_protos[protocol]);
+ if (!*old_prot) {
+ rcu_read_unlock();
+ return -1;
+ }
+ rcu_read_unlock();
+
+ /*
+ * old_prot is not protected as cmpxchg is successful only if
+ * old_prot matches with the value in inet_protos[protocol]
+ */
+ ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol],
+ *old_prot, new_prot) == *old_prot) ? 0 : -1;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(inet_update_protocol);
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -497,11 +497,16 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int err;
struct ip_options_data opt_copy;
struct raw_frag_vec rfv;
+ int hdrincl;
err = -EMSGSIZE;
if (len > 0xFFFF)
goto out;
+ /* hdrincl should be READ_ONCE(inet->hdrincl)
+ * but READ_ONCE() doesn't work with bit fields
+ */
+ hdrincl = inet->hdrincl;
/*
* Check the flags.
*/
@@ -576,7 +581,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
/* Linux does not mangle headers on raw sockets,
* so that IP options + IP_HDRINCL is non-sense.
*/
- if (inet->hdrincl)
+ if (hdrincl)
goto done;
if (ipc.opt->opt.srr) {
if (!daddr)
@@ -598,9 +603,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE,
- inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+ hdrincl ? IPPROTO_RAW : sk->sk_protocol,
inet_sk_flowi_flags(sk) |
- (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
+ (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
daddr, saddr, 0, 0);
if (!saddr && ipc.oif) {
@@ -609,7 +614,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
goto done;
}
- if (!inet->hdrincl) {
+ if (!hdrincl) {
rfv.msg = msg;
rfv.hlen = 0;
@@ -634,7 +639,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
goto do_confirm;
back_from_confirm:
- if (inet->hdrincl)
+ if (hdrincl)
err = raw_send_hdrinc(sk, &fl4, msg, len,
&rt, msg->msg_flags);
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -420,6 +420,9 @@ static struct pernet_operations ip_rt_proc_ops __net_initdata = {
static int __init ip_rt_proc_init(void)
{
+ if (IS_ENABLED(CONFIG_PROC_STRIPPED))
+ return 0;
+
return register_pernet_subsys(&ip_rt_proc_ops);
}
@@ -455,7 +458,7 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
else if (skb)
pkey = &ip_hdr(skb)->daddr;
- n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
+ n = __ipv4_neigh_lookup(dev, net_hdr_word(pkey));
if (n)
return n;
return neigh_create(&arp_tbl, pkey, dev);
@@ -1148,6 +1151,9 @@ static void ipv4_link_failure(struct sk_buff *skb)
{
struct rtable *rt;
+ /* Forwarding packets, do not have IPCB() initialized, do so
+ */
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
rt = skb_rtable(skb);
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -799,6 +799,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = &one
},
+ {
+ .procname = "ip_use_legacy_tos",
+ .data = &sysctl_ip_use_legacy_tos,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ }
};
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3818,14 +3818,16 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr
{
const __be32 *ptr = (const __be32 *)(th + 1);
- if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
- | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
+ if (net_hdr_word(ptr) ==
+ htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+ (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
tp->rx_opt.saw_tstamp = 1;
++ptr;
- tp->rx_opt.rcv_tsval = ntohl(*ptr);
+ tp->rx_opt.rcv_tsval = get_unaligned_be32(ptr);
++ptr;
- if (*ptr)
- tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
+ if (net_hdr_word(ptr))
+ tp->rx_opt.rcv_tsecr = get_unaligned_be32(ptr) -
+ tp->tsoffset;
else
tp->rx_opt.rcv_tsecr = 0;
return true;
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -451,48 +451,53 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
u16 options = opts->options; /* mungable copy */
if (unlikely(OPTION_MD5 & options)) {
- *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
- (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
+ net_hdr_word(ptr++) =
+ htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+ (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
/* overload cookie hash location */
opts->hash_location = (__u8 *)ptr;
ptr += 4;
}
if (unlikely(opts->mss)) {
- *ptr++ = htonl((TCPOPT_MSS << 24) |
- (TCPOLEN_MSS << 16) |
- opts->mss);
+ net_hdr_word(ptr++) =
+ htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) |
+ opts->mss);
}
if (likely(OPTION_TS & options)) {
if (unlikely(OPTION_SACK_ADVERTISE & options)) {
- *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
- (TCPOLEN_SACK_PERM << 16) |
- (TCPOPT_TIMESTAMP << 8) |
- TCPOLEN_TIMESTAMP);
+ net_hdr_word(ptr++) =
+ htonl((TCPOPT_SACK_PERM << 24) |
+ (TCPOLEN_SACK_PERM << 16) |
+ (TCPOPT_TIMESTAMP << 8) |
+ TCPOLEN_TIMESTAMP);
options &= ~OPTION_SACK_ADVERTISE;
} else {
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
- (TCPOPT_TIMESTAMP << 8) |
- TCPOLEN_TIMESTAMP);
+ net_hdr_word(ptr++) =
+ htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_TIMESTAMP << 8) |
+ TCPOLEN_TIMESTAMP);
}
- *ptr++ = htonl(opts->tsval);
- *ptr++ = htonl(opts->tsecr);
+ net_hdr_word(ptr++) = htonl(opts->tsval);
+ net_hdr_word(ptr++) = htonl(opts->tsecr);
}
if (unlikely(OPTION_SACK_ADVERTISE & options)) {
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
- (TCPOPT_SACK_PERM << 8) |
- TCPOLEN_SACK_PERM);
+ net_hdr_word(ptr++) =
+ htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_SACK_PERM << 8) |
+ TCPOLEN_SACK_PERM);
}
if (unlikely(OPTION_WSCALE & options)) {
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_WINDOW << 16) |
- (TCPOLEN_WINDOW << 8) |
- opts->ws);
+ net_hdr_word(ptr++) =
+ htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_WINDOW << 16) |
+ (TCPOLEN_WINDOW << 8) |
+ opts->ws);
}
if (unlikely(opts->num_sack_blocks)) {
@@ -500,16 +505,17 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
tp->duplicate_sack : tp->selective_acks;
int this_sack;
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
- (TCPOPT_SACK << 8) |
- (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
+ net_hdr_word(ptr++) =
+ htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_SACK << 8) |
+ (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
TCPOLEN_SACK_PERBLOCK)));
for (this_sack = 0; this_sack < opts->num_sack_blocks;
++this_sack) {
- *ptr++ = htonl(sp[this_sack].start_seq);
- *ptr++ = htonl(sp[this_sack].end_seq);
+ net_hdr_word(ptr++) = htonl(sp[this_sack].start_seq);
+ net_hdr_word(ptr++) = htonl(sp[this_sack].end_seq);
}
tp->rx_opt.dsack = 0;
@@ -522,13 +528,14 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
if (foc->exp) {
len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
- *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
+ net_hdr_word(ptr) =
+ htonl((TCPOPT_EXP << 24) | (len << 16) |
TCPOPT_FASTOPEN_MAGIC);
p += TCPOLEN_EXP_FASTOPEN_BASE;
} else {
len = TCPOLEN_FASTOPEN_BASE + foc->len;
- *p++ = TCPOPT_FASTOPEN;
- *p++ = len;
+ net_hdr_word(p++) = TCPOPT_FASTOPEN;
+ net_hdr_word(p++) = len;
}
memcpy(p, foc->val, foc->len);
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -819,7 +819,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
if (is_udplite) /* UDP-Lite */
csum = udplite_csum(skb);
- else if (sk->sk_no_check_tx) { /* UDP csum disabled */
+ else if (sk->sk_no_check_tx && !skb_is_gso(skb)) { /* UDP csum off */
skb->ip_summed = CHECKSUM_NONE;
goto send;
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1768,6 +1768,35 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add
return result;
}
+/* ipv6_dev_find()
+ * Find (and hold) net device that has the given address.
+ * Or NULL on failure.
+ */
+struct net_device *ipv6_dev_find(struct net *net, struct in6_addr *addr,
+ int strict)
+{
+ struct inet6_ifaddr *ifp;
+ struct net_device *dev;
+
+ ifp = ipv6_get_ifaddr(net, addr, NULL, strict);
+ if (!ifp)
+ return NULL;
+
+ if (!ifp->idev) {
+ in6_ifa_put(ifp);
+ return NULL;
+ }
+
+ dev = ifp->idev->dev;
+ if (dev)
+ dev_hold(dev);
+
+ in6_ifa_put(ifp);
+
+ return dev;
+}
+EXPORT_SYMBOL(ipv6_dev_find);
+
/* Gets referenced address, destroys ifaddr */
static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed)
@@ -2053,6 +2082,7 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev)
case ARPHRD_IEEE1394:
return addrconf_ifid_ieee1394(eui, dev);
case ARPHRD_TUNNEL6:
+ case ARPHRD_RAWIP:
return addrconf_ifid_ip6tnl(eui, dev);
}
return -1;
@@ -3076,7 +3106,8 @@ static void addrconf_dev_config(struct net_device *dev)
(dev->type != ARPHRD_IEEE802154) &&
(dev->type != ARPHRD_IEEE1394) &&
(dev->type != ARPHRD_TUNNEL6) &&
- (dev->type != ARPHRD_6LOWPAN)) {
+ (dev->type != ARPHRD_6LOWPAN) &&
+ (dev->type != ARPHRD_RAWIP)) {
/* Alas, we support only Ethernet autoconfiguration. */
return;
}
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -429,7 +429,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
ipv6_iface_scope_id(&sin->sin6_addr,
IP6CB(skb)->iif);
} else {
- ipv6_addr_set_v4mapped(*(__be32 *)(nh + serr->addr_offset),
+ ipv6_addr_set_v4mapped(net_hdr_word(nh + serr->addr_offset),
&sin->sin6_addr);
sin->sin6_scope_id = 0;
}
@@ -766,12 +766,12 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
}
if (fl6->flowlabel&IPV6_FLOWINFO_MASK) {
- if ((fl6->flowlabel^*(__be32 *)CMSG_DATA(cmsg))&~IPV6_FLOWINFO_MASK) {
+ if ((fl6->flowlabel^net_hdr_word(CMSG_DATA(cmsg)))&~IPV6_FLOWINFO_MASK) {
err = -EINVAL;
goto exit_f;
}
}
- fl6->flowlabel = IPV6_FLOWINFO_MASK & *(__be32 *)CMSG_DATA(cmsg);
+ fl6->flowlabel = IPV6_FLOWINFO_MASK & net_hdr_word(CMSG_DATA(cmsg));
break;
case IPV6_2292HOPOPTS:
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -168,12 +168,19 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
u8 *tail;
__be32 *seqhi;
__be64 seqno;
+ bool nosupp_sg;
/* skb is pure payload to encrypt */
aead = x->data;
alen = crypto_aead_authsize(aead);
ivlen = crypto_aead_ivsize(aead);
+ nosupp_sg = crypto_tfm_alg_flags(&aead->base) & CRYPTO_ALG_NOSUPP_SG;
+ if (nosupp_sg && skb_linearize(skb)) {
+ err = -ENOMEM;
+ goto error;
+ }
+
tfclen = 0;
if (x->tfcpad) {
struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
@@ -367,6 +374,7 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
__be32 *seqhi;
u8 *iv;
struct scatterlist *sg;
+ bool nosupp_sg;
if (!pskb_may_pull(skb, sizeof(*esph) + ivlen)) {
ret = -EINVAL;
@@ -378,6 +386,12 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
goto out;
}
+ nosupp_sg = crypto_tfm_alg_flags(&aead->base) & CRYPTO_ALG_NOSUPP_SG;
+ if (nosupp_sg && skb_linearize(skb)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
nfrags = skb_cow_data(skb, 0, &trailer);
if (nfrags < 0) {
ret = -EINVAL;
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -573,7 +573,7 @@ static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
goto drop;
}
- pkt_len = ntohl(*(__be32 *)(nh + optoff + 2));
+ pkt_len = ntohl(net_hdr_word(nh + optoff + 2));
if (pkt_len <= IPV6_MAXPLEN) {
IP6_INC_STATS_BH(net, ipv6_skb_idev(skb),
IPSTATS_MIB_INHDRERRORS);
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -84,6 +84,10 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
err = -EACCES;
rt = net->ipv6.ip6_prohibit_entry;
goto discard_pkt;
+ case FR_ACT_POLICY_FAILED:
+ err = -EACCES;
+ rt = net->ipv6.ip6_policy_failed_entry;
+ goto discard_pkt;
}
table = fib6_get_table(net, rule->table);
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -138,7 +138,7 @@ static __be32 addr_bit_set(const void *token, int fn_bit)
* See include/asm-generic/bitops/le.h.
*/
return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) &
- addr[fn_bit >> 5];
+ net_hdr_word(&addr[fn_bit >> 5]);
}
static struct fib6_node *node_alloc(void)
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -57,7 +57,6 @@
#include <net/ip6_tunnel.h>
#include <net/gre.h>
-
static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
@@ -366,7 +365,6 @@ static void ip6gre_tunnel_uninit(struct net_device *dev)
dev_put(dev);
}
-
static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
@@ -398,7 +396,7 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
key = key_off ? *(__be32 *)(skb->data + key_off) : 0;
t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr,
- key, greh->protocol);
+ key, greh->protocol);
if (!t)
return;
@@ -479,11 +477,11 @@ static int ip6gre_rcv(struct sk_buff *skb)
offset += 4;
}
if (flags&GRE_KEY) {
- key = *(__be32 *)(h + offset);
+ key = net_hdr_word(h + offset);
offset += 4;
}
if (flags&GRE_SEQ) {
- seqno = ntohl(*(__be32 *)(h + offset));
+ seqno = ntohl(net_hdr_word(h + offset));
offset += 4;
}
}
@@ -745,7 +743,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
if (tunnel->parms.o_flags&GRE_SEQ) {
++tunnel->o_seqno;
- *ptr = htonl(tunnel->o_seqno);
+ net_hdr_word(ptr) = htonl(tunnel->o_seqno);
ptr--;
}
if (tunnel->parms.o_flags&GRE_KEY) {
@@ -760,7 +758,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
}
skb_set_inner_protocol(skb, protocol);
-
+ skb->skb_iif = dev->ifindex;
ip6tunnel_xmit(NULL, skb, dev);
return 0;
tx_err_link_failure:
@@ -841,7 +839,7 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
dsfield = ipv6_get_dsfield(ipv6h);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
- fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK);
+ fl6.flowlabel |= net_hdr_word(ipv6h) & IPV6_TCLASS_MASK;
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
fl6.flowlabel |= ip6_flowlabel(ipv6h);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
@@ -1305,7 +1303,6 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev)
dev_hold(dev);
}
-
static struct inet6_protocol ip6gre_protocol __read_mostly = {
.handler = ip6gre_rcv,
.err_handler = ip6gre_err,
@@ -1361,7 +1358,6 @@ static int __net_init ip6gre_init_net(struct net *net)
*/
ign->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
-
ip6gre_fb_tunnel_init(ign->fb_tunnel_dev);
ign->fb_tunnel_dev->rtnl_link_ops = &ip6gre_link_ops;
@@ -1438,7 +1434,6 @@ out:
return ip6gre_tunnel_validate(tb, data);
}
-
static void ip6gre_netlink_parms(struct nlattr *data[],
struct __ip6_tnl_parm *parms)
{
@@ -1515,7 +1510,7 @@ static void ip6gre_tap_setup(struct net_device *dev)
dev->netdev_ops = &ip6gre_tap_netdev_ops;
dev->destructor = ip6gre_dev_free;
-
+ dev->priv_flags |= IFF_GRE_V6_TAP;
dev->features |= NETIF_F_NETNS_LOCAL;
}
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -222,7 +222,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
continue;
iph2 = (struct ipv6hdr *)(p->data + off);
- first_word = *(__be32 *)iph ^ *(__be32 *)iph2;
+ first_word = net_hdr_word(iph) ^ net_hdr_word(iph2);
/* All fields must match except length and Traffic Class.
* XXX skbs on the gro_list have all been parsed and pulled
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1353,11 +1353,12 @@ emsgsize:
*/
cork->length += length;
- if (((length > mtu) ||
- (skb && skb_is_gso(skb))) &&
+ if ((skb && skb_is_gso(skb)) ||
+ (((length + (skb ? skb->len : headersize)) > mtu) &&
+ (skb_queue_len(queue) <= 1) &&
(sk->sk_protocol == IPPROTO_UDP) &&
- (rt->dst.dev->features & NETIF_F_UFO) &&
- (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
+ (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
+ (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk))) {
err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
hh_len, fragheaderlen, exthdrlen,
transhdrlen, mtu, flags, fl6);
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -16,6 +16,8 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
+ * Changes:
+ * Steven Barth <cyrus@openwrt.org>: MAP-E FMR support
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -71,11 +73,9 @@ static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
-static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2)
+static u32 HASH(const struct in6_addr *addr)
{
- u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2);
-
- return hash_32(hash, HASH_SIZE_SHIFT);
+ return hash_32(ipv6_addr_hash(addr), HASH_SIZE_SHIFT);
}
static int ip6_tnl_dev_init(struct net_device *dev);
@@ -122,6 +122,24 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev)
return &dev->stats;
}
+/*
+ * Update offload stats
+ */
+void ip6_update_offload_stats(struct net_device *dev, void *ptr)
+{
+ struct pcpu_sw_netstats *tstats = per_cpu_ptr(dev->tstats, 0);
+ const struct pcpu_sw_netstats *offload_stats =
+ (struct pcpu_sw_netstats *)ptr;
+
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->tx_packets += offload_stats->tx_packets;
+ tstats->tx_bytes += offload_stats->tx_bytes;
+ tstats->rx_packets += offload_stats->rx_packets;
+ tstats->rx_bytes += offload_stats->rx_bytes;
+ u64_stats_update_end(&tstats->syncp);
+}
+EXPORT_SYMBOL(ip6_update_offload_stats);
+
/*
* Locking : hash tables are protected by RCU and RTNL
*/
@@ -230,20 +248,29 @@ EXPORT_SYMBOL_GPL(ip6_tnl_dst_init);
static struct ip6_tnl *
ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_addr *local)
{
- unsigned int hash = HASH(remote, local);
+ unsigned int hash = HASH(local);
struct ip6_tnl *t;
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
struct in6_addr any;
+ struct __ip6_tnl_fmr *fmr;
for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
- if (ipv6_addr_equal(local, &t->parms.laddr) &&
- ipv6_addr_equal(remote, &t->parms.raddr) &&
- (t->dev->flags & IFF_UP))
+ if (!ipv6_addr_equal(local, &t->parms.laddr) ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (ipv6_addr_equal(remote, &t->parms.raddr))
return t;
+
+ for (fmr = t->parms.fmrs; fmr; fmr = fmr->next) {
+ if (ipv6_prefix_equal(remote, &fmr->ip6_prefix,
+ fmr->ip6_prefix_len))
+ return t;
+ }
}
memset(&any, 0, sizeof(any));
- hash = HASH(&any, local);
+ hash = HASH(local);
for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
if (ipv6_addr_equal(local, &t->parms.laddr) &&
ipv6_addr_any(&t->parms.raddr) &&
@@ -251,7 +278,7 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_
return t;
}
- hash = HASH(remote, &any);
+ hash = HASH(&any);
for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
if (ipv6_addr_equal(remote, &t->parms.raddr) &&
ipv6_addr_any(&t->parms.laddr) &&
@@ -287,7 +314,7 @@ ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct __ip6_tnl_parm *p)
if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) {
prio = 1;
- h = HASH(remote, local);
+ h = HASH(local);
}
return &ip6n->tnls[prio][h];
}
@@ -460,6 +487,12 @@ ip6_tnl_dev_uninit(struct net_device *dev)
struct net *net = t->net;
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+ while (t->parms.fmrs) {
+ struct __ip6_tnl_fmr *next = t->parms.fmrs->next;
+ kfree(t->parms.fmrs);
+ t->parms.fmrs = next;
+ }
+
if (dev == ip6n->fb_tnl_dev)
RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL);
else
@@ -856,6 +889,127 @@ int ip6_tnl_rcv_ctl(struct ip6_tnl *t,
}
EXPORT_SYMBOL_GPL(ip6_tnl_rcv_ctl);
+
+/**
+ * ip4ip6_fmr_calc - calculate target / source IPv6-address based on FMR
+ * @dest: destination IPv6 address buffer
+ * @skb: received socket buffer
+ * @fmr: MAP FMR
+ * @xmit: Calculate for xmit or rcv
+ **/
+static void ip4ip6_fmr_calc(struct in6_addr *dest,
+ const struct iphdr *iph, const uint8_t *end,
+ const struct __ip6_tnl_fmr *fmr, bool xmit, bool draft03)
+{
+ int psidlen = fmr->ea_len - (32 - fmr->ip4_prefix_len);
+ u8 *portp = NULL;
+ bool use_dest_addr;
+ const struct iphdr *dsth = iph;
+
+ if ((u8*)dsth >= end)
+ return;
+
+ /* find significant IP header */
+ if (iph->protocol == IPPROTO_ICMP) {
+ struct icmphdr *ih = (struct icmphdr*)(((u8*)dsth) + dsth->ihl * 4);
+ if (ih && ((u8*)&ih[1]) <= end && (
+ ih->type == ICMP_DEST_UNREACH ||
+ ih->type == ICMP_SOURCE_QUENCH ||
+ ih->type == ICMP_TIME_EXCEEDED ||
+ ih->type == ICMP_PARAMETERPROB ||
+ ih->type == ICMP_REDIRECT))
+ dsth = (const struct iphdr*)&ih[1];
+ }
+
+ /* in xmit-path use dest port by default and source port only if
+ this is an ICMP reply to something else; vice versa in rcv-path */
+ use_dest_addr = (xmit && dsth == iph) || (!xmit && dsth != iph);
+
+ /* get dst port */
+ if (((u8 *)&dsth[1]) <= end && (
+ dsth->protocol == IPPROTO_UDP ||
+ dsth->protocol == IPPROTO_TCP ||
+ dsth->protocol == IPPROTO_SCTP ||
+ dsth->protocol == IPPROTO_DCCP)) {
+ /* for UDP, TCP, SCTP and DCCP source and dest port
+ follow IPv4 header directly */
+ portp = ((u8*)dsth) + dsth->ihl * 4;
+
+ if (use_dest_addr)
+ portp += sizeof(u16);
+ } else if (iph->protocol == IPPROTO_ICMP) {
+ struct icmphdr *ih = (struct icmphdr*)(((u8*)dsth) + dsth->ihl * 4);
+
+ /* use icmp identifier as port */
+ if (((u8 *)ih) <= end && (
+ (use_dest_addr && (
+ ih->type == ICMP_ECHOREPLY ||
+ ih->type == ICMP_TIMESTAMPREPLY ||
+ ih->type == ICMP_INFO_REPLY ||
+ ih->type == ICMP_ADDRESSREPLY)) ||
+ (!use_dest_addr && (
+ ih->type == ICMP_ECHO ||
+ ih->type == ICMP_TIMESTAMP ||
+ ih->type == ICMP_INFO_REQUEST ||
+ ih->type == ICMP_ADDRESS)
+ )))
+ portp = (u8*)&ih->un.echo.id;
+ }
+
+ if ((portp && &portp[2] <= end) || psidlen == 0) {
+ int frombyte = fmr->ip6_prefix_len / 8;
+ int fromrem = fmr->ip6_prefix_len % 8;
+ int bytes = sizeof(struct in6_addr) - frombyte;
+ const u32 *addr = (use_dest_addr) ? &dsth->daddr : &dsth->saddr;
+ u64 eabits = ((u64)ntohl(*addr)) << (32 + fmr->ip4_prefix_len);
+ u64 t = 0;
+
+ /* extract PSID from port and add it to eabits */
+ u16 psidbits = 0;
+ if (psidlen > 0) {
+ psidbits = ((u16)portp[0]) << 8 | ((u16)portp[1]);
+ psidbits >>= 16 - psidlen - fmr->offset;
+ psidbits = (u16)(psidbits << (16 - psidlen));
+ eabits |= ((u64)psidbits) << (48 - (fmr->ea_len - psidlen));
+ }
+
+ /* rewrite destination address */
+ *dest = fmr->ip6_prefix;
+ memcpy(&dest->s6_addr[10], addr, sizeof(*addr));
+ dest->s6_addr16[7] = htons(psidbits >> (16 - psidlen));
+
+ if (bytes > sizeof(u64))
+ bytes = sizeof(u64);
+
+ /* insert eabits */
+ memcpy(&t, &dest->s6_addr[frombyte], bytes);
+ t = be64_to_cpu(t) & ~(((((u64)1) << fmr->ea_len) - 1)
+ << (64 - fmr->ea_len - fromrem));
+ t = cpu_to_be64(t | (eabits >> fromrem));
+ memcpy(&dest->s6_addr[frombyte], &t, bytes);
+ if (draft03) {
+ /**
+ * Draft03 IPv6 address format
+ * +--+---+---+---+---+---+---+---+---+
+ * |PL| 8 16 24 32 40 48 56 |
+ * +--+---+---+---+---+---+---+---+---+
+ * |64| u | IPv4 address |PSID |0 |
+ * +--+---+---+---+---+---+---+---+---+
+ * Final specification IPv6 address format
+ * +--+---+---+---+---+---+---+---+---+
+ * |PL| 8 16 24 32 40 48 56 |
+ * +--+---+---+---+---+---+---+---+---+
+ * |64| 0 | IPv4 address |PSID |
+ * +--+---+---+---+---+---+---+---+---+
+ * We need move last six Bytes 1 byte forward
+ */
+ memmove(&dest->s6_addr[9], &dest->s6_addr[10], 6);
+ dest->s6_addr[15] = 0;
+ }
+ }
+}
+
+
/**
* ip6_tnl_rcv - decapsulate IPv6 packet and retransmit it locally
* @skb: received socket buffer
@@ -901,6 +1055,28 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
skb_reset_network_header(skb);
skb->protocol = htons(protocol);
memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
+ if (protocol == ETH_P_IP &&
+ !ipv6_addr_equal(&ipv6h->saddr, &t->parms.raddr)) {
+ /* Packet didn't come from BR, so lookup FMR */
+ struct __ip6_tnl_fmr *fmr;
+ struct in6_addr expected = t->parms.raddr;
+ for (fmr = t->parms.fmrs; fmr; fmr = fmr->next)
+ if (ipv6_prefix_equal(&ipv6h->saddr,
+ &fmr->ip6_prefix, fmr->ip6_prefix_len))
+ break;
+
+ /* Check that IPv6 matches IPv4 source to prevent spoofing */
+ if (fmr)
+ ip4ip6_fmr_calc(&expected, ip_hdr(skb),
+ skb_tail_pointer(skb),
+ fmr, false,
+ t->parms.draft03);
+
+ if (!ipv6_addr_equal(&ipv6h->saddr, &expected)) {
+ rcu_read_unlock();
+ goto discard;
+ }
+ }
__skb_tunnel_rx(skb, t->dev, t->net);
@@ -924,6 +1100,8 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
tstats->rx_bytes += skb->len;
u64_stats_update_end(&tstats->syncp);
+ /* Reset the skb_iif to Tunnels interface index */
+ skb->skb_iif = t->dev->ifindex;
netif_rx(skb);
rcu_read_unlock();
@@ -1173,12 +1351,15 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
ipv6h = ipv6_hdr(skb);
- ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield),
+ ip6_flow_hdr(ipv6h, dsfield,
ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6));
ipv6h->hop_limit = t->parms.hop_limit;
ipv6h->nexthdr = proto;
ipv6h->saddr = fl6->saddr;
ipv6h->daddr = fl6->daddr;
+
+ /* Reset the skb_iif to Tunnels interface index */
+ skb->skb_iif = dev->ifindex;
ip6tunnel_xmit(NULL, skb, dev);
return 0;
tx_err_link_failure:
@@ -1200,6 +1381,7 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
__u32 mtu;
u8 tproto;
int err;
+ struct __ip6_tnl_fmr *fmr;
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
@@ -1221,6 +1403,19 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
+ /* try to find matching FMR */
+ for (fmr = t->parms.fmrs; fmr; fmr = fmr->next) {
+ unsigned mshift = 32 - fmr->ip4_prefix_len;
+ if (ntohl(fmr->ip4_prefix.s_addr) >> mshift ==
+ ntohl(iph->daddr) >> mshift)
+ break;
+ }
+
+ /* change dstaddr according to FMR */
+ if (fmr)
+ ip4ip6_fmr_calc(&fl6.daddr, iph, skb_tail_pointer(skb), fmr,
+ true, t->parms.draft03);
+
err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu);
if (err != 0) {
/* XXX: send ICMP error even if DF is not set. */
@@ -1269,7 +1464,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
dsfield = ipv6_get_dsfield(ipv6h);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
- fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK);
+ fl6.flowlabel |= net_hdr_word(ipv6h) & IPV6_TCLASS_MASK;
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
fl6.flowlabel |= ip6_flowlabel(ipv6h);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
@@ -1389,6 +1584,14 @@ ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p)
t->parms.flowinfo = p->flowinfo;
t->parms.link = p->link;
t->parms.proto = p->proto;
+
+ while (t->parms.fmrs) {
+ struct __ip6_tnl_fmr *next = t->parms.fmrs->next;
+ kfree(t->parms.fmrs);
+ t->parms.fmrs = next;
+ }
+ t->parms.fmrs = p->fmrs;
+
ip6_tnl_dst_reset(t);
ip6_tnl_link_config(t);
return 0;
@@ -1427,6 +1630,7 @@ ip6_tnl_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm *u)
p->flowinfo = u->flowinfo;
p->link = u->link;
p->proto = u->proto;
+ p->fmrs = NULL;
memcpy(p->name, u->name, sizeof(u->name));
}
@@ -1608,6 +1812,7 @@ static const struct net_device_ops ip6_tnl_netdev_ops = {
.ndo_start_xmit = ip6_tnl_xmit,
.ndo_do_ioctl = ip6_tnl_ioctl,
.ndo_change_mtu = ip6_tnl_change_mtu,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
.ndo_get_stats = ip6_get_stats,
.ndo_get_iflink = ip6_tnl_get_iflink,
};
@@ -1722,6 +1927,15 @@ static int ip6_tnl_validate(struct nlattr *tb[], struct nlattr *data[])
return 0;
}
+static const struct nla_policy ip6_tnl_fmr_policy[IFLA_IPTUN_FMR_MAX + 1] = {
+ [IFLA_IPTUN_FMR_IP6_PREFIX] = { .len = sizeof(struct in6_addr) },
+ [IFLA_IPTUN_FMR_IP4_PREFIX] = { .len = sizeof(struct in_addr) },
+ [IFLA_IPTUN_FMR_IP6_PREFIX_LEN] = { .type = NLA_U8 },
+ [IFLA_IPTUN_FMR_IP4_PREFIX_LEN] = { .type = NLA_U8 },
+ [IFLA_IPTUN_FMR_EA_LEN] = { .type = NLA_U8 },
+ [IFLA_IPTUN_FMR_OFFSET] = { .type = NLA_U8 }
+};
+
static void ip6_tnl_netlink_parms(struct nlattr *data[],
struct __ip6_tnl_parm *parms)
{
@@ -1753,6 +1967,49 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[],
if (data[IFLA_IPTUN_PROTO])
parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+
+ if (data[IFLA_IPTUN_DRAFT03])
+ parms->draft03 = nla_get_u8(data[IFLA_IPTUN_DRAFT03]);
+
+ if (data[IFLA_IPTUN_FMRS]) {
+ unsigned rem;
+ struct nlattr *fmr;
+ nla_for_each_nested(fmr, data[IFLA_IPTUN_FMRS], rem) {
+ struct nlattr *fmrd[IFLA_IPTUN_FMR_MAX + 1], *c;
+ struct __ip6_tnl_fmr *nfmr;
+
+ nla_parse_nested(fmrd, IFLA_IPTUN_FMR_MAX,
+ fmr, ip6_tnl_fmr_policy);
+
+ if (!(nfmr = kzalloc(sizeof(*nfmr), GFP_KERNEL)))
+ continue;
+
+ nfmr->offset = 6;
+
+ if ((c = fmrd[IFLA_IPTUN_FMR_IP6_PREFIX]))
+ nla_memcpy(&nfmr->ip6_prefix, fmrd[IFLA_IPTUN_FMR_IP6_PREFIX],
+ sizeof(nfmr->ip6_prefix));
+
+ if ((c = fmrd[IFLA_IPTUN_FMR_IP4_PREFIX]))
+ nla_memcpy(&nfmr->ip4_prefix, fmrd[IFLA_IPTUN_FMR_IP4_PREFIX],
+ sizeof(nfmr->ip4_prefix));
+
+ if ((c = fmrd[IFLA_IPTUN_FMR_IP6_PREFIX_LEN]))
+ nfmr->ip6_prefix_len = nla_get_u8(c);
+
+ if ((c = fmrd[IFLA_IPTUN_FMR_IP4_PREFIX_LEN]))
+ nfmr->ip4_prefix_len = nla_get_u8(c);
+
+ if ((c = fmrd[IFLA_IPTUN_FMR_EA_LEN]))
+ nfmr->ea_len = nla_get_u8(c);
+
+ if ((c = fmrd[IFLA_IPTUN_FMR_OFFSET]))
+ nfmr->offset = nla_get_u8(c);
+
+ nfmr->next = parms->fmrs;
+ parms->fmrs = nfmr;
+ }
+ }
}
static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
@@ -1805,6 +2062,12 @@ static void ip6_tnl_dellink(struct net_device *dev, struct list_head *head)
static size_t ip6_tnl_get_size(const struct net_device *dev)
{
+ const struct ip6_tnl *t = netdev_priv(dev);
+ struct __ip6_tnl_fmr *c;
+ int fmrs = 0;
+ for (c = t->parms.fmrs; c; c = c->next)
+ ++fmrs;
+
return
/* IFLA_IPTUN_LINK */
nla_total_size(4) +
@@ -1822,6 +2085,24 @@ static size_t ip6_tnl_get_size(const struct net_device *dev)
nla_total_size(4) +
/* IFLA_IPTUN_PROTO */
nla_total_size(1) +
+ /* IFLA_IPTUN_FMRS */
+ nla_total_size(0) +
+ (
+ /* nest */
+ nla_total_size(0) +
+ /* IFLA_IPTUN_FMR_IP6_PREFIX */
+ nla_total_size(sizeof(struct in6_addr)) +
+ /* IFLA_IPTUN_FMR_IP4_PREFIX */
+ nla_total_size(sizeof(struct in_addr)) +
+ /* IFLA_IPTUN_FMR_EA_LEN */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_FMR_IP6_PREFIX_LEN */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_FMR_IP4_PREFIX_LEN */
+ nla_total_size(1) +
+ /* IFLA_IPTUN_FMR_OFFSET */
+ nla_total_size(1)
+ ) * fmrs +
0;
}
@@ -1838,11 +2119,9 @@ static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_be32(skb, IFLA_IPTUN_FLOWINFO, parm->flowinfo) ||
nla_put_u32(skb, IFLA_IPTUN_FLAGS, parm->flags) ||
nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto))
- goto nla_put_failure;
- return 0;
+ return -EMSGSIZE;
-nla_put_failure:
- return -EMSGSIZE;
+ return 0;
}
struct net *ip6_tnl_get_link_net(const struct net_device *dev)
@@ -1862,6 +2141,7 @@ static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = {
[IFLA_IPTUN_FLOWINFO] = { .type = NLA_U32 },
[IFLA_IPTUN_FLAGS] = { .type = NLA_U32 },
[IFLA_IPTUN_PROTO] = { .type = NLA_U8 },
+ [IFLA_IPTUN_FMRS] = { .type = NLA_NESTED },
};
static struct rtnl_link_ops ip6_link_ops __read_mostly = {
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -120,6 +120,11 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb,
struct netlink_callback *cb);
static void mroute_clean_tables(struct mr6_table *mrt, bool all);
static void ipmr_expire_process(unsigned long arg);
+static struct mfc6_cache *ip6mr_cache_find(struct mr6_table *mrt,
+ const struct in6_addr *origin,
+ const struct in6_addr *mcastgrp);
+static ip6mr_mfc_event_offload_callback_t __rcu
+ ip6mr_mfc_event_offload_callback;
#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
#define ip6mr_for_each_table(mrt, net) \
@@ -167,6 +172,8 @@ static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp,
return -ENETUNREACH;
case FR_ACT_PROHIBIT:
return -EACCES;
+ case FR_ACT_POLICY_FAILED:
+ return -EACCES;
case FR_ACT_BLACKHOLE:
default:
return -EINVAL;
@@ -338,6 +345,82 @@ static void ip6mr_free_table(struct mr6_table *mrt)
kfree(mrt);
}
+/* ip6mr_sync_entry_update()
+ * Call the registered offload callback to report an update to a multicast
+ * route entry. The callback receives the list of destination interfaces and
+ * the interface count
+ */
+static void ip6mr_sync_entry_update(struct mr6_table *mrt,
+ struct mfc6_cache *cache)
+{
+ int vifi, dest_if_count = 0;
+ u32 dest_dev[MAXMIFS];
+ struct in6_addr mc_origin, mc_group;
+ ip6mr_mfc_event_offload_callback_t offload_update_cb_f;
+
+ memset(dest_dev, 0, sizeof(dest_dev));
+
+ read_lock(&mrt_lock);
+
+ for (vifi = 0; vifi < cache->mfc_un.res.maxvif; vifi++) {
+ if (!((cache->mfc_un.res.ttls[vifi] > 0) &&
+ (cache->mfc_un.res.ttls[vifi] < 255))) {
+ continue;
+ }
+
+ if (dest_if_count == MAXMIFS) {
+ read_unlock(&mrt_lock);
+ return;
+ }
+
+ if (!MIF_EXISTS(mrt, vifi)) {
+ read_unlock(&mrt_lock);
+ return;
+ }
+
+ dest_dev[dest_if_count] = mrt->vif6_table[vifi].dev->ifindex;
+ dest_if_count++;
+ }
+
+ memcpy(&mc_origin, &cache->mf6c_origin, sizeof(struct in6_addr));
+ memcpy(&mc_group, &cache->mf6c_mcastgrp, sizeof(struct in6_addr));
+ read_unlock(&mrt_lock);
+
+ rcu_read_lock();
+ offload_update_cb_f = rcu_dereference(ip6mr_mfc_event_offload_callback);
+
+ if (!offload_update_cb_f) {
+ rcu_read_unlock();
+ return;
+ }
+
+ offload_update_cb_f(&mc_group, &mc_origin, dest_if_count, dest_dev,
+ IP6MR_MFC_EVENT_UPDATE);
+ rcu_read_unlock();
+}
+
+/* ip6mr_sync_entry_delete()
+ * Call the registered offload callback to inform of a multicast route entry
+ * delete event
+ */
+static void ip6mr_sync_entry_delete(struct in6_addr *mc_origin,
+ struct in6_addr *mc_group)
+{
+ ip6mr_mfc_event_offload_callback_t offload_update_cb_f;
+
+ rcu_read_lock();
+ offload_update_cb_f = rcu_dereference(ip6mr_mfc_event_offload_callback);
+
+ if (!offload_update_cb_f) {
+ rcu_read_unlock();
+ return;
+ }
+
+ offload_update_cb_f(mc_group, mc_origin, 0, NULL,
+ IP6MR_MFC_EVENT_DELETE);
+ rcu_read_unlock();
+}
+
#ifdef CONFIG_PROC_FS
struct ipmr_mfc_iter {
@@ -454,7 +537,7 @@ static int ip6mr_vif_seq_show(struct seq_file *seq, void *v)
const char *name = vif->dev ? vif->dev->name : "none";
seq_printf(seq,
- "%2td %-10s %8ld %7ld %8ld %7ld %05X\n",
+ "%2td %-10s %8llu %7llu %8llu %7llu %05X\n",
vif - mrt->vif6_table,
name, vif->bytes_in, vif->pkt_in,
vif->bytes_out, vif->pkt_out,
@@ -770,6 +853,145 @@ failure:
}
#endif
+/* ip6mr_register_mfc_event_offload_callback()
+ * Register the IPv6 multicast update callback for offload modules
+ */
+bool ip6mr_register_mfc_event_offload_callback(
+ ip6mr_mfc_event_offload_callback_t mfc_offload_cb)
+{
+ ip6mr_mfc_event_offload_callback_t offload_update_cb_f;
+
+ rcu_read_lock();
+ offload_update_cb_f = rcu_dereference(ip6mr_mfc_event_offload_callback);
+
+ if (offload_update_cb_f) {
+ rcu_read_unlock();
+ return false;
+ }
+
+ rcu_assign_pointer(ip6mr_mfc_event_offload_callback, mfc_offload_cb);
+ rcu_read_unlock();
+ return true;
+}
+EXPORT_SYMBOL(ip6mr_register_mfc_event_offload_callback);
+
+/* ip6mr_unregister_mfc_event_offload_callback()
+ * De-register the IPv6 multicast update callback for offload modules
+ */
+void ip6mr_unregister_mfc_event_offload_callback(void)
+{
+ rcu_read_lock();
+ rcu_assign_pointer(ip6mr_mfc_event_offload_callback, NULL);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(ip6mr_unregister_mfc_event_offload_callback);
+
+/* ip6mr_find_mfc_entry()
+ * Return the destination interface list for a particular multicast flow, and
+ * the number of interfaces in the list
+ */
+int ip6mr_find_mfc_entry(struct net *net, struct in6_addr *origin,
+ struct in6_addr *group, u32 max_dest_cnt,
+ u32 dest_dev[])
+{
+ int vifi, dest_if_count = 0;
+ struct mr6_table *mrt;
+ struct mfc6_cache *cache;
+
+ mrt = ip6mr_get_table(net, RT6_TABLE_DFLT);
+ if (!mrt)
+ return -ENOENT;
+
+ read_lock(&mrt_lock);
+ cache = ip6mr_cache_find(mrt, origin, group);
+ if (!cache) {
+ read_unlock(&mrt_lock);
+ return -ENOENT;
+ }
+
+ for (vifi = 0; vifi < cache->mfc_un.res.maxvif; vifi++) {
+ if (!((cache->mfc_un.res.ttls[vifi] > 0) &&
+ (cache->mfc_un.res.ttls[vifi] < 255))) {
+ continue;
+ }
+
+ /* We have another valid destination interface entry. Check if
+ * the number of the destination interfaces for the route is
+ * exceeding the size of the array given to us
+ */
+ if (dest_if_count == max_dest_cnt) {
+ read_unlock(&mrt_lock);
+ return -EINVAL;
+ }
+
+ if (!MIF_EXISTS(mrt, vifi)) {
+ read_unlock(&mrt_lock);
+ return -EINVAL;
+ }
+
+ dest_dev[dest_if_count] = mrt->vif6_table[vifi].dev->ifindex;
+ dest_if_count++;
+ }
+ read_unlock(&mrt_lock);
+
+ return dest_if_count;
+}
+EXPORT_SYMBOL(ip6mr_find_mfc_entry);
+
+/* ip6mr_mfc_stats_update()
+ * Update the MFC/VIF statistics for offloaded flows
+ */
+int ip6mr_mfc_stats_update(struct net *net, struct in6_addr *origin,
+ struct in6_addr *group, u64 pkts_in,
+ u64 bytes_in, uint64_t pkts_out,
+ u64 bytes_out)
+{
+ int vif, vifi;
+ struct mr6_table *mrt;
+ struct mfc6_cache *cache;
+
+ mrt = ip6mr_get_table(net, RT6_TABLE_DFLT);
+
+ if (!mrt)
+ return -ENOENT;
+
+ read_lock(&mrt_lock);
+ cache = ip6mr_cache_find(mrt, origin, group);
+ if (!cache) {
+ read_unlock(&mrt_lock);
+ return -ENOENT;
+ }
+
+ vif = cache->mf6c_parent;
+
+ if (!MIF_EXISTS(mrt, vif)) {
+ read_unlock(&mrt_lock);
+ return -EINVAL;
+ }
+
+ mrt->vif6_table[vif].pkt_in += pkts_in;
+ mrt->vif6_table[vif].bytes_in += bytes_in;
+ cache->mfc_un.res.pkt += pkts_out;
+ cache->mfc_un.res.bytes += bytes_out;
+
+ for (vifi = cache->mfc_un.res.minvif;
+ vifi < cache->mfc_un.res.maxvif; vifi++) {
+ if ((cache->mfc_un.res.ttls[vifi] > 0) &&
+ (cache->mfc_un.res.ttls[vifi] < 255)) {
+ if (!MIF_EXISTS(mrt, vifi)) {
+ read_unlock(&mrt_lock);
+ return -EINVAL;
+ }
+ mrt->vif6_table[vifi].pkt_out += pkts_out;
+ mrt->vif6_table[vifi].bytes_out += bytes_out;
+ }
+ }
+
+ read_unlock(&mrt_lock);
+ return 0;
+}
+EXPORT_SYMBOL(ip6mr_mfc_stats_update);
+
/*
* Delete a VIF entry
*/
@@ -1302,6 +1524,7 @@ static int ip6mr_mfc_delete(struct mr6_table *mrt, struct mf6cctl *mfc,
{
int line;
struct mfc6_cache *c, *next;
+ struct in6_addr mc_origin, mc_group;
line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr);
@@ -1310,12 +1533,20 @@ static int ip6mr_mfc_delete(struct mr6_table *mrt, struct mf6cctl *mfc,
ipv6_addr_equal(&c->mf6c_mcastgrp,
&mfc->mf6cc_mcastgrp.sin6_addr) &&
(parent == -1 || parent == c->mf6c_parent)) {
+ memcpy(&mc_origin, &c->mf6c_origin,
+ sizeof(struct in6_addr));
+ memcpy(&mc_group, &c->mf6c_mcastgrp,
+ sizeof(struct in6_addr));
+
write_lock_bh(&mrt_lock);
list_del(&c->list);
write_unlock_bh(&mrt_lock);
mr6_netlink_event(mrt, c, RTM_DELROUTE);
ip6mr_cache_free(c);
+
+ /* Inform offload modules of the delete event */
+ ip6mr_sync_entry_delete(&mc_origin, &mc_group);
return 0;
}
}
@@ -1486,6 +1717,9 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt,
c->mfc_flags |= MFC_STATIC;
write_unlock_bh(&mrt_lock);
mr6_netlink_event(mrt, c, RTM_NEWROUTE);
+
+ /* Inform offload modules of the update event */
+ ip6mr_sync_entry_update(mrt, c);
return 0;
}
@@ -1544,6 +1778,7 @@ static void mroute_clean_tables(struct mr6_table *mrt, bool all)
int i;
LIST_HEAD(list);
struct mfc6_cache *c, *next;
+ struct in6_addr mc_origin, mc_group;
/*
* Shut down all active vif entries
@@ -1562,12 +1797,19 @@ static void mroute_clean_tables(struct mr6_table *mrt, bool all)
list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[i], list) {
if (!all && (c->mfc_flags & MFC_STATIC))
continue;
+ memcpy(&mc_origin, &c->mf6c_origin,
+ sizeof(struct in6_addr));
+ memcpy(&mc_group, &c->mf6c_mcastgrp,
+ sizeof(struct in6_addr));
write_lock_bh(&mrt_lock);
list_del(&c->list);
write_unlock_bh(&mrt_lock);
mr6_netlink_event(mrt, c, RTM_DELROUTE);
ip6mr_cache_free(c);
+
+ /* Inform offload modules of the delete event */
+ ip6mr_sync_entry_delete(&mc_origin, &mc_group);
}
}
--- a/net/ipv6/mcast_snoop.c
+++ b/net/ipv6/mcast_snoop.c
@@ -53,7 +53,7 @@ static int ipv6_mc_check_exthdrs(struct sk_buff *skb)
ip6h = ipv6_hdr(skb);
- if (ip6h->nexthdr != IPPROTO_HOPOPTS)
+ if (ip6h->nexthdr != IPPROTO_HOPOPTS && ip6h->nexthdr != IPPROTO_ICMPV6)
return -ENOMSG;
nexthdr = ip6h->nexthdr;
@@ -63,9 +63,6 @@ static int ipv6_mc_check_exthdrs(struct sk_buff *skb)
if (offset < 0)
return -EINVAL;
- if (nexthdr != IPPROTO_ICMPV6)
- return -ENOMSG;
-
skb_set_transport_header(skb, offset);
return 0;
@@ -120,6 +117,7 @@ static int ipv6_mc_check_mld_msg(struct sk_buff *skb)
switch (mld->mld_type) {
case ICMPV6_MGM_REDUCTION:
case ICMPV6_MGM_REPORT:
+ case ICMPV6_NDISC_NBR_SOLICITATION:
/* fall through */
return 0;
case ICMPV6_MLD2_REPORT:
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -594,6 +594,7 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
ndisc_send_skb(skb, daddr, saddr);
}
+EXPORT_SYMBOL(ndisc_send_ns);
void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr,
const struct in6_addr *daddr)
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -334,5 +334,14 @@ endif # IP6_NF_NAT
endif # IP6_NF_IPTABLES
+config NF_IPV6_DUMMY_HEADER
+ tristate "Retain Dummy fragment header"
+ depends on NF_DEFRAG_IPV6
+ default n
+ help
+ This option allows to retain dummy fragment header in an IPv6 packet.
+ Dummy fragment header is the fragment header with Fragment Offset and
+ M bit as 0.
+
endmenu
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -425,6 +425,10 @@ ip6t_do_table(struct sk_buff *skb,
}
if (table_base + v != ip6t_next_entry(e) &&
!(e->ipv6.flags & IP6T_F_GOTO)) {
+ if (unlikely(stackidx >= private->stacksize)) {
+ verdict = NF_DROP;
+ break;
+ }
jumpstack[stackidx++] = e;
}
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -601,6 +601,23 @@ struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 use
hdr = ipv6_hdr(clone);
fhdr = (struct frag_hdr *)skb_transport_header(clone);
+#if IS_ENABLED(CONFIG_NF_IPV6_DUMMY_HEADER)
+ /*
+ * Revoke dummy header removal by IPv6 reassembly code.
+ *
+ * Fragment header with MF and fragment offset field as 0, is a
+ * dummy fragment header.
+ *
+ * MAP-T's RFC mandates CE to add the dummy header in packets and
+ * adds its identification in its ID field. This field should be
+ * conserved and delivered to BR, which uses it to identify the
+ * particular CE.
+ */
+ if (unlikely((fhdr->frag_off & htons(IP6_OFFSET | IP6_MF)) == 0)) {
+ goto ret_orig;
+ }
+#endif
+
fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
if (fq == NULL) {
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -66,9 +66,9 @@ static void dump_ipv6_packet(struct nf_log_buf *m,
/* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
nf_log_buf_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
- (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
+ (ntohl(net_hdr_word(ih)) & 0x0ff00000) >> 20,
ih->hop_limit,
- (ntohl(*(__be32 *)ih) & 0x000fffff));
+ (ntohl(net_hdr_word(ih)) & 0x000fffff));
fragment = 0;
ptr = ip6hoff + sizeof(struct ipv6hdr);
--- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
@@ -21,6 +21,10 @@
#include <net/ipv6.h>
#include <net/netfilter/ipv6/nf_nat_masquerade.h>
+#define MAX_WORK_COUNT 16
+
+static atomic_t v6_worker_count;
+
unsigned int
nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
const struct net_device *out)
@@ -78,14 +82,78 @@ static struct notifier_block masq_dev_notifier = {
.notifier_call = masq_device_event,
};
+struct masq_dev_work {
+ struct work_struct work;
+ struct net *net;
+ int ifindex;
+};
+
+static void iterate_cleanup_work(struct work_struct *work)
+{
+ struct masq_dev_work *w;
+ long index;
+
+ w = container_of(work, struct masq_dev_work, work);
+
+ index = w->ifindex;
+ nf_ct_iterate_cleanup(w->net, device_cmp, (void *)index, 0, 0);
+
+ put_net(w->net);
+ kfree(w);
+ atomic_dec(&v6_worker_count);
+ module_put(THIS_MODULE);
+}
+
+/* ipv6 inet notifier is an atomic notifier, i.e. we cannot
+ * schedule.
+ *
+ * Unfortunately, nf_ct_iterate_cleanup can run for a long
+ * time if there are lots of conntracks and the system
+ * handles high softirq load, so it frequently calls cond_resched
+ * while iterating the conntrack table.
+ *
+ * So we defer nf_ct_iterate_cleanup walk to the system workqueue.
+ *
+ * As we can have 'a lot' of inet_events (depending on amount
+ * of ipv6 addresses being deleted), we also need to add an upper
+ * limit to the number of queued work items.
+ */
static int masq_inet_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct inet6_ifaddr *ifa = ptr;
- struct netdev_notifier_info info;
+ const struct net_device *dev;
+ struct masq_dev_work *w;
+ struct net *net;
+
+ if (event != NETDEV_DOWN ||
+ atomic_read(&v6_worker_count) >= MAX_WORK_COUNT)
+ return NOTIFY_DONE;
+
+ dev = ifa->idev->dev;
+ net = maybe_get_net(dev_net(dev));
+ if (!net)
+ return NOTIFY_DONE;
- netdev_notifier_info_init(&info, ifa->idev->dev);
- return masq_device_event(this, event, &info);
+ if (!try_module_get(THIS_MODULE))
+ goto err_module;
+
+ w = kmalloc(sizeof(*w), GFP_ATOMIC);
+ if (w) {
+ atomic_inc(&v6_worker_count);
+
+ INIT_WORK(&w->work, iterate_cleanup_work);
+ w->ifindex = dev->ifindex;
+ w->net = net;
+ schedule_work(&w->work);
+
+ return NOTIFY_DONE;
+ }
+
+ module_put(THIS_MODULE);
+ err_module:
+ put_net(net);
+ return NOTIFY_DONE;
}
static struct notifier_block masq_inet_notifier = {
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -135,7 +135,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
struct sk_buff *nskb;
struct tcphdr _otcph;
const struct tcphdr *otcph;
- unsigned int otcplen, hh_len;
+ unsigned int otcplen;
const struct ipv6hdr *oip6h = ipv6_hdr(oldskb);
struct ipv6hdr *ip6h;
struct dst_entry *dst = NULL;
@@ -157,6 +157,17 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
fl6.daddr = oip6h->saddr;
fl6.fl6_sport = otcph->dest;
fl6.fl6_dport = otcph->source;
+
+ /* For forwarding packet, the skb->skb_iif is the incoming device's
+ * ifindex, but it is 0 for local out skb, use dst->dev's ifindex
+ * instead.
+ */
+ if (oldskb->skb_iif != 0)
+ fl6.flowi6_oif = oldskb->skb_iif;
+ else
+ fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev);
+
+ fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark);
security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6));
dst = ip6_route_output(net, NULL, &fl6);
if (dst == NULL || dst->error) {
@@ -167,8 +178,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
if (IS_ERR(dst))
return;
- hh_len = (dst->dev->hard_header_len + 15)&~15;
- nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr)
+ nskb = alloc_skb(LL_MAX_HEADER + sizeof(struct ipv6hdr)
+ sizeof(struct tcphdr) + dst->trailer_len,
GFP_ATOMIC);
@@ -180,7 +190,9 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
skb_dst_set(nskb, dst);
- skb_reserve(nskb, hh_len + dst->header_len);
+ nskb->mark = fl6.flowi6_mark;
+
+ skb_reserve(nskb, LL_MAX_HEADER);
ip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP,
ip6_dst_hoplimit(dst));
nf_reject_ip6_tcphdr_put(nskb, oldskb, otcph, otcplen);
--- a/net/ipv6/protocol.c
+++ b/net/ipv6/protocol.c
@@ -72,3 +72,29 @@ int inet6_del_offload(const struct net_offload *prot, unsigned char protocol)
return ret;
}
EXPORT_SYMBOL(inet6_del_offload);
+
+int inet6_update_protocol(const struct inet6_protocol *new_prot,
+ unsigned char protocol, const struct inet6_protocol **old_prot)
+{
+ int ret;
+
+ rcu_read_lock();
+ *old_prot = rcu_dereference(inet6_protos[protocol]);
+ if (!*old_prot) {
+ rcu_read_unlock();
+ return -1;
+ }
+ rcu_read_unlock();
+
+ /*
+ * old_prot is not protected as cmpxchg is successful only if
+ * old_prot matches with the value in inet6_protos[protocol]
+ */
+ ret = (cmpxchg((const struct inet6_protocol **)&inet6_protos[protocol],
+ *old_prot, new_prot) == *old_prot) ? 0 : -1;
+
+ synchronize_net();
+
+ return ret;
+}
+EXPORT_SYMBOL(inet6_update_protocol);
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -90,6 +90,8 @@ static int ip6_pkt_discard(struct sk_buff *skb);
static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static int ip6_pkt_prohibit(struct sk_buff *skb);
static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
+static int ip6_pkt_policy_failed(struct sk_buff *skb);
+static int ip6_pkt_policy_failed_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static void ip6_link_failure(struct sk_buff *skb);
static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu);
@@ -175,6 +177,9 @@ static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
return dst_metrics_write_ptr(rt->dst.from);
}
+/* Define route change notification chain. */
+ATOMIC_NOTIFIER_HEAD(ip6route_chain);
+
static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
{
struct rt6_info *rt = (struct rt6_info *)dst;
@@ -297,6 +302,21 @@ static const struct rt6_info ip6_prohibit_entry_template = {
.rt6i_ref = ATOMIC_INIT(1),
};
+static const struct rt6_info ip6_policy_failed_entry_template = {
+ .dst = {
+ .__refcnt = ATOMIC_INIT(1),
+ .__use = 1,
+ .obsolete = DST_OBSOLETE_FORCE_CHK,
+ .error = -EACCES,
+ .input = ip6_pkt_policy_failed,
+ .output = ip6_pkt_policy_failed_out,
+ },
+ .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
+ .rt6i_protocol = RTPROT_KERNEL,
+ .rt6i_metric = ~(u32) 0,
+ .rt6i_ref = ATOMIC_INIT(1),
+};
+
static const struct rt6_info ip6_blk_hole_entry_template = {
.dst = {
.__refcnt = ATOMIC_INIT(1),
@@ -1885,6 +1905,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
rt->dst.output = ip6_pkt_prohibit_out;
rt->dst.input = ip6_pkt_prohibit;
break;
+ case RTN_POLICY_FAILED:
+ rt->dst.error = -EACCES;
+ rt->dst.output = ip6_pkt_policy_failed_out;
+ rt->dst.input = ip6_pkt_policy_failed;
+ break;
case RTN_THROW:
case RTN_UNREACHABLE:
default:
@@ -2012,6 +2037,9 @@ int ip6_route_add(struct fib6_config *cfg)
goto out;
err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
+ if (!err)
+ atomic_notifier_call_chain(&ip6route_chain,
+ RTM_NEWROUTE, rt);
kfree(mxc.mx);
@@ -2040,6 +2068,9 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
err = fib6_del(rt, info);
write_unlock_bh(&table->tb6_lock);
+ if (!err)
+ atomic_notifier_call_chain(&ip6route_chain,
+ RTM_DELROUTE, rt);
out:
ip6_rt_put(rt);
return err;
@@ -2486,6 +2517,17 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff
return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
}
+static int ip6_pkt_policy_failed(struct sk_buff *skb)
+{
+ return ip6_pkt_drop(skb, ICMPV6_POLICY_FAIL, IPSTATS_MIB_INNOROUTES);
+}
+
+static int ip6_pkt_policy_failed_out(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ skb->dev = skb_dst(skb)->dev;
+ return ip6_pkt_drop(skb, ICMPV6_POLICY_FAIL, IPSTATS_MIB_OUTNOROUTES);
+}
+
/*
* Allocate a dst for local (unicast / anycast) address.
*/
@@ -2728,7 +2770,8 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
if (rtm->rtm_type == RTN_UNREACHABLE ||
rtm->rtm_type == RTN_BLACKHOLE ||
rtm->rtm_type == RTN_PROHIBIT ||
- rtm->rtm_type == RTN_THROW)
+ rtm->rtm_type == RTN_THROW ||
+ rtm->rtm_type == RTN_POLICY_FAILED)
cfg->fc_flags |= RTF_REJECT;
if (rtm->rtm_type == RTN_LOCAL)
@@ -3087,6 +3130,9 @@ static int rt6_fill_node(struct net *net,
case -EACCES:
rtm->rtm_type = RTN_PROHIBIT;
break;
+ case -EPERM:
+ rtm->rtm_type = RTN_POLICY_FAILED;
+ break;
case -EAGAIN:
rtm->rtm_type = RTN_THROW;
break;
@@ -3363,6 +3409,8 @@ static int ip6_route_dev_notify(struct notifier_block *this,
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
net->ipv6.ip6_prohibit_entry->dst.dev = dev;
net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
+ net->ipv6.ip6_policy_failed_entry->dst.dev = dev;
+ net->ipv6.ip6_policy_failed_entry->rt6i_idev = in6_dev_get(dev);
net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
#endif
@@ -3371,6 +3419,18 @@ static int ip6_route_dev_notify(struct notifier_block *this,
return NOTIFY_OK;
}
+int rt6_register_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&ip6route_chain, nb);
+}
+EXPORT_SYMBOL(rt6_register_notifier);
+
+int rt6_unregister_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&ip6route_chain, nb);
+}
+EXPORT_SYMBOL(rt6_unregister_notifier);
+
/*
* /proc
*/
@@ -3579,6 +3639,17 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
ip6_template_metrics, true);
+
+ net->ipv6.ip6_policy_failed_entry =
+ kmemdup(&ip6_policy_failed_entry_template,
+ sizeof(*net->ipv6.ip6_policy_failed_entry), GFP_KERNEL);
+ if (!net->ipv6.ip6_policy_failed_entry)
+ goto out_ip6_blk_hole_entry;
+ net->ipv6.ip6_policy_failed_entry->dst.path =
+ (struct dst_entry *)net->ipv6.ip6_policy_failed_entry;
+ net->ipv6.ip6_policy_failed_entry->dst.ops = &net->ipv6.ip6_dst_ops;
+ dst_init_metrics(&net->ipv6.ip6_policy_failed_entry->dst,
+ ip6_template_metrics, true);
#endif
net->ipv6.sysctl.flush_delay = 0;
@@ -3597,6 +3668,8 @@ out:
return ret;
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+out_ip6_blk_hole_entry:
+ kfree(net->ipv6.ip6_blk_hole_entry);
out_ip6_prohibit_entry:
kfree(net->ipv6.ip6_prohibit_entry);
out_ip6_null_entry:
@@ -3614,6 +3687,7 @@ static void __net_exit ip6_route_net_exit(struct net *net)
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
kfree(net->ipv6.ip6_prohibit_entry);
kfree(net->ipv6.ip6_blk_hole_entry);
+ kfree(net->ipv6.ip6_policy_failed_entry);
#endif
dst_entries_destroy(&net->ipv6.ip6_dst_ops);
}
@@ -3711,6 +3785,9 @@ int __init ip6_route_init(void)
init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
+ init_net.ipv6.ip6_policy_failed_entry->dst.dev = init_net.loopback_dev;
+ init_net.ipv6.ip6_policy_failed_entry->rt6i_idev =
+ in6_dev_get(init_net.loopback_dev);
#endif
ret = fib6_init();
if (ret)
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -87,6 +87,21 @@ struct sit_net {
struct net_device *fb_tunnel_dev;
};
+void ipip6_update_offload_stats(struct net_device *dev, void *ptr)
+{
+ struct pcpu_sw_netstats *tstats = per_cpu_ptr(dev->tstats, 0);
+ const struct pcpu_sw_netstats *offload_stats =
+ (struct pcpu_sw_netstats *)ptr;
+
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->tx_packets += offload_stats->tx_packets;
+ tstats->tx_bytes += offload_stats->tx_bytes;
+ tstats->rx_packets += offload_stats->rx_packets;
+ tstats->rx_bytes += offload_stats->rx_bytes;
+ u64_stats_update_end(&tstats->syncp);
+}
+EXPORT_SYMBOL(ipip6_update_offload_stats);
+
/*
* Must be invoked with rcu_read_lock
*/
@@ -709,6 +724,8 @@ static int ipip6_rcv(struct sk_buff *skb)
tstats->rx_bytes += skb->len;
u64_stats_update_end(&tstats->syncp);
+ /* Reset the skb_iif to Tunnels interface index */
+ skb->skb_iif = tunnel->dev->ifindex;
netif_rx(skb);
return 0;
@@ -984,6 +1001,8 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
skb_set_inner_ipproto(skb, IPPROTO_IPV6);
+ /* Reset the skb_iif to Tunnels interface index */
+ skb->skb_iif = tunnel->dev->ifindex;
err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr,
protocol, tos, ttl, df,
!net_eq(tunnel->net, dev_net(dev)));
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -39,6 +39,7 @@
#include <linux/ipsec.h>
#include <linux/times.h>
#include <linux/slab.h>
+#include <asm/unaligned.h>
#include <linux/uaccess.h>
#include <linux/ipv6.h>
#include <linux/icmpv6.h>
@@ -781,10 +782,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
topt = (__be32 *)(t1 + 1);
if (tsecr) {
- *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
- (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP);
- *topt++ = htonl(tsval);
- *topt++ = htonl(tsecr);
+ put_unaligned_be32((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+ (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP, topt++);
+ put_unaligned_be32(tsval, topt++);
+ put_unaligned_be32(tsecr, topt++);
}
#ifdef CONFIG_TCP_MD5SIG
@@ -1033,6 +1034,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
#endif
+ newnp->ipv6_mc_list = NULL;
newnp->ipv6_ac_list = NULL;
newnp->ipv6_fl_list = NULL;
newnp->pktoptions = NULL;
@@ -1102,6 +1104,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
First: no IPv4 options.
*/
newinet->inet_opt = NULL;
+ newnp->ipv6_mc_list = NULL;
newnp->ipv6_ac_list = NULL;
newnp->ipv6_fl_list = NULL;
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -1168,11 +1168,10 @@ static int ipxitf_ioctl(unsigned int cmd, void __user *arg)
sipx->sipx_network = ipxif->if_netnum;
memcpy(sipx->sipx_node, ipxif->if_node,
sizeof(sipx->sipx_node));
- rc = -EFAULT;
+ rc = 0;
if (copy_to_user(arg, &ifr, sizeof(ifr)))
- break;
+ rc = -EFAULT;
ipxitf_put(ipxif);
- rc = 0;
break;
}
case SIOCAIPXITFCRT:
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -187,30 +187,22 @@ static int pfkey_release(struct socket *sock)
return 0;
}
-static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2,
- gfp_t allocation, struct sock *sk)
+static int pfkey_broadcast_one(struct sk_buff *skb, gfp_t allocation,
+ struct sock *sk)
{
int err = -ENOBUFS;
- sock_hold(sk);
- if (*skb2 == NULL) {
- if (atomic_read(&skb->users) != 1) {
- *skb2 = skb_clone(skb, allocation);
- } else {
- *skb2 = skb;
- atomic_inc(&skb->users);
- }
- }
- if (*skb2 != NULL) {
- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) {
- skb_set_owner_r(*skb2, sk);
- skb_queue_tail(&sk->sk_receive_queue, *skb2);
- sk->sk_data_ready(sk);
- *skb2 = NULL;
- err = 0;
- }
+ if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
+ return err;
+
+ skb = skb_clone(skb, allocation);
+
+ if (skb) {
+ skb_set_owner_r(skb, sk);
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ sk->sk_data_ready(sk);
+ err = 0;
}
- sock_put(sk);
return err;
}
@@ -225,7 +217,6 @@ static int pfkey_broadcast(struct sk_buff *skb,
{
struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
struct sock *sk;
- struct sk_buff *skb2 = NULL;
int err = -ESRCH;
/* XXX Do we need something like netlink_overrun? I think
@@ -244,7 +235,7 @@ static int pfkey_broadcast(struct sk_buff *skb,
* socket.
*/
if (pfk->promisc)
- pfkey_broadcast_one(skb, &skb2, GFP_ATOMIC, sk);
+ pfkey_broadcast_one(skb, GFP_ATOMIC, sk);
/* the exact target will be processed later */
if (sk == one_sk)
@@ -259,7 +250,7 @@ static int pfkey_broadcast(struct sk_buff *skb,
continue;
}
- err2 = pfkey_broadcast_one(skb, &skb2, GFP_ATOMIC, sk);
+ err2 = pfkey_broadcast_one(skb, GFP_ATOMIC, sk);
/* Error is cleared after successful sending to at least one
* registered KM */
@@ -269,9 +260,8 @@ static int pfkey_broadcast(struct sk_buff *skb,
rcu_read_unlock();
if (one_sk != NULL)
- err = pfkey_broadcast_one(skb, &skb2, GFP_KERNEL, one_sk);
+ err = pfkey_broadcast_one(skb, GFP_KERNEL, one_sk);
- kfree_skb(skb2);
kfree_skb(skb);
return err;
}
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -365,6 +365,30 @@ struct l2tp_tunnel *l2tp_tunnel_find_nth(struct net *net, int nth)
}
EXPORT_SYMBOL_GPL(l2tp_tunnel_find_nth);
+void l2tp_stats_update(struct l2tp_tunnel *tunnel,
+ struct l2tp_session *session,
+ struct l2tp_stats *stats)
+{
+ atomic_long_add(atomic_long_read(&stats->rx_packets),
+ &tunnel->stats.rx_packets);
+ atomic_long_add(atomic_long_read(&stats->rx_bytes),
+ &tunnel->stats.rx_bytes);
+ atomic_long_add(atomic_long_read(&stats->tx_packets),
+ &tunnel->stats.tx_packets);
+ atomic_long_add(atomic_long_read(&stats->tx_bytes),
+ &tunnel->stats.tx_bytes);
+
+ atomic_long_add(atomic_long_read(&stats->rx_packets),
+ &session->stats.rx_packets);
+ atomic_long_add(atomic_long_read(&stats->rx_bytes),
+ &session->stats.rx_bytes);
+ atomic_long_add(atomic_long_read(&stats->tx_packets),
+ &session->stats.tx_packets);
+ atomic_long_add(atomic_long_read(&stats->tx_bytes),
+ &session->stats.tx_bytes);
+}
+EXPORT_SYMBOL_GPL(l2tp_stats_update);
+
/*****************************************************************************
* Receive data handling
*****************************************************************************/
@@ -1198,7 +1222,6 @@ static void l2tp_tunnel_destruct(struct sock *sk)
l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: closing...\n", tunnel->name);
-
/* Disable udp encapsulation */
switch (tunnel->encap) {
case L2TP_ENCAPTYPE_UDP:
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -247,6 +247,8 @@ struct l2tp_session *l2tp_session_find_nth(struct l2tp_tunnel *tunnel, int nth);
struct l2tp_session *l2tp_session_find_by_ifname(struct net *net, char *ifname);
struct l2tp_tunnel *l2tp_tunnel_find(struct net *net, u32 tunnel_id);
struct l2tp_tunnel *l2tp_tunnel_find_nth(struct net *net, int nth);
+void l2tp_stats_update(struct l2tp_tunnel *tunnel, struct l2tp_session *session,
+ struct l2tp_stats *stats);
int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id,
u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg,
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -95,7 +95,11 @@ static int l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev)
struct l2tp_eth *priv = netdev_priv(dev);
struct l2tp_session *session = priv->session;
unsigned int len = skb->len;
- int ret = l2tp_xmit_skb(session, skb, session->hdr_len);
+ int ret;
+
+ skb->skb_iif = dev->ifindex;
+
+ ret = l2tp_xmit_skb(session, skb, session->hdr_len);
if (likely(ret == NET_XMIT_SUCCESS)) {
atomic_long_add(len, &priv->tx_bytes);
@@ -133,6 +137,7 @@ static void l2tp_eth_dev_setup(struct net_device *dev)
{
ether_setup(dev);
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ dev->priv_flags |= IFF_PPP_L2TPV3;
dev->features |= NETIF_F_LLTX;
dev->netdev_ops = &l2tp_eth_netdev_ops;
dev->destructor = free_netdev;
@@ -166,6 +171,8 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb,
skb_dst_drop(skb);
nf_reset(skb);
+ skb->skb_iif = dev->ifindex;
+
if (dev_forward_skb(dev, skb) == NET_RX_SUCCESS) {
atomic_long_inc(&priv->rx_packets);
atomic_long_add(data_len, &priv->rx_bytes);
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -98,6 +98,7 @@
#include <net/udp.h>
#include <net/xfrm.h>
#include <net/inet_common.h>
+#include <linux/if_pppox.h>
#include <asm/byteorder.h>
#include <linux/atomic.h>
@@ -131,9 +132,16 @@ struct pppol2tp_session {
};
static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb);
-
-static const struct ppp_channel_ops pppol2tp_chan_ops = {
- .start_xmit = pppol2tp_xmit,
+static int pppol2tp_get_channel_protocol(struct ppp_channel *);
+static int pppol2tp_get_channel_protocol_ver(struct ppp_channel *);
+static void pppol2tp_hold_chan(struct ppp_channel *);
+static void pppol2tp_release_chan(struct ppp_channel *);
+static const struct pppol2tp_channel_ops pppol2tp_chan_ops = {
+ .ops.start_xmit = pppol2tp_xmit,
+ .ops.get_channel_protocol = pppol2tp_get_channel_protocol,
+ .ops.get_channel_protocol_ver = pppol2tp_get_channel_protocol_ver,
+ .ops.hold = pppol2tp_hold_chan,
+ .ops.release = pppol2tp_release_chan,
};
static const struct proto_ops pppol2tp_ops;
@@ -251,6 +259,7 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int
nf_reset(skb);
po = pppox_sk(sk);
+ skb->skb_iif = ppp_dev_index(&po->chan);
ppp_input(&po->chan, skb);
} else {
l2tp_dbg(session, PPPOL2TP_MSG_DATA,
@@ -368,6 +377,126 @@ error:
return error;
}
+/* pppol2tp_hold_chan() */
+static void pppol2tp_hold_chan(struct ppp_channel *chan)
+{
+ struct sock *sk = (struct sock *)chan->private;
+
+ sock_hold(sk);
+}
+
+/* pppol2tp_release_chan() */
+static void pppol2tp_release_chan(struct ppp_channel *chan)
+{
+ struct sock *sk = (struct sock *)chan->private;
+
+ sock_put(sk);
+}
+
+/* pppol2tp_get_channel_protocol()
+ * Return the protocol type of the L2TP over PPP protocol
+ */
+static int pppol2tp_get_channel_protocol(struct ppp_channel *chan)
+{
+ return PX_PROTO_OL2TP;
+}
+
+/* pppol2tp_get_channel_protocol_ver()
+ * Return the protocol version of the L2TP over PPP protocol
+ */
+static int pppol2tp_get_channel_protocol_ver(struct ppp_channel *chan)
+{
+ struct sock *sk;
+ struct l2tp_session *session;
+ struct l2tp_tunnel *tunnel;
+ struct pppol2tp_session *ps;
+ int version = 0;
+
+ if (chan && chan->private)
+ sk = (struct sock *)chan->private;
+ else
+ return -1;
+
+ /* Get session and tunnel contexts from the socket */
+ session = pppol2tp_sock_to_session(sk);
+ if (!session)
+ return -1;
+
+ ps = l2tp_session_priv(session);
+ if (!ps->tunnel_sock) {
+ sock_put(sk);
+ return -1;
+ }
+
+ tunnel = l2tp_sock_to_tunnel(ps->tunnel_sock);
+ if (!tunnel) {
+ sock_put(sk);
+ return -1;
+ }
+
+ version = tunnel->version;
+
+ sock_put(ps->tunnel_sock);
+ sock_put(sk);
+
+ return version;
+}
+
+/* pppol2tp_get_addressing() */
+static int pppol2tp_get_addressing(struct ppp_channel *chan,
+ struct pppol2tp_common_addr *addr)
+{
+ struct sock *sk = (struct sock *)chan->private;
+ struct sock *sk_tun;
+ struct l2tp_session *session;
+ struct l2tp_tunnel *tunnel;
+ struct pppol2tp_session *ps;
+ struct inet_sock *isk = NULL;
+ int err = -ENXIO;
+
+ /* Get session and tunnel contexts from the socket */
+ session = pppol2tp_sock_to_session(sk);
+ if (!session)
+ return err;
+
+ ps = l2tp_session_priv(session);
+ sk_tun = ps->tunnel_sock;
+ if (!sk_tun) {
+ sock_put(sk);
+ return err;
+ }
+
+ tunnel = l2tp_sock_to_tunnel(sk_tun);
+ if (!tunnel) {
+ sock_put(sk_tun);
+ sock_put(sk);
+ return err;
+ }
+ isk = inet_sk(ps->tunnel_sock);
+
+ addr->local_tunnel_id = tunnel->tunnel_id;
+ addr->remote_tunnel_id = tunnel->peer_tunnel_id;
+ addr->local_session_id = session->session_id;
+ addr->remote_session_id = session->peer_session_id;
+
+ addr->local_addr.sin_port = isk->inet_sport;
+ addr->remote_addr.sin_port = isk->inet_dport;
+ addr->local_addr.sin_addr.s_addr = isk->inet_saddr;
+ addr->remote_addr.sin_addr.s_addr = isk->inet_daddr;
+
+ sock_put(sk_tun);
+ sock_put(sk);
+ return 0;
+}
+
+/* pppol2tp_channel_addressing_get() */
+int pppol2tp_channel_addressing_get(struct ppp_channel *chan,
+ struct pppol2tp_common_addr *addr)
+{
+ return pppol2tp_get_addressing(chan, addr);
+}
+EXPORT_SYMBOL(pppol2tp_channel_addressing_get);
+
/* Transmit function called by generic PPP driver. Sends PPP frame
* over PPPoL2TP socket.
*
@@ -421,6 +550,10 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
__skb_push(skb, sizeof(ppph));
skb->data[0] = ppph[0];
skb->data[1] = ppph[1];
+ /* set incoming interface as the ppp interface */
+ if ((skb->protocol == htons(ETH_P_IP)) ||
+ (skb->protocol == htons(ETH_P_IPV6)))
+ skb->skb_iif = ppp_dev_index(chan);
local_bh_disable();
l2tp_xmit_skb(session, skb, session->hdr_len);
@@ -779,7 +912,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
po->chan.hdrlen = PPPOL2TP_L2TP_HDR_SIZE_NOSEQ;
po->chan.private = sk;
- po->chan.ops = &pppol2tp_chan_ops;
+ po->chan.ops = &pppol2tp_chan_ops.ops;
po->chan.mtu = session->mtu;
error = ppp_register_net_channel(sock_net(sk), &po->chan);
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -156,10 +156,10 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap));
ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap);
+ memset(chandef, 0, sizeof(struct cfg80211_chan_def));
chandef->chan = channel;
chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
chandef->center_freq1 = channel->center_freq;
- chandef->center_freq2 = 0;
if (!ht_cap || !ht_oper || !sta_ht_cap.ht_supported) {
ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT;
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -248,6 +248,8 @@ static int ieee80211_tx_radiotap_len(struct ieee80211_tx_info *info)
/* IEEE80211_RADIOTAP_RATE rate */
if (info->status.rates[0].idx >= 0 &&
!(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS |
+ RATE_INFO_FLAGS_DMG |
+ RATE_INFO_FLAGS_EDMG |
IEEE80211_TX_RC_VHT_MCS)))
len += 2;
@@ -299,6 +301,8 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local,
/* IEEE80211_RADIOTAP_RATE */
if (info->status.rates[0].idx >= 0 &&
!(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS |
+ RATE_INFO_FLAGS_DMG |
+ RATE_INFO_FLAGS_EDMG |
IEEE80211_TX_RC_VHT_MCS))) {
u16 rate;
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -10,7 +10,7 @@ config NETFILTER_INGRESS
infrastructure.
config NETFILTER_NETLINK
- tristate
+ tristate "Netfilter NFNETLINK interface"
config NETFILTER_NETLINK_ACCT
tristate "Netfilter NFACCT over NFNETLINK interface"
@@ -114,6 +114,18 @@ config NF_CONNTRACK_EVENTS
If unsure, say `N'.
+config NF_CONNTRACK_RTCACHE
+ tristate "Cache route entries in conntrack objects"
+ depends on NETFILTER_ADVANCED
+ depends on NF_CONNTRACK
+ help
+ If this option is enabled, the connection tracking code will
+ cache routing information for each connection that is being
+ forwarded, at a cost of 32 bytes per conntrack object.
+
+ To compile it as a module, choose M here. If unsure, say N.
+ The module will be called nf_conntrack_rtcache.
+
config NF_CONNTRACK_TIMEOUT
bool 'Connection tracking timeout'
depends on NETFILTER_ADVANCED
@@ -124,6 +136,21 @@ config NF_CONNTRACK_TIMEOUT
If unsure, say `N'.
+config NF_CONNTRACK_DSCPREMARK_EXT
+ bool 'Connection tracking extension for dscp remark target'
+ depends on NETFILTER_ADVANCED
+ help
+ This option enables support for connection tracking extension
+ for dscp remark.
+
+config NF_CONNTRACK_CHAIN_EVENTS
+ bool "Register multiple callbacks to ct events"
+ depends on NF_CONNTRACK_EVENTS
+ help
+ Support multiple registrations.
+
+ If unsure, say `N'.
+
config NF_CONNTRACK_TIMESTAMP
bool 'Connection tracking timestamping'
depends on NETFILTER_ADVANCED
@@ -206,7 +233,6 @@ config NF_CONNTRACK_FTP
config NF_CONNTRACK_H323
tristate "H.323 protocol support"
- depends on IPV6 || IPV6=n
depends on NETFILTER_ADVANCED
help
H.323 is a VoIP signalling protocol from ITU-T. As one of the most
@@ -420,6 +446,15 @@ config NF_NAT_TFTP
depends on NF_CONNTRACK && NF_NAT
default NF_NAT && NF_CONNTRACK_TFTP
+config NF_NAT_TRY_NEXT_RULE
+ tristate
+ depends on NF_CONNTRACK && NF_NAT
+ default n
+ help
+ If this option is enabled, the iptables will move on to the
+ next rule in the chain if a unique tuple is not found for
+ translation from the current matched rule.
+
config NF_NAT_REDIRECT
tristate "IPv4/IPv6 redirect support"
depends on NF_NAT
@@ -918,7 +953,6 @@ config NETFILTER_XT_TARGET_SECMARK
config NETFILTER_XT_TARGET_TCPMSS
tristate '"TCPMSS" target support'
- depends on IPV6 || IPV6=n
default m if NETFILTER_ADVANCED=n
---help---
This option adds a `TCPMSS' target, which allows you to alter the
@@ -1170,6 +1204,13 @@ config NETFILTER_XT_MATCH_IPCOMP
To compile it as a module, choose M here. If unsure, say N.
+config NETFILTER_XT_MATCH_ID
+ tristate '"id" match support'
+ depends on NETFILTER_ADVANCED
+ ---help---
+ This option adds a `id' dummy-match, which allows you to put
+ numeric IDs into your iptables ruleset.
+
config NETFILTER_XT_MATCH_IPRANGE
tristate '"iprange" address range match support'
depends on NETFILTER_ADVANCED
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -5,6 +5,7 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
+nf_conntrack-$(CONFIG_NF_CONNTRACK_DSCPREMARK_EXT) += nf_conntrack_dscpremark_ext.o
obj-$(CONFIG_NETFILTER) = netfilter.o
@@ -16,6 +17,9 @@ obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
# connection tracking
obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o
+# optional conntrack route cache extension
+obj-$(CONFIG_NF_CONNTRACK_RTCACHE) += nf_conntrack_rtcache.o
+
# SCTP protocol connection tracking
obj-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
obj-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
@@ -141,6 +145,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o
obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o
obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o
obj-$(CONFIG_NETFILTER_XT_MATCH_HL) += xt_hl.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_ID) += xt_id.o
obj-$(CONFIG_NETFILTER_XT_MATCH_IPCOMP) += xt_ipcomp.o
obj-$(CONFIG_NETFILTER_XT_MATCH_IPRANGE) += xt_iprange.o
obj-$(CONFIG_NETFILTER_XT_MATCH_IPVS) += xt_ipvs.o
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2804,7 +2804,7 @@ static struct genl_family ip_vs_genl_family = {
.hdrsize = 0,
.name = IPVS_GENL_NAME,
.version = IPVS_GENL_VERSION,
- .maxattr = IPVS_CMD_MAX,
+ .maxattr = IPVS_CMD_ATTR_MAX,
.netnsok = true, /* Make ipvsadm to work on netns */
};
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -47,6 +47,7 @@
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_conntrack_timestamp.h>
#include <net/netfilter/nf_conntrack_timeout.h>
+#include <net/netfilter/nf_conntrack_dscpremark_ext.h>
#include <net/netfilter/nf_conntrack_labels.h>
#include <net/netfilter/nf_conntrack_synproxy.h>
#include <net/netfilter/nf_nat.h>
@@ -237,7 +238,7 @@ EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
static void
clean_from_lists(struct nf_conn *ct)
{
- pr_debug("clean_from_lists(%p)\n", ct);
+ pr_debug("clean_from_lists(%pK)\n", ct);
hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
@@ -330,7 +331,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
struct net *net = nf_ct_net(ct);
struct nf_conntrack_l4proto *l4proto;
- pr_debug("destroy_conntrack(%p)\n", ct);
+ pr_debug("destroy_conntrack(%pK)\n", ct);
NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
NF_CT_ASSERT(!timer_pending(&ct->timeout));
@@ -361,7 +362,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
if (ct->master)
nf_ct_put(ct->master);
- pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
+ pr_debug("destroy_conntrack: returning ct=%pK to slab\n", ct);
nf_conntrack_free(ct);
}
@@ -629,7 +630,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
* confirmed us.
*/
NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
- pr_debug("Confirming conntrack %p\n", ct);
+ pr_debug("Confirming conntrack %pK\n", ct);
/* We have to check the DYING flag after unlink to prevent
* a race against nf_ct_get_next_corpse() possibly called from
* user context, else we insert an already 'dead' hash, blocking
@@ -961,6 +962,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
nf_ct_acct_ext_add(ct, GFP_ATOMIC);
nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
nf_ct_labels_ext_add(ct);
+ nf_ct_dscpremark_ext_add(ct, GFP_ATOMIC);
ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
@@ -972,7 +974,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
spin_lock(&nf_conntrack_expect_lock);
exp = nf_ct_find_expectation(net, zone, tuple);
if (exp) {
- pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
+ pr_debug("conntrack: expectation arrives ct=%pK exp=%pK\n",
ct, exp);
/* Welcome, Mr. Bond. We've been expecting you... */
__set_bit(IPS_EXPECTED_BIT, &ct->status);
@@ -1063,14 +1065,14 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
} else {
/* Once we've had two way comms, always ESTABLISHED. */
if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- pr_debug("nf_conntrack_in: normal packet for %p\n", ct);
+ pr_debug("nf_conntrack_in:normal packet for %pK\n", ct);
*ctinfo = IP_CT_ESTABLISHED;
} else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
- pr_debug("nf_conntrack_in: related packet for %p\n",
+ pr_debug("nf_conntrack_in: related packet for %pK\n",
ct);
*ctinfo = IP_CT_RELATED;
} else {
- pr_debug("nf_conntrack_in: new packet for %p\n", ct);
+ pr_debug("nf_conntrack_in: new packet for %pK\n", ct);
*ctinfo = IP_CT_NEW;
}
*set_reply = 0;
@@ -1212,7 +1214,7 @@ void nf_conntrack_alter_reply(struct nf_conn *ct,
/* Should be unconfirmed, so not in hash table yet */
NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
- pr_debug("Altering reply tuple of %p to ", ct);
+ pr_debug("Altering reply tuple of %pK to ", ct);
nf_ct_dump_tuple(newreply);
ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
@@ -1394,6 +1396,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
}
spin_unlock(lockp);
local_bh_enable();
+ cond_resched();
}
for_each_possible_cpu(cpu) {
@@ -1406,6 +1409,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
set_bit(IPS_DYING_BIT, &ct->status);
}
spin_unlock_bh(&pcpu->lock);
+ cond_resched();
}
return NULL;
found:
@@ -1422,6 +1426,8 @@ void nf_ct_iterate_cleanup(struct net *net,
struct nf_conn *ct;
unsigned int bucket = 0;
+ might_sleep();
+
while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
/* Time to push up daises... */
if (del_timer(&ct->timeout))
@@ -1430,6 +1436,7 @@ void nf_ct_iterate_cleanup(struct net *net,
/* ... else the timer will get him soon. */
nf_ct_put(ct);
+ cond_resched();
}
}
EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
@@ -1478,6 +1485,7 @@ void nf_conntrack_cleanup_end(void)
nf_conntrack_proto_fini();
nf_conntrack_seqadj_fini();
nf_conntrack_labels_fini();
+ nf_conntrack_dscpremark_ext_fini();
nf_conntrack_helper_fini();
nf_conntrack_timeout_fini();
nf_conntrack_ecache_fini();
@@ -1666,6 +1674,10 @@ int nf_conntrack_init_start(void)
NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
nf_conntrack_max);
+ ret = nf_conntrack_dscpremark_ext_init();
+ if (ret < 0)
+ goto err_dscpremark_ext;
+
ret = nf_conntrack_expect_init();
if (ret < 0)
goto err_expect;
@@ -1738,6 +1750,8 @@ err_tstamp:
err_acct:
nf_conntrack_expect_fini();
err_expect:
+ nf_conntrack_dscpremark_ext_fini();
+err_dscpremark_ext:
return ret;
}
@@ -1817,6 +1831,10 @@ int nf_conntrack_init_net(struct net *net)
ret = nf_conntrack_proto_pernet_init(net);
if (ret < 0)
goto err_proto;
+
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+ ATOMIC_INIT_NOTIFIER_HEAD(&net->ct.nf_conntrack_chain);
+#endif
return 0;
err_proto:
new file mode 100644
--- /dev/null
+++ b/net/netfilter/nf_conntrack_dscpremark_ext.c
@@ -0,0 +1,92 @@
+/*
+ **************************************************************************
+ * Copyright (c) 2014-2015, The Linux Foundation. All rights reserved.
+ * Permission to use, copy, modify, and/or distribute this software for
+ * any purpose with or without fee is hereby granted, provided that the
+ * above copyright notice and this permission notice appear in all copies.
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+ * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ **************************************************************************
+ */
+
+/* DSCP remark handling conntrack extension registration. */
+
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/export.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_dscpremark_ext.h>
+
+/* DSCP remark conntrack extension type declaration */
+static struct nf_ct_ext_type dscpremark_extend __read_mostly = {
+ .len = sizeof(struct nf_ct_dscpremark_ext),
+ .align = __alignof__(struct nf_ct_dscpremark_ext),
+ .id = NF_CT_EXT_DSCPREMARK,
+};
+
+/* nf_conntrack_dscpremark_ext_init()
+ * Initializes the DSCP remark conntrack extension.
+ */
+int nf_conntrack_dscpremark_ext_init(void)
+{
+ int ret;
+
+ ret = nf_ct_extend_register(&dscpremark_extend);
+ if (ret < 0) {
+ pr_warn("nf_conntrack_dscpremark: Unable to register extension\n");
+ return ret;
+ }
+
+ return 0;
+}
+
+/* nf_conntrack_dscpremark_ext_set_dscp_rule_valid()
+ * Set DSCP rule validity flag in the extension
+ */
+int nf_conntrack_dscpremark_ext_set_dscp_rule_valid(struct nf_conn *ct)
+{
+ struct nf_ct_dscpremark_ext *ncde;
+
+ ncde = nf_ct_dscpremark_ext_find(ct);
+ if (!ncde)
+ return -1;
+
+ ncde->rule_flags = NF_CT_DSCPREMARK_EXT_DSCP_RULE_VALID;
+ return 0;
+}
+EXPORT_SYMBOL(nf_conntrack_dscpremark_ext_set_dscp_rule_valid);
+
+/* nf_conntrack_dscpremark_ext_get_dscp_rule_validity()
+ * Check if the DSCP rule flag is valid from the extension
+ */
+int nf_conntrack_dscpremark_ext_get_dscp_rule_validity(struct nf_conn *ct)
+{
+ struct nf_ct_dscpremark_ext *ncde;
+
+ ncde = nf_ct_dscpremark_ext_find(ct);
+ if (!ncde)
+ return NF_CT_DSCPREMARK_EXT_RULE_NOT_VALID;
+
+ if (ncde->rule_flags & NF_CT_DSCPREMARK_EXT_DSCP_RULE_VALID)
+ return NF_CT_DSCPREMARK_EXT_RULE_VALID;
+
+ return NF_CT_DSCPREMARK_EXT_RULE_NOT_VALID;
+}
+EXPORT_SYMBOL(nf_conntrack_dscpremark_ext_get_dscp_rule_validity);
+
+/* nf_conntrack_dscpremark_ext_fini()
+ * De-initializes the DSCP remark conntrack extension.
+ */
+void nf_conntrack_dscpremark_ext_fini(void)
+{
+ nf_ct_extend_unregister(&dscpremark_extend);
+}
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -18,6 +18,9 @@
#include <linux/stddef.h>
#include <linux/err.h>
#include <linux/percpu.h>
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+#include <linux/notifier.h>
+#endif
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/slab.h>
@@ -115,6 +118,52 @@ static void ecache_work(struct work_struct *work)
/* deliver cached events and clear cache entry - must be called with locally
* disabled softirqs */
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+void nf_ct_deliver_cached_events(struct nf_conn *ct)
+{
+ unsigned long events, missed;
+ struct nf_conntrack_ecache *e;
+ struct nf_ct_event item;
+ struct net *net = nf_ct_net(ct);
+ int ret = 0;
+
+ e = nf_ct_ecache_find(ct);
+ if (!e)
+ return;
+
+ events = xchg(&e->cache, 0);
+
+ if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct) || !events)
+ return;
+
+ /*
+ * We make a copy of the missed event cache without taking
+ * the lock, thus we may send missed events twice. However,
+ * this does not harm and it happens very rarely.
+ */
+ missed = e->missed;
+
+ if (!((events | missed) & e->ctmask))
+ return;
+
+ item.ct = ct;
+ item.portid = 0;
+ item.report = 0;
+
+ atomic_notifier_call_chain(&net->ct.nf_conntrack_chain,
+ events | missed, &item);
+
+ if (likely(ret >= 0 && !missed))
+ return;
+
+ spin_lock_bh(&ct->lock);
+ if (ret < 0)
+ e->missed |= events;
+ else
+ e->missed &= ~missed;
+ spin_unlock_bh(&ct->lock);
+}
+#else
void nf_ct_deliver_cached_events(struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
@@ -165,8 +214,15 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct)
out_unlock:
rcu_read_unlock();
}
+#endif
EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+int nf_conntrack_register_notifier(struct net *net, struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&net->ct.nf_conntrack_chain, nb);
+}
+#else
int nf_conntrack_register_notifier(struct net *net,
struct nf_ct_event_notifier *new)
{
@@ -187,8 +243,16 @@ out_unlock:
mutex_unlock(&nf_ct_ecache_mutex);
return ret;
}
+#endif
EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+int nf_conntrack_unregister_notifier(struct net *net, struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&net->ct.nf_conntrack_chain,
+ nb);
+}
+#else
void nf_conntrack_unregister_notifier(struct net *net,
struct nf_ct_event_notifier *new)
{
@@ -201,6 +265,7 @@ void nf_conntrack_unregister_notifier(struct net *net,
RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL);
mutex_unlock(&nf_ct_ecache_mutex);
}
+#endif
EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);
int nf_ct_expect_register_notifier(struct net *net,
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -28,6 +28,9 @@
#include <linux/netlink.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+#include <linux/notifier.h>
+#endif
#include <linux/slab.h>
#include <linux/netfilter.h>
@@ -629,19 +632,27 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
}
#ifdef CONFIG_NF_CONNTRACK_EVENTS
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+static int ctnetlink_conntrack_event(struct notifier_block *this,
+ unsigned long events, void *ptr)
+#else
static int
ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
+#endif
{
const struct nf_conntrack_zone *zone;
struct net *net;
struct nlmsghdr *nlh;
struct nfgenmsg *nfmsg;
struct nlattr *nest_parms;
- struct nf_conn *ct = item->ct;
struct sk_buff *skb;
unsigned int type;
unsigned int flags = 0, group;
int err;
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+ struct nf_ct_event *item = (struct nf_ct_event *)ptr;
+#endif
+ struct nf_conn *ct = item->ct;
/* ignore our fake conntrack entry */
if (nf_ct_is_untracked(ct))
@@ -3258,9 +3269,15 @@ ctnetlink_stat_exp_cpu(struct sock *ctnl, struct sk_buff *skb,
}
#ifdef CONFIG_NF_CONNTRACK_EVENTS
+#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS
+static struct notifier_block ctnl_notifier = {
+ .notifier_call = ctnetlink_conntrack_event,
+};
+#else
static struct nf_ct_event_notifier ctnl_notifier = {
.fcn = ctnetlink_conntrack_event,
};
+#endif
static struct nf_exp_event_notifier ctnl_notifier_exp = {
.fcn = ctnetlink_expect_event,
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -1,4 +1,6 @@
/*
+ * Copyright (c) 2018, The Linux Foundation. All rights reserved.
+ *
* ip_conntrack_proto_gre.c - Version 3.0
*
* Connection tracking protocol helper module for GRE.
@@ -393,17 +395,62 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = {
.init_net = gre_init_net,
};
+static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre6 __read_mostly = {
+ .l3proto = AF_INET6,
+ .l4proto = IPPROTO_GRE,
+ .name = "gre",
+ .pkt_to_tuple = gre_pkt_to_tuple,
+ .invert_tuple = gre_invert_tuple,
+ .print_tuple = gre_print_tuple,
+ .print_conntrack = gre_print_conntrack,
+ .get_timeouts = gre_get_timeouts,
+ .packet = gre_packet,
+ .new = gre_new,
+ .destroy = gre_destroy,
+ .me = THIS_MODULE,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+ .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
+ .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
+ .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
+ .nla_policy = nf_ct_port_nla_policy,
+#endif
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
+ .ctnl_timeout = {
+ .nlattr_to_obj = gre_timeout_nlattr_to_obj,
+ .obj_to_nlattr = gre_timeout_obj_to_nlattr,
+ .nlattr_max = CTA_TIMEOUT_GRE_MAX,
+ .obj_size = sizeof(unsigned int) * GRE_CT_MAX,
+ .nla_policy = gre_timeout_nla_policy,
+ },
+#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
+ .net_id = &proto_gre_net_id,
+ .init_net = gre_init_net,
+};
+
static int proto_gre_net_init(struct net *net)
{
int ret = 0;
ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_gre4);
- if (ret < 0)
+ if (ret < 0) {
pr_err("nf_conntrack_gre4: pernet registration failed.\n");
+ goto out;
+ }
+
+ ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_gre6);
+ if (ret < 0) {
+ pr_err("nf_conntrack_gre6: pernet registration failed.\n");
+ goto cleanup_gre4;
+ }
+ return 0;
+cleanup_gre4:
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_gre4);
+out:
return ret;
}
static void proto_gre_net_exit(struct net *net)
{
+ nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_gre6);
nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_gre4);
nf_ct_gre_keymap_flush(net);
}
@@ -427,7 +474,13 @@ static int __init nf_ct_proto_gre_init(void)
if (ret < 0)
goto out_gre4;
+ ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_gre6);
+ if (ret < 0)
+ goto out_gre6;
+
return 0;
+out_gre6:
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_gre4);
out_gre4:
unregister_pernet_subsys(&proto_gre_net_ops);
out_pernet:
@@ -436,6 +489,7 @@ out_pernet:
static void __exit nf_ct_proto_gre_fini(void)
{
+ nf_ct_l4proto_unregister(&nf_conntrack_l4proto_gre6);
nf_ct_l4proto_unregister(&nf_conntrack_l4proto_gre4);
unregister_pernet_subsys(&proto_gre_net_ops);
}
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -33,10 +33,15 @@
#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+/* Do not check the TCP window for incoming packets */
+int nf_ct_tcp_no_window_check __read_mostly = 1;
+EXPORT_SYMBOL_GPL(nf_ct_tcp_no_window_check);
+
/* "Be conservative in what you do,
be liberal in what you accept from others."
If it's non-zero, we mark only out of window RST segments as INVALID. */
-static int nf_ct_tcp_be_liberal __read_mostly = 0;
+int nf_ct_tcp_be_liberal __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ct_tcp_be_liberal);
/* If it is set to zero, we disable picking up already established
connections. */
@@ -453,7 +458,7 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
/* Fast path for timestamp-only option */
if (length == TCPOLEN_TSTAMP_ALIGNED
- && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24)
+ && net_hdr_word(ptr) == htonl((TCPOPT_NOP << 24)
| (TCPOPT_NOP << 16)
| (TCPOPT_TIMESTAMP << 8)
| TCPOLEN_TIMESTAMP))
@@ -515,6 +520,9 @@ static bool tcp_in_window(const struct nf_conn *ct,
s32 receiver_offset;
bool res, in_recv_win;
+ if (nf_ct_tcp_no_window_check)
+ return true;
+
/*
* Get the required data from the packet.
*/
@@ -1481,6 +1489,13 @@ static struct ctl_table tcp_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "nf_conntrack_tcp_no_window_check",
+ .data = &nf_ct_tcp_no_window_check,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ }
};
new file mode 100644
--- /dev/null
+++ b/net/netfilter/nf_conntrack_rtcache.c
@@ -0,0 +1,416 @@
+/* route cache for netfilter.
+ *
+ * (C) 2014 Red Hat GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
+#include <linux/stddef.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/export.h>
+#include <linux/module.h>
+
+#include <net/dst.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_rtcache.h>
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV6)
+#include <net/ip6_fib.h>
+#endif
+
+static void __nf_conn_rtcache_destroy(struct nf_conn_rtcache *rtc,
+ enum ip_conntrack_dir dir)
+{
+ struct dst_entry *dst = rtc->cached_dst[dir].dst;
+
+ dst_release(dst);
+}
+
+static void nf_conn_rtcache_destroy(struct nf_conn *ct)
+{
+ struct nf_conn_rtcache *rtc = nf_ct_rtcache_find(ct);
+
+ if (!rtc)
+ return;
+
+ __nf_conn_rtcache_destroy(rtc, IP_CT_DIR_ORIGINAL);
+ __nf_conn_rtcache_destroy(rtc, IP_CT_DIR_REPLY);
+}
+
+static void nf_ct_rtcache_ext_add(struct nf_conn *ct)
+{
+ struct nf_conn_rtcache *rtc;
+
+ rtc = nf_ct_ext_add(ct, NF_CT_EXT_RTCACHE, GFP_ATOMIC);
+ if (rtc) {
+ rtc->cached_dst[IP_CT_DIR_ORIGINAL].iif = -1;
+ rtc->cached_dst[IP_CT_DIR_ORIGINAL].dst = NULL;
+ rtc->cached_dst[IP_CT_DIR_REPLY].iif = -1;
+ rtc->cached_dst[IP_CT_DIR_REPLY].dst = NULL;
+ }
+}
+
+static struct nf_conn_rtcache *nf_ct_rtcache_find_usable(struct nf_conn *ct)
+{
+ if (nf_ct_is_untracked(ct))
+ return NULL;
+ return nf_ct_rtcache_find(ct);
+}
+
+static struct dst_entry *
+nf_conn_rtcache_dst_get(const struct nf_conn_rtcache *rtc,
+ enum ip_conntrack_dir dir)
+{
+ return rtc->cached_dst[dir].dst;
+}
+
+static u32 nf_rtcache_get_cookie(int pf, const struct dst_entry *dst)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV6)
+ if (pf == NFPROTO_IPV6) {
+ const struct rt6_info *rt = (const struct rt6_info *)dst;
+
+ if (rt->rt6i_node)
+ return (u32)rt->rt6i_node->fn_sernum;
+ }
+#endif
+ return 0;
+}
+
+static void nf_conn_rtcache_dst_set(int pf,
+ struct nf_conn_rtcache *rtc,
+ struct dst_entry *dst,
+ enum ip_conntrack_dir dir, int iif)
+{
+ if (rtc->cached_dst[dir].iif != iif)
+ rtc->cached_dst[dir].iif = iif;
+
+ if (rtc->cached_dst[dir].dst != dst) {
+ struct dst_entry *old;
+
+ dst_hold(dst);
+
+ old = xchg(&rtc->cached_dst[dir].dst, dst);
+ dst_release(old);
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV6)
+ if (pf == NFPROTO_IPV6)
+ rtc->cached_dst[dir].cookie =
+ nf_rtcache_get_cookie(pf, dst);
+#endif
+ }
+}
+
+static void nf_conn_rtcache_dst_obsolete(struct nf_conn_rtcache *rtc,
+ enum ip_conntrack_dir dir)
+{
+ struct dst_entry *old;
+
+ pr_debug("Invalidate iif %d for dir %d on cache %p\n",
+ rtc->cached_dst[dir].iif, dir, rtc);
+
+ old = xchg(&rtc->cached_dst[dir].dst, NULL);
+ dst_release(old);
+ rtc->cached_dst[dir].iif = -1;
+}
+
+static unsigned int nf_rtcache_in(u_int8_t pf,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_conn_rtcache *rtc;
+ enum ip_conntrack_info ctinfo;
+ enum ip_conntrack_dir dir;
+ struct dst_entry *dst;
+ struct nf_conn *ct;
+ int iif;
+ u32 cookie;
+
+ if (skb_dst(skb) || skb->sk)
+ return NF_ACCEPT;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return NF_ACCEPT;
+
+ rtc = nf_ct_rtcache_find_usable(ct);
+ if (!rtc)
+ return NF_ACCEPT;
+
+ /* if iif changes, don't use cache and let ip stack
+ * do route lookup.
+ *
+ * If rp_filter is enabled it might toss skb, so
+ * we don't want to avoid these checks.
+ */
+ dir = CTINFO2DIR(ctinfo);
+ iif = nf_conn_rtcache_iif_get(rtc, dir);
+ if (state->in->ifindex != iif) {
+ pr_debug("ct %p, iif %d, cached iif %d, skip cached entry\n",
+ ct, iif, state->in->ifindex);
+ return NF_ACCEPT;
+ }
+ dst = nf_conn_rtcache_dst_get(rtc, dir);
+ if (dst == NULL)
+ return NF_ACCEPT;
+
+ cookie = nf_rtcache_get_cookie(pf, dst);
+
+ dst = dst_check(dst, cookie);
+ pr_debug("obtained dst %p for skb %p, cookie %d\n", dst, skb, cookie);
+ if (likely(dst))
+ skb_dst_set_noref(skb, dst);
+ else
+ nf_conn_rtcache_dst_obsolete(rtc, dir);
+
+ return NF_ACCEPT;
+}
+
+static unsigned int nf_rtcache_forward(u_int8_t pf,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_conn_rtcache *rtc;
+ enum ip_conntrack_info ctinfo;
+ enum ip_conntrack_dir dir;
+ struct nf_conn *ct;
+ struct dst_entry *dst = skb_dst(skb);
+ int iif;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return NF_ACCEPT;
+
+ if (dst && dst_xfrm(dst))
+ return NF_ACCEPT;
+
+ if (dst && (dst->flags & DST_FAKE_RTABLE))
+ return NF_ACCEPT;
+
+ if (!nf_ct_is_confirmed(ct)) {
+ if (nf_ct_rtcache_find(ct))
+ return NF_ACCEPT;
+ nf_ct_rtcache_ext_add(ct);
+ return NF_ACCEPT;
+ }
+
+ rtc = nf_ct_rtcache_find_usable(ct);
+ if (!rtc)
+ return NF_ACCEPT;
+
+ dir = CTINFO2DIR(ctinfo);
+ iif = nf_conn_rtcache_iif_get(rtc, dir);
+ pr_debug("ct %p, skb %p, dir %d, iif %d, cached iif %d\n",
+ ct, skb, dir, iif, state->in->ifindex);
+ if (likely(state->in->ifindex == iif))
+ return NF_ACCEPT;
+
+ nf_conn_rtcache_dst_set(pf, rtc, skb_dst(skb), dir, state->in->ifindex);
+ return NF_ACCEPT;
+}
+
+static unsigned int nf_rtcache_in4(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_rtcache_in(NFPROTO_IPV4, skb, state);
+}
+
+static unsigned int nf_rtcache_forward4(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_rtcache_forward(NFPROTO_IPV4, skb, state);
+}
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV6)
+static unsigned int nf_rtcache_in6(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_rtcache_in(NFPROTO_IPV6, skb, state);
+}
+
+static unsigned int nf_rtcache_forward6(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_rtcache_forward(NFPROTO_IPV6, skb, state);
+}
+#endif
+
+static int nf_rtcache_dst_remove(struct nf_conn *ct, void *data)
+{
+ struct nf_conn_rtcache *rtc = nf_ct_rtcache_find(ct);
+ struct net_device *dev = data;
+
+ if (!rtc)
+ return 0;
+
+ if (dev->ifindex == rtc->cached_dst[IP_CT_DIR_ORIGINAL].iif ||
+ dev->ifindex == rtc->cached_dst[IP_CT_DIR_REPLY].iif) {
+ nf_conn_rtcache_dst_obsolete(rtc, IP_CT_DIR_ORIGINAL);
+ nf_conn_rtcache_dst_obsolete(rtc, IP_CT_DIR_REPLY);
+ }
+
+ return 0;
+}
+
+static int nf_rtcache_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+
+ if (event == NETDEV_DOWN)
+ nf_ct_iterate_cleanup(net, nf_rtcache_dst_remove, dev, 0, 0);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nf_rtcache_notifier = {
+ .notifier_call = nf_rtcache_netdev_event,
+};
+
+static struct nf_hook_ops rtcache_ops[] = {
+ {
+ .hook = nf_rtcache_in4,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP_PRI_LAST,
+ },
+ {
+ .hook = nf_rtcache_forward4,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_FORWARD,
+ .priority = NF_IP_PRI_LAST,
+ },
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV6)
+ {
+ .hook = nf_rtcache_in6,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP_PRI_LAST,
+ },
+ {
+ .hook = nf_rtcache_forward6,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_FORWARD,
+ .priority = NF_IP_PRI_LAST,
+ },
+#endif
+};
+
+static struct nf_ct_ext_type rtcache_extend __read_mostly = {
+ .len = sizeof(struct nf_conn_rtcache),
+ .align = __alignof__(struct nf_conn_rtcache),
+ .id = NF_CT_EXT_RTCACHE,
+ .destroy = nf_conn_rtcache_destroy,
+};
+
+static int __init nf_conntrack_rtcache_init(void)
+{
+ int ret = nf_ct_extend_register(&rtcache_extend);
+
+ if (ret < 0) {
+ pr_err("nf_conntrack_rtcache: Unable to register extension\n");
+ return ret;
+ }
+
+ ret = nf_register_hooks(rtcache_ops, ARRAY_SIZE(rtcache_ops));
+ if (ret < 0) {
+ nf_ct_extend_unregister(&rtcache_extend);
+ return ret;
+ }
+
+ ret = register_netdevice_notifier(&nf_rtcache_notifier);
+ if (ret) {
+ nf_unregister_hooks(rtcache_ops, ARRAY_SIZE(rtcache_ops));
+ nf_ct_extend_unregister(&rtcache_extend);
+ }
+
+ return ret;
+}
+
+static int nf_rtcache_ext_remove(struct nf_conn *ct, void *data)
+{
+ struct nf_conn_rtcache *rtc = nf_ct_rtcache_find(ct);
+
+ return rtc != NULL;
+}
+
+static bool __exit nf_conntrack_rtcache_wait_for_dying(struct net *net)
+{
+ bool wait = false;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct nf_conntrack_tuple_hash *h;
+ struct hlist_nulls_node *n;
+ struct nf_conn *ct;
+ struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+ rcu_read_lock();
+ spin_lock_bh(&pcpu->lock);
+
+ hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) {
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ if (nf_ct_rtcache_find(ct) != NULL) {
+ wait = true;
+ break;
+ }
+ }
+ spin_unlock_bh(&pcpu->lock);
+ rcu_read_unlock();
+ }
+
+ return wait;
+}
+
+static void __exit nf_conntrack_rtcache_fini(void)
+{
+ struct net *net;
+ int count = 0;
+
+ /* remove hooks so no new connections get rtcache extension */
+ nf_unregister_hooks(rtcache_ops, ARRAY_SIZE(rtcache_ops));
+
+ synchronize_net();
+
+ unregister_netdevice_notifier(&nf_rtcache_notifier);
+
+ rtnl_lock();
+
+ /* zap all conntracks with rtcache extension */
+ for_each_net(net)
+ nf_ct_iterate_cleanup(net, nf_rtcache_ext_remove, NULL, 0, 0);
+
+ for_each_net(net) {
+ /* .. and make sure they're gone from dying list, too */
+ while (nf_conntrack_rtcache_wait_for_dying(net)) {
+ msleep(200);
+ WARN_ONCE(++count > 25, "Waiting for all rtcache conntracks to go away\n");
+ }
+ }
+
+ rtnl_unlock();
+ synchronize_net();
+ nf_ct_extend_unregister(&rtcache_extend);
+}
+module_init(nf_conntrack_rtcache_init);
+module_exit(nf_conntrack_rtcache_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
+MODULE_DESCRIPTION("Conntrack route cache extension");
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -17,6 +17,7 @@
#include <linux/percpu.h>
#include <linux/netdevice.h>
#include <linux/security.h>
+#include <linux/inet.h>
#include <net/net_namespace.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
@@ -288,10 +289,66 @@ static int ct_open(struct inode *inode, struct file *file)
sizeof(struct ct_iter_state));
}
+struct kill_request {
+ u16 family;
+ union nf_inet_addr addr;
+};
+
+static int kill_matching(struct nf_conn *i, void *data)
+{
+ struct kill_request *kr = data;
+ struct nf_conntrack_tuple *t1 = &i->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ struct nf_conntrack_tuple *t2 = &i->tuplehash[IP_CT_DIR_REPLY].tuple;
+
+ if (!kr->family)
+ return 1;
+
+ if (t1->src.l3num != kr->family)
+ return 0;
+
+ return (nf_inet_addr_cmp(&kr->addr, &t1->src.u3) ||
+ nf_inet_addr_cmp(&kr->addr, &t1->dst.u3) ||
+ nf_inet_addr_cmp(&kr->addr, &t2->src.u3) ||
+ nf_inet_addr_cmp(&kr->addr, &t2->dst.u3));
+}
+
+static ssize_t ct_file_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ struct net *net = seq_file_net(seq);
+ struct kill_request kr = { };
+ char req[INET6_ADDRSTRLEN] = { };
+
+ if (count == 0)
+ return 0;
+
+ if (count >= INET6_ADDRSTRLEN)
+ count = INET6_ADDRSTRLEN - 1;
+
+ if (copy_from_user(req, buf, count))
+ return -EFAULT;
+
+ if (strnchr(req, count, ':')) {
+ kr.family = AF_INET6;
+ if (!in6_pton(req, count, (void *)&kr.addr, '\n', NULL))
+ return -EINVAL;
+ } else if (strnchr(req, count, '.')) {
+ kr.family = AF_INET;
+ if (!in4_pton(req, count, (void *)&kr.addr, '\n', NULL))
+ return -EINVAL;
+ }
+
+ nf_ct_iterate_cleanup(net, kill_matching, &kr, 0, 0);
+
+ return count;
+}
+
static const struct file_operations ct_file_ops = {
.owner = THIS_MODULE,
.open = ct_open,
.read = seq_read,
+ .write = ct_file_write,
.llseek = seq_lseek,
.release = seq_release_net,
};
@@ -393,7 +450,7 @@ static int nf_conntrack_standalone_init_proc(struct net *net)
{
struct proc_dir_entry *pde;
- pde = proc_create("nf_conntrack", 0440, net->proc_net, &ct_file_ops);
+ pde = proc_create("nf_conntrack", 0660, net->proc_net, &ct_file_ops);
if (!pde)
goto out_nf_conntrack;
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -90,6 +90,9 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
struct dst_entry *dst;
int err;
+ if (skb->dev && !dev_net(skb->dev)->xfrm.policy_count[XFRM_POLICY_OUT])
+ return 0;
+
err = xfrm_decode_session(skb, &fl, family);
if (err < 0)
return err;
@@ -404,6 +407,13 @@ nf_nat_setup_info(struct nf_conn *ct,
get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
+#if IS_ENABLED(CONFIG_NF_NAT_TRY_NEXT_RULE)
+ if (curr_tuple.src.u.all != 0 && curr_tuple.dst.u.all != 0 &&
+ new_tuple.src.u.all != 0 && new_tuple.dst.u.all != 0 &&
+ nf_nat_used_tuple(&new_tuple, ct))
+ return XT_CONTINUE;
+#endif
+
if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
struct nf_conntrack_tuple reply;
--- a/net/netfilter/nf_nat_ftp.c
+++ b/net/netfilter/nf_nat_ftp.c
@@ -24,8 +24,39 @@ MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
MODULE_DESCRIPTION("ftp NAT helper");
MODULE_ALIAS("ip_nat_ftp");
+static ushort psid = 0;
+module_param(psid, ushort, 0644);
+MODULE_PARM_DESC(psid, "MAP_E devices's psid");
+
+static uint psid_len = 0;
+module_param(psid_len, uint, 0644);
+MODULE_PARM_DESC(psid_len, "MAP_E devices's psid length");
+
+static uint offset = 0;
+module_param(offset, uint, 0644);
+MODULE_PARM_DESC(offset, "MAP_E devices's psid offset");
+
/* FIXME: Time out? --RR */
+/**
+ * nf_nat_port_valid_check - check the port is in the range of psid
+ * @skb the packets to be translated
+ * @port the port to be checked.
+ **/
+static int nf_nat_port_valid_check(struct sk_buff *skb, u16 port)
+{
+ if (psid == 0 || psid_len == 0 || offset == 0)
+ return 1;
+
+ if ((psid_len + offset) > 16)
+ return 1;
+
+ if ((((port >> (16 - psid_len - offset)) & ((1 << psid_len) - 1))) == psid)
+ return 1;
+
+ return 0;
+}
+
static int nf_nat_ftp_fmt_cmd(struct nf_conn *ct, enum nf_ct_ftp_type type,
char *buffer, size_t buflen,
union nf_inet_addr *addr, u16 port)
@@ -65,7 +96,7 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
struct nf_conntrack_expect *exp)
{
union nf_inet_addr newaddr;
- u_int16_t port;
+ u16 port;
int dir = CTINFO2DIR(ctinfo);
struct nf_conn *ct = exp->master;
char buffer[sizeof("|1||65535|") + INET6_ADDRSTRLEN];
@@ -82,10 +113,16 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
* this one. */
exp->expectfn = nf_nat_follow_master;
- /* Try to get same port: if not, try to change it. */
+ /* In the case of MAP-E, the FTP ALG source port number must use its own
+ * PSID. Otherwise the returned packets from ftp server will use other
+ * than its own IPv6 address.
+ * so let the check hook to validate the port*/
for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
int ret;
+ if (!nf_nat_port_valid_check(skb, port))
+ continue;
+
exp->tuple.dst.u.tcp.port = htons(port);
ret = nf_ct_expect_related(exp);
if (ret == 0)
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -326,10 +326,12 @@ replay:
nlh = nlmsg_hdr(skb);
err = 0;
- if (nlmsg_len(nlh) < sizeof(struct nfgenmsg) ||
- skb->len < nlh->nlmsg_len) {
- err = -EINVAL;
- goto ack;
+ if (nlh->nlmsg_len < NLMSG_HDRLEN ||
+ skb->len < nlh->nlmsg_len ||
+ nlmsg_len(nlh) < sizeof(struct nfgenmsg)) {
+ nfnl_err_reset(&err_list);
+ status |= NFNL_BATCH_FAILURE;
+ goto done;
}
/* Only requests are handled by the kernel */
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -192,7 +192,7 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
int err;
- err = rhashtable_walk_init(&priv->ht, &hti);
+ err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
iter->err = err;
if (err)
return;
@@ -248,7 +248,7 @@ static void nft_hash_gc(struct work_struct *work)
priv = container_of(work, struct nft_hash, gc_work.work);
set = nft_set_container_of(priv);
- err = rhashtable_walk_init(&priv->ht, &hti);
+ err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL);
if (err)
goto schedule;
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -548,7 +548,7 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr,
m->u.user.match_size = msize;
strlcpy(name, match->name, sizeof(name));
module_put(match->me);
- strncpy(m->u.user.name, name, sizeof(m->u.user.name));
+ strlcpy(m->u.user.name, name, sizeof(m->u.user.name));
*size += off;
*dstptr += msize;
@@ -856,7 +856,7 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
t->u.user.target_size = tsize;
strlcpy(name, target->name, sizeof(name));
module_put(target->me);
- strncpy(t->u.user.name, name, sizeof(t->u.user.name));
+ strlcpy(t->u.user.name, name, sizeof(t->u.user.name));
*size += off;
*dstptr += tsize;
--- a/net/netfilter/xt_DSCP.c
+++ b/net/netfilter/xt_DSCP.c
@@ -18,6 +18,7 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_DSCP.h>
+#include <net/netfilter/nf_conntrack_dscpremark_ext.h>
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_DESCRIPTION("Xtables: DSCP/TOS field modification");
@@ -32,6 +33,10 @@ dscp_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_DSCP_info *dinfo = par->targinfo;
u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT;
+#ifdef CONFIG_NF_CONNTRACK_DSCPREMARK_EXT
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+#endif
if (dscp != dinfo->dscp) {
if (!skb_make_writable(skb, sizeof(struct iphdr)))
@@ -41,6 +46,13 @@ dscp_tg(struct sk_buff *skb, const struct xt_action_param *par)
(__force __u8)(~XT_DSCP_MASK),
dinfo->dscp << XT_DSCP_SHIFT);
+#ifdef CONFIG_NF_CONNTRACK_DSCPREMARK_EXT
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return XT_CONTINUE;
+
+ nf_conntrack_dscpremark_ext_set_dscp_rule_valid(ct);
+#endif
}
return XT_CONTINUE;
}
@@ -50,7 +62,10 @@ dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_DSCP_info *dinfo = par->targinfo;
u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT;
-
+#ifdef CONFIG_NF_CONNTRACK_DSCPREMARK_EXT
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+#endif
if (dscp != dinfo->dscp) {
if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
return NF_DROP;
@@ -58,6 +73,14 @@ dscp_tg6(struct sk_buff *skb, const struct xt_action_param *par)
ipv6_change_dsfield(ipv6_hdr(skb),
(__force __u8)(~XT_DSCP_MASK),
dinfo->dscp << XT_DSCP_SHIFT);
+
+#ifdef CONFIG_NF_CONNTRACK_DSCPREMARK_EXT
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return XT_CONTINUE;
+
+ nf_conntrack_dscpremark_ext_set_dscp_rule_valid(ct);
+#endif
}
return XT_CONTINUE;
}
new file mode 100644
--- /dev/null
+++ b/net/netfilter/xt_id.c
@@ -0,0 +1,45 @@
+/*
+ * Implements a dummy match to allow attaching IDs to rules
+ *
+ * 2014-08-01 Jo-Philipp Wich <jow@openwrt.org>
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_id.h>
+
+MODULE_AUTHOR("Jo-Philipp Wich <jow@openwrt.org>");
+MODULE_DESCRIPTION("Xtables: No-op match which can be tagged with a 32bit ID");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_id");
+MODULE_ALIAS("ip6t_id");
+
+static bool
+id_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ /* We always match */
+ return true;
+}
+
+static struct xt_match id_mt_reg __read_mostly = {
+ .name = "id",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .match = id_mt,
+ .matchsize = sizeof(struct xt_id_info),
+ .me = THIS_MODULE,
+};
+
+static int __init id_mt_init(void)
+{
+ return xt_register_match(&id_mt_reg);
+}
+
+static void __exit id_mt_exit(void)
+{
+ xt_unregister_match(&id_mt_reg);
+}
+
+module_init(id_mt_init);
+module_exit(id_mt_exit);
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -229,7 +229,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
transparent = xt_socket_sk_is_transparent(sk);
if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
- transparent)
+ transparent && sk_fullsock(sk))
pskb->mark = sk->sk_mark;
if (sk != skb->sk)
@@ -404,7 +404,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
transparent = xt_socket_sk_is_transparent(sk);
if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
- transparent)
+ transparent && sk_fullsock(sk))
pskb->mark = sk->sk_mark;
if (sk != skb->sk)
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -787,7 +787,8 @@ static int netlbl_unlabel_addrinfo_get(struct genl_info *info,
{
u32 addr_len;
- if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR]) {
+ if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR] &&
+ info->attrs[NLBL_UNLABEL_A_IPV4MASK]) {
addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
if (addr_len != sizeof(struct in_addr) &&
addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK]))
--- a/net/netlink/Kconfig
+++ b/net/netlink/Kconfig
@@ -4,6 +4,7 @@
config NETLINK_DIAG
tristate "NETLINK: socket monitoring interface"
+ select SOCK_DIAG
default n
---help---
Support for NETLINK socket monitoring interface used by the ss tool.
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1187,24 +1187,7 @@ void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
{
- int delta;
-
WARN_ON(skb->sk != NULL);
- delta = skb->end - skb->tail;
- if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
- return skb;
-
- if (skb_shared(skb)) {
- struct sk_buff *nskb = skb_clone(skb, allocation);
- if (!nskb)
- return skb;
- consume_skb(skb);
- skb = nskb;
- }
-
- if (!pskb_expand_head(skb, 0, -delta, allocation))
- skb->truesize -= delta;
-
return skb;
}
@@ -2362,7 +2345,8 @@ static int netlink_walk_start(struct nl_seq_iter *iter)
{
int err;
- err = rhashtable_walk_init(&nl_table[iter->link].hash, &iter->hti);
+ err = rhashtable_walk_init(&nl_table[iter->link].hash,
+ &iter->hti, GFP_KERNEL);
if (err) {
iter->link = MAX_LINKS;
return err;
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -993,7 +993,7 @@ static struct genl_multicast_group genl_ctrl_groups[] = {
static int genl_bind(struct net *net, int group)
{
- int i, err = -ENOENT;
+ int i, err = 0;
down_read(&cb_lock);
for (i = 0; i < GENL_FAM_TAB_SIZE; i++) {
--- a/net/nfc/hci/core.c
+++ b/net/nfc/hci/core.c
@@ -209,6 +209,11 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd,
}
create_info = (struct hci_create_pipe_resp *)skb->data;
+ if (create_info->pipe >= NFC_HCI_MAX_PIPES) {
+ status = NFC_HCI_ANY_E_NOK;
+ goto exit;
+ }
+
/* Save the new created pipe and bind with local gate,
* the description for skb->data[3] is destination gate id
* but since we received this cmd from host controller, we
@@ -232,6 +237,11 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd,
}
delete_info = (struct hci_delete_pipe_noti *)skb->data;
+ if (delete_info->pipe >= NFC_HCI_MAX_PIPES) {
+ status = NFC_HCI_ANY_E_NOK;
+ goto exit;
+ }
+
hdev->pipes[delete_info->pipe].gate = NFC_HCI_INVALID_GATE;
hdev->pipes[delete_info->pipe].dest_host = NFC_HCI_INVALID_HOST;
break;
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -61,6 +61,8 @@
int ovs_net_id __read_mostly;
EXPORT_SYMBOL_GPL(ovs_net_id);
+static struct ovs_accel_callback *ovs_accel_cb;
+
static struct genl_family dp_packet_genl_family;
static struct genl_family dp_flow_genl_family;
static struct genl_family dp_datapath_genl_family;
@@ -252,6 +254,126 @@ void ovs_dp_detach_port(struct vport *p)
ovs_vport_del(p);
}
+/* Notify datapath add event to acceleration callback */
+static void ovs_dp_add_notify(struct datapath *dp, struct vport *vp)
+{
+ struct ovs_accel_callback *ovs_cb;
+
+ rcu_read_lock();
+ ovs_cb = rcu_dereference(ovs_accel_cb);
+ if (ovs_cb && ovs_cb->ovs_accel_dp_add)
+ ovs_cb->ovs_accel_dp_add((void *)dp, vp->dev);
+ rcu_read_unlock();
+}
+
+/* Notify datapath delete event to acceleration callback */
+static void ovs_dp_del_notify(struct datapath *dp, struct vport *vp)
+{
+ struct ovs_accel_callback *ovs_cb;
+
+ rcu_read_lock();
+ ovs_cb = rcu_dereference(ovs_accel_cb);
+ if (ovs_cb && ovs_cb->ovs_accel_dp_del)
+ ovs_cb->ovs_accel_dp_del((void *)dp, vp->dev);
+ rcu_read_unlock();
+}
+
+/* Notify datapath port add event to acceleration callback */
+static void ovs_dp_port_add_notify(struct datapath *dp, struct vport *vp,
+ struct nlattr **a)
+{
+ struct ovs_accel_callback *ovs_cb;
+ const char *master = NULL;
+
+ if (a[OVS_VPORT_ATTR_MASTER])
+ master = nla_data(a[OVS_VPORT_ATTR_MASTER]);
+
+ rcu_read_lock();
+ ovs_cb = rcu_dereference(ovs_accel_cb);
+ if (ovs_cb && ovs_cb->ovs_accel_dp_port_add)
+ ovs_cb->ovs_accel_dp_port_add((void *)dp, (void *)vp,
+ vp->port_no, vp->ops->type,
+ master, vp->dev);
+ rcu_read_unlock();
+}
+
+/* Notify datapath port delete event to acceleration callback */
+static void ovs_dp_port_del_notify(struct datapath *dp, struct vport *vp)
+{
+ struct ovs_accel_callback *ovs_cb;
+
+ rcu_read_lock();
+ ovs_cb = rcu_dereference(ovs_accel_cb);
+ if (ovs_cb && ovs_cb->ovs_accel_dp_port_del)
+ ovs_cb->ovs_accel_dp_port_del((void *)dp, (void *)vp, vp->dev);
+ rcu_read_unlock();
+}
+
+/* Notify datapath flow add event to acceleration callback */
+static void ovs_dp_flow_add_notify(struct datapath *dp, struct sw_flow *sf)
+{
+ struct ovs_accel_callback *ovs_cb;
+
+ rcu_read_lock();
+ ovs_cb = rcu_dereference(ovs_accel_cb);
+ if (ovs_cb && ovs_cb->ovs_accel_dp_flow_add)
+ ovs_cb->ovs_accel_dp_flow_add((void *)dp, sf);
+ rcu_read_unlock();
+}
+
+/* Notify datapath flow delete event to acceleration callback */
+static void ovs_dp_flow_del_notify(struct datapath *dp, struct sw_flow *sf)
+{
+ struct ovs_accel_callback *ovs_cb;
+
+ rcu_read_lock();
+ ovs_cb = rcu_dereference(ovs_accel_cb);
+ if (ovs_cb && ovs_cb->ovs_accel_dp_flow_del)
+ ovs_cb->ovs_accel_dp_flow_del((void *)dp, sf);
+ rcu_read_unlock();
+}
+
+/* Notify datapath flow table flush event to acceleration callback */
+static void ovs_dp_flow_tbl_flush_notify(struct datapath *dp)
+{
+ struct ovs_accel_callback *ovs_cb;
+
+ rcu_read_lock();
+ ovs_cb = rcu_dereference(ovs_accel_cb);
+ if (ovs_cb && ovs_cb->ovs_accel_dp_flow_tbl_flush)
+ ovs_cb->ovs_accel_dp_flow_tbl_flush((void *)dp);
+ rcu_read_unlock();
+}
+
+/* Notify datapath flow set/change event to acceleration callback */
+static void ovs_dp_flow_set_notify(struct datapath *dp, struct sw_flow *sf,
+ struct sw_flow_actions *new_sfa)
+{
+ struct ovs_accel_callback *ovs_cb;
+
+ rcu_read_lock();
+ ovs_cb = rcu_dereference(ovs_accel_cb);
+ if (ovs_cb && ovs_cb->ovs_accel_dp_flow_set)
+ ovs_cb->ovs_accel_dp_flow_set((void *)dp, sf, new_sfa);
+ rcu_read_unlock();
+}
+
+/* Forward datapath packet to acceleration callback
+ * Must be called with rcu_read_lock.
+ */
+static void ovs_dp_pkt_process_notify(struct datapath *dp, struct sk_buff *skb,
+ struct sw_flow_key *key, struct sw_flow *sf,
+ struct sw_flow_actions *sfa)
+{
+ struct ovs_accel_callback *ovs_cb;
+
+ WARN_ON(!rcu_read_lock_held());
+
+ ovs_cb = rcu_dereference(ovs_accel_cb);
+ if (ovs_cb && ovs_cb->ovs_accel_dp_pkt_process)
+ ovs_cb->ovs_accel_dp_pkt_process((void *)dp, skb, key, sf, sfa);
+}
+
/* Must be called with rcu_read_lock. */
void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
{
@@ -265,6 +387,8 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
stats = this_cpu_ptr(dp->stats_percpu);
+ ovs_dp_pkt_process_notify(dp, skb, key, NULL, NULL);
+
/* Look up flow. */
flow = ovs_flow_tbl_lookup_stats(&dp->table, key, &n_mask_hit);
if (unlikely(!flow)) {
@@ -286,6 +410,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
ovs_flow_stats_update(flow, key->tp.flags, skb);
sf_acts = rcu_dereference(flow->sf_acts);
+ ovs_dp_pkt_process_notify(dp, skb, key, flow, sf_acts);
ovs_execute_actions(dp, skb, sf_acts, key);
stats_counter = &stats->n_hit;
@@ -992,6 +1117,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
goto err_unlock_ovs;
}
+ ovs_dp_flow_add_notify(dp, new_flow);
if (unlikely(reply)) {
error = ovs_flow_cmd_fill_info(new_flow,
ovs_header->dp_ifindex,
@@ -1156,6 +1282,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
if (likely(acts)) {
old_acts = ovsl_dereference(flow->sf_acts);
rcu_assign_pointer(flow->sf_acts, acts);
+ ovs_dp_flow_set_notify(dp, flow, old_acts);
if (unlikely(reply)) {
error = ovs_flow_cmd_fill_info(flow,
@@ -1292,6 +1419,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
if (unlikely(!a[OVS_FLOW_ATTR_KEY] && !ufid_present)) {
err = ovs_flow_tbl_flush(&dp->table);
+ ovs_dp_flow_tbl_flush_notify(dp);
goto unlock;
}
@@ -1304,6 +1432,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
goto unlock;
}
+ ovs_dp_flow_del_notify(dp, flow);
ovs_flow_tbl_remove(&dp->table, flow);
ovs_unlock();
@@ -1606,6 +1735,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id);
list_add_tail_rcu(&dp->list_node, &ovs_net->dps);
+ ovs_dp_add_notify(dp, vport);
ovs_unlock();
ovs_notify(&dp_datapath_genl_family, reply, info);
@@ -1642,6 +1772,7 @@ static void __dp_destroy(struct datapath *dp)
list_del_rcu(&dp->list_node);
+ ovs_dp_del_notify(dp, ovs_vport_ovsl(dp, OVSP_LOCAL));
/* OVSP_LOCAL is datapath internal port. We need to make sure that
* all ports in datapath are destroyed first before freeing datapath.
*/
@@ -1975,6 +2106,7 @@ restart:
goto exit_unlock_free;
}
+ ovs_dp_port_add_notify(dp, vport, a);
err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
info->snd_seq, 0, OVS_VPORT_CMD_NEW);
BUG_ON(err < 0);
@@ -2063,6 +2195,7 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
goto exit_unlock_free;
}
+ ovs_dp_port_del_notify(vport->dp, vport);
err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
info->snd_seq, 0, OVS_VPORT_CMD_DEL);
BUG_ON(err < 0);
@@ -2297,6 +2430,163 @@ static struct pernet_operations ovs_net_ops = {
.size = sizeof(struct ovs_net),
};
+/* Register OVS datapath accelerator */
+int ovs_register_accelerator(struct ovs_accel_callback *oac)
+{
+ ovs_lock();
+
+ if (unlikely(rcu_access_pointer(ovs_accel_cb))) {
+ ovs_unlock();
+ return -EEXIST;
+ }
+
+ rcu_assign_pointer(ovs_accel_cb, oac);
+ ovs_unlock();
+ return 0;
+}
+EXPORT_SYMBOL(ovs_register_accelerator);
+
+/* Unregister OVS datapath accelerator */
+void ovs_unregister_accelerator(struct ovs_accel_callback *oac)
+{
+ ovs_lock();
+ rcu_assign_pointer(ovs_accel_cb, NULL);
+ ovs_unlock();
+}
+EXPORT_SYMBOL(ovs_unregister_accelerator);
+
+/* Find datapath flow rule using the key*/
+struct sw_flow *ovs_accel_flow_find(void *dp_inst, struct sw_flow_key *key)
+{
+ struct datapath *dp = dp_inst;
+ struct sw_flow *flow;
+
+ rcu_read_lock();
+ flow = ovs_flow_tbl_lookup(&dp->table, key);
+ rcu_read_unlock();
+
+ return flow;
+}
+EXPORT_SYMBOL(ovs_accel_flow_find);
+
+/* Update flow rule statistics */
+int ovs_accel_flow_stats_update(void *dp_inst, void *out_vport,
+ struct sw_flow_key *key, int pkts, int bytes)
+{
+ struct datapath *dp = dp_inst;
+ struct flow_stats *stats;
+ struct sw_flow *flow;
+ struct dp_stats_percpu *dp_stats;
+ int node = numa_node_id();
+ u64 *stats_counter;
+ u32 n_mask_hit;
+
+ rcu_read_lock();
+ flow = ovs_flow_tbl_lookup_stats(&dp->table, key, &n_mask_hit);
+ if (!flow) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+
+ /* Update node specific statistics, if memory is not allocated
+ * for this node then update in 0 node
+ */
+ stats = rcu_dereference(flow->stats[node]);
+ if (unlikely(!stats))
+ stats = rcu_dereference(flow->stats[0]);
+
+ rcu_read_unlock();
+
+ spin_lock(&stats->lock);
+ stats->used = jiffies;
+ stats->packet_count += pkts;
+ stats->byte_count += bytes;
+
+ /* Update datapath statistics, only hit count should be updated here,
+ * miss count is taken care by datapath.
+ * n_mask_hit and stats_counter are updated per packet, whereas
+ * stats_counter will match the number of packets processed in datapath
+ * n_mask_hit is updated number of packets times the total masks that
+ * are processed. Datapath flows are now accelerated and this API is
+ * called to update flow statistics, datpath statistics should use
+ * number of packets.
+ */
+ dp_stats = this_cpu_ptr(dp->stats_percpu);
+ stats_counter = &dp_stats->n_hit;
+
+ u64_stats_update_begin(&dp_stats->syncp);
+ (*stats_counter) += pkts;
+ dp_stats->n_mask_hit += n_mask_hit * pkts;
+ u64_stats_update_end(&dp_stats->syncp);
+
+ spin_unlock(&stats->lock);
+ return 0;
+}
+EXPORT_SYMBOL(ovs_accel_flow_stats_update);
+
+/* Find netdev using vport number */
+struct net_device *ovs_accel_dev_find(void *dp_inst, int vport_no)
+{
+ struct datapath *dp = dp_inst;
+ struct net_device *dev;
+ struct vport *vport;
+
+ rcu_read_lock();
+
+ vport = ovs_vport_rcu(dp, vport_no);
+ if (!vport) {
+ rcu_read_unlock();
+ return NULL;
+ }
+
+ dev = vport->dev;
+ rcu_read_unlock();
+ return dev;
+}
+EXPORT_SYMBOL(ovs_accel_dev_find);
+
+/* Find egress interface using key and skb */
+struct net_device *ovs_accel_egress_dev_find(void *dp_inst,
+ struct sw_flow_key *key,
+ struct sk_buff *skb)
+{
+ struct datapath *dp = dp_inst;
+ struct sw_flow *flow;
+ struct sw_flow_actions *sf_acts;
+ struct net_device *dev;
+ const struct nlattr *a;
+ int rem;
+
+ rcu_read_lock();
+ flow = ovs_accel_flow_find(dp_inst, key);
+ if (unlikely(!flow))
+ goto done;
+
+ sf_acts = rcu_dereference(flow->sf_acts);
+ for (a = sf_acts->actions, rem = sf_acts->actions_len; rem > 0;
+ a = nla_next(a, &rem)) {
+ struct vport *vport;
+ int port_no;
+
+ switch (nla_type(a)) {
+ case OVS_ACTION_ATTR_OUTPUT:
+ port_no = nla_get_u32(a);
+ vport = ovs_vport_ovsl_rcu(dp, port_no);
+ if (!vport) {
+ goto done;
+ }
+
+ dev = vport->dev;
+ rcu_read_unlock();
+ return dev;
+ }
+ }
+done:
+ rcu_read_unlock();
+ return NULL;
+}
+EXPORT_SYMBOL(ovs_accel_egress_dev_find);
+
static int __init dp_init(void)
{
int err;
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -138,6 +138,37 @@ struct ovs_net {
bool xt_label;
};
+/**
+ * struct ovs_accel_callback - OVS acceleration callbacks
+ * @ovs_accel_dp_add - new data path is created
+ * @ovs_accel_dp_del - data path is deleted
+ * @ovs_accel_dp_port_add - new port is added into data path
+ * @ovs_accel_dp_port_del - port is deleted from data path
+ * @ovs_accel_dp_flow_add - new flow rule is added in data path
+ * @ovs_accel_dp_flow_del - flow rule is deleted from data path
+ * @ovs_accel_dp_flow_set - existing flow rule is modified in data path
+ * @ovs_accel_dp_flow_tbl_flush - flow table is flushed in data path
+ * @ovs_accel_dp_pkt_process - Process data path packet
+ */
+struct ovs_accel_callback {
+ void (*ovs_accel_dp_add)(void *dp, struct net_device *dev);
+ void (*ovs_accel_dp_del)(void *dp, struct net_device *dev);
+ void (*ovs_accel_dp_port_add)(void *dp, void *vp,
+ int vp_num, enum ovs_vport_type vp_type,
+ const char *master, struct net_device *dev);
+ void (*ovs_accel_dp_port_del)(void *dp, void *vp,
+ struct net_device *dev);
+ void (*ovs_accel_dp_flow_add)(void *dp, struct sw_flow *sf);
+ void (*ovs_accel_dp_flow_del)(void *dp, struct sw_flow *sf);
+ void (*ovs_accel_dp_flow_set)(void *dp, struct sw_flow *sf,
+ struct sw_flow_actions *sfa);
+ void (*ovs_accel_dp_flow_tbl_flush)(void *dp);
+ void (*ovs_accel_dp_pkt_process)(void *dp, struct sk_buff *skb,
+ struct sw_flow_key *key,
+ struct sw_flow *sf,
+ struct sw_flow_actions *sfa);
+};
+
extern int ovs_net_id;
void ovs_lock(void);
void ovs_unlock(void);
@@ -204,6 +235,16 @@ void ovs_dp_notify_wq(struct work_struct *work);
int action_fifos_init(void);
void action_fifos_exit(void);
+int ovs_register_accelerator(struct ovs_accel_callback *oac);
+void ovs_unregister_accelerator(struct ovs_accel_callback *oac);
+int ovs_accel_flow_stats_update(void *dp, void *out_vport,
+ struct sw_flow_key *sf, int pkts, int bytes);
+struct sw_flow *ovs_accel_flow_find(void *dp, struct sw_flow_key *sfk);
+struct net_device *ovs_accel_dev_find(void *dp, int vport_no);
+struct net_device *ovs_accel_egress_dev_find(void *dp_inst,
+ struct sw_flow_key *key,
+ struct sk_buff *skb);
+
/* 'KEY' must not have any bits set outside of the 'MASK' */
#define OVS_MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK)))
#define OVS_SET_MASKED(OLD, KEY, MASK) ((OLD) = OVS_MASKED(OLD, KEY, MASK))
--- a/net/packet/Kconfig
+++ b/net/packet/Kconfig
@@ -18,6 +18,7 @@ config PACKET
config PACKET_DIAG
tristate "Packet: sockets monitoring interface"
depends on PACKET
+ select SOCK_DIAG
default n
---help---
Support for PF_PACKET sockets monitoring interface used by the ss tool.
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1776,6 +1776,7 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
{
struct sock *sk;
struct sockaddr_pkt *spkt;
+ struct packet_sock *po;
/*
* When we registered the protocol we saved the socket in the data
@@ -1783,6 +1784,7 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
*/
sk = pt->af_packet_priv;
+ po = pkt_sk(sk);
/*
* Yank back the headers [hope the device set this
@@ -1795,7 +1797,7 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
* so that this procedure is noop.
*/
- if (skb->pkt_type == PACKET_LOOPBACK)
+ if (!(po->pkt_type & (1 << skb->pkt_type)))
goto out;
if (!net_eq(dev_net(dev), sock_net(sk)))
@@ -1998,12 +2000,12 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
int skb_len = skb->len;
unsigned int snaplen, res;
- if (skb->pkt_type == PACKET_LOOPBACK)
- goto drop;
-
sk = pt->af_packet_priv;
po = pkt_sk(sk);
+ if (!(po->pkt_type & (1 << skb->pkt_type)))
+ goto drop;
+
if (!net_eq(dev_net(dev), sock_net(sk)))
goto drop;
@@ -2123,12 +2125,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
- if (skb->pkt_type == PACKET_LOOPBACK)
- goto drop;
-
sk = pt->af_packet_priv;
po = pkt_sk(sk);
+ if (!(po->pkt_type & (1 << skb->pkt_type)))
+ goto drop;
+
if (!net_eq(dev_net(dev), sock_net(sk)))
goto drop;
@@ -3115,6 +3117,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
mutex_init(&po->pg_vec_lock);
po->rollover = NULL;
po->prot_hook.func = packet_rcv;
+ po->pkt_type = PACKET_MASK_ANY & ~(1 << PACKET_LOOPBACK);
if (sock->type == SOCK_PACKET)
po->prot_hook.func = packet_rcv_spkt;
@@ -3737,6 +3740,16 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
return 0;
}
+ case PACKET_RECV_TYPE:
+ {
+ unsigned int val;
+ if (optlen != sizeof(val))
+ return -EINVAL;
+ if (copy_from_user(&val, optval, sizeof(val)))
+ return -EFAULT;
+ po->pkt_type = val & ~BIT(PACKET_LOOPBACK);
+ return 0;
+ }
default:
return -ENOPROTOOPT;
}
@@ -3789,6 +3802,13 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
case PACKET_VNET_HDR:
val = po->has_vnet_hdr;
break;
+ case PACKET_RECV_TYPE:
+ if (len > sizeof(unsigned int))
+ len = sizeof(unsigned int);
+ val = po->pkt_type;
+
+ data = &val;
+ break;
case PACKET_VERSION:
val = po->tp_version;
break;
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -129,6 +129,7 @@ struct packet_sock {
struct net_device __rcu *cached_dev;
int (*xmit)(struct sk_buff *skb);
struct packet_type prot_hook ____cacheline_aligned_in_smp;
+ unsigned int pkt_type;
};
static struct packet_sock *pkt_sk(struct sock *sk)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment