fix refcnt leaks with net namespaces
see https://github.com/lxc/lxc/issues/2141 and https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407/
This commit is contained in:
parent
8a8c16e218
commit
38c79e8118
3 changed files with 276 additions and 0 deletions
|
@ -0,0 +1,127 @@
|
|||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Dan Streetman <ddstreet@ieee.org>
|
||||
Date: Thu, 18 Jan 2018 16:14:26 -0500
|
||||
Subject: [PATCH] net: tcp: close sock if net namespace is exiting
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
When a tcp socket is closed, if it detects that its net namespace is
|
||||
exiting, close immediately and do not wait for FIN sequence.
|
||||
|
||||
For normal sockets, a reference is taken to their net namespace, so it will
|
||||
never exit while the socket is open. However, kernel sockets do not take a
|
||||
reference to their net namespace, so it may begin exiting while the kernel
|
||||
socket is still open. In this case if the kernel socket is a tcp socket,
|
||||
it will stay open trying to complete its close sequence. The sock's dst(s)
|
||||
hold a reference to their interface, which are all transferred to the
|
||||
namespace's loopback interface when the real interfaces are taken down.
|
||||
When the namespace tries to take down its loopback interface, it hangs
|
||||
waiting for all references to the loopback interface to release, which
|
||||
results in messages like:
|
||||
|
||||
unregister_netdevice: waiting for lo to become free. Usage count = 1
|
||||
|
||||
These messages continue until the socket finally times out and closes.
|
||||
Since the net namespace cleanup holds the net_mutex while calling its
|
||||
registered pernet callbacks, any new net namespace initialization is
|
||||
blocked until the current net namespace finishes exiting.
|
||||
|
||||
After this change, the tcp socket notices the exiting net namespace, and
|
||||
closes immediately, releasing its dst(s) and their reference to the
|
||||
loopback interface, which lets the net namespace continue exiting.
|
||||
|
||||
Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407
|
||||
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=97811
|
||||
Signed-off-by: Dan Streetman <ddstreet@canonical.com>
|
||||
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
|
||||
---
|
||||
include/net/net_namespace.h | 10 ++++++++++
|
||||
net/ipv4/tcp.c | 3 +++
|
||||
net/ipv4/tcp_timer.c | 15 +++++++++++++++
|
||||
3 files changed, 28 insertions(+)
|
||||
|
||||
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
|
||||
index 1c401bd4c2e0..a5d023fa78db 100644
|
||||
--- a/include/net/net_namespace.h
|
||||
+++ b/include/net/net_namespace.h
|
||||
@@ -221,6 +221,11 @@ int net_eq(const struct net *net1, const struct net *net2)
|
||||
return net1 == net2;
|
||||
}
|
||||
|
||||
+static inline int check_net(const struct net *net)
|
||||
+{
|
||||
+ return atomic_read(&net->count) != 0;
|
||||
+}
|
||||
+
|
||||
void net_drop_ns(void *);
|
||||
|
||||
#else
|
||||
@@ -245,6 +250,11 @@ int net_eq(const struct net *net1, const struct net *net2)
|
||||
return 1;
|
||||
}
|
||||
|
||||
+static inline int check_net(const struct net *net)
|
||||
+{
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
#define net_drop_ns NULL
|
||||
#endif
|
||||
|
||||
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
|
||||
index a3e91b552edc..fd2a086da910 100644
|
||||
--- a/net/ipv4/tcp.c
|
||||
+++ b/net/ipv4/tcp.c
|
||||
@@ -2258,6 +2258,9 @@ void tcp_close(struct sock *sk, long timeout)
|
||||
tcp_send_active_reset(sk, GFP_ATOMIC);
|
||||
__NET_INC_STATS(sock_net(sk),
|
||||
LINUX_MIB_TCPABORTONMEMORY);
|
||||
+ } else if (!check_net(sock_net(sk))) {
|
||||
+ /* Not possible to send reset; just close */
|
||||
+ tcp_set_state(sk, TCP_CLOSE);
|
||||
}
|
||||
}
|
||||
|
||||
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
|
||||
index e906014890b6..ec1e5de41653 100644
|
||||
--- a/net/ipv4/tcp_timer.c
|
||||
+++ b/net/ipv4/tcp_timer.c
|
||||
@@ -50,11 +50,19 @@ static void tcp_write_err(struct sock *sk)
|
||||
* to prevent DoS attacks. It is called when a retransmission timeout
|
||||
* or zero probe timeout occurs on orphaned socket.
|
||||
*
|
||||
+ * Also close if our net namespace is exiting; in that case there is no
|
||||
+ * hope of ever communicating again since all netns interfaces are already
|
||||
+ * down (or about to be down), and we need to release our dst references,
|
||||
+ * which have been moved to the netns loopback interface, so the namespace
|
||||
+ * can finish exiting. This condition is only possible if we are a kernel
|
||||
+ * socket, as those do not hold references to the namespace.
|
||||
+ *
|
||||
* Criteria is still not confirmed experimentally and may change.
|
||||
* We kill the socket, if:
|
||||
* 1. If number of orphaned sockets exceeds an administratively configured
|
||||
* limit.
|
||||
* 2. If we have strong memory pressure.
|
||||
+ * 3. If our net namespace is exiting.
|
||||
*/
|
||||
static int tcp_out_of_resources(struct sock *sk, bool do_reset)
|
||||
{
|
||||
@@ -83,6 +91,13 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
|
||||
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
|
||||
return 1;
|
||||
}
|
||||
+
|
||||
+ if (!check_net(sock_net(sk))) {
|
||||
+ /* Not possible to send reset; just close */
|
||||
+ tcp_done(sk);
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
|
||||
--
|
||||
2.14.2
|
||||
|
|
@ -0,0 +1,89 @@
|
|||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Tommi Rantala <tommi.t.rantala@nokia.com>
|
||||
Date: Mon, 5 Feb 2018 21:48:14 +0200
|
||||
Subject: [PATCH] sctp: fix dst refcnt leak in sctp_v4_get_dst
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Fix dst reference count leak in sctp_v4_get_dst() introduced in commit
|
||||
410f03831 ("sctp: add routing output fallback"):
|
||||
|
||||
When walking the address_list, successive ip_route_output_key() calls
|
||||
may return the same rt->dst with the reference incremented on each call.
|
||||
|
||||
The code would not decrement the dst refcount when the dst pointer was
|
||||
identical from the previous iteration, causing the dst refcnt leak.
|
||||
|
||||
Testcase:
|
||||
ip netns add TEST
|
||||
ip netns exec TEST ip link set lo up
|
||||
ip link add dummy0 type dummy
|
||||
ip link add dummy1 type dummy
|
||||
ip link add dummy2 type dummy
|
||||
ip link set dev dummy0 netns TEST
|
||||
ip link set dev dummy1 netns TEST
|
||||
ip link set dev dummy2 netns TEST
|
||||
ip netns exec TEST ip addr add 192.168.1.1/24 dev dummy0
|
||||
ip netns exec TEST ip link set dummy0 up
|
||||
ip netns exec TEST ip addr add 192.168.1.2/24 dev dummy1
|
||||
ip netns exec TEST ip link set dummy1 up
|
||||
ip netns exec TEST ip addr add 192.168.1.3/24 dev dummy2
|
||||
ip netns exec TEST ip link set dummy2 up
|
||||
ip netns exec TEST sctp_test -H 192.168.1.2 -P 20002 -h 192.168.1.1 -p 20000 -s -B 192.168.1.3
|
||||
ip netns del TEST
|
||||
|
||||
In 4.4 and 4.9 kernels this results to:
|
||||
[ 354.179591] unregister_netdevice: waiting for lo to become free. Usage count = 1
|
||||
[ 364.419674] unregister_netdevice: waiting for lo to become free. Usage count = 1
|
||||
[ 374.663664] unregister_netdevice: waiting for lo to become free. Usage count = 1
|
||||
[ 384.903717] unregister_netdevice: waiting for lo to become free. Usage count = 1
|
||||
[ 395.143724] unregister_netdevice: waiting for lo to become free. Usage count = 1
|
||||
[ 405.383645] unregister_netdevice: waiting for lo to become free. Usage count = 1
|
||||
...
|
||||
|
||||
Fixes: 410f03831 ("sctp: add routing output fallback")
|
||||
Fixes: 0ca50d12f ("sctp: fix src address selection if using secondary addresses")
|
||||
Signed-off-by: Tommi Rantala <tommi.t.rantala@nokia.com>
|
||||
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
|
||||
Acked-by: Neil Horman <nhorman@tuxdriver.com>
|
||||
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
|
||||
---
|
||||
net/sctp/protocol.c | 10 ++++------
|
||||
1 file changed, 4 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
|
||||
index 989a900383b5..e1a3ae4f3cab 100644
|
||||
--- a/net/sctp/protocol.c
|
||||
+++ b/net/sctp/protocol.c
|
||||
@@ -514,22 +514,20 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
|
||||
if (IS_ERR(rt))
|
||||
continue;
|
||||
|
||||
- if (!dst)
|
||||
- dst = &rt->dst;
|
||||
-
|
||||
/* Ensure the src address belongs to the output
|
||||
* interface.
|
||||
*/
|
||||
odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr,
|
||||
false);
|
||||
if (!odev || odev->ifindex != fl4->flowi4_oif) {
|
||||
- if (&rt->dst != dst)
|
||||
+ if (!dst)
|
||||
+ dst = &rt->dst;
|
||||
+ else
|
||||
dst_release(&rt->dst);
|
||||
continue;
|
||||
}
|
||||
|
||||
- if (dst != &rt->dst)
|
||||
- dst_release(dst);
|
||||
+ dst_release(dst);
|
||||
dst = &rt->dst;
|
||||
break;
|
||||
}
|
||||
--
|
||||
2.14.2
|
||||
|
|
@ -0,0 +1,60 @@
|
|||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Alexey Kodanev <alexey.kodanev@oracle.com>
|
||||
Date: Mon, 5 Feb 2018 15:10:35 +0300
|
||||
Subject: [PATCH] sctp: fix dst refcnt leak in sctp_v6_get_dst()
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
When going through the bind address list in sctp_v6_get_dst() and
|
||||
the previously found address is better ('matchlen > bmatchlen'),
|
||||
the code continues to the next iteration without releasing currently
|
||||
held destination.
|
||||
|
||||
Fix it by releasing 'bdst' before continue to the next iteration, and
|
||||
instead of introducing one more '!IS_ERR(bdst)' check for dst_release(),
|
||||
move the already existed one right after ip6_dst_lookup_flow(), i.e. we
|
||||
shouldn't proceed further if we get an error for the route lookup.
|
||||
|
||||
Fixes: dbc2b5e9a09e ("sctp: fix src address selection if using secondary addresses for ipv6")
|
||||
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
|
||||
Acked-by: Neil Horman <nhorman@tuxdriver.com>
|
||||
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
|
||||
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
|
||||
---
|
||||
net/sctp/ipv6.c | 10 +++++++---
|
||||
1 file changed, 7 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
|
||||
index a4b6ffb61495..c5a5ad8ac00f 100644
|
||||
--- a/net/sctp/ipv6.c
|
||||
+++ b/net/sctp/ipv6.c
|
||||
@@ -326,8 +326,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
|
||||
final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
|
||||
bdst = ip6_dst_lookup_flow(sk, fl6, final_p);
|
||||
|
||||
- if (!IS_ERR(bdst) &&
|
||||
- ipv6_chk_addr(dev_net(bdst->dev),
|
||||
+ if (IS_ERR(bdst))
|
||||
+ continue;
|
||||
+
|
||||
+ if (ipv6_chk_addr(dev_net(bdst->dev),
|
||||
&laddr->a.v6.sin6_addr, bdst->dev, 1)) {
|
||||
if (!IS_ERR_OR_NULL(dst))
|
||||
dst_release(dst);
|
||||
@@ -336,8 +338,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
|
||||
}
|
||||
|
||||
bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a);
|
||||
- if (matchlen > bmatchlen)
|
||||
+ if (matchlen > bmatchlen) {
|
||||
+ dst_release(bdst);
|
||||
continue;
|
||||
+ }
|
||||
|
||||
if (!IS_ERR_OR_NULL(dst))
|
||||
dst_release(dst);
|
||||
--
|
||||
2.14.2
|
||||
|
Loading…
Reference in a new issue