38c79e8118
see https://github.com/lxc/lxc/issues/2141 and https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407/
127 lines
4.6 KiB
Diff
127 lines
4.6 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: Dan Streetman <ddstreet@ieee.org>
|
|
Date: Thu, 18 Jan 2018 16:14:26 -0500
|
|
Subject: [PATCH] net: tcp: close sock if net namespace is exiting
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
When a tcp socket is closed, if it detects that its net namespace is
|
|
exiting, close immediately and do not wait for FIN sequence.
|
|
|
|
For normal sockets, a reference is taken to their net namespace, so it will
|
|
never exit while the socket is open. However, kernel sockets do not take a
|
|
reference to their net namespace, so it may begin exiting while the kernel
|
|
socket is still open. In this case if the kernel socket is a tcp socket,
|
|
it will stay open trying to complete its close sequence. The sock's dst(s)
|
|
hold a reference to their interface, which are all transferred to the
|
|
namespace's loopback interface when the real interfaces are taken down.
|
|
When the namespace tries to take down its loopback interface, it hangs
|
|
waiting for all references to the loopback interface to release, which
|
|
results in messages like:
|
|
|
|
unregister_netdevice: waiting for lo to become free. Usage count = 1
|
|
|
|
These messages continue until the socket finally times out and closes.
|
|
Since the net namespace cleanup holds the net_mutex while calling its
|
|
registered pernet callbacks, any new net namespace initialization is
|
|
blocked until the current net namespace finishes exiting.
|
|
|
|
After this change, the tcp socket notices the exiting net namespace, and
|
|
closes immediately, releasing its dst(s) and their reference to the
|
|
loopback interface, which lets the net namespace continue exiting.
|
|
|
|
Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407
|
|
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=97811
|
|
Signed-off-by: Dan Streetman <ddstreet@canonical.com>
|
|
Signed-off-by: David S. Miller <davem@davemloft.net>
|
|
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
|
|
---
|
|
include/net/net_namespace.h | 10 ++++++++++
|
|
net/ipv4/tcp.c | 3 +++
|
|
net/ipv4/tcp_timer.c | 15 +++++++++++++++
|
|
3 files changed, 28 insertions(+)
|
|
|
|
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
|
|
index 1c401bd4c2e0..a5d023fa78db 100644
|
|
--- a/include/net/net_namespace.h
|
|
+++ b/include/net/net_namespace.h
|
|
@@ -221,6 +221,11 @@ int net_eq(const struct net *net1, const struct net *net2)
|
|
return net1 == net2;
|
|
}
|
|
|
|
+static inline int check_net(const struct net *net)
|
|
+{
|
|
+ return atomic_read(&net->count) != 0;
|
|
+}
|
|
+
|
|
void net_drop_ns(void *);
|
|
|
|
#else
|
|
@@ -245,6 +250,11 @@ int net_eq(const struct net *net1, const struct net *net2)
|
|
return 1;
|
|
}
|
|
|
|
+static inline int check_net(const struct net *net)
|
|
+{
|
|
+ return 1;
|
|
+}
|
|
+
|
|
#define net_drop_ns NULL
|
|
#endif
|
|
|
|
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
|
|
index a3e91b552edc..fd2a086da910 100644
|
|
--- a/net/ipv4/tcp.c
|
|
+++ b/net/ipv4/tcp.c
|
|
@@ -2258,6 +2258,9 @@ void tcp_close(struct sock *sk, long timeout)
|
|
tcp_send_active_reset(sk, GFP_ATOMIC);
|
|
__NET_INC_STATS(sock_net(sk),
|
|
LINUX_MIB_TCPABORTONMEMORY);
|
|
+ } else if (!check_net(sock_net(sk))) {
|
|
+ /* Not possible to send reset; just close */
|
|
+ tcp_set_state(sk, TCP_CLOSE);
|
|
}
|
|
}
|
|
|
|
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
|
|
index e906014890b6..ec1e5de41653 100644
|
|
--- a/net/ipv4/tcp_timer.c
|
|
+++ b/net/ipv4/tcp_timer.c
|
|
@@ -50,11 +50,19 @@ static void tcp_write_err(struct sock *sk)
|
|
* to prevent DoS attacks. It is called when a retransmission timeout
|
|
* or zero probe timeout occurs on orphaned socket.
|
|
*
|
|
+ * Also close if our net namespace is exiting; in that case there is no
|
|
+ * hope of ever communicating again since all netns interfaces are already
|
|
+ * down (or about to be down), and we need to release our dst references,
|
|
+ * which have been moved to the netns loopback interface, so the namespace
|
|
+ * can finish exiting. This condition is only possible if we are a kernel
|
|
+ * socket, as those do not hold references to the namespace.
|
|
+ *
|
|
* Criteria is still not confirmed experimentally and may change.
|
|
* We kill the socket, if:
|
|
* 1. If number of orphaned sockets exceeds an administratively configured
|
|
* limit.
|
|
* 2. If we have strong memory pressure.
|
|
+ * 3. If our net namespace is exiting.
|
|
*/
|
|
static int tcp_out_of_resources(struct sock *sk, bool do_reset)
|
|
{
|
|
@@ -83,6 +91,13 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
|
|
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
|
|
return 1;
|
|
}
|
|
+
|
|
+ if (!check_net(sock_net(sk))) {
|
|
+ /* Not possible to send reset; just close */
|
|
+ tcp_done(sk);
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
return 0;
|
|
}
|
|
|
|
--
|
|
2.14.2
|
|
|