diff --git a/patches/kernel/0024-net-tcp-close-sock-if-net-namespace-is-exiting.patch b/patches/kernel/0024-net-tcp-close-sock-if-net-namespace-is-exiting.patch new file mode 100644 index 0000000..d4eb5e6 --- /dev/null +++ b/patches/kernel/0024-net-tcp-close-sock-if-net-namespace-is-exiting.patch @@ -0,0 +1,127 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Dan Streetman +Date: Thu, 18 Jan 2018 16:14:26 -0500 +Subject: [PATCH] net: tcp: close sock if net namespace is exiting +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +When a tcp socket is closed, if it detects that its net namespace is +exiting, close immediately and do not wait for FIN sequence. + +For normal sockets, a reference is taken to their net namespace, so it will +never exit while the socket is open. However, kernel sockets do not take a +reference to their net namespace, so it may begin exiting while the kernel +socket is still open. In this case if the kernel socket is a tcp socket, +it will stay open trying to complete its close sequence. The sock's dst(s) +hold a reference to their interface, which are all transferred to the +namespace's loopback interface when the real interfaces are taken down. +When the namespace tries to take down its loopback interface, it hangs +waiting for all references to the loopback interface to release, which +results in messages like: + +unregister_netdevice: waiting for lo to become free. Usage count = 1 + +These messages continue until the socket finally times out and closes. +Since the net namespace cleanup holds the net_mutex while calling its +registered pernet callbacks, any new net namespace initialization is +blocked until the current net namespace finishes exiting. + +After this change, the tcp socket notices the exiting net namespace, and +closes immediately, releasing its dst(s) and their reference to the +loopback interface, which lets the net namespace continue exiting. + +Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407 +Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=97811 +Signed-off-by: Dan Streetman +Signed-off-by: David S. Miller +Signed-off-by: Fabian Grünbichler +--- + include/net/net_namespace.h | 10 ++++++++++ + net/ipv4/tcp.c | 3 +++ + net/ipv4/tcp_timer.c | 15 +++++++++++++++ + 3 files changed, 28 insertions(+) + +diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h +index 1c401bd4c2e0..a5d023fa78db 100644 +--- a/include/net/net_namespace.h ++++ b/include/net/net_namespace.h +@@ -221,6 +221,11 @@ int net_eq(const struct net *net1, const struct net *net2) + return net1 == net2; + } + ++static inline int check_net(const struct net *net) ++{ ++ return atomic_read(&net->count) != 0; ++} ++ + void net_drop_ns(void *); + + #else +@@ -245,6 +250,11 @@ int net_eq(const struct net *net1, const struct net *net2) + return 1; + } + ++static inline int check_net(const struct net *net) ++{ ++ return 1; ++} ++ + #define net_drop_ns NULL + #endif + +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index a3e91b552edc..fd2a086da910 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -2258,6 +2258,9 @@ void tcp_close(struct sock *sk, long timeout) + tcp_send_active_reset(sk, GFP_ATOMIC); + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPABORTONMEMORY); ++ } else if (!check_net(sock_net(sk))) { ++ /* Not possible to send reset; just close */ ++ tcp_set_state(sk, TCP_CLOSE); + } + } + +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index e906014890b6..ec1e5de41653 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -50,11 +50,19 @@ static void tcp_write_err(struct sock *sk) + * to prevent DoS attacks. It is called when a retransmission timeout + * or zero probe timeout occurs on orphaned socket. + * ++ * Also close if our net namespace is exiting; in that case there is no ++ * hope of ever communicating again since all netns interfaces are already ++ * down (or about to be down), and we need to release our dst references, ++ * which have been moved to the netns loopback interface, so the namespace ++ * can finish exiting. This condition is only possible if we are a kernel ++ * socket, as those do not hold references to the namespace. ++ * + * Criteria is still not confirmed experimentally and may change. + * We kill the socket, if: + * 1. If number of orphaned sockets exceeds an administratively configured + * limit. + * 2. If we have strong memory pressure. ++ * 3. If our net namespace is exiting. + */ + static int tcp_out_of_resources(struct sock *sk, bool do_reset) + { +@@ -83,6 +91,13 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); + return 1; + } ++ ++ if (!check_net(sock_net(sk))) { ++ /* Not possible to send reset; just close */ ++ tcp_done(sk); ++ return 1; ++ } ++ + return 0; + } + +-- +2.14.2 + diff --git a/patches/kernel/0025-sctp-fix-dst-refcnt-leak-in-sctp_v4_get_dst.patch b/patches/kernel/0025-sctp-fix-dst-refcnt-leak-in-sctp_v4_get_dst.patch new file mode 100644 index 0000000..1bed6b0 --- /dev/null +++ b/patches/kernel/0025-sctp-fix-dst-refcnt-leak-in-sctp_v4_get_dst.patch @@ -0,0 +1,89 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Tommi Rantala +Date: Mon, 5 Feb 2018 21:48:14 +0200 +Subject: [PATCH] sctp: fix dst refcnt leak in sctp_v4_get_dst +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Fix dst reference count leak in sctp_v4_get_dst() introduced in commit +410f03831 ("sctp: add routing output fallback"): + +When walking the address_list, successive ip_route_output_key() calls +may return the same rt->dst with the reference incremented on each call. + +The code would not decrement the dst refcount when the dst pointer was +identical from the previous iteration, causing the dst refcnt leak. + +Testcase: + ip netns add TEST + ip netns exec TEST ip link set lo up + ip link add dummy0 type dummy + ip link add dummy1 type dummy + ip link add dummy2 type dummy + ip link set dev dummy0 netns TEST + ip link set dev dummy1 netns TEST + ip link set dev dummy2 netns TEST + ip netns exec TEST ip addr add 192.168.1.1/24 dev dummy0 + ip netns exec TEST ip link set dummy0 up + ip netns exec TEST ip addr add 192.168.1.2/24 dev dummy1 + ip netns exec TEST ip link set dummy1 up + ip netns exec TEST ip addr add 192.168.1.3/24 dev dummy2 + ip netns exec TEST ip link set dummy2 up + ip netns exec TEST sctp_test -H 192.168.1.2 -P 20002 -h 192.168.1.1 -p 20000 -s -B 192.168.1.3 + ip netns del TEST + +In 4.4 and 4.9 kernels this results to: + [ 354.179591] unregister_netdevice: waiting for lo to become free. Usage count = 1 + [ 364.419674] unregister_netdevice: waiting for lo to become free. Usage count = 1 + [ 374.663664] unregister_netdevice: waiting for lo to become free. Usage count = 1 + [ 384.903717] unregister_netdevice: waiting for lo to become free. Usage count = 1 + [ 395.143724] unregister_netdevice: waiting for lo to become free. Usage count = 1 + [ 405.383645] unregister_netdevice: waiting for lo to become free. Usage count = 1 + ... + +Fixes: 410f03831 ("sctp: add routing output fallback") +Fixes: 0ca50d12f ("sctp: fix src address selection if using secondary addresses") +Signed-off-by: Tommi Rantala +Acked-by: Marcelo Ricardo Leitner +Acked-by: Neil Horman +Signed-off-by: David S. Miller +Signed-off-by: Fabian Grünbichler +--- + net/sctp/protocol.c | 10 ++++------ + 1 file changed, 4 insertions(+), 6 deletions(-) + +diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c +index 989a900383b5..e1a3ae4f3cab 100644 +--- a/net/sctp/protocol.c ++++ b/net/sctp/protocol.c +@@ -514,22 +514,20 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, + if (IS_ERR(rt)) + continue; + +- if (!dst) +- dst = &rt->dst; +- + /* Ensure the src address belongs to the output + * interface. + */ + odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr, + false); + if (!odev || odev->ifindex != fl4->flowi4_oif) { +- if (&rt->dst != dst) ++ if (!dst) ++ dst = &rt->dst; ++ else + dst_release(&rt->dst); + continue; + } + +- if (dst != &rt->dst) +- dst_release(dst); ++ dst_release(dst); + dst = &rt->dst; + break; + } +-- +2.14.2 + diff --git a/patches/kernel/0026-sctp-fix-dst-refcnt-leak-in-sctp_v6_get_dst.patch b/patches/kernel/0026-sctp-fix-dst-refcnt-leak-in-sctp_v6_get_dst.patch new file mode 100644 index 0000000..58087ed --- /dev/null +++ b/patches/kernel/0026-sctp-fix-dst-refcnt-leak-in-sctp_v6_get_dst.patch @@ -0,0 +1,60 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Alexey Kodanev +Date: Mon, 5 Feb 2018 15:10:35 +0300 +Subject: [PATCH] sctp: fix dst refcnt leak in sctp_v6_get_dst() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +When going through the bind address list in sctp_v6_get_dst() and +the previously found address is better ('matchlen > bmatchlen'), +the code continues to the next iteration without releasing currently +held destination. + +Fix it by releasing 'bdst' before continue to the next iteration, and +instead of introducing one more '!IS_ERR(bdst)' check for dst_release(), +move the already existed one right after ip6_dst_lookup_flow(), i.e. we +shouldn't proceed further if we get an error for the route lookup. + +Fixes: dbc2b5e9a09e ("sctp: fix src address selection if using secondary addresses for ipv6") +Signed-off-by: Alexey Kodanev +Acked-by: Neil Horman +Acked-by: Marcelo Ricardo Leitner +Signed-off-by: David S. Miller +Signed-off-by: Fabian Grünbichler +--- + net/sctp/ipv6.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c +index a4b6ffb61495..c5a5ad8ac00f 100644 +--- a/net/sctp/ipv6.c ++++ b/net/sctp/ipv6.c +@@ -326,8 +326,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); + bdst = ip6_dst_lookup_flow(sk, fl6, final_p); + +- if (!IS_ERR(bdst) && +- ipv6_chk_addr(dev_net(bdst->dev), ++ if (IS_ERR(bdst)) ++ continue; ++ ++ if (ipv6_chk_addr(dev_net(bdst->dev), + &laddr->a.v6.sin6_addr, bdst->dev, 1)) { + if (!IS_ERR_OR_NULL(dst)) + dst_release(dst); +@@ -336,8 +338,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, + } + + bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a); +- if (matchlen > bmatchlen) ++ if (matchlen > bmatchlen) { ++ dst_release(bdst); + continue; ++ } + + if (!IS_ERR_OR_NULL(dst)) + dst_release(dst); +-- +2.14.2 +