 93bb0ceb75
			
		
	
	
	93bb0ceb75
	
	
	
		
			
			nf_conntrack_lock is a monolithic lock and suffers from huge contention
on current generation servers (8 or more core/threads).
Perf locking congestion is clear on base kernel:
-  72.56%  ksoftirqd/6  [kernel.kallsyms]    [k] _raw_spin_lock_bh
   - _raw_spin_lock_bh
      + 25.33% init_conntrack
      + 24.86% nf_ct_delete_from_lists
      + 24.62% __nf_conntrack_confirm
      + 24.38% destroy_conntrack
      + 0.70% tcp_packet
+   2.21%  ksoftirqd/6  [kernel.kallsyms]    [k] fib_table_lookup
+   1.15%  ksoftirqd/6  [kernel.kallsyms]    [k] __slab_free
+   0.77%  ksoftirqd/6  [kernel.kallsyms]    [k] inet_getpeer
+   0.70%  ksoftirqd/6  [nf_conntrack]       [k] nf_ct_delete
+   0.55%  ksoftirqd/6  [ip_tables]          [k] ipt_do_table
This patch change conntrack locking and provides a huge performance
improvement.  SYN-flood attack tested on a 24-core E5-2695v2(ES) with
10Gbit/s ixgbe (with tool trafgen):
 Base kernel:   810.405 new conntrack/sec
 After patch: 2.233.876 new conntrack/sec
Notice other floods attack (SYN+ACK or ACK) can easily be deflected using:
 # iptables -A INPUT -m state --state INVALID -j DROP
 # sysctl -w net/netfilter/nf_conntrack_tcp_loose=0
Use an array of hashed spinlocks to protect insertions/deletions of
conntracks into the hash table. 1024 spinlocks seem to give good
results, at minimal cost (4KB memory). Due to lockdep max depth,
1024 becomes 8 if CONFIG_LOCKDEP=y
The hash resize is a bit tricky, because we need to take all locks in
the array. A seqcount_t is used to synchronize the hash table users
with the resizing process.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Reviewed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
		
	
			
		
			
				
	
	
		
			112 lines
		
	
	
	
		
			2.6 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			112 lines
		
	
	
	
		
			2.6 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
| #ifndef __NETNS_CONNTRACK_H
 | |
| #define __NETNS_CONNTRACK_H
 | |
| 
 | |
| #include <linux/list.h>
 | |
| #include <linux/list_nulls.h>
 | |
| #include <linux/atomic.h>
 | |
| #include <linux/netfilter/nf_conntrack_tcp.h>
 | |
| #include <linux/seqlock.h>
 | |
| 
 | |
| struct ctl_table_header;
 | |
| struct nf_conntrack_ecache;
 | |
| 
 | |
| struct nf_proto_net {
 | |
| #ifdef CONFIG_SYSCTL
 | |
| 	struct ctl_table_header *ctl_table_header;
 | |
| 	struct ctl_table        *ctl_table;
 | |
| #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
 | |
| 	struct ctl_table_header *ctl_compat_header;
 | |
| 	struct ctl_table        *ctl_compat_table;
 | |
| #endif
 | |
| #endif
 | |
| 	unsigned int		users;
 | |
| };
 | |
| 
 | |
| struct nf_generic_net {
 | |
| 	struct nf_proto_net pn;
 | |
| 	unsigned int timeout;
 | |
| };
 | |
| 
 | |
| struct nf_tcp_net {
 | |
| 	struct nf_proto_net pn;
 | |
| 	unsigned int timeouts[TCP_CONNTRACK_TIMEOUT_MAX];
 | |
| 	unsigned int tcp_loose;
 | |
| 	unsigned int tcp_be_liberal;
 | |
| 	unsigned int tcp_max_retrans;
 | |
| };
 | |
| 
 | |
| enum udp_conntrack {
 | |
| 	UDP_CT_UNREPLIED,
 | |
| 	UDP_CT_REPLIED,
 | |
| 	UDP_CT_MAX
 | |
| };
 | |
| 
 | |
| struct nf_udp_net {
 | |
| 	struct nf_proto_net pn;
 | |
| 	unsigned int timeouts[UDP_CT_MAX];
 | |
| };
 | |
| 
 | |
| struct nf_icmp_net {
 | |
| 	struct nf_proto_net pn;
 | |
| 	unsigned int timeout;
 | |
| };
 | |
| 
 | |
| struct nf_ip_net {
 | |
| 	struct nf_generic_net   generic;
 | |
| 	struct nf_tcp_net	tcp;
 | |
| 	struct nf_udp_net	udp;
 | |
| 	struct nf_icmp_net	icmp;
 | |
| 	struct nf_icmp_net	icmpv6;
 | |
| #if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
 | |
| 	struct ctl_table_header *ctl_table_header;
 | |
| 	struct ctl_table	*ctl_table;
 | |
| #endif
 | |
| };
 | |
| 
 | |
| struct ct_pcpu {
 | |
| 	spinlock_t		lock;
 | |
| 	struct hlist_nulls_head unconfirmed;
 | |
| 	struct hlist_nulls_head dying;
 | |
| 	struct hlist_nulls_head tmpl;
 | |
| };
 | |
| 
 | |
| struct netns_ct {
 | |
| 	atomic_t		count;
 | |
| 	unsigned int		expect_count;
 | |
| #ifdef CONFIG_SYSCTL
 | |
| 	struct ctl_table_header	*sysctl_header;
 | |
| 	struct ctl_table_header	*acct_sysctl_header;
 | |
| 	struct ctl_table_header	*tstamp_sysctl_header;
 | |
| 	struct ctl_table_header	*event_sysctl_header;
 | |
| 	struct ctl_table_header	*helper_sysctl_header;
 | |
| #endif
 | |
| 	char			*slabname;
 | |
| 	unsigned int		sysctl_log_invalid; /* Log invalid packets */
 | |
| 	unsigned int		sysctl_events_retry_timeout;
 | |
| 	int			sysctl_events;
 | |
| 	int			sysctl_acct;
 | |
| 	int			sysctl_auto_assign_helper;
 | |
| 	bool			auto_assign_helper_warned;
 | |
| 	int			sysctl_tstamp;
 | |
| 	int			sysctl_checksum;
 | |
| 
 | |
| 	unsigned int		htable_size;
 | |
| 	seqcount_t		generation;
 | |
| 	struct kmem_cache	*nf_conntrack_cachep;
 | |
| 	struct hlist_nulls_head	*hash;
 | |
| 	struct hlist_head	*expect_hash;
 | |
| 	struct ct_pcpu __percpu *pcpu_lists;
 | |
| 	struct ip_conntrack_stat __percpu *stat;
 | |
| 	struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
 | |
| 	struct nf_exp_event_notifier __rcu *nf_expect_event_cb;
 | |
| 	struct nf_ip_net	nf_ct_proto;
 | |
| #if defined(CONFIG_NF_CONNTRACK_LABELS)
 | |
| 	unsigned int		labels_used;
 | |
| 	u8			label_words;
 | |
| #endif
 | |
| #ifdef CONFIG_NF_NAT_NEEDED
 | |
| 	struct hlist_head	*nat_bysource;
 | |
| 	unsigned int		nat_htable_size;
 | |
| #endif
 | |
| };
 | |
| #endif
 |