 c9bee3b7fd
			
		
	
	
	c9bee3b7fd
	
	
	
		
			
			Idea of this patch is to add optional limitation of number of
unsent bytes in TCP sockets, to reduce usage of kernel memory.
TCP receiver might announce a big window, and TCP sender autotuning
might allow a large amount of bytes in write queue, but this has little
performance impact if a large part of this buffering is wasted :
Write queue needs to be large only to deal with large BDP, not
necessarily to cope with scheduling delays (incoming ACKS make room
for the application to queue more bytes)
For most workloads, using a value of 128 KB or less is OK to give
applications enough time to react to POLLOUT events in time
(or being awaken in a blocking sendmsg())
This patch adds two ways to set the limit :
1) Per socket option TCP_NOTSENT_LOWAT
2) A sysctl (/proc/sys/net/ipv4/tcp_notsent_lowat) for sockets
not using TCP_NOTSENT_LOWAT socket option (or setting a zero value)
Default value being UINT_MAX (0xFFFFFFFF), meaning this has no effect.
This changes poll()/select()/epoll() to report POLLOUT
only if number of unsent bytes is below tp->nosent_lowat
Note this might increase number of sendmsg()/sendfile() calls
when using non blocking sockets,
and increase number of context switches for blocking sockets.
Note this is not related to SO_SNDLOWAT (as SO_SNDLOWAT is
defined as :
 Specify the minimum number of bytes in the buffer until
 the socket layer will pass the data to the protocol)
Tested:
netperf sessions, and watching /proc/net/protocols "memory" column for TCP
With 200 concurrent netperf -t TCP_STREAM sessions, amount of kernel memory
used by TCP buffers shrinks by ~55 % (20567 pages instead of 45458)
lpq83:~# echo -1 >/proc/sys/net/ipv4/tcp_notsent_lowat
lpq83:~# (super_netperf 200 -t TCP_STREAM -H remote -l 90 &); sleep 60 ; grep TCP /proc/net/protocols
TCPv6     1880      2   45458   no     208   yes  ipv6        y  y  y  y  y  y  y  y  y  y  y  y  y  n  y  y  y  y  y
TCP       1696    508   45458   no     208   yes  kernel      y  y  y  y  y  y  y  y  y  y  y  y  y  n  y  y  y  y  y
lpq83:~# echo 131072 >/proc/sys/net/ipv4/tcp_notsent_lowat
lpq83:~# (super_netperf 200 -t TCP_STREAM -H remote -l 90 &); sleep 60 ; grep TCP /proc/net/protocols
TCPv6     1880      2   20567   no     208   yes  ipv6        y  y  y  y  y  y  y  y  y  y  y  y  y  n  y  y  y  y  y
TCP       1696    508   20567   no     208   yes  kernel      y  y  y  y  y  y  y  y  y  y  y  y  y  n  y  y  y  y  y
Using 128KB has no bad effect on the throughput or cpu usage
of a single flow, although there is an increase of context switches.
A bonus is that we hold socket lock for a shorter amount
of time and should improve latencies of ACK processing.
lpq83:~# echo -1 >/proc/sys/net/ipv4/tcp_notsent_lowat
lpq83:~# perf stat -e context-switches ./netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3
OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : +/-2.500% @ 99% conf.
Local       Remote      Local  Elapsed Throughput Throughput  Local Local  Remote Remote Local   Remote  Service
Send Socket Recv Socket Send   Time               Units       CPU   CPU    CPU    CPU    Service Service Demand
Size        Size        Size   (sec)                          Util  Util   Util   Util   Demand  Demand  Units
Final       Final                                             %     Method %      Method
1651584     6291456     16384  20.00   17447.90   10^6bits/s  3.13  S      -1.00  U      0.353   -1.000  usec/KB
 Performance counter stats for './netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3':
           412,514 context-switches
     200.034645535 seconds time elapsed
lpq83:~# echo 131072 >/proc/sys/net/ipv4/tcp_notsent_lowat
lpq83:~# perf stat -e context-switches ./netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3
OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : +/-2.500% @ 99% conf.
Local       Remote      Local  Elapsed Throughput Throughput  Local Local  Remote Remote Local   Remote  Service
Send Socket Recv Socket Send   Time               Units       CPU   CPU    CPU    CPU    Service Service Demand
Size        Size        Size   (sec)                          Util  Util   Util   Util   Demand  Demand  Units
Final       Final                                             %     Method %      Method
1593240     6291456     16384  20.00   17321.16   10^6bits/s  3.35  S      -1.00  U      0.381   -1.000  usec/KB
 Performance counter stats for './netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3':
         2,675,818 context-switches
     200.029651391 seconds time elapsed
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-By: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
		
	
			
		
			
				
	
	
		
			202 lines
		
	
	
	
		
			5.5 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			202 lines
		
	
	
	
		
			5.5 KiB
			
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  * INET		An implementation of the TCP/IP protocol suite for the LINUX
 | |
|  *		operating system.  INET is implemented using the  BSD Socket
 | |
|  *		interface as the means of communication with the user level.
 | |
|  *
 | |
|  *		Definitions for the TCP protocol.
 | |
|  *
 | |
|  * Version:	@(#)tcp.h	1.0.2	04/28/93
 | |
|  *
 | |
|  * Author:	Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 | |
|  *
 | |
|  *		This program is free software; you can redistribute it and/or
 | |
|  *		modify it under the terms of the GNU General Public License
 | |
|  *		as published by the Free Software Foundation; either version
 | |
|  *		2 of the License, or (at your option) any later version.
 | |
|  */
 | |
| #ifndef _UAPI_LINUX_TCP_H
 | |
| #define _UAPI_LINUX_TCP_H
 | |
| 
 | |
| #include <linux/types.h>
 | |
| #include <asm/byteorder.h>
 | |
| #include <linux/socket.h>
 | |
| 
 | |
| struct tcphdr {
 | |
| 	__be16	source;
 | |
| 	__be16	dest;
 | |
| 	__be32	seq;
 | |
| 	__be32	ack_seq;
 | |
| #if defined(__LITTLE_ENDIAN_BITFIELD)
 | |
| 	__u16	res1:4,
 | |
| 		doff:4,
 | |
| 		fin:1,
 | |
| 		syn:1,
 | |
| 		rst:1,
 | |
| 		psh:1,
 | |
| 		ack:1,
 | |
| 		urg:1,
 | |
| 		ece:1,
 | |
| 		cwr:1;
 | |
| #elif defined(__BIG_ENDIAN_BITFIELD)
 | |
| 	__u16	doff:4,
 | |
| 		res1:4,
 | |
| 		cwr:1,
 | |
| 		ece:1,
 | |
| 		urg:1,
 | |
| 		ack:1,
 | |
| 		psh:1,
 | |
| 		rst:1,
 | |
| 		syn:1,
 | |
| 		fin:1;
 | |
| #else
 | |
| #error	"Adjust your <asm/byteorder.h> defines"
 | |
| #endif	
 | |
| 	__be16	window;
 | |
| 	__sum16	check;
 | |
| 	__be16	urg_ptr;
 | |
| };
 | |
| 
 | |
| /*
 | |
|  *	The union cast uses a gcc extension to avoid aliasing problems
 | |
|  *  (union is compatible to any of its members)
 | |
|  *  This means this part of the code is -fstrict-aliasing safe now.
 | |
|  */
 | |
| union tcp_word_hdr { 
 | |
| 	struct tcphdr hdr;
 | |
| 	__be32 		  words[5];
 | |
| }; 
 | |
| 
 | |
| #define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3]) 
 | |
| 
 | |
| enum { 
 | |
| 	TCP_FLAG_CWR = __constant_cpu_to_be32(0x00800000),
 | |
| 	TCP_FLAG_ECE = __constant_cpu_to_be32(0x00400000),
 | |
| 	TCP_FLAG_URG = __constant_cpu_to_be32(0x00200000),
 | |
| 	TCP_FLAG_ACK = __constant_cpu_to_be32(0x00100000),
 | |
| 	TCP_FLAG_PSH = __constant_cpu_to_be32(0x00080000),
 | |
| 	TCP_FLAG_RST = __constant_cpu_to_be32(0x00040000),
 | |
| 	TCP_FLAG_SYN = __constant_cpu_to_be32(0x00020000),
 | |
| 	TCP_FLAG_FIN = __constant_cpu_to_be32(0x00010000),
 | |
| 	TCP_RESERVED_BITS = __constant_cpu_to_be32(0x0F000000),
 | |
| 	TCP_DATA_OFFSET = __constant_cpu_to_be32(0xF0000000)
 | |
| }; 
 | |
| 
 | |
| /*
 | |
|  * TCP general constants
 | |
|  */
 | |
| #define TCP_MSS_DEFAULT		 536U	/* IPv4 (RFC1122, RFC2581) */
 | |
| #define TCP_MSS_DESIRED		1220U	/* IPv6 (tunneled), EDNS0 (RFC3226) */
 | |
| 
 | |
| /* TCP socket options */
 | |
| #define TCP_NODELAY		1	/* Turn off Nagle's algorithm. */
 | |
| #define TCP_MAXSEG		2	/* Limit MSS */
 | |
| #define TCP_CORK		3	/* Never send partially complete segments */
 | |
| #define TCP_KEEPIDLE		4	/* Start keeplives after this period */
 | |
| #define TCP_KEEPINTVL		5	/* Interval between keepalives */
 | |
| #define TCP_KEEPCNT		6	/* Number of keepalives before death */
 | |
| #define TCP_SYNCNT		7	/* Number of SYN retransmits */
 | |
| #define TCP_LINGER2		8	/* Life time of orphaned FIN-WAIT-2 state */
 | |
| #define TCP_DEFER_ACCEPT	9	/* Wake up listener only when data arrive */
 | |
| #define TCP_WINDOW_CLAMP	10	/* Bound advertised window */
 | |
| #define TCP_INFO		11	/* Information about this connection. */
 | |
| #define TCP_QUICKACK		12	/* Block/reenable quick acks */
 | |
| #define TCP_CONGESTION		13	/* Congestion control algorithm */
 | |
| #define TCP_MD5SIG		14	/* TCP MD5 Signature (RFC2385) */
 | |
| #define TCP_THIN_LINEAR_TIMEOUTS 16      /* Use linear timeouts for thin streams*/
 | |
| #define TCP_THIN_DUPACK         17      /* Fast retrans. after 1 dupack */
 | |
| #define TCP_USER_TIMEOUT	18	/* How long for loss retry before timeout */
 | |
| #define TCP_REPAIR		19	/* TCP sock is under repair right now */
 | |
| #define TCP_REPAIR_QUEUE	20
 | |
| #define TCP_QUEUE_SEQ		21
 | |
| #define TCP_REPAIR_OPTIONS	22
 | |
| #define TCP_FASTOPEN		23	/* Enable FastOpen on listeners */
 | |
| #define TCP_TIMESTAMP		24
 | |
| #define TCP_NOTSENT_LOWAT	25	/* limit number of unsent bytes in write queue */
 | |
| 
 | |
| struct tcp_repair_opt {
 | |
| 	__u32	opt_code;
 | |
| 	__u32	opt_val;
 | |
| };
 | |
| 
 | |
| enum {
 | |
| 	TCP_NO_QUEUE,
 | |
| 	TCP_RECV_QUEUE,
 | |
| 	TCP_SEND_QUEUE,
 | |
| 	TCP_QUEUES_NR,
 | |
| };
 | |
| 
 | |
| /* for TCP_INFO socket option */
 | |
| #define TCPI_OPT_TIMESTAMPS	1
 | |
| #define TCPI_OPT_SACK		2
 | |
| #define TCPI_OPT_WSCALE		4
 | |
| #define TCPI_OPT_ECN		8 /* ECN was negociated at TCP session init */
 | |
| #define TCPI_OPT_ECN_SEEN	16 /* we received at least one packet with ECT */
 | |
| #define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */
 | |
| 
 | |
| enum tcp_ca_state {
 | |
| 	TCP_CA_Open = 0,
 | |
| #define TCPF_CA_Open	(1<<TCP_CA_Open)
 | |
| 	TCP_CA_Disorder = 1,
 | |
| #define TCPF_CA_Disorder (1<<TCP_CA_Disorder)
 | |
| 	TCP_CA_CWR = 2,
 | |
| #define TCPF_CA_CWR	(1<<TCP_CA_CWR)
 | |
| 	TCP_CA_Recovery = 3,
 | |
| #define TCPF_CA_Recovery (1<<TCP_CA_Recovery)
 | |
| 	TCP_CA_Loss = 4
 | |
| #define TCPF_CA_Loss	(1<<TCP_CA_Loss)
 | |
| };
 | |
| 
 | |
| struct tcp_info {
 | |
| 	__u8	tcpi_state;
 | |
| 	__u8	tcpi_ca_state;
 | |
| 	__u8	tcpi_retransmits;
 | |
| 	__u8	tcpi_probes;
 | |
| 	__u8	tcpi_backoff;
 | |
| 	__u8	tcpi_options;
 | |
| 	__u8	tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
 | |
| 
 | |
| 	__u32	tcpi_rto;
 | |
| 	__u32	tcpi_ato;
 | |
| 	__u32	tcpi_snd_mss;
 | |
| 	__u32	tcpi_rcv_mss;
 | |
| 
 | |
| 	__u32	tcpi_unacked;
 | |
| 	__u32	tcpi_sacked;
 | |
| 	__u32	tcpi_lost;
 | |
| 	__u32	tcpi_retrans;
 | |
| 	__u32	tcpi_fackets;
 | |
| 
 | |
| 	/* Times. */
 | |
| 	__u32	tcpi_last_data_sent;
 | |
| 	__u32	tcpi_last_ack_sent;     /* Not remembered, sorry. */
 | |
| 	__u32	tcpi_last_data_recv;
 | |
| 	__u32	tcpi_last_ack_recv;
 | |
| 
 | |
| 	/* Metrics. */
 | |
| 	__u32	tcpi_pmtu;
 | |
| 	__u32	tcpi_rcv_ssthresh;
 | |
| 	__u32	tcpi_rtt;
 | |
| 	__u32	tcpi_rttvar;
 | |
| 	__u32	tcpi_snd_ssthresh;
 | |
| 	__u32	tcpi_snd_cwnd;
 | |
| 	__u32	tcpi_advmss;
 | |
| 	__u32	tcpi_reordering;
 | |
| 
 | |
| 	__u32	tcpi_rcv_rtt;
 | |
| 	__u32	tcpi_rcv_space;
 | |
| 
 | |
| 	__u32	tcpi_total_retrans;
 | |
| };
 | |
| 
 | |
| /* for TCP_MD5SIG socket option */
 | |
| #define TCP_MD5SIG_MAXKEYLEN	80
 | |
| 
 | |
| struct tcp_md5sig {
 | |
| 	struct __kernel_sockaddr_storage tcpm_addr;	/* address associated */
 | |
| 	__u16	__tcpm_pad1;				/* zero */
 | |
| 	__u16	tcpm_keylen;				/* key length */
 | |
| 	__u32	__tcpm_pad2;				/* zero */
 | |
| 	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];		/* key (binary) */
 | |
| };
 | |
| 
 | |
| #endif /* _UAPI_LINUX_TCP_H */
 |