| 
									
										
										
										
											2012-08-13 05:49:39 +00:00
										 |  |  | #ifndef __PACKET_INTERNAL_H__
 | 
					
						
							|  |  |  | #define __PACKET_INTERNAL_H__
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct packet_mclist { | 
					
						
							|  |  |  | 	struct packet_mclist	*next; | 
					
						
							|  |  |  | 	int			ifindex; | 
					
						
							|  |  |  | 	int			count; | 
					
						
							|  |  |  | 	unsigned short		type; | 
					
						
							|  |  |  | 	unsigned short		alen; | 
					
						
							|  |  |  | 	unsigned char		addr[MAX_ADDR_LEN]; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* kbdq - kernel block descriptor queue */ | 
					
						
							|  |  |  | struct tpacket_kbdq_core { | 
					
						
							|  |  |  | 	struct pgv	*pkbdq; | 
					
						
							|  |  |  | 	unsigned int	feature_req_word; | 
					
						
							|  |  |  | 	unsigned int	hdrlen; | 
					
						
							|  |  |  | 	unsigned char	reset_pending_on_curr_blk; | 
					
						
							|  |  |  | 	unsigned char   delete_blk_timer; | 
					
						
							|  |  |  | 	unsigned short	kactive_blk_num; | 
					
						
							|  |  |  | 	unsigned short	blk_sizeof_priv; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* last_kactive_blk_num:
 | 
					
						
							|  |  |  | 	 * trick to see if user-space has caught up | 
					
						
							|  |  |  | 	 * in order to avoid refreshing timer when every single pkt arrives. | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	unsigned short	last_kactive_blk_num; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	char		*pkblk_start; | 
					
						
							|  |  |  | 	char		*pkblk_end; | 
					
						
							|  |  |  | 	int		kblk_size; | 
					
						
							|  |  |  | 	unsigned int	knum_blocks; | 
					
						
							|  |  |  | 	uint64_t	knxt_seq_num; | 
					
						
							|  |  |  | 	char		*prev; | 
					
						
							|  |  |  | 	char		*nxt_offset; | 
					
						
							|  |  |  | 	struct sk_buff	*skb; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	atomic_t	blk_fill_in_prog; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* Default is set to 8ms */ | 
					
						
							|  |  |  | #define DEFAULT_PRB_RETIRE_TOV	(8)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	unsigned short  retire_blk_tov; | 
					
						
							|  |  |  | 	unsigned short  version; | 
					
						
							|  |  |  | 	unsigned long	tov_in_jiffies; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* timer to retire an outstanding block */ | 
					
						
							|  |  |  | 	struct timer_list retire_blk_timer; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct pgv { | 
					
						
							|  |  |  | 	char *buffer; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct packet_ring_buffer { | 
					
						
							|  |  |  | 	struct pgv		*pg_vec; | 
					
						
							|  |  |  | 	unsigned int		head; | 
					
						
							|  |  |  | 	unsigned int		frames_per_block; | 
					
						
							|  |  |  | 	unsigned int		frame_size; | 
					
						
							|  |  |  | 	unsigned int		frame_max; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	unsigned int		pg_vec_order; | 
					
						
							|  |  |  | 	unsigned int		pg_vec_pages; | 
					
						
							|  |  |  | 	unsigned int		pg_vec_len; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	struct tpacket_kbdq_core	prb_bdqc; | 
					
						
							|  |  |  | 	atomic_t		pending; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-08-16 05:36:48 +00:00
										 |  |  | extern struct mutex fanout_mutex; | 
					
						
							|  |  |  | #define PACKET_FANOUT_MAX	256
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct packet_fanout { | 
					
						
							|  |  |  | #ifdef CONFIG_NET_NS
 | 
					
						
							|  |  |  | 	struct net		*net; | 
					
						
							|  |  |  | #endif
 | 
					
						
							|  |  |  | 	unsigned int		num_members; | 
					
						
							|  |  |  | 	u16			id; | 
					
						
							|  |  |  | 	u8			type; | 
					
						
							| 
									
										
											  
											
												packet: packet fanout rollover during socket overload
Changes:
  v3->v2: rebase (no other changes)
          passes selftest
  v2->v1: read f->num_members only once
          fix bug: test rollover mode + flag
Minimize packet drop in a fanout group. If one socket is full,
roll over packets to another from the group. Maintain flow
affinity during normal load using an rxhash fanout policy, while
dispersing unexpected traffic storms that hit a single cpu, such
as spoofed-source DoS flows. Rollover breaks affinity for flows
arriving at saturated sockets during those conditions.
The patch adds a fanout policy ROLLOVER that rotates between sockets,
filling each socket before moving to the next. It also adds a fanout
flag ROLLOVER. If passed along with any other fanout policy, the
primary policy is applied until the chosen socket is full. Then,
rollover selects another socket, to delay packet drop until the
entire system is saturated.
Probing sockets is not free. Selecting the last used socket, as
rollover does, is a greedy approach that maximizes chance of
success, at the cost of extreme load imbalance. In practice, with
sufficiently long queues to absorb bursts, sockets are drained in
parallel and load balance looks uniform in `top`.
To avoid contention, scales counters with number of sockets and
accesses them lockfree. Values are bounds checked to ensure
correctness.
Tested using an application with 9 threads pinned to CPUs, one socket
per thread and sufficient busywork per packet operation to limits each
thread to handling 32 Kpps. When sent 500 Kpps single UDP stream
packets, a FANOUT_CPU setup processes 32 Kpps in total without this
patch, 270 Kpps with the patch. Tested with read() and with a packet
ring (V1).
Also, passes psock_fanout.c unit test added to selftests.
Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2013-03-19 10:18:11 +00:00
										 |  |  | 	u8			flags; | 
					
						
							| 
									
										
										
										
											2012-08-16 05:36:48 +00:00
										 |  |  | 	atomic_t		rr_cur; | 
					
						
							|  |  |  | 	struct list_head	list; | 
					
						
							|  |  |  | 	struct sock		*arr[PACKET_FANOUT_MAX]; | 
					
						
							| 
									
										
											  
											
												packet: packet fanout rollover during socket overload
Changes:
  v3->v2: rebase (no other changes)
          passes selftest
  v2->v1: read f->num_members only once
          fix bug: test rollover mode + flag
Minimize packet drop in a fanout group. If one socket is full,
roll over packets to another from the group. Maintain flow
affinity during normal load using an rxhash fanout policy, while
dispersing unexpected traffic storms that hit a single cpu, such
as spoofed-source DoS flows. Rollover breaks affinity for flows
arriving at saturated sockets during those conditions.
The patch adds a fanout policy ROLLOVER that rotates between sockets,
filling each socket before moving to the next. It also adds a fanout
flag ROLLOVER. If passed along with any other fanout policy, the
primary policy is applied until the chosen socket is full. Then,
rollover selects another socket, to delay packet drop until the
entire system is saturated.
Probing sockets is not free. Selecting the last used socket, as
rollover does, is a greedy approach that maximizes chance of
success, at the cost of extreme load imbalance. In practice, with
sufficiently long queues to absorb bursts, sockets are drained in
parallel and load balance looks uniform in `top`.
To avoid contention, scales counters with number of sockets and
accesses them lockfree. Values are bounds checked to ensure
correctness.
Tested using an application with 9 threads pinned to CPUs, one socket
per thread and sufficient busywork per packet operation to limits each
thread to handling 32 Kpps. When sent 500 Kpps single UDP stream
packets, a FANOUT_CPU setup processes 32 Kpps in total without this
patch, 270 Kpps with the patch. Tested with read() and with a packet
ring (V1).
Also, passes psock_fanout.c unit test added to selftests.
Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2013-03-19 10:18:11 +00:00
										 |  |  | 	int			next[PACKET_FANOUT_MAX]; | 
					
						
							| 
									
										
										
										
											2012-08-16 05:36:48 +00:00
										 |  |  | 	spinlock_t		lock; | 
					
						
							|  |  |  | 	atomic_t		sk_ref; | 
					
						
							|  |  |  | 	struct packet_type	prot_hook ____cacheline_aligned_in_smp; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-08-13 05:49:39 +00:00
										 |  |  | struct packet_sock { | 
					
						
							|  |  |  | 	/* struct sock has to be the first member of packet_sock */ | 
					
						
							|  |  |  | 	struct sock		sk; | 
					
						
							|  |  |  | 	struct packet_fanout	*fanout; | 
					
						
							|  |  |  | 	struct tpacket_stats	stats; | 
					
						
							|  |  |  | 	union  tpacket_stats_u	stats_u; | 
					
						
							|  |  |  | 	struct packet_ring_buffer	rx_ring; | 
					
						
							|  |  |  | 	struct packet_ring_buffer	tx_ring; | 
					
						
							|  |  |  | 	int			copy_thresh; | 
					
						
							|  |  |  | 	spinlock_t		bind_lock; | 
					
						
							|  |  |  | 	struct mutex		pg_vec_lock; | 
					
						
							|  |  |  | 	unsigned int		running:1,	/* prot_hook is attached*/ | 
					
						
							|  |  |  | 				auxdata:1, | 
					
						
							|  |  |  | 				origdev:1, | 
					
						
							|  |  |  | 				has_vnet_hdr:1; | 
					
						
							|  |  |  | 	int			ifindex;	/* bound device		*/ | 
					
						
							|  |  |  | 	__be16			num; | 
					
						
							|  |  |  | 	struct packet_mclist	*mclist; | 
					
						
							|  |  |  | 	atomic_t		mapped; | 
					
						
							|  |  |  | 	enum tpacket_versions	tp_version; | 
					
						
							|  |  |  | 	unsigned int		tp_hdrlen; | 
					
						
							|  |  |  | 	unsigned int		tp_reserve; | 
					
						
							|  |  |  | 	unsigned int		tp_loss:1; | 
					
						
							| 
									
										
										
										
											2012-11-06 23:10:47 +00:00
										 |  |  | 	unsigned int		tp_tx_has_off:1; | 
					
						
							| 
									
										
										
										
											2012-08-13 05:49:39 +00:00
										 |  |  | 	unsigned int		tp_tstamp; | 
					
						
							|  |  |  | 	struct packet_type	prot_hook ____cacheline_aligned_in_smp; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static struct packet_sock *pkt_sk(struct sock *sk) | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  | 	return (struct packet_sock *)sk; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #endif
 |