| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | #ifndef __LINUX_PKT_SCHED_H
 | 
					
						
							|  |  |  | #define __LINUX_PKT_SCHED_H
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-30 22:07:05 +05:30
										 |  |  | #include <linux/types.h>
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | /* Logical priority bands not depending on specific packet scheduler.
 | 
					
						
							|  |  |  |    Every scheduler will map them to real traffic classes, if it has | 
					
						
							|  |  |  |    no more precise mechanism to classify packets. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    These numbers have no special meaning, though their coincidence | 
					
						
							|  |  |  |    with obsolete IPv6 values is not occasional :-). New IPv6 drafts | 
					
						
							|  |  |  |    preferred full anarchy inspired by diffserv group. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    Note: TC_PRIO_BESTEFFORT does not mean that it is the most unhappy | 
					
						
							|  |  |  |    class, actually, as rule it will be handled with more care than | 
					
						
							|  |  |  |    filler or even bulk. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TC_PRIO_BESTEFFORT		0
 | 
					
						
							|  |  |  | #define TC_PRIO_FILLER			1
 | 
					
						
							|  |  |  | #define TC_PRIO_BULK			2
 | 
					
						
							|  |  |  | #define TC_PRIO_INTERACTIVE_BULK	4
 | 
					
						
							|  |  |  | #define TC_PRIO_INTERACTIVE		6
 | 
					
						
							|  |  |  | #define TC_PRIO_CONTROL			7
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TC_PRIO_MAX			15
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* Generic queue statistics, available for all the elements.
 | 
					
						
							|  |  |  |    Particular schedulers may have also their private records. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_stats { | 
					
						
							| 
									
										
										
										
											2011-11-21 06:53:46 +00:00
										 |  |  | 	__u64	bytes;			/* Number of enqueued bytes */ | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__u32	packets;		/* Number of enqueued packets	*/ | 
					
						
							|  |  |  | 	__u32	drops;			/* Packets dropped because of lack of resources */ | 
					
						
							|  |  |  | 	__u32	overlimits;		/* Number of throttle events when this
 | 
					
						
							|  |  |  | 					 * flow goes out of allocated bandwidth */ | 
					
						
							|  |  |  | 	__u32	bps;			/* Current flow byte rate */ | 
					
						
							|  |  |  | 	__u32	pps;			/* Current flow packet rate */ | 
					
						
							|  |  |  | 	__u32	qlen; | 
					
						
							|  |  |  | 	__u32	backlog; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_estimator { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	signed char	interval; | 
					
						
							|  |  |  | 	unsigned char	ewma_log; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* "Handles"
 | 
					
						
							|  |  |  |    --------- | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     All the traffic control objects have 32bit identifiers, or "handles". | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     They can be considered as opaque numbers from user API viewpoint, | 
					
						
							|  |  |  |     but actually they always consist of two fields: major and | 
					
						
							|  |  |  |     minor numbers, which are interpreted by kernel specially, | 
					
						
							|  |  |  |     that may be used by applications, though not recommended. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     F.e. qdisc handles always have minor number equal to zero, | 
					
						
							|  |  |  |     classes (or flows) have major equal to parent qdisc major, and | 
					
						
							|  |  |  |     minor uniquely identifying class inside qdisc. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     Macros to manipulate handles: | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TC_H_MAJ_MASK (0xFFFF0000U)
 | 
					
						
							|  |  |  | #define TC_H_MIN_MASK (0x0000FFFFU)
 | 
					
						
							|  |  |  | #define TC_H_MAJ(h) ((h)&TC_H_MAJ_MASK)
 | 
					
						
							|  |  |  | #define TC_H_MIN(h) ((h)&TC_H_MIN_MASK)
 | 
					
						
							|  |  |  | #define TC_H_MAKE(maj,min) (((maj)&TC_H_MAJ_MASK)|((min)&TC_H_MIN_MASK))
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TC_H_UNSPEC	(0U)
 | 
					
						
							|  |  |  | #define TC_H_ROOT	(0xFFFFFFFFU)
 | 
					
						
							|  |  |  | #define TC_H_INGRESS    (0xFFFFFFF1U)
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-08-14 23:47:11 +02:00
										 |  |  | /* Need to corrospond to iproute2 tc/tc_core.h "enum link_layer" */ | 
					
						
							|  |  |  | enum tc_link_layer { | 
					
						
							|  |  |  | 	TC_LINKLAYER_UNAWARE, /* Indicate unaware old iproute2 util */ | 
					
						
							|  |  |  | 	TC_LINKLAYER_ETHERNET, | 
					
						
							|  |  |  | 	TC_LINKLAYER_ATM, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | #define TC_LINKLAYER_MASK 0x0F /* limit use to lower 4 bits */
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_ratespec { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	unsigned char	cell_log; | 
					
						
							| 
									
										
										
										
											2013-08-14 23:47:11 +02:00
										 |  |  | 	__u8		linklayer; /* lower 4 bits */ | 
					
						
							| 
									
										
										
										
											2007-09-12 16:36:28 +02:00
										 |  |  | 	unsigned short	overhead; | 
					
						
							|  |  |  | 	short		cell_align; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	unsigned short	mpu; | 
					
						
							|  |  |  | 	__u32		rate; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-01-23 20:35:19 -08:00
										 |  |  | #define TC_RTAB_SIZE	1024
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-07-20 00:08:47 -07:00
										 |  |  | struct tc_sizespec { | 
					
						
							|  |  |  | 	unsigned char	cell_log; | 
					
						
							|  |  |  | 	unsigned char	size_log; | 
					
						
							|  |  |  | 	short		cell_align; | 
					
						
							|  |  |  | 	int		overhead; | 
					
						
							|  |  |  | 	unsigned int	linklayer; | 
					
						
							|  |  |  | 	unsigned int	mpu; | 
					
						
							|  |  |  | 	unsigned int	mtu; | 
					
						
							|  |  |  | 	unsigned int	tsize; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | enum { | 
					
						
							|  |  |  | 	TCA_STAB_UNSPEC, | 
					
						
							|  |  |  | 	TCA_STAB_BASE, | 
					
						
							|  |  |  | 	TCA_STAB_DATA, | 
					
						
							|  |  |  | 	__TCA_STAB_MAX | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TCA_STAB_MAX (__TCA_STAB_MAX - 1)
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | /* FIFO section */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_fifo_qopt { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__u32	limit;	/* Queue length: bytes for bfifo, packets for pfifo */ | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* PRIO section */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TCQ_PRIO_BANDS	16
 | 
					
						
							| 
									
										
										
										
											2005-11-05 21:14:28 +01:00
										 |  |  | #define TCQ_MIN_PRIO_BANDS 2
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_prio_qopt { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	int	bands;			/* Number of bands */ | 
					
						
							|  |  |  | 	__u8	priomap[TC_PRIO_MAX+1];	/* Map: logical priority -> PRIO band */ | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-09-12 16:29:34 -07:00
										 |  |  | /* MULTIQ section */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct tc_multiq_qopt { | 
					
						
							|  |  |  | 	__u16	bands;			/* Number of bands */ | 
					
						
							|  |  |  | 	__u16	max_bands;		/* Maximum number of queues */ | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-02-05 13:51:32 +00:00
										 |  |  | /* PLUG section */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TCQ_PLUG_BUFFER                0
 | 
					
						
							|  |  |  | #define TCQ_PLUG_RELEASE_ONE           1
 | 
					
						
							|  |  |  | #define TCQ_PLUG_RELEASE_INDEFINITE    2
 | 
					
						
							|  |  |  | #define TCQ_PLUG_LIMIT                 3
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct tc_plug_qopt { | 
					
						
							|  |  |  | 	/* TCQ_PLUG_BUFFER: Inset a plug into the queue and
 | 
					
						
							|  |  |  | 	 *  buffer any incoming packets | 
					
						
							|  |  |  | 	 * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head | 
					
						
							|  |  |  | 	 *   to beginning of the next plug. | 
					
						
							|  |  |  | 	 * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue. | 
					
						
							|  |  |  | 	 *   Stop buffering packets until the next TCQ_PLUG_BUFFER | 
					
						
							|  |  |  | 	 *   command is received (just act as a pass-thru queue). | 
					
						
							|  |  |  | 	 * TCQ_PLUG_LIMIT: Increase/decrease queue size | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	int             action; | 
					
						
							|  |  |  | 	__u32           limit; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | /* TBF section */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_tbf_qopt { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	struct tc_ratespec rate; | 
					
						
							|  |  |  | 	struct tc_ratespec peakrate; | 
					
						
							|  |  |  | 	__u32		limit; | 
					
						
							|  |  |  | 	__u32		buffer; | 
					
						
							|  |  |  | 	__u32		mtu; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | enum { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	TCA_TBF_UNSPEC, | 
					
						
							|  |  |  | 	TCA_TBF_PARMS, | 
					
						
							|  |  |  | 	TCA_TBF_RTAB, | 
					
						
							|  |  |  | 	TCA_TBF_PTAB, | 
					
						
							| 
									
										
										
										
											2013-11-08 10:23:34 +08:00
										 |  |  | 	TCA_TBF_RATE64, | 
					
						
							|  |  |  | 	TCA_TBF_PRATE64, | 
					
						
							| 
									
										
										
										
											2013-12-20 09:24:47 +08:00
										 |  |  | 	TCA_TBF_BURST, | 
					
						
							|  |  |  | 	TCA_TBF_PBURST, | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__TCA_TBF_MAX, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TCA_TBF_MAX (__TCA_TBF_MAX - 1)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* TEQL section */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* TEQL does not require any parameters */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* SFQ section */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_sfq_qopt { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	unsigned	quantum;	/* Bytes per round allocated to flow */ | 
					
						
							|  |  |  | 	int		perturb_period;	/* Period of hash perturbation */ | 
					
						
							|  |  |  | 	__u32		limit;		/* Maximal packets in queue */ | 
					
						
							|  |  |  | 	unsigned	divisor;	/* Hash divisor  */ | 
					
						
							|  |  |  | 	unsigned	flows;		/* Maximal number of flows  */ | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												net_sched: sfq: add optional RED on top of SFQ
Adds an optional Random Early Detection on each SFQ flow queue.
Traditional SFQ limits count of packets, while RED permits to also
control number of bytes per flow, and adds ECN capability as well.
1) We dont handle the idle time management in this RED implementation,
since each 'new flow' begins with a null qavg. We really want to address
backlogged flows.
2) if headdrop is selected, we try to ecn mark first packet instead of
currently enqueued packet. This gives faster feedback for tcp flows
compared to traditional RED [ marking the last packet in queue ]
Example of use :
tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 4sec sfq \
	limit 3000 headdrop flows 512 divisor 16384 \
	redflowlimit 100000 min 8000 max 60000 probability 0.20 ecn
qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop
flows 512/16384 divisor 16384
 ewma 6 min 8000b max 60000b probability 0.2 ecn
 prob_mark 0 prob_mark_head 4876 prob_drop 6131
 forced_mark 0 forced_mark_head 0 forced_drop 0
 Sent 1175211782 bytes 777537 pkt (dropped 6131, overlimits 11007
requeues 0)
 rate 99483Kbit 8219pps backlog 689392b 456p requeues 0
In this test, with 64 netperf TCP_STREAM sessions, 50% using ECN enabled
flows, we can see number of packets CE marked is smaller than number of
drops (for non ECN flows)
If same test is run, without RED, we can check backlog is much bigger.
qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop
flows 512/16384 divisor 16384
 Sent 1148683617 bytes 795006 pkt (dropped 0, overlimits 0 requeues 0)
 rate 98429Kbit 8521pps backlog 1221290b 841p requeues 0
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Stephen Hemminger <shemminger@vyatta.com>
CC: Dave Taht <dave.taht@gmail.com>
Tested-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2012-01-06 06:31:44 +00:00
										 |  |  | struct tc_sfqred_stats { | 
					
						
							|  |  |  | 	__u32           prob_drop;      /* Early drops, below max threshold */ | 
					
						
							|  |  |  | 	__u32           forced_drop;	/* Early drops, after max threshold */ | 
					
						
							|  |  |  | 	__u32           prob_mark;      /* Marked packets, below max threshold */ | 
					
						
							|  |  |  | 	__u32           forced_mark;    /* Marked packets, after max threshold */ | 
					
						
							|  |  |  | 	__u32           prob_mark_head; /* Marked packets, below max threshold */ | 
					
						
							|  |  |  | 	__u32           forced_mark_head;/* Marked packets, after max threshold */ | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-01-04 14:18:38 +00:00
										 |  |  | struct tc_sfq_qopt_v1 { | 
					
						
							|  |  |  | 	struct tc_sfq_qopt v0; | 
					
						
							|  |  |  | 	unsigned int	depth;		/* max number of packets per flow */ | 
					
						
							|  |  |  | 	unsigned int	headdrop; | 
					
						
							| 
									
										
											  
											
												net_sched: sfq: add optional RED on top of SFQ
Adds an optional Random Early Detection on each SFQ flow queue.
Traditional SFQ limits count of packets, while RED permits to also
control number of bytes per flow, and adds ECN capability as well.
1) We dont handle the idle time management in this RED implementation,
since each 'new flow' begins with a null qavg. We really want to address
backlogged flows.
2) if headdrop is selected, we try to ecn mark first packet instead of
currently enqueued packet. This gives faster feedback for tcp flows
compared to traditional RED [ marking the last packet in queue ]
Example of use :
tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 4sec sfq \
	limit 3000 headdrop flows 512 divisor 16384 \
	redflowlimit 100000 min 8000 max 60000 probability 0.20 ecn
qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop
flows 512/16384 divisor 16384
 ewma 6 min 8000b max 60000b probability 0.2 ecn
 prob_mark 0 prob_mark_head 4876 prob_drop 6131
 forced_mark 0 forced_mark_head 0 forced_drop 0
 Sent 1175211782 bytes 777537 pkt (dropped 6131, overlimits 11007
requeues 0)
 rate 99483Kbit 8219pps backlog 689392b 456p requeues 0
In this test, with 64 netperf TCP_STREAM sessions, 50% using ECN enabled
flows, we can see number of packets CE marked is smaller than number of
drops (for non ECN flows)
If same test is run, without RED, we can check backlog is much bigger.
qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop
flows 512/16384 divisor 16384
 Sent 1148683617 bytes 795006 pkt (dropped 0, overlimits 0 requeues 0)
 rate 98429Kbit 8521pps backlog 1221290b 841p requeues 0
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Stephen Hemminger <shemminger@vyatta.com>
CC: Dave Taht <dave.taht@gmail.com>
Tested-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2012-01-06 06:31:44 +00:00
										 |  |  | /* SFQRED parameters */ | 
					
						
							|  |  |  | 	__u32		limit;		/* HARD maximal flow queue length (bytes) */ | 
					
						
							|  |  |  | 	__u32		qth_min;	/* Min average length threshold (bytes) */ | 
					
						
							|  |  |  | 	__u32		qth_max;	/* Max average length threshold (bytes) */ | 
					
						
							|  |  |  | 	unsigned char   Wlog;		/* log(W)		*/ | 
					
						
							|  |  |  | 	unsigned char   Plog;		/* log(P_max/(qth_max-qth_min))	*/ | 
					
						
							|  |  |  | 	unsigned char   Scell_log;	/* cell size for idle damping */ | 
					
						
							|  |  |  | 	unsigned char	flags; | 
					
						
							|  |  |  | 	__u32		max_P;		/* probability, high resolution */ | 
					
						
							|  |  |  | /* SFQRED stats */ | 
					
						
							|  |  |  | 	struct tc_sfqred_stats stats; | 
					
						
							| 
									
										
										
										
											2012-01-04 14:18:38 +00:00
										 |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_sfq_xstats { | 
					
						
							| 
									
										
										
										
											2008-01-31 18:37:16 -08:00
										 |  |  | 	__s32		allot; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | /* RED section */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | enum { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	TCA_RED_UNSPEC, | 
					
						
							|  |  |  | 	TCA_RED_PARMS, | 
					
						
							|  |  |  | 	TCA_RED_STAB, | 
					
						
							| 
									
										
											  
											
												sch_red: Adaptative RED AQM
Adaptative RED AQM for linux, based on paper from Sally FLoyd,
Ramakrishna Gummadi, and Scott Shenker, August 2001 :
http://icir.org/floyd/papers/adaptiveRed.pdf
Goal of Adaptative RED is to make max_p a dynamic value between 1% and
50% to reach the target average queue : (max_th - min_th) / 2
Every 500 ms:
 if (avg > target and max_p <= 0.5)
  increase max_p : max_p += alpha;
 else if (avg < target and max_p >= 0.01)
  decrease max_p : max_p *= beta;
target :[min_th + 0.4*(min_th - max_th),
          min_th + 0.6*(min_th - max_th)].
alpha : min(0.01, max_p / 4)
beta : 0.9
max_P is a Q0.32 fixed point number (unsigned, with 32 bits mantissa)
Changes against our RED implementation are :
max_p is no longer a negative power of two (1/(2^Plog)), but a Q0.32
fixed point number, to allow full range described in Adatative paper.
To deliver a random number, we now use a reciprocal divide (thats really
a multiply), but this operation is done once per marked/droped packet
when in RED_BETWEEN_TRESH window, so added cost (compared to previous
AND operation) is near zero.
dump operation gives current max_p value in a new TCA_RED_MAX_P
attribute.
Example on a 10Mbit link :
tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 8sec red \
   limit 400000 min 30000 max 90000 avpkt 1000 \
   burst 55 ecn adaptative bandwidth 10Mbit
# tc -s -d qdisc show dev eth3
...
qdisc red 10: parent 1:1 limit 400000b min 30000b max 90000b ecn
adaptative ewma 5 max_p=0.113335 Scell_log 15
 Sent 50414282 bytes 34504 pkt (dropped 35, overlimits 1392 requeues 0)
 rate 9749Kbit 831pps backlog 72056b 16p requeues 0
  marked 1357 early 35 pdrop 0 other 0
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2011-12-08 06:06:03 +00:00
										 |  |  | 	TCA_RED_MAX_P, | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__TCA_RED_MAX, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TCA_RED_MAX (__TCA_RED_MAX - 1)
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_red_qopt { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__u32		limit;		/* HARD maximal queue length (bytes)	*/ | 
					
						
							|  |  |  | 	__u32		qth_min;	/* Min average length threshold (bytes) */ | 
					
						
							|  |  |  | 	__u32		qth_max;	/* Max average length threshold (bytes) */ | 
					
						
							|  |  |  | 	unsigned char   Wlog;		/* log(W)		*/ | 
					
						
							|  |  |  | 	unsigned char   Plog;		/* log(P_max/(qth_max-qth_min))	*/ | 
					
						
							|  |  |  | 	unsigned char   Scell_log;	/* cell size for idle damping */ | 
					
						
							|  |  |  | 	unsigned char	flags; | 
					
						
							| 
									
										
											  
											
												sch_red: Adaptative RED AQM
Adaptative RED AQM for linux, based on paper from Sally FLoyd,
Ramakrishna Gummadi, and Scott Shenker, August 2001 :
http://icir.org/floyd/papers/adaptiveRed.pdf
Goal of Adaptative RED is to make max_p a dynamic value between 1% and
50% to reach the target average queue : (max_th - min_th) / 2
Every 500 ms:
 if (avg > target and max_p <= 0.5)
  increase max_p : max_p += alpha;
 else if (avg < target and max_p >= 0.01)
  decrease max_p : max_p *= beta;
target :[min_th + 0.4*(min_th - max_th),
          min_th + 0.6*(min_th - max_th)].
alpha : min(0.01, max_p / 4)
beta : 0.9
max_P is a Q0.32 fixed point number (unsigned, with 32 bits mantissa)
Changes against our RED implementation are :
max_p is no longer a negative power of two (1/(2^Plog)), but a Q0.32
fixed point number, to allow full range described in Adatative paper.
To deliver a random number, we now use a reciprocal divide (thats really
a multiply), but this operation is done once per marked/droped packet
when in RED_BETWEEN_TRESH window, so added cost (compared to previous
AND operation) is near zero.
dump operation gives current max_p value in a new TCA_RED_MAX_P
attribute.
Example on a 10Mbit link :
tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 8sec red \
   limit 400000 min 30000 max 90000 avpkt 1000 \
   burst 55 ecn adaptative bandwidth 10Mbit
# tc -s -d qdisc show dev eth3
...
qdisc red 10: parent 1:1 limit 400000b min 30000b max 90000b ecn
adaptative ewma 5 max_p=0.113335 Scell_log 15
 Sent 50414282 bytes 34504 pkt (dropped 35, overlimits 1392 requeues 0)
 rate 9749Kbit 831pps backlog 72056b 16p requeues 0
  marked 1357 early 35 pdrop 0 other 0
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2011-12-08 06:06:03 +00:00
										 |  |  | #define TC_RED_ECN		1
 | 
					
						
							|  |  |  | #define TC_RED_HARDDROP		2
 | 
					
						
							|  |  |  | #define TC_RED_ADAPTATIVE	4
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_red_xstats { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__u32           early;          /* Early drops */ | 
					
						
							|  |  |  | 	__u32           pdrop;          /* Drops due to queue limits */ | 
					
						
							|  |  |  | 	__u32           other;          /* Drops due to drop() calls */ | 
					
						
							|  |  |  | 	__u32           marked;         /* Marked packets */ | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* GRED section */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define MAX_DPs 16
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | enum { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  |        TCA_GRED_UNSPEC, | 
					
						
							|  |  |  |        TCA_GRED_PARMS, | 
					
						
							|  |  |  |        TCA_GRED_STAB, | 
					
						
							|  |  |  |        TCA_GRED_DPS, | 
					
						
							| 
									
										
										
										
											2011-12-09 02:46:45 +00:00
										 |  |  |        TCA_GRED_MAX_P, | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	   __TCA_GRED_MAX, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TCA_GRED_MAX (__TCA_GRED_MAX - 1)
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_gred_qopt { | 
					
						
							| 
									
										
										
										
											2005-11-05 21:14:25 +01:00
										 |  |  | 	__u32		limit;        /* HARD maximal queue length (bytes)    */ | 
					
						
							|  |  |  | 	__u32		qth_min;      /* Min average length threshold (bytes) */ | 
					
						
							|  |  |  | 	__u32		qth_max;      /* Max average length threshold (bytes) */ | 
					
						
							| 
									
										
										
										
											2011-03-30 22:57:33 -03:00
										 |  |  | 	__u32		DP;           /* up to 2^32 DPs */ | 
					
						
							| 
									
										
										
										
											2005-11-05 21:14:25 +01:00
										 |  |  | 	__u32		backlog; | 
					
						
							|  |  |  | 	__u32		qave; | 
					
						
							|  |  |  | 	__u32		forced; | 
					
						
							|  |  |  | 	__u32		early; | 
					
						
							|  |  |  | 	__u32		other; | 
					
						
							|  |  |  | 	__u32		pdrop; | 
					
						
							|  |  |  | 	__u8		Wlog;         /* log(W)               */ | 
					
						
							|  |  |  | 	__u8		Plog;         /* log(P_max/(qth_max-qth_min)) */ | 
					
						
							|  |  |  | 	__u8		Scell_log;    /* cell size for idle damping */ | 
					
						
							|  |  |  | 	__u8		prio;         /* prio of this VQ */ | 
					
						
							|  |  |  | 	__u32		packets; | 
					
						
							|  |  |  | 	__u32		bytesin; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | }; | 
					
						
							| 
									
										
										
										
											2005-11-05 21:14:25 +01:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | /* gred setup */ | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_gred_sopt { | 
					
						
							| 
									
										
										
										
											2005-11-05 21:14:25 +01:00
										 |  |  | 	__u32		DPs; | 
					
						
							|  |  |  | 	__u32		def_DP; | 
					
						
							|  |  |  | 	__u8		grio; | 
					
						
							| 
									
										
										
										
											2005-11-05 21:14:27 +01:00
										 |  |  | 	__u8		flags; | 
					
						
							|  |  |  | 	__u16		pad1; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-02-02 15:21:10 +00:00
										 |  |  | /* CHOKe section */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | enum { | 
					
						
							|  |  |  | 	TCA_CHOKE_UNSPEC, | 
					
						
							|  |  |  | 	TCA_CHOKE_PARMS, | 
					
						
							|  |  |  | 	TCA_CHOKE_STAB, | 
					
						
							| 
									
										
										
										
											2011-12-09 02:46:45 +00:00
										 |  |  | 	TCA_CHOKE_MAX_P, | 
					
						
							| 
									
										
										
										
											2011-02-02 15:21:10 +00:00
										 |  |  | 	__TCA_CHOKE_MAX, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TCA_CHOKE_MAX (__TCA_CHOKE_MAX - 1)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct tc_choke_qopt { | 
					
						
							|  |  |  | 	__u32		limit;		/* Hard queue length (packets)	*/ | 
					
						
							|  |  |  | 	__u32		qth_min;	/* Min average threshold (packets) */ | 
					
						
							|  |  |  | 	__u32		qth_max;	/* Max average threshold (packets) */ | 
					
						
							|  |  |  | 	unsigned char   Wlog;		/* log(W)		*/ | 
					
						
							|  |  |  | 	unsigned char   Plog;		/* log(P_max/(qth_max-qth_min))	*/ | 
					
						
							|  |  |  | 	unsigned char   Scell_log;	/* cell size for idle damping */ | 
					
						
							|  |  |  | 	unsigned char	flags;		/* see RED flags */ | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct tc_choke_xstats { | 
					
						
							|  |  |  | 	__u32		early;          /* Early drops */ | 
					
						
							|  |  |  | 	__u32		pdrop;          /* Drops due to queue limits */ | 
					
						
							|  |  |  | 	__u32		other;          /* Drops due to drop() calls */ | 
					
						
							|  |  |  | 	__u32		marked;         /* Marked packets */ | 
					
						
							|  |  |  | 	__u32		matched;	/* Drops due to flow match */ | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | /* HTB section */ | 
					
						
							|  |  |  | #define TC_HTB_NUMPRIO		8
 | 
					
						
							|  |  |  | #define TC_HTB_MAXDEPTH		8
 | 
					
						
							|  |  |  | #define TC_HTB_PROTOVER		3 /* the same as HTB and TC's major */
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_htb_opt { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	struct tc_ratespec 	rate; | 
					
						
							|  |  |  | 	struct tc_ratespec 	ceil; | 
					
						
							|  |  |  | 	__u32	buffer; | 
					
						
							|  |  |  | 	__u32	cbuffer; | 
					
						
							|  |  |  | 	__u32	quantum; | 
					
						
							|  |  |  | 	__u32	level;		/* out only */ | 
					
						
							|  |  |  | 	__u32	prio; | 
					
						
							|  |  |  | }; | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_htb_glob { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__u32 version;		/* to match HTB/TC */ | 
					
						
							|  |  |  |     	__u32 rate2quantum;	/* bps->quantum divisor */ | 
					
						
							|  |  |  |     	__u32 defcls;		/* default class number */ | 
					
						
							|  |  |  | 	__u32 debug;		/* debug flags */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* stats */ | 
					
						
							| 
									
										
										
										
											2011-11-21 06:53:46 +00:00
										 |  |  | 	__u32 direct_pkts; /* count of non shaped packets */ | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | }; | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | enum { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	TCA_HTB_UNSPEC, | 
					
						
							|  |  |  | 	TCA_HTB_PARMS, | 
					
						
							|  |  |  | 	TCA_HTB_INIT, | 
					
						
							|  |  |  | 	TCA_HTB_CTAB, | 
					
						
							|  |  |  | 	TCA_HTB_RTAB, | 
					
						
							| 
									
										
										
										
											2013-03-06 06:49:21 +00:00
										 |  |  | 	TCA_HTB_DIRECT_QLEN, | 
					
						
							| 
									
										
										
										
											2013-09-19 09:10:20 -07:00
										 |  |  | 	TCA_HTB_RATE64, | 
					
						
							|  |  |  | 	TCA_HTB_CEIL64, | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__TCA_HTB_MAX, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TCA_HTB_MAX (__TCA_HTB_MAX - 1)
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_htb_xstats { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__u32 lends; | 
					
						
							|  |  |  | 	__u32 borrows; | 
					
						
							|  |  |  | 	__u32 giants;	/* too big packets (rate will not be accurate) */ | 
					
						
							|  |  |  | 	__u32 tokens; | 
					
						
							|  |  |  | 	__u32 ctokens; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* HFSC section */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_hfsc_qopt { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__u16	defcls;		/* default class */ | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_service_curve { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__u32	m1;		/* slope of the first segment in bps */ | 
					
						
							|  |  |  | 	__u32	d;		/* x-projection of the first segment in us */ | 
					
						
							|  |  |  | 	__u32	m2;		/* slope of the second segment in bps */ | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_hfsc_stats { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__u64	work;		/* total work done */ | 
					
						
							|  |  |  | 	__u64	rtwork;		/* work done by real-time criteria */ | 
					
						
							|  |  |  | 	__u32	period;		/* current period */ | 
					
						
							|  |  |  | 	__u32	level;		/* class level in hierarchy */ | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | enum { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	TCA_HFSC_UNSPEC, | 
					
						
							|  |  |  | 	TCA_HFSC_RSC, | 
					
						
							|  |  |  | 	TCA_HFSC_FSC, | 
					
						
							|  |  |  | 	TCA_HFSC_USC, | 
					
						
							|  |  |  | 	__TCA_HFSC_MAX, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TCA_HFSC_MAX (__TCA_HFSC_MAX - 1)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* CBQ section */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TC_CBQ_MAXPRIO		8
 | 
					
						
							|  |  |  | #define TC_CBQ_MAXLEVEL		8
 | 
					
						
							|  |  |  | #define TC_CBQ_DEF_EWMA		5
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_cbq_lssopt { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	unsigned char	change; | 
					
						
							|  |  |  | 	unsigned char	flags; | 
					
						
							|  |  |  | #define TCF_CBQ_LSS_BOUNDED	1
 | 
					
						
							|  |  |  | #define TCF_CBQ_LSS_ISOLATED	2
 | 
					
						
							|  |  |  | 	unsigned char  	ewma_log; | 
					
						
							|  |  |  | 	unsigned char  	level; | 
					
						
							|  |  |  | #define TCF_CBQ_LSS_FLAGS	1
 | 
					
						
							|  |  |  | #define TCF_CBQ_LSS_EWMA	2
 | 
					
						
							|  |  |  | #define TCF_CBQ_LSS_MAXIDLE	4
 | 
					
						
							|  |  |  | #define TCF_CBQ_LSS_MINIDLE	8
 | 
					
						
							|  |  |  | #define TCF_CBQ_LSS_OFFTIME	0x10
 | 
					
						
							|  |  |  | #define TCF_CBQ_LSS_AVPKT	0x20
 | 
					
						
							|  |  |  | 	__u32		maxidle; | 
					
						
							|  |  |  | 	__u32		minidle; | 
					
						
							|  |  |  | 	__u32		offtime; | 
					
						
							|  |  |  | 	__u32		avpkt; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_cbq_wrropt { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	unsigned char	flags; | 
					
						
							|  |  |  | 	unsigned char	priority; | 
					
						
							|  |  |  | 	unsigned char	cpriority; | 
					
						
							|  |  |  | 	unsigned char	__reserved; | 
					
						
							|  |  |  | 	__u32		allot; | 
					
						
							|  |  |  | 	__u32		weight; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_cbq_ovl { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	unsigned char	strategy; | 
					
						
							|  |  |  | #define	TC_CBQ_OVL_CLASSIC	0
 | 
					
						
							|  |  |  | #define	TC_CBQ_OVL_DELAY	1
 | 
					
						
							|  |  |  | #define	TC_CBQ_OVL_LOWPRIO	2
 | 
					
						
							|  |  |  | #define	TC_CBQ_OVL_DROP		3
 | 
					
						
							|  |  |  | #define	TC_CBQ_OVL_RCLASSIC	4
 | 
					
						
							|  |  |  | 	unsigned char	priority2; | 
					
						
							| 
									
										
										
										
											2005-06-28 12:56:45 -07:00
										 |  |  | 	__u16		pad; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__u32		penalty; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_cbq_police { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	unsigned char	police; | 
					
						
							|  |  |  | 	unsigned char	__res1; | 
					
						
							|  |  |  | 	unsigned short	__res2; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_cbq_fopt { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__u32		split; | 
					
						
							|  |  |  | 	__u32		defmap; | 
					
						
							|  |  |  | 	__u32		defchange; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_cbq_xstats { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__u32		borrows; | 
					
						
							|  |  |  | 	__u32		overactions; | 
					
						
							|  |  |  | 	__s32		avgidle; | 
					
						
							|  |  |  | 	__s32		undertime; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | enum { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	TCA_CBQ_UNSPEC, | 
					
						
							|  |  |  | 	TCA_CBQ_LSSOPT, | 
					
						
							|  |  |  | 	TCA_CBQ_WRROPT, | 
					
						
							|  |  |  | 	TCA_CBQ_FOPT, | 
					
						
							|  |  |  | 	TCA_CBQ_OVL_STRATEGY, | 
					
						
							|  |  |  | 	TCA_CBQ_RATE, | 
					
						
							|  |  |  | 	TCA_CBQ_RTAB, | 
					
						
							|  |  |  | 	TCA_CBQ_POLICE, | 
					
						
							|  |  |  | 	__TCA_CBQ_MAX, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TCA_CBQ_MAX	(__TCA_CBQ_MAX - 1)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* dsmark section */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | enum { | 
					
						
							|  |  |  | 	TCA_DSMARK_UNSPEC, | 
					
						
							|  |  |  | 	TCA_DSMARK_INDICES, | 
					
						
							|  |  |  | 	TCA_DSMARK_DEFAULT_INDEX, | 
					
						
							|  |  |  | 	TCA_DSMARK_SET_TC_INDEX, | 
					
						
							|  |  |  | 	TCA_DSMARK_MASK, | 
					
						
							|  |  |  | 	TCA_DSMARK_VALUE, | 
					
						
							|  |  |  | 	__TCA_DSMARK_MAX, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TCA_DSMARK_MAX (__TCA_DSMARK_MAX - 1)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* ATM  section */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | enum { | 
					
						
							|  |  |  | 	TCA_ATM_UNSPEC, | 
					
						
							|  |  |  | 	TCA_ATM_FD,		/* file/socket descriptor */ | 
					
						
							|  |  |  | 	TCA_ATM_PTR,		/* pointer to descriptor - later */ | 
					
						
							|  |  |  | 	TCA_ATM_HDR,		/* LL header */ | 
					
						
							|  |  |  | 	TCA_ATM_EXCESS,		/* excess traffic class (0 for CLP)  */ | 
					
						
							|  |  |  | 	TCA_ATM_ADDR,		/* PVC address (for output only) */ | 
					
						
							|  |  |  | 	TCA_ATM_STATE,		/* VC state (ATM_VS_*; for output only) */ | 
					
						
							|  |  |  | 	__TCA_ATM_MAX, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TCA_ATM_MAX	(__TCA_ATM_MAX - 1)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* Network emulator */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | enum { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	TCA_NETEM_UNSPEC, | 
					
						
							|  |  |  | 	TCA_NETEM_CORR, | 
					
						
							|  |  |  | 	TCA_NETEM_DELAY_DIST, | 
					
						
							| 
									
										
										
										
											2005-05-26 12:55:48 -07:00
										 |  |  | 	TCA_NETEM_REORDER, | 
					
						
							| 
									
										
										
										
											2005-12-21 19:03:44 -08:00
										 |  |  | 	TCA_NETEM_CORRUPT, | 
					
						
							| 
									
										
										
										
											2011-02-23 13:04:21 +00:00
										 |  |  | 	TCA_NETEM_LOSS, | 
					
						
							| 
									
										
										
										
											2011-11-30 12:20:26 +00:00
										 |  |  | 	TCA_NETEM_RATE, | 
					
						
							| 
									
										
										
										
											2012-04-30 23:11:05 +00:00
										 |  |  | 	TCA_NETEM_ECN, | 
					
						
							| 
									
										
										
										
											2013-12-25 17:35:15 +08:00
										 |  |  | 	TCA_NETEM_RATE64, | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__TCA_NETEM_MAX, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TCA_NETEM_MAX (__TCA_NETEM_MAX - 1)
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_netem_qopt { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__u32	latency;	/* added delay (us) */ | 
					
						
							|  |  |  | 	__u32   limit;		/* fifo limit (packets) */ | 
					
						
							|  |  |  | 	__u32	loss;		/* random packet loss (0=none ~0=100%) */ | 
					
						
							| 
									
										
										
										
											2005-05-26 12:55:48 -07:00
										 |  |  | 	__u32	gap;		/* re-ordering gap (0 for none) */ | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__u32   duplicate;	/* random packet dup  (0=none ~0=100%) */ | 
					
						
							|  |  |  | 	__u32	jitter;		/* random jitter in latency (us) */ | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_netem_corr { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__u32	delay_corr;	/* delay correlation */ | 
					
						
							|  |  |  | 	__u32	loss_corr;	/* packet loss correlation */ | 
					
						
							|  |  |  | 	__u32	dup_corr;	/* duplicate correlation  */ | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_netem_reorder { | 
					
						
							| 
									
										
										
										
											2005-05-26 12:55:48 -07:00
										 |  |  | 	__u32	probability; | 
					
						
							|  |  |  | 	__u32	correlation; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_netem_corrupt { | 
					
						
							| 
									
										
										
										
											2005-12-21 19:03:44 -08:00
										 |  |  | 	__u32	probability; | 
					
						
							|  |  |  | 	__u32	correlation; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-11-30 12:20:26 +00:00
										 |  |  | struct tc_netem_rate { | 
					
						
							|  |  |  | 	__u32	rate;	/* byte/s */ | 
					
						
							| 
									
										
										
										
											2011-12-12 14:30:00 +00:00
										 |  |  | 	__s32	packet_overhead; | 
					
						
							|  |  |  | 	__u32	cell_size; | 
					
						
							|  |  |  | 	__s32	cell_overhead; | 
					
						
							| 
									
										
										
										
											2011-11-30 12:20:26 +00:00
										 |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-02-23 13:04:21 +00:00
										 |  |  | enum { | 
					
						
							|  |  |  | 	NETEM_LOSS_UNSPEC, | 
					
						
							|  |  |  | 	NETEM_LOSS_GI,		/* General Intuitive - 4 state model */ | 
					
						
							|  |  |  | 	NETEM_LOSS_GE,		/* Gilbert Elliot models */ | 
					
						
							|  |  |  | 	__NETEM_LOSS_MAX | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | #define NETEM_LOSS_MAX (__NETEM_LOSS_MAX - 1)
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-11-21 06:53:46 +00:00
										 |  |  | /* State transition probabilities for 4 state model */ | 
					
						
							| 
									
										
										
										
											2011-02-23 13:04:21 +00:00
										 |  |  | struct tc_netem_gimodel { | 
					
						
							|  |  |  | 	__u32	p13; | 
					
						
							|  |  |  | 	__u32	p31; | 
					
						
							|  |  |  | 	__u32	p32; | 
					
						
							|  |  |  | 	__u32	p14; | 
					
						
							|  |  |  | 	__u32	p23; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* Gilbert-Elliot models */ | 
					
						
							|  |  |  | struct tc_netem_gemodel { | 
					
						
							|  |  |  | 	__u32 p; | 
					
						
							|  |  |  | 	__u32 r; | 
					
						
							|  |  |  | 	__u32 h; | 
					
						
							|  |  |  | 	__u32 k1; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | #define NETEM_DIST_SCALE	8192
 | 
					
						
							| 
									
										
										
										
											2011-02-23 13:04:19 +00:00
										 |  |  | #define NETEM_DIST_MAX		16384
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-11-20 04:10:00 -08:00
										 |  |  | /* DRR */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | enum { | 
					
						
							| 
									
										
										
										
											2008-11-20 04:10:00 -08:00
										 |  |  | 	TCA_DRR_UNSPEC, | 
					
						
							|  |  |  | 	TCA_DRR_QUANTUM, | 
					
						
							|  |  |  | 	__TCA_DRR_MAX | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TCA_DRR_MAX	(__TCA_DRR_MAX - 1)
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-11-04 09:50:58 -08:00
										 |  |  | struct tc_drr_stats { | 
					
						
							| 
									
										
										
										
											2009-02-10 17:18:17 -08:00
										 |  |  | 	__u32	deficit; | 
					
						
							| 
									
										
										
										
											2008-11-20 04:10:00 -08:00
										 |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-01-17 08:06:09 +00:00
										 |  |  | /* MQPRIO */ | 
					
						
							|  |  |  | #define TC_QOPT_BITMASK 15
 | 
					
						
							|  |  |  | #define TC_QOPT_MAX_QUEUE 16
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct tc_mqprio_qopt { | 
					
						
							|  |  |  | 	__u8	num_tc; | 
					
						
							|  |  |  | 	__u8	prio_tc_map[TC_QOPT_BITMASK + 1]; | 
					
						
							|  |  |  | 	__u8	hw; | 
					
						
							|  |  |  | 	__u16	count[TC_QOPT_MAX_QUEUE]; | 
					
						
							|  |  |  | 	__u16	offset[TC_QOPT_MAX_QUEUE]; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												net_sched: SFB flow scheduler
This is the Stochastic Fair Blue scheduler, based on work from :
W. Feng, D. Kandlur, D. Saha, K. Shin. Blue: A New Class of Active Queue
Management Algorithms. U. Michigan CSE-TR-387-99, April 1999.
http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
This implementation is based on work done by Juliusz Chroboczek
General SFB algorithm can be found in figure 14, page 15:
B[l][n] : L x N array of bins (L levels, N bins per level)
enqueue()
Calculate hash function values h{0}, h{1}, .. h{L-1}
Update bins at each level
for i = 0 to L - 1
   if (B[i][h{i}].qlen > bin_size)
      B[i][h{i}].p_mark += p_increment;
   else if (B[i][h{i}].qlen == 0)
      B[i][h{i}].p_mark -= p_decrement;
p_min = min(B[0][h{0}].p_mark ... B[L-1][h{L-1}].p_mark);
if (p_min == 1.0)
    ratelimit();
else
    mark/drop with probabilty p_min;
I did the adaptation of Juliusz code to meet current kernel standards,
and various changes to address previous comments :
http://thread.gmane.org/gmane.linux.network/90225
http://thread.gmane.org/gmane.linux.network/90375
Default flow classifier is the rxhash introduced by RPS in 2.6.35, but
we can use an external flow classifier if wanted.
tc qdisc add dev $DEV parent 1:11 handle 11:  \
        est 0.5sec 2sec sfb limit 128
tc filter add dev $DEV protocol ip parent 11: handle 3 \
        flow hash keys dst divisor 1024
Notes:
1) SFB default child qdisc is pfifo_fast. It can be changed by another
qdisc but a child qdisc MUST not drop a packet previously queued. This
is because SFB needs to handle a dequeued packet in order to maintain
its virtual queue states. pfifo_head_drop or CHOKe should not be used.
2) ECN is enabled by default, unlike RED/CHOKe/GRED
With help from Patrick McHardy & Andi Kleen
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Juliusz Chroboczek <Juliusz.Chroboczek@pps.jussieu.fr>
CC: Stephen Hemminger <shemminger@vyatta.com>
CC: Patrick McHardy <kaber@trash.net>
CC: Andi Kleen <andi@firstfloor.org>
CC: John W. Linville <linville@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2011-02-23 10:56:17 +00:00
										 |  |  | /* SFB */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | enum { | 
					
						
							|  |  |  | 	TCA_SFB_UNSPEC, | 
					
						
							|  |  |  | 	TCA_SFB_PARMS, | 
					
						
							|  |  |  | 	__TCA_SFB_MAX, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TCA_SFB_MAX (__TCA_SFB_MAX - 1)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /*
 | 
					
						
							|  |  |  |  * Note: increment, decrement are Q0.16 fixed-point values. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | struct tc_sfb_qopt { | 
					
						
							|  |  |  | 	__u32 rehash_interval;	/* delay between hash move, in ms */ | 
					
						
							|  |  |  | 	__u32 warmup_time;	/* double buffering warmup time in ms (warmup_time < rehash_interval) */ | 
					
						
							|  |  |  | 	__u32 max;		/* max len of qlen_min */ | 
					
						
							|  |  |  | 	__u32 bin_size;		/* maximum queue length per bin */ | 
					
						
							|  |  |  | 	__u32 increment;	/* probability increment, (d1 in Blue) */ | 
					
						
							|  |  |  | 	__u32 decrement;	/* probability decrement, (d2 in Blue) */ | 
					
						
							|  |  |  | 	__u32 limit;		/* max SFB queue length */ | 
					
						
							|  |  |  | 	__u32 penalty_rate;	/* inelastic flows are rate limited to 'rate' pps */ | 
					
						
							|  |  |  | 	__u32 penalty_burst; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct tc_sfb_xstats { | 
					
						
							|  |  |  | 	__u32 earlydrop; | 
					
						
							|  |  |  | 	__u32 penaltydrop; | 
					
						
							|  |  |  | 	__u32 bucketdrop; | 
					
						
							|  |  |  | 	__u32 queuedrop; | 
					
						
							|  |  |  | 	__u32 childdrop; /* drops in child qdisc */ | 
					
						
							|  |  |  | 	__u32 marked; | 
					
						
							|  |  |  | 	__u32 maxqlen; | 
					
						
							|  |  |  | 	__u32 maxprob; | 
					
						
							|  |  |  | 	__u32 avgprob; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define SFB_MAX_PROB 0xFFFF
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-04-04 05:30:58 +00:00
										 |  |  | /* QFQ */ | 
					
						
							|  |  |  | enum { | 
					
						
							|  |  |  | 	TCA_QFQ_UNSPEC, | 
					
						
							|  |  |  | 	TCA_QFQ_WEIGHT, | 
					
						
							|  |  |  | 	TCA_QFQ_LMAX, | 
					
						
							|  |  |  | 	__TCA_QFQ_MAX | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TCA_QFQ_MAX	(__TCA_QFQ_MAX - 1)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct tc_qfq_stats { | 
					
						
							|  |  |  | 	__u32 weight; | 
					
						
							|  |  |  | 	__u32 lmax; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												codel: Controlled Delay AQM
An implementation of CoDel AQM, from Kathleen Nichols and Van Jacobson.
http://queue.acm.org/detail.cfm?id=2209336
This AQM main input is no longer queue size in bytes or packets, but the
delay packets stay in (FIFO) queue.
As we don't have infinite memory, we still can drop packets in enqueue()
in case of massive load, but mean of CoDel is to drop packets in
dequeue(), using a control law based on two simple parameters :
target : target sojourn time (default 5ms)
interval : width of moving time window (default 100ms)
Based on initial work from Dave Taht.
Refactored to help future codel inclusion as a plugin for other linux
qdisc (FQ_CODEL, ...), like RED.
include/net/codel.h contains codel algorithm as close as possible than
Kathleen reference.
net/sched/sch_codel.c contains the linux qdisc specific glue.
Separate structures permit a memory efficient implementation of fq_codel
(to be sent as a separate work) : Each flow has its own struct
codel_vars.
timestamps are taken at enqueue() time with 1024 ns precision, allowing
a range of 2199 seconds in queue, and 100Gb links support. iproute2 uses
usec as base unit.
Selected packets are dropped, unless ECN is enabled and packets can get
ECN mark instead.
Tested from 2Mb to 10Gb speeds with no particular problems, on ixgbe and
tg3 drivers (BQL enabled).
Usage: tc qdisc ... codel [ limit PACKETS ] [ target TIME ]
                          [ interval TIME ] [ ecn ]
qdisc codel 10: parent 1:1 limit 2000p target 3.0ms interval 60.0ms ecn
 Sent 13347099587 bytes 8815805 pkt (dropped 0, overlimits 0 requeues 0)
 rate 202365Kbit 16708pps backlog 113550b 75p requeues 0
  count 116 lastcount 98 ldelay 4.3ms dropping drop_next 816us
  maxpacket 1514 ecn_mark 84399 drop_overlimit 0
CoDel must be seen as a base module, and should be used keeping in mind
there is still a FIFO queue. So a typical setup will probably need a
hierarchy of several qdiscs and packet classifiers to be able to meet
whatever constraints a user might have.
One possible example would be to use fq_codel, which combines Fair
Queueing and CoDel, in replacement of sfq / sfq_red.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Dave Taht <dave.taht@bufferbloat.net>
Cc: Kathleen Nichols <nichols@pollere.com>
Cc: Van Jacobson <van@pollere.net>
Cc: Tom Herbert <therbert@google.com>
Cc: Matt Mathis <mattmathis@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2012-05-10 07:51:25 +00:00
										 |  |  | /* CODEL */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | enum { | 
					
						
							|  |  |  | 	TCA_CODEL_UNSPEC, | 
					
						
							|  |  |  | 	TCA_CODEL_TARGET, | 
					
						
							|  |  |  | 	TCA_CODEL_LIMIT, | 
					
						
							|  |  |  | 	TCA_CODEL_INTERVAL, | 
					
						
							|  |  |  | 	TCA_CODEL_ECN, | 
					
						
							|  |  |  | 	__TCA_CODEL_MAX | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TCA_CODEL_MAX	(__TCA_CODEL_MAX - 1)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct tc_codel_xstats { | 
					
						
							|  |  |  | 	__u32	maxpacket; /* largest packet we've seen so far */ | 
					
						
							|  |  |  | 	__u32	count;	   /* how many drops we've done since the last time we
 | 
					
						
							|  |  |  | 			    * entered dropping state | 
					
						
							|  |  |  | 			    */ | 
					
						
							|  |  |  | 	__u32	lastcount; /* count at entry to dropping state */ | 
					
						
							|  |  |  | 	__u32	ldelay;    /* in-queue delay seen by most recently dequeued packet */ | 
					
						
							|  |  |  | 	__s32	drop_next; /* time to drop next packet */ | 
					
						
							|  |  |  | 	__u32	drop_overlimit; /* number of time max qdisc packet limit was hit */ | 
					
						
							|  |  |  | 	__u32	ecn_mark;  /* number of packets we ECN marked instead of dropped */ | 
					
						
							|  |  |  | 	__u32	dropping;  /* are we in dropping state ? */ | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												fq_codel: Fair Queue Codel AQM
Fair Queue Codel packet scheduler
Principles :
- Packets are classified (internal classifier or external) on flows.
- This is a Stochastic model (as we use a hash, several flows might
                              be hashed on same slot)
- Each flow has a CoDel managed queue.
- Flows are linked onto two (Round Robin) lists,
  so that new flows have priority on old ones.
- For a given flow, packets are not reordered (CoDel uses a FIFO)
- head drops only.
- ECN capability is on by default.
- Very low memory footprint (64 bytes per flow)
tc qdisc ... fq_codel [ limit PACKETS ] [ flows number ]
                      [ target TIME ] [ interval TIME ] [ noecn ]
                      [ quantum BYTES ]
defaults : 1024 flows, 10240 packets limit, quantum : device MTU
           target : 5ms (CoDel default)
           interval : 100ms (CoDel default)
Impressive results on load :
class htb 1:1 root leaf 10: prio 0 quantum 1514 rate 200000Kbit ceil 200000Kbit burst 1475b/8 mpu 0b overhead 0b cburst 1475b/8 mpu 0b overhead 0b level 0
 Sent 43304920109 bytes 33063109 pkt (dropped 0, overlimits 0 requeues 0)
 rate 201691Kbit 28595pps backlog 0b 312p requeues 0
 lended: 33063109 borrowed: 0 giants: 0
 tokens: -912 ctokens: -912
class fq_codel 10:1735 parent 10:
 (dropped 1292, overlimits 0 requeues 0)
 backlog 15140b 10p requeues 0
  deficit 1514 count 1 lastcount 1 ldelay 7.1ms
class fq_codel 10:4524 parent 10:
 (dropped 1291, overlimits 0 requeues 0)
 backlog 16654b 11p requeues 0
  deficit 1514 count 1 lastcount 1 ldelay 7.1ms
class fq_codel 10:4e74 parent 10:
 (dropped 1290, overlimits 0 requeues 0)
 backlog 6056b 4p requeues 0
  deficit 1514 count 1 lastcount 1 ldelay 6.4ms dropping drop_next 92.0ms
class fq_codel 10:628a parent 10:
 (dropped 1289, overlimits 0 requeues 0)
 backlog 7570b 5p requeues 0
  deficit 1514 count 1 lastcount 1 ldelay 5.4ms dropping drop_next 90.9ms
class fq_codel 10:a4b3 parent 10:
 (dropped 302, overlimits 0 requeues 0)
 backlog 16654b 11p requeues 0
  deficit 1514 count 1 lastcount 1 ldelay 7.1ms
class fq_codel 10:c3c2 parent 10:
 (dropped 1284, overlimits 0 requeues 0)
 backlog 13626b 9p requeues 0
  deficit 1514 count 1 lastcount 1 ldelay 5.9ms
class fq_codel 10:d331 parent 10:
 (dropped 299, overlimits 0 requeues 0)
 backlog 15140b 10p requeues 0
  deficit 1514 count 1 lastcount 1 ldelay 7.0ms
class fq_codel 10:d526 parent 10:
 (dropped 12160, overlimits 0 requeues 0)
 backlog 35870b 211p requeues 0
  deficit 1508 count 12160 lastcount 1 ldelay 15.3ms dropping drop_next 247us
class fq_codel 10:e2c6 parent 10:
 (dropped 1288, overlimits 0 requeues 0)
 backlog 15140b 10p requeues 0
  deficit 1514 count 1 lastcount 1 ldelay 7.1ms
class fq_codel 10:eab5 parent 10:
 (dropped 1285, overlimits 0 requeues 0)
 backlog 16654b 11p requeues 0
  deficit 1514 count 1 lastcount 1 ldelay 5.9ms
class fq_codel 10:f220 parent 10:
 (dropped 1289, overlimits 0 requeues 0)
 backlog 15140b 10p requeues 0
  deficit 1514 count 1 lastcount 1 ldelay 7.1ms
qdisc htb 1: root refcnt 6 r2q 10 default 1 direct_packets_stat 0 ver 3.17
 Sent 43331086547 bytes 33092812 pkt (dropped 0, overlimits 66063544 requeues 71)
 rate 201697Kbit 28602pps backlog 0b 260p requeues 71
qdisc fq_codel 10: parent 1:1 limit 10240p flows 65536 target 5.0ms interval 100.0ms ecn
 Sent 43331086547 bytes 33092812 pkt (dropped 949359, overlimits 0 requeues 0)
 rate 201697Kbit 28602pps backlog 189352b 260p requeues 0
  maxpacket 1514 drop_overlimit 0 new_flow_count 5582 ecn_mark 125593
  new_flows_len 0 old_flows_len 11
PING 172.30.42.18 (172.30.42.18) 56(84) bytes of data.
64 bytes from 172.30.42.18: icmp_req=1 ttl=64 time=0.227 ms
64 bytes from 172.30.42.18: icmp_req=2 ttl=64 time=0.165 ms
64 bytes from 172.30.42.18: icmp_req=3 ttl=64 time=0.166 ms
64 bytes from 172.30.42.18: icmp_req=4 ttl=64 time=0.151 ms
64 bytes from 172.30.42.18: icmp_req=5 ttl=64 time=0.164 ms
64 bytes from 172.30.42.18: icmp_req=6 ttl=64 time=0.172 ms
64 bytes from 172.30.42.18: icmp_req=7 ttl=64 time=0.175 ms
64 bytes from 172.30.42.18: icmp_req=8 ttl=64 time=0.183 ms
64 bytes from 172.30.42.18: icmp_req=9 ttl=64 time=0.158 ms
64 bytes from 172.30.42.18: icmp_req=10 ttl=64 time=0.200 ms
10 packets transmitted, 10 received, 0% packet loss, time 8999ms
rtt min/avg/max/mdev = 0.151/0.176/0.227/0.022 ms
Much better than SFQ because of priority given to new flows, and fast
path dirtying less cache lines.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2012-05-11 09:30:50 +00:00
										 |  |  | /* FQ_CODEL */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | enum { | 
					
						
							|  |  |  | 	TCA_FQ_CODEL_UNSPEC, | 
					
						
							|  |  |  | 	TCA_FQ_CODEL_TARGET, | 
					
						
							|  |  |  | 	TCA_FQ_CODEL_LIMIT, | 
					
						
							|  |  |  | 	TCA_FQ_CODEL_INTERVAL, | 
					
						
							|  |  |  | 	TCA_FQ_CODEL_ECN, | 
					
						
							|  |  |  | 	TCA_FQ_CODEL_FLOWS, | 
					
						
							|  |  |  | 	TCA_FQ_CODEL_QUANTUM, | 
					
						
							|  |  |  | 	__TCA_FQ_CODEL_MAX | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TCA_FQ_CODEL_MAX	(__TCA_FQ_CODEL_MAX - 1)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | enum { | 
					
						
							|  |  |  | 	TCA_FQ_CODEL_XSTATS_QDISC, | 
					
						
							|  |  |  | 	TCA_FQ_CODEL_XSTATS_CLASS, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct tc_fq_codel_qd_stats { | 
					
						
							|  |  |  | 	__u32	maxpacket;	/* largest packet we've seen so far */ | 
					
						
							|  |  |  | 	__u32	drop_overlimit; /* number of time max qdisc
 | 
					
						
							|  |  |  | 				 * packet limit was hit | 
					
						
							|  |  |  | 				 */ | 
					
						
							|  |  |  | 	__u32	ecn_mark;	/* number of packets we ECN marked
 | 
					
						
							|  |  |  | 				 * instead of being dropped | 
					
						
							|  |  |  | 				 */ | 
					
						
							|  |  |  | 	__u32	new_flow_count; /* number of time packets
 | 
					
						
							|  |  |  | 				 * created a 'new flow' | 
					
						
							|  |  |  | 				 */ | 
					
						
							|  |  |  | 	__u32	new_flows_len;	/* count of flows in new list */ | 
					
						
							|  |  |  | 	__u32	old_flows_len;	/* count of flows in old list */ | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct tc_fq_codel_cl_stats { | 
					
						
							|  |  |  | 	__s32	deficit; | 
					
						
							|  |  |  | 	__u32	ldelay;		/* in-queue delay seen by most recently
 | 
					
						
							|  |  |  | 				 * dequeued packet | 
					
						
							|  |  |  | 				 */ | 
					
						
							|  |  |  | 	__u32	count; | 
					
						
							|  |  |  | 	__u32	lastcount; | 
					
						
							|  |  |  | 	__u32	dropping; | 
					
						
							|  |  |  | 	__s32	drop_next; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct tc_fq_codel_xstats { | 
					
						
							|  |  |  | 	__u32	type; | 
					
						
							|  |  |  | 	union { | 
					
						
							|  |  |  | 		struct tc_fq_codel_qd_stats qdisc_stats; | 
					
						
							|  |  |  | 		struct tc_fq_codel_cl_stats class_stats; | 
					
						
							|  |  |  | 	}; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												pkt_sched: fq: Fair Queue packet scheduler
- Uses perfect flow match (not stochastic hash like SFQ/FQ_codel)
- Uses the new_flow/old_flow separation from FQ_codel
- New flows get an initial credit allowing IW10 without added delay.
- Special FIFO queue for high prio packets (no need for PRIO + FQ)
- Uses a hash table of RB trees to locate the flows at enqueue() time
- Smart on demand gc (at enqueue() time, RB tree lookup evicts old
  unused flows)
- Dynamic memory allocations.
- Designed to allow millions of concurrent flows per Qdisc.
- Small memory footprint : ~8K per Qdisc, and 104 bytes per flow.
- Single high resolution timer for throttled flows (if any).
- One RB tree to link throttled flows.
- Ability to have a max rate per flow. We might add a socket option
  to add per socket limitation.
Attempts have been made to add TCP pacing in TCP stack, but this
seems to add complex code to an already complex stack.
TCP pacing is welcomed for flows having idle times, as the cwnd
permits TCP stack to queue a possibly large number of packets.
This removes the 'slow start after idle' choice, hitting badly
large BDP flows, and applications delivering chunks of data
as video streams.
Nicely spaced packets :
Here interface is 10Gbit, but flow bottleneck is ~20Mbit
cwin is big, yet FQ avoids the typical bursts generated by TCP
(as in netperf TCP_RR -- -r 100000,100000)
15:01:23.545279 IP A > B: . 78193:81089(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.545394 IP B > A: . ack 81089 win 3668 <nop,nop,timestamp 11597985 1115>
15:01:23.546488 IP A > B: . 81089:83985(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.546565 IP B > A: . ack 83985 win 3668 <nop,nop,timestamp 11597986 1115>
15:01:23.547713 IP A > B: . 83985:86881(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.547778 IP B > A: . ack 86881 win 3668 <nop,nop,timestamp 11597987 1115>
15:01:23.548911 IP A > B: . 86881:89777(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.548949 IP B > A: . ack 89777 win 3668 <nop,nop,timestamp 11597988 1115>
15:01:23.550116 IP A > B: . 89777:92673(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.550182 IP B > A: . ack 92673 win 3668 <nop,nop,timestamp 11597989 1115>
15:01:23.551333 IP A > B: . 92673:95569(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.551406 IP B > A: . ack 95569 win 3668 <nop,nop,timestamp 11597991 1115>
15:01:23.552539 IP A > B: . 95569:98465(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.552576 IP B > A: . ack 98465 win 3668 <nop,nop,timestamp 11597992 1115>
15:01:23.553756 IP A > B: . 98465:99913(1448) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.554138 IP A > B: P 99913:100001(88) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.554204 IP B > A: . ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.554234 IP B > A: . 65248:68144(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.555620 IP B > A: . 68144:71040(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.557005 IP B > A: . 71040:73936(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.558390 IP B > A: . 73936:76832(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.559773 IP B > A: . 76832:79728(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.561158 IP B > A: . 79728:82624(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.562543 IP B > A: . 82624:85520(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.563928 IP B > A: . 85520:88416(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.565313 IP B > A: . 88416:91312(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.566698 IP B > A: . 91312:94208(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.568083 IP B > A: . 94208:97104(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.569467 IP B > A: . 97104:100000(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.570852 IP B > A: . 100000:102896(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.572237 IP B > A: . 102896:105792(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.573639 IP B > A: . 105792:108688(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.575024 IP B > A: . 108688:111584(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.576408 IP B > A: . 111584:114480(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.577793 IP B > A: . 114480:117376(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
TCP timestamps show that most packets from B were queued in the same ms
timeframe (TSval 1159799{3,4}), but FQ managed to send them right
in time to avoid a big burst.
In slow start or steady state, very few packets are throttled [1]
FQ gets a bunch of tunables as :
  limit : max number of packets on whole Qdisc (default 10000)
  flow_limit : max number of packets per flow (default 100)
  quantum : the credit per RR round (default is 2 MTU)
  initial_quantum : initial credit for new flows (default is 10 MTU)
  maxrate : max per flow rate (default : unlimited)
  buckets : number of RB trees (default : 1024) in hash table.
               (consumes 8 bytes per bucket)
  [no]pacing : disable/enable pacing (default is enable)
All of them can be changed on a live qdisc.
$ tc qd add dev eth0 root fq help
Usage: ... fq [ limit PACKETS ] [ flow_limit PACKETS ]
              [ quantum BYTES ] [ initial_quantum BYTES ]
              [ maxrate RATE  ] [ buckets NUMBER ]
              [ [no]pacing ]
$ tc -s -d qd
qdisc fq 8002: dev eth0 root refcnt 32 limit 10000p flow_limit 100p buckets 256 quantum 3028 initial_quantum 15140
 Sent 216532416 bytes 148395 pkt (dropped 0, overlimits 0 requeues 14)
 backlog 0b 0p requeues 14
  511 flows, 511 inactive, 0 throttled
  110 gc, 0 highprio, 0 retrans, 1143 throttled, 0 flows_plimit
[1] Except if initial srtt is overestimated, as if using
cached srtt in tcp metrics. We'll provide a fix for this issue.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2013-08-29 15:49:55 -07:00
										 |  |  | /* FQ */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | enum { | 
					
						
							|  |  |  | 	TCA_FQ_UNSPEC, | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	TCA_FQ_PLIMIT,		/* limit of total number of packets in queue */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	TCA_FQ_FLOW_PLIMIT,	/* limit of packets per flow */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	TCA_FQ_QUANTUM,		/* RR quantum */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	TCA_FQ_INITIAL_QUANTUM,		/* RR quantum for new flow */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	TCA_FQ_RATE_ENABLE,	/* enable/disable rate limiting */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2013-11-15 08:57:26 -08:00
										 |  |  | 	TCA_FQ_FLOW_DEFAULT_RATE,/* obsolete, do not use */ | 
					
						
							| 
									
										
											  
											
												pkt_sched: fq: Fair Queue packet scheduler
- Uses perfect flow match (not stochastic hash like SFQ/FQ_codel)
- Uses the new_flow/old_flow separation from FQ_codel
- New flows get an initial credit allowing IW10 without added delay.
- Special FIFO queue for high prio packets (no need for PRIO + FQ)
- Uses a hash table of RB trees to locate the flows at enqueue() time
- Smart on demand gc (at enqueue() time, RB tree lookup evicts old
  unused flows)
- Dynamic memory allocations.
- Designed to allow millions of concurrent flows per Qdisc.
- Small memory footprint : ~8K per Qdisc, and 104 bytes per flow.
- Single high resolution timer for throttled flows (if any).
- One RB tree to link throttled flows.
- Ability to have a max rate per flow. We might add a socket option
  to add per socket limitation.
Attempts have been made to add TCP pacing in TCP stack, but this
seems to add complex code to an already complex stack.
TCP pacing is welcomed for flows having idle times, as the cwnd
permits TCP stack to queue a possibly large number of packets.
This removes the 'slow start after idle' choice, hitting badly
large BDP flows, and applications delivering chunks of data
as video streams.
Nicely spaced packets :
Here interface is 10Gbit, but flow bottleneck is ~20Mbit
cwin is big, yet FQ avoids the typical bursts generated by TCP
(as in netperf TCP_RR -- -r 100000,100000)
15:01:23.545279 IP A > B: . 78193:81089(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.545394 IP B > A: . ack 81089 win 3668 <nop,nop,timestamp 11597985 1115>
15:01:23.546488 IP A > B: . 81089:83985(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.546565 IP B > A: . ack 83985 win 3668 <nop,nop,timestamp 11597986 1115>
15:01:23.547713 IP A > B: . 83985:86881(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.547778 IP B > A: . ack 86881 win 3668 <nop,nop,timestamp 11597987 1115>
15:01:23.548911 IP A > B: . 86881:89777(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.548949 IP B > A: . ack 89777 win 3668 <nop,nop,timestamp 11597988 1115>
15:01:23.550116 IP A > B: . 89777:92673(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.550182 IP B > A: . ack 92673 win 3668 <nop,nop,timestamp 11597989 1115>
15:01:23.551333 IP A > B: . 92673:95569(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.551406 IP B > A: . ack 95569 win 3668 <nop,nop,timestamp 11597991 1115>
15:01:23.552539 IP A > B: . 95569:98465(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.552576 IP B > A: . ack 98465 win 3668 <nop,nop,timestamp 11597992 1115>
15:01:23.553756 IP A > B: . 98465:99913(1448) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.554138 IP A > B: P 99913:100001(88) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.554204 IP B > A: . ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.554234 IP B > A: . 65248:68144(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.555620 IP B > A: . 68144:71040(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.557005 IP B > A: . 71040:73936(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.558390 IP B > A: . 73936:76832(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.559773 IP B > A: . 76832:79728(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.561158 IP B > A: . 79728:82624(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.562543 IP B > A: . 82624:85520(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.563928 IP B > A: . 85520:88416(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.565313 IP B > A: . 88416:91312(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.566698 IP B > A: . 91312:94208(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.568083 IP B > A: . 94208:97104(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.569467 IP B > A: . 97104:100000(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.570852 IP B > A: . 100000:102896(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.572237 IP B > A: . 102896:105792(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.573639 IP B > A: . 105792:108688(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.575024 IP B > A: . 108688:111584(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.576408 IP B > A: . 111584:114480(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.577793 IP B > A: . 114480:117376(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
TCP timestamps show that most packets from B were queued in the same ms
timeframe (TSval 1159799{3,4}), but FQ managed to send them right
in time to avoid a big burst.
In slow start or steady state, very few packets are throttled [1]
FQ gets a bunch of tunables as :
  limit : max number of packets on whole Qdisc (default 10000)
  flow_limit : max number of packets per flow (default 100)
  quantum : the credit per RR round (default is 2 MTU)
  initial_quantum : initial credit for new flows (default is 10 MTU)
  maxrate : max per flow rate (default : unlimited)
  buckets : number of RB trees (default : 1024) in hash table.
               (consumes 8 bytes per bucket)
  [no]pacing : disable/enable pacing (default is enable)
All of them can be changed on a live qdisc.
$ tc qd add dev eth0 root fq help
Usage: ... fq [ limit PACKETS ] [ flow_limit PACKETS ]
              [ quantum BYTES ] [ initial_quantum BYTES ]
              [ maxrate RATE  ] [ buckets NUMBER ]
              [ [no]pacing ]
$ tc -s -d qd
qdisc fq 8002: dev eth0 root refcnt 32 limit 10000p flow_limit 100p buckets 256 quantum 3028 initial_quantum 15140
 Sent 216532416 bytes 148395 pkt (dropped 0, overlimits 0 requeues 14)
 backlog 0b 0p requeues 14
  511 flows, 511 inactive, 0 throttled
  110 gc, 0 highprio, 0 retrans, 1143 throttled, 0 flows_plimit
[1] Except if initial srtt is overestimated, as if using
cached srtt in tcp metrics. We'll provide a fix for this issue.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2013-08-29 15:49:55 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	TCA_FQ_FLOW_MAX_RATE,	/* per flow max rate */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	TCA_FQ_BUCKETS_LOG,	/* log2(number of buckets) */ | 
					
						
							| 
									
										
										
										
											2013-11-15 08:58:14 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	TCA_FQ_FLOW_REFILL_DELAY,	/* flow credit refill delay in usec */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												pkt_sched: fq: better control of DDOS traffic
FQ has a fast path for skb attached to a socket, as it does not
have to compute a flow hash. But for other packets, FQ being non
stochastic means that hosts exposed to random Internet traffic
can allocate million of flows structure (104 bytes each) pretty
easily. Not only host can OOM, but lookup in RB trees can take
too much cpu and memory resources.
This patch adds a new attribute, orphan_mask, that is adding
possibility of having a stochastic hash for orphaned skb.
Its default value is 1024 slots, to mimic SFQ behavior.
Note: This does not apply to locally generated TCP traffic,
and no locally generated traffic will share a flow structure
with another perfect or stochastic flow.
This patch also handles the specific case of SYNACK messages:
They are attached to the listener socket, and therefore all map
to a single hash bucket. If listener have set SO_MAX_PACING_RATE,
hoping to have new accepted socket inherit this rate, SYNACK
might be paced and even dropped.
This is very similar to an internal patch Google have used more
than one year.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2015-02-04 21:30:40 -08:00
										 |  |  | 	TCA_FQ_ORPHAN_MASK,	/* mask applied to orphaned skb hashes */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												pkt_sched: fq: Fair Queue packet scheduler
- Uses perfect flow match (not stochastic hash like SFQ/FQ_codel)
- Uses the new_flow/old_flow separation from FQ_codel
- New flows get an initial credit allowing IW10 without added delay.
- Special FIFO queue for high prio packets (no need for PRIO + FQ)
- Uses a hash table of RB trees to locate the flows at enqueue() time
- Smart on demand gc (at enqueue() time, RB tree lookup evicts old
  unused flows)
- Dynamic memory allocations.
- Designed to allow millions of concurrent flows per Qdisc.
- Small memory footprint : ~8K per Qdisc, and 104 bytes per flow.
- Single high resolution timer for throttled flows (if any).
- One RB tree to link throttled flows.
- Ability to have a max rate per flow. We might add a socket option
  to add per socket limitation.
Attempts have been made to add TCP pacing in TCP stack, but this
seems to add complex code to an already complex stack.
TCP pacing is welcomed for flows having idle times, as the cwnd
permits TCP stack to queue a possibly large number of packets.
This removes the 'slow start after idle' choice, hitting badly
large BDP flows, and applications delivering chunks of data
as video streams.
Nicely spaced packets :
Here interface is 10Gbit, but flow bottleneck is ~20Mbit
cwin is big, yet FQ avoids the typical bursts generated by TCP
(as in netperf TCP_RR -- -r 100000,100000)
15:01:23.545279 IP A > B: . 78193:81089(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.545394 IP B > A: . ack 81089 win 3668 <nop,nop,timestamp 11597985 1115>
15:01:23.546488 IP A > B: . 81089:83985(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.546565 IP B > A: . ack 83985 win 3668 <nop,nop,timestamp 11597986 1115>
15:01:23.547713 IP A > B: . 83985:86881(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.547778 IP B > A: . ack 86881 win 3668 <nop,nop,timestamp 11597987 1115>
15:01:23.548911 IP A > B: . 86881:89777(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.548949 IP B > A: . ack 89777 win 3668 <nop,nop,timestamp 11597988 1115>
15:01:23.550116 IP A > B: . 89777:92673(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.550182 IP B > A: . ack 92673 win 3668 <nop,nop,timestamp 11597989 1115>
15:01:23.551333 IP A > B: . 92673:95569(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.551406 IP B > A: . ack 95569 win 3668 <nop,nop,timestamp 11597991 1115>
15:01:23.552539 IP A > B: . 95569:98465(2896) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.552576 IP B > A: . ack 98465 win 3668 <nop,nop,timestamp 11597992 1115>
15:01:23.553756 IP A > B: . 98465:99913(1448) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.554138 IP A > B: P 99913:100001(88) ack 65248 win 3125 <nop,nop,timestamp 1115 11597805>
15:01:23.554204 IP B > A: . ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.554234 IP B > A: . 65248:68144(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.555620 IP B > A: . 68144:71040(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.557005 IP B > A: . 71040:73936(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.558390 IP B > A: . 73936:76832(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.559773 IP B > A: . 76832:79728(2896) ack 100001 win 3668 <nop,nop,timestamp 11597993 1115>
15:01:23.561158 IP B > A: . 79728:82624(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.562543 IP B > A: . 82624:85520(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.563928 IP B > A: . 85520:88416(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.565313 IP B > A: . 88416:91312(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.566698 IP B > A: . 91312:94208(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.568083 IP B > A: . 94208:97104(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.569467 IP B > A: . 97104:100000(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.570852 IP B > A: . 100000:102896(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.572237 IP B > A: . 102896:105792(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.573639 IP B > A: . 105792:108688(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.575024 IP B > A: . 108688:111584(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.576408 IP B > A: . 111584:114480(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
15:01:23.577793 IP B > A: . 114480:117376(2896) ack 100001 win 3668 <nop,nop,timestamp 11597994 1115>
TCP timestamps show that most packets from B were queued in the same ms
timeframe (TSval 1159799{3,4}), but FQ managed to send them right
in time to avoid a big burst.
In slow start or steady state, very few packets are throttled [1]
FQ gets a bunch of tunables as :
  limit : max number of packets on whole Qdisc (default 10000)
  flow_limit : max number of packets per flow (default 100)
  quantum : the credit per RR round (default is 2 MTU)
  initial_quantum : initial credit for new flows (default is 10 MTU)
  maxrate : max per flow rate (default : unlimited)
  buckets : number of RB trees (default : 1024) in hash table.
               (consumes 8 bytes per bucket)
  [no]pacing : disable/enable pacing (default is enable)
All of them can be changed on a live qdisc.
$ tc qd add dev eth0 root fq help
Usage: ... fq [ limit PACKETS ] [ flow_limit PACKETS ]
              [ quantum BYTES ] [ initial_quantum BYTES ]
              [ maxrate RATE  ] [ buckets NUMBER ]
              [ [no]pacing ]
$ tc -s -d qd
qdisc fq 8002: dev eth0 root refcnt 32 limit 10000p flow_limit 100p buckets 256 quantum 3028 initial_quantum 15140
 Sent 216532416 bytes 148395 pkt (dropped 0, overlimits 0 requeues 14)
 backlog 0b 0p requeues 14
  511 flows, 511 inactive, 0 throttled
  110 gc, 0 highprio, 0 retrans, 1143 throttled, 0 flows_plimit
[1] Except if initial srtt is overestimated, as if using
cached srtt in tcp metrics. We'll provide a fix for this issue.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2013-08-29 15:49:55 -07:00
										 |  |  | 	__TCA_FQ_MAX | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TCA_FQ_MAX	(__TCA_FQ_MAX - 1)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct tc_fq_qd_stats { | 
					
						
							|  |  |  | 	__u64	gc_flows; | 
					
						
							|  |  |  | 	__u64	highprio_packets; | 
					
						
							|  |  |  | 	__u64	tcp_retrans; | 
					
						
							|  |  |  | 	__u64	throttled; | 
					
						
							|  |  |  | 	__u64	flows_plimit; | 
					
						
							|  |  |  | 	__u64	pkts_too_long; | 
					
						
							|  |  |  | 	__u64	allocation_errors; | 
					
						
							|  |  |  | 	__s64	time_next_delayed_flow; | 
					
						
							|  |  |  | 	__u32	flows; | 
					
						
							|  |  |  | 	__u32	inactive_flows; | 
					
						
							|  |  |  | 	__u32	throttled_flows; | 
					
						
							|  |  |  | 	__u32	pad; | 
					
						
							|  |  |  | }; | 
					
						
							| 
									
										
											  
											
												net-qdisc-hhf: Heavy-Hitter Filter (HHF) qdisc
This patch implements the first size-based qdisc that attempts to
differentiate between small flows and heavy-hitters.  The goal is to
catch the heavy-hitters and move them to a separate queue with less
priority so that bulk traffic does not affect the latency of critical
traffic.  Currently "less priority" means less weight (2:1 in
particular) in a Weighted Deficit Round Robin (WDRR) scheduler.
In essence, this patch addresses the "delay-bloat" problem due to
bloated buffers. In some systems, large queues may be necessary for
obtaining CPU efficiency, or due to the presence of unresponsive
traffic like UDP, or just a large number of connections with each
having a small amount of outstanding traffic. In these circumstances,
HHF aims to reduce the HoL blocking for latency sensitive traffic,
while not impacting the queues built up by bulk traffic.  HHF can also
be used in conjunction with other AQM mechanisms such as CoDel.
To capture heavy-hitters, we implement the "multi-stage filter" design
in the following paper:
C. Estan and G. Varghese, "New Directions in Traffic Measurement and
Accounting", in ACM SIGCOMM, 2002.
Some configurable qdisc settings through 'tc':
- hhf_reset_timeout: period to reset counter values in the multi-stage
                     filter (default 40ms)
- hhf_admit_bytes:   threshold to classify heavy-hitters
                     (default 128KB)
- hhf_evict_timeout: threshold to evict idle heavy-hitters
                     (default 1s)
- hhf_non_hh_weight: Weighted Deficit Round Robin (WDRR) weight for
                     non-heavy-hitters (default 2)
- hh_flows_limit:    max number of heavy-hitter flow entries
                     (default 2048)
Note that the ratio between hhf_admit_bytes and hhf_reset_timeout
reflects the bandwidth of heavy-hitters that we attempt to capture
(25Mbps with the above default settings).
The false negative rate (heavy-hitter flows getting away unclassified)
is zero by the design of the multi-stage filter algorithm.
With 100 heavy-hitter flows, using four hashes and 4000 counters yields
a false positive rate (non-heavy-hitters mistakenly classified as
heavy-hitters) of less than 1e-4.
Signed-off-by: Terry Lam <vtlam@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2013-12-15 00:30:21 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | /* Heavy-Hitter Filter */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | enum { | 
					
						
							|  |  |  | 	TCA_HHF_UNSPEC, | 
					
						
							|  |  |  | 	TCA_HHF_BACKLOG_LIMIT, | 
					
						
							|  |  |  | 	TCA_HHF_QUANTUM, | 
					
						
							|  |  |  | 	TCA_HHF_HH_FLOWS_LIMIT, | 
					
						
							|  |  |  | 	TCA_HHF_RESET_TIMEOUT, | 
					
						
							|  |  |  | 	TCA_HHF_ADMIT_BYTES, | 
					
						
							|  |  |  | 	TCA_HHF_EVICT_TIMEOUT, | 
					
						
							|  |  |  | 	TCA_HHF_NON_HH_WEIGHT, | 
					
						
							|  |  |  | 	__TCA_HHF_MAX | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define TCA_HHF_MAX	(__TCA_HHF_MAX - 1)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct tc_hhf_xstats { | 
					
						
							|  |  |  | 	__u32	drop_overlimit; /* number of times max qdisc packet limit
 | 
					
						
							|  |  |  | 				 * was hit | 
					
						
							|  |  |  | 				 */ | 
					
						
							|  |  |  | 	__u32	hh_overlimit;   /* number of times max heavy-hitters was hit */ | 
					
						
							|  |  |  | 	__u32	hh_tot_count;   /* number of captured heavy-hitters so far */ | 
					
						
							|  |  |  | 	__u32	hh_cur_count;   /* number of current heavy-hitters */ | 
					
						
							|  |  |  | }; | 
					
						
							| 
									
										
											  
											
												net: pkt_sched: PIE AQM scheme
Proportional Integral controller Enhanced (PIE) is a scheduler to address the
bufferbloat problem.
>From the IETF draft below:
" Bufferbloat is a phenomenon where excess buffers in the network cause high
latency and jitter. As more and more interactive applications (e.g. voice over
IP, real time video streaming and financial transactions) run in the Internet,
high latency and jitter degrade application performance. There is a pressing
need to design intelligent queue management schemes that can control latency and
jitter; and hence provide desirable quality of service to users.
We present here a lightweight design, PIE(Proportional Integral controller
Enhanced) that can effectively control the average queueing latency to a target
value. Simulation results, theoretical analysis and Linux testbed results have
shown that PIE can ensure low latency and achieve high link utilization under
various congestion situations. The design does not require per-packet
timestamp, so it incurs very small overhead and is simple enough to implement
in both hardware and software.  "
Many thanks to Dave Taht for extensive feedback, reviews, testing and
suggestions. Thanks also to Stephen Hemminger and Eric Dumazet for reviews and
suggestions.  Naeem Khademi and Dave Taht independently contributed to ECN
support.
For more information, please see technical paper about PIE in the IEEE
Conference on High Performance Switching and Routing 2013. A copy of the paper
can be found at ftp://ftpeng.cisco.com/pie/.
Please also refer to the IETF draft submission at
http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
All relevant code, documents and test scripts and results can be found at
ftp://ftpeng.cisco.com/pie/.
For problems with the iproute2/tc or Linux kernel code, please contact Vijay
Subramanian (vijaynsu@cisco.com or subramanian.vijay@gmail.com) Mythili Prabhu
(mysuryan@cisco.com)
Signed-off-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: Mythili Prabhu <mysuryan@cisco.com>
CC: Dave Taht <dave.taht@bufferbloat.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
											
										 
											2014-01-04 17:33:55 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | /* PIE */ | 
					
						
							|  |  |  | enum { | 
					
						
							|  |  |  | 	TCA_PIE_UNSPEC, | 
					
						
							|  |  |  | 	TCA_PIE_TARGET, | 
					
						
							|  |  |  | 	TCA_PIE_LIMIT, | 
					
						
							|  |  |  | 	TCA_PIE_TUPDATE, | 
					
						
							|  |  |  | 	TCA_PIE_ALPHA, | 
					
						
							|  |  |  | 	TCA_PIE_BETA, | 
					
						
							|  |  |  | 	TCA_PIE_ECN, | 
					
						
							|  |  |  | 	TCA_PIE_BYTEMODE, | 
					
						
							|  |  |  | 	__TCA_PIE_MAX | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | #define TCA_PIE_MAX   (__TCA_PIE_MAX - 1)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | struct tc_pie_xstats { | 
					
						
							|  |  |  | 	__u32 prob;             /* current probability */ | 
					
						
							|  |  |  | 	__u32 delay;            /* current delay in ms */ | 
					
						
							|  |  |  | 	__u32 avg_dq_rate;      /* current average dq_rate in bits/pie_time */ | 
					
						
							|  |  |  | 	__u32 packets_in;       /* total number of packets enqueued */ | 
					
						
							|  |  |  | 	__u32 dropped;          /* packets dropped due to pie_action */ | 
					
						
							|  |  |  | 	__u32 overlimit;        /* dropped due to lack of space in queue */ | 
					
						
							|  |  |  | 	__u32 maxq;             /* maximum queue size */ | 
					
						
							|  |  |  | 	__u32 ecn_mark;         /* packets marked with ecn*/ | 
					
						
							|  |  |  | }; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | #endif
 |