| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | #ifndef _RAID10_H
 | 
					
						
							|  |  |  | #define _RAID10_H
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2012-07-31 10:03:52 +10:00
										 |  |  | struct raid10_info { | 
					
						
							| 
									
										
										
										
											2011-12-23 10:17:54 +11:00
										 |  |  | 	struct md_rdev	*rdev, *replacement; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	sector_t	head_position; | 
					
						
							| 
									
										
										
										
											2011-07-27 11:00:36 +10:00
										 |  |  | 	int		recovery_disabled;	/* matches
 | 
					
						
							|  |  |  | 						 * mddev->recovery_disabled | 
					
						
							|  |  |  | 						 * when we shouldn't try | 
					
						
							|  |  |  | 						 * recovering this device. | 
					
						
							|  |  |  | 						 */ | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-10-11 16:49:02 +11:00
										 |  |  | struct r10conf { | 
					
						
							| 
									
										
										
										
											2011-10-11 16:47:53 +11:00
										 |  |  | 	struct mddev		*mddev; | 
					
						
							| 
									
										
										
										
											2012-07-31 10:03:52 +10:00
										 |  |  | 	struct raid10_info	*mirrors; | 
					
						
							|  |  |  | 	struct raid10_info	*mirrors_new, *mirrors_old; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	spinlock_t		device_lock; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* geometry */ | 
					
						
							| 
									
										
										
										
											2012-05-21 09:28:20 +10:00
										 |  |  | 	struct geom { | 
					
						
							|  |  |  | 		int		raid_disks; | 
					
						
							|  |  |  | 		int		near_copies;  /* number of copies laid out
 | 
					
						
							| 
									
										
										
										
											2011-12-23 10:17:54 +11:00
										 |  |  | 					       * raid0 style */ | 
					
						
							| 
									
										
										
										
											2012-05-21 09:28:20 +10:00
										 |  |  | 		int		far_copies;   /* number of copies laid out
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 					       * at large strides across drives | 
					
						
							|  |  |  | 					       */ | 
					
						
							| 
									
										
										
										
											2012-05-21 09:28:20 +10:00
										 |  |  | 		int		far_offset;   /* far_copies are offset by 1
 | 
					
						
							| 
									
										
										
										
											2011-12-23 10:17:54 +11:00
										 |  |  | 					       * stripe instead of many | 
					
						
							| 
									
										
										
										
											2006-06-26 00:27:41 -07:00
										 |  |  | 					       */ | 
					
						
							| 
									
										
										
										
											2012-05-21 09:28:20 +10:00
										 |  |  | 		sector_t	stride;	      /* distance between far copies.
 | 
					
						
							| 
									
										
										
										
											2006-06-26 00:27:41 -07:00
										 |  |  | 					       * This is size / far_copies unless | 
					
						
							|  |  |  | 					       * far_offset, in which case it is | 
					
						
							|  |  |  | 					       * 1 stripe. | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 					       */ | 
					
						
							| 
									
										
											  
											
												MD RAID10: Improve redundancy for 'far' and 'offset' algorithms (part 1)
The MD RAID10 'far' and 'offset' algorithms make copies of entire stripe
widths - copying them to a different location on the same devices after
shifting the stripe.  An example layout of each follows below:
	        "far" algorithm
	dev1 dev2 dev3 dev4 dev5 dev6
	==== ==== ==== ==== ==== ====
	 A    B    C    D    E    F
	 G    H    I    J    K    L
	            ...
	 F    A    B    C    D    E  --> Copy of stripe0, but shifted by 1
	 L    G    H    I    J    K
	            ...
		"offset" algorithm
	dev1 dev2 dev3 dev4 dev5 dev6
	==== ==== ==== ==== ==== ====
	 A    B    C    D    E    F
	 F    A    B    C    D    E  --> Copy of stripe0, but shifted by 1
	 G    H    I    J    K    L
	 L    G    H    I    J    K
	            ...
Redundancy for these algorithms is gained by shifting the copied stripes
one device to the right.  This patch proposes that array be divided into
sets of adjacent devices and when the stripe copies are shifted, they wrap
on set boundaries rather than the array size boundary.  That is, for the
purposes of shifting, the copies are confined to their sets within the
array.  The sets are 'near_copies * far_copies' in size.
The above "far" algorithm example would change to:
	        "far" algorithm
	dev1 dev2 dev3 dev4 dev5 dev6
	==== ==== ==== ==== ==== ====
	 A    B    C    D    E    F
	 G    H    I    J    K    L
	            ...
	 B    A    D    C    F    E  --> Copy of stripe0, shifted 1, 2-dev sets
	 H    G    J    I    L    K      Dev sets are 1-2, 3-4, 5-6
	            ...
This has the affect of improving the redundancy of the array.  We can
always sustain at least one failure, but sometimes more than one can
be handled.  In the first examples, the pairs of devices that CANNOT fail
together are:
	(1,2) (2,3) (3,4) (4,5) (5,6) (1, 6) [40% of possible pairs]
In the example where the copies are confined to sets, the pairs of
devices that cannot fail together are:
	(1,2) (3,4) (5,6)                    [20% of possible pairs]
We cannot simply replace the old algorithms, so the 17th bit of the 'layout'
variable is used to indicate whether we use the old or new method of computing
the shift.  (This is similar to the way the 16th bit indicates whether the
"far" algorithm or the "offset" algorithm is being used.)
This patch only handles the cases where the number of total raid disks is
a multiple of 'far_copies'.  A follow-on patch addresses the condition where
this is not true.
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
											
										 
											2013-02-21 13:28:10 +11:00
										 |  |  | 		int             far_set_size; /* The number of devices in a set,
 | 
					
						
							|  |  |  | 					       * where a 'set' are devices that | 
					
						
							|  |  |  | 					       * contain far/offset copies of | 
					
						
							|  |  |  | 					       * each other. | 
					
						
							|  |  |  | 					       */ | 
					
						
							| 
									
										
										
										
											2012-05-21 09:28:20 +10:00
										 |  |  | 		int		chunk_shift; /* shift from chunks to sectors */ | 
					
						
							|  |  |  | 		sector_t	chunk_mask; | 
					
						
							| 
									
										
										
										
											2012-05-21 09:28:33 +10:00
										 |  |  | 	} prev, geo; | 
					
						
							| 
									
										
										
										
											2012-05-21 09:28:20 +10:00
										 |  |  | 	int			copies;	      /* near_copies * far_copies.
 | 
					
						
							|  |  |  | 					       * must be <= raid_disks | 
					
						
							|  |  |  | 					       */ | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-12-23 10:17:54 +11:00
										 |  |  | 	sector_t		dev_sectors;  /* temp copy of
 | 
					
						
							|  |  |  | 					       * mddev->dev_sectors */ | 
					
						
							| 
									
										
										
										
											2012-05-21 09:28:33 +10:00
										 |  |  | 	sector_t		reshape_progress; | 
					
						
							| 
									
										
										
										
											2012-05-22 13:53:47 +10:00
										 |  |  | 	sector_t		reshape_safe; | 
					
						
							|  |  |  | 	unsigned long		reshape_checkpoint; | 
					
						
							|  |  |  | 	sector_t		offset_diff; | 
					
						
							| 
									
										
										
										
											2010-03-08 16:02:45 +11:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	struct list_head	retry_list; | 
					
						
							| 
									
										
										
										
											2006-01-06 00:20:16 -08:00
										 |  |  | 	/* queue pending writes and submit them on unplug */ | 
					
						
							|  |  |  | 	struct bio_list		pending_bio_list; | 
					
						
							| 
									
										
										
										
											2011-10-11 16:50:01 +11:00
										 |  |  | 	int			pending_count; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	spinlock_t		resync_lock; | 
					
						
							| 
									
										
										
										
											2011-12-23 10:17:54 +11:00
										 |  |  | 	int			nr_pending; | 
					
						
							|  |  |  | 	int			nr_waiting; | 
					
						
							|  |  |  | 	int			nr_queued; | 
					
						
							|  |  |  | 	int			barrier; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	sector_t		next_resync; | 
					
						
							| 
									
										
										
										
											2006-01-06 00:20:16 -08:00
										 |  |  | 	int			fullsync;  /* set to 1 if a full sync is needed,
 | 
					
						
							|  |  |  | 					    * (fresh device added). | 
					
						
							|  |  |  | 					    * Cleared when a sync completes. | 
					
						
							|  |  |  | 					    */ | 
					
						
							| 
									
										
										
										
											2011-12-23 10:17:54 +11:00
										 |  |  | 	int			have_replacement; /* There is at least one
 | 
					
						
							|  |  |  | 						   * replacement device. | 
					
						
							|  |  |  | 						   */ | 
					
						
							| 
									
										
										
										
											2006-01-06 00:20:13 -08:00
										 |  |  | 	wait_queue_head_t	wait_barrier; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-12-23 10:17:54 +11:00
										 |  |  | 	mempool_t		*r10bio_pool; | 
					
						
							|  |  |  | 	mempool_t		*r10buf_pool; | 
					
						
							| 
									
										
										
										
											2006-01-06 00:20:28 -08:00
										 |  |  | 	struct page		*tmppage; | 
					
						
							| 
									
										
										
										
											2010-03-08 16:02:45 +11:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	/* When taking over an array from a different personality, we store
 | 
					
						
							|  |  |  | 	 * the new thread here until we fully activate the array. | 
					
						
							|  |  |  | 	 */ | 
					
						
							| 
									
										
										
										
											2011-10-11 16:48:23 +11:00
										 |  |  | 	struct md_thread	*thread; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /*
 | 
					
						
							|  |  |  |  * this is our 'private' RAID10 bio. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * it contains information about what kind of IO operations were started | 
					
						
							|  |  |  |  * for this RAID10 operation, and about their status: | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2011-10-11 16:48:43 +11:00
										 |  |  | struct r10bio { | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	atomic_t		remaining; /* 'have we finished' count,
 | 
					
						
							|  |  |  | 					    * used from IRQ handlers | 
					
						
							|  |  |  | 					    */ | 
					
						
							|  |  |  | 	sector_t		sector;	/* virtual sector number */ | 
					
						
							|  |  |  | 	int			sectors; | 
					
						
							|  |  |  | 	unsigned long		state; | 
					
						
							| 
									
										
										
										
											2011-10-11 16:47:53 +11:00
										 |  |  | 	struct mddev		*mddev; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	/*
 | 
					
						
							|  |  |  | 	 * original bio going to /dev/mdx | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	struct bio		*master_bio; | 
					
						
							|  |  |  | 	/*
 | 
					
						
							|  |  |  | 	 * if the IO is in READ direction, then this is where we read | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	int			read_slot; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	struct list_head	retry_list; | 
					
						
							|  |  |  | 	/*
 | 
					
						
							|  |  |  | 	 * if the IO is in WRITE direction, then multiple bios are used, | 
					
						
							|  |  |  | 	 * one for each copy. | 
					
						
							|  |  |  | 	 * When resyncing we also use one for each copy. | 
					
						
							|  |  |  | 	 * When reconstructing, we use 2 bios, one for read, one for write. | 
					
						
							|  |  |  | 	 * We choose the number when they are allocated. | 
					
						
							| 
									
										
										
										
											2011-12-23 10:17:54 +11:00
										 |  |  | 	 * We sometimes need an extra bio to write to the replacement. | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	 */ | 
					
						
							| 
									
										
										
										
											2012-08-18 09:51:42 +10:00
										 |  |  | 	struct r10dev { | 
					
						
							| 
									
										
										
										
											2011-12-23 10:17:54 +11:00
										 |  |  | 		struct bio	*bio; | 
					
						
							|  |  |  | 		union { | 
					
						
							|  |  |  | 			struct bio	*repl_bio; /* used for resync and
 | 
					
						
							|  |  |  | 						    * writes */ | 
					
						
							|  |  |  | 			struct md_rdev	*rdev;	   /* used for reads
 | 
					
						
							|  |  |  | 						    * (read_slot >= 0) */ | 
					
						
							|  |  |  | 		}; | 
					
						
							|  |  |  | 		sector_t	addr; | 
					
						
							|  |  |  | 		int		devnum; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	} devs[0]; | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* bits for r10bio.state */ | 
					
						
							| 
									
										
										
										
											2011-12-23 10:17:54 +11:00
										 |  |  | enum r10bio_state { | 
					
						
							|  |  |  | 	R10BIO_Uptodate, | 
					
						
							|  |  |  | 	R10BIO_IsSync, | 
					
						
							|  |  |  | 	R10BIO_IsRecover, | 
					
						
							| 
									
										
										
										
											2012-05-22 13:53:47 +10:00
										 |  |  | 	R10BIO_IsReshape, | 
					
						
							| 
									
										
										
										
											2011-12-23 10:17:54 +11:00
										 |  |  | 	R10BIO_Degraded, | 
					
						
							| 
									
										
										
										
											2011-07-28 11:39:23 +10:00
										 |  |  | /* Set ReadError on bios that experience a read error
 | 
					
						
							|  |  |  |  * so that raid10d knows what to do with them. | 
					
						
							|  |  |  |  */ | 
					
						
							| 
									
										
										
										
											2011-12-23 10:17:54 +11:00
										 |  |  | 	R10BIO_ReadError, | 
					
						
							| 
									
										
										
										
											2011-07-28 11:39:24 +10:00
										 |  |  | /* If a write for this request means we can clear some
 | 
					
						
							|  |  |  |  * known-bad-block records, we set this flag. | 
					
						
							|  |  |  |  */ | 
					
						
							| 
									
										
										
										
											2011-12-23 10:17:54 +11:00
										 |  |  | 	R10BIO_MadeGood, | 
					
						
							|  |  |  | 	R10BIO_WriteError, | 
					
						
							| 
									
										
										
										
											2012-05-21 09:28:33 +10:00
										 |  |  | /* During a reshape we might be performing IO on the
 | 
					
						
							|  |  |  |  * 'previous' part of the array, in which case this | 
					
						
							|  |  |  |  * flag is set | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | 	R10BIO_Previous, | 
					
						
							| 
									
										
										
										
											2011-12-23 10:17:54 +11:00
										 |  |  | }; | 
					
						
							| 
									
										
										
										
											2012-07-31 10:03:53 +10:00
										 |  |  | 
 | 
					
						
							|  |  |  | extern int md_raid10_congested(struct mddev *mddev, int bits); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | #endif
 |