| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | /*
 | 
					
						
							|  |  |  |    md_p.h : physical layout of Linux RAID devices | 
					
						
							|  |  |  |           Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman | 
					
						
							|  |  |  | 	   | 
					
						
							|  |  |  |    This program is free software; you can redistribute it and/or modify | 
					
						
							|  |  |  |    it under the terms of the GNU General Public License as published by | 
					
						
							|  |  |  |    the Free Software Foundation; either version 2, or (at your option) | 
					
						
							|  |  |  |    any later version. | 
					
						
							|  |  |  |     | 
					
						
							|  |  |  |    You should have received a copy of the GNU General Public License | 
					
						
							|  |  |  |    (for example /usr/src/linux/COPYING); if not, write to the Free | 
					
						
							|  |  |  |    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.   | 
					
						
							|  |  |  | */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #ifndef _MD_P_H
 | 
					
						
							|  |  |  | #define _MD_P_H
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2009-01-30 20:06:44 +05:30
										 |  |  | #include <linux/types.h>
 | 
					
						
							| 
									
										
										
										
											2013-11-14 15:16:19 +11:00
										 |  |  | #include <asm/byteorder.h>
 | 
					
						
							| 
									
										
										
										
											2009-01-30 20:06:44 +05:30
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | /*
 | 
					
						
							|  |  |  |  * RAID superblock. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * The RAID superblock maintains some statistics on each RAID configuration. | 
					
						
							|  |  |  |  * Each real device in the RAID set contains it near the end of the device. | 
					
						
							|  |  |  |  * Some of the ideas are copied from the ext2fs implementation. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * We currently use 4096 bytes as follows: | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  *	word offset	function | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  *	   0  -    31	Constant generic RAID device information. | 
					
						
							|  |  |  |  *        32  -    63   Generic state information. | 
					
						
							|  |  |  |  *	  64  -   127	Personality specific information. | 
					
						
							|  |  |  |  *	 128  -   511	12 32-words descriptors of the disks in the raid set. | 
					
						
							|  |  |  |  *	 512  -   911	Reserved. | 
					
						
							|  |  |  |  *	 912  -  1023	Disk specific descriptor. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /*
 | 
					
						
							|  |  |  |  * If x is the real device size in bytes, we return an apparent size of: | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  *	y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * and place the 4kB superblock at offset y. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | #define MD_RESERVED_BYTES		(64 * 1024)
 | 
					
						
							|  |  |  | #define MD_RESERVED_SECTORS		(MD_RESERVED_BYTES / 512)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define MD_NEW_SIZE_SECTORS(x)		((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define MD_SB_BYTES			4096
 | 
					
						
							|  |  |  | #define MD_SB_WORDS			(MD_SB_BYTES / 4)
 | 
					
						
							|  |  |  | #define MD_SB_SECTORS			(MD_SB_BYTES / 512)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /*
 | 
					
						
							|  |  |  |  * The following are counted in 32-bit words | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | #define	MD_SB_GENERIC_OFFSET		0
 | 
					
						
							|  |  |  | #define MD_SB_PERSONALITY_OFFSET	64
 | 
					
						
							|  |  |  | #define MD_SB_DISKS_OFFSET		128
 | 
					
						
							|  |  |  | #define MD_SB_DESCRIPTOR_OFFSET		992
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define MD_SB_GENERIC_CONSTANT_WORDS	32
 | 
					
						
							|  |  |  | #define MD_SB_GENERIC_STATE_WORDS	32
 | 
					
						
							|  |  |  | #define MD_SB_GENERIC_WORDS		(MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
 | 
					
						
							|  |  |  | #define MD_SB_PERSONALITY_WORDS		64
 | 
					
						
							|  |  |  | #define MD_SB_DESCRIPTOR_WORDS		32
 | 
					
						
							|  |  |  | #define MD_SB_DISKS			27
 | 
					
						
							|  |  |  | #define MD_SB_DISKS_WORDS		(MD_SB_DISKS*MD_SB_DESCRIPTOR_WORDS)
 | 
					
						
							|  |  |  | #define MD_SB_RESERVED_WORDS		(1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
 | 
					
						
							|  |  |  | #define MD_SB_EQUAL_WORDS		(MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /*
 | 
					
						
							|  |  |  |  * Device "operational" state bits | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | #define MD_DISK_FAULTY		0 /* disk is faulty / operational */
 | 
					
						
							|  |  |  | #define MD_DISK_ACTIVE		1 /* disk is running or spare disk */
 | 
					
						
							|  |  |  | #define MD_DISK_SYNC		2 /* disk is in sync with the raid set */
 | 
					
						
							|  |  |  | #define MD_DISK_REMOVED		3 /* disk is in sync with the raid set */
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-09-09 16:23:45 -07:00
										 |  |  | #define	MD_DISK_WRITEMOSTLY	9 /* disk is "write-mostly" is RAID1 config.
 | 
					
						
							|  |  |  | 				   * read requests will only be sent here in | 
					
						
							|  |  |  | 				   * dire need | 
					
						
							|  |  |  | 				   */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | typedef struct mdp_device_descriptor_s { | 
					
						
							|  |  |  | 	__u32 number;		/* 0 Device number in the entire set	      */ | 
					
						
							|  |  |  | 	__u32 major;		/* 1 Device major number		      */ | 
					
						
							|  |  |  | 	__u32 minor;		/* 2 Device minor number		      */ | 
					
						
							|  |  |  | 	__u32 raid_disk;	/* 3 The role of the device in the raid set   */ | 
					
						
							|  |  |  | 	__u32 state;		/* 4 Operational state			      */ | 
					
						
							|  |  |  | 	__u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5]; | 
					
						
							|  |  |  | } mdp_disk_t; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define MD_SB_MAGIC		0xa92b4efc
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /*
 | 
					
						
							|  |  |  |  * Superblock state bits | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | #define MD_SB_CLEAN		0
 | 
					
						
							|  |  |  | #define MD_SB_ERRORS		1
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-06-21 17:17:27 -07:00
										 |  |  | #define	MD_SB_BITMAP_PRESENT	8 /* bitmap may be present nearby */
 | 
					
						
							| 
									
										
										
										
											2006-03-27 01:18:11 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | /*
 | 
					
						
							|  |  |  |  * Notes: | 
					
						
							|  |  |  |  * - if an array is being reshaped (restriped) in order to change the | 
					
						
							|  |  |  |  *   the number of active devices in the array, 'raid_disks' will be | 
					
						
							|  |  |  |  *   the larger of the old and new numbers.  'delta_disks' will | 
					
						
							|  |  |  |  *   be the "new - old".  So if +ve, raid_disks is the new value, and | 
					
						
							|  |  |  |  *   "raid_disks-delta_disks" is the old.  If -ve, raid_disks is the | 
					
						
							|  |  |  |  *   old value and "raid_disks+delta_disks" is the new (smaller) value. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | typedef struct mdp_superblock_s { | 
					
						
							|  |  |  | 	/*
 | 
					
						
							|  |  |  | 	 * Constant generic information | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	__u32 md_magic;		/*  0 MD identifier 			      */ | 
					
						
							|  |  |  | 	__u32 major_version;	/*  1 major version to which the set conforms */ | 
					
						
							|  |  |  | 	__u32 minor_version;	/*  2 minor version ...			      */ | 
					
						
							|  |  |  | 	__u32 patch_version;	/*  3 patchlevel version ...		      */ | 
					
						
							|  |  |  | 	__u32 gvalid_words;	/*  4 Number of used words in this section    */ | 
					
						
							|  |  |  | 	__u32 set_uuid0;	/*  5 Raid set identifier		      */ | 
					
						
							|  |  |  | 	__u32 ctime;		/*  6 Creation time			      */ | 
					
						
							|  |  |  | 	__u32 level;		/*  7 Raid personality			      */ | 
					
						
							|  |  |  | 	__u32 size;		/*  8 Apparent size of each individual disk   */ | 
					
						
							|  |  |  | 	__u32 nr_disks;		/*  9 total disks in the raid set	      */ | 
					
						
							|  |  |  | 	__u32 raid_disks;	/* 10 disks in a fully functional raid set    */ | 
					
						
							|  |  |  | 	__u32 md_minor;		/* 11 preferred MD minor device number	      */ | 
					
						
							|  |  |  | 	__u32 not_persistent;	/* 12 does it have a persistent superblock    */ | 
					
						
							|  |  |  | 	__u32 set_uuid1;	/* 13 Raid set identifier #2		      */ | 
					
						
							|  |  |  | 	__u32 set_uuid2;	/* 14 Raid set identifier #3		      */ | 
					
						
							|  |  |  | 	__u32 set_uuid3;	/* 15 Raid set identifier #4		      */ | 
					
						
							|  |  |  | 	__u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16]; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/*
 | 
					
						
							|  |  |  | 	 * Generic state information | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	__u32 utime;		/*  0 Superblock update time		      */ | 
					
						
							|  |  |  | 	__u32 state;		/*  1 State bits (clean, ...)		      */ | 
					
						
							|  |  |  | 	__u32 active_disks;	/*  2 Number of currently active disks	      */ | 
					
						
							|  |  |  | 	__u32 working_disks;	/*  3 Number of working disks		      */ | 
					
						
							|  |  |  | 	__u32 failed_disks;	/*  4 Number of failed disks		      */ | 
					
						
							|  |  |  | 	__u32 spare_disks;	/*  5 Number of spare disks		      */ | 
					
						
							|  |  |  | 	__u32 sb_csum;		/*  6 checksum of the whole superblock        */ | 
					
						
							| 
									
										
										
										
											2013-03-13 14:59:47 -07:00
										 |  |  | #if defined(__BYTE_ORDER) ? __BYTE_ORDER == __BIG_ENDIAN : defined(__BIG_ENDIAN)
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__u32 events_hi;	/*  7 high-order of superblock update count   */ | 
					
						
							|  |  |  | 	__u32 events_lo;	/*  8 low-order of superblock update count    */ | 
					
						
							|  |  |  | 	__u32 cp_events_hi;	/*  9 high-order of checkpoint update count   */ | 
					
						
							|  |  |  | 	__u32 cp_events_lo;	/* 10 low-order of checkpoint update count    */ | 
					
						
							| 
									
										
										
										
											2013-03-13 14:59:47 -07:00
										 |  |  | #elif defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN)
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__u32 events_lo;	/*  7 low-order of superblock update count    */ | 
					
						
							|  |  |  | 	__u32 events_hi;	/*  8 high-order of superblock update count   */ | 
					
						
							|  |  |  | 	__u32 cp_events_lo;	/*  9 low-order of checkpoint update count    */ | 
					
						
							|  |  |  | 	__u32 cp_events_hi;	/* 10 high-order of checkpoint update count   */ | 
					
						
							| 
									
										
										
										
											2013-03-13 14:59:47 -07:00
										 |  |  | #else
 | 
					
						
							|  |  |  | #error unspecified endianness
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | #endif
 | 
					
						
							|  |  |  | 	__u32 recovery_cp;	/* 11 recovery checkpoint sector count	      */ | 
					
						
							| 
									
										
										
										
											2006-03-27 01:18:11 -08:00
										 |  |  | 	/* There are only valid for minor_version > 90 */ | 
					
						
							|  |  |  | 	__u64 reshape_position;	/* 12,13 next address in array-space for reshape */ | 
					
						
							|  |  |  | 	__u32 new_level;	/* 14 new level we are reshaping to	      */ | 
					
						
							|  |  |  | 	__u32 delta_disks;	/* 15 change in number of raid_disks	      */ | 
					
						
							|  |  |  | 	__u32 new_layout;	/* 16 new layout			      */ | 
					
						
							|  |  |  | 	__u32 new_chunk;	/* 17 new chunk size (bytes)		      */ | 
					
						
							|  |  |  | 	__u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 18]; | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	/*
 | 
					
						
							|  |  |  | 	 * Personality information | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	__u32 layout;		/*  0 the array's physical layout	      */ | 
					
						
							|  |  |  | 	__u32 chunk_size;	/*  1 chunk size in bytes		      */ | 
					
						
							|  |  |  | 	__u32 root_pv;		/*  2 LV root PV */ | 
					
						
							|  |  |  | 	__u32 root_block;	/*  3 LV root block */ | 
					
						
							|  |  |  | 	__u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4]; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/*
 | 
					
						
							|  |  |  | 	 * Disks information | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	mdp_disk_t disks[MD_SB_DISKS]; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/*
 | 
					
						
							|  |  |  | 	 * Reserved | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	__u32 reserved[MD_SB_RESERVED_WORDS]; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/*
 | 
					
						
							|  |  |  | 	 * Active descriptor | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	mdp_disk_t this_disk; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | } mdp_super_t; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static inline __u64 md_event(mdp_super_t *sb) { | 
					
						
							|  |  |  | 	__u64 ev = sb->events_hi; | 
					
						
							|  |  |  | 	return (ev<<32)| sb->events_lo; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												md: need another print_sb for mdp_superblock_1
md_print_devices is called in two code path: MD_BUG(...), and md_ioctl
with PRINT_RAID_DEBUG.  it will dump out all in use md devices
information;
However, it wrongly processed two types of superblock in one:
The header file <linux/raid/md_p.h> has defined two types of superblock,
struct mdp_superblock_s (typedefed with mdp_super_t) according to md with
metadata 0.90, and struct mdp_superblock_1 according to md with metadata
1.0 and later,
These two types of superblock are very different,
The md_print_devices code processed them both in mdp_super_t, that would
lead to wrong informaton dump like:
	[ 6742.345877]
	[ 6742.345887] md:	**********************************
	[ 6742.345890] md:	* <COMPLETE RAID STATE PRINTOUT> *
	[ 6742.345892] md:	**********************************
	[ 6742.345896] md1: <ram7><ram6><ram5><ram4>
	[ 6742.345907] md: rdev ram7, SZ:00065472 F:0 S:1 DN:3
	[ 6742.345909] md: rdev superblock:
	[ 6742.345914] md:  SB: (V:0.90.0) ID:<42ef13c7.598c059a.5f9f1645.801e9ee6> CT:4919856d
	[ 6742.345918] md:     L5 S00065472 ND:4 RD:4 md1 LO:2 CS:65536
	[ 6742.345922] md:     UT:4919856d ST:1 AD:4 WD:4 FD:0 SD:0 CSUM:b7992907 E:00000001
	[ 6742.345924]      D  0:  DISK<N:0,(1,8),R:0,S:6>
	[ 6742.345930]      D  1:  DISK<N:1,(1,10),R:1,S:6>
	[ 6742.345933]      D  2:  DISK<N:2,(1,12),R:2,S:6>
	[ 6742.345937]      D  3:  DISK<N:3,(1,14),R:3,S:6>
	[ 6742.345942] md:     THIS:  DISK<N:3,(1,14),R:3,S:6>
	...
	[ 6742.346058] md0: <ram3><ram2><ram1><ram0>
	[ 6742.346067] md: rdev ram3, SZ:00065472 F:0 S:1 DN:3
	[ 6742.346070] md: rdev superblock:
	[ 6742.346073] md:  SB: (V:1.0.0) ID:<369aad81.00000000.00000000.00000000> CT:9a322a9c
	[ 6742.346077] md:     L-1507699579 S976570180 ND:48 RD:0 md0 LO:65536 CS:196610
	[ 6742.346081] md:     UT:00000018 ST:0 AD:131048 WD:0 FD:8 SD:0 CSUM:00000000 E:00000000
	[ 6742.346084]      D  0:  DISK<N:-1,(-1,-1),R:-1,S:-1>
	[ 6742.346089]      D  1:  DISK<N:-1,(-1,-1),R:-1,S:-1>
	[ 6742.346092]      D  2:  DISK<N:-1,(-1,-1),R:-1,S:-1>
	[ 6742.346096]      D  3:  DISK<N:-1,(-1,-1),R:-1,S:-1>
	[ 6742.346102] md:     THIS:  DISK<N:0,(0,0),R:0,S:0>
	...
	[ 6742.346219] md:	**********************************
	[ 6742.346221]
Here md1 is metadata 0.90.0, and md0 is metadata 1.2
After some more code to distinguish these two types of superblock, in this patch,
it will generate dump information like:
	[ 7906.755790]
	[ 7906.755799] md:	**********************************
	[ 7906.755802] md:	* <COMPLETE RAID STATE PRINTOUT> *
	[ 7906.755804] md:	**********************************
	[ 7906.755808] md1: <ram7><ram6><ram5><ram4>
	[ 7906.755819] md: rdev ram7, SZ:00065472 F:0 S:1 DN:3
	[ 7906.755821] md: rdev superblock (MJ:0):
	[ 7906.755826] md:  SB: (V:0.90.0) ID:<3fca7a0d.a612bfed.5f9f1645.801e9ee6> CT:491989f3
	[ 7906.755830] md:     L5 S00065472 ND:4 RD:4 md1 LO:2 CS:65536
	[ 7906.755834] md:     UT:491989f3 ST:1 AD:4 WD:4 FD:0 SD:0 CSUM:00fb52ad E:00000001
	[ 7906.755836]      D  0:  DISK<N:0,(1,8),R:0,S:6>
	[ 7906.755842]      D  1:  DISK<N:1,(1,10),R:1,S:6>
	[ 7906.755845]      D  2:  DISK<N:2,(1,12),R:2,S:6>
	[ 7906.755849]      D  3:  DISK<N:3,(1,14),R:3,S:6>
	[ 7906.755855] md:     THIS:  DISK<N:3,(1,14),R:3,S:6>
	...
	[ 7906.755972] md0: <ram3><ram2><ram1><ram0>
	[ 7906.755981] md: rdev ram3, SZ:00065472 F:0 S:1 DN:3
	[ 7906.755984] md: rdev superblock (MJ:1):
	[ 7906.755989] md:  SB: (V:1) (F:0) Array-ID:<5fbcf158:55aa:5fbe:9a79:1e939880dcbd>
	[ 7906.755990] md:    Name: "DG5:0" CT:1226410480
	[ 7906.755998] md:       L5 SZ130944 RD:4 LO:2 CS:128 DO:24 DS:131048 SO:8 RO:0
	[ 7906.755999] md:     Dev:00000003 UUID: 9194d744:87f7:a448:85f2:7497b84ce30a
	[ 7906.756001] md:       (F:0) UT:1226410480 Events:0 ResyncOffset:-1 CSUM:0dbcd829
	[ 7906.756003] md:         (MaxDev:384)
	...
	[ 7906.756113] md:	**********************************
	[ 7906.756116]
this md0 (metadata 1.2) information dumping is exactly according to struct
mdp_superblock_1.
Signed-off-by: Cheng Renquan <crquan@gmail.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Dan Williams <dan.j.williams@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: NeilBrown <neilb@suse.de>
											
										 
											2009-01-09 08:31:08 +11:00
										 |  |  | #define MD_SUPERBLOCK_1_TIME_SEC_MASK ((1ULL<<40) - 1)
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | /*
 | 
					
						
							|  |  |  |  * The version-1 superblock : | 
					
						
							|  |  |  |  * All numeric fields are little-endian. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * total size: 256 bytes plus 2 per device. | 
					
						
							|  |  |  |  *  1K allows 384 devices. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | struct mdp_superblock_1 { | 
					
						
							|  |  |  | 	/* constant array information - 128 bytes */ | 
					
						
							| 
									
										
										
										
											2006-10-21 10:24:08 -07:00
										 |  |  | 	__le32	magic;		/* MD_SB_MAGIC: 0xa92b4efc - little endian */ | 
					
						
							|  |  |  | 	__le32	major_version;	/* 1 */ | 
					
						
							|  |  |  | 	__le32	feature_map;	/* bit 0 set if 'bitmap_offset' is meaningful */ | 
					
						
							|  |  |  | 	__le32	pad0;		/* always set to 0 when writing */ | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	__u8	set_uuid[16];	/* user-space generated. */ | 
					
						
							|  |  |  | 	char	set_name[32];	/* set and interpreted by user-space */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-10-21 10:24:08 -07:00
										 |  |  | 	__le64	ctime;		/* lo 40 bits are seconds, top 24 are microseconds or 0*/ | 
					
						
							|  |  |  | 	__le32	level;		/* -4 (multipath), -1 (linear), 0,1,4,5 */ | 
					
						
							|  |  |  | 	__le32	layout;		/* only for raid5 and raid10 currently */ | 
					
						
							|  |  |  | 	__le64	size;		/* used size of component devices, in 512byte sectors */ | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2006-10-21 10:24:08 -07:00
										 |  |  | 	__le32	chunksize;	/* in 512byte sectors */ | 
					
						
							|  |  |  | 	__le32	raid_disks; | 
					
						
							|  |  |  | 	__le32	bitmap_offset;	/* sectors after start of superblock that bitmap starts
 | 
					
						
							| 
									
										
										
										
											2005-06-21 17:17:27 -07:00
										 |  |  | 				 * NOTE: signed, so bitmap can be before superblock | 
					
						
							|  |  |  | 				 * only meaningful of feature_map[0] is set. | 
					
						
							|  |  |  | 				 */ | 
					
						
							| 
									
										
										
										
											2006-03-27 01:18:11 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	/* These are only valid with feature bit '4' */ | 
					
						
							| 
									
										
										
										
											2006-10-21 10:24:08 -07:00
										 |  |  | 	__le32	new_level;	/* new level we are reshaping to		*/ | 
					
						
							|  |  |  | 	__le64	reshape_position;	/* next address in array-space for reshape */ | 
					
						
							|  |  |  | 	__le32	delta_disks;	/* change in number of raid_disks		*/ | 
					
						
							|  |  |  | 	__le32	new_layout;	/* new layout					*/ | 
					
						
							| 
									
										
										
										
											2009-06-18 08:46:47 +10:00
										 |  |  | 	__le32	new_chunk;	/* new chunk size (512byte sectors)		*/ | 
					
						
							| 
									
										
										
										
											2012-05-21 09:27:00 +10:00
										 |  |  | 	__le32  new_offset;	/* signed number to add to data_offset in new
 | 
					
						
							|  |  |  | 				 * layout.  0 == no-change.  This can be | 
					
						
							|  |  |  | 				 * different on each device in the array. | 
					
						
							|  |  |  | 				 */ | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	/* constant this-device information - 64 bytes */ | 
					
						
							| 
									
										
										
										
											2006-10-21 10:24:08 -07:00
										 |  |  | 	__le64	data_offset;	/* sector start of data, often 0 */ | 
					
						
							|  |  |  | 	__le64	data_size;	/* sectors in this device that can be used for data */ | 
					
						
							|  |  |  | 	__le64	super_offset;	/* sector start of this superblock */ | 
					
						
							|  |  |  | 	__le64	recovery_offset;/* sectors before this offset (from data_offset) have been recovered */ | 
					
						
							|  |  |  | 	__le32	dev_number;	/* permanent identifier of this  device - not role in raid */ | 
					
						
							|  |  |  | 	__le32	cnt_corrected_read; /* number of read errors that were corrected by re-writing */ | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__u8	device_uuid[16]; /* user-space setable, ignored by kernel */ | 
					
						
							| 
									
										
										
										
											2005-09-09 16:23:45 -07:00
										 |  |  | 	__u8	devflags;	/* per-device flags.  Only one defined...*/ | 
					
						
							|  |  |  | #define	WriteMostly1	1	/* mask for writemostly flag in above */
 | 
					
						
							| 
									
										
										
										
											2011-07-28 11:31:47 +10:00
										 |  |  | 	/* Bad block log.  If there are any bad blocks the feature flag is set.
 | 
					
						
							|  |  |  | 	 * If offset and size are non-zero, that space is reserved and available | 
					
						
							|  |  |  | 	 */ | 
					
						
							|  |  |  | 	__u8	bblog_shift;	/* shift from sectors to block size */ | 
					
						
							|  |  |  | 	__le16	bblog_size;	/* number of sectors reserved for list */ | 
					
						
							|  |  |  | 	__le32	bblog_offset;	/* sector offset from superblock to bblog,
 | 
					
						
							|  |  |  | 				 * signed - not unsigned */ | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 	/* array state information - 64 bytes */ | 
					
						
							| 
									
										
										
										
											2011-07-28 11:31:47 +10:00
										 |  |  | 	__le64	utime;		/* 40 bits second, 24 bits microseconds */ | 
					
						
							| 
									
										
										
										
											2006-10-21 10:24:08 -07:00
										 |  |  | 	__le64	events;		/* incremented when superblock updated */ | 
					
						
							|  |  |  | 	__le64	resync_offset;	/* data before this offset (from data_offset) known to be in sync */ | 
					
						
							| 
									
										
										
										
											2011-03-30 22:57:33 -03:00
										 |  |  | 	__le32	sb_csum;	/* checksum up to devs[max_dev] */ | 
					
						
							| 
									
										
										
										
											2006-10-21 10:24:08 -07:00
										 |  |  | 	__le32	max_dev;	/* size of devs[] array to consider */ | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | 	__u8	pad3[64-32];	/* set to 0 when writing */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 	/* device state information. Indexed by dev_number.
 | 
					
						
							|  |  |  | 	 * 2 bytes per device | 
					
						
							|  |  |  | 	 * Note there are no per-device state flags. State information is rolled | 
					
						
							|  |  |  | 	 * into the 'roles' value.  If a device is spare or faulty, then it doesn't | 
					
						
							|  |  |  | 	 * have a meaningful role. | 
					
						
							|  |  |  | 	 */ | 
					
						
							| 
									
										
										
										
											2006-10-21 10:24:08 -07:00
										 |  |  | 	__le16	dev_roles[0];	/* role in array, or 0xffff for a spare, or 0xfffe for faulty */ | 
					
						
							| 
									
										
										
										
											2005-04-16 15:20:36 -07:00
										 |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2005-09-09 16:23:51 -07:00
										 |  |  | /* feature_map bits */ | 
					
						
							|  |  |  | #define MD_FEATURE_BITMAP_OFFSET	1
 | 
					
						
							| 
									
										
										
										
											2006-06-26 00:27:40 -07:00
										 |  |  | #define	MD_FEATURE_RECOVERY_OFFSET	2 /* recovery_offset is present and
 | 
					
						
							|  |  |  | 					   * must be honoured | 
					
						
							|  |  |  | 					   */ | 
					
						
							| 
									
										
										
										
											2006-03-27 01:18:11 -08:00
										 |  |  | #define	MD_FEATURE_RESHAPE_ACTIVE	4
 | 
					
						
							| 
									
										
										
										
											2011-07-28 11:31:47 +10:00
										 |  |  | #define	MD_FEATURE_BAD_BLOCKS		8 /* badblock list is not empty */
 | 
					
						
							| 
									
										
										
										
											2011-12-23 10:17:51 +11:00
										 |  |  | #define	MD_FEATURE_REPLACEMENT		16 /* This device is replacing an
 | 
					
						
							|  |  |  | 					    * active device with same 'role'. | 
					
						
							|  |  |  | 					    * 'recovery_offset' is also set. | 
					
						
							|  |  |  | 					    */ | 
					
						
							| 
									
										
										
										
											2012-05-21 09:27:00 +10:00
										 |  |  | #define	MD_FEATURE_RESHAPE_BACKWARDS	32 /* Reshape doesn't change number
 | 
					
						
							|  |  |  | 					    * of devices, but is going | 
					
						
							|  |  |  | 					    * backwards anyway. | 
					
						
							|  |  |  | 					    */ | 
					
						
							| 
									
										
										
										
											2012-05-21 09:27:00 +10:00
										 |  |  | #define	MD_FEATURE_NEW_OFFSET		64 /* new_offset must be honoured */
 | 
					
						
							| 
									
										
											  
											
												md: Change handling of save_raid_disk and metadata update during recovery.
Since commit d70ed2e4fafdbef0800e739
   MD: Allow restarting an interrupted incremental recovery.
we don't write out the metadata to devices while they are recovering.
This had a good reason, but has unfortunate consequences.  This patch
changes things to make them work better.
At issue is what happens if the array is shut down while a recovery is
happening, particularly a bitmap-guided recovery.
Ideally the recovery should pick up where it left off.
However the metadata cannot represent the state "A recovery is in
process which is guided by the bitmap".
Before the above mentioned commit, we wrote metadata to the device
which said "this is being recovered and it is up to <here>".  So after
a restart, a full recovery (not bitmap-guided) would happen from
where-ever it was up to.
After the commit the metadata wasn't updated so it still said "This
device is fully in sync with <this> event count".  That leads to a
bitmap-based recovery following the whole bitmap, which should be a
lot less work than a full recovery from some starting point.  So this
was an improvement.
However updates some metadata but not all leads to other problems.
In particular, the metadata written to the fully-up-to-date device
record that the array has all devices present (even though some are
recovering).  So on restart, mdadm wants to find all devices and
expects them to have current event counts.
Obviously it doesn't (some have old event counts) so (when assembling
with --incremental) it waits indefinitely for the rest of the expected
devices.
It really is wrong to not update all the metadata together.  Do that
is bound to cause confusion.
Instead, we should make it possible to record the truth in the
metadata.  i.e. we need to be able to record that a device is being
recovered based on the bitmap.
We already have a Feature flag to say that recovery is happening.  We
now add another one to say that it is a bitmap-based recovery.
With this we can remove the code that disables the write-out of
metadata on some devices.
So this patch:
 - moves the setting of 'saved_raid_disk' from add_new_disk to
   the validate_super methods.  This makes sure it is always set
   properly, both when adding a new device to an array, and when
   assembling an array from a collection of devices.
 - Adds a metadata flag MD_FEATURE_RECOVERY_BITMAP which is only
   used if MD_FEATURE_RECOVERY_OFFSET is set, and record that a
   bitmap-based recovery is allowed.
   This is only present in v1.x metadata. v0.90 doesn't support
   devices which are in the middle of recovery at all.
 - Only skips writing metadata to Faulty devices.
 - Also allows rdev state to be set to "-insync" via sysfs.
   This can be used for external-metadata arrays.  When the
   'role' is set the device is assumed to be in-sync.  If, after
   setting the role, we set the state to "-insync", the role is
   moved to saved_raid_disk which effectively says the device is
   partly in-sync with that slot and needs a bitmap recovery.
Cc: Andrei Warkentin <andreiw@vmware.com>
Signed-off-by: NeilBrown <neilb@suse.de>
											
										 
											2013-12-09 12:04:56 +11:00
										 |  |  | #define	MD_FEATURE_RECOVERY_BITMAP	128 /* recovery that is happening
 | 
					
						
							|  |  |  | 					     * is guided by bitmap. | 
					
						
							|  |  |  | 					     */ | 
					
						
							| 
									
										
										
										
											2012-03-13 11:21:23 +11:00
										 |  |  | #define	MD_FEATURE_ALL			(MD_FEATURE_BITMAP_OFFSET	\
 | 
					
						
							|  |  |  | 					|MD_FEATURE_RECOVERY_OFFSET	\ | 
					
						
							|  |  |  | 					|MD_FEATURE_RESHAPE_ACTIVE	\ | 
					
						
							|  |  |  | 					|MD_FEATURE_BAD_BLOCKS		\ | 
					
						
							| 
									
										
										
										
											2012-05-21 09:27:00 +10:00
										 |  |  | 					|MD_FEATURE_REPLACEMENT		\ | 
					
						
							| 
									
										
										
										
											2012-05-21 09:27:00 +10:00
										 |  |  | 					|MD_FEATURE_RESHAPE_BACKWARDS	\ | 
					
						
							|  |  |  | 					|MD_FEATURE_NEW_OFFSET		\ | 
					
						
							| 
									
										
											  
											
												md: Change handling of save_raid_disk and metadata update during recovery.
Since commit d70ed2e4fafdbef0800e739
   MD: Allow restarting an interrupted incremental recovery.
we don't write out the metadata to devices while they are recovering.
This had a good reason, but has unfortunate consequences.  This patch
changes things to make them work better.
At issue is what happens if the array is shut down while a recovery is
happening, particularly a bitmap-guided recovery.
Ideally the recovery should pick up where it left off.
However the metadata cannot represent the state "A recovery is in
process which is guided by the bitmap".
Before the above mentioned commit, we wrote metadata to the device
which said "this is being recovered and it is up to <here>".  So after
a restart, a full recovery (not bitmap-guided) would happen from
where-ever it was up to.
After the commit the metadata wasn't updated so it still said "This
device is fully in sync with <this> event count".  That leads to a
bitmap-based recovery following the whole bitmap, which should be a
lot less work than a full recovery from some starting point.  So this
was an improvement.
However updates some metadata but not all leads to other problems.
In particular, the metadata written to the fully-up-to-date device
record that the array has all devices present (even though some are
recovering).  So on restart, mdadm wants to find all devices and
expects them to have current event counts.
Obviously it doesn't (some have old event counts) so (when assembling
with --incremental) it waits indefinitely for the rest of the expected
devices.
It really is wrong to not update all the metadata together.  Do that
is bound to cause confusion.
Instead, we should make it possible to record the truth in the
metadata.  i.e. we need to be able to record that a device is being
recovered based on the bitmap.
We already have a Feature flag to say that recovery is happening.  We
now add another one to say that it is a bitmap-based recovery.
With this we can remove the code that disables the write-out of
metadata on some devices.
So this patch:
 - moves the setting of 'saved_raid_disk' from add_new_disk to
   the validate_super methods.  This makes sure it is always set
   properly, both when adding a new device to an array, and when
   assembling an array from a collection of devices.
 - Adds a metadata flag MD_FEATURE_RECOVERY_BITMAP which is only
   used if MD_FEATURE_RECOVERY_OFFSET is set, and record that a
   bitmap-based recovery is allowed.
   This is only present in v1.x metadata. v0.90 doesn't support
   devices which are in the middle of recovery at all.
 - Only skips writing metadata to Faulty devices.
 - Also allows rdev state to be set to "-insync" via sysfs.
   This can be used for external-metadata arrays.  When the
   'role' is set the device is assumed to be in-sync.  If, after
   setting the role, we set the state to "-insync", the role is
   moved to saved_raid_disk which effectively says the device is
   partly in-sync with that slot and needs a bitmap recovery.
Cc: Andrei Warkentin <andreiw@vmware.com>
Signed-off-by: NeilBrown <neilb@suse.de>
											
										 
											2013-12-09 12:04:56 +11:00
										 |  |  | 					|MD_FEATURE_RECOVERY_BITMAP	\ | 
					
						
							| 
									
										
										
										
											2012-05-21 09:27:00 +10:00
										 |  |  | 					) | 
					
						
							| 
									
										
										
										
											2005-09-09 16:23:51 -07:00
										 |  |  | 
 | 
					
						
							| 
									
										
											  
											
												md: Change handling of save_raid_disk and metadata update during recovery.
Since commit d70ed2e4fafdbef0800e739
   MD: Allow restarting an interrupted incremental recovery.
we don't write out the metadata to devices while they are recovering.
This had a good reason, but has unfortunate consequences.  This patch
changes things to make them work better.
At issue is what happens if the array is shut down while a recovery is
happening, particularly a bitmap-guided recovery.
Ideally the recovery should pick up where it left off.
However the metadata cannot represent the state "A recovery is in
process which is guided by the bitmap".
Before the above mentioned commit, we wrote metadata to the device
which said "this is being recovered and it is up to <here>".  So after
a restart, a full recovery (not bitmap-guided) would happen from
where-ever it was up to.
After the commit the metadata wasn't updated so it still said "This
device is fully in sync with <this> event count".  That leads to a
bitmap-based recovery following the whole bitmap, which should be a
lot less work than a full recovery from some starting point.  So this
was an improvement.
However updates some metadata but not all leads to other problems.
In particular, the metadata written to the fully-up-to-date device
record that the array has all devices present (even though some are
recovering).  So on restart, mdadm wants to find all devices and
expects them to have current event counts.
Obviously it doesn't (some have old event counts) so (when assembling
with --incremental) it waits indefinitely for the rest of the expected
devices.
It really is wrong to not update all the metadata together.  Do that
is bound to cause confusion.
Instead, we should make it possible to record the truth in the
metadata.  i.e. we need to be able to record that a device is being
recovered based on the bitmap.
We already have a Feature flag to say that recovery is happening.  We
now add another one to say that it is a bitmap-based recovery.
With this we can remove the code that disables the write-out of
metadata on some devices.
So this patch:
 - moves the setting of 'saved_raid_disk' from add_new_disk to
   the validate_super methods.  This makes sure it is always set
   properly, both when adding a new device to an array, and when
   assembling an array from a collection of devices.
 - Adds a metadata flag MD_FEATURE_RECOVERY_BITMAP which is only
   used if MD_FEATURE_RECOVERY_OFFSET is set, and record that a
   bitmap-based recovery is allowed.
   This is only present in v1.x metadata. v0.90 doesn't support
   devices which are in the middle of recovery at all.
 - Only skips writing metadata to Faulty devices.
 - Also allows rdev state to be set to "-insync" via sysfs.
   This can be used for external-metadata arrays.  When the
   'role' is set the device is assumed to be in-sync.  If, after
   setting the role, we set the state to "-insync", the role is
   moved to saved_raid_disk which effectively says the device is
   partly in-sync with that slot and needs a bitmap recovery.
Cc: Andrei Warkentin <andreiw@vmware.com>
Signed-off-by: NeilBrown <neilb@suse.de>
											
										 
											2013-12-09 12:04:56 +11:00
										 |  |  | #endif
 |